From 0ca03130935940dbc86dc82171e977b5d211b688 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Mon, 31 Jan 2022 10:47:31 -0600
Subject: [PATCH 01/32] Add compile-time constants (macros) for the MR and NR
 block sizes.

These are defined in sub-configuration-specific header files, which are only included by reference kernels.
---
 common.mk                                     |  1 +
 config/a64fx/bli_kernel_defs_a64fx.h          | 52 ++++++++++++++++
 config/armsve/bli_kernel_defs_armsve.h        | 58 ++++++++++++++++++
 config/bgq/bli_kernel_defs_bgq.h              | 48 +++++++++++++++
 config/bulldozer/bli_kernel_defs_bulldozer.h  | 52 ++++++++++++++++
 config/cortexa15/bli_kernel_defs_cortexa15.h  | 48 +++++++++++++++
 config/cortexa53/bli_kernel_defs_cortexa53.h  | 48 +++++++++++++++
 config/cortexa57/bli_kernel_defs_cortexa57.h  | 48 +++++++++++++++
 config/cortexa9/bli_kernel_defs_cortexa9.h    | 48 +++++++++++++++
 config/excavator/bli_kernel_defs_excavator.h  | 52 ++++++++++++++++
 config/firestorm/bli_kernel_defs_firestorm.h  | 48 +++++++++++++++
 config/generic/bli_kernel_defs_generic.h      | 42 +++++++++++++
 config/haswell/bli_kernel_defs_haswell.h      | 52 ++++++++++++++++
 config/knc/bli_kernel_defs_knc.h              | 48 +++++++++++++++
 config/knl/bli_kernel_defs_knl.h              | 48 +++++++++++++++
 config/penryn/bli_kernel_defs_penryn.h        | 48 +++++++++++++++
 .../piledriver/bli_kernel_defs_piledriver.h   | 52 ++++++++++++++++
 config/power10/bli_kernel_defs_power10.h      | 48 +++++++++++++++
 config/power7/bli_kernel_defs_power7.h        | 46 ++++++++++++++
 config/power9/bli_kernel_defs_power9.h        | 46 ++++++++++++++
 .../sandybridge/bli_kernel_defs_sandybridge.h | 52 ++++++++++++++++
 config/skx/bli_kernel_defs_skx.h              | 48 +++++++++++++++
 .../steamroller/bli_kernel_defs_steamroller.h | 52 ++++++++++++++++
 config/template/bli_kernel_defs_template.h    | 60 +++++++++++++++++++
 config/thunderx2/bli_kernel_defs_thunderx2.h  | 48 +++++++++++++++
 config/zen/bli_kernel_defs_zen.h              | 52 ++++++++++++++++
 config/zen2/bli_kernel_defs_zen2.h            | 52 ++++++++++++++++
 27 files changed, 1297 insertions(+)
 create mode 100644 config/a64fx/bli_kernel_defs_a64fx.h
 create mode 100644 config/armsve/bli_kernel_defs_armsve.h
 create mode 100644 config/bgq/bli_kernel_defs_bgq.h
 create mode 100644 config/bulldozer/bli_kernel_defs_bulldozer.h
 create mode 100644 config/cortexa15/bli_kernel_defs_cortexa15.h
 create mode 100644 config/cortexa53/bli_kernel_defs_cortexa53.h
 create mode 100644 config/cortexa57/bli_kernel_defs_cortexa57.h
 create mode 100644 config/cortexa9/bli_kernel_defs_cortexa9.h
 create mode 100644 config/excavator/bli_kernel_defs_excavator.h
 create mode 100644 config/firestorm/bli_kernel_defs_firestorm.h
 create mode 100644 config/generic/bli_kernel_defs_generic.h
 create mode 100644 config/haswell/bli_kernel_defs_haswell.h
 create mode 100644 config/knc/bli_kernel_defs_knc.h
 create mode 100644 config/knl/bli_kernel_defs_knl.h
 create mode 100644 config/penryn/bli_kernel_defs_penryn.h
 create mode 100644 config/piledriver/bli_kernel_defs_piledriver.h
 create mode 100644 config/power10/bli_kernel_defs_power10.h
 create mode 100644 config/power7/bli_kernel_defs_power7.h
 create mode 100644 config/power9/bli_kernel_defs_power9.h
 create mode 100644 config/sandybridge/bli_kernel_defs_sandybridge.h
 create mode 100644 config/skx/bli_kernel_defs_skx.h
 create mode 100644 config/steamroller/bli_kernel_defs_steamroller.h
 create mode 100644 config/template/bli_kernel_defs_template.h
 create mode 100644 config/thunderx2/bli_kernel_defs_thunderx2.h
 create mode 100644 config/zen/bli_kernel_defs_zen.h
 create mode 100644 config/zen2/bli_kernel_defs_zen2.h

diff --git a/common.mk b/common.mk
index 5f2d30c9bf..13449bd203 100644
--- a/common.mk
+++ b/common.mk
@@ -129,6 +129,7 @@ get-refkern-cflags-for   = $(strip $(call load-var-for,CROPTFLAGS,$(1)) \
                                    -DBLIS_CNAME=$(1) \
                                    $(BUILD_CPPFLAGS) \
                                    $(BUILD_SYMFLAGS) \
+                                   -include $(CONFIG_PATH)/$(1)/bli_kernel_defs_$(1).h \
                             )
 
 get-config-cflags-for    = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
diff --git a/config/a64fx/bli_kernel_defs_a64fx.h b/config/a64fx/bli_kernel_defs_a64fx.h
new file mode 100644
index 0000000000..397c6caa0b
--- /dev/null
+++ b/config/a64fx/bli_kernel_defs_a64fx.h
@@ -0,0 +1,52 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_S   32
+#define BLIS_MR_D   16
+#define BLIS_MR_C   16
+#define BLIS_MR_Z   8
+
+#define BLIS_NR_S   10
+#define BLIS_NR_D   10
+#define BLIS_NR_C   10
+#define BLIS_NR_Z   10
+
+//#endif
+
diff --git a/config/armsve/bli_kernel_defs_armsve.h b/config/armsve/bli_kernel_defs_armsve.h
new file mode 100644
index 0000000000..4baa5028a3
--- /dev/null
+++ b/config/armsve/bli_kernel_defs_armsve.h
@@ -0,0 +1,58 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+//
+// The armsve configuration handles both 256-bit and 512-bit SVE vectors,
+// so it is not possible to define specific register block sizes. Thus,
+// armsve can't use reference kernels!
+//
+
+#define BLIS_MR_S   -1
+#define BLIS_MR_D   -1
+#define BLIS_MR_C   -1
+#define BLIS_MR_Z   -1
+
+#define BLIS_NR_S   -1
+#define BLIS_NR_D   -1
+#define BLIS_NR_C   -1
+#define BLIS_NR_Z   -1
+
+//#endif
+
diff --git a/config/bgq/bli_kernel_defs_bgq.h b/config/bgq/bli_kernel_defs_bgq.h
new file mode 100644
index 0000000000..135ccabb24
--- /dev/null
+++ b/config/bgq/bli_kernel_defs_bgq.h
@@ -0,0 +1,48 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_D   8
+#define BLIS_MR_Z   4
+
+#define BLIS_NR_D   8
+#define BLIS_NR_Z   4
+
+//#endif
+
diff --git a/config/bulldozer/bli_kernel_defs_bulldozer.h b/config/bulldozer/bli_kernel_defs_bulldozer.h
new file mode 100644
index 0000000000..903701ef0b
--- /dev/null
+++ b/config/bulldozer/bli_kernel_defs_bulldozer.h
@@ -0,0 +1,52 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_S   8
+#define BLIS_MR_D   4
+#define BLIS_MR_C   8
+#define BLIS_MR_Z   4
+
+#define BLIS_NR_S   8
+#define BLIS_NR_D   6
+#define BLIS_NR_C   4
+#define BLIS_NR_Z   4
+
+//#endif
+
diff --git a/config/cortexa15/bli_kernel_defs_cortexa15.h b/config/cortexa15/bli_kernel_defs_cortexa15.h
new file mode 100644
index 0000000000..9484c1771c
--- /dev/null
+++ b/config/cortexa15/bli_kernel_defs_cortexa15.h
@@ -0,0 +1,48 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_S   4
+#define BLIS_MR_D   4
+
+#define BLIS_NR_S   4
+#define BLIS_NR_D   4
+
+//#endif
+
diff --git a/config/cortexa53/bli_kernel_defs_cortexa53.h b/config/cortexa53/bli_kernel_defs_cortexa53.h
new file mode 100644
index 0000000000..0d6d98953f
--- /dev/null
+++ b/config/cortexa53/bli_kernel_defs_cortexa53.h
@@ -0,0 +1,48 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_S   8
+#define BLIS_MR_D   6
+
+#define BLIS_NR_S   12
+#define BLIS_NR_D   8
+
+//#endif
+
diff --git a/config/cortexa57/bli_kernel_defs_cortexa57.h b/config/cortexa57/bli_kernel_defs_cortexa57.h
new file mode 100644
index 0000000000..0d6d98953f
--- /dev/null
+++ b/config/cortexa57/bli_kernel_defs_cortexa57.h
@@ -0,0 +1,48 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_S   8
+#define BLIS_MR_D   6
+
+#define BLIS_NR_S   12
+#define BLIS_NR_D   8
+
+//#endif
+
diff --git a/config/cortexa9/bli_kernel_defs_cortexa9.h b/config/cortexa9/bli_kernel_defs_cortexa9.h
new file mode 100644
index 0000000000..9484c1771c
--- /dev/null
+++ b/config/cortexa9/bli_kernel_defs_cortexa9.h
@@ -0,0 +1,48 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_S   4
+#define BLIS_MR_D   4
+
+#define BLIS_NR_S   4
+#define BLIS_NR_D   4
+
+//#endif
+
diff --git a/config/excavator/bli_kernel_defs_excavator.h b/config/excavator/bli_kernel_defs_excavator.h
new file mode 100644
index 0000000000..73181b3dfd
--- /dev/null
+++ b/config/excavator/bli_kernel_defs_excavator.h
@@ -0,0 +1,52 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_S   16
+#define BLIS_MR_D   8
+#define BLIS_MR_C   4
+#define BLIS_MR_Z   2
+
+#define BLIS_NR_S   3
+#define BLIS_NR_D   3
+#define BLIS_NR_C   2
+#define BLIS_NR_Z   2
+
+//#endif
+
diff --git a/config/firestorm/bli_kernel_defs_firestorm.h b/config/firestorm/bli_kernel_defs_firestorm.h
new file mode 100644
index 0000000000..0d6d98953f
--- /dev/null
+++ b/config/firestorm/bli_kernel_defs_firestorm.h
@@ -0,0 +1,48 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_S   8
+#define BLIS_MR_D   6
+
+#define BLIS_NR_S   12
+#define BLIS_NR_D   8
+
+//#endif
+
diff --git a/config/generic/bli_kernel_defs_generic.h b/config/generic/bli_kernel_defs_generic.h
new file mode 100644
index 0000000000..db2f32947b
--- /dev/null
+++ b/config/generic/bli_kernel_defs_generic.h
@@ -0,0 +1,42 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+//#endif
+
diff --git a/config/haswell/bli_kernel_defs_haswell.h b/config/haswell/bli_kernel_defs_haswell.h
new file mode 100644
index 0000000000..7d6b333044
--- /dev/null
+++ b/config/haswell/bli_kernel_defs_haswell.h
@@ -0,0 +1,52 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_S   6
+#define BLIS_MR_D   6
+#define BLIS_MR_C   3
+#define BLIS_MR_Z   3
+
+#define BLIS_NR_S   16
+#define BLIS_NR_D   8
+#define BLIS_NR_C   8
+#define BLIS_NR_Z   4
+
+//#endif
+
diff --git a/config/knc/bli_kernel_defs_knc.h b/config/knc/bli_kernel_defs_knc.h
new file mode 100644
index 0000000000..88abba4418
--- /dev/null
+++ b/config/knc/bli_kernel_defs_knc.h
@@ -0,0 +1,48 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_D   30
+
+#define BLIS_NR_D   8
+
+#define BLIS_PACKMR_D   32
+
+//#endif
+
diff --git a/config/knl/bli_kernel_defs_knl.h b/config/knl/bli_kernel_defs_knl.h
new file mode 100644
index 0000000000..081b2060b9
--- /dev/null
+++ b/config/knl/bli_kernel_defs_knl.h
@@ -0,0 +1,48 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_S   24
+#define BLIS_MR_D   24
+
+#define BLIS_NR_S   16
+#define BLIS_NR_D   8
+
+//#endif
+
diff --git a/config/penryn/bli_kernel_defs_penryn.h b/config/penryn/bli_kernel_defs_penryn.h
new file mode 100644
index 0000000000..2a3451399e
--- /dev/null
+++ b/config/penryn/bli_kernel_defs_penryn.h
@@ -0,0 +1,48 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_S   8
+#define BLIS_MR_D   4
+
+#define BLIS_NR_S   4
+#define BLIS_NR_D   4
+
+//#endif
+
diff --git a/config/piledriver/bli_kernel_defs_piledriver.h b/config/piledriver/bli_kernel_defs_piledriver.h
new file mode 100644
index 0000000000..73181b3dfd
--- /dev/null
+++ b/config/piledriver/bli_kernel_defs_piledriver.h
@@ -0,0 +1,52 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_S   16
+#define BLIS_MR_D   8
+#define BLIS_MR_C   4
+#define BLIS_MR_Z   2
+
+#define BLIS_NR_S   3
+#define BLIS_NR_D   3
+#define BLIS_NR_C   2
+#define BLIS_NR_Z   2
+
+//#endif
+
diff --git a/config/power10/bli_kernel_defs_power10.h b/config/power10/bli_kernel_defs_power10.h
new file mode 100644
index 0000000000..4e4567f639
--- /dev/null
+++ b/config/power10/bli_kernel_defs_power10.h
@@ -0,0 +1,48 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_S   8
+#define BLIS_MR_D   8
+
+#define BLIS_NR_S   16
+#define BLIS_NR_D   8
+
+//#endif
+
diff --git a/config/power7/bli_kernel_defs_power7.h b/config/power7/bli_kernel_defs_power7.h
new file mode 100644
index 0000000000..0bbe12aad9
--- /dev/null
+++ b/config/power7/bli_kernel_defs_power7.h
@@ -0,0 +1,46 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_D   8
+
+#define BLIS_NR_D   4
+
+//#endif
+
diff --git a/config/power9/bli_kernel_defs_power9.h b/config/power9/bli_kernel_defs_power9.h
new file mode 100644
index 0000000000..c59ddcf8d3
--- /dev/null
+++ b/config/power9/bli_kernel_defs_power9.h
@@ -0,0 +1,46 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_D   12
+
+#define BLIS_NR_D   6
+
+//#endif
+
diff --git a/config/sandybridge/bli_kernel_defs_sandybridge.h b/config/sandybridge/bli_kernel_defs_sandybridge.h
new file mode 100644
index 0000000000..a599061650
--- /dev/null
+++ b/config/sandybridge/bli_kernel_defs_sandybridge.h
@@ -0,0 +1,52 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_S   8
+#define BLIS_MR_D   8
+#define BLIS_MR_C   8
+#define BLIS_MR_Z   4
+
+#define BLIS_NR_S   8
+#define BLIS_NR_D   4
+#define BLIS_NR_C   4
+#define BLIS_NR_Z   4
+
+//#endif
+
diff --git a/config/skx/bli_kernel_defs_skx.h b/config/skx/bli_kernel_defs_skx.h
new file mode 100644
index 0000000000..97062493be
--- /dev/null
+++ b/config/skx/bli_kernel_defs_skx.h
@@ -0,0 +1,48 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_S   32
+#define BLIS_MR_D   16
+
+#define BLIS_NR_S   12
+#define BLIS_NR_D   14
+
+//#endif
+
diff --git a/config/steamroller/bli_kernel_defs_steamroller.h b/config/steamroller/bli_kernel_defs_steamroller.h
new file mode 100644
index 0000000000..73181b3dfd
--- /dev/null
+++ b/config/steamroller/bli_kernel_defs_steamroller.h
@@ -0,0 +1,52 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_S   16
+#define BLIS_MR_D   8
+#define BLIS_MR_C   4
+#define BLIS_MR_Z   2
+
+#define BLIS_NR_S   3
+#define BLIS_NR_D   3
+#define BLIS_NR_C   2
+#define BLIS_NR_Z   2
+
+//#endif
+
diff --git a/config/template/bli_kernel_defs_template.h b/config/template/bli_kernel_defs_template.h
new file mode 100644
index 0000000000..be4ff3f984
--- /dev/null
+++ b/config/template/bli_kernel_defs_template.h
@@ -0,0 +1,60 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+//
+// Only defined for block sizes which are not taken as the default (i.e. when
+// an optimized kernel is provided).
+//
+
+#define BLIS_MR_Z   4
+
+#define BLIS_NR_Z   4
+
+//
+// PACKMR/PACKNR do not need to be defined unless they are different from the
+// "normal" MR/NR.
+//
+
+//#define BLIS_PACKMR_Z   4
+
+//#define BLIS_PACKNR_Z   4
+
+//#endif
+
diff --git a/config/thunderx2/bli_kernel_defs_thunderx2.h b/config/thunderx2/bli_kernel_defs_thunderx2.h
new file mode 100644
index 0000000000..0d6d98953f
--- /dev/null
+++ b/config/thunderx2/bli_kernel_defs_thunderx2.h
@@ -0,0 +1,48 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_S   8
+#define BLIS_MR_D   6
+
+#define BLIS_NR_S   12
+#define BLIS_NR_D   8
+
+//#endif
+
diff --git a/config/zen/bli_kernel_defs_zen.h b/config/zen/bli_kernel_defs_zen.h
new file mode 100644
index 0000000000..7d6b333044
--- /dev/null
+++ b/config/zen/bli_kernel_defs_zen.h
@@ -0,0 +1,52 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_S   6
+#define BLIS_MR_D   6
+#define BLIS_MR_C   3
+#define BLIS_MR_Z   3
+
+#define BLIS_NR_S   16
+#define BLIS_NR_D   8
+#define BLIS_NR_C   8
+#define BLIS_NR_Z   4
+
+//#endif
+
diff --git a/config/zen2/bli_kernel_defs_zen2.h b/config/zen2/bli_kernel_defs_zen2.h
new file mode 100644
index 0000000000..7d6b333044
--- /dev/null
+++ b/config/zen2/bli_kernel_defs_zen2.h
@@ -0,0 +1,52 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_S   6
+#define BLIS_MR_D   6
+#define BLIS_MR_C   3
+#define BLIS_MR_Z   3
+
+#define BLIS_NR_S   16
+#define BLIS_NR_D   8
+#define BLIS_NR_C   8
+#define BLIS_NR_Z   4
+
+//#endif
+

From 88dd4d654c24ff2aa55ef9a9ca6afaae0d2df4a6 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Sat, 5 Feb 2022 13:26:29 -0600
Subject: [PATCH 02/32] PoC implementation of reference gemm kernel.

The gemm reference kernel now uses the configuration-dependent BLIS_MR_x/BLIS_NR_x macros to control unrolling, rather than fixed values. This fixes #259 and replaces PR #547.
---
 common.mk                                     |  1 +
 config/a64fx/bli_kernel_defs_a64fx.h          | 18 ++---
 config/armsve/bli_kernel_defs_armsve.h        | 18 ++---
 config/bgq/bli_kernel_defs_bgq.h              |  8 +--
 config/bulldozer/bli_kernel_defs_bulldozer.h  | 18 ++---
 config/cortexa15/bli_kernel_defs_cortexa15.h  |  8 +--
 config/cortexa53/bli_kernel_defs_cortexa53.h  |  8 +--
 config/cortexa57/bli_kernel_defs_cortexa57.h  |  8 +--
 config/cortexa9/bli_kernel_defs_cortexa9.h    |  8 +--
 config/excavator/bli_kernel_defs_excavator.h  | 18 ++---
 config/firestorm/bli_kernel_defs_firestorm.h  |  8 +--
 config/haswell/bli_kernel_defs_haswell.h      | 18 ++---
 config/knc/bli_kernel_defs_knc.h              |  6 +-
 config/knl/bli_kernel_defs_knl.h              |  8 +--
 config/penryn/bli_kernel_defs_penryn.h        |  8 +--
 .../piledriver/bli_kernel_defs_piledriver.h   | 18 ++---
 config/power10/bli_kernel_defs_power10.h      |  8 +--
 config/power7/bli_kernel_defs_power7.h        |  4 +-
 config/power9/bli_kernel_defs_power9.h        |  4 +-
 .../sandybridge/bli_kernel_defs_sandybridge.h | 18 ++---
 config/skx/bli_kernel_defs_skx.h              |  8 +--
 .../steamroller/bli_kernel_defs_steamroller.h | 18 ++---
 config/template/bli_kernel_defs_template.h    |  8 +--
 config/thunderx2/bli_kernel_defs_thunderx2.h  |  8 +--
 config/zen/bli_kernel_defs_zen.h              | 18 ++---
 config/zen2/bli_kernel_defs_zen2.h            | 18 ++---
 frame/include/bli_kernel_macro_defs.h         | 68 +++++++++++++++++++
 ref_kernels/3/bli_gemm_ref.c                  | 35 +++++-----
 28 files changed, 231 insertions(+), 163 deletions(-)

diff --git a/common.mk b/common.mk
index 13449bd203..6ce7abc79c 100644
--- a/common.mk
+++ b/common.mk
@@ -129,6 +129,7 @@ get-refkern-cflags-for   = $(strip $(call load-var-for,CROPTFLAGS,$(1)) \
                                    -DBLIS_CNAME=$(1) \
                                    $(BUILD_CPPFLAGS) \
                                    $(BUILD_SYMFLAGS) \
+								   -DBLIS_IN_KERNEL=1 \
                                    -include $(CONFIG_PATH)/$(1)/bli_kernel_defs_$(1).h \
                             )
 
diff --git a/config/a64fx/bli_kernel_defs_a64fx.h b/config/a64fx/bli_kernel_defs_a64fx.h
index 397c6caa0b..2c5c972049 100644
--- a/config/a64fx/bli_kernel_defs_a64fx.h
+++ b/config/a64fx/bli_kernel_defs_a64fx.h
@@ -38,15 +38,15 @@
 
 // -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
 
-#define BLIS_MR_S   32
-#define BLIS_MR_D   16
-#define BLIS_MR_C   16
-#define BLIS_MR_Z   8
-
-#define BLIS_NR_S   10
-#define BLIS_NR_D   10
-#define BLIS_NR_C   10
-#define BLIS_NR_Z   10
+#define BLIS_MR_s   32
+#define BLIS_MR_d   16
+#define BLIS_MR_c   16
+#define BLIS_MR_z   8
+
+#define BLIS_NR_s   10
+#define BLIS_NR_d   10
+#define BLIS_NR_c   10
+#define BLIS_NR_z   10
 
 //#endif
 
diff --git a/config/armsve/bli_kernel_defs_armsve.h b/config/armsve/bli_kernel_defs_armsve.h
index 4baa5028a3..8496cb0b77 100644
--- a/config/armsve/bli_kernel_defs_armsve.h
+++ b/config/armsve/bli_kernel_defs_armsve.h
@@ -44,15 +44,15 @@
 // armsve can't use reference kernels!
 //
 
-#define BLIS_MR_S   -1
-#define BLIS_MR_D   -1
-#define BLIS_MR_C   -1
-#define BLIS_MR_Z   -1
-
-#define BLIS_NR_S   -1
-#define BLIS_NR_D   -1
-#define BLIS_NR_C   -1
-#define BLIS_NR_Z   -1
+#define BLIS_MR_s   -1
+#define BLIS_MR_d   -1
+#define BLIS_MR_c   -1
+#define BLIS_MR_z   -1
+
+#define BLIS_NR_s   -1
+#define BLIS_NR_d   -1
+#define BLIS_NR_c   -1
+#define BLIS_NR_z   -1
 
 //#endif
 
diff --git a/config/bgq/bli_kernel_defs_bgq.h b/config/bgq/bli_kernel_defs_bgq.h
index 135ccabb24..bd3962e45a 100644
--- a/config/bgq/bli_kernel_defs_bgq.h
+++ b/config/bgq/bli_kernel_defs_bgq.h
@@ -38,11 +38,11 @@
 
 // -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
 
-#define BLIS_MR_D   8
-#define BLIS_MR_Z   4
+#define BLIS_MR_d   8
+#define BLIS_MR_z   4
 
-#define BLIS_NR_D   8
-#define BLIS_NR_Z   4
+#define BLIS_NR_d   8
+#define BLIS_NR_z   4
 
 //#endif
 
diff --git a/config/bulldozer/bli_kernel_defs_bulldozer.h b/config/bulldozer/bli_kernel_defs_bulldozer.h
index 903701ef0b..ea1e58e66b 100644
--- a/config/bulldozer/bli_kernel_defs_bulldozer.h
+++ b/config/bulldozer/bli_kernel_defs_bulldozer.h
@@ -38,15 +38,15 @@
 
 // -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
 
-#define BLIS_MR_S   8
-#define BLIS_MR_D   4
-#define BLIS_MR_C   8
-#define BLIS_MR_Z   4
-
-#define BLIS_NR_S   8
-#define BLIS_NR_D   6
-#define BLIS_NR_C   4
-#define BLIS_NR_Z   4
+#define BLIS_MR_s   8
+#define BLIS_MR_d   4
+#define BLIS_MR_c   8
+#define BLIS_MR_z   4
+
+#define BLIS_NR_s   8
+#define BLIS_NR_d   6
+#define BLIS_NR_c   4
+#define BLIS_NR_z   4
 
 //#endif
 
diff --git a/config/cortexa15/bli_kernel_defs_cortexa15.h b/config/cortexa15/bli_kernel_defs_cortexa15.h
index 9484c1771c..9c413f7f84 100644
--- a/config/cortexa15/bli_kernel_defs_cortexa15.h
+++ b/config/cortexa15/bli_kernel_defs_cortexa15.h
@@ -38,11 +38,11 @@
 
 // -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
 
-#define BLIS_MR_S   4
-#define BLIS_MR_D   4
+#define BLIS_MR_s   4
+#define BLIS_MR_d   4
 
-#define BLIS_NR_S   4
-#define BLIS_NR_D   4
+#define BLIS_NR_s   4
+#define BLIS_NR_d   4
 
 //#endif
 
diff --git a/config/cortexa53/bli_kernel_defs_cortexa53.h b/config/cortexa53/bli_kernel_defs_cortexa53.h
index 0d6d98953f..60292099cc 100644
--- a/config/cortexa53/bli_kernel_defs_cortexa53.h
+++ b/config/cortexa53/bli_kernel_defs_cortexa53.h
@@ -38,11 +38,11 @@
 
 // -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
 
-#define BLIS_MR_S   8
-#define BLIS_MR_D   6
+#define BLIS_MR_s   8
+#define BLIS_MR_d   6
 
-#define BLIS_NR_S   12
-#define BLIS_NR_D   8
+#define BLIS_NR_s   12
+#define BLIS_NR_d   8
 
 //#endif
 
diff --git a/config/cortexa57/bli_kernel_defs_cortexa57.h b/config/cortexa57/bli_kernel_defs_cortexa57.h
index 0d6d98953f..60292099cc 100644
--- a/config/cortexa57/bli_kernel_defs_cortexa57.h
+++ b/config/cortexa57/bli_kernel_defs_cortexa57.h
@@ -38,11 +38,11 @@
 
 // -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
 
-#define BLIS_MR_S   8
-#define BLIS_MR_D   6
+#define BLIS_MR_s   8
+#define BLIS_MR_d   6
 
-#define BLIS_NR_S   12
-#define BLIS_NR_D   8
+#define BLIS_NR_s   12
+#define BLIS_NR_d   8
 
 //#endif
 
diff --git a/config/cortexa9/bli_kernel_defs_cortexa9.h b/config/cortexa9/bli_kernel_defs_cortexa9.h
index 9484c1771c..9c413f7f84 100644
--- a/config/cortexa9/bli_kernel_defs_cortexa9.h
+++ b/config/cortexa9/bli_kernel_defs_cortexa9.h
@@ -38,11 +38,11 @@
 
 // -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
 
-#define BLIS_MR_S   4
-#define BLIS_MR_D   4
+#define BLIS_MR_s   4
+#define BLIS_MR_d   4
 
-#define BLIS_NR_S   4
-#define BLIS_NR_D   4
+#define BLIS_NR_s   4
+#define BLIS_NR_d   4
 
 //#endif
 
diff --git a/config/excavator/bli_kernel_defs_excavator.h b/config/excavator/bli_kernel_defs_excavator.h
index 73181b3dfd..df4a8c4118 100644
--- a/config/excavator/bli_kernel_defs_excavator.h
+++ b/config/excavator/bli_kernel_defs_excavator.h
@@ -38,15 +38,15 @@
 
 // -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
 
-#define BLIS_MR_S   16
-#define BLIS_MR_D   8
-#define BLIS_MR_C   4
-#define BLIS_MR_Z   2
-
-#define BLIS_NR_S   3
-#define BLIS_NR_D   3
-#define BLIS_NR_C   2
-#define BLIS_NR_Z   2
+#define BLIS_MR_s   16
+#define BLIS_MR_d   8
+#define BLIS_MR_c   4
+#define BLIS_MR_z   2
+
+#define BLIS_NR_s   3
+#define BLIS_NR_d   3
+#define BLIS_NR_c   2
+#define BLIS_NR_z   2
 
 //#endif
 
diff --git a/config/firestorm/bli_kernel_defs_firestorm.h b/config/firestorm/bli_kernel_defs_firestorm.h
index 0d6d98953f..60292099cc 100644
--- a/config/firestorm/bli_kernel_defs_firestorm.h
+++ b/config/firestorm/bli_kernel_defs_firestorm.h
@@ -38,11 +38,11 @@
 
 // -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
 
-#define BLIS_MR_S   8
-#define BLIS_MR_D   6
+#define BLIS_MR_s   8
+#define BLIS_MR_d   6
 
-#define BLIS_NR_S   12
-#define BLIS_NR_D   8
+#define BLIS_NR_s   12
+#define BLIS_NR_d   8
 
 //#endif
 
diff --git a/config/haswell/bli_kernel_defs_haswell.h b/config/haswell/bli_kernel_defs_haswell.h
index 7d6b333044..c5bc8d63f3 100644
--- a/config/haswell/bli_kernel_defs_haswell.h
+++ b/config/haswell/bli_kernel_defs_haswell.h
@@ -38,15 +38,15 @@
 
 // -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
 
-#define BLIS_MR_S   6
-#define BLIS_MR_D   6
-#define BLIS_MR_C   3
-#define BLIS_MR_Z   3
-
-#define BLIS_NR_S   16
-#define BLIS_NR_D   8
-#define BLIS_NR_C   8
-#define BLIS_NR_Z   4
+#define BLIS_MR_s   6
+#define BLIS_MR_d   6
+#define BLIS_MR_c   3
+#define BLIS_MR_z   3
+
+#define BLIS_NR_s   16
+#define BLIS_NR_d   8
+#define BLIS_NR_c   8
+#define BLIS_NR_z   4
 
 //#endif
 
diff --git a/config/knc/bli_kernel_defs_knc.h b/config/knc/bli_kernel_defs_knc.h
index 88abba4418..0ae6d1b75c 100644
--- a/config/knc/bli_kernel_defs_knc.h
+++ b/config/knc/bli_kernel_defs_knc.h
@@ -38,11 +38,11 @@
 
 // -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
 
-#define BLIS_MR_D   30
+#define BLIS_MR_d   30
 
-#define BLIS_NR_D   8
+#define BLIS_NR_d   8
 
-#define BLIS_PACKMR_D   32
+#define BLIS_PACKMR_d   32
 
 //#endif
 
diff --git a/config/knl/bli_kernel_defs_knl.h b/config/knl/bli_kernel_defs_knl.h
index 081b2060b9..ce514bb21a 100644
--- a/config/knl/bli_kernel_defs_knl.h
+++ b/config/knl/bli_kernel_defs_knl.h
@@ -38,11 +38,11 @@
 
 // -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
 
-#define BLIS_MR_S   24
-#define BLIS_MR_D   24
+#define BLIS_MR_s   24
+#define BLIS_MR_d   24
 
-#define BLIS_NR_S   16
-#define BLIS_NR_D   8
+#define BLIS_NR_s   16
+#define BLIS_NR_d   8
 
 //#endif
 
diff --git a/config/penryn/bli_kernel_defs_penryn.h b/config/penryn/bli_kernel_defs_penryn.h
index 2a3451399e..f1e483646a 100644
--- a/config/penryn/bli_kernel_defs_penryn.h
+++ b/config/penryn/bli_kernel_defs_penryn.h
@@ -38,11 +38,11 @@
 
 // -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
 
-#define BLIS_MR_S   8
-#define BLIS_MR_D   4
+#define BLIS_MR_s   8
+#define BLIS_MR_d   4
 
-#define BLIS_NR_S   4
-#define BLIS_NR_D   4
+#define BLIS_NR_s   4
+#define BLIS_NR_d   4
 
 //#endif
 
diff --git a/config/piledriver/bli_kernel_defs_piledriver.h b/config/piledriver/bli_kernel_defs_piledriver.h
index 73181b3dfd..df4a8c4118 100644
--- a/config/piledriver/bli_kernel_defs_piledriver.h
+++ b/config/piledriver/bli_kernel_defs_piledriver.h
@@ -38,15 +38,15 @@
 
 // -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
 
-#define BLIS_MR_S   16
-#define BLIS_MR_D   8
-#define BLIS_MR_C   4
-#define BLIS_MR_Z   2
-
-#define BLIS_NR_S   3
-#define BLIS_NR_D   3
-#define BLIS_NR_C   2
-#define BLIS_NR_Z   2
+#define BLIS_MR_s   16
+#define BLIS_MR_d   8
+#define BLIS_MR_c   4
+#define BLIS_MR_z   2
+
+#define BLIS_NR_s   3
+#define BLIS_NR_d   3
+#define BLIS_NR_c   2
+#define BLIS_NR_z   2
 
 //#endif
 
diff --git a/config/power10/bli_kernel_defs_power10.h b/config/power10/bli_kernel_defs_power10.h
index 4e4567f639..39a2cf3d58 100644
--- a/config/power10/bli_kernel_defs_power10.h
+++ b/config/power10/bli_kernel_defs_power10.h
@@ -38,11 +38,11 @@
 
 // -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
 
-#define BLIS_MR_S   8
-#define BLIS_MR_D   8
+#define BLIS_MR_s   8
+#define BLIS_MR_d   8
 
-#define BLIS_NR_S   16
-#define BLIS_NR_D   8
+#define BLIS_NR_s   16
+#define BLIS_NR_d   8
 
 //#endif
 
diff --git a/config/power7/bli_kernel_defs_power7.h b/config/power7/bli_kernel_defs_power7.h
index 0bbe12aad9..ceec01df3c 100644
--- a/config/power7/bli_kernel_defs_power7.h
+++ b/config/power7/bli_kernel_defs_power7.h
@@ -38,9 +38,9 @@
 
 // -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
 
-#define BLIS_MR_D   8
+#define BLIS_MR_d   8
 
-#define BLIS_NR_D   4
+#define BLIS_NR_d   4
 
 //#endif
 
diff --git a/config/power9/bli_kernel_defs_power9.h b/config/power9/bli_kernel_defs_power9.h
index c59ddcf8d3..f367fda1dd 100644
--- a/config/power9/bli_kernel_defs_power9.h
+++ b/config/power9/bli_kernel_defs_power9.h
@@ -38,9 +38,9 @@
 
 // -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
 
-#define BLIS_MR_D   12
+#define BLIS_MR_d   12
 
-#define BLIS_NR_D   6
+#define BLIS_NR_d   6
 
 //#endif
 
diff --git a/config/sandybridge/bli_kernel_defs_sandybridge.h b/config/sandybridge/bli_kernel_defs_sandybridge.h
index a599061650..dc1b843f60 100644
--- a/config/sandybridge/bli_kernel_defs_sandybridge.h
+++ b/config/sandybridge/bli_kernel_defs_sandybridge.h
@@ -38,15 +38,15 @@
 
 // -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
 
-#define BLIS_MR_S   8
-#define BLIS_MR_D   8
-#define BLIS_MR_C   8
-#define BLIS_MR_Z   4
-
-#define BLIS_NR_S   8
-#define BLIS_NR_D   4
-#define BLIS_NR_C   4
-#define BLIS_NR_Z   4
+#define BLIS_MR_s   8
+#define BLIS_MR_d   8
+#define BLIS_MR_c   8
+#define BLIS_MR_z   4
+
+#define BLIS_NR_s   8
+#define BLIS_NR_d   4
+#define BLIS_NR_c   4
+#define BLIS_NR_z   4
 
 //#endif
 
diff --git a/config/skx/bli_kernel_defs_skx.h b/config/skx/bli_kernel_defs_skx.h
index 97062493be..2aaf477ad5 100644
--- a/config/skx/bli_kernel_defs_skx.h
+++ b/config/skx/bli_kernel_defs_skx.h
@@ -38,11 +38,11 @@
 
 // -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
 
-#define BLIS_MR_S   32
-#define BLIS_MR_D   16
+#define BLIS_MR_s   32
+#define BLIS_MR_d   16
 
-#define BLIS_NR_S   12
-#define BLIS_NR_D   14
+#define BLIS_NR_s   12
+#define BLIS_NR_d   14
 
 //#endif
 
diff --git a/config/steamroller/bli_kernel_defs_steamroller.h b/config/steamroller/bli_kernel_defs_steamroller.h
index 73181b3dfd..df4a8c4118 100644
--- a/config/steamroller/bli_kernel_defs_steamroller.h
+++ b/config/steamroller/bli_kernel_defs_steamroller.h
@@ -38,15 +38,15 @@
 
 // -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
 
-#define BLIS_MR_S   16
-#define BLIS_MR_D   8
-#define BLIS_MR_C   4
-#define BLIS_MR_Z   2
-
-#define BLIS_NR_S   3
-#define BLIS_NR_D   3
-#define BLIS_NR_C   2
-#define BLIS_NR_Z   2
+#define BLIS_MR_s   16
+#define BLIS_MR_d   8
+#define BLIS_MR_c   4
+#define BLIS_MR_z   2
+
+#define BLIS_NR_s   3
+#define BLIS_NR_d   3
+#define BLIS_NR_c   2
+#define BLIS_NR_z   2
 
 //#endif
 
diff --git a/config/template/bli_kernel_defs_template.h b/config/template/bli_kernel_defs_template.h
index be4ff3f984..86a33d8d8e 100644
--- a/config/template/bli_kernel_defs_template.h
+++ b/config/template/bli_kernel_defs_template.h
@@ -43,18 +43,18 @@
 // an optimized kernel is provided).
 //
 
-#define BLIS_MR_Z   4
+#define BLIS_MR_z   4
 
-#define BLIS_NR_Z   4
+#define BLIS_NR_z   4
 
 //
 // PACKMR/PACKNR do not need to be defined unless they are different from the
 // "normal" MR/NR.
 //
 
-//#define BLIS_PACKMR_Z   4
+//#define BLIS_PACKMR_z   4
 
-//#define BLIS_PACKNR_Z   4
+//#define BLIS_PACKNR_z   4
 
 //#endif
 
diff --git a/config/thunderx2/bli_kernel_defs_thunderx2.h b/config/thunderx2/bli_kernel_defs_thunderx2.h
index 0d6d98953f..60292099cc 100644
--- a/config/thunderx2/bli_kernel_defs_thunderx2.h
+++ b/config/thunderx2/bli_kernel_defs_thunderx2.h
@@ -38,11 +38,11 @@
 
 // -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
 
-#define BLIS_MR_S   8
-#define BLIS_MR_D   6
+#define BLIS_MR_s   8
+#define BLIS_MR_d   6
 
-#define BLIS_NR_S   12
-#define BLIS_NR_D   8
+#define BLIS_NR_s   12
+#define BLIS_NR_d   8
 
 //#endif
 
diff --git a/config/zen/bli_kernel_defs_zen.h b/config/zen/bli_kernel_defs_zen.h
index 7d6b333044..c5bc8d63f3 100644
--- a/config/zen/bli_kernel_defs_zen.h
+++ b/config/zen/bli_kernel_defs_zen.h
@@ -38,15 +38,15 @@
 
 // -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
 
-#define BLIS_MR_S   6
-#define BLIS_MR_D   6
-#define BLIS_MR_C   3
-#define BLIS_MR_Z   3
-
-#define BLIS_NR_S   16
-#define BLIS_NR_D   8
-#define BLIS_NR_C   8
-#define BLIS_NR_Z   4
+#define BLIS_MR_s   6
+#define BLIS_MR_d   6
+#define BLIS_MR_c   3
+#define BLIS_MR_z   3
+
+#define BLIS_NR_s   16
+#define BLIS_NR_d   8
+#define BLIS_NR_c   8
+#define BLIS_NR_z   4
 
 //#endif
 
diff --git a/config/zen2/bli_kernel_defs_zen2.h b/config/zen2/bli_kernel_defs_zen2.h
index 7d6b333044..c5bc8d63f3 100644
--- a/config/zen2/bli_kernel_defs_zen2.h
+++ b/config/zen2/bli_kernel_defs_zen2.h
@@ -38,15 +38,15 @@
 
 // -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
 
-#define BLIS_MR_S   6
-#define BLIS_MR_D   6
-#define BLIS_MR_C   3
-#define BLIS_MR_Z   3
-
-#define BLIS_NR_S   16
-#define BLIS_NR_D   8
-#define BLIS_NR_C   8
-#define BLIS_NR_Z   4
+#define BLIS_MR_s   6
+#define BLIS_MR_d   6
+#define BLIS_MR_c   3
+#define BLIS_MR_z   3
+
+#define BLIS_NR_s   16
+#define BLIS_NR_d   8
+#define BLIS_NR_c   8
+#define BLIS_NR_z   4
 
 //#endif
 
diff --git a/frame/include/bli_kernel_macro_defs.h b/frame/include/bli_kernel_macro_defs.h
index d2487584e7..2769bf6bc7 100644
--- a/frame/include/bli_kernel_macro_defs.h
+++ b/frame/include/bli_kernel_macro_defs.h
@@ -245,7 +245,75 @@
 #define BLIS_POOL_ADDR_OFFSET_SIZE_GEN   0
 #endif
 
+// -- MR and NR block sizes (only for kernels) --------------------------------
 
+#ifdef BLIS_IN_KERNEL
+
+#ifndef BLIS_MR_s
+#define BLIS_MR_s 4
+#endif
+
+#ifndef BLIS_MR_d
+#define BLIS_MR_d 4
+#endif
+
+#ifndef BLIS_MR_c
+#define BLIS_MR_c 4
+#endif
+
+#ifndef BLIS_MR_z
+#define BLIS_MR_z 4
+#endif
+
+#ifndef BLIS_NR_s
+#define BLIS_NR_s 16
+#endif
+
+#ifndef BLIS_NR_d
+#define BLIS_NR_d 8
+#endif
+
+#ifndef BLIS_NR_c
+#define BLIS_NR_c 8
+#endif
+
+#ifndef BLIS_NR_z
+#define BLIS_NR_z 4
+#endif
+
+#ifndef BLIS_PACKMR_s
+#define BLIS_PACKMR_s BLIS_MR_s
+#endif
+
+#ifndef BLIS_PACKMR_d
+#define BLIS_PACKMR_d BLIS_MR_d
+#endif
+
+#ifndef BLIS_PACKMR_c
+#define BLIS_PACKMR_c BLIS_MR_c
+#endif
+
+#ifndef BLIS_PACKMR_z
+#define BLIS_PACKMR_z BLIS_MR_z
+#endif
+
+#ifndef BLIS_PACKNR_s
+#define BLIS_PACKNR_s BLIS_NR_s
+#endif
+
+#ifndef BLIS_PACKNR_d
+#define BLIS_PACKNR_d BLIS_NR_d
+#endif
+
+#ifndef BLIS_PACKNR_c
+#define BLIS_PACKNR_c BLIS_NR_c
+#endif
+
+#ifndef BLIS_PACKNR_z
+#define BLIS_PACKNR_z BLIS_NR_z
+#endif
+
+#endif
 
 #endif
 
diff --git a/ref_kernels/3/bli_gemm_ref.c b/ref_kernels/3/bli_gemm_ref.c
index 51ff9df4bd..ba3c8bbd1f 100644
--- a/ref_kernels/3/bli_gemm_ref.c
+++ b/ref_kernels/3/bli_gemm_ref.c
@@ -40,7 +40,7 @@
 // instructions via constant loop bounds + #pragma omp simd directives.
 
 #undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, arch, suf, mr, nr ) \
+#define GENTFUNC( ctype, ch, opname, arch, suf ) \
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
@@ -56,14 +56,15 @@ void PASTEMAC3(ch,opname,arch,suf) \
        cntx_t*    restrict cntx  \
      ) \
 { \
-	ctype           ab[ BLIS_STACK_BUF_MAX_SIZE \
-	                    / sizeof( ctype ) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+    const dim_t     mr = PASTECH(BLIS_MR_,ch); \
+    const dim_t     nr = PASTECH(BLIS_NR_,ch); \
+\
+	ctype           ab[ mr * nr ] __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
 	const inc_t     rs_ab  = nr; \
 	const inc_t     cs_ab  = 1; \
 \
-	const inc_t     cs_a   = mr; \
-	const inc_t     rs_b   = nr; \
+	const inc_t     cs_a   = PASTECH(BLIS_PACKMR_,ch); \
+	const inc_t     rs_b   = PASTECH(BLIS_PACKNR_,ch); \
 \
 \
 	/* Initialize the accumulator elements in ab to zero. */ \
@@ -103,14 +104,15 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 	/* Output/accumulate intermediate result ab based on the storage
 	   of c and the value of beta. */ \
-	if ( cs_c == 1 ) \
+	if ( cs_c == 1 && m == mr && n == nr ) \
 	{ \
-		/* C is row-stored. */ \
+		/* C is row-stored and a full tile. */ \
 \
 		if ( PASTEMAC(ch,eq0)( *beta ) ) \
 		{ \
-			for ( dim_t i = 0; i < m; ++i ) \
-			for ( dim_t j = 0; j < n; ++j ) \
+			for ( dim_t i = 0; i < mr ++i ) \
+	        PRAGMA_SIMD \
+			for ( dim_t j = 0; j < nr; ++j ) \
 			PASTEMAC(ch,copys) \
 			( \
 			  ab[ i*rs_ab + j*cs_ab ], \
@@ -119,8 +121,9 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		} \
 		else \
 		{ \
-			for ( dim_t i = 0; i < m; ++i ) \
-			for ( dim_t j = 0; j < n; ++j ) \
+			for ( dim_t i = 0; i < mr; ++i ) \
+	        PRAGMA_SIMD \
+			for ( dim_t j = 0; j < nr; ++j ) \
 			PASTEMAC(ch,xpbys) \
 			( \
 			  ab[ i*rs_ab + j*cs_ab ], \
@@ -131,7 +134,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	} \
 	else \
 	{ \
-		/* C is column-stored or general-stored. */ \
+		/* C is column-stored, general-stored, or an edge case. */ \
 \
 		if ( PASTEMAC(ch,eq0)( *beta ) ) \
 		{ \
@@ -157,11 +160,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	} \
 }
 
-//INSERT_GENTFUNC_BASIC2( gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-GENTFUNC( float,    s, gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 16 )
-GENTFUNC( double,   d, gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 8 )
-GENTFUNC( scomplex, c, gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 8 )
-GENTFUNC( dcomplex, z, gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, 4, 4 )
+INSERT_GENTFUNC_BASIC2( gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
 #else
 

From ac920eb1b8a1f9634b35cac4cecae30cfc229892 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Sun, 6 Feb 2022 13:48:27 -0600
Subject: [PATCH 03/32] Simplify cntx_t structure.

All kernels have been combined into a single array (level-1v/1f, (un)packm, level-3, and sup), and similarly with preferences (only ukr row-storage preferences for now) and block sizes (which now include sup thresholds and block sizes). These changes are necessary for future support of user-defined kernels. The context initialization functions used by bli_cntx_init_* have also been reworked to use a sentinel instead of an explicit count in order to prevent errors. Note that mostly these changes make the cntx_t code oblivious to BLAS level, but some l3-specific functions remain for compatibility.
---
 addon/gemmd/attic/bao_gemmd_bp_var2.c         |    8 +-
 addon/gemmd/bao_gemmd.c                       |    2 +-
 addon/gemmd/bao_gemmd_bp_var1.c               |    2 +-
 addon/gemmd/bao_packm_cxk.c                   |    6 +-
 config/a64fx/bli_cntx_init_a64fx.c            |   98 +-
 config/armsve/bli_cntx_init_armsve.c          |  105 +-
 config/bgq/bli_cntx_init_bgq.c                |   35 +-
 config/bulldozer/bli_cntx_init_bulldozer.c    |   41 +-
 config/cortexa15/bli_cntx_init_cortexa15.c    |   35 +-
 config/cortexa53/bli_cntx_init_cortexa53.c    |   35 +-
 config/cortexa57/bli_cntx_init_cortexa57.c    |   35 +-
 config/cortexa9/bli_cntx_init_cortexa9.c      |   35 +-
 config/excavator/bli_cntx_init_excavator.c    |   41 +-
 config/firestorm/bli_cntx_init_firestorm.c    |  143 +-
 config/haswell/bli_cntx_init_haswell.c        |  231 +--
 config/knc/bli_cntx_init_knc.c                |   34 +-
 config/knl/bli_cntx_init_knl.c                |   67 +-
 config/old/armv7a/bli_cntx_init_armv7a.c      |    2 +-
 config/old/haswellbb/bli_cntx_init_haswell.c  |    2 +-
 config/penryn/bli_cntx_init_penryn.c          |   47 +-
 config/piledriver/bli_cntx_init_piledriver.c  |   41 +-
 config/power10/bli_cntx_init_power10.c        |   81 +-
 config/power7/bli_cntx_init_power7.c          |   32 +-
 config/power9/bli_cntx_init_power9.c          |   81 +-
 .../sandybridge/bli_cntx_init_sandybridge.c   |   41 +-
 config/skx/bli_cntx_init_skx.c                |   53 +-
 .../steamroller/bli_cntx_init_steamroller.c   |   41 +-
 config/template/bli_cntx_init_template.c      |   53 +-
 config/thunderx2/bli_cntx_init_thunderx2.c    |   35 +-
 config/zen/bli_cntx_init_zen.c                |  295 +--
 config/zen2/bli_cntx_init_zen2.c              |  281 ++-
 config/zen3/bli_cntx_init_zen3.c              |  312 ++--
 docs/ConfigurationHowTo.md                    |   24 +-
 frame/1/bli_l1v_tapi.c                        |   20 +-
 frame/1/other/packv/bli_packv_unb_var1.c      |    2 +-
 frame/1/other/unpackv/bli_unpackv_unb_var1.c  |    2 +-
 frame/1d/bli_l1d_tapi.c                       |   14 +-
 frame/1f/bli_l1f_tapi.c                       |   10 +-
 frame/1m/bli_l1m_unb_var1.c                   |    8 +-
 frame/1m/packm/bli_packm_cxk.c                |    6 +-
 frame/1m/packm/bli_packm_cxk_1er.c            |    6 +-
 frame/1m/unpackm/bli_unpackm_cxk.c            |    6 +-
 frame/2/gemv/bli_gemv_unb_var1.c              |    2 +-
 frame/2/gemv/bli_gemv_unb_var2.c              |    2 +-
 frame/2/gemv/bli_gemv_unf_var1.c              |    2 +-
 frame/2/gemv/bli_gemv_unf_var2.c              |    2 +-
 frame/2/ger/bli_ger_unb_var1.c                |    2 +-
 frame/2/ger/bli_ger_unb_var2.c                |    2 +-
 frame/2/hemv/bli_hemv_unb_var1.c              |    4 +-
 frame/2/hemv/bli_hemv_unb_var2.c              |    2 +-
 frame/2/hemv/bli_hemv_unb_var3.c              |    4 +-
 frame/2/hemv/bli_hemv_unb_var4.c              |    2 +-
 frame/2/hemv/bli_hemv_unf_var1.c              |    2 +-
 frame/2/hemv/bli_hemv_unf_var1a.c             |    2 +-
 frame/2/hemv/bli_hemv_unf_var3.c              |    2 +-
 frame/2/hemv/bli_hemv_unf_var3a.c             |    2 +-
 frame/2/her/bli_her_unb_var1.c                |    2 +-
 frame/2/her/bli_her_unb_var2.c                |    2 +-
 frame/2/her2/bli_her2_unb_var1.c              |    2 +-
 frame/2/her2/bli_her2_unb_var2.c              |    2 +-
 frame/2/her2/bli_her2_unb_var3.c              |    2 +-
 frame/2/her2/bli_her2_unb_var4.c              |    2 +-
 frame/2/her2/bli_her2_unf_var1.c              |    2 +-
 frame/2/her2/bli_her2_unf_var4.c              |    2 +-
 frame/2/trmv/bli_trmv_unb_var1.c              |    2 +-
 frame/2/trmv/bli_trmv_unb_var2.c              |    2 +-
 frame/2/trmv/bli_trmv_unf_var1.c              |    2 +-
 frame/2/trmv/bli_trmv_unf_var2.c              |    2 +-
 frame/2/trsv/bli_trsv_unb_var1.c              |    2 +-
 frame/2/trsv/bli_trsv_unb_var2.c              |    2 +-
 frame/2/trsv/bli_trsv_unf_var1.c              |    2 +-
 frame/2/trsv/bli_trsv_unf_var2.c              |    2 +-
 frame/3/bli_l3_schema.c                       |    2 +-
 frame/3/bli_l3_sup.c                          |    2 +-
 frame/3/bli_l3_sup_int.c                      |    4 +-
 frame/3/bli_l3_sup_vars.h                     |    2 +-
 frame/3/gemm/bli_gemm_front.c                 |    2 +-
 frame/3/gemm/bli_gemm_ker_var2.c              |    2 +-
 frame/3/gemm/bli_gemm_md.c                    |   18 +-
 frame/3/gemm/bli_gemm_md_c2r_ref.c            |    4 +-
 frame/3/gemm/other/bli_gemm_ker_var2.c        |    2 +-
 frame/3/gemm/other/bli_gemm_ker_var2rr.c      |    2 +-
 frame/3/gemm/other/bli_gemm_ker_var2sl.c      |    2 +-
 frame/3/gemmt/bli_gemmt_front.c               |    2 +-
 frame/3/gemmt/bli_gemmt_l_ker_var2.c          |    2 +-
 frame/3/gemmt/bli_gemmt_u_ker_var2.c          |    2 +-
 frame/3/gemmt/other/bli_gemmt_l_ker_var2.c    |    2 +-
 frame/3/gemmt/other/bli_gemmt_u_ker_var2.c    |    2 +-
 frame/3/hemm/bli_hemm_front.c                 |    2 +-
 frame/3/symm/bli_symm_front.c                 |    2 +-
 frame/3/trmm/bli_trmm_front.c                 |    2 +-
 frame/3/trmm/other/bli_trmm_ll_ker_var2.c     |    2 +-
 frame/3/trmm/other/bli_trmm_ll_ker_var2rr.c   |    4 +-
 frame/3/trmm/other/bli_trmm_ll_ker_var2sl.c   |    2 +-
 frame/3/trmm/other/bli_trmm_lu_ker_var2.c     |    2 +-
 frame/3/trmm/other/bli_trmm_lu_ker_var2rr.c   |    2 +-
 frame/3/trmm/other/bli_trmm_lu_ker_var2sl.c   |    2 +-
 frame/3/trmm/other/bli_trmm_rl_ker_var2.c     |    2 +-
 frame/3/trmm/other/bli_trmm_rl_ker_var2rr.c   |    2 +-
 frame/3/trmm/other/bli_trmm_rl_ker_var2sl.c   |    2 +-
 frame/3/trmm/other/bli_trmm_ru_ker_var2.c     |    2 +-
 frame/3/trmm/other/bli_trmm_ru_ker_var2rr.c   |    2 +-
 frame/3/trmm/other/bli_trmm_ru_ker_var2sl.c   |    2 +-
 frame/3/trmm3/bli_trmm3_front.c               |    2 +-
 frame/3/trsm/bli_trsm_ll_ker_var2.c           |    2 +-
 frame/3/trsm/bli_trsm_lu_ker_var2.c           |    2 +-
 frame/3/trsm/bli_trsm_rl_ker_var2.c           |    2 +-
 frame/3/trsm/bli_trsm_ru_ker_var2.c           |    2 +-
 frame/3/trsm/other/bli_trsm_ll_ker_var2.c     |    2 +-
 frame/3/trsm/other/bli_trsm_ll_ker_var2rr.c   |    2 +-
 frame/3/trsm/other/bli_trsm_ll_ker_var2sl.c   |    2 +-
 frame/3/trsm/other/bli_trsm_lu_ker_var2.c     |    2 +-
 frame/3/trsm/other/bli_trsm_lu_ker_var2rr.c   |    2 +-
 frame/3/trsm/other/bli_trsm_lu_ker_var2sl.c   |    2 +-
 frame/3/trsm/other/bli_trsm_rl_ker_var2.c     |    2 +-
 frame/3/trsm/other/bli_trsm_ru_ker_var2.c     |    2 +-
 frame/base/bli_cntx.c                         | 1644 +++--------------
 frame/base/bli_cntx.h                         |  577 ++----
 frame/base/bli_gks.c                          |   42 +-
 frame/base/bli_gks.h                          |    8 +-
 frame/include/bli_param_macro_defs.h          |   18 +-
 frame/include/bli_type_defs.h                 |  274 +--
 kernels/penryn/1/bli_axpyv_penryn_int.c       |    2 +-
 kernels/penryn/1/bli_dotv_penryn_int.c        |    2 +-
 kernels/penryn/1f/bli_axpy2v_penryn_int.c     |    2 +-
 kernels/penryn/1f/bli_axpyf_penryn_int.c      |    2 +-
 kernels/penryn/1f/bli_dotaxpyv_penryn_int.c   |    2 +-
 kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c  |    4 +-
 kernels/penryn/1f/bli_dotxf_penryn_int.c      |    4 +-
 kernels/zen/1/bli_scalv_zen_int.c             |    4 +-
 kernels/zen/1/bli_scalv_zen_int10.c           |    4 +-
 kernels/zen/1f/bli_axpyf_zen_int_8.c          |    4 +-
 kernels/zen/1f/bli_dotxf_zen_int_8.c          |   12 +-
 kernels/zen2/1f/bli_axpyf_zen_int_5.c         |   12 +-
 ref_kernels/1/bli_axpbyv_ref.c                |   14 +-
 ref_kernels/1/bli_axpyv_ref.c                 |    4 +-
 ref_kernels/1/bli_scal2v_ref.c                |    4 +-
 ref_kernels/1/bli_scalv_ref.c                 |    2 +-
 ref_kernels/1/bli_xpbyv_ref.c                 |    4 +-
 ref_kernels/1f/bli_axpy2v_ref.c               |    2 +-
 ref_kernels/1f/bli_axpyf_ref.c                |    2 +-
 ref_kernels/1f/bli_dotaxpyv_ref.c             |    4 +-
 ref_kernels/1f/bli_dotxaxpyf_ref.c            |    4 +-
 ref_kernels/1f/bli_dotxf_ref.c                |    2 +-
 ref_kernels/1f/other/bli_dotxaxpyf_ref_alt.c  |    4 +-
 ref_kernels/3/bb/bli_gemmtrsmbb_ref.c         |    4 +-
 ref_kernels/3/bli_gemm_ref.c                  |    2 +-
 ref_kernels/3/bli_gemmtrsm_ref.c              |    4 +-
 ref_kernels/bli_cntx_ref.c                    |  212 +--
 ref_kernels/ind/bli_gemm1m_ref.c              |    4 +-
 ref_kernels/ind/bli_gemmtrsm1m_ref.c          |    4 +-
 sandbox/gemmlike/attic/bls_gemm_bp_var2.c     |    8 +-
 sandbox/gemmlike/bls_gemm.c                   |    2 +-
 sandbox/gemmlike/bls_gemm_bp_var1.c           |    2 +-
 sandbox/gemmlike/bls_packm_cxk.c              |    6 +-
 155 files changed, 2149 insertions(+), 3384 deletions(-)

diff --git a/addon/gemmd/attic/bao_gemmd_bp_var2.c b/addon/gemmd/attic/bao_gemmd_bp_var2.c
index a0040fec06..9139e89b15 100644
--- a/addon/gemmd/attic/bao_gemmd_bp_var2.c
+++ b/addon/gemmd/attic/bao_gemmd_bp_var2.c
@@ -164,7 +164,7 @@ void PASTECH2(bao_,ch,varname) \
 	   function pointer type. */ \
 	/*
 	PASTECH(ch,gemm_ukr_ft) \
-               gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+               gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
 	*/ \
 \
 	/* Temporary C buffer for edge cases. Note that the strides of this
@@ -175,7 +175,7 @@ void PASTECH2(bao_,ch,varname) \
 	ctype       ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                / sizeof( ctype ) ] \
 	                __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
 	const inc_t rs_ct   = ( col_pref ? 1 : NR ); \
 	const inc_t cs_ct   = ( col_pref ? MR : 1 ); \
 	*/ \
@@ -536,7 +536,7 @@ void PASTECH2(bao_,ch,varname) \
 	/* Query the context for the microkernel address and cast it to its
 	   function pointer type. */ \
 	PASTECH(ch,gemm_ukr_ft) \
-               gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+               gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
 \
 	/* Temporary C buffer for edge cases. Note that the strides of this
 	   temporary buffer are set so that they match the storage of the
@@ -545,7 +545,7 @@ void PASTECH2(bao_,ch,varname) \
 	ctype       ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                / sizeof( ctype ) ] \
 	                __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
 	const inc_t rs_ct   = ( col_pref ? 1 : NR ); \
 	const inc_t cs_ct   = ( col_pref ? MR : 1 ); \
 \
diff --git a/addon/gemmd/bao_gemmd.c b/addon/gemmd/bao_gemmd.c
index fadc526918..01185a9d75 100644
--- a/addon/gemmd/bao_gemmd.c
+++ b/addon/gemmd/bao_gemmd.c
@@ -137,7 +137,7 @@ void bao_gemmd_ex
 	// contiguous columns, or if C is stored by columns and the micro-kernel
 	// prefers contiguous rows, transpose the entire operation to allow the
 	// micro-kernel to access elements of C in its preferred manner.
-	if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
+	if ( bli_cntx_dislikes_storage_of( &c_local, BLIS_GEMM_VIR_UKR, cntx ) )
 	{
 		bli_obj_swap( &a_local, &b_local );
 
diff --git a/addon/gemmd/bao_gemmd_bp_var1.c b/addon/gemmd/bao_gemmd_bp_var1.c
index 09e4df09e4..689471367f 100644
--- a/addon/gemmd/bao_gemmd_bp_var1.c
+++ b/addon/gemmd/bao_gemmd_bp_var1.c
@@ -163,7 +163,7 @@ void PASTECH2(bao_,ch,varname) \
 	/* Query the context for the microkernel address and cast it to its
 	   function pointer type. */ \
 	PASTECH(ch,gemm_ukr_ft) \
-               gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+               gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
 \
 	/* Compute partitioning step values for each matrix of each loop. */ \
 	const inc_t jcstep_c = cs_c; \
diff --git a/addon/gemmd/bao_packm_cxk.c b/addon/gemmd/bao_packm_cxk.c
index 645f09d798..455ce3fe0d 100644
--- a/addon/gemmd/bao_packm_cxk.c
+++ b/addon/gemmd/bao_packm_cxk.c
@@ -55,15 +55,15 @@ void PASTECH2(bao_,ch,opname) \
 	/* Note that we use panel_dim_max, not panel_dim, to query the packm
 	   kernel function pointer. This means that we always use the same
 	   kernel, even for edge cases. */ \
-	num_t     dt     = PASTEMAC(ch,type); \
-	l1mkr_t   ker_id = panel_dim_max; \
+	num_t dt     = PASTEMAC(ch,type); \
+	ukr_t ker_id = ( ukr_t )( BLIS_PACKM_0XK_KER + panel_dim_max ); \
 \
 	PASTECH2(ch,opname,_ker_ft) f; \
 \
 	/* Query the context for the packm kernel corresponding to the current
 	   panel dimension, or kernel id. If the id is invalid, the function will
 	   return NULL. */ \
-	f = bli_cntx_get_packm_ker_dt( dt, ker_id, cntx ); \
+	f = bli_cntx_get_ukr_dt( dt, ker_id, cntx ); \
 \
 	/* If there exists a kernel implementation for the micro-panel dimension
 	   provided, we invoke the implementation. Otherwise, we use scal2m. */ \
diff --git a/config/a64fx/bli_cntx_init_a64fx.c b/config/a64fx/bli_cntx_init_a64fx.c
index 5132b2824c..f002477b0e 100644
--- a/config/a64fx/bli_cntx_init_a64fx.c
+++ b/config/a64fx/bli_cntx_init_a64fx.c
@@ -38,34 +38,44 @@
 void bli_cntx_init_a64fx( cntx_t* cntx )
 {
 	blksz_t blkszs[ BLIS_NUM_BLKSZS ];
-	blksz_t thresh[ BLIS_NUM_THRESH ];
 
 	// Set default kernel blocksizes and functions.
 	bli_cntx_init_a64fx_ref( cntx );
 
 	// -------------------------------------------------------------------------
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  4,
-	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_armsve_asm_2vx10_unindexed, FALSE,
-	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_armsve_asm_2vx10_unindexed, FALSE,
-	  BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_armsve_asm_2vx10_unindexed, FALSE,
-	  BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_armsve_asm_2vx10_unindexed, FALSE,
-	  cntx
-	);
+	  cntx,
 
-	// Set SVE-512 packing routine.
-	bli_cntx_set_packm_kers
-	(
-	  2,
+      // level-3
+	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_armsve_asm_2vx10_unindexed,
+	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_armsve_asm_2vx10_unindexed,
+	  BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_armsve_asm_2vx10_unindexed,
+	  BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_armsve_asm_2vx10_unindexed,
+
+      // packm
 	  BLIS_PACKM_10XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk,
 	  // 12xk is not used and disabled for GCC 8-9 compatibility.
 	  // BLIS_PACKM_12XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_int_12xk,
 	  BLIS_PACKM_16XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk,
-	  cntx
+
+      -1
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+      // level-3
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,    FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE,   FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
+
+      -1
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -80,66 +90,18 @@ void bli_cntx_init_a64fx( cntx_t* cntx )
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 5,
+	  cntx,
+
+      // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
-	  cntx
-	);
-
-#if 0
-	// Initialize sup thresholds with architecture-appropriate values.
-	//                                          s     d     c     z
-	bli_blksz_init_easy( &thresh[ BLIS_MT ],   -1,   65,   -1,   -1 );
-	bli_blksz_init_easy( &thresh[ BLIS_NT ],   -1,   65,   -1,   -1 );
-	bli_blksz_init_easy( &thresh[ BLIS_KT ],   -1,   65,   -1,   -1 );
 
-	// Initialize the context with the sup thresholds.
-	bli_cntx_set_l3_sup_thresh
-	(
-	  3,
-	  BLIS_MT, &thresh[ BLIS_MT ],
-	  BLIS_NT, &thresh[ BLIS_NT ],
-	  BLIS_KT, &thresh[ BLIS_KT ],
-	  cntx
-	);
-
-	// Update the context with optimized small/unpacked gemm kernels.
-	bli_cntx_set_l3_sup_kers
-	(
-	  4,
-	  BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
-	  BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
-	  BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
-	  BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
-	  cntx
+      -1
 	);
 
-	// Initialize level-3 sup blocksize objects with architecture-specific
-	// values.
-	//                                           s      d      c      z
-	bli_blksz_init_easy( &blkszs[ BLIS_MR ],    -1,    10,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    -1,    16,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_MC ],    -1,   120,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_KC ],    -1,   256,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NC ],    -1,  4080,    -1,    -1 );
-
-	// Update the context with the current architecture's register and cache
-	// blocksizes for small/unpacked level-3 problems.
-	bli_cntx_set_l3_sup_blkszs
-	(
-	  5,
-	  BLIS_NC, &blkszs[ BLIS_NC ],
-	  BLIS_KC, &blkszs[ BLIS_KC ],
-	  BLIS_MC, &blkszs[ BLIS_MC ],
-	  BLIS_NR, &blkszs[ BLIS_NR ],
-	  BLIS_MR, &blkszs[ BLIS_MR ],
-	  cntx
-	);
-#endif
-
 	// Set A64FX cache sector sizes for each PE/CMG
 	// SC Fugaku might disable users' setting cache sizes.
 #if !defined(CACHE_SECTOR_SIZE_READONLY)
diff --git a/config/armsve/bli_cntx_init_armsve.c b/config/armsve/bli_cntx_init_armsve.c
index cd07924a71..7ee24351c4 100644
--- a/config/armsve/bli_cntx_init_armsve.c
+++ b/config/armsve/bli_cntx_init_armsve.c
@@ -37,9 +37,6 @@
 void bli_cntx_init_armsve( cntx_t* cntx )
 {
 	blksz_t blkszs[ BLIS_NUM_BLKSZS ];
-#if 0
-	blksz_t thresh[ BLIS_NUM_THRESH ];
-#endif
 
 	// Set default kernel blocksizes and functions.
 	bli_cntx_init_armsve_ref( cntx );
@@ -56,34 +53,50 @@ void bli_cntx_init_armsve( cntx_t* cntx )
 	bli_c_blksz_armsve(&m_r_c, &n_r_c, &k_c_c, &m_c_c, &n_c_c);
 	bli_z_blksz_armsve(&m_r_z, &n_r_z, &k_c_z, &m_c_z, &n_c_z);
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  4,
+	  cntx,
+
+      // level-3
 	  // These are vector-length agnostic kernels. Yet knowing mr is required at runtime.
-	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_armsve_asm_2vx10_unindexed, FALSE,
-	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_armsve_asm_2vx10_unindexed, FALSE,
-	  BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_armsve_asm_2vx10_unindexed, FALSE,
-	  BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_armsve_asm_2vx10_unindexed, FALSE,
-	  cntx
+	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_armsve_asm_2vx10_unindexed,
+	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_armsve_asm_2vx10_unindexed,
+	  BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_armsve_asm_2vx10_unindexed,
+	  BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_armsve_asm_2vx10_unindexed,
+
+      -1
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+      // level-3
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,    FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE,   FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
+
+      -1
 	);
 
 	// Set VL-specific packing routines if applicable.
 	if (m_r_d==16)
-	  bli_cntx_set_packm_kers
+	  bli_cntx_set_ukrs
 	  (
-		2,
+		cntx,
 		BLIS_PACKM_10XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk,
 		BLIS_PACKM_16XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk,
-		cntx
+		-1
 	  );
 	else if (m_r_d==8)
-	  bli_cntx_set_packm_kers
+	  bli_cntx_set_ukrs
 	  (
-		1,
+		cntx,
 		BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_armsve256_int_8xk,
-		cntx
+		-1
 	  );
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -98,64 +111,16 @@ void bli_cntx_init_armsve( cntx_t* cntx )
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 5,
+	  cntx,
+
+      // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
-	  cntx
-	);
-
-#if 0
-	// Initialize sup thresholds with architecture-appropriate values.
-	//                                          s     d     c     z
-	bli_blksz_init_easy( &thresh[ BLIS_MT ],   -1,  101,   -1,   -1 );
-	bli_blksz_init_easy( &thresh[ BLIS_NT ],   -1,  101,   -1,   -1 );
-	bli_blksz_init_easy( &thresh[ BLIS_KT ],   -1,  101,   -1,   -1 );
 
-	// Initialize the context with the sup thresholds.
-	bli_cntx_set_l3_sup_thresh
-	(
-	  3,
-	  BLIS_MT, &thresh[ BLIS_MT ],
-	  BLIS_NT, &thresh[ BLIS_NT ],
-	  BLIS_KT, &thresh[ BLIS_KT ],
-	  cntx
-	);
-
-	// Update the context with optimized small/unpacked gemm kernels.
-	bli_cntx_set_l3_sup_kers
-	(
-	  4,
-	  BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
-	  BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
-	  BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
-	  BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armsve_10x2v_unindexed, TRUE,
-	  cntx
-	);
-
-	// Initialize level-3 sup blocksize objects with architecture-specific
-	// values.
-	//                                           s      d      c      z
-	bli_blksz_init_easy( &blkszs[ BLIS_MR ],    -1, n_r_d,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    -1, m_r_d,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_MC ],    -1,   120,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_KC ],    -1,   256,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NC ],    -1,  2048,    -1,    -1 );
-
-	// Update the context with the current architecture's register and cache
-	// blocksizes for small/unpacked level-3 problems.
-	bli_cntx_set_l3_sup_blkszs
-	(
-	  5,
-	  BLIS_NC, &blkszs[ BLIS_NC ],
-	  BLIS_KC, &blkszs[ BLIS_KC ],
-	  BLIS_MC, &blkszs[ BLIS_MC ],
-	  BLIS_NR, &blkszs[ BLIS_NR ],
-	  BLIS_MR, &blkszs[ BLIS_MR ],
-	  cntx
+      -1
 	);
-#endif
 }
 
diff --git a/config/bgq/bli_cntx_init_bgq.c b/config/bgq/bli_cntx_init_bgq.c
index 782c441b97..03f9fd989e 100644
--- a/config/bgq/bli_cntx_init_bgq.c
+++ b/config/bgq/bli_cntx_init_bgq.c
@@ -43,14 +43,28 @@ void bli_cntx_init_bgq( cntx_t* cntx )
 
 	// -------------------------------------------------------------------------
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  2,
-	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_bgq_int_8x8, FALSE,
-	  BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_bgq_int_4x4, FALSE,
-	  cntx
+	  cntx,
+
+      // level-3
+	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_bgq_int_8x8,
+	  BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_bgq_int_4x4,
+
+      -1
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+      // level-3
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE,   FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
+
+      -1
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -65,13 +79,16 @@ void bli_cntx_init_bgq( cntx_t* cntx )
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 5,
+	  cntx,
+
+      // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
-	  cntx
+
+      -1
 	);
 }
 
diff --git a/config/bulldozer/bli_cntx_init_bulldozer.c b/config/bulldozer/bli_cntx_init_bulldozer.c
index 9f6e83d6ba..6d9a230ccc 100644
--- a/config/bulldozer/bli_cntx_init_bulldozer.c
+++ b/config/bulldozer/bli_cntx_init_bulldozer.c
@@ -43,16 +43,32 @@ void bli_cntx_init_bulldozer( cntx_t* cntx )
 
 	// -------------------------------------------------------------------------
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  4,
-	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_bulldozer_asm_8x8_fma4, FALSE,
-	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_bulldozer_asm_4x6_fma4, FALSE,
-	  BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_bulldozer_asm_8x4_fma4, FALSE,
-	  BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_bulldozer_asm_4x4_fma4, FALSE,
-	  cntx
+	  cntx,
+
+      // level-3
+	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_bulldozer_asm_8x8_fma4,
+	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_bulldozer_asm_4x6_fma4,
+	  BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_bulldozer_asm_8x4_fma4,
+	  BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_bulldozer_asm_4x4_fma4,
+
+      -1
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+      // level-3
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,    FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE,   FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
+
+      -1
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -67,13 +83,16 @@ void bli_cntx_init_bulldozer( cntx_t* cntx )
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 5,
+	  cntx,
+
+      // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
-	  cntx
+
+      -1
 	);
 }
 
diff --git a/config/cortexa15/bli_cntx_init_cortexa15.c b/config/cortexa15/bli_cntx_init_cortexa15.c
index 7c6134ff01..928d8fee46 100644
--- a/config/cortexa15/bli_cntx_init_cortexa15.c
+++ b/config/cortexa15/bli_cntx_init_cortexa15.c
@@ -43,14 +43,28 @@ void bli_cntx_init_cortexa15( cntx_t* cntx )
 
 	// -------------------------------------------------------------------------
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  2,
-	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_armv7a_int_4x4, FALSE,
-	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_armv7a_int_4x4, FALSE,
-	  cntx
+	  cntx,
+
+      // level-3
+	  BLIS_GEMM_UKR, BLIS_FLOAT,  bli_sgemm_armv7a_int_4x4,
+	  BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv7a_int_4x4,
+
+      -1
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+      // level-3
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,  FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
+
+      -1
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -73,13 +87,16 @@ void bli_cntx_init_cortexa15( cntx_t* cntx )
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 5,
+	  cntx,
+
+      // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
-	  cntx
+
+      -1
 	);
 }
 
diff --git a/config/cortexa53/bli_cntx_init_cortexa53.c b/config/cortexa53/bli_cntx_init_cortexa53.c
index d7d786f8c6..e0e72c4f36 100644
--- a/config/cortexa53/bli_cntx_init_cortexa53.c
+++ b/config/cortexa53/bli_cntx_init_cortexa53.c
@@ -43,14 +43,28 @@ void bli_cntx_init_cortexa53( cntx_t* cntx )
 
 	// -------------------------------------------------------------------------
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  2,
-	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_armv8a_asm_8x12, FALSE,
-	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_armv8a_asm_6x8,  FALSE,
-	  cntx
+	  cntx,
+
+      // level-3
+	  BLIS_GEMM_UKR, BLIS_FLOAT,  bli_sgemm_armv8a_asm_8x12,
+	  BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8,
+
+      -1
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+      // level-3
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,  FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
+
+      -1
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -65,13 +79,16 @@ void bli_cntx_init_cortexa53( cntx_t* cntx )
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 5,
+	  cntx,
+
+      // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
-	  cntx
+
+      -1
 	);
 }
 
diff --git a/config/cortexa57/bli_cntx_init_cortexa57.c b/config/cortexa57/bli_cntx_init_cortexa57.c
index 57d18792de..8c327d436e 100644
--- a/config/cortexa57/bli_cntx_init_cortexa57.c
+++ b/config/cortexa57/bli_cntx_init_cortexa57.c
@@ -43,14 +43,28 @@ void bli_cntx_init_cortexa57( cntx_t* cntx )
 
 	// -------------------------------------------------------------------------
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  2,
-	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_armv8a_asm_8x12, FALSE,
-	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_armv8a_asm_6x8,  FALSE,
-	  cntx
+	  cntx,
+
+      // level-3
+	  BLIS_GEMM_UKR, BLIS_FLOAT,  bli_sgemm_armv8a_asm_8x12,
+	  BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8,
+
+      -1
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+      // level-3
+	  BLIS_GEMM_UKR, BLIS_FLOAT,  FALSE,
+	  BLIS_GEMM_UKR, BLIS_DOUBLE, FALSE,
+
+      -1
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -65,13 +79,16 @@ void bli_cntx_init_cortexa57( cntx_t* cntx )
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 5,
+	  cntx,
+
+      // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
-	  cntx
+
+      -1
 	);
 }
 
diff --git a/config/cortexa9/bli_cntx_init_cortexa9.c b/config/cortexa9/bli_cntx_init_cortexa9.c
index d38e12ebbf..4751242e16 100644
--- a/config/cortexa9/bli_cntx_init_cortexa9.c
+++ b/config/cortexa9/bli_cntx_init_cortexa9.c
@@ -43,14 +43,28 @@ void bli_cntx_init_cortexa9( cntx_t* cntx )
 
 	// -------------------------------------------------------------------------
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  2,
-	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_armv7a_int_4x4, FALSE,
-	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_armv7a_int_4x4, FALSE,
-	  cntx
+	  cntx,
+
+      // level-3
+	  BLIS_GEMM_UKR, BLIS_FLOAT,  bli_sgemm_armv7a_int_4x4,
+	  BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv7a_int_4x4,
+
+      -1
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+      // level-3
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,  FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
+
+      -1
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -65,13 +79,16 @@ void bli_cntx_init_cortexa9( cntx_t* cntx )
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 5,
+	  cntx,
+
+      // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
-	  cntx
+
+      -1
 	);
 }
 
diff --git a/config/excavator/bli_cntx_init_excavator.c b/config/excavator/bli_cntx_init_excavator.c
index adae152d50..351b4bc63a 100644
--- a/config/excavator/bli_cntx_init_excavator.c
+++ b/config/excavator/bli_cntx_init_excavator.c
@@ -43,16 +43,32 @@ void bli_cntx_init_excavator( cntx_t* cntx )
 
 	// -------------------------------------------------------------------------
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  4,
-	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_piledriver_asm_16x3, FALSE,
-	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_piledriver_asm_8x3,  FALSE,
-	  BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2,  FALSE,
-	  BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2,  FALSE,
-	  cntx
+	  cntx,
+
+      // level-3
+	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_piledriver_asm_16x3,
+	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_piledriver_asm_8x3,
+	  BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2,
+	  BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2,
+
+      -1
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+      // level-3
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,    FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE,   FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
+
+      -1
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -67,13 +83,16 @@ void bli_cntx_init_excavator( cntx_t* cntx )
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 5,
+	  cntx,
+
+      // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
-	  cntx
+
+      -1
 	);
 }
 
diff --git a/config/firestorm/bli_cntx_init_firestorm.c b/config/firestorm/bli_cntx_init_firestorm.c
index a15ce03448..946aabd433 100644
--- a/config/firestorm/bli_cntx_init_firestorm.c
+++ b/config/firestorm/bli_cntx_init_firestorm.c
@@ -37,32 +37,60 @@
 void bli_cntx_init_firestorm( cntx_t* cntx )
 {
 	blksz_t blkszs[ BLIS_NUM_BLKSZS ];
-	blksz_t thresh[ BLIS_NUM_THRESH ];
 
 	// Set default kernel blocksizes and functions.
 	bli_cntx_init_firestorm_ref( cntx );
 
 	// -------------------------------------------------------------------------
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  2,
-	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_armv8a_asm_8x12, FALSE,
-	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_armv8a_asm_6x8,  FALSE,
-	  cntx
+	  cntx,
+
+      // level-3
+	  BLIS_GEMM_UKR, BLIS_FLOAT,  bli_sgemm_armv8a_asm_8x12,
+	  BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8,
+
+      // packm
+	  BLIS_PACKM_8XK_KER,  BLIS_FLOAT,  bli_spackm_armv8a_int_8xk,
+	  BLIS_PACKM_12XK_KER, BLIS_FLOAT,  bli_spackm_armv8a_int_12xk,
+	  BLIS_PACKM_6XK_KER,  BLIS_DOUBLE, bli_dpackm_armv8a_int_6xk,
+	  BLIS_PACKM_8XK_KER,  BLIS_DOUBLE, bli_dpackm_armv8a_int_8xk,
+
+      // gemmsup
+	  BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m,
+	  BLIS_GEMMSUP_RRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_armv8a_asm_6x8m,
+	  BLIS_GEMMSUP_RCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m,
+	  BLIS_GEMMSUP_RCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n,
+	  BLIS_GEMMSUP_CRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m,
+	  BLIS_GEMMSUP_CRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_armv8a_asm_6x8n,
+	  BLIS_GEMMSUP_CCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n,
+	  BLIS_GEMMSUP_CCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n,
+
+      -1
 	);
 
-	// Update the context with optimized packm kernels.
-	bli_cntx_set_packm_kers
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
 	(
-	  4,
-	  BLIS_PACKM_8XK_KER,  BLIS_FLOAT,    bli_spackm_armv8a_int_8xk,
-	  BLIS_PACKM_12XK_KER, BLIS_FLOAT,    bli_spackm_armv8a_int_12xk,
-	  BLIS_PACKM_6XK_KER,  BLIS_DOUBLE,   bli_dpackm_armv8a_int_6xk,
-	  BLIS_PACKM_8XK_KER,  BLIS_DOUBLE,   bli_dpackm_armv8a_int_8xk,
-	  cntx
+	  cntx,
+
+      // level-3
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,  FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
+
+      // gemmsup
+	  BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+
+      -1
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -73,72 +101,47 @@ void bli_cntx_init_firestorm( cntx_t* cntx )
 	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   640,  3072,    -1,    -1 );
 	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  3072,  8192,    -1,    -1 );
 
+	// Initialize sup thresholds with architecture-appropriate values.
+	//                                          s     d     c     z
+	bli_blksz_init_easy( &blkszs[ BLIS_MT ],   -1,   99,   -1,   -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NT ],   -1,   99,   -1,   -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KT ],   -1,   99,   -1,   -1 );
+
+	// Initialize level-3 sup blocksize objects with architecture-specific
+	// values.
+	//                                               s      d      c      z
+	bli_blksz_init_easy( &blkszs[ BLIS_MR_SUP ],    -1,     6,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR_SUP ],    -1,     8,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC_SUP ],    -1,   240,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC_SUP ],    -1,  1024,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC_SUP ],    -1,  3072,    -1,    -1 );
+
 	// Update the context with the current architecture's register and cache
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 5,
+	  cntx,
+
+      // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
-	  cntx
-	);
 
-	// -------------------------------------------------------------------------
+      // sup thresholds
+	  BLIS_MT, &blkszs[ BLIS_MT ], BLIS_MT,
+	  BLIS_NT, &blkszs[ BLIS_NT ], BLIS_NT,
+	  BLIS_KT, &blkszs[ BLIS_KT ], BLIS_KT,
 
-	// Initialize sup thresholds with architecture-appropriate values.
-	//                                          s     d     c     z
-	bli_blksz_init_easy( &thresh[ BLIS_MT ],   -1,   99,   -1,   -1 );
-	bli_blksz_init_easy( &thresh[ BLIS_NT ],   -1,   99,   -1,   -1 );
-	bli_blksz_init_easy( &thresh[ BLIS_KT ],   -1,   99,   -1,   -1 );
+      // level-3 sup
+	  BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ], BLIS_NR_SUP,
+	  BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ], BLIS_KR_SUP,
+	  BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ], BLIS_MR_SUP,
+	  BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ], BLIS_NR_SUP,
+	  BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ], BLIS_MR_SUP,
 
-	// Initialize the context with the sup thresholds.
-	bli_cntx_set_l3_sup_thresh
-	(
-	  3,
-	  BLIS_MT, &thresh[ BLIS_MT ],
-	  BLIS_NT, &thresh[ BLIS_NT ],
-	  BLIS_KT, &thresh[ BLIS_KT ],
-	  cntx
-	);
-
-	// Update the context with optimized small/unpacked gemm kernels.
-	bli_cntx_set_l3_sup_kers
-	(
-	  8,
-	  BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, TRUE,
-	  BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_armv8a_asm_6x8m, TRUE,
-	  BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, TRUE,
-	  BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n, TRUE,
-	  BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m, TRUE,
-	  BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_armv8a_asm_6x8n, TRUE,
-	  BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n, TRUE,
-	  BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n, TRUE,
-	  cntx
-	);
-
-	// Initialize level-3 sup blocksize objects with architecture-specific
-	// values.
-	//                                           s      d      c      z
-	bli_blksz_init_easy( &blkszs[ BLIS_MR ],    -1,     6,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    -1,     8,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_MC ],    -1,   240,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_KC ],    -1,  1024,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NC ],    -1,  3072,    -1,    -1 );
-
-	// Update the context with the current architecture's register and cache
-	// blocksizes for small/unpacked level-3 problems.
-	bli_cntx_set_l3_sup_blkszs
-	(
-	  5,
-	  BLIS_NC, &blkszs[ BLIS_NC ],
-	  BLIS_KC, &blkszs[ BLIS_KC ],
-	  BLIS_MC, &blkszs[ BLIS_MC ],
-	  BLIS_NR, &blkszs[ BLIS_NR ],
-	  BLIS_MR, &blkszs[ BLIS_MR ],
-	  cntx
+	  -1
 	);
 }
 
diff --git a/config/haswell/bli_cntx_init_haswell.c b/config/haswell/bli_cntx_init_haswell.c
index f2dc900ead..34e3909ff9 100644
--- a/config/haswell/bli_cntx_init_haswell.c
+++ b/config/haswell/bli_cntx_init_haswell.c
@@ -35,50 +35,42 @@
 
 #include "blis.h"
 
-//GEMMSUP_KER_PROT( double,   d, gemmsup_r_haswell_ref )
-
 void bli_cntx_init_haswell( cntx_t* cntx )
 {
 	blksz_t blkszs[ BLIS_NUM_BLKSZS ];
-	blksz_t thresh[ BLIS_NUM_THRESH ];
 
 	// Set default kernel blocksizes and functions.
 	bli_cntx_init_haswell_ref( cntx );
 
 	// -------------------------------------------------------------------------
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  8,
+	  cntx,
+
 	  // gemm
 #if 1
-	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemm_haswell_asm_6x16,       TRUE,
-	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_haswell_asm_6x8,        TRUE,
-	  BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8,        TRUE,
-	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4,        TRUE,
+	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemm_haswell_asm_6x16,
+	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_haswell_asm_6x8,
+	  BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8,
+	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4,
 #else
-	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemm_haswell_asm_16x6,       FALSE,
-	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_haswell_asm_8x6,        FALSE,
-	  BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemm_haswell_asm_8x3,        FALSE,
-	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemm_haswell_asm_4x3,        FALSE,
+	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemm_haswell_asm_16x6,
+	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_haswell_asm_8x6,
+	  BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemm_haswell_asm_8x3,
+	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemm_haswell_asm_4x3,
 #endif
 	  // gemmtrsm_l
-	  BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT,    bli_sgemmtrsm_l_haswell_asm_6x16, TRUE,
-	  BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_l_haswell_asm_6x8,  TRUE,
+	  BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT,    bli_sgemmtrsm_l_haswell_asm_6x16,
+	  BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_l_haswell_asm_6x8,
 
 	  // gemmtrsm_u
-	  BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT,    bli_sgemmtrsm_u_haswell_asm_6x16, TRUE,
-	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_u_haswell_asm_6x8,  TRUE,
-	  cntx
-	);
+	  BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT,    bli_sgemmtrsm_u_haswell_asm_6x16,
+	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_u_haswell_asm_6x8,
 
 #if 1
-	// Update the context with optimized packm kernels.
-	bli_cntx_set_packm_kers
-	(
-	  8,
+      // packm
 	  BLIS_PACKM_6XK_KER,  BLIS_FLOAT,    bli_spackm_haswell_asm_6xk,
 	  BLIS_PACKM_16XK_KER, BLIS_FLOAT,    bli_spackm_haswell_asm_16xk,
 	  BLIS_PACKM_6XK_KER,  BLIS_DOUBLE,   bli_dpackm_haswell_asm_6xk,
@@ -87,27 +79,14 @@ void bli_cntx_init_haswell( cntx_t* cntx )
 	  BLIS_PACKM_8XK_KER,  BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk,
 	  BLIS_PACKM_3XK_KER,  BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk,
 	  BLIS_PACKM_4XK_KER,  BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk,
-	  cntx
-	);
 #endif
 
-	// Update the context with optimized level-1f kernels.
-	bli_cntx_set_l1f_kers
-	(
-	  4,
 	  // axpyf
 	  BLIS_AXPYF_KER,     BLIS_FLOAT,  bli_saxpyf_zen_int_8,
 	  BLIS_AXPYF_KER,     BLIS_DOUBLE, bli_daxpyf_zen_int_8,
 	  // dotxf
 	  BLIS_DOTXF_KER,     BLIS_FLOAT,  bli_sdotxf_zen_int_8,
 	  BLIS_DOTXF_KER,     BLIS_DOUBLE, bli_ddotxf_zen_int_8,
-	  cntx
-	);
-
-	// Update the context with optimized level-1v kernels.
-	bli_cntx_set_l1v_kers
-	(
-	  10,
 
 	  // amaxv
 	  BLIS_AMAXV_KER,  BLIS_FLOAT,  bli_samaxv_zen_int,
@@ -137,7 +116,74 @@ void bli_cntx_init_haswell( cntx_t* cntx )
 	  BLIS_SCALV_KER,  BLIS_FLOAT,  bli_sscalv_zen_int10,
 	  BLIS_SCALV_KER,  BLIS_DOUBLE, bli_dscalv_zen_int10,
 #endif
-	  cntx
+
+      // gemmsup
+	  BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
+	  BLIS_GEMMSUP_RRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m,
+	  BLIS_GEMMSUP_RCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
+	  BLIS_GEMMSUP_RCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
+	  BLIS_GEMMSUP_CRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
+	  BLIS_GEMMSUP_CRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n,
+	  BLIS_GEMMSUP_CCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
+	  BLIS_GEMMSUP_CCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
+
+	  BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m,
+	  BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m,
+	  BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m,
+	  BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
+	  BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m,
+	  BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n,
+	  BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
+	  BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
+
+      -1
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+	  // gemm
+#if 1
+	  BLIS_GEMM_UKR_ROW_PREF,       BLIS_FLOAT,    TRUE,
+	  BLIS_GEMM_UKR_ROW_PREF,       BLIS_DOUBLE,   TRUE,
+	  BLIS_GEMM_UKR_ROW_PREF,       BLIS_SCOMPLEX, TRUE,
+	  BLIS_GEMM_UKR_ROW_PREF,       BLIS_DCOMPLEX, TRUE,
+#else
+	  BLIS_GEMM_UKR_ROW_PREF,       BLIS_FLOAT,    FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF,       BLIS_DOUBLE,   FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF,       BLIS_SCOMPLEX, FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF,       BLIS_DCOMPLEX, FALSE,
+#endif
+	  // gemmtrsm_l
+	  BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_FLOAT,    TRUE,
+	  BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_DOUBLE,   TRUE,
+
+	  // gemmtrsm_u
+	  BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_FLOAT,    TRUE,
+	  BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_DOUBLE,   TRUE,
+
+      // gemmsup
+	  BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+
+	  BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
+	  BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
+	  BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
+	  BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
+	  BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
+	  BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
+	  BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
+	  BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
+
+      -1
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -161,97 +207,54 @@ void bli_cntx_init_haswell( cntx_t* cntx )
 	bli_blksz_init_easy( &blkszs[ BLIS_AF ],     8,     8,     8,     8 );
 	bli_blksz_init_easy( &blkszs[ BLIS_DF ],     8,     8,     8,     8 );
 
+	// -------------------------------------------------------------------------
+
+	// Initialize sup thresholds with architecture-appropriate values.
+	//                                          s     d     c     z
+	bli_blksz_init_easy( &blkszs[ BLIS_MT ],  201,  201,   -1,   -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NT ],  201,  201,   -1,   -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KT ],  201,  201,   -1,   -1 );
+
+	// Initialize level-3 sup blocksize objects with architecture-specific
+	// values.
+	//                                           s      d      c      z
+	bli_blksz_init     ( &blkszs[ BLIS_MR_SUP ],     6,     6,    -1,    -1,
+	                                                 9,     9,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR_SUP ],    16,     8,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC_SUP ],   168,    72,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC_SUP ],   256,   256,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC_SUP ],  4080,  4080,    -1,    -1 );
+
 	// Update the context with the current architecture's register and cache
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 7,
+	  cntx,
+
 	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+
 	  // level-1f
 	  BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
 	  BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
-	  cntx
-	);
 
-	// -------------------------------------------------------------------------
+      // gemmsup thresholds
+	  BLIS_MT, &blkszs[ BLIS_MT ], BLIS_MT,
+	  BLIS_NT, &blkszs[ BLIS_NT ], BLIS_NT,
+	  BLIS_KT, &blkszs[ BLIS_KT ], BLIS_KT,
 
-	// Initialize sup thresholds with architecture-appropriate values.
-	//                                          s     d     c     z
-	bli_blksz_init_easy( &thresh[ BLIS_MT ],  201,  201,   -1,   -1 );
-	bli_blksz_init_easy( &thresh[ BLIS_NT ],  201,  201,   -1,   -1 );
-	bli_blksz_init_easy( &thresh[ BLIS_KT ],  201,  201,   -1,   -1 );
-
-	// Initialize the context with the sup thresholds.
-	bli_cntx_set_l3_sup_thresh
-	(
-	  3,
-	  BLIS_MT, &thresh[ BLIS_MT ],
-	  BLIS_NT, &thresh[ BLIS_NT ],
-	  BLIS_KT, &thresh[ BLIS_KT ],
-	  cntx
-	);
+      // level-3 sup
+	  BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ], BLIS_NR_SUP,
+	  BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ], BLIS_KR_SUP,
+	  BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ], BLIS_MR_SUP,
+	  BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ], BLIS_NR_SUP,
+	  BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ], BLIS_MR_SUP,
 
-#if 0
-	// Initialize the context with the sup handlers.
-	bli_cntx_set_l3_sup_handlers
-	(
-	  1,
-	  BLIS_GEMM, bli_gemmsup_ref,
-	  cntx
-	);
-#endif
-
-	// Update the context with optimized small/unpacked gemm kernels.
-	bli_cntx_set_l3_sup_kers
-	(
-	  16,
-	  //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
-	  BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
-	  BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
-	  BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
-	  BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
-	  BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
-	  BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
-	  BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
-	  BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
-
-	  BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
-	  BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m, TRUE,
-	  BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
-	  BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
-	  BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
-	  BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, TRUE,
-	  BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
-	  BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
-	  cntx
-	);
-
-	// Initialize level-3 sup blocksize objects with architecture-specific
-	// values.
-	//                                           s      d      c      z
-	bli_blksz_init     ( &blkszs[ BLIS_MR ],     6,     6,    -1,    -1,
-	                                             9,     9,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    16,     8,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   168,    72,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   256,   256,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  4080,  4080,    -1,    -1 );
-
-	// Update the context with the current architecture's register and cache
-	// blocksizes for small/unpacked level-3 problems.
-	bli_cntx_set_l3_sup_blkszs
-	(
-	  5,
-	  BLIS_NC, &blkszs[ BLIS_NC ],
-	  BLIS_KC, &blkszs[ BLIS_KC ],
-	  BLIS_MC, &blkszs[ BLIS_MC ],
-	  BLIS_NR, &blkszs[ BLIS_NR ],
-	  BLIS_MR, &blkszs[ BLIS_MR ],
-	  cntx
+	  -1
 	);
 }
 
diff --git a/config/knc/bli_cntx_init_knc.c b/config/knc/bli_cntx_init_knc.c
index 198f08827a..5fe47f8af7 100644
--- a/config/knc/bli_cntx_init_knc.c
+++ b/config/knc/bli_cntx_init_knc.c
@@ -43,13 +43,26 @@ void bli_cntx_init_knc( cntx_t* cntx )
 
 	// -------------------------------------------------------------------------
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  1,
-	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_knc_asm_30x8, TRUE,
-	  cntx
+	  cntx,
+
+      // level-3
+	  BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_knc_asm_30x8,
+
+      -1
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+      // level-3
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+
+      -1
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -58,7 +71,7 @@ void bli_cntx_init_knc( cntx_t* cntx )
 	bli_blksz_init_easy( &blkszs[ BLIS_NR ],     0,     8,     0,     0 );
 	bli_blksz_init_easy( &blkszs[ BLIS_MC ],     0,   120,     0,     0,
 	                                             0,   160,     0,     0 );
-	bli_blksz_init     ( &blkszs[ BLIS_KC ],     0,   240,     0,     0,  
+	bli_blksz_init     ( &blkszs[ BLIS_KC ],     0,   240,     0,     0,
 	                                             0,   300,     0,     0 );
 	bli_blksz_init_easy( &blkszs[ BLIS_NC ],     0, 14400,     0,     0 );
 
@@ -66,13 +79,16 @@ void bli_cntx_init_knc( cntx_t* cntx )
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 5,
+	  cntx,
+
+      // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
-	  cntx
+
+      -1
 	);
 }
 
diff --git a/config/knl/bli_cntx_init_knl.c b/config/knl/bli_cntx_init_knl.c
index 6da3b7a3a9..fcc25f023e 100644
--- a/config/knl/bli_cntx_init_knl.c
+++ b/config/knl/bli_cntx_init_knl.c
@@ -43,47 +43,33 @@ void bli_cntx_init_knl( cntx_t* cntx )
 
 	// -------------------------------------------------------------------------
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  2,
-	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemm_knl_asm_24x16, FALSE,
-	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_knl_asm_24x8,  FALSE,
-	  cntx
-	);
+	  cntx,
 
-	// Update the context with optimized packm kernels.
-	bli_cntx_set_packm_kers
-	(
-	  2,
+      // level-3
+	  BLIS_GEMM_UKR, BLIS_FLOAT,  bli_sgemm_knl_asm_24x16,
+	  BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_knl_asm_24x8,
+
+      // packm
 	  BLIS_PACKM_8XK_KER,  BLIS_DOUBLE, bli_dpackm_knl_asm_8xk,
 	  BLIS_PACKM_24XK_KER, BLIS_DOUBLE, bli_dpackm_knl_asm_24xk,
-	  cntx
-	);
 
-	// Update the context with optimized level-1f kernels.
-	bli_cntx_set_l1f_kers
-	(
-	  4,
 	  // axpyf
-	  BLIS_AXPYF_KER,     BLIS_FLOAT,  bli_saxpyf_zen_int_8,
-	  BLIS_AXPYF_KER,     BLIS_DOUBLE, bli_daxpyf_zen_int_8,
+	  BLIS_AXPYF_KER, BLIS_FLOAT,  bli_saxpyf_zen_int_8,
+	  BLIS_AXPYF_KER, BLIS_DOUBLE, bli_daxpyf_zen_int_8,
+
 	  // dotxf
-	  BLIS_DOTXF_KER,     BLIS_FLOAT,  bli_sdotxf_zen_int_8,
-	  BLIS_DOTXF_KER,     BLIS_DOUBLE, bli_ddotxf_zen_int_8,
-	  cntx
-	);
+	  BLIS_DOTXF_KER, BLIS_FLOAT,  bli_sdotxf_zen_int_8,
+	  BLIS_DOTXF_KER, BLIS_DOUBLE, bli_ddotxf_zen_int_8,
 
-	// Update the context with optimized level-1v kernels.
-	bli_cntx_set_l1v_kers
-	(
-	  10,
 #if 1
 	  // amaxv
 	  BLIS_AMAXV_KER,  BLIS_FLOAT,  bli_samaxv_zen_int,
 	  BLIS_AMAXV_KER,  BLIS_DOUBLE, bli_damaxv_zen_int,
 #endif
+
 	  // axpyv
 #if 0
 	  BLIS_AXPYV_KER,  BLIS_FLOAT,  bli_saxpyv_zen_int,
@@ -92,12 +78,15 @@ void bli_cntx_init_knl( cntx_t* cntx )
 	  BLIS_AXPYV_KER,  BLIS_FLOAT,  bli_saxpyv_zen_int10,
 	  BLIS_AXPYV_KER,  BLIS_DOUBLE, bli_daxpyv_zen_int10,
 #endif
+
 	  // dotv
 	  BLIS_DOTV_KER,   BLIS_FLOAT,  bli_sdotv_zen_int,
 	  BLIS_DOTV_KER,   BLIS_DOUBLE, bli_ddotv_zen_int,
+
 	  // dotxv
 	  BLIS_DOTXV_KER,  BLIS_FLOAT,  bli_sdotxv_zen_int,
 	  BLIS_DOTXV_KER,  BLIS_DOUBLE, bli_ddotxv_zen_int,
+
 	  // scalv
 #if 0
 	  BLIS_SCALV_KER,  BLIS_FLOAT,  bli_sscalv_zen_int,
@@ -106,7 +95,20 @@ void bli_cntx_init_knl( cntx_t* cntx )
 	  BLIS_SCALV_KER,  BLIS_FLOAT,  bli_sscalv_zen_int10,
 	  BLIS_SCALV_KER,  BLIS_DOUBLE, bli_dscalv_zen_int10,
 #endif
-	  cntx
+
+      -1
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+      // level-3
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,  FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
+
+      -1
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -125,17 +127,20 @@ void bli_cntx_init_knl( cntx_t* cntx )
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 7,
+	  cntx,
+
 	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+
 	  // level-1f
 	  BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
 	  BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
-	  cntx
+
+	  -1
 	);
 }
 
diff --git a/config/old/armv7a/bli_cntx_init_armv7a.c b/config/old/armv7a/bli_cntx_init_armv7a.c
index d4cc9e91d4..acd8e6c182 100644
--- a/config/old/armv7a/bli_cntx_init_armv7a.c
+++ b/config/old/armv7a/bli_cntx_init_armv7a.c
@@ -66,7 +66,7 @@ void bli_cntx_init_armv7a( cntx_t* cntx )
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 5,
+	  5,
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
diff --git a/config/old/haswellbb/bli_cntx_init_haswell.c b/config/old/haswellbb/bli_cntx_init_haswell.c
index 9e1d03503a..88bd14a071 100644
--- a/config/old/haswellbb/bli_cntx_init_haswell.c
+++ b/config/old/haswellbb/bli_cntx_init_haswell.c
@@ -203,7 +203,7 @@ void bli_cntx_init_haswell( cntx_t* cntx )
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 7,
+	  7,
 	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
diff --git a/config/penryn/bli_cntx_init_penryn.c b/config/penryn/bli_cntx_init_penryn.c
index 1576bf9448..12a36eabb5 100644
--- a/config/penryn/bli_cntx_init_penryn.c
+++ b/config/penryn/bli_cntx_init_penryn.c
@@ -43,18 +43,36 @@ void bli_cntx_init_penryn( cntx_t* cntx )
 
 	// -------------------------------------------------------------------------
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  4,
-	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_penryn_asm_8x4, FALSE,
-	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_penryn_asm_4x4, FALSE,
-	  //BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_penryn_asm_8x4, FALSE,
-	  //BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_penryn_asm_4x4, FALSE,
-	  BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_l_penryn_asm_4x4, FALSE,
-	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_u_penryn_asm_4x4, FALSE,
-	  cntx
+	  cntx,
+
+      //level-3
+	  BLIS_GEMM_UKR, BLIS_FLOAT,  bli_sgemm_penryn_asm_8x4,
+	  BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_penryn_asm_4x4,
+	  //BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_penryn_asm_8x4,
+	  //BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_penryn_asm_4x4,
+	  BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_penryn_asm_4x4,
+	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_penryn_asm_4x4,
+
+	  -1
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+      //level-3
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,  FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
+	  //BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
+	  //BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
+	  BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
+	  BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
+
+	  -1
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -69,13 +87,16 @@ void bli_cntx_init_penryn( cntx_t* cntx )
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 5,
+	  cntx,
+
+      // level-1
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
-	  cntx
+
+      -1
 	);
 }
 
diff --git a/config/piledriver/bli_cntx_init_piledriver.c b/config/piledriver/bli_cntx_init_piledriver.c
index 4ed15e322b..fe78d51423 100644
--- a/config/piledriver/bli_cntx_init_piledriver.c
+++ b/config/piledriver/bli_cntx_init_piledriver.c
@@ -43,16 +43,32 @@ void bli_cntx_init_piledriver( cntx_t* cntx )
 
 	// -------------------------------------------------------------------------
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  4,
-	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_piledriver_asm_16x3, FALSE,
-	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_piledriver_asm_8x3,  FALSE,
-	  BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2,  FALSE,
-	  BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2,  FALSE,
-	  cntx
+	  cntx,
+
+      // level-3
+	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_piledriver_asm_16x3,
+	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_piledriver_asm_8x3,
+	  BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2,
+	  BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2,
+
+      -1
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+      // level-3
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,    FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE,   FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
+
+      -1
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -67,13 +83,16 @@ void bli_cntx_init_piledriver( cntx_t* cntx )
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 5,
+	  cntx,
+
+      // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
-	  cntx
+
+      -1
 	);
 }
 
diff --git a/config/power10/bli_cntx_init_power10.c b/config/power10/bli_cntx_init_power10.c
index 14c940f995..dda9710ee0 100644
--- a/config/power10/bli_cntx_init_power10.c
+++ b/config/power10/bli_cntx_init_power10.c
@@ -72,33 +72,24 @@ void bli_cntx_init_power10( cntx_t* cntx )
 
 	// -------------------------------------------------------------------------
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  12,
-	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemm_power10_mma_8x16,     TRUE,
-
-	  BLIS_TRSM_L_UKR,     BLIS_FLOAT,    bli_strsmbb_l_power10_ref,      FALSE,
-	  BLIS_TRSM_U_UKR,     BLIS_FLOAT,    bli_strsmbb_u_power10_ref,      FALSE,
-	  
-	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_power10_mma_8x8,      TRUE,  
-	  
-	  BLIS_TRSM_L_UKR,     BLIS_DOUBLE,   bli_dtrsmbb_l_power10_ref,      FALSE,
-	  BLIS_TRSM_U_UKR,     BLIS_DOUBLE,   bli_dtrsmbb_u_power10_ref,      FALSE,
-	  BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemmbb_power10_ref,        FALSE,
-	  BLIS_TRSM_L_UKR,     BLIS_SCOMPLEX, bli_ctrsmbb_l_power10_ref,      FALSE,
-	  BLIS_TRSM_U_UKR,     BLIS_SCOMPLEX, bli_ctrsmbb_u_power10_ref,      FALSE,
-	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemmbb_power10_ref,        FALSE,
-	  BLIS_TRSM_L_UKR,     BLIS_DCOMPLEX, bli_ztrsmbb_l_power10_ref,      FALSE,
-	  BLIS_TRSM_U_UKR,     BLIS_DCOMPLEX, bli_ztrsmbb_u_power10_ref,      FALSE,
-	  cntx
-	);
-
-	// Update the context with customized virtual [gemm]trsm micro-kernels.
-	bli_cntx_set_l3_vir_ukrs
-	(
-	  8,
+	  cntx,
+
+      // level-3
+	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemm_power10_mma_8x16,
+	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_power10_mma_8x8,
+	  BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemmbb_power10_ref,
+	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemmbb_power10_ref,
+	  BLIS_TRSM_L_UKR,     BLIS_FLOAT,    bli_strsmbb_l_power10_ref,
+	  BLIS_TRSM_U_UKR,     BLIS_FLOAT,    bli_strsmbb_u_power10_ref,
+	  BLIS_TRSM_L_UKR,     BLIS_DOUBLE,   bli_dtrsmbb_l_power10_ref,
+	  BLIS_TRSM_U_UKR,     BLIS_DOUBLE,   bli_dtrsmbb_u_power10_ref,
+	  BLIS_TRSM_L_UKR,     BLIS_SCOMPLEX, bli_ctrsmbb_l_power10_ref,
+	  BLIS_TRSM_U_UKR,     BLIS_SCOMPLEX, bli_ctrsmbb_u_power10_ref,
+	  BLIS_TRSM_L_UKR,     BLIS_DCOMPLEX, bli_ztrsmbb_l_power10_ref,
+	  BLIS_TRSM_U_UKR,     BLIS_DCOMPLEX, bli_ztrsmbb_u_power10_ref,
 	  BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT,    bli_sgemmtrsmbb_l_power10_ref,
 	  BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT,    bli_sgemmtrsmbb_u_power10_ref,
 	  BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE,   bli_dgemmtrsmbb_l_power10_ref,
@@ -107,16 +98,34 @@ void bli_cntx_init_power10( cntx_t* cntx )
 	  BLIS_GEMMTRSM_U_UKR, BLIS_SCOMPLEX, bli_cgemmtrsmbb_u_power10_ref,
 	  BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsmbb_l_power10_ref,
 	  BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsmbb_u_power10_ref,
-	  cntx
-	);
 
-	// Update the context with optimized packm kernels.
-	bli_cntx_set_packm_kers
-	(
-	  2,
+      // packm
 	  BLIS_PACKM_6XK_KER,  BLIS_FLOAT,    bli_spackm_6xk_bb4_power10_ref,
 	  BLIS_PACKM_6XK_KER,  BLIS_DOUBLE,   bli_dpackm_6xk_bb2_power10_ref,
-	  cntx
+
+      -1
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+      // level-3
+	  BLIS_GEMM_UKR_ROW_PREF,   BLIS_FLOAT,    TRUE,
+	  BLIS_GEMM_UKR_ROW_PREF,   BLIS_DOUBLE,   TRUE,
+	  BLIS_GEMM_UKR_ROW_PREF,   BLIS_SCOMPLEX, FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF,   BLIS_DCOMPLEX, FALSE,
+	  BLIS_TRSM_L_UKR_ROW_PREF, BLIS_FLOAT,    FALSE,
+	  BLIS_TRSM_U_UKR_ROW_PREF, BLIS_FLOAT,    FALSE,
+	  BLIS_TRSM_L_UKR_ROW_PREF, BLIS_DOUBLE,   FALSE,
+	  BLIS_TRSM_U_UKR_ROW_PREF, BLIS_DOUBLE,   FALSE,
+	  BLIS_TRSM_L_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
+	  BLIS_TRSM_U_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
+	  BLIS_TRSM_L_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
+	  BLIS_TRSM_U_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
+
+      -1
 	);
 
 	//                                           s      d      c      z
@@ -131,14 +140,16 @@ void bli_cntx_init_power10( cntx_t* cntx )
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 5,
+	  cntx,
+
 	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
-	  cntx
+
+      -1
 	);
 
 }
diff --git a/config/power7/bli_cntx_init_power7.c b/config/power7/bli_cntx_init_power7.c
index c9caf62a6d..0c83ba8821 100644
--- a/config/power7/bli_cntx_init_power7.c
+++ b/config/power7/bli_cntx_init_power7.c
@@ -43,13 +43,26 @@ void bli_cntx_init_power7( cntx_t* cntx )
 
 	// -------------------------------------------------------------------------
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  1,
-	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_power7_int_8x4,  FALSE,
-	  cntx
+	  cntx,
+
+      // level-3
+	  BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power7_int_8x4,
+
+      -1
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+      // level-3
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
+
+      -1
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -64,13 +77,16 @@ void bli_cntx_init_power7( cntx_t* cntx )
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 5,
+	  cntx,
+
+      // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
-	  cntx
+
+      -1
 	);
 }
 
diff --git a/config/power9/bli_cntx_init_power9.c b/config/power9/bli_cntx_init_power9.c
index 4370ce26c1..75f9b1ffca 100644
--- a/config/power9/bli_cntx_init_power9.c
+++ b/config/power9/bli_cntx_init_power9.c
@@ -72,32 +72,24 @@ void bli_cntx_init_power9( cntx_t* cntx )
 
 	// -------------------------------------------------------------------------
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  12,
-	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemmbb_power9_ref,        FALSE,
-	  BLIS_TRSM_L_UKR,     BLIS_FLOAT,    bli_strsmbb_l_power9_ref,      FALSE,
-	  BLIS_TRSM_U_UKR,     BLIS_FLOAT,    bli_strsmbb_u_power9_ref,      FALSE,
-
-	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_power9_asm_12x6,     FALSE,
-	  
-	  BLIS_TRSM_L_UKR,     BLIS_DOUBLE,   bli_dtrsmbb_l_power9_ref,      FALSE,
-	  BLIS_TRSM_U_UKR,     BLIS_DOUBLE,   bli_dtrsmbb_u_power9_ref,      FALSE,
-	  BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemmbb_power9_ref,        FALSE,
-	  BLIS_TRSM_L_UKR,     BLIS_SCOMPLEX, bli_ctrsmbb_l_power9_ref,      FALSE,
-	  BLIS_TRSM_U_UKR,     BLIS_SCOMPLEX, bli_ctrsmbb_u_power9_ref,      FALSE,
-	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemmbb_power9_ref,        FALSE,
-	  BLIS_TRSM_L_UKR,     BLIS_DCOMPLEX, bli_ztrsmbb_l_power9_ref,      FALSE,
-	  BLIS_TRSM_U_UKR,     BLIS_DCOMPLEX, bli_ztrsmbb_u_power9_ref,      FALSE,
-	  cntx
-	);
-
-	// Update the context with customized virtual [gemm]trsm micro-kernels.
-	bli_cntx_set_l3_vir_ukrs
-	(
-	  8,
+	  cntx,
+
+      // level-3
+	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemmbb_power9_ref,
+	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_power9_asm_12x6,
+	  BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemmbb_power9_ref,
+	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemmbb_power9_ref,
+	  BLIS_TRSM_L_UKR,     BLIS_FLOAT,    bli_strsmbb_l_power9_ref,
+	  BLIS_TRSM_U_UKR,     BLIS_FLOAT,    bli_strsmbb_u_power9_ref,
+	  BLIS_TRSM_L_UKR,     BLIS_DOUBLE,   bli_dtrsmbb_l_power9_ref,
+	  BLIS_TRSM_U_UKR,     BLIS_DOUBLE,   bli_dtrsmbb_u_power9_ref,
+	  BLIS_TRSM_L_UKR,     BLIS_SCOMPLEX, bli_ctrsmbb_l_power9_ref,
+	  BLIS_TRSM_U_UKR,     BLIS_SCOMPLEX, bli_ctrsmbb_u_power9_ref,
+	  BLIS_TRSM_L_UKR,     BLIS_DCOMPLEX, bli_ztrsmbb_l_power9_ref,
+	  BLIS_TRSM_U_UKR,     BLIS_DCOMPLEX, bli_ztrsmbb_u_power9_ref,
 	  BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT,    bli_sgemmtrsmbb_l_power9_ref,
 	  BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT,    bli_sgemmtrsmbb_u_power9_ref,
 	  BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE,   bli_dgemmtrsmbb_l_power9_ref,
@@ -106,16 +98,34 @@ void bli_cntx_init_power9( cntx_t* cntx )
 	  BLIS_GEMMTRSM_U_UKR, BLIS_SCOMPLEX, bli_cgemmtrsmbb_u_power9_ref,
 	  BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsmbb_l_power9_ref,
 	  BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsmbb_u_power9_ref,
-	  cntx
-	);
 
-	// Update the context with optimized packm kernels.
-	bli_cntx_set_packm_kers
-	(
-	  2,
+      // packm
 	  BLIS_PACKM_6XK_KER,  BLIS_FLOAT,    bli_spackm_6xk_bb4_power9_ref,
 	  BLIS_PACKM_6XK_KER,  BLIS_DOUBLE,   bli_dpackm_6xk_bb2_power9_ref,
-	  cntx
+
+      -1
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+      // level-3
+	  BLIS_GEMM_UKR_ROW_PREF,   BLIS_FLOAT,    FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF,   BLIS_DOUBLE,   FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF,   BLIS_SCOMPLEX, FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF,   BLIS_DCOMPLEX, FALSE,
+	  BLIS_TRSM_L_UKR_ROW_PREF, BLIS_FLOAT,    FALSE,
+	  BLIS_TRSM_U_UKR_ROW_PREF, BLIS_FLOAT,    FALSE,
+	  BLIS_TRSM_L_UKR_ROW_PREF, BLIS_DOUBLE,   FALSE,
+	  BLIS_TRSM_U_UKR_ROW_PREF, BLIS_DOUBLE,   FALSE,
+	  BLIS_TRSM_L_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
+	  BLIS_TRSM_U_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
+	  BLIS_TRSM_L_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
+	  BLIS_TRSM_U_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
+
+      -1
 	);
 
 
@@ -131,14 +141,15 @@ void bli_cntx_init_power9( cntx_t* cntx )
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 5,
+	  cntx,
+
 	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
-	  cntx
-	);
 
+      -1
+	);
 }
diff --git a/config/sandybridge/bli_cntx_init_sandybridge.c b/config/sandybridge/bli_cntx_init_sandybridge.c
index 1ffa5bf8b6..fa92a6d760 100644
--- a/config/sandybridge/bli_cntx_init_sandybridge.c
+++ b/config/sandybridge/bli_cntx_init_sandybridge.c
@@ -43,16 +43,32 @@ void bli_cntx_init_sandybridge( cntx_t* cntx )
 
 	// -------------------------------------------------------------------------
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  4,
-	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_sandybridge_asm_8x8, FALSE,
-	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_sandybridge_asm_8x4, FALSE,
-	  BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_sandybridge_asm_8x4, FALSE,
-	  BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_sandybridge_asm_4x4, FALSE,
-	  cntx
+	  cntx,
+
+      // level-3
+	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_sandybridge_asm_8x8,
+	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_sandybridge_asm_8x4,
+	  BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_sandybridge_asm_8x4,
+	  BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_sandybridge_asm_4x4,
+
+      -1
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+      // level-3
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,    FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE,   FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
+
+      -1
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -67,13 +83,16 @@ void bli_cntx_init_sandybridge( cntx_t* cntx )
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 5,
+	  cntx,
+
+      // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
-	  cntx
+
+      -1
 	);
 }
 
diff --git a/config/skx/bli_cntx_init_skx.c b/config/skx/bli_cntx_init_skx.c
index f18503a7a7..14ffc0680a 100644
--- a/config/skx/bli_cntx_init_skx.c
+++ b/config/skx/bli_cntx_init_skx.c
@@ -43,39 +43,29 @@ void bli_cntx_init_skx( cntx_t* cntx )
 
 	// -------------------------------------------------------------------------
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  2,
-	  // gemm
-	  BLIS_GEMM_UKR,       BLIS_FLOAT ,   bli_sgemm_skx_asm_32x12_l2,   FALSE,
-	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_skx_asm_16x14,      FALSE,
-	  cntx
-	);
+	  cntx,
+
+	  // level-3
+	  BLIS_GEMM_UKR,       BLIS_FLOAT ,   bli_sgemm_skx_asm_32x12_l2,
+	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_skx_asm_16x14,
 
-	// Update the context with optimized level-1f kernels.
-	bli_cntx_set_l1f_kers
-	(
-	  4,
 	  // axpyf
 	  BLIS_AXPYF_KER,     BLIS_FLOAT,  bli_saxpyf_zen_int_8,
 	  BLIS_AXPYF_KER,     BLIS_DOUBLE, bli_daxpyf_zen_int_8,
+
 	  // dotxf
 	  BLIS_DOTXF_KER,     BLIS_FLOAT,  bli_sdotxf_zen_int_8,
 	  BLIS_DOTXF_KER,     BLIS_DOUBLE, bli_ddotxf_zen_int_8,
-	  cntx
-	);
 
-	// Update the context with optimized level-1v kernels.
-	bli_cntx_set_l1v_kers
-	(
-	  10,
 #if 1
 	  // amaxv
 	  BLIS_AMAXV_KER,  BLIS_FLOAT,  bli_samaxv_zen_int,
 	  BLIS_AMAXV_KER,  BLIS_DOUBLE, bli_damaxv_zen_int,
 #endif
+
 	  // axpyv
 #if 0
 	  BLIS_AXPYV_KER,  BLIS_FLOAT,  bli_saxpyv_zen_int,
@@ -84,12 +74,15 @@ void bli_cntx_init_skx( cntx_t* cntx )
 	  BLIS_AXPYV_KER,  BLIS_FLOAT,  bli_saxpyv_zen_int10,
 	  BLIS_AXPYV_KER,  BLIS_DOUBLE, bli_daxpyv_zen_int10,
 #endif
+
 	  // dotv
 	  BLIS_DOTV_KER,   BLIS_FLOAT,  bli_sdotv_zen_int,
 	  BLIS_DOTV_KER,   BLIS_DOUBLE, bli_ddotv_zen_int,
+
 	  // dotxv
 	  BLIS_DOTXV_KER,  BLIS_FLOAT,  bli_sdotxv_zen_int,
 	  BLIS_DOTXV_KER,  BLIS_DOUBLE, bli_ddotxv_zen_int,
+
 	  // scalv
 #if 0
 	  BLIS_SCALV_KER,  BLIS_FLOAT,  bli_sscalv_zen_int,
@@ -98,7 +91,20 @@ void bli_cntx_init_skx( cntx_t* cntx )
 	  BLIS_SCALV_KER,  BLIS_FLOAT,  bli_sscalv_zen_int10,
 	  BLIS_SCALV_KER,  BLIS_DOUBLE, bli_dscalv_zen_int10,
 #endif
-	  cntx
+
+      -1
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+	  // level-3
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT , FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
+
+      -1
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -116,17 +122,20 @@ void bli_cntx_init_skx( cntx_t* cntx )
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 7,
+	  cntx,
+
 	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+
 	  // level-1f
 	  BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
 	  BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
-	  cntx
+
+	  -1
 	);
 }
 
diff --git a/config/steamroller/bli_cntx_init_steamroller.c b/config/steamroller/bli_cntx_init_steamroller.c
index 13e7f6495b..ce8870d854 100644
--- a/config/steamroller/bli_cntx_init_steamroller.c
+++ b/config/steamroller/bli_cntx_init_steamroller.c
@@ -43,16 +43,32 @@ void bli_cntx_init_steamroller( cntx_t* cntx )
 
 	// -------------------------------------------------------------------------
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  4,
-	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_piledriver_asm_16x3, FALSE,
-	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_piledriver_asm_8x3,  FALSE,
-	  BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2,  FALSE,
-	  BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2,  FALSE,
-	  cntx
+	  cntx,
+
+      // level-3
+	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_piledriver_asm_16x3,
+	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_piledriver_asm_8x3,
+	  BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2,
+	  BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2,
+
+      -1
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+      // level-3
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,    FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE,   FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
+
+      -1
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -67,13 +83,16 @@ void bli_cntx_init_steamroller( cntx_t* cntx )
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 5,
+	  cntx,
+
+      // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
-	  cntx
+
+      -1
 	);
 }
 
diff --git a/config/template/bli_cntx_init_template.c b/config/template/bli_cntx_init_template.c
index f2b1c8d175..fcc0223f56 100644
--- a/config/template/bli_cntx_init_template.c
+++ b/config/template/bli_cntx_init_template.c
@@ -45,34 +45,44 @@ void bli_cntx_init_template( cntx_t* cntx )
 
 	// Update the context with optimized native gemm micro-kernels and
 	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	bli_cntx_set_ukrs
 	(
-	  5,
-	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemm_template_noopt,       FALSE,
-	  BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_l_template_noopt, FALSE,
-	  BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_u_template_noopt, FALSE,
-	  BLIS_TRSM_L_UKR,     BLIS_DCOMPLEX, bli_ztrsm_l_template_noopt,     FALSE,
-	  BLIS_TRSM_U_UKR,     BLIS_DCOMPLEX, bli_ztrsm_u_template_noopt,     FALSE,
-	  cntx
-	);
+	  cntx,
 
-	// Update the context with optimized level-1f kernels.
-	bli_cntx_set_l1f_kers
-	(
+      // level-3
+	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemm_template_noopt,
+	  BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_l_template_noopt,
+	  BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_u_template_noopt,
+	  BLIS_TRSM_L_UKR,     BLIS_DCOMPLEX, bli_ztrsm_l_template_noopt,
+	  BLIS_TRSM_U_UKR,     BLIS_DCOMPLEX, bli_ztrsm_u_template_noopt,
+
+      // level-1f
 	  BLIS_AXPY2V_KER,    BLIS_DCOMPLEX, bli_zaxpy2v_template_noopt,
 	  BLIS_DOTAXPYV_KER,  BLIS_DCOMPLEX, bli_zdotaxpyv_template_noopt,
 	  BLIS_AXPYF_KER,     BLIS_DCOMPLEX, bli_zaxpyf_template_noopt,
 	  BLIS_DOTXF_KER,     BLIS_DCOMPLEX, bli_zdotxf_template_noopt,
 	  BLIS_DOTXAXPYF_KER, BLIS_DCOMPLEX, bli_zdotxaxpyf_template_noopt,
-	  cntx
-	);
 
-	// Update the context with optimized level-1v kernels.
-	bli_cntx_set_l1v_kers
-	(
+      // level-1v
 	  BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_template_noopt,
 	  BLIS_DOTV_KER,  BLIS_DCOMPLEX, bli_zdotv_template_noopt,
-	  cntx
+
+      -1
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+      // level-3
+	  BLIS_GEMM_UKR_ROW_PREF,       BLIS_DCOMPLEX, FALSE,
+	  BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
+	  BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
+	  BLIS_TRSM_L_UKR_ROW_PREF,     BLIS_DCOMPLEX, FALSE,
+	  BLIS_TRSM_U_UKR_ROW_PREF,     BLIS_DCOMPLEX, FALSE,
+
+      -1
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -87,13 +97,16 @@ void bli_cntx_init_template( cntx_t* cntx )
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 5,
+	  cntx,
+
+      // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
-	  cntx
+
+      -1
 	);
 }
 
diff --git a/config/thunderx2/bli_cntx_init_thunderx2.c b/config/thunderx2/bli_cntx_init_thunderx2.c
index f2b7b633d9..3c58bb1fa7 100644
--- a/config/thunderx2/bli_cntx_init_thunderx2.c
+++ b/config/thunderx2/bli_cntx_init_thunderx2.c
@@ -43,14 +43,28 @@ void bli_cntx_init_thunderx2( cntx_t* cntx )
 
 	// -------------------------------------------------------------------------
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  2,
-	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_armv8a_asm_8x12, FALSE,
-	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_armv8a_asm_6x8,  FALSE,
-	  cntx
+	  cntx,
+
+      // level-3
+	  BLIS_GEMM_UKR, BLIS_FLOAT,  bli_sgemm_armv8a_asm_8x12,
+	  BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8,
+
+      -1
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+      // level-3
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,  FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
+
+      -1
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -65,13 +79,16 @@ void bli_cntx_init_thunderx2( cntx_t* cntx )
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 5,
+	  cntx,
+
+      // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
-	  cntx
+
+      -1
 	);
 }
 
diff --git a/config/zen/bli_cntx_init_zen.c b/config/zen/bli_cntx_init_zen.c
index 615a31a043..dde28da64e 100644
--- a/config/zen/bli_cntx_init_zen.c
+++ b/config/zen/bli_cntx_init_zen.c
@@ -40,40 +40,78 @@
 void bli_cntx_init_zen( cntx_t* cntx )
 {
 	blksz_t blkszs[ BLIS_NUM_BLKSZS ];
-	blksz_t thresh[ BLIS_NUM_THRESH ];
 
 	// Set default kernel blocksizes and functions.
 	bli_cntx_init_zen_ref( cntx );
 
 	// -------------------------------------------------------------------------
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  8,
+	  cntx,
 
 	  // gemm
-	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemm_haswell_asm_6x16,       TRUE,
-	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_haswell_asm_6x8,        TRUE,
-	  BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8,        TRUE,
-	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4,        TRUE,
+	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemm_haswell_asm_6x16,
+	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_haswell_asm_6x8,
+	  BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8,
+	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4,
 
 	  // gemmtrsm_l
-	  BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT,    bli_sgemmtrsm_l_haswell_asm_6x16, TRUE,
-	  BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_l_haswell_asm_6x8,  TRUE,
+	  BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT,    bli_sgemmtrsm_l_haswell_asm_6x16,
+	  BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_l_haswell_asm_6x8,
 
 	  // gemmtrsm_u
-	  BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT,    bli_sgemmtrsm_u_haswell_asm_6x16, TRUE,
-	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_u_haswell_asm_6x8,  TRUE,
-	  cntx
-	);
+	  BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT,    bli_sgemmtrsm_u_haswell_asm_6x16,
+	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_u_haswell_asm_6x8,
+
+      // gemmsup
+	  BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
+	  BLIS_GEMMSUP_RRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m,
+	  BLIS_GEMMSUP_RCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
+	  BLIS_GEMMSUP_RCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
+	  BLIS_GEMMSUP_CRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
+	  BLIS_GEMMSUP_CRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n,
+	  BLIS_GEMMSUP_CCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
+	  BLIS_GEMMSUP_CCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
+
+	  BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m,
+	  BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m,
+	  BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m,
+	  BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
+	  BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m,
+	  BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n,
+	  BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
+	  BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
+#if 0
+	  BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m,
+	  BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m,
+	  BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m,
+	  BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n,
+	  BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m,
+	  BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n,
+	  BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n,
+	  BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n,
+#endif
 
-#if 1
-	// Update the context with optimized packm kernels.
-	bli_cntx_set_packm_kers
-	(
-	  8,
+#if 0
+	  // NOTE: This set of kernels is likely broken and therefore disabled.
+	  BLIS_GEMMSUP_RRR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m,
+	  BLIS_GEMMSUP_RCR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m,
+	  BLIS_GEMMSUP_CRR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m,
+	  BLIS_GEMMSUP_RCC_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n,
+	  BLIS_GEMMSUP_CCR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n,
+	  BLIS_GEMMSUP_CCC_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n,
+
+	  BLIS_GEMMSUP_RRR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m,
+	  BLIS_GEMMSUP_RCR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m,
+	  BLIS_GEMMSUP_CRR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m,
+	  BLIS_GEMMSUP_RCC_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n,
+	  BLIS_GEMMSUP_CCR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n,
+	  BLIS_GEMMSUP_CCC_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n,
+#endif
+
+      // packm
 	  BLIS_PACKM_6XK_KER,  BLIS_FLOAT,    bli_spackm_haswell_asm_6xk,
 	  BLIS_PACKM_16XK_KER, BLIS_FLOAT,    bli_spackm_haswell_asm_16xk,
 	  BLIS_PACKM_6XK_KER,  BLIS_DOUBLE,   bli_dpackm_haswell_asm_6xk,
@@ -82,14 +120,6 @@ void bli_cntx_init_zen( cntx_t* cntx )
 	  BLIS_PACKM_8XK_KER,  BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk,
 	  BLIS_PACKM_3XK_KER,  BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk,
 	  BLIS_PACKM_4XK_KER,  BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk,
-	  cntx
-	);
-#endif
-
-	// Update the context with optimized level-1f kernels.
-	bli_cntx_set_l1f_kers
-	(
-	  4,
 
 	  // axpyf
 	  BLIS_AXPYF_KER,     BLIS_FLOAT,  bli_saxpyf_zen_int_8,
@@ -98,13 +128,6 @@ void bli_cntx_init_zen( cntx_t* cntx )
 	  // dotxf
 	  BLIS_DOTXF_KER,     BLIS_FLOAT,  bli_sdotxf_zen_int_8,
 	  BLIS_DOTXF_KER,     BLIS_DOUBLE, bli_ddotxf_zen_int_8,
-	  cntx
-	);
-
-	// Update the context with optimized level-1v kernels.
-	bli_cntx_set_l1v_kers
-	(
-	  10,
 
 	  // amaxv
 	  BLIS_AMAXV_KER,  BLIS_FLOAT,  bli_samaxv_zen_int,
@@ -151,7 +174,66 @@ void bli_cntx_init_zen( cntx_t* cntx )
 	  BLIS_SWAPV_KER,  BLIS_FLOAT,  bli_sswapv_zen_int8,
 	  BLIS_SWAPV_KER,  BLIS_DOUBLE, bli_dswapv_zen_int8,
 #endif
-	  cntx
+
+      -1
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+	  // gemm
+	  BLIS_GEMM_UKR_ROW_PREF,       BLIS_FLOAT,    TRUE,
+	  BLIS_GEMM_UKR_ROW_PREF,       BLIS_DOUBLE,   TRUE,
+	  BLIS_GEMM_UKR_ROW_PREF,       BLIS_SCOMPLEX, TRUE,
+	  BLIS_GEMM_UKR_ROW_PREF,       BLIS_DCOMPLEX, TRUE,
+
+	  // gemmtrsm_l
+	  BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_FLOAT,    TRUE,
+	  BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_DOUBLE,   TRUE,
+
+	  // gemmtrsm_u
+	  BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_FLOAT,    TRUE,
+	  BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_DOUBLE,   TRUE,
+
+      // gemmsup
+	  BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_RRC_UKR, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_RCR_UKR, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_RCC_UKR, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_CRR_UKR, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_CRC_UKR, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_CCR_UKR, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_CCC_UKR, BLIS_DOUBLE, TRUE,
+
+	  BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT,  TRUE,
+
+#if 0
+	  // NOTE: This set of kernels is likely broken and therefore disabled.
+	  BLIS_GEMMSUP_RRR_UKR, BLIS_SCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_RCR_UKR, BLIS_SCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_CRR_UKR, BLIS_SCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_RCC_UKR, BLIS_SCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_CCR_UKR, BLIS_SCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_CCC_UKR, BLIS_SCOMPLEX, TRUE,
+
+	  BLIS_GEMMSUP_RRR_UKR, BLIS_DCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_RCR_UKR, BLIS_DCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_CRR_UKR, BLIS_DCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_RCC_UKR, BLIS_DCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_CCR_UKR, BLIS_DCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_CCC_UKR, BLIS_DCOMPLEX, TRUE,
+#endif
+
+      -1
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -192,41 +274,64 @@ void bli_cntx_init_zen( cntx_t* cntx )
 	bli_blksz_init_easy( &blkszs[ BLIS_AF ],     8,     8,    -1,    -1 );
 	bli_blksz_init_easy( &blkszs[ BLIS_DF ],     8,     8,    -1,    -1 );
 
+	// Initialize sup thresholds with architecture-appropriate values.
+	//                                           s      d      c      z
+	bli_blksz_init_easy( &blkszs[ BLIS_MT ],   512,   256,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NT ],   512,   256,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KT ],   440,   220,    -1,    -1 );
+
+	// Initialize level-3 sup blocksize objects with architecture-specific
+	// values.
+	//                                           s      d      c      z
+	bli_blksz_init     ( &blkszs[ BLIS_MR_SUP ],     6,     6,    -1,    -1,
+	                                                 9,     9,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR_SUP ],    16,     8,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC_SUP ],   144,    72,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC_SUP ],   256,   256,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC_SUP ],  8160,  4080,    -1,    -1 );
+#if 0
+	bli_blksz_init     ( &blkszs[ BLIS_MR_SUP ],     6,     6,     3,     3,
+	                                                 9,     9,     3,     3 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR_SUP ],    16,     8,     8,     4 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC_SUP ],   144,    72,    72,    36 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC_SUP ],   512,   256,   128,    64 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC_SUP ],  8160,  4080,  2040,  1020 );
+#endif
+
 	// Update the context with the current architecture's register and cache
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 7,
+	  cntx,
+
 	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+
 	  // level-1f
 	  BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
 	  BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
-	  cntx
-	);
 
-	// -------------------------------------------------------------------------
+      // sup thresholds
+	  BLIS_MT, &blkszs[ BLIS_MT ], BLIS_MT,
+	  BLIS_NT, &blkszs[ BLIS_NT ], BLIS_NT,
+	  BLIS_KT, &blkszs[ BLIS_KT ], BLIS_KT,
 
-	// Initialize sup thresholds with architecture-appropriate values.
-	//                                           s      d      c      z
-	bli_blksz_init_easy( &thresh[ BLIS_MT ],   512,   256,    -1,    -1 );
-	bli_blksz_init_easy( &thresh[ BLIS_NT ],   512,   256,    -1,    -1 );
-	bli_blksz_init_easy( &thresh[ BLIS_KT ],   440,   220,    -1,    -1 );
+      // gemmsup
+	  BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ], BLIS_NR_SUP,
+	  BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ], BLIS_KR_SUP,
+	  BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ], BLIS_MR_SUP,
+	  BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ], BLIS_NR_SUP,
+	  BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ], BLIS_MR_SUP,
 
-	// Initialize the context with the sup thresholds.
-	bli_cntx_set_l3_sup_thresh
-	(
-	  3,
-	  BLIS_MT, &thresh[ BLIS_MT ],
-	  BLIS_NT, &thresh[ BLIS_NT ],
-	  BLIS_KT, &thresh[ BLIS_KT ],
-	  cntx
+	  -1
 	);
 
+	// -------------------------------------------------------------------------
+
 	// Initialize the context with the sup handlers.
 	bli_cntx_set_l3_sup_handlers
 	(
@@ -235,88 +340,4 @@ void bli_cntx_init_zen( cntx_t* cntx )
 	  //BLIS_GEMMT, bli_gemmtsup_ref,
 	  cntx
 	);
-
-	// Update the context with optimized small/unpacked gemm kernels.
-	bli_cntx_set_l3_sup_kers
-	(
-	  16,
-	  //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
-	  BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
-	  BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
-	  BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
-	  BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
-	  BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
-	  BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
-	  BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
-	  BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
-
-	  BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
-	  BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m, TRUE,
-	  BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
-	  BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
-	  BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
-	  BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, TRUE,
-	  BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
-	  BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
-#if 0
-	  BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
-	  BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE,
-	  BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
-	  BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
-	  BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
-	  BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, TRUE,
-	  BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
-	  BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
-#endif
-
-#if 0
-	  // NOTE: This set of kernels is likely broken and therefore disabled.
-	  BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
-	  BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
-	  BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
-	  BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
-	  BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
-	  BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
-
-	  BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
-	  BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
-	  BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
-	  BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
-	  BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
-	  BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
-#endif
-	  cntx
-	);
-
-	// Initialize level-3 sup blocksize objects with architecture-specific
-	// values.
-	//                                           s      d      c      z
-	bli_blksz_init     ( &blkszs[ BLIS_MR ],     6,     6,    -1,    -1,
-	                                             9,     9,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    16,     8,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   144,    72,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   256,   256,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  8160,  4080,    -1,    -1 );
-#if 0
-	bli_blksz_init     ( &blkszs[ BLIS_MR ],     6,     6,     3,     3,
-	                                             9,     9,     3,     3 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    16,     8,     8,     4 );
-	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   144,    72,    72,    36 );
-	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   512,   256,   128,    64 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  8160,  4080,  2040,  1020 );
-#endif
-
-	// Update the context with the current architecture's register and cache
-	// blocksizes for small/unpacked level-3 problems.
-	bli_cntx_set_l3_sup_blkszs
-	(
-	  5,
-	  BLIS_NC, &blkszs[ BLIS_NC ],
-	  BLIS_KC, &blkszs[ BLIS_KC ],
-	  BLIS_MC, &blkszs[ BLIS_MC ],
-	  BLIS_NR, &blkszs[ BLIS_NR ],
-	  BLIS_MR, &blkszs[ BLIS_MR ],
-	  cntx
-	);
 }
-
diff --git a/config/zen2/bli_cntx_init_zen2.c b/config/zen2/bli_cntx_init_zen2.c
index 0964ce463e..ea0fc42b3b 100644
--- a/config/zen2/bli_cntx_init_zen2.c
+++ b/config/zen2/bli_cntx_init_zen2.c
@@ -38,40 +38,78 @@
 void bli_cntx_init_zen2( cntx_t* cntx )
 {
 	blksz_t blkszs[ BLIS_NUM_BLKSZS ];
-	blksz_t thresh[ BLIS_NUM_THRESH ];
 
 	// Set default kernel blocksizes and functions.
 	bli_cntx_init_zen2_ref( cntx );
 
 	// -------------------------------------------------------------------------
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  8,
+	  cntx,
 
 	  // gemm
-	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemm_haswell_asm_6x16,       TRUE,
-	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_haswell_asm_6x8,        TRUE,
-	  BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8,        TRUE,
-	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4,        TRUE,
+	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemm_haswell_asm_6x16,
+	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_haswell_asm_6x8,
+	  BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8,
+	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4,
 
 	  // gemmtrsm_l
-	  BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT,    bli_sgemmtrsm_l_haswell_asm_6x16, TRUE,
-	  BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_l_haswell_asm_6x8,  TRUE,
+	  BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT,    bli_sgemmtrsm_l_haswell_asm_6x16,
+	  BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_l_haswell_asm_6x8,
 
 	  // gemmtrsm_u
-	  BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT,    bli_sgemmtrsm_u_haswell_asm_6x16, TRUE,
-	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_u_haswell_asm_6x8,  TRUE,
-	  cntx
-	);
+	  BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT,    bli_sgemmtrsm_u_haswell_asm_6x16,
+	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_u_haswell_asm_6x8,
+
+      // level-3 sup
+	  BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
+	  BLIS_GEMMSUP_RRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m,
+	  BLIS_GEMMSUP_RCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
+	  BLIS_GEMMSUP_RCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
+	  BLIS_GEMMSUP_CRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
+	  BLIS_GEMMSUP_CRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n,
+	  BLIS_GEMMSUP_CCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
+	  BLIS_GEMMSUP_CCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
+
+	  BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m,
+	  BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m,
+	  BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m,
+	  BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
+	  BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m,
+	  BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n,
+	  BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
+	  BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
+#if 0
+	  BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m,
+	  BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m,
+	  BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m,
+	  BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n,
+	  BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m,
+	  BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n,
+	  BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n,
+	  BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n,
+#endif
 
-#if 1
-	// Update the context with optimized packm kernels.
-	bli_cntx_set_packm_kers
-	(
-	  8,
+#if 0
+	  // NOTE: This set of kernels is likely broken and therefore disabled.
+	  BLIS_GEMMSUP_RRR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m,
+	  BLIS_GEMMSUP_RCR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m,
+	  BLIS_GEMMSUP_RCC_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n,
+	  BLIS_GEMMSUP_CRR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m,
+	  BLIS_GEMMSUP_CCR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n,
+	  BLIS_GEMMSUP_CCC_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n,
+
+	  BLIS_GEMMSUP_RRR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m,
+	  BLIS_GEMMSUP_RCR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m,
+	  BLIS_GEMMSUP_CRR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m,
+	  BLIS_GEMMSUP_RCC_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n,
+	  BLIS_GEMMSUP_CCR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n,
+	  BLIS_GEMMSUP_CCC_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n,
+#endif
+
+      // packm
 	  BLIS_PACKM_6XK_KER,  BLIS_FLOAT,    bli_spackm_haswell_asm_6xk,
 	  BLIS_PACKM_16XK_KER, BLIS_FLOAT,    bli_spackm_haswell_asm_16xk,
 	  BLIS_PACKM_6XK_KER,  BLIS_DOUBLE,   bli_dpackm_haswell_asm_6xk,
@@ -80,14 +118,6 @@ void bli_cntx_init_zen2( cntx_t* cntx )
 	  BLIS_PACKM_8XK_KER,  BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk,
 	  BLIS_PACKM_3XK_KER,  BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk,
 	  BLIS_PACKM_4XK_KER,  BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk,
-	  cntx
-	);
-#endif
-
-	// Update the context with optimized level-1f kernels.
-	bli_cntx_set_l1f_kers
-	(
-	  4,
 
 	  // axpyf
 	  BLIS_AXPYF_KER,     BLIS_FLOAT,  bli_saxpyf_zen_int_5,
@@ -96,13 +126,6 @@ void bli_cntx_init_zen2( cntx_t* cntx )
 	  // dotxf
 	  BLIS_DOTXF_KER,     BLIS_FLOAT,  bli_sdotxf_zen_int_8,
 	  BLIS_DOTXF_KER,     BLIS_DOUBLE, bli_ddotxf_zen_int_8,
-	  cntx
-	);
-
-	// Update the context with optimized level-1v kernels.
-	bli_cntx_set_l1v_kers
-	(
-	  16,
 
 	  // amaxv
 	  BLIS_AMAXV_KER,  BLIS_FLOAT,  bli_samaxv_zen_int,
@@ -135,7 +158,49 @@ void bli_cntx_init_zen2( cntx_t* cntx )
 	  //set
 	  BLIS_SETV_KER,  BLIS_FLOAT,  bli_ssetv_zen_int,
 	  BLIS_SETV_KER,  BLIS_DOUBLE, bli_dsetv_zen_int,
-	  cntx
+
+      -1
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+	  // gemm
+	  BLIS_GEMM_UKR_ROW_PREF,       BLIS_FLOAT,    TRUE,
+	  BLIS_GEMM_UKR_ROW_PREF,       BLIS_DOUBLE,   TRUE,
+	  BLIS_GEMM_UKR_ROW_PREF,       BLIS_SCOMPLEX, TRUE,
+	  BLIS_GEMM_UKR_ROW_PREF,       BLIS_DCOMPLEX, TRUE,
+
+	  // gemmtrsm_l
+	  BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_FLOAT,    TRUE,
+	  BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_DOUBLE,   TRUE,
+
+	  // gemmtrsm_u
+	  BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_FLOAT,    TRUE,
+	  BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_DOUBLE,   TRUE,
+
+      // level-3 sup
+	  BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+
+	  BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
+
+      -1
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -155,130 +220,58 @@ void bli_cntx_init_zen2( cntx_t* cntx )
 	bli_blksz_init_easy( &blkszs[ BLIS_AF ],     5,     5,    -1,    -1 );
 	bli_blksz_init_easy( &blkszs[ BLIS_DF ],     8,     8,    -1,    -1 );
 
+	// Initialize sup thresholds with architecture-appropriate values.
+	//                                          s     d     c     z
+#if 1
+	bli_blksz_init_easy( &blkszs[ BLIS_MT ],  500,  249,   -1,   -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NT ],  500,  249,   -1,   -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KT ],  500,  249,   -1,   -1 );
+#else
+	bli_blksz_init_easy( &blkszs[ BLIS_MT ], 100000, 100000,   -1,   -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NT ], 100000, 100000,   -1,   -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KT ], 100000, 100000,   -1,   -1 );
+#endif
+
+	// Initialize level-3 sup blocksize objects with architecture-specific
+	// values.
+	//                                           s      d      c      z
+	bli_blksz_init     ( &blkszs[ BLIS_MR_SUP ],     6,     6,    -1,    -1,
+	                                                 9,     9,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR_SUP ],    16,     8,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC_SUP ],   168,    72,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC_SUP ],   256,   256,    -1,    -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC_SUP ],  4080,  4080,    -1,    -1 );
+
 	// Update the context with the current architecture's register and cache
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 7,
+	  cntx,
+
 	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+
 	  // level-1f
 	  BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
 	  BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
-	  cntx
-	);
-
-	// -------------------------------------------------------------------------
-
-	// Initialize sup thresholds with architecture-appropriate values.
-	//                                          s     d     c     z
-#if 1
-	bli_blksz_init_easy( &thresh[ BLIS_MT ],  500,  249,   -1,   -1 );
-	bli_blksz_init_easy( &thresh[ BLIS_NT ],  500,  249,   -1,   -1 );
-	bli_blksz_init_easy( &thresh[ BLIS_KT ],  500,  249,   -1,   -1 );
-#else
-	bli_blksz_init_easy( &thresh[ BLIS_MT ], 100000, 100000,   -1,   -1 );
-	bli_blksz_init_easy( &thresh[ BLIS_NT ], 100000, 100000,   -1,   -1 );
-	bli_blksz_init_easy( &thresh[ BLIS_KT ], 100000, 100000,   -1,   -1 );
-#endif
-
-	// Initialize the context with the sup thresholds.
-	bli_cntx_set_l3_sup_thresh
-	(
-	  3,
-	  BLIS_MT, &thresh[ BLIS_MT ],
-	  BLIS_NT, &thresh[ BLIS_NT ],
-	  BLIS_KT, &thresh[ BLIS_KT ],
-	  cntx
-	);
-
-#if 0
-	// Initialize the context with the sup handlers.
-	bli_cntx_set_l3_sup_handlers
-	(
-	  1,
-	  BLIS_GEMM, bli_gemmsup_ref,
-	  cntx
-	);
-#endif
 
-	// Update the context with optimized small/unpacked gemm kernels.
-	bli_cntx_set_l3_sup_kers
-	(
-	  16,
-	  //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
-	  BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
-	  BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
-	  BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
-	  BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
-	  BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
-	  BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
-	  BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
-	  BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
-
-	  BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
-	  BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m, TRUE,
-	  BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
-	  BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
-	  BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
-	  BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, TRUE,
-	  BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
-	  BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
-#if 0
-	  BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
-	  BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE,
-	  BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
-	  BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
-	  BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
-	  BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, TRUE,
-	  BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
-	  BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
-#endif
+      // sup thresholds
+	  BLIS_MT, &blkszs[ BLIS_MT ], BLIS_MT,
+	  BLIS_NT, &blkszs[ BLIS_NT ], BLIS_NT,
+	  BLIS_KT, &blkszs[ BLIS_KT ], BLIS_KT,
 
-#if 0
-	  // NOTE: This set of kernels is likely broken and therefore disabled.
-	  BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
-	  BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
-	  BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
-	  BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
-	  BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
-	  BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
-
-	  BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
-	  BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
-	  BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
-	  BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
-	  BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
-	  BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
-#endif
-	  cntx
-	);
-
-	// Initialize level-3 sup blocksize objects with architecture-specific
-	// values.
-	//                                           s      d      c      z
-	bli_blksz_init     ( &blkszs[ BLIS_MR ],     6,     6,    -1,    -1,
-	                                             9,     9,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    16,     8,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   168,    72,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   256,   256,    -1,    -1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  4080,  4080,    -1,    -1 );
+      // level-3 sup
+	  BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ], BLIS_NC_SUP,
+	  BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ], BLIS_KC_SUP,
+	  BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ], BLIS_MC_SUP,
+	  BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ], BLIS_NR_SUP,
+	  BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ], BLIS_MR_SUP,
 
-	// Update the context with the current architecture's register and cache
-	// blocksizes for small/unpacked level-3 problems.
-	bli_cntx_set_l3_sup_blkszs
-	(
-	  5,
-	  BLIS_NC, &blkszs[ BLIS_NC ],
-	  BLIS_KC, &blkszs[ BLIS_KC ],
-	  BLIS_MC, &blkszs[ BLIS_MC ],
-	  BLIS_NR, &blkszs[ BLIS_NR ],
-	  BLIS_MR, &blkszs[ BLIS_MR ],
-	  cntx
+	  -1
 	);
 }
 
diff --git a/config/zen3/bli_cntx_init_zen3.c b/config/zen3/bli_cntx_init_zen3.c
index b5bbb05ed2..e8934d2a46 100644
--- a/config/zen3/bli_cntx_init_zen3.c
+++ b/config/zen3/bli_cntx_init_zen3.c
@@ -37,49 +37,89 @@
 void bli_cntx_init_zen3( cntx_t* cntx )
 {
 	blksz_t blkszs[ BLIS_NUM_BLKSZS ];
-	blksz_t thresh[ BLIS_NUM_THRESH ];
 
 	// Set default kernel blocksizes and functions.
 	bli_cntx_init_zen3_ref( cntx );
 
 	// -------------------------------------------------------------------------
 
-	// Update the context with optimized native gemm micro-kernels and
-	// their storage preferences.
-	bli_cntx_set_l3_nat_ukrs
+	// Update the context with optimized native gemm micro-kernels.
+	bli_cntx_set_ukrs
 	(
-	  8,
+	  cntx,
+
 	  // gemm
-	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemm_haswell_asm_6x16,       TRUE,
-	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_haswell_asm_6x8,        TRUE,
-	  BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8,        TRUE,
-	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4,        TRUE,
+	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemm_haswell_asm_6x16,
+	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_haswell_asm_6x8,
+	  BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemm_haswell_asm_3x8,
+	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemm_haswell_asm_3x4,
 
 	  // gemmtrsm_l
-	  BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT,    bli_sgemmtrsm_l_haswell_asm_6x16, TRUE,
-	  BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_l_haswell_asm_6x8,  TRUE,
+	  BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT,    bli_sgemmtrsm_l_haswell_asm_6x16,
+	  BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_l_haswell_asm_6x8,
 
 	  // gemmtrsm_u
-	  BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT,    bli_sgemmtrsm_u_haswell_asm_6x16, TRUE,
-	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_u_haswell_asm_6x8,  TRUE,
-	  cntx
-	);
+	  BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT,    bli_sgemmtrsm_u_haswell_asm_6x16,
+	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_u_haswell_asm_6x8,
 
+      // gemmsup
 #if 0
-	// AMD: This will be enabled in other PRs.
-	// packm kernels
-	bli_cntx_set_packm_kers
-	(
-	  2,
+	  // AMD: This should be enabled in the PR which has added these kernels
+	  // Update the context with optimized small/unpacked gemm kernels.
+	  BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
+	  BLIS_GEMMSUP_RRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m,
+	  BLIS_GEMMSUP_RCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
+	  BLIS_GEMMSUP_RCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
+	  BLIS_GEMMSUP_CRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
+	  BLIS_GEMMSUP_CRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n,
+	  BLIS_GEMMSUP_CCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
+	  BLIS_GEMMSUP_CCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
+	  BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m,
+	  BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m,
+	  BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m,
+	  BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n,
+	  BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m,
+	  BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n,
+	  BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n,
+	  BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n,
+	  BLIS_GEMMSUP_RRR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m,
+	  BLIS_GEMMSUP_RCR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m,
+	  BLIS_GEMMSUP_CRR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m,
+	  BLIS_GEMMSUP_RCC_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n,
+	  BLIS_GEMMSUP_CCR_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n,
+	  BLIS_GEMMSUP_CCC_UKR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n,
+	  BLIS_GEMMSUP_RRR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m,
+	  BLIS_GEMMSUP_RCR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m,
+	  BLIS_GEMMSUP_CRR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m,
+	  BLIS_GEMMSUP_RCC_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n,
+	  BLIS_GEMMSUP_CCR_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n,
+	  BLIS_GEMMSUP_CCC_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n,
+#else
+	  BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
+	  BLIS_GEMMSUP_RRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m,
+	  BLIS_GEMMSUP_RCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
+	  BLIS_GEMMSUP_RCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
+	  BLIS_GEMMSUP_CRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
+	  BLIS_GEMMSUP_CRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n,
+	  BLIS_GEMMSUP_CCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
+	  BLIS_GEMMSUP_CCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n,
+
+	  BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m,
+	  BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m,
+	  BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m,
+	  BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
+	  BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m,
+	  BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n,
+	  BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
+	  BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
+#endif
+
+      // packm
+#if 0
+	  // AMD: This will be enabled in other PRs.
 	  BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_8xk_gen_zen,
 	  BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_6xk_gen_zen,
-	  cntx
-	);
 #else
-	// Update the context with optimized packm kernels.
-	bli_cntx_set_packm_kers
-	(
-	  8,
 	  BLIS_PACKM_6XK_KER,  BLIS_FLOAT,    bli_spackm_haswell_asm_6xk,
 	  BLIS_PACKM_16XK_KER, BLIS_FLOAT,    bli_spackm_haswell_asm_16xk,
 	  BLIS_PACKM_6XK_KER,  BLIS_DOUBLE,   bli_dpackm_haswell_asm_6xk,
@@ -88,34 +128,20 @@ void bli_cntx_init_zen3( cntx_t* cntx )
 	  BLIS_PACKM_8XK_KER,  BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk,
 	  BLIS_PACKM_3XK_KER,  BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk,
 	  BLIS_PACKM_4XK_KER,  BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk,
-	  cntx
-	);
 #endif
 
-	// Update the context with optimized level-1f kernels.
-	bli_cntx_set_l1f_kers
-	(
-	  4,
 	  // axpyf
 	  BLIS_AXPYF_KER,     BLIS_FLOAT,  bli_saxpyf_zen_int_5,
 	  BLIS_AXPYF_KER,     BLIS_DOUBLE, bli_daxpyf_zen_int_5,
+
 	  // dotxf
 	  BLIS_DOTXF_KER,     BLIS_FLOAT,  bli_sdotxf_zen_int_8,
 	  BLIS_DOTXF_KER,     BLIS_DOUBLE, bli_ddotxf_zen_int_8,
-	  cntx
-	);
-
-	// Update the context with optimized level-1v kernels.
-	bli_cntx_set_l1v_kers
-	(
-	  16,
 
 	  // amaxv
 	  BLIS_AMAXV_KER,  BLIS_FLOAT,  bli_samaxv_zen_int,
 	  BLIS_AMAXV_KER,  BLIS_DOUBLE, bli_damaxv_zen_int,
 
-	  // axpyv
-
 	  // axpyv
 	  BLIS_AXPYV_KER,  BLIS_FLOAT,  bli_saxpyv_zen_int10,
 	  BLIS_AXPYV_KER,  BLIS_DOUBLE, bli_daxpyv_zen_int10,
@@ -144,7 +170,63 @@ void bli_cntx_init_zen3( cntx_t* cntx )
 	  BLIS_SETV_KER,  BLIS_FLOAT,  bli_ssetv_zen_int,
 	  BLIS_SETV_KER,  BLIS_DOUBLE, bli_dsetv_zen_int,
 
-	  cntx
+	  -1
+	);
+
+	// Update the context with storage preferences.
+	bli_cntx_set_ukr_prefs
+	(
+	  cntx,
+
+	  // gemm
+	  BLIS_GEMM_UKR_ROW_PREF,       BLIS_FLOAT,    TRUE,
+	  BLIS_GEMM_UKR_ROW_PREF,       BLIS_DOUBLE,   TRUE,
+	  BLIS_GEMM_UKR_ROW_PREF,       BLIS_SCOMPLEX, TRUE,
+	  BLIS_GEMM_UKR_ROW_PREF,       BLIS_DCOMPLEX, TRUE,
+
+	  // gemmtrsm_l
+	  BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_FLOAT,    TRUE,
+	  BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_DOUBLE,   TRUE,
+
+	  // gemmtrsm_u
+	  BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_FLOAT,    TRUE,
+	  BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_DOUBLE,   TRUE,
+
+      // gemmsup
+	  BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+#if 0
+	  // AMD: This should be enabled in the PR which has added these kernels
+	  // Update the context with optimized small/unpacked gemm kernels.
+	  BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
+#endif
+
+	  -1
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -161,138 +243,52 @@ void bli_cntx_init_zen3( cntx_t* cntx )
 	bli_blksz_init_easy( &blkszs[ BLIS_AF ],     5,     5,    -1,    -1 );
 	bli_blksz_init_easy( &blkszs[ BLIS_DF ],     8,     8,    -1,    -1 );
 
+	// Initialize sup thresholds with architecture-appropriate values.
+	//                                          s     d     c     z
+	bli_blksz_init_easy( &blkszs[ BLIS_MT ],  512,  256,   -1,   -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NT ],  200,  256,   -1,   -1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KT ],  240,  220,   -1,   -1 );
+
+	// Initialize level-3 sup blocksize objects with architecture-specific
+	// values.
+	//                                           s      d      c      z
+	bli_blksz_init     ( &blkszs[ BLIS_MR_SUP ],     6,     6,     3,     3,
+	                                                 9,     9,     3,     3 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR_SUP ],    16,     8,     8,     4 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC_SUP ],   144,    72,    72,    36 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC_SUP ],   512,   256,   128,    64 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC_SUP ],  8160,  4080,  2040,  1020 );
+
 	// Update the context with the current architecture's register and cache
 	// blocksizes (and multiples) for native execution.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 7,
+	  cntx,
+
 	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+
 	  // level-1f
 	  BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
 	  BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
-	  cntx
-	);
-
-// -------------------------------------------------------------------------
 
-	// Initialize sup thresholds with architecture-appropriate values.
-	//                                          s     d     c     z
-	bli_blksz_init_easy( &thresh[ BLIS_MT ],  512,  256,   -1,   -1 );
-	bli_blksz_init_easy( &thresh[ BLIS_NT ],  200,  256,   -1,   -1 );
-	bli_blksz_init_easy( &thresh[ BLIS_KT ],  240,  220,   -1,   -1 );
-
-	// Initialize the context with the sup thresholds.
-	bli_cntx_set_l3_sup_thresh
-	(
-	  3,
-	  BLIS_MT, &thresh[ BLIS_MT ],
-	  BLIS_NT, &thresh[ BLIS_NT ],
-	  BLIS_KT, &thresh[ BLIS_KT ],
-	  cntx
-	);
-
-#if 0
-	// Initialize the context with the sup handlers.
-	bli_cntx_set_l3_sup_handlers
-	(
-	  2,
-	  BLIS_GEMM, bli_gemmsup_ref,
-	  BLIS_GEMMT, bli_gemmtsup_ref,
-	  cntx
-	);
-#endif
+      // sup thresholds
+	  BLIS_MT, &blkszs[ BLIS_MT ], BLIS_MT,
+	  BLIS_NT, &blkszs[ BLIS_NT ], BLIS_NT,
+	  BLIS_KT, &blkszs[ BLIS_KT ], BLIS_KT,
 
-#if 0
-	// AMD: This should be enabled in the PR which has added these kernels
-	// Update the context with optimized small/unpacked gemm kernels.
-	bli_cntx_set_l3_sup_kers
-	(
-	  28,
-	  //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
-	  BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
-	  BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
-	  BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
-	  BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
-	  BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
-	  BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
-	  BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
-	  BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
-	  BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
-	  BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16m, TRUE,
-	  BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
-	  BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
-	  BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16m, TRUE,
-	  BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_zen_asm_6x16n, TRUE,
-	  BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
-	  BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_zen_asm_6x16n, TRUE,
-	  BLIS_RRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
-	  BLIS_RCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
-	  BLIS_CRR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8m, TRUE,
-	  BLIS_RCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
-	  BLIS_CCR, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
-	  BLIS_CCC, BLIS_SCOMPLEX, bli_cgemmsup_rv_zen_asm_3x8n, TRUE,
-	  BLIS_RRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
-	  BLIS_RCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
-	  BLIS_CRR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4m, TRUE,
-	  BLIS_RCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
-	  BLIS_CCR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
-	  BLIS_CCC, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n, TRUE,
-	  cntx
-	);
-#else
-	// Update the context with optimized small/unpacked gemm kernels.
-	bli_cntx_set_l3_sup_kers
-	(
-	  16,
-	  //BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_r_haswell_ref,
-	  BLIS_RRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
-	  BLIS_RRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m, TRUE,
-	  BLIS_RCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
-	  BLIS_RCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
-	  BLIS_CRR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m, TRUE,
-	  BLIS_CRC, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8n, TRUE,
-	  BLIS_CCR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
-	  BLIS_CCC, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8n, TRUE,
-
-	  BLIS_RRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
-	  BLIS_RRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16m, TRUE,
-	  BLIS_RCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
-	  BLIS_RCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
-	  BLIS_CRR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16m, TRUE,
-	  BLIS_CRC, BLIS_FLOAT, bli_sgemmsup_rd_haswell_asm_6x16n, TRUE,
-	  BLIS_CCR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
-	  BLIS_CCC, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n, TRUE,
-	  cntx
-	);
-	
-#endif
-	
-	// Initialize level-3 sup blocksize objects with architecture-specific
-	// values.
-	//                                           s      d      c      z
-	bli_blksz_init     ( &blkszs[ BLIS_MR ],     6,     6,     3,     3,
-	                                             9,     9,     3,     3 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NR ],    16,     8,     8,     4 );
-	bli_blksz_init_easy( &blkszs[ BLIS_MC ],   144,    72,    72,    36 );
-	bli_blksz_init_easy( &blkszs[ BLIS_KC ],   512,   256,   128,    64 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NC ],  8160,  4080,  2040,  1020 );
+      // gemmsup
+	  BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ], BLIS_NR_SUP,
+	  BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ], BLIS_KR_SUP,
+	  BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ], BLIS_MR_SUP,
+	  BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ], BLIS_NR_SUP,
+	  BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ], BLIS_MR_SUP,
 
-	// Update the context with the current architecture's register and cache
-	// blocksizes for small/unpacked level-3 problems.
-	bli_cntx_set_l3_sup_blkszs
-	(
-	  5,
-	  BLIS_NC, &blkszs[ BLIS_NC ],
-	  BLIS_KC, &blkszs[ BLIS_KC ],
-	  BLIS_MC, &blkszs[ BLIS_MC ],
-	  BLIS_NR, &blkszs[ BLIS_NR ],
-	  BLIS_MR, &blkszs[ BLIS_MR ],
-	  cntx
+	  -1
 	);
 }
 
diff --git a/docs/ConfigurationHowTo.md b/docs/ConfigurationHowTo.md
index 08eaf8027e..b629eb6177 100644
--- a/docs/ConfigurationHowTo.md
+++ b/docs/ConfigurationHowTo.md
@@ -47,7 +47,7 @@ $ ls config/haswell
 bli_cntx_init_haswell.c  bli_family_haswell.h  make_defs.mk
 ```
 A sub-configuration (`haswell`, in this case) usually contains just three files:
-  * `bli_cntx_init_haswell.c`. This file contains the initialization function for a context targeting the hardware in question, in this case, Intel Haswell. A context, or `cntx_t` object, in BLIS encapsulates all of the hardware-specific information--including kernel function pointers and cache and register blocksizes--necessary to support all of the main computational operations in BLIS. The initialization function inside this file should be named the same as the filename (excluding `.c` suffix), which should begin with prefix `bli_cntx_init_` and end with the (lowercase) name of the sub-configuration. The context initialization function (in this case, `bli_cntx_init_haswell()`) is used internally by BLIS when setting up the global kernel structure--a mechanism for managing and supporting multiple microarchitectures simultaneously, so that the choice of which context to use can be deferred until the computation is ready to execute. 
+  * `bli_cntx_init_haswell.c`. This file contains the initialization function for a context targeting the hardware in question, in this case, Intel Haswell. A context, or `cntx_t` object, in BLIS encapsulates all of the hardware-specific information--including kernel function pointers and cache and register blocksizes--necessary to support all of the main computational operations in BLIS. The initialization function inside this file should be named the same as the filename (excluding `.c` suffix), which should begin with prefix `bli_cntx_init_` and end with the (lowercase) name of the sub-configuration. The context initialization function (in this case, `bli_cntx_init_haswell()`) is used internally by BLIS when setting up the global kernel structure--a mechanism for managing and supporting multiple microarchitectures simultaneously, so that the choice of which context to use can be deferred until the computation is ready to execute.
   * `bli_family_haswell.h`. This header file is `#included` when the configuration in question, in this case `haswell`, was the target to `./configure`. This is where you would specify certain global parameters and settings. For example, if you wanted to specify custom implementations of `malloc()` and `free()`, this is where you would specify them. The file is oftentimes empty. (In the case of configuration families, the definitions in this file apply to the _entire_ build, and not any specific sub-configuration, but for consistency we support them for all configuration targets, whether they be singleton sub-configurations or configuration families.)
   * `make_defs.mk`. This makefile fragment defines the compiler and compiler flags to use during compilation. Specifically, the values defined in this file are used whenever compiling source code specific to the sub-configuration (i.e., reference kernels and optimized kernels). If the sub-configuration is the target of `configure`, then these flags are also used to compile general framework code.
 
@@ -127,7 +127,7 @@ void bli_cntx_init_fooarch( cntx_t* cntx )
     // blocksizes (and multiples) for native execution.
     bli_cntx_set_blkszs
     (
-      BLIS_NAT, 5,
+      5,
       BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
       BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
       BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
@@ -143,7 +143,7 @@ _**Blocksize object array.**_ The `blkszs` array declaration is needed later in
 
 _**Reference initialization.**_ The first function call, `bli_cntx_init_fooarch_ref()`, initializes the context `cntx` with function pointers to reference implementations of all of the kernels supported by BLIS (as well as cache and register blocksizes, and other fields). This function is automatically generated by BLIS for every sub-configuration enabled at configure-time. The function prototype is generated by a preprocessor macro in `frame/include/bli_arch_config.h`.
 
-_**Level-3 microkernels.**_ The second function call is to a variable argument function, `bli_cntx_set_l3_nat_ukrs()`, which updates `cntx` with five optimized double-precision complex level-3 microkernels. The first argument encodes the number of individual kernels being registered into the context. Every subsequent line, except for the last line, is associated with the registration of a single kernel, and each of these lines is independent of one another and can occur in any order, provided that the kernel parameters of each line occur in the same order--kernel ID, followed by datatype, followed by function name, followed by storage preference boolean (i.e., whether the microkernel prefers row storage). The last argument of the function call is the address of the context being updated, `cntx`. Notice that we are registering microkernels written for another type of hardware, `bararch`, because in our hypothetical universe `bararch` is very similar to `fooarch` and so we recycle the code between the two configurations. After the function returns, the context contains pointers to optimized double-precision level-3 real microkernels. Note that the context will still contain reference microkernels for single-precision real and complex, and double-precision complex computation, as those kernels were not updated. 
+_**Level-3 microkernels.**_ The second function call is to a variable argument function, `bli_cntx_set_l3_nat_ukrs()`, which updates `cntx` with five optimized double-precision complex level-3 microkernels. The first argument encodes the number of individual kernels being registered into the context. Every subsequent line, except for the last line, is associated with the registration of a single kernel, and each of these lines is independent of one another and can occur in any order, provided that the kernel parameters of each line occur in the same order--kernel ID, followed by datatype, followed by function name, followed by storage preference boolean (i.e., whether the microkernel prefers row storage). The last argument of the function call is the address of the context being updated, `cntx`. Notice that we are registering microkernels written for another type of hardware, `bararch`, because in our hypothetical universe `bararch` is very similar to `fooarch` and so we recycle the code between the two configurations. After the function returns, the context contains pointers to optimized double-precision level-3 real microkernels. Note that the context will still contain reference microkernels for single-precision real and complex, and double-precision complex computation, as those kernels were not updated.
 
 _Note:_ Currently, BLIS only allows the kernel developer to signal a preference (row or column) for `gemm` microkernels. The preference of the `gemmtrsm` and `trsm` microkernels can (and must) be set, but are ignored by the framework during execution.
 
@@ -233,7 +233,7 @@ _**Memory alignment.**_ BLIS implements memory alignment internally, rather than
 ```
 The value `BLIS_STACK_BUF_ALIGN_SIZE` defines the alignment of stack memory used as temporary internal buffers, such as for output matrices to the microkernel when computing edge cases. (See [implementation notes](KernelsHowTo#implementation-notes-for-gemm) for the `gemm` microkernel for details.) This value defaults to `BLIS_SIMD_ALIGN_SIZE`, which defaults to `BLIS_SIMD_SIZE`.
 
-The value `BLIS_HEAP_ADDR_ALIGN_SIZE` defines the alignment used when allocating memory via the `malloc()` function defined by `BLIS_MALLOC_USER`. Setting this value to `BLIS_SIMD_ALIGN_SIZE` may speed up certain level-1v and -1f kernels. 
+The value `BLIS_HEAP_ADDR_ALIGN_SIZE` defines the alignment used when allocating memory via the `malloc()` function defined by `BLIS_MALLOC_USER`. Setting this value to `BLIS_SIMD_ALIGN_SIZE` may speed up certain level-1v and -1f kernels.
 
 The value `BLIS_HEAP_STRIDE_ALIGN_SIZE` defines the alignment used for so-called "leading dimensions" (i.e. column strides for column-stored matrices, and row strides for row-stored matrices) when creating BLIS matrices via the object-based API (e.g. `bli_obj_create()`). While setting `BLIS_HEAP_ADDR_ALIGN_SIZE` guarantees alignment for the first column (or row), creating a matrix with certain dimension values (_m_ and _n_) may cause subsequent columns (or rows) to be misaligned. Setting this value to `BLIS_SIMD_ALIGN_SIZE` is usually desirable. Additional alignment may or may not be beneficial.
 
@@ -243,7 +243,7 @@ The value `BLIS_POOL_ADDR_ALIGN_SIZE` defines the alignment used when allocating
 
 ### make_defs.mk
 
-The `make_defs.mk` file primarily contains compiler and compiler flag definitions used by `make` when building a BLIS library. 
+The `make_defs.mk` file primarily contains compiler and compiler flag definitions used by `make` when building a BLIS library.
 
 The format of the file is mostly self-explanatory. However, we will expound on the contents here, using the `make_defs.mk` file for the `haswell` configuration as an example:
 ```make
@@ -301,7 +301,7 @@ _**Debugging flags.**_ The `CDBGFLAGS` variable should be assigned to contain fl
 
 _**Optimization flags.**_ The `COPTFLAGS` variable should be assigned any flags relating to general compiler optimization. Usually this takes the form of `-O2` or `-O3`, but more specific optimization flags may be included as well, such as `-fomit-frame-pointer`. Note that, as with `CDBGFLAGS`, `COPTFLAGS` is conditionally assigned based on the value of `$(DEBUG_TYPE)`. A separate `CKOPTFLAGS` variable tracks optimizations flags used when compiling kernels. For most configurations, `CKOPTFLAGS` is assigned as a copy of `COPTFLAGS`, but if the kernel developer needs different optimization flags to be applied when compiling kernel source code, `CKOPTFLAGS` should be set accordingly.
 
-_**Vectorization flags.**_ The second-to-last block sets the `CVECFLAGS`, which should be assigned any flags that must be given to the compiler in order to enable use of a vector instruction set needed or assumed by the kernel source code. Also, if you wish to enable automatic use of certain instruction sets (e.g. `-mfpmath=sse` for many Intel architectures), this is where you should set those flags. These flags often differ among compiler families, especially between `icc` and `gcc`/`clang`. 
+_**Vectorization flags.**_ The second-to-last block sets the `CVECFLAGS`, which should be assigned any flags that must be given to the compiler in order to enable use of a vector instruction set needed or assumed by the kernel source code. Also, if you wish to enable automatic use of certain instruction sets (e.g. `-mfpmath=sse` for many Intel architectures), this is where you should set those flags. These flags often differ among compiler families, especially between `icc` and `gcc`/`clang`.
 
 _**Variable storage/renaming.**_ Finally, the last statement commits the variables defined in the file to "storage". That is, they are copied to variable names that contain `THIS_CONFIG` as a suffix. This allows the variables for one configuration to co-exist with variables of another configuration.
 
@@ -403,7 +403,7 @@ Some sub-configurations, for various reasons, do not rely on their own set of ke
 excavator:   excavator/piledriver
 steamroller: steamroller/piledriver
 ```
-Here, the first line (reading from left-to-right) defines the `excavator` singleton family as containing only itself, the `excavator` sub-configuration, and also specifies that this sub-configuration must have access to the `piledriver` kernel set. The second line defines the `steamroller` singleton family in a similar manner. 
+Here, the first line (reading from left-to-right) defines the `excavator` singleton family as containing only itself, the `excavator` sub-configuration, and also specifies that this sub-configuration must have access to the `piledriver` kernel set. The second line defines the `steamroller` singleton family in a similar manner.
 
 **Note:** Specifying non-native kernel sets via the `/` character is only allowed when defining singleton configuration families. They may NOT appear in the definitions of umbrella families! When an umbrella family includes a singleton family that is defined to require non-native kernels, this will be accounted for during the parsing of the `config_registry` file.
 
@@ -464,7 +464,7 @@ configure:   skx: skx
 configure:   steamroller: steamroller
 configure:   x86_64: haswell sandybridge penryn zen excavator steamroller piledriver bulldozer generic
 ```
-This simply lists the sub-configurations associated with each defined configuration family (singleton or umbrella). Note that they are sorted alphabetically. 
+This simply lists the sub-configurations associated with each defined configuration family (singleton or umbrella). Note that they are sorted alphabetically.
 
 Next, the kernel list (actually, all kernel lists) is printed:
 ```
@@ -546,7 +546,7 @@ Adding support for a new set of kernels in BLIS is easy and can be done via the
 
 2. _**Add support within the framework source code.**_ We also need to make a minor update to the framework to support the new kernels--specifically, to pull in the kernels' function prototypes.
 
-   **`frame/include/bli_arch_config.h`**. When adding support for the `knl` kernel set to the framework, we must modify this file to `#include` the `bli_kernels_knl.h` header file: 
+   **`frame/include/bli_arch_config.h`**. When adding support for the `knl` kernel set to the framework, we must modify this file to `#include` the `bli_kernels_knl.h` header file:
    ```c
    #ifdef BLIS_KERNELS_KNL
    #include "bli_kernels_knl.h"
@@ -557,7 +557,7 @@ Adding support for a new set of kernels in BLIS is easy and can be done via the
 
 ## Adding a new configuration family
 
-Adding support for a new umbrella configuration family in BLIS is fairly straightforward and can be done via the following steps. The hypothetical examples used in these steps assume you are trying to create a new configuration family `intelavx` that supports only Intel microarchitectures that support the Intel AVX instruction set. 
+Adding support for a new umbrella configuration family in BLIS is fairly straightforward and can be done via the following steps. The hypothetical examples used in these steps assume you are trying to create a new configuration family `intelavx` that supports only Intel microarchitectures that support the Intel AVX instruction set.
 
 
 
@@ -633,7 +633,7 @@ Adding support for a new-subconfiguration to BLIS is similar to adding support f
    ```
    THIS_CONFIG    := knl
    ```
-   and while we're editing the file, we can make any other changes to compiler flags we wish (if any). Similarly, the `bli_family_knl.h` header file should be updated as needed. Since the number of vector registers and the vector register size on `knl` differ from the defaults, we must explicitly set them. (The role of these parameters was explained in a [previous section](ConfigurationHowTo.md#bli_family_h).) Furthermore, provided that a macro `BLIS_NO_HBWMALLOC` is not set, we use a different implementation of `malloc()` and `free()` and `#include` that implementation's header file. 
+   and while we're editing the file, we can make any other changes to compiler flags we wish (if any). Similarly, the `bli_family_knl.h` header file should be updated as needed. Since the number of vector registers and the vector register size on `knl` differ from the defaults, we must explicitly set them. (The role of these parameters was explained in a [previous section](ConfigurationHowTo.md#bli_family_h).) Furthermore, provided that a macro `BLIS_NO_HBWMALLOC` is not set, we use a different implementation of `malloc()` and `free()` and `#include` that implementation's header file.
    ```c
    #define BLIS_SIMD_NUM_REGISTERS  32
    #define BLIS_SIMD_SIZE           64
@@ -711,7 +711,7 @@ Adding support for a new-subconfiguration to BLIS is similar to adding support f
       #include "bli_family_knl.h"
       #endif
       ```
-      As before with umbrella families, the `BLIS_FAMILY_KNL` macro is automatically defined by the build system for whatever family was targeted by `configure`. (That is, if the user runs `./configure foobar`, the C preprocessor macro `BLIS_FAMILY_FOOBAR` will be defined.) 
+      As before with umbrella families, the `BLIS_FAMILY_KNL` macro is automatically defined by the build system for whatever family was targeted by `configure`. (That is, if the user runs `./configure foobar`, the C preprocessor macro `BLIS_FAMILY_FOOBAR` will be defined.)
 
 
 
diff --git a/frame/1/bli_l1v_tapi.c b/frame/1/bli_l1v_tapi.c
index 5fdfdb91ef..b1b7dea87d 100644
--- a/frame/1/bli_l1v_tapi.c
+++ b/frame/1/bli_l1v_tapi.c
@@ -61,7 +61,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
-	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	f \
 	( \
@@ -98,7 +98,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
-	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	f \
 	( \
@@ -135,7 +135,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
-	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	f \
 	( \
@@ -175,7 +175,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	if ( cntx == NULL ) \
 		cntx = bli_gks_query_cntx(); \
 \
-	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	f \
 	( \
@@ -215,7 +215,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
-	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	f \
 	( \
@@ -257,7 +257,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
-	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	f \
 	( \
@@ -295,7 +295,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
-	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	f \
 	( \
@@ -329,7 +329,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
-	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	f \
 	( \
@@ -365,7 +365,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
-	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	f \
 	( \
@@ -400,7 +400,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
-	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	f \
 	( \
diff --git a/frame/1/other/packv/bli_packv_unb_var1.c b/frame/1/other/packv/bli_packv_unb_var1.c
index 23b370949d..ca1323b58c 100644
--- a/frame/1/other/packv/bli_packv_unb_var1.c
+++ b/frame/1/other/packv/bli_packv_unb_var1.c
@@ -91,7 +91,7 @@ void PASTEMAC(ch,varname) \
 { \
 	const num_t dt  = PASTEMAC(ch,type); \
 \
-	PASTECH(ch,copyv_ft) copyv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_COPYV_KER, cntx ); \
+	PASTECH(ch,copyv_ft) copyv_p = bli_cntx_get_ukr_dt( dt, BLIS_COPYV_KER, cntx ); \
 \
 	copyv_p \
 	( \
diff --git a/frame/1/other/unpackv/bli_unpackv_unb_var1.c b/frame/1/other/unpackv/bli_unpackv_unb_var1.c
index 5dc1101b6f..43c9a266cb 100644
--- a/frame/1/other/unpackv/bli_unpackv_unb_var1.c
+++ b/frame/1/other/unpackv/bli_unpackv_unb_var1.c
@@ -91,7 +91,7 @@ void PASTEMAC(ch,varname) \
 { \
 	const num_t dt  = PASTEMAC(ch,type); \
 \
-	PASTECH(ch,copyv_ft) copyv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_COPYV_KER, cntx ); \
+	PASTECH(ch,copyv_ft) copyv_p = bli_cntx_get_ukr_dt( dt, BLIS_COPYV_KER, cntx ); \
 \
 	copyv_p \
 	( \
diff --git a/frame/1d/bli_l1d_tapi.c b/frame/1d/bli_l1d_tapi.c
index a8f9e844ac..921534f375 100644
--- a/frame/1d/bli_l1d_tapi.c
+++ b/frame/1d/bli_l1d_tapi.c
@@ -101,7 +101,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
 	/* Query the context for the operation's kernel address. */ \
-	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	/* Invoke the kernel with the appropriate parameters. */ \
 	f( \
@@ -180,7 +180,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
 	/* Query the context for the operation's kernel address. */ \
-	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	/* Invoke the kernel with the appropriate parameters. */ \
 	f( \
@@ -239,7 +239,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
 	/* Query the context for the operation's kernel address. */ \
-	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	/* Invoke the kernel with the appropriate parameters. */ \
 	f( \
@@ -296,7 +296,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
 	/* Query the context for the operation's kernel address. */ \
-	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	/* Invoke the kernel with the appropriate parameters. */ \
 	f( \
@@ -372,7 +372,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
 	/* Query the context for the operation's kernel address. */ \
-	PASTECH2(chr,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt_r, kerid, cntx ); \
+	PASTECH2(chr,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt_r, kerid, cntx ); \
 \
 	/* Invoke the kernel with the appropriate parameters. */ \
 	f( \
@@ -430,7 +430,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
 	/* Query the context for the operation's kernel address. */ \
-	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	/* Invoke the kernel with the appropriate parameters. */ \
 	f( \
@@ -507,7 +507,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
 	/* Query the context for the operation's kernel address. */ \
-	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	/* Invoke the kernel with the appropriate parameters. */ \
 	f( \
diff --git a/frame/1f/bli_l1f_tapi.c b/frame/1f/bli_l1f_tapi.c
index 332ff5af2c..c53d490179 100644
--- a/frame/1f/bli_l1f_tapi.c
+++ b/frame/1f/bli_l1f_tapi.c
@@ -65,7 +65,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
-	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	f \
 	( \
@@ -109,7 +109,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
-	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	f \
 	( \
@@ -154,7 +154,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
-	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	f \
 	( \
@@ -204,7 +204,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
-	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	f \
 	( \
@@ -254,7 +254,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
 \
-	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_l1f_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,opname,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	f \
 	( \
diff --git a/frame/1m/bli_l1m_unb_var1.c b/frame/1m/bli_l1m_unb_var1.c
index f2ce3c8d7e..c979f082aa 100644
--- a/frame/1m/bli_l1m_unb_var1.c
+++ b/frame/1m/bli_l1m_unb_var1.c
@@ -80,7 +80,7 @@ void PASTEMAC(ch,opname) \
 	conjx = bli_extract_conj( transx ); \
 \
 	/* Query the kernel needed for this operation. */ \
-	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	/* Handle dense and upper/lower storage cases separately. */ \
 	if ( bli_is_dense( uplox_eff ) ) \
@@ -197,7 +197,7 @@ void PASTEMAC(ch,opname) \
 	conjx = bli_extract_conj( transx ); \
 \
 	/* Query the kernel needed for this operation. */ \
-	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	/* Handle dense and upper/lower storage cases separately. */ \
 	if ( bli_is_dense( uplox_eff ) ) \
@@ -310,7 +310,7 @@ void PASTEMAC(ch,opname) \
 	if ( bli_is_zeros( uplox_eff ) ) return; \
 \
 	/* Query the kernel needed for this operation. */ \
-	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	/* Handle dense and upper/lower storage cases separately. */ \
 	if ( bli_is_dense( uplox_eff ) ) \
@@ -423,7 +423,7 @@ void PASTEMAC(ch,opname) \
 	conjx = bli_extract_conj( transx ); \
 \
 	/* Query the kernel needed for this operation. */ \
-	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_l1v_ker_dt( dt, kerid, cntx ); \
+	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	/* Handle dense and upper/lower storage cases separately. */ \
 	if ( bli_is_dense( uplox_eff ) ) \
diff --git a/frame/1m/packm/bli_packm_cxk.c b/frame/1m/packm/bli_packm_cxk.c
index ea0418cae5..55d4862cbe 100644
--- a/frame/1m/packm/bli_packm_cxk.c
+++ b/frame/1m/packm/bli_packm_cxk.c
@@ -54,15 +54,15 @@ void PASTEMAC(ch,opname) \
 	/* Note that we use panel_dim_max, not panel_dim, to query the packm
 	   kernel function pointer. This means that we always use the same
 	   kernel, even for edge cases. */ \
-	num_t     dt     = PASTEMAC(ch,type); \
-	l1mkr_t   ker_id = panel_dim_max; \
+	num_t dt     = PASTEMAC(ch,type); \
+	ukr_t ker_id = ( ukr_t )( BLIS_PACKM_0XK_KER + panel_dim_max ); \
 \
 	PASTECH2(ch,opname,_ker_ft) f; \
 \
 	/* Query the context for the packm kernel corresponding to the current
 	   panel dimension, or kernel id. If the id is invalid, the function will
 	   return NULL. */ \
-	f = bli_cntx_get_packm_ker_dt( dt, ker_id, cntx ); \
+	f = bli_cntx_get_ukr_dt( dt, ker_id, cntx ); \
 \
 	/* If there exists a kernel implementation for the micro-panel dimension
 	   provided, we invoke the implementation. Otherwise, we use scal2m. */ \
diff --git a/frame/1m/packm/bli_packm_cxk_1er.c b/frame/1m/packm/bli_packm_cxk_1er.c
index e583c8a82d..835e476c60 100644
--- a/frame/1m/packm/bli_packm_cxk_1er.c
+++ b/frame/1m/packm/bli_packm_cxk_1er.c
@@ -54,15 +54,15 @@ void PASTEMAC(ch,opname) \
 	/* Note that we use panel_dim_max, not panel_dim, to query the packm
 	   kernel function pointer. This means that we always use the same
 	   kernel, even for edge cases. */ \
-	num_t     dt     = PASTEMAC(ch,type); \
-	l1mkr_t   ker_id = panel_dim_max; \
+	num_t dt     = PASTEMAC(ch,type); \
+	ukr_t ker_id = ( ukr_t )( BLIS_PACKM_0XK_KER + panel_dim_max ); \
 \
 	PASTECH2(ch,opname,_ker_ft) f; \
 \
 	/* Query the context for the packm kernel corresponding to the current
 	   panel dimension, or kernel id. If the id is invalid, the function will
 	   return NULL. */ \
-	f = bli_cntx_get_packm_ker_dt( dt, ker_id, cntx ); \
+	f = bli_cntx_get_ukr_dt( dt, ker_id, cntx ); \
 \
 	/* If there exists a kernel implementation for the micro-panel dimension
 	   provided, we invoke the implementation. Otherwise, we use scal2m. */ \
diff --git a/frame/1m/unpackm/bli_unpackm_cxk.c b/frame/1m/unpackm/bli_unpackm_cxk.c
index 4423c41a27..bc002c453d 100644
--- a/frame/1m/unpackm/bli_unpackm_cxk.c
+++ b/frame/1m/unpackm/bli_unpackm_cxk.c
@@ -48,15 +48,15 @@ void PASTEMAC(ch,opname) \
        cntx_t* cntx  \
      ) \
 { \
-	num_t     dt     = PASTEMAC(ch,type); \
-	l1mkr_t   ker_id = panel_dim; \
+	num_t dt     = PASTEMAC(ch,type); \
+	ukr_t ker_id = ( ukr_t )( BLIS_UNPACKM_0XK_KER + panel_dim ); \
 \
 	PASTECH2(ch,opname,_ker_ft) f; \
 \
 	/* Query the context for the unpackm kernel corresponding to the current
 	   panel dimension, or kernel id. If the id is invalid, the function will
 	   return NULL. */ \
-	f = bli_cntx_get_unpackm_ker_dt( dt, ker_id, cntx ); \
+	f = bli_cntx_get_ukr_dt( dt, ker_id, cntx ); \
 \
 	/* If there exists a kernel implementation for the micro-panel dimension
 	   provided, we invoke the implementation. Otherwise, we use scal2m. */ \
diff --git a/frame/2/gemv/bli_gemv_unb_var1.c b/frame/2/gemv/bli_gemv_unb_var1.c
index 3f5681d2b7..840b96901a 100644
--- a/frame/2/gemv/bli_gemv_unb_var1.c
+++ b/frame/2/gemv/bli_gemv_unb_var1.c
@@ -70,7 +70,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,dotxv_ker_ft) kfp_dv; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_dv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTXV_KER, cntx ); \
+	kfp_dv = bli_cntx_get_ukr_dt( dt, BLIS_DOTXV_KER, cntx ); \
 \
 	for ( i = 0; i < n_iter; ++i ) \
 	{ \
diff --git a/frame/2/gemv/bli_gemv_unb_var2.c b/frame/2/gemv/bli_gemv_unb_var2.c
index 8166aa4175..7fc4fcfe42 100644
--- a/frame/2/gemv/bli_gemv_unb_var2.c
+++ b/frame/2/gemv/bli_gemv_unb_var2.c
@@ -100,7 +100,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,axpyv_ker_ft) kfp_av; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
+	kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 	for ( i = 0; i < n_iter; ++i ) \
 	{ \
diff --git a/frame/2/gemv/bli_gemv_unf_var1.c b/frame/2/gemv/bli_gemv_unf_var1.c
index e392e830e7..0dceed4cf7 100644
--- a/frame/2/gemv/bli_gemv_unf_var1.c
+++ b/frame/2/gemv/bli_gemv_unf_var1.c
@@ -71,7 +71,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,dotxf_ker_ft) kfp_df; \
 \
 	/* Query the context for the kernel function pointer and fusing factor. */ \
-	kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); \
+	kfp_df = bli_cntx_get_ukr_dt( dt, BLIS_DOTXF_KER, cntx ); \
 	b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); \
 \
 	for ( i = 0; i < n_iter; i += f ) \
diff --git a/frame/2/gemv/bli_gemv_unf_var2.c b/frame/2/gemv/bli_gemv_unf_var2.c
index fe7702e4c3..4c43657adb 100644
--- a/frame/2/gemv/bli_gemv_unf_var2.c
+++ b/frame/2/gemv/bli_gemv_unf_var2.c
@@ -100,7 +100,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,axpyf_ker_ft) kfp_af; \
 \
 	/* Query the context for the kernel function pointer and fusing factor. */ \
-	kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx ); \
+	kfp_af = bli_cntx_get_ukr_dt( dt, BLIS_AXPYF_KER, cntx ); \
 	b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx ); \
 \
 	for ( i = 0; i < n_iter; i += f ) \
diff --git a/frame/2/ger/bli_ger_unb_var1.c b/frame/2/ger/bli_ger_unb_var1.c
index d6cda277e4..d8ddd12471 100644
--- a/frame/2/ger/bli_ger_unb_var1.c
+++ b/frame/2/ger/bli_ger_unb_var1.c
@@ -61,7 +61,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,axpyv_ker_ft) kfp_av; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
+	kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 	for ( i = 0; i < m; ++i ) \
 	{ \
diff --git a/frame/2/ger/bli_ger_unb_var2.c b/frame/2/ger/bli_ger_unb_var2.c
index 1590bfe5e9..9c49e336bd 100644
--- a/frame/2/ger/bli_ger_unb_var2.c
+++ b/frame/2/ger/bli_ger_unb_var2.c
@@ -61,7 +61,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,axpyv_ker_ft) kfp_av; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
+	kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 	for ( j = 0; j < n; ++j ) \
 	{ \
diff --git a/frame/2/hemv/bli_hemv_unb_var1.c b/frame/2/hemv/bli_hemv_unb_var1.c
index ea5d478be2..71c27a326a 100644
--- a/frame/2/hemv/bli_hemv_unb_var1.c
+++ b/frame/2/hemv/bli_hemv_unb_var1.c
@@ -122,8 +122,8 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,dotxv_ker_ft) kfp_dv; \
 \
 	/* Query the context for the kernel function pointers. */ \
-	kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
-	kfp_dv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTXV_KER, cntx ); \
+	kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
+	kfp_dv = bli_cntx_get_ukr_dt( dt, BLIS_DOTXV_KER, cntx ); \
 \
 	for ( i = 0; i < m; ++i ) \
 	{ \
diff --git a/frame/2/hemv/bli_hemv_unb_var2.c b/frame/2/hemv/bli_hemv_unb_var2.c
index 1f73465175..3753c8d3bc 100644
--- a/frame/2/hemv/bli_hemv_unb_var2.c
+++ b/frame/2/hemv/bli_hemv_unb_var2.c
@@ -123,7 +123,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,dotxv_ker_ft) kfp_dv; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_dv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTXV_KER, cntx ); \
+	kfp_dv = bli_cntx_get_ukr_dt( dt, BLIS_DOTXV_KER, cntx ); \
 \
 	for ( i = 0; i < m; ++i ) \
 	{ \
diff --git a/frame/2/hemv/bli_hemv_unb_var3.c b/frame/2/hemv/bli_hemv_unb_var3.c
index 6573e59fc8..d592251d5f 100644
--- a/frame/2/hemv/bli_hemv_unb_var3.c
+++ b/frame/2/hemv/bli_hemv_unb_var3.c
@@ -122,8 +122,8 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,dotxv_ker_ft) kfp_dv; \
 \
 	/* Query the context for the kernel function pointers. */ \
-	kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
-	kfp_dv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTXV_KER, cntx ); \
+	kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
+	kfp_dv = bli_cntx_get_ukr_dt( dt, BLIS_DOTXV_KER, cntx ); \
 \
 	for ( i = 0; i < m; ++i ) \
 	{ \
diff --git a/frame/2/hemv/bli_hemv_unb_var4.c b/frame/2/hemv/bli_hemv_unb_var4.c
index deabc3ab41..10cf953b64 100644
--- a/frame/2/hemv/bli_hemv_unb_var4.c
+++ b/frame/2/hemv/bli_hemv_unb_var4.c
@@ -122,7 +122,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,axpyv_ker_ft) kfp_av; \
 \
 	/* Query the context for the kernel function pointers. */ \
-	kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
+	kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 	for ( i = 0; i < m; ++i ) \
 	{ \
diff --git a/frame/2/hemv/bli_hemv_unf_var1.c b/frame/2/hemv/bli_hemv_unf_var1.c
index d36dc00988..a449909a56 100644
--- a/frame/2/hemv/bli_hemv_unf_var1.c
+++ b/frame/2/hemv/bli_hemv_unf_var1.c
@@ -130,7 +130,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,dotxaxpyf_ker_ft) kfp_xf; \
 \
 	/* Query the context for the kernel function pointer and fusing factor. */ \
-	kfp_xf = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXAXPYF_KER, cntx ); \
+	kfp_xf = bli_cntx_get_ukr_dt( dt, BLIS_DOTXAXPYF_KER, cntx ); \
 	b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_XF, cntx ); \
 \
 	for ( i = 0; i < m; i += f ) \
diff --git a/frame/2/hemv/bli_hemv_unf_var1a.c b/frame/2/hemv/bli_hemv_unf_var1a.c
index 31ab1515f6..d0af57393a 100644
--- a/frame/2/hemv/bli_hemv_unf_var1a.c
+++ b/frame/2/hemv/bli_hemv_unf_var1a.c
@@ -121,7 +121,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,dotaxpyv_ker_ft) kfp_vf; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_vf = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTAXPYV_KER, cntx ); \
+	kfp_vf = bli_cntx_get_ukr_dt( dt, BLIS_DOTAXPYV_KER, cntx ); \
 \
 	for ( i = 0; i < m; ++i ) \
 	{ \
diff --git a/frame/2/hemv/bli_hemv_unf_var3.c b/frame/2/hemv/bli_hemv_unf_var3.c
index d8db9bc78a..baaff098d8 100644
--- a/frame/2/hemv/bli_hemv_unf_var3.c
+++ b/frame/2/hemv/bli_hemv_unf_var3.c
@@ -130,7 +130,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,dotxaxpyf_ker_ft) kfp_xf; \
 \
 	/* Query the context for the kernel function pointer and fusing factor. */ \
-	kfp_xf = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXAXPYF_KER, cntx ); \
+	kfp_xf = bli_cntx_get_ukr_dt( dt, BLIS_DOTXAXPYF_KER, cntx ); \
 	b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_XF, cntx ); \
 \
 	for ( i = 0; i < m; i += f ) \
diff --git a/frame/2/hemv/bli_hemv_unf_var3a.c b/frame/2/hemv/bli_hemv_unf_var3a.c
index 54ab0f6ce9..55c1929ffb 100644
--- a/frame/2/hemv/bli_hemv_unf_var3a.c
+++ b/frame/2/hemv/bli_hemv_unf_var3a.c
@@ -121,7 +121,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,dotaxpyv_ker_ft) kfp_vf; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_vf = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTAXPYV_KER, cntx ); \
+	kfp_vf = bli_cntx_get_ukr_dt( dt, BLIS_DOTAXPYV_KER, cntx ); \
 \
 	for ( i = 0; i < m; ++i ) \
 	{ \
diff --git a/frame/2/her/bli_her_unb_var1.c b/frame/2/her/bli_her_unb_var1.c
index e7f7186805..8cd6bd3979 100644
--- a/frame/2/her/bli_her_unb_var1.c
+++ b/frame/2/her/bli_her_unb_var1.c
@@ -103,7 +103,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,axpyv_ker_ft) kfp_av; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
+	kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 	for ( i = 0; i < m; ++i ) \
 	{ \
diff --git a/frame/2/her/bli_her_unb_var2.c b/frame/2/her/bli_her_unb_var2.c
index 4b39e1df00..f68798dce6 100644
--- a/frame/2/her/bli_her_unb_var2.c
+++ b/frame/2/her/bli_her_unb_var2.c
@@ -103,7 +103,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,axpyv_ker_ft) kfp_av; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
+	kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 	for ( i = 0; i < m; ++i ) \
 	{ \
diff --git a/frame/2/her2/bli_her2_unb_var1.c b/frame/2/her2/bli_her2_unb_var1.c
index 37423bfcb0..b5c182639e 100644
--- a/frame/2/her2/bli_her2_unb_var1.c
+++ b/frame/2/her2/bli_her2_unb_var1.c
@@ -106,7 +106,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,axpyv_ker_ft) kfp_av; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
+	kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 	for ( i = 0; i < m; ++i ) \
 	{ \
diff --git a/frame/2/her2/bli_her2_unb_var2.c b/frame/2/her2/bli_her2_unb_var2.c
index 22d6de07a1..602e922a8e 100644
--- a/frame/2/her2/bli_her2_unb_var2.c
+++ b/frame/2/her2/bli_her2_unb_var2.c
@@ -113,7 +113,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,axpyv_ker_ft) kfp_av; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
+	kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 	for ( i = 0; i < m; ++i ) \
 	{ \
diff --git a/frame/2/her2/bli_her2_unb_var3.c b/frame/2/her2/bli_her2_unb_var3.c
index 297b9b702a..1d5872d5d2 100644
--- a/frame/2/her2/bli_her2_unb_var3.c
+++ b/frame/2/her2/bli_her2_unb_var3.c
@@ -113,7 +113,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,axpyv_ker_ft) kfp_av; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
+	kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 	for ( i = 0; i < m; ++i ) \
 	{ \
diff --git a/frame/2/her2/bli_her2_unb_var4.c b/frame/2/her2/bli_her2_unb_var4.c
index 58adb0e706..922fe7db77 100644
--- a/frame/2/her2/bli_her2_unb_var4.c
+++ b/frame/2/her2/bli_her2_unb_var4.c
@@ -114,7 +114,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,axpyv_ker_ft) kfp_av; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
+	kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 	for ( i = 0; i < m; ++i ) \
 	{ \
diff --git a/frame/2/her2/bli_her2_unf_var1.c b/frame/2/her2/bli_her2_unf_var1.c
index a0aec48f71..3824880c6b 100644
--- a/frame/2/her2/bli_her2_unf_var1.c
+++ b/frame/2/her2/bli_her2_unf_var1.c
@@ -106,7 +106,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,axpy2v_ker_ft) kfp_2v; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_2v = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPY2V_KER, cntx ); \
+	kfp_2v = bli_cntx_get_ukr_dt( dt, BLIS_AXPY2V_KER, cntx ); \
 \
 	for ( i = 0; i < m; ++i ) \
 	{ \
diff --git a/frame/2/her2/bli_her2_unf_var4.c b/frame/2/her2/bli_her2_unf_var4.c
index 3dea31d53e..6b2b0e9acc 100644
--- a/frame/2/her2/bli_her2_unf_var4.c
+++ b/frame/2/her2/bli_her2_unf_var4.c
@@ -114,7 +114,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,axpy2v_ker_ft) kfp_2v; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_2v = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPY2V_KER, cntx ); \
+	kfp_2v = bli_cntx_get_ukr_dt( dt, BLIS_AXPY2V_KER, cntx ); \
 \
 	for ( i = 0; i < m; ++i ) \
 	{ \
diff --git a/frame/2/trmv/bli_trmv_unb_var1.c b/frame/2/trmv/bli_trmv_unb_var1.c
index 31bfa6a838..367a34e6c1 100644
--- a/frame/2/trmv/bli_trmv_unb_var1.c
+++ b/frame/2/trmv/bli_trmv_unb_var1.c
@@ -83,7 +83,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,dotv_ker_ft) kfp_dv; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_dv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTV_KER, cntx ); \
+	kfp_dv = bli_cntx_get_ukr_dt( dt, BLIS_DOTV_KER, cntx ); \
 \
 	/* We reduce all of the possible cases down to just lower/upper. */ \
 	if      ( bli_is_upper( uploa_trans ) ) \
diff --git a/frame/2/trmv/bli_trmv_unb_var2.c b/frame/2/trmv/bli_trmv_unb_var2.c
index 00d4d95f3b..fa21776b30 100644
--- a/frame/2/trmv/bli_trmv_unb_var2.c
+++ b/frame/2/trmv/bli_trmv_unb_var2.c
@@ -83,7 +83,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,axpyv_ker_ft) kfp_av; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
+	kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 	/* We reduce all of the possible cases down to just lower/upper. */ \
 	if      ( bli_is_upper( uploa_trans ) ) \
diff --git a/frame/2/trmv/bli_trmv_unf_var1.c b/frame/2/trmv/bli_trmv_unf_var1.c
index 6dc3cea362..9e576fc777 100644
--- a/frame/2/trmv/bli_trmv_unf_var1.c
+++ b/frame/2/trmv/bli_trmv_unf_var1.c
@@ -91,7 +91,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,dotxf_ker_ft) kfp_df; \
 \
 	/* Query the context for the kernel function pointer and fusing factor. */ \
-	kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); \
+	kfp_df = bli_cntx_get_ukr_dt( dt, BLIS_DOTXF_KER, cntx ); \
 	b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); \
 \
 	/* We reduce all of the possible cases down to just lower/upper. */ \
diff --git a/frame/2/trmv/bli_trmv_unf_var2.c b/frame/2/trmv/bli_trmv_unf_var2.c
index 8bbd518201..0525959353 100644
--- a/frame/2/trmv/bli_trmv_unf_var2.c
+++ b/frame/2/trmv/bli_trmv_unf_var2.c
@@ -90,7 +90,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,axpyf_ker_ft) kfp_af; \
 \
 	/* Query the context for the kernel function pointer and fusing factor. */ \
-	kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx ); \
+	kfp_af = bli_cntx_get_ukr_dt( dt, BLIS_AXPYF_KER, cntx ); \
 	b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx ); \
 \
 	/* We reduce all of the possible cases down to just lower/upper. */ \
diff --git a/frame/2/trsv/bli_trsv_unb_var1.c b/frame/2/trsv/bli_trsv_unb_var1.c
index c7493e33dc..2f24b10a8d 100644
--- a/frame/2/trsv/bli_trsv_unb_var1.c
+++ b/frame/2/trsv/bli_trsv_unb_var1.c
@@ -94,7 +94,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,dotv_ker_ft) kfp_tv; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_tv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTV_KER, cntx ); \
+	kfp_tv = bli_cntx_get_ukr_dt( dt, BLIS_DOTV_KER, cntx ); \
 \
 	/* We reduce all of the possible cases down to just lower/upper. */ \
 	if      ( bli_is_upper( uploa_trans ) ) \
diff --git a/frame/2/trsv/bli_trsv_unb_var2.c b/frame/2/trsv/bli_trsv_unb_var2.c
index a78e7eef04..1a8e81634a 100644
--- a/frame/2/trsv/bli_trsv_unb_var2.c
+++ b/frame/2/trsv/bli_trsv_unb_var2.c
@@ -94,7 +94,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,axpyv_ker_ft) kfp_av; \
 \
 	/* Query the context for the kernel function pointer. */ \
-	kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
+	kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 	/* We reduce all of the possible cases down to just lower/upper. */ \
 	if      ( bli_is_upper( uploa_trans ) ) \
diff --git a/frame/2/trsv/bli_trsv_unf_var1.c b/frame/2/trsv/bli_trsv_unf_var1.c
index 3b03b43e5d..824f26d151 100644
--- a/frame/2/trsv/bli_trsv_unf_var1.c
+++ b/frame/2/trsv/bli_trsv_unf_var1.c
@@ -103,7 +103,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,dotxf_ker_ft) kfp_df; \
 \
 	/* Query the context for the kernel function pointer and fusing factor. */ \
-	kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); \
+	kfp_df = bli_cntx_get_ukr_dt( dt, BLIS_DOTXF_KER, cntx ); \
 	b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); \
 \
 	/* We reduce all of the possible cases down to just lower/upper. */ \
diff --git a/frame/2/trsv/bli_trsv_unf_var2.c b/frame/2/trsv/bli_trsv_unf_var2.c
index 10741d2918..bd1f8e3b04 100644
--- a/frame/2/trsv/bli_trsv_unf_var2.c
+++ b/frame/2/trsv/bli_trsv_unf_var2.c
@@ -102,7 +102,7 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,axpyf_ker_ft) kfp_af; \
 \
 	/* Query the context for the kernel function pointer and fusing factor. */ \
-	kfp_af = bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx ); \
+	kfp_af = bli_cntx_get_ukr_dt( dt, BLIS_AXPYF_KER, cntx ); \
 	b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_AF, cntx ); \
 \
 	/* We reduce all of the possible cases down to just lower/upper. */ \
diff --git a/frame/3/bli_l3_schema.c b/frame/3/bli_l3_schema.c
index bde30c5277..1d46087997 100644
--- a/frame/3/bli_l3_schema.c
+++ b/frame/3/bli_l3_schema.c
@@ -57,7 +57,7 @@ void bli_l3_set_schemas
 		// projection of dt to query the preference of the corresponding native
 		// real-domain microkernel. This is what ultimately determines which
 		// variant of 1m is applicable.
-		if ( bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ) )
+		if ( bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ) )
 		{
 			schema_a = BLIS_PACKED_ROW_PANELS_1E;
 			schema_b = BLIS_PACKED_COL_PANELS_1R;
diff --git a/frame/3/bli_l3_sup.c b/frame/3/bli_l3_sup.c
index 72ec405ab0..7e37e1f22e 100644
--- a/frame/3/bli_l3_sup.c
+++ b/frame/3/bli_l3_sup.c
@@ -63,7 +63,7 @@ err_t bli_gemmsup
 	// Return early if a microkernel preference-induced transposition would
 	// have been performed and shifted the dimensions outside of the space
 	// of sup-handled problems.
-	if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( c, BLIS_GEMM_UKR, cntx ) )
+	if ( bli_cntx_dislikes_storage_of( c, BLIS_GEMM_VIR_UKR, cntx ) )
 	{
 		const num_t dt = bli_obj_dt( c );
 		const dim_t m  = bli_obj_length( c );
diff --git a/frame/3/bli_l3_sup_int.c b/frame/3/bli_l3_sup_int.c
index e54e01d7c7..3da3954fa7 100644
--- a/frame/3/bli_l3_sup_int.c
+++ b/frame/3/bli_l3_sup_int.c
@@ -85,7 +85,7 @@ err_t bli_gemmsup_int
 	const bool    is_rcc_crc_ccr_ccc = !is_rrr_rrc_rcr_crr;
 
 	const num_t   dt         = bli_obj_dt( c );
-	const bool    row_pref   = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx );
+	const bool    row_pref   = bli_cntx_ukr_prefers_rows_dt( dt, bli_stor3_ukr( stor_id ), cntx );
 
 	const bool    is_primary = ( row_pref ? is_rrr_rrc_rcr_crr
 	                                      : is_rcc_crc_ccr_ccc );
@@ -259,7 +259,7 @@ err_t bli_gemmtsup_int
 	const bool    is_rcc_crc_ccr_ccc = !is_rrr_rrc_rcr_crr;
 
 	const num_t   dt         = bli_obj_dt( c );
-	const bool    row_pref   = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx );
+	const bool    row_pref   = bli_cntx_ukr_prefers_rows_dt( dt, bli_stor3_ukr( stor_id ), cntx );
 
 	const bool    is_primary = ( row_pref ? is_rrr_rrc_rcr_crr
 	                                      : is_rcc_crc_ccr_ccc );
diff --git a/frame/3/bli_l3_sup_vars.h b/frame/3/bli_l3_sup_vars.h
index 7c315192d5..ead9925e68 100644
--- a/frame/3/bli_l3_sup_vars.h
+++ b/frame/3/bli_l3_sup_vars.h
@@ -127,7 +127,7 @@ BLIS_INLINE void bli_gemmsup_ref_var1n2m_opt_cases
        cntx_t*  cntx
      )
 {
-	const bool row_pref = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, *eff_id, cntx );
+	const bool row_pref = bli_cntx_ukr_prefers_rows_dt( dt, bli_stor3_ukr( *eff_id ), cntx );
 
 	// Handle row- and column-preferrential kernels separately.
 	if ( row_pref )
diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c
index 4ff45036fe..cd8827bd9c 100644
--- a/frame/3/gemm/bli_gemm_front.c
+++ b/frame/3/gemm/bli_gemm_front.c
@@ -99,7 +99,7 @@ void bli_gemm_front
 	// contiguous columns, or if C is stored by columns and the micro-kernel
 	// prefers contiguous rows, transpose the entire operation to allow the
 	// micro-kernel to access elements of C in its preferred manner.
-	if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
+	if ( bli_cntx_dislikes_storage_of( &c_local, BLIS_GEMM_VIR_UKR, cntx ) )
 	{
 		bli_obj_swap( &a_local, &b_local );
 
diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c
index 6de361194d..874a12439c 100644
--- a/frame/3/gemm/bli_gemm_ker_var2.c
+++ b/frame/3/gemm/bli_gemm_ker_var2.c
@@ -201,7 +201,7 @@ void bli_gemm_ker_var2
 	// column-stored as well.
 	char        ct[ BLIS_STACK_BUF_MAX_SIZE ]
 	                __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE)));
-	const bool  col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt_exec, BLIS_GEMM_UKR, cntx );
+	const bool  col_pref    = bli_cntx_ukr_prefers_cols_dt( dt_exec, BLIS_GEMM_VIR_UKR, cntx );
 	const inc_t rs_ct       = ( col_pref ? 1 : NR );
 	const inc_t cs_ct       = ( col_pref ? MR : 1 );
 	char*       zero        = bli_obj_buffer_for_const( dt_exec, &BLIS_ZERO );
diff --git a/frame/3/gemm/bli_gemm_md.c b/frame/3/gemm/bli_gemm_md.c
index e257cdf287..a16156c157 100644
--- a/frame/3/gemm/bli_gemm_md.c
+++ b/frame/3/gemm/bli_gemm_md.c
@@ -173,7 +173,7 @@ mddm_t bli_gemm_md_ccr
 	// preference.
 	const num_t dt = BLIS_REAL | bli_obj_comp_prec( c );
 	const bool  row_pref
-	      = bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, *cntx );
+	      = bli_cntx_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, *cntx );
 
 	// We can only perform this case of mixed-domain gemm, C += A*B where
 	// B is real, if the microkernel prefers column output. If it prefers
@@ -236,8 +236,8 @@ mddm_t bli_gemm_md_ccr
 
 	// Use the default pack schemas in the objects.
 
-	// static func_t* bli_cntx_get_l3_vir_ukrs( l3ukr_t ukr_id, cntx_t* cntx )
-	func_t* l3_vir_ukrs = bli_cntx_get_l3_vir_ukrs( BLIS_GEMM_UKR, *cntx );
+	// static func_t* bli_cntx_get_ukrs( ukr_t ukr_id, cntx_t* cntx )
+	func_t* l3_vir_ukrs = bli_cntx_get_ukrs( BLIS_GEMM_VIR_UKR, *cntx );
 
 	// Rather than check which complex datatype dt_comp refers to, we set
 	// the mixed-domain virtual microkernel for both types.
@@ -278,7 +278,7 @@ mddm_t bli_gemm_md_crc
 	// preference.
 	const num_t dt = BLIS_REAL | bli_obj_comp_prec( c );
 	const bool  col_pref
-	      = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, *cntx );
+	      = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, *cntx );
 
 	// We can only perform this case of mixed-domain gemm, C += A*B where
 	// A is real, if the microkernel prefers row output. If it prefers
@@ -341,8 +341,8 @@ mddm_t bli_gemm_md_crc
 
 	// Use the default pack schemas in the objects.
 
-	// static func_t* bli_cntx_get_l3_vir_ukrs( l3ukr_t ukr_id, cntx_t* cntx )
-	func_t* l3_vir_ukrs = bli_cntx_get_l3_vir_ukrs( BLIS_GEMM_UKR, *cntx );
+	// static func_t* bli_cntx_get_ukrs( ukr_t ukr_id, cntx_t* cntx )
+	func_t* l3_vir_ukrs = bli_cntx_get_ukrs( BLIS_GEMM_VIR_UKR, *cntx );
 
 	// Rather than check which complex datatype dt_comp refers to, we set
 	// the mixed-domain virtual microkernel for both types.
@@ -430,10 +430,10 @@ mddm_t bli_gemm_md_rcc
 	const num_t dt_complex = bli_obj_dt( a );
 	cntx_t* cntx_1m = bli_gks_query_ind_cntx( BLIS_1M, dt_complex );
 
-	func_t* cntx_funcs    = bli_cntx_packm_kers_buf( *cntx );
-	func_t* cntx_1m_funcs = bli_cntx_packm_kers_buf( cntx_1m );
+	func_t* cntx_funcs    = bli_cntx_ukrs_buf( *cntx );
+	func_t* cntx_1m_funcs = bli_cntx_ukrs_buf( cntx_1m );
 
-	for ( dim_t i = 0; i <= BLIS_PACKM_31XK_KER; ++i )
+	for ( dim_t i = BLIS_PACKM_0XK_KER; i <= BLIS_PACKM_31XK_KER; ++i )
 	{
 		cntx_funcs[ i ] = cntx_1m_funcs[ i ];
 	}
diff --git a/frame/3/gemm/bli_gemm_md_c2r_ref.c b/frame/3/gemm/bli_gemm_md_c2r_ref.c
index bbd9190a9a..a4797ad4fd 100644
--- a/frame/3/gemm/bli_gemm_md_c2r_ref.c
+++ b/frame/3/gemm/bli_gemm_md_c2r_ref.c
@@ -57,8 +57,8 @@ void PASTEMAC2(ch,opname,suf) \
 	const num_t       dt_r      = PASTEMAC(chr,type); \
 \
 	PASTECH(chr,gemm_ukr_ft) \
-	                  rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
-	const bool        col_pref  = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
+	                  rgemm_ukr = bli_cntx_get_ukr_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
+	const bool        col_pref  = bli_cntx_ukr_prefers_cols_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
 	const bool        row_pref  = !col_pref; \
 \
 	const dim_t       mr        = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
diff --git a/frame/3/gemm/other/bli_gemm_ker_var2.c b/frame/3/gemm/other/bli_gemm_ker_var2.c
index 62d2a9e04b..c5cf935b83 100644
--- a/frame/3/gemm/other/bli_gemm_ker_var2.c
+++ b/frame/3/gemm/other/bli_gemm_ker_var2.c
@@ -198,7 +198,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/gemm/other/bli_gemm_ker_var2rr.c b/frame/3/gemm/other/bli_gemm_ker_var2rr.c
index 289e4ddf5b..946e3048c2 100644
--- a/frame/3/gemm/other/bli_gemm_ker_var2rr.c
+++ b/frame/3/gemm/other/bli_gemm_ker_var2rr.c
@@ -199,7 +199,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/gemm/other/bli_gemm_ker_var2sl.c b/frame/3/gemm/other/bli_gemm_ker_var2sl.c
index d75838fb4e..f5159bbb92 100644
--- a/frame/3/gemm/other/bli_gemm_ker_var2sl.c
+++ b/frame/3/gemm/other/bli_gemm_ker_var2sl.c
@@ -199,7 +199,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/gemmt/bli_gemmt_front.c b/frame/3/gemmt/bli_gemmt_front.c
index 2a9d91759b..d53838470a 100644
--- a/frame/3/gemmt/bli_gemmt_front.c
+++ b/frame/3/gemmt/bli_gemmt_front.c
@@ -86,7 +86,7 @@ void bli_gemmt_front
 	// contiguous columns, or if C is stored by columns and the micro-kernel
 	// prefers contiguous rows, transpose the entire operation to allow the
 	// micro-kernel to access elements of C in its preferred manner.
-	if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
+	if ( bli_cntx_dislikes_storage_of( &c_local, BLIS_GEMM_VIR_UKR, cntx ) )
 	{
 		bli_obj_swap( &a_local, &b_local );
 
diff --git a/frame/3/gemmt/bli_gemmt_l_ker_var2.c b/frame/3/gemmt/bli_gemmt_l_ker_var2.c
index fea4efec0a..3aedc6e9a0 100644
--- a/frame/3/gemmt/bli_gemmt_l_ker_var2.c
+++ b/frame/3/gemmt/bli_gemmt_l_ker_var2.c
@@ -183,7 +183,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/gemmt/bli_gemmt_u_ker_var2.c b/frame/3/gemmt/bli_gemmt_u_ker_var2.c
index 4b849bbc6d..b3a9fe8a1e 100644
--- a/frame/3/gemmt/bli_gemmt_u_ker_var2.c
+++ b/frame/3/gemmt/bli_gemmt_u_ker_var2.c
@@ -183,7 +183,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/gemmt/other/bli_gemmt_l_ker_var2.c b/frame/3/gemmt/other/bli_gemmt_l_ker_var2.c
index 0bf4b1a0fb..ece351ef76 100644
--- a/frame/3/gemmt/other/bli_gemmt_l_ker_var2.c
+++ b/frame/3/gemmt/other/bli_gemmt_l_ker_var2.c
@@ -183,7 +183,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/gemmt/other/bli_gemmt_u_ker_var2.c b/frame/3/gemmt/other/bli_gemmt_u_ker_var2.c
index 1655bea555..f00e769b53 100644
--- a/frame/3/gemmt/other/bli_gemmt_u_ker_var2.c
+++ b/frame/3/gemmt/other/bli_gemmt_u_ker_var2.c
@@ -183,7 +183,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/hemm/bli_hemm_front.c b/frame/3/hemm/bli_hemm_front.c
index 9835de9c15..15460125da 100644
--- a/frame/3/hemm/bli_hemm_front.c
+++ b/frame/3/hemm/bli_hemm_front.c
@@ -117,7 +117,7 @@ void bli_hemm_front
 	// micro-kernel to access elements of C in its preferred manner.
 	//if ( !bli_obj_is_1x1( &c_local ) ) // NOTE: This conditional should NOT
 	                                     // be enabled. See issue #342 comments.
-	if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
+	if ( bli_cntx_dislikes_storage_of( &c_local, BLIS_GEMM_VIR_UKR, cntx ) )
 	{
 		bli_toggle_side( &side );
 		bli_obj_toggle_conj( &a_local );
diff --git a/frame/3/symm/bli_symm_front.c b/frame/3/symm/bli_symm_front.c
index be94c44c1b..8108b607fc 100644
--- a/frame/3/symm/bli_symm_front.c
+++ b/frame/3/symm/bli_symm_front.c
@@ -117,7 +117,7 @@ void bli_symm_front
 	// micro-kernel to access elements of C in its preferred manner.
 	//if ( !bli_obj_is_1x1( &c_local ) ) // NOTE: This conditional should NOT
 	                                     // be enabled. See issue #342 comments.
-	if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
+	if ( bli_cntx_dislikes_storage_of( &c_local, BLIS_GEMM_VIR_UKR, cntx ) )
 	{
 		bli_toggle_side( &side );
 		bli_obj_induce_trans( &b_local );
diff --git a/frame/3/trmm/bli_trmm_front.c b/frame/3/trmm/bli_trmm_front.c
index 1de28958eb..d973b6eb6b 100644
--- a/frame/3/trmm/bli_trmm_front.c
+++ b/frame/3/trmm/bli_trmm_front.c
@@ -135,7 +135,7 @@ void bli_trmm_front
 	// of row- vs. column storage breaks down.
 	//if ( !bli_obj_is_1x1( &c_local ) ) // NOTE: This conditional should NOT
 	                                     // be enabled. See issue #342 comments.
-	if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
+	if ( bli_cntx_dislikes_storage_of( &c_local, BLIS_GEMM_VIR_UKR, cntx ) )
 	{
 		bli_toggle_side( &side );
 		bli_obj_induce_trans( &a_local );
diff --git a/frame/3/trmm/other/bli_trmm_ll_ker_var2.c b/frame/3/trmm/other/bli_trmm_ll_ker_var2.c
index 9ab64e470d..706e14d43b 100644
--- a/frame/3/trmm/other/bli_trmm_ll_ker_var2.c
+++ b/frame/3/trmm/other/bli_trmm_ll_ker_var2.c
@@ -175,7 +175,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/trmm/other/bli_trmm_ll_ker_var2rr.c b/frame/3/trmm/other/bli_trmm_ll_ker_var2rr.c
index 6fef4e0c96..699892635f 100644
--- a/frame/3/trmm/other/bli_trmm_ll_ker_var2rr.c
+++ b/frame/3/trmm/other/bli_trmm_ll_ker_var2rr.c
@@ -178,7 +178,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
@@ -337,7 +337,7 @@ void PASTEMAC(ch,varname) \
 	dim_t jr_inc; \
 \
 	/* Use round-robin assignment of micropanels to threads in the 2nd loop for
-	   the initial rectangular region of C (if it exists). 
+	   the initial rectangular region of C (if it exists).
 	   NOTE: Parallelism in the 1st loop is disabled for now. */ \
 	bli_thread_range_jrir_rr( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
 	/*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \
diff --git a/frame/3/trmm/other/bli_trmm_ll_ker_var2sl.c b/frame/3/trmm/other/bli_trmm_ll_ker_var2sl.c
index e0d9cc75f7..eb55775934 100644
--- a/frame/3/trmm/other/bli_trmm_ll_ker_var2sl.c
+++ b/frame/3/trmm/other/bli_trmm_ll_ker_var2sl.c
@@ -178,7 +178,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/trmm/other/bli_trmm_lu_ker_var2.c b/frame/3/trmm/other/bli_trmm_lu_ker_var2.c
index 0abcfd77ae..738711f58d 100644
--- a/frame/3/trmm/other/bli_trmm_lu_ker_var2.c
+++ b/frame/3/trmm/other/bli_trmm_lu_ker_var2.c
@@ -175,7 +175,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/trmm/other/bli_trmm_lu_ker_var2rr.c b/frame/3/trmm/other/bli_trmm_lu_ker_var2rr.c
index 8c505f88a7..df53b2011d 100644
--- a/frame/3/trmm/other/bli_trmm_lu_ker_var2rr.c
+++ b/frame/3/trmm/other/bli_trmm_lu_ker_var2rr.c
@@ -178,7 +178,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/trmm/other/bli_trmm_lu_ker_var2sl.c b/frame/3/trmm/other/bli_trmm_lu_ker_var2sl.c
index 3bb0deaa30..fbcd4f9aa3 100644
--- a/frame/3/trmm/other/bli_trmm_lu_ker_var2sl.c
+++ b/frame/3/trmm/other/bli_trmm_lu_ker_var2sl.c
@@ -178,7 +178,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/trmm/other/bli_trmm_rl_ker_var2.c b/frame/3/trmm/other/bli_trmm_rl_ker_var2.c
index 672caaa052..7775d92173 100644
--- a/frame/3/trmm/other/bli_trmm_rl_ker_var2.c
+++ b/frame/3/trmm/other/bli_trmm_rl_ker_var2.c
@@ -175,7 +175,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/trmm/other/bli_trmm_rl_ker_var2rr.c b/frame/3/trmm/other/bli_trmm_rl_ker_var2rr.c
index 9d9e3809cd..c1354a962b 100644
--- a/frame/3/trmm/other/bli_trmm_rl_ker_var2rr.c
+++ b/frame/3/trmm/other/bli_trmm_rl_ker_var2rr.c
@@ -178,7 +178,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/trmm/other/bli_trmm_rl_ker_var2sl.c b/frame/3/trmm/other/bli_trmm_rl_ker_var2sl.c
index 8bac0ec4aa..7cf8eeef0e 100644
--- a/frame/3/trmm/other/bli_trmm_rl_ker_var2sl.c
+++ b/frame/3/trmm/other/bli_trmm_rl_ker_var2sl.c
@@ -178,7 +178,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/trmm/other/bli_trmm_ru_ker_var2.c b/frame/3/trmm/other/bli_trmm_ru_ker_var2.c
index fc2991b132..1d0f317083 100644
--- a/frame/3/trmm/other/bli_trmm_ru_ker_var2.c
+++ b/frame/3/trmm/other/bli_trmm_ru_ker_var2.c
@@ -175,7 +175,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/trmm/other/bli_trmm_ru_ker_var2rr.c b/frame/3/trmm/other/bli_trmm_ru_ker_var2rr.c
index 00a0dc3f0c..d8ae4f8bbe 100644
--- a/frame/3/trmm/other/bli_trmm_ru_ker_var2rr.c
+++ b/frame/3/trmm/other/bli_trmm_ru_ker_var2rr.c
@@ -178,7 +178,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/trmm/other/bli_trmm_ru_ker_var2sl.c b/frame/3/trmm/other/bli_trmm_ru_ker_var2sl.c
index 889fa49fa7..c05a082d40 100644
--- a/frame/3/trmm/other/bli_trmm_ru_ker_var2sl.c
+++ b/frame/3/trmm/other/bli_trmm_ru_ker_var2sl.c
@@ -178,7 +178,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/trmm3/bli_trmm3_front.c b/frame/3/trmm3/bli_trmm3_front.c
index 3b97539603..9cd04963b4 100644
--- a/frame/3/trmm3/bli_trmm3_front.c
+++ b/frame/3/trmm3/bli_trmm3_front.c
@@ -127,7 +127,7 @@ void bli_trmm3_front
 	// contiguous columns, or if C is stored by columns and the micro-kernel
 	// prefers contiguous rows, transpose the entire operation to allow the
 	// micro-kernel to access elements of C in its preferred manner.
-	if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
+	if ( bli_cntx_dislikes_storage_of( &c_local, BLIS_GEMM_VIR_UKR, cntx ) )
 	{
 		bli_toggle_side( &side );
 		bli_obj_induce_trans( &a_local );
diff --git a/frame/3/trsm/bli_trsm_ll_ker_var2.c b/frame/3/trsm/bli_trsm_ll_ker_var2.c
index b503efa5bf..5f53e6b38b 100644
--- a/frame/3/trsm/bli_trsm_ll_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_ll_ker_var2.c
@@ -179,7 +179,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/trsm/bli_trsm_lu_ker_var2.c b/frame/3/trsm/bli_trsm_lu_ker_var2.c
index 55ceafb91d..9bdc25a32a 100644
--- a/frame/3/trsm/bli_trsm_lu_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_lu_ker_var2.c
@@ -179,7 +179,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/trsm/bli_trsm_rl_ker_var2.c b/frame/3/trsm/bli_trsm_rl_ker_var2.c
index 23d4dd7289..eabbcf8494 100644
--- a/frame/3/trsm/bli_trsm_rl_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_rl_ker_var2.c
@@ -184,7 +184,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/trsm/bli_trsm_ru_ker_var2.c b/frame/3/trsm/bli_trsm_ru_ker_var2.c
index 71381707c4..ce7bd2e19d 100644
--- a/frame/3/trsm/bli_trsm_ru_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_ru_ker_var2.c
@@ -184,7 +184,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/trsm/other/bli_trsm_ll_ker_var2.c b/frame/3/trsm/other/bli_trsm_ll_ker_var2.c
index dc57eac5f2..26da1b004c 100644
--- a/frame/3/trsm/other/bli_trsm_ll_ker_var2.c
+++ b/frame/3/trsm/other/bli_trsm_ll_ker_var2.c
@@ -179,7 +179,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/trsm/other/bli_trsm_ll_ker_var2rr.c b/frame/3/trsm/other/bli_trsm_ll_ker_var2rr.c
index 38768242ec..607b40e548 100644
--- a/frame/3/trsm/other/bli_trsm_ll_ker_var2rr.c
+++ b/frame/3/trsm/other/bli_trsm_ll_ker_var2rr.c
@@ -182,7 +182,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/trsm/other/bli_trsm_ll_ker_var2sl.c b/frame/3/trsm/other/bli_trsm_ll_ker_var2sl.c
index 78ffe17585..3299b5f8e8 100644
--- a/frame/3/trsm/other/bli_trsm_ll_ker_var2sl.c
+++ b/frame/3/trsm/other/bli_trsm_ll_ker_var2sl.c
@@ -182,7 +182,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/trsm/other/bli_trsm_lu_ker_var2.c b/frame/3/trsm/other/bli_trsm_lu_ker_var2.c
index 7c4cea9763..b02ff09553 100644
--- a/frame/3/trsm/other/bli_trsm_lu_ker_var2.c
+++ b/frame/3/trsm/other/bli_trsm_lu_ker_var2.c
@@ -179,7 +179,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/trsm/other/bli_trsm_lu_ker_var2rr.c b/frame/3/trsm/other/bli_trsm_lu_ker_var2rr.c
index 8d050c62b0..e78cef4772 100644
--- a/frame/3/trsm/other/bli_trsm_lu_ker_var2rr.c
+++ b/frame/3/trsm/other/bli_trsm_lu_ker_var2rr.c
@@ -182,7 +182,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/trsm/other/bli_trsm_lu_ker_var2sl.c b/frame/3/trsm/other/bli_trsm_lu_ker_var2sl.c
index b49a1144ee..93cac371a1 100644
--- a/frame/3/trsm/other/bli_trsm_lu_ker_var2sl.c
+++ b/frame/3/trsm/other/bli_trsm_lu_ker_var2sl.c
@@ -182,7 +182,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/trsm/other/bli_trsm_rl_ker_var2.c b/frame/3/trsm/other/bli_trsm_rl_ker_var2.c
index a11936389c..1e903c3c1e 100644
--- a/frame/3/trsm/other/bli_trsm_rl_ker_var2.c
+++ b/frame/3/trsm/other/bli_trsm_rl_ker_var2.c
@@ -184,7 +184,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/3/trsm/other/bli_trsm_ru_ker_var2.c b/frame/3/trsm/other/bli_trsm_ru_ker_var2.c
index 7ad1e42714..a44d64f459 100644
--- a/frame/3/trsm/other/bli_trsm_ru_ker_var2.c
+++ b/frame/3/trsm/other/bli_trsm_ru_ker_var2.c
@@ -184,7 +184,7 @@ void PASTEMAC(ch,varname) \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
 	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
 \
diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c
index 3a698871b1..dfb1dcaa85 100644
--- a/frame/base/bli_cntx.c
+++ b/frame/base/bli_cntx.c
@@ -43,7 +43,7 @@ void bli_cntx_clear( cntx_t* cntx )
 
 // -----------------------------------------------------------------------------
 
-void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
+void bli_cntx_set_blkszs( cntx_t* cntx, ... )
 {
 	// This function can be called from the bli_cntx_init_*() function for
 	// a particular architecture if the kernel developer wishes to use
@@ -55,1412 +55,308 @@ void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... )
 
 	   void bli_cntx_set_blkszs
 	   (
-	     ind_t   method = BLIS_NAT,
-	     dim_t   n_bs,
+	     cntx_t* cntx,
 	     bszid_t bs0_id, blksz_t* blksz0, bszid_t bm0_id,
 	     bszid_t bs1_id, blksz_t* blksz1, bszid_t bm1_id,
 	     bszid_t bs2_id, blksz_t* blksz2, bszid_t bm2_id,
-	     ...
-	     cntx_t* cntx
-	   );
-
-	   void bli_cntx_set_blkszs
-	   (
-	     ind_t   method != BLIS_NAT,
-	     dim_t   n_bs,
-	     bszid_t bs0_id, blksz_t* blksz0, bszid_t bm0_id, dim_t def_scalr0, dim_t max_scalr0,
-	     bszid_t bs1_id, blksz_t* blksz1, bszid_t bm1_id, dim_t def_scalr1, dim_t max_scalr1,
-	     bszid_t bs2_id, blksz_t* blksz2, bszid_t bm2_id, dim_t def_scalr2, dim_t max_scalr2,
-	     ...
-	     cntx_t* cntx
-	   );
-	*/
-
-	va_list   args;
-	dim_t     i;
-	err_t     r_val;
-
-	// Allocate some temporary local arrays.
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_blkszs(): " );
-	#endif
-	bszid_t*  bszids = bli_malloc_intl( n_bs * sizeof( bszid_t  ), &r_val );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_blkszs(): " );
-	#endif
-	blksz_t** blkszs = bli_malloc_intl( n_bs * sizeof( blksz_t* ), &r_val );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_blkszs(): " );
-	#endif
-	bszid_t*  bmults = bli_malloc_intl( n_bs * sizeof( bszid_t  ), &r_val );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_blkszs(): " );
-	#endif
-	double*   dsclrs = bli_malloc_intl( n_bs * sizeof( double   ), &r_val );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_blkszs(): " );
-	#endif
-	double*   msclrs = bli_malloc_intl( n_bs * sizeof( double   ), &r_val );
-
-	// -- Begin variable argument section --
-
-	// Initialize variable argument environment.
-	va_start( args, n_bs );
-
-	// Handle native and induced method cases separately.
-	if ( method == BLIS_NAT )
-	{
-		// Process n_bs tuples.
-		for ( i = 0; i < n_bs; ++i )
-		{
-			// Here, we query the variable argument list for:
-			// - the bszid_t of the blocksize we're about to process,
-			// - the address of the blksz_t object,
-			// - the bszid_t of the multiple we need to associate with
-			//   the blksz_t object.
-			bszid_t  bs_id = ( bszid_t  )va_arg( args, bszid_t  );
-			blksz_t* blksz = ( blksz_t* )va_arg( args, blksz_t* );
-			bszid_t  bm_id = ( bszid_t  )va_arg( args, bszid_t  );
-
-			// Store the values in our temporary arrays.
-			bszids[ i ] = bs_id;
-			blkszs[ i ] = blksz;
-			bmults[ i ] = bm_id;
-		}
-	}
-	else // if induced method execution was indicated
-	{
-		// Process n_bs tuples.
-		for ( i = 0; i < n_bs; ++i )
-		{
-			// Here, we query the variable argument list for:
-			// - the bszid_t of the blocksize we're about to process,
-			// - the address of the blksz_t object,
-			// - the bszid_t of the multiple we  need to associate with
-			//   the blksz_t object,
-			// - the scalars we wish to apply to the real blocksizes to
-			//   come up with the induced complex blocksizes (for default
-			//   and maximum blocksizes).
-			bszid_t  bs_id = ( bszid_t  )va_arg( args, bszid_t  );
-			blksz_t* blksz = ( blksz_t* )va_arg( args, blksz_t* );
-			bszid_t  bm_id = ( bszid_t  )va_arg( args, bszid_t  );
-			double   dsclr = ( double   )va_arg( args, double   );
-			double   msclr = ( double   )va_arg( args, double   );
-
-			// Store the values in our temporary arrays.
-			bszids[ i ] = bs_id;
-			blkszs[ i ] = blksz;
-			bmults[ i ] = bm_id;
-			dsclrs[ i ] = dsclr;
-			msclrs[ i ] = msclr;
-		}
-	}
-
-	// The last argument should be the context pointer.
-	cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* );
-
-	// Shutdown variable argument environment and clean up stack.
-	va_end( args );
-
-	// -- End variable argument section --
-
-	// Save the execution type into the context.
-	bli_cntx_set_method( method, cntx );
-
-	// Query the context for the addresses of:
-	// - the blocksize object array
-	// - the blocksize multiple array
-
-	blksz_t* cntx_blkszs = bli_cntx_blkszs_buf( cntx );
-	bszid_t* cntx_bmults = bli_cntx_bmults_buf( cntx );
-
-	// Now that we have the context address, we want to copy the values
-	// from the temporary buffers into the corresponding buffers in the
-	// context. Notice that the blksz_t* pointers were saved, rather than
-	// the objects themselves, but we copy the contents of the objects
-	// when copying into the context.
-
-	// Handle native and induced method cases separately.
-	if ( method == BLIS_NAT )
-	{
-		// Process each blocksize id tuple provided.
-		for ( i = 0; i < n_bs; ++i )
-		{
-			// Read the current blocksize id, blksz_t* pointer, blocksize
-			// multiple id, and blocksize scalar.
-			bszid_t  bs_id = bszids[ i ];
-			bszid_t  bm_id = bmults[ i ];
-
-			blksz_t* blksz = blkszs[ i ];
-
-			blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ];
-
-			// Copy the blksz_t object contents into the appropriate
-			// location within the context's blksz_t array. Do the same
-			// for the blocksize multiple id.
-			//cntx_blkszs[ bs_id ] = *blksz;
-			//bli_blksz_copy( blksz, cntx_blksz );
-			bli_blksz_copy_if_pos( blksz, cntx_blksz );
-
-			// Copy the blocksize multiple id into the context.
-			cntx_bmults[ bs_id ] = bm_id;
-		}
-	}
-	else
-	{
-		// Process each blocksize id tuple provided.
-		for ( i = 0; i < n_bs; ++i )
-		{
-			// Read the current blocksize id, blksz_t pointer, blocksize
-			// multiple id, and blocksize scalar.
-			bszid_t  bs_id = bszids[ i ];
-			bszid_t  bm_id = bmults[ i ];
-			double   dsclr = dsclrs[ i ];
-			double   msclr = msclrs[ i ];
-
-			blksz_t* blksz = blkszs[ i ];
-
-			blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ];
-
-			// Copy the real domain values of the source blksz_t object into
-			// the context, duplicating into the complex domain fields.
-			bli_blksz_copy_dt( BLIS_FLOAT,  blksz, BLIS_FLOAT,    cntx_blksz );
-			bli_blksz_copy_dt( BLIS_DOUBLE, blksz, BLIS_DOUBLE,   cntx_blksz );
-			bli_blksz_copy_dt( BLIS_FLOAT,  blksz, BLIS_SCOMPLEX, cntx_blksz );
-			bli_blksz_copy_dt( BLIS_DOUBLE, blksz, BLIS_DCOMPLEX, cntx_blksz );
-
-			// If the default blocksize scalar is non-unit, we need to scale
-			// the complex domain default blocksizes.
-			if ( dsclr != 1.0 )
-			{
-				// Scale the complex domain default blocksize values in the
-				// blocksize object.
-				bli_blksz_scale_def( 1, ( dim_t )dsclr, BLIS_SCOMPLEX, cntx_blksz );
-				bli_blksz_scale_def( 1, ( dim_t )dsclr, BLIS_DCOMPLEX, cntx_blksz );
-			}
-
-			// Similarly, if the maximum blocksize scalar is non-unit, we need
-			// to scale the complex domain maximum blocksizes.
-			if ( msclr != 1.0 )
-			{
-				// Scale the complex domain maximum blocksize values in the
-				// blocksize object.
-				bli_blksz_scale_max( 1, ( dim_t )msclr, BLIS_SCOMPLEX, cntx_blksz );
-				bli_blksz_scale_max( 1, ( dim_t )msclr, BLIS_DCOMPLEX, cntx_blksz );
-			}
-
-			// Copy the blocksize multiple id into the context.
-			cntx_bmults[ bs_id ] = bm_id;
-		}
-	}
-
-	// Free the temporary local arrays.
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_blkszs(): " );
-	#endif
-	bli_free_intl( blkszs );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_blkszs(): " );
-	#endif
-	bli_free_intl( bszids );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_blkszs(): " );
-	#endif
-	bli_free_intl( bmults );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_blkszs(): " );
-	#endif
-	bli_free_intl( dsclrs );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_blkszs(): " );
-	#endif
-	bli_free_intl( msclrs );
-}
-
-// -----------------------------------------------------------------------------
-
-void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, dim_t n_bs, ... )
-{
-	/* Example prototypes:
-
-	   void bli_gks_cntx_set_ind_blkszs
-	   (
-	     ind_t   method != BLIS_NAT,
-	     num_t   dt,
-	     dim_t   n_bs,
-	     bszid_t bs0_id, dim_t def_scalr0, dim_t max_scalr0,
-	     bszid_t bs1_id, dim_t def_scalr1, dim_t max_scalr1,
-	     bszid_t bs2_id, dim_t def_scalr2, dim_t max_scalr2,
-	     ...
-	     cntx_t* cntx
-	   );
-	
-		NOTE: This function modifies an existing context that is presumed
-		to have been initialized for native execution.
-	*/
-
-	va_list   args;
-	dim_t     i;
-	err_t     r_val;
-
-	// Project the given datatype to the real domain. This will be used later on.
-	num_t dt_real = bli_dt_proj_to_real( dt );
-
-	// Return early if called with BLIS_NAT.
-	if ( method == BLIS_NAT ) return;
-
-	// Allocate some temporary local arrays.
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_ind_blkszs(): " );
-	#endif
-	bszid_t* bszids = bli_malloc_intl( n_bs * sizeof( bszid_t  ), &r_val );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_ind_blkszs(): " );
-	#endif
-	double*  dsclrs = bli_malloc_intl( n_bs * sizeof( double   ), &r_val );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_ind_blkszs(): " );
-	#endif
-	double*  msclrs = bli_malloc_intl( n_bs * sizeof( double   ), &r_val );
-
-	// -- Begin variable argument section --
-
-	// Initialize variable argument environment.
-	va_start( args, n_bs );
-
-	{
-		// Process n_bs tuples.
-		for ( i = 0; i < n_bs; ++i )
-		{
-			// Here, we query the variable argument list for:
-			// - the bszid_t of the blocksize we're about to process,
-			// - the scalars we wish to apply to the real blocksizes to
-			//   come up with the induced complex blocksizes (for default
-			//   and maximum blocksizes).
-			bszid_t  bs_id = ( bszid_t )va_arg( args, bszid_t  );
-			double   dsclr = ( double  )va_arg( args, double   );
-			double   msclr = ( double  )va_arg( args, double   );
-
-			// Store the values in our temporary arrays.
-			bszids[ i ] = bs_id;
-			dsclrs[ i ] = dsclr;
-			msclrs[ i ] = msclr;
-		}
-	}
-
-	// The last argument should be the context pointer.
-	cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* );
-
-	// Shutdown variable argument environment and clean up stack.
-	va_end( args );
-
-	// -- End variable argument section --
-
-	// Save the execution type into the context.
-	bli_cntx_set_method( method, cntx );
-
-	// Now that we have the context address, we want to copy the values
-	// from the temporary buffers into the corresponding buffers in the
-	// context.
-
-	{
-		// Process each blocksize id tuple provided.
-		for ( i = 0; i < n_bs; ++i )
-		{
-			// Read the current blocksize id, blocksize multiple id,
-			// and blocksize scalar.
-			bszid_t  bs_id = bszids[ i ];
-			double   dsclr = dsclrs[ i ];
-			double   msclr = msclrs[ i ];
-
-			//blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ];
-
-			// Query the context for the blksz_t object assoicated with the
-			// current blocksize id, and also query the object corresponding
-			// to the blocksize multiple.
-			blksz_t* cntx_blksz = bli_cntx_get_blksz( bs_id, cntx );
-
-			// Copy the real domain value of the blksz_t object into the
-			// corresponding complex domain slot of the same object.
-			bli_blksz_copy_dt( dt_real, cntx_blksz, dt, cntx_blksz );
-
-			// If the default blocksize scalar is non-unit, we need to scale
-			// the complex domain default blocksizes.
-			if ( dsclr != 1.0 )
-			{
-				// Scale the default blocksize value corresponding to the given
-				// datatype.
-				bli_blksz_scale_def( 1, ( dim_t )dsclr, dt, cntx_blksz );
-			}
-
-			// Similarly, if the maximum blocksize scalar is non-unit, we need
-			// to scale the complex domain maximum blocksizes.
-			if ( msclr != 1.0 )
-			{
-				// Scale the maximum blocksize value corresponding to the given
-				// datatype.
-				bli_blksz_scale_max( 1, ( dim_t )msclr, dt, cntx_blksz );
-			}
-		}
-	}
-
-	// Free the temporary local arrays.
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_ind_blkszs(): " );
-	#endif
-	bli_free_intl( bszids );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_ind_blkszs(): " );
-	#endif
-	bli_free_intl( dsclrs );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_ind_blkszs(): " );
-	#endif
-	bli_free_intl( msclrs );
-}
-
-// -----------------------------------------------------------------------------
-
-void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... )
-{
-	// This function can be called from the bli_cntx_init_*() function for
-	// a particular architecture if the kernel developer wishes to use
-	// non-default level-3 microkernels. It should be called after
-	// bli_cntx_init_defaults() so that the context begins with default
-	// microkernels across all datatypes.
-
-	/* Example prototypes:
-
-	   void bli_cntx_set_l3_nat_ukrs
-	   (
-	     dim_t   n_ukrs,
-	     l3ukr_t ukr0_id, num_t dt0, void_fp ukr0_fp, bool pref0,
-	     l3ukr_t ukr1_id, num_t dt1, void_fp ukr1_fp, bool pref1,
-	     l3ukr_t ukr2_id, num_t dt2, void_fp ukr2_fp, bool pref2,
-	     ...
-	     cntx_t* cntx
-	   );
-	*/
-
-	va_list   args;
-	dim_t     i;
-	err_t     r_val;
-
-	// Allocate some temporary local arrays.
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_nat_ukrs(): " );
-	#endif
-	l3ukr_t* ukr_ids   = bli_malloc_intl( n_ukrs * sizeof( l3ukr_t ), &r_val );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_nat_ukrs(): " );
-	#endif
-	num_t*   ukr_dts   = bli_malloc_intl( n_ukrs * sizeof( num_t   ), &r_val );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_nat_ukrs(): " );
-	#endif
-	void_fp* ukr_fps   = bli_malloc_intl( n_ukrs * sizeof( void_fp ), &r_val );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_nat_ukrs(): " );
-	#endif
-	bool*    ukr_prefs = bli_malloc_intl( n_ukrs * sizeof( bool    ), &r_val );
-
-	// -- Begin variable argument section --
-
-	// Initialize variable argument environment.
-	va_start( args, n_ukrs );
-
-	// Process n_ukrs tuples.
-	for ( i = 0; i < n_ukrs; ++i )
-	{
-		// Here, we query the variable argument list for:
-		// - the l3ukr_t of the kernel we're about to process,
-		// - the datatype of the kernel,
-		// - the kernel function pointer, and
-		// - the kernel function storage preference
-		// that we need to store to the context.
-
-		// NOTE: Though bool_t is no longer used, the following comment is
-		// being kept for historical reasons.
-		// The type that we pass into the va_arg() macro for the ukr
-		// preference matters. Using 'bool_t' may cause breakage on 64-bit
-		// systems that define int as 32 bits and long int and pointers as
-		// 64 bits. The problem is that TRUE or FALSE are defined as 1 and
-		// 0, respectively, and when "passed" into the variadic function
-		// they come with no contextual typecast. Thus, default rules of
-		// argument promotion kick in to treat these integer literals as
-		// being of type int. Thus, we need to let va_arg() treat the TRUE
-		// or FALSE value as an int, even if we cast it to and store it
-		// within a bool_t afterwards.
-		const l3ukr_t  ukr_id   = ( l3ukr_t )va_arg( args, l3ukr_t );
-		const num_t    ukr_dt   = ( num_t   )va_arg( args, num_t   );
-		      void_fp  ukr_fp   = ( void_fp )va_arg( args, void_fp );
-		const bool     ukr_pref = ( bool    )va_arg( args, int     );
-
-		// Store the values in our temporary arrays.
-		ukr_ids[ i ]   = ukr_id;
-		ukr_dts[ i ]   = ukr_dt;
-		ukr_fps[ i ]   = ukr_fp;
-		ukr_prefs[ i ] = ukr_pref;
-	}
-
-	// The last argument should be the context pointer.
-	cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* );
-
-	// Shutdown variable argument environment and clean up stack.
-	va_end( args );
-
-	// -- End variable argument section --
-
-	// Query the context for the addresses of:
-	// - the l3 virtual ukernel func_t array
-	// - the l3 native ukernel func_t array
-	// - the l3 native ukernel preferences array
-	func_t*  cntx_l3_vir_ukrs       = bli_cntx_l3_vir_ukrs_buf( cntx );
-	func_t*  cntx_l3_nat_ukrs       = bli_cntx_l3_nat_ukrs_buf( cntx );
-	mbool_t* cntx_l3_nat_ukrs_prefs = bli_cntx_l3_nat_ukrs_prefs_buf( cntx );
-
-	// Now that we have the context address, we want to copy the values
-	// from the temporary buffers into the corresponding buffers in the
-	// context.
-
-	// Process each blocksize id tuple provided.
-	for ( i = 0; i < n_ukrs; ++i )
-	{
-		// Read the current ukernel id, ukernel datatype, ukernel function
-		// pointer, and ukernel preference.
-		const l3ukr_t ukr_id   = ukr_ids[ i ];
-		const num_t   ukr_dt   = ukr_dts[ i ];
-		      void_fp ukr_fp   = ukr_fps[ i ];
-		const bool    ukr_pref = ukr_prefs[ i ];
-
-		// Index into the func_t and mbool_t for the current kernel id
-		// being processed.
-		func_t*       vukrs  = &cntx_l3_vir_ukrs[ ukr_id ];
-		func_t*       ukrs   = &cntx_l3_nat_ukrs[ ukr_id ];
-		mbool_t*      prefs  = &cntx_l3_nat_ukrs_prefs[ ukr_id ];
-
-		// Store the ukernel function pointer and preference values into
-		// the context. Notice that we redundantly store the native
-		// ukernel address in both the native and virtual ukernel slots
-		// in the context. This is standard practice when creating a
-		// native context. (Induced method contexts will overwrite the
-		// virtual function pointer with the address of the appropriate
-		// virtual ukernel.)
-		bli_func_set_dt( ukr_fp, ukr_dt, vukrs );
-		bli_func_set_dt( ukr_fp, ukr_dt, ukrs );
-		bli_mbool_set_dt( ukr_pref, ukr_dt, prefs );
-	}
-
-	// Free the temporary local arrays.
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_nat_ukrs(): " );
-	#endif
-	bli_free_intl( ukr_ids );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_nat_ukrs(): " );
-	#endif
-	bli_free_intl( ukr_dts );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_nat_ukrs(): " );
-	#endif
-	bli_free_intl( ukr_fps );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_nat_ukrs(): " );
-	#endif
-	bli_free_intl( ukr_prefs );
-}
-
-// -----------------------------------------------------------------------------
-
-void bli_cntx_set_l3_vir_ukrs( dim_t n_ukrs, ... )
-{
-	// This function can be called from the bli_cntx_init_*() function for
-	// a particular architecture if the kernel developer wishes to use
-	// non-default level-3 virtual microkernels. It should be called after
-	// bli_cntx_init_defaults() so that the context begins with default
-	// microkernels across all datatypes.
-
-	/* Example prototypes:
-
-	   void bli_cntx_set_l3_vir_ukrs
-	   (
-	     dim_t   n_ukrs,
-	     l3ukr_t ukr0_id, num_t dt0, void_fp ukr0_fp,
-	     l3ukr_t ukr1_id, num_t dt1, void_fp ukr1_fp,
-	     l3ukr_t ukr2_id, num_t dt2, void_fp ukr2_fp,
-	     ...
-	     cntx_t* cntx
-	   );
-	*/
-
-	va_list   args;
-	dim_t     i;
-	err_t     r_val;
-
-	// Allocate some temporary local arrays.
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_vir_ukrs(): " );
-	#endif
-	l3ukr_t* ukr_ids   = bli_malloc_intl( n_ukrs * sizeof( l3ukr_t ), &r_val );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_vir_ukrs(): " );
-	#endif
-	num_t*   ukr_dts   = bli_malloc_intl( n_ukrs * sizeof( num_t   ), &r_val );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_vir_ukrs(): " );
-	#endif
-	void_fp* ukr_fps   = bli_malloc_intl( n_ukrs * sizeof( void_fp ), &r_val );
-
-	// -- Begin variable argument section --
-
-	// Initialize variable argument environment.
-	va_start( args, n_ukrs );
-
-	// Process n_ukrs tuples.
-	for ( i = 0; i < n_ukrs; ++i )
-	{
-		// Here, we query the variable argument list for:
-		// - the l3ukr_t of the kernel we're about to process,
-		// - the datatype of the kernel, and
-		// - the kernel function pointer.
-		// that we need to store to the context.
-		const l3ukr_t  ukr_id   = ( l3ukr_t )va_arg( args, l3ukr_t );
-		const num_t    ukr_dt   = ( num_t   )va_arg( args, num_t   );
-		      void_fp  ukr_fp   = ( void_fp )va_arg( args, void_fp );
-
-		// Store the values in our temporary arrays.
-		ukr_ids[ i ]   = ukr_id;
-		ukr_dts[ i ]   = ukr_dt;
-		ukr_fps[ i ]   = ukr_fp;
-	}
-
-	// The last argument should be the context pointer.
-	cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* );
-
-	// Shutdown variable argument environment and clean up stack.
-	va_end( args );
-
-	// -- End variable argument section --
-
-	// Query the context for the addresses of:
-	// - the l3 virtual ukernel func_t array
-	func_t*  cntx_l3_vir_ukrs       = bli_cntx_l3_vir_ukrs_buf( cntx );
-
-	// Now that we have the context address, we want to copy the values
-	// from the temporary buffers into the corresponding buffers in the
-	// context.
-
-	// Process each blocksize id tuple provided.
-	for ( i = 0; i < n_ukrs; ++i )
-	{
-		// Read the current ukernel id, ukernel datatype, ukernel function
-		// pointer, and ukernel preference.
-		const l3ukr_t ukr_id   = ukr_ids[ i ];
-		const num_t   ukr_dt   = ukr_dts[ i ];
-		      void_fp ukr_fp   = ukr_fps[ i ];
-
-		// Index into the func_t and mbool_t for the current kernel id
-		// being processed.
-		func_t*       vukrs  = &cntx_l3_vir_ukrs[ ukr_id ];
-
-		// Store the ukernel function pointer and preference values into
-		// the context. Notice that we redundantly store the native
-		// ukernel address in both the native and virtual ukernel slots
-		// in the context. This is standard practice when creating a
-		// native context. (Induced method contexts will overwrite the
-		// virtual function pointer with the address of the appropriate
-		// virtual ukernel.)
-		bli_func_set_dt( ukr_fp, ukr_dt, vukrs );
-	}
-
-	// Free the temporary local arrays.
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_vir_ukrs(): " );
-	#endif
-	bli_free_intl( ukr_ids );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_vir_ukrs(): " );
-	#endif
-	bli_free_intl( ukr_dts );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_vir_ukrs(): " );
-	#endif
-	bli_free_intl( ukr_fps );
-}
-
-// -----------------------------------------------------------------------------
-
-void bli_cntx_set_l3_sup_thresh( dim_t n_thresh, ... )
-{
-	// This function can be called from the bli_cntx_init_*() function for
-	// a particular architecture if the kernel developer wishes to use
-	// non-default thresholds for small/unpacked matrix handling. It should
-	// be called after bli_cntx_init_defaults() so that the context begins
-	// with default thresholds.
-
-	/* Example prototypes:
-
-	   void bli_cntx_set_l3_sup_thresh
-	   (
-	     dim_t      n_thresh,
-	     threshid_t th0_id, blksz_t* blksz0,
-	     threshid_t th1_id, blksz_t* blksz1,
-	     ...
-	     cntx_t* cntx
-	   );
-
-	*/
-
-	va_list   args;
-	dim_t     i;
-	err_t     r_val;
-
-	// Allocate some temporary local arrays.
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_sup_thresh(): " );
-	#endif
-	threshid_t* threshids = bli_malloc_intl( n_thresh * sizeof( threshid_t ), &r_val );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_sup_thresh(): " );
-	#endif
-	blksz_t**   threshs = bli_malloc_intl( n_thresh * sizeof( blksz_t* ), &r_val );
-
-	// -- Begin variable argument section --
-
-	// Initialize variable argument environment.
-	va_start( args, n_thresh );
-
-	// Process n_thresh tuples.
-	for ( i = 0; i < n_thresh; ++i )
-	{
-		// Here, we query the variable argument list for:
-		// - the threshid_t of the threshold we're about to process,
-		// - the address of the blksz_t object,
-		threshid_t th_id  = ( threshid_t )va_arg( args, threshid_t );
-		blksz_t*   thresh = ( blksz_t*   )va_arg( args, blksz_t*   );
-
-		// Store the values in our temporary arrays.
-		threshids[ i ] = th_id;
-		threshs[ i ]   = thresh;
-	}
-
-	// The last argument should be the context pointer.
-	cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* );
-
-	// Shutdown variable argument environment and clean up stack.
-	va_end( args );
-
-	// -- End variable argument section --
-
-	// Query the context for the addresses of:
-	// - the threshold array
-	blksz_t* cntx_threshs = bli_cntx_l3_sup_thresh_buf( cntx );
-
-	// Now that we have the context address, we want to copy the values
-	// from the temporary buffers into the corresponding buffers in the
-	// context. Notice that the blksz_t* pointers were saved, rather than
-	// the objects themselves, but we copy the contents of the objects
-	// when copying into the context.
-
-	// Process each blocksize id tuple provided.
-	for ( i = 0; i < n_thresh; ++i )
-	{
-		// Read the current blocksize id, blksz_t* pointer, blocksize
-		// multiple id, and blocksize scalar.
-		threshid_t th_id  = threshids[ i ];
-		blksz_t*   thresh = threshs[ i ];
-
-		blksz_t* cntx_thresh = &cntx_threshs[ th_id ];
-
-		// Copy the blksz_t object contents into the appropriate
-		// location within the context's blksz_t array.
-		//cntx_threshs[ th_id ] = *thresh;
-		//bli_blksz_copy( thresh, cntx_thresh );
-		bli_blksz_copy_if_pos( thresh, cntx_thresh );
-	}
-
-	// Free the temporary local arrays.
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_sup_thresh(): " );
-	#endif
-	bli_free_intl( threshs );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_sup_thresh(): " );
-	#endif
-	bli_free_intl( threshids );
-}
-
-// -----------------------------------------------------------------------------
-
-void bli_cntx_set_l3_sup_handlers( dim_t n_ops, ... )
-{
-	// This function can be called from the bli_cntx_init_*() function for
-	// a particular architecture if the kernel developer wishes to use
-	// non-default level-3 operation handler for small/unpacked matrices. It
-	// should be called after bli_cntx_init_defaults() so that the context
-	// begins with default sup handlers across all datatypes.
-
-	/* Example prototypes:
-
-	   void bli_cntx_set_l3_sup_handlers
-	   (
-	     dim_t   n_ops,
-	     opid_t  op0_id, void* handler0_fp,
-	     opid_t  op1_id, void* handler1_fp,
-	     opid_t  op2_id, void* handler2_fp,
-	     ...
-	     cntx_t* cntx
-	   );
-	*/
-
-	va_list   args;
-	dim_t     i;
-	err_t     r_val;
-
-	// Allocate some temporary local arrays.
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_sup_handlers(): " );
-	#endif
-	opid_t* op_ids = bli_malloc_intl( n_ops * sizeof( opid_t ), &r_val );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_sup_handlers(): " );
-	#endif
-	void**  op_fps = bli_malloc_intl( n_ops * sizeof( void*  ), &r_val );
-
-	// -- Begin variable argument section --
-
-	// Initialize variable argument environment.
-	va_start( args, n_ops );
-
-	// Process n_ukrs tuples.
-	for ( i = 0; i < n_ops; ++i )
-	{
-		// Here, we query the variable argument list for:
-		// - the opid_t of the operation we're about to process,
-		// - the sup handler function pointer
-		// that we need to store to the context.
-		const opid_t op_id = ( opid_t )va_arg( args, opid_t );
-		      void*  op_fp = ( void*  )va_arg( args, void*  );
-
-		// Store the values in our temporary arrays.
-		op_ids[ i ] = op_id;
-		op_fps[ i ] = op_fp;
-	}
-
-	// The last argument should be the context pointer.
-	cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* );
-
-	// Shutdown variable argument environment and clean up stack.
-	va_end( args );
-
-	// -- End variable argument section --
-
-	// Query the context for the addresses of:
-	// - the l3 small/unpacked handlers array
-	void** cntx_l3_sup_handlers = bli_cntx_l3_sup_handlers_buf( cntx );
-
-	// Now that we have the context address, we want to copy the values
-	// from the temporary buffers into the corresponding buffers in the
-	// context.
-
-	// Process each operation id tuple provided.
-	for ( i = 0; i < n_ops; ++i )
-	{
-		// Read the current operation id and handler function pointer.
-		const opid_t op_id = op_ids[ i ];
-		      void*  op_fp = op_fps[ i ];
-
-		// Store the sup handler function pointer into the slot for the
-		// specified operation id.
-		cntx_l3_sup_handlers[ op_id ] = op_fp;
-	}
-
-	// Free the temporary local arrays.
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_sup_handlers(): " );
-	#endif
-	bli_free_intl( op_ids );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_sup_handlers(): " );
-	#endif
-	bli_free_intl( op_fps );
-}
-
-// -----------------------------------------------------------------------------
-
-void bli_cntx_set_l3_sup_blkszs( dim_t n_bs, ... )
-{
-	// This function can be called from the bli_cntx_init_*() function for
-	// a particular architecture if the kernel developer wishes to use
-	// non-default l3 sup blocksizes. It should be called after
-	// bli_cntx_init_defaults() so that the context begins with default
-	// blocksizes across all datatypes.
-
-	/* Example prototypes:
-
-	   void bli_cntx_set_blkszs
-	   (
-	     dim_t   n_bs,
-	     bszid_t bs0_id, blksz_t* blksz0,
-	     bszid_t bs1_id, blksz_t* blksz1,
-	     bszid_t bs2_id, blksz_t* blksz2,
-	     ...
-	     cntx_t* cntx
-	   );
-	*/
-
-	va_list   args;
-	dim_t     i;
-	err_t     r_val;
-
-	// Allocate some temporary local arrays.
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_blkszs(): " );
-	#endif
-	bszid_t* bszids = bli_malloc_intl( n_bs * sizeof( bszid_t  ), &r_val );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_blkszs(): " );
-	#endif
-	blksz_t** blkszs = bli_malloc_intl( n_bs * sizeof( blksz_t* ), &r_val );
-
-	// -- Begin variable argument section --
-
-	// Initialize variable argument environment.
-	va_start( args, n_bs );
-
-	// Process n_bs tuples.
-	for ( i = 0; i < n_bs; ++i )
-	{
-		// Here, we query the variable argument list for:
-		// - the bszid_t of the blocksize we're about to process,
-		// - the address of the blksz_t object.
-		bszid_t  bs_id = ( bszid_t  )va_arg( args, bszid_t  );
-		blksz_t* blksz = ( blksz_t* )va_arg( args, blksz_t* );
-
-		// Store the values in our temporary arrays.
-		bszids[ i ] = bs_id;
-		blkszs[ i ] = blksz;
-	}
-
-	// The last argument should be the context pointer.
-	cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* );
-
-	// Shutdown variable argument environment and clean up stack.
-	va_end( args );
-
-	// -- End variable argument section --
-
-	// Query the context for the addresses of:
-	// - the blocksize object array
-	blksz_t* cntx_l3_sup_blkszs = bli_cntx_l3_sup_blkszs_buf( cntx );
-
-	// Now that we have the context address, we want to copy the values
-	// from the temporary buffers into the corresponding buffers in the
-	// context. Notice that the blksz_t* pointers were saved, rather than
-	// the objects themselves, but we copy the contents of the objects
-	// when copying into the context.
-
-	// Process each blocksize id tuple provided.
-	for ( i = 0; i < n_bs; ++i )
-	{
-		// Read the current blocksize id, blksz_t* pointer, blocksize
-		// multiple id, and blocksize scalar.
-		bszid_t  bs_id = bszids[ i ];
-		blksz_t* blksz = blkszs[ i ];
-
-		blksz_t* cntx_l3_sup_blksz = &cntx_l3_sup_blkszs[ bs_id ];
-
-		// Copy the blksz_t object contents into the appropriate
-		// location within the context's blksz_t array.
-		//cntx_l3_sup_blkszs[ bs_id ] = *blksz;
-		//bli_blksz_copy( blksz, cntx_l3_sup_blksz );
-		bli_blksz_copy_if_pos( blksz, cntx_l3_sup_blksz );
-	}
-
-	// Free the temporary local arrays.
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_blkszs(): " );
-	#endif
-	bli_free_intl( blkszs );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_blkszs(): " );
-	#endif
-	bli_free_intl( bszids );
-}
-
-// -----------------------------------------------------------------------------
-
-void bli_cntx_set_l3_sup_kers( dim_t n_ukrs, ... )
-{
-	// This function can be called from the bli_cntx_init_*() function for
-	// a particular architecture if the kernel developer wishes to use
-	// non-default level-3 microkernels for small/unpacked matrices. It
-	// should be called after bli_cntx_init_defaults() so that the context
-	// begins with default sup micro/millikernels across all datatypes.
-
-	/* Example prototypes:
-
-	   void bli_cntx_set_l3_sup_kers
-	   (
-	     dim_t   n_ukrs,
-	     stor3_t stor_id0, num_t dt0, void* ukr0_fp, bool pref0,
-	     stor3_t stor_id1, num_t dt1, void* ukr1_fp, bool pref1,
-	     stor3_t stor_id2, num_t dt2, void* ukr2_fp, bool pref2,
-	     ...
-	     cntx_t* cntx
-	   );
-	*/
-
-	va_list   args;
-	dim_t     i;
-	err_t     r_val;
-
-	// Allocate some temporary local arrays.
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_sup_kers(): " );
-	#endif
-	stor3_t* st3_ids   = bli_malloc_intl( n_ukrs * sizeof( stor3_t ), &r_val );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_sup_kers(): " );
-	#endif
-	num_t*   ukr_dts   = bli_malloc_intl( n_ukrs * sizeof( num_t   ), &r_val );
+	     ...,
+         -1
+	   );
+	*/
 
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_sup_kers(): " );
-	#endif
-	void**   ukr_fps   = bli_malloc_intl( n_ukrs * sizeof( void*   ), &r_val );
+	// Save the execution type into the context.
+	bli_cntx_set_method( BLIS_NAT, cntx );
 
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_sup_kers(): " );
-	#endif
-	bool*    ukr_prefs = bli_malloc_intl( n_ukrs * sizeof( bool    ), &r_val );
+	// Query the context for the addresses of:
+	// - the blocksize object array
+	// - the blocksize multiple array
+	blksz_t* cntx_blkszs = bli_cntx_blkszs_buf( cntx );
+	bszid_t* cntx_bmults = bli_cntx_bmults_buf( cntx );
 
-	// -- Begin variable argument section --
+	// Now that we have the context address, we want to copy the values
+	// from the temporary buffers into the corresponding buffers in the
+	// context.
 
 	// Initialize variable argument environment.
-	va_start( args, n_ukrs );
+	va_list args;
+	va_start( args, cntx );
 
-	// Process n_ukrs tuples.
-	for ( i = 0; i < n_ukrs; ++i )
+	// Process block sizes until we get a -1.
+	while ( true )
 	{
+        int bs_id0 = va_arg( args, int );
+        if ( bs_id0 == -1 )
+            break;
+
 		// Here, we query the variable argument list for:
-		// - the stor3_t storage case being assigned to the kernel we're
-		//   about to process,
-		// - the datatype of the kernel,
-		// - the kernel function pointer, and
-		// - the kernel function storage preference
-		// that we need to store to the context.
-		const stor3_t  st3_id   = ( stor3_t )va_arg( args, stor3_t );
-		const num_t    ukr_dt   = ( num_t   )va_arg( args, num_t   );
-		      void*    ukr_fp   = ( void*   )va_arg( args, void*   );
-		const bool     ukr_pref = ( bool    )va_arg( args, int     );
+		// - the bszid_t of the blocksize we're about to process (already done),
+		// - the address of the blksz_t object,
+		// - the bszid_t of the multiple we need to associate with
+		//   the blksz_t object.
+		bszid_t  bs_id = ( bszid_t  )bs_id0;
+		blksz_t* blksz = ( blksz_t* )va_arg( args, blksz_t* );
+		bszid_t  bm_id = ( bszid_t  )va_arg( args, bszid_t  );
 
-		// Store the values in our temporary arrays.
-		st3_ids[ i ]   = st3_id;
-		ukr_dts[ i ]   = ukr_dt;
-		ukr_fps[ i ]   = ukr_fp;
-		ukr_prefs[ i ] = ukr_pref;
+		// Copy the blksz_t object contents into the appropriate
+		// location within the context's blksz_t array. Do the same
+		// for the blocksize multiple id.
+		//cntx_blkszs[ bs_id ] = *blksz;
+		//bli_blksz_copy( blksz, cntx_blksz );
+		blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ];
+		bli_blksz_copy_if_pos( blksz, cntx_blksz );
+
+		// Copy the blocksize multiple id into the context.
+		cntx_bmults[ bs_id ] = bm_id;
 	}
 
-	// The last argument should be the context pointer.
-	cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* );
-
 	// Shutdown variable argument environment and clean up stack.
 	va_end( args );
+}
 
-	// -- End variable argument section --
+// -----------------------------------------------------------------------------
 
-	// Query the context for the addresses of:
-	// - the l3 small/unpacked ukernel func_t array
-	// - the l3 small/unpacked ukernel preferences array
-	func_t*  cntx_l3_sup_kers       = bli_cntx_l3_sup_kers_buf( cntx );
-	mbool_t* cntx_l3_sup_kers_prefs = bli_cntx_l3_sup_kers_prefs_buf( cntx );
+void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, cntx_t* cntx, ... )
+{
+	/* Example prototypes:
+
+	   void bli_gks_cntx_set_ind_blkszs
+	   (
+	     ind_t   method != BLIS_NAT,
+	     num_t   dt,
+	     cntx_t* cntx,
+	     bszid_t bs0_id, dim_t def_scalr0, dim_t max_scalr0,
+	     bszid_t bs1_id, dim_t def_scalr1, dim_t max_scalr1,
+	     bszid_t bs2_id, dim_t def_scalr2, dim_t max_scalr2,
+	     ...,
+         -1
+	   );
+
+		NOTE: This function modifies an existing context that is presumed
+		to have been initialized for native execution.
+	*/
+
+	// Project the given datatype to the real domain. This will be used later on.
+	num_t dt_real = bli_dt_proj_to_real( dt );
+
+	// Return early if called with BLIS_NAT.
+	if ( method == BLIS_NAT ) return;
+
+	// Save the execution type into the context.
+	bli_cntx_set_method( method, cntx );
 
 	// Now that we have the context address, we want to copy the values
 	// from the temporary buffers into the corresponding buffers in the
 	// context.
 
-#if 0
-	dim_t sup_map[ BLIS_NUM_LEVEL3_SUP_UKRS ][2];
-
-	// Create the small/unpacked ukernel mappings:
-	// - rv -> rrr 0, rcr 2
-	// - rg -> rrc 1, rcc 3
-	// - cv -> ccr 6, ccc 7
-	// - cg -> crr 4, crc 5
-	// - rd -> rrc 1
-	// - cd -> crc 5
-	// - rc -> rcc 3
-	// - cr -> crr 4
-	// - gx -> xxx 8
-	// NOTE: We only need to set one slot in the context l3_sup_kers array
-	// for the general-stride/generic ukernel type, but since the loop below
-	// needs to be set up to set two slots to accommodate the RV, RG, CV, and
-	// CG, ukernel types, we will just be okay with the GX ukernel being set
-	// redundantly. (The RD, CD, CR, and RC ukernel types are set redundantly
-	// for the same reason.)
-	sup_map[ BLIS_GEMMSUP_RV_UKR ][0] = BLIS_RRR;
-	sup_map[ BLIS_GEMMSUP_RV_UKR ][1] = BLIS_RCR;
-	sup_map[ BLIS_GEMMSUP_RG_UKR ][0] = BLIS_RRC;
-	sup_map[ BLIS_GEMMSUP_RG_UKR ][1] = BLIS_RCC;
-	sup_map[ BLIS_GEMMSUP_CV_UKR ][0] = BLIS_CCR;
-	sup_map[ BLIS_GEMMSUP_CV_UKR ][1] = BLIS_CCC;
-	sup_map[ BLIS_GEMMSUP_CG_UKR ][0] = BLIS_CRR;
-	sup_map[ BLIS_GEMMSUP_CG_UKR ][1] = BLIS_CRC;
-
-	sup_map[ BLIS_GEMMSUP_RD_UKR ][0] = BLIS_RRC;
-	sup_map[ BLIS_GEMMSUP_RD_UKR ][1] = BLIS_RRC;
-	sup_map[ BLIS_GEMMSUP_CD_UKR ][0] = BLIS_CRC;
-	sup_map[ BLIS_GEMMSUP_CD_UKR ][1] = BLIS_CRC;
-
-	sup_map[ BLIS_GEMMSUP_RC_UKR ][0] = BLIS_RCC;
-	sup_map[ BLIS_GEMMSUP_RC_UKR ][1] = BLIS_RCC;
-	sup_map[ BLIS_GEMMSUP_CR_UKR ][0] = BLIS_CRR;
-	sup_map[ BLIS_GEMMSUP_CR_UKR ][1] = BLIS_CRR;
-
-	sup_map[ BLIS_GEMMSUP_GX_UKR ][0] = BLIS_XXX;
-	sup_map[ BLIS_GEMMSUP_GX_UKR ][1] = BLIS_XXX;
-#endif
-
-	// Process each blocksize id tuple provided.
-	for ( i = 0; i < n_ukrs; ++i )
-	{
-		// Read the current stor3_t id, ukernel datatype, ukernel function
-		// pointer, and ukernel preference.
-		const stor3_t st3_id   = st3_ids[ i ];
-		const num_t   ukr_dt   = ukr_dts[ i ];
-		      void*   ukr_fp   = ukr_fps[ i ];
-		const bool    ukr_pref = ukr_prefs[ i ];
-
-		// Index to the func_t and mbool_t for the current stor3_t id
-		// being processed.
-		func_t*  ukrs   = &cntx_l3_sup_kers[ st3_id ];
-		mbool_t* prefs  = &cntx_l3_sup_kers_prefs[ st3_id ];
-
-		// Store the ukernel function pointer and preference values into
-		// the stor3_t location in the context.
-		bli_func_set_dt( ukr_fp, ukr_dt, ukrs );
-		bli_mbool_set_dt( ukr_pref, ukr_dt, prefs );
-	}
+	// Initialize variable argument environment.
+	va_list args;
+	va_start( args, cntx );
 
-	// Free the temporary local arrays.
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_sup_kers(): " );
-	#endif
-	bli_free_intl( st3_ids );
+	// Process block sizes until we get a -1.
+	while ( true )
+	{
+        int bs_id0 = va_arg( args, int );
+        if ( bs_id0 == -1 )
+            break;
 
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_sup_kers(): " );
-	#endif
-	bli_free_intl( ukr_dts );
+		// Here, we query the variable argument list for:
+		// - the bszid_t of the blocksize we're about to process (already done),
+		// - the scalars we wish to apply to the real blocksizes to
+		//   come up with the induced complex blocksizes (for default
+		//   and maximum blocksizes).
+		bszid_t bs_id = ( bszid_t )bs_id0;
+		double  dsclr = ( double  )va_arg( args, double );
+		double  msclr = ( double  )va_arg( args, double );
+
+		// Query the context for the blksz_t object assoicated with the
+		// current blocksize id, and also query the object corresponding
+		// to the blocksize multiple.
+		blksz_t* cntx_blksz = bli_cntx_get_blksz( bs_id, cntx );
+
+		// Copy the real domain value of the blksz_t object into the
+		// corresponding complex domain slot of the same object.
+		bli_blksz_copy_dt( dt_real, cntx_blksz, dt, cntx_blksz );
+
+		// If the default blocksize scalar is non-unit, we need to scale
+		// the complex domain default blocksizes.
+		if ( dsclr != 1.0 )
+		{
+			// Scale the default blocksize value corresponding to the given
+			// datatype.
+			bli_blksz_scale_def( 1, ( dim_t )dsclr, dt, cntx_blksz );
+		}
 
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_sup_kers(): " );
-	#endif
-	bli_free_intl( ukr_fps );
+		// Similarly, if the maximum blocksize scalar is non-unit, we need
+		// to scale the complex domain maximum blocksizes.
+		if ( msclr != 1.0 )
+		{
+			// Scale the maximum blocksize value corresponding to the given
+			// datatype.
+			bli_blksz_scale_max( 1, ( dim_t )msclr, dt, cntx_blksz );
+		}
+	}
 
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_sup_kers(): " );
-	#endif
-	bli_free_intl( ukr_prefs );
+	// Shutdown variable argument environment and clean up stack.
+	va_end( args );
 }
 
 // -----------------------------------------------------------------------------
 
-void bli_cntx_set_l1f_kers( dim_t n_kers, ... )
+void bli_cntx_set_ukrs( cntx_t* cntx , ... )
 {
 	// This function can be called from the bli_cntx_init_*() function for
 	// a particular architecture if the kernel developer wishes to use
-	// non-default level-1f kernels. It should be called after
-	// bli_cntx_init_defaults() so that the context begins with default l1f
-	// kernels across all datatypes.
+	// non-default microkernels. It should be called after
+	// bli_cntx_init_defaults() so that the context begins with default
+	// microkernels across all datatypes.
 
 	/* Example prototypes:
 
-	   void bli_cntx_set_l1f_kers
+	   void bli_cntx_set_ukrs
 	   (
-	     dim_t   n_ukrs,
-	     l1fkr_t ker0_id, num_t ker0_dt, void_fp ker0_fp,
-	     l1fkr_t ker1_id, num_t ker1_dt, void_fp ker1_fp,
-	     l1fkr_t ker2_id, num_t ker2_dt, void_fp ker2_fp,
-	     ...
-	     cntx_t* cntx
+	     cntx_t* cntx,
+	     ukr_t ukr0_id, num_t dt0, void_fp ukr0_fp,
+	     ukr_t ukr1_id, num_t dt1, void_fp ukr1_fp,
+	     ukr_t ukr2_id, num_t dt2, void_fp ukr2_fp,
+	     ...,
+         -1
 	   );
 	*/
 
-	va_list   args;
-	dim_t     i;
-	err_t     r_val;
-
-	// Allocate some temporary local arrays.
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l1f_kers(): " );
-	#endif
-	l1fkr_t* ker_ids   = bli_malloc_intl( n_kers * sizeof( l1fkr_t ), &r_val );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l1f_kers(): " );
-	#endif
-	num_t*   ker_dts   = bli_malloc_intl( n_kers * sizeof( num_t   ), &r_val );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l1f_kers(): " );
-	#endif
-	void_fp* ker_fps   = bli_malloc_intl( n_kers * sizeof( void_fp ), &r_val );
-
-	// -- Begin variable argument section --
+	// Query the context for the address of the ukernel func_t array
+	func_t*  cntx_ukrs = bli_cntx_ukrs_buf( cntx );
 
 	// Initialize variable argument environment.
-	va_start( args, n_kers );
+	va_list   args;
+	va_start( args, cntx );
 
-	// Process n_kers tuples.
-	for ( i = 0; i < n_kers; ++i )
+	// Process ukernels until -1 is reached.
+	while ( true )
 	{
+        const int ukr_id0 = va_arg( args, int );
+
+        // If we find a ukr ID of -1, then we are done.
+        if ( ukr_id0 == -1 )
+            break;
+
 		// Here, we query the variable argument list for:
-		// - the l1fkr_t of the kernel we're about to process,
+		// - the ukr_t of the kernel we're about to process (already done),
 		// - the datatype of the kernel, and
 		// - the kernel function pointer
 		// that we need to store to the context.
-		const l1fkr_t  ker_id   = ( l1fkr_t )va_arg( args, l1fkr_t );
-		const num_t    ker_dt   = ( num_t   )va_arg( args, num_t   );
-		      void_fp  ker_fp   = ( void_fp )va_arg( args, void_fp );
-
-		// Store the values in our temporary arrays.
-		ker_ids[ i ]   = ker_id;
-		ker_dts[ i ]   = ker_dt;
-		ker_fps[ i ]   = ker_fp;
-	}
-
-	// The last argument should be the context pointer.
-	cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* );
-
-	// Shutdown variable argument environment and clean up stack.
-	va_end( args );
-
-	// -- End variable argument section --
-
-	// Query the context for the address of:
-	// - the level-1f kernels func_t array
-	func_t* cntx_l1f_kers = bli_cntx_l1f_kers_buf( cntx );
-
-	// Now that we have the context address, we want to copy the values
-	// from the temporary buffers into the corresponding buffers in the
-	// context.
-
-	// Process each blocksize id tuple provided.
-	for ( i = 0; i < n_kers; ++i )
-	{
-		// Read the current kernel id, kernel datatype, and kernel function
-		// pointer.
-		const l1fkr_t ker_id   = ker_ids[ i ];
-		const num_t   ker_dt   = ker_dts[ i ];
-		      void_fp ker_fp   = ker_fps[ i ];
+		const ukr_t   ukr_id = ( ukr_t   )ukr_id0;
+		const num_t   ukr_dt = ( num_t   )va_arg( args, num_t   );
+		      void_fp ukr_fp = ( void_fp )va_arg( args, void_fp );
 
 		// Index into the func_t and mbool_t for the current kernel id
 		// being processed.
-		func_t*       kers     = &cntx_l1f_kers[ ker_id ];
+		func_t* ukrs = &cntx_ukrs[ ukr_id ];
 
-		// Store the ukernel function pointer and preference values into
-		// the context.
-		bli_func_set_dt( ker_fp, ker_dt, kers );
-	}
-
-	// Free the temporary local arrays.
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l1f_kers(): " );
-	#endif
-	bli_free_intl( ker_ids );
+		// Store the ukernel function pointer into the context.
+        // Notice that we redundantly store the native
+		// ukernel address in both the native and virtual ukernel slots
+		// in the context. This is standard practice when creating a
+		// native context. (Induced method contexts will overwrite the
+		// virtual function pointer with the address of the appropriate
+		// virtual ukernel.)
+		bli_func_set_dt( ukr_fp, ukr_dt, ukrs );
 
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l1f_kers(): " );
-	#endif
-	bli_free_intl( ker_dts );
+        switch ( ukr_id )
+        {
+        	case BLIS_GEMM_UKR:       ukrs = &cntx_ukrs[ BLIS_GEMM_VIR_UKR ];
+        	case BLIS_GEMMTRSM_L_UKR: ukrs = &cntx_ukrs[ BLIS_GEMMTRSM_L_VIR_UKR ];
+        	case BLIS_GEMMTRSM_U_UKR: ukrs = &cntx_ukrs[ BLIS_GEMMTRSM_U_VIR_UKR ];
+        	case BLIS_TRSM_L_UKR:     ukrs = &cntx_ukrs[ BLIS_TRSM_L_VIR_UKR ];
+        	case BLIS_TRSM_U_UKR:     ukrs = &cntx_ukrs[ BLIS_TRSM_U_VIR_UKR ];
+            default:                  ukrs = NULL;
+        };
+
+        if ( ukrs )
+		    bli_func_set_dt( ukr_fp, ukr_dt, ukrs );
+	}
 
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l1f_kers(): " );
-	#endif
-	bli_free_intl( ker_fps );
+	// Shutdown variable argument environment and clean up stack.
+	va_end( args );
 }
 
 // -----------------------------------------------------------------------------
 
-void bli_cntx_set_l1v_kers( dim_t n_kers, ... )
+void bli_cntx_set_ukr_prefs( cntx_t* cntx , ... )
 {
 	// This function can be called from the bli_cntx_init_*() function for
 	// a particular architecture if the kernel developer wishes to use
-	// non-default level-1v kernels. It should be called after
-	// bli_cntx_init_defaults() so that the context begins with default l1v
-	// kernels across all datatypes.
+	// non-default microkernel preferences. It should be called after
+	// bli_cntx_init_defaults() so that the context begins with default
+	// preferences across all datatypes.
 
 	/* Example prototypes:
 
-	   void bli_cntx_set_l1v_kers
+	   void bli_cntx_set_ukr_prefs
 	   (
-	     dim_t   n_ukrs,
-	     l1vkr_t ker0_id, num_t ker0_dt, void_fp ker0_fp,
-	     l1vkr_t ker1_id, num_t ker1_dt, void_fp ker1_fp,
-	     l1vkr_t ker2_id, num_t ker2_dt, void_fp ker2_fp,
-	     ...
-	     cntx_t* cntx
+	     cntx_t* cntx,
+	     ukr_pref_t ukr_pref0_id, num_t dt0, bool ukr_pref0,
+	     ukr_pref_t ukr_pref1_id, num_t dt1, bool ukr_pref1,
+	     ukr_pref_t ukr_pref2_id, num_t dt2, bool ukr_pref2,
+	     ...,
+         -1
 	   );
 	*/
 
-	va_list   args;
-	dim_t     i;
-	err_t     r_val;
-
-	// Allocate some temporary local arrays.
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l1v_kers(): " );
-	#endif
-	l1vkr_t* ker_ids   = bli_malloc_intl( n_kers * sizeof( l1vkr_t ), &r_val );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l1v_kers(): " );
-	#endif
-	num_t*   ker_dts   = bli_malloc_intl( n_kers * sizeof( num_t   ), &r_val );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l1v_kers(): " );
-	#endif
-	void_fp* ker_fps   = bli_malloc_intl( n_kers * sizeof( void_fp ), &r_val );
-
-	// -- Begin variable argument section --
+	// Query the context for the address of the ukernel preference mbool_t array
+	mbool_t* cntx_ukr_prefs = bli_cntx_ukr_prefs_buf( cntx );
 
 	// Initialize variable argument environment.
-	va_start( args, n_kers );
+	va_list   args;
+	va_start( args, cntx );
 
-	// Process n_kers tuples.
-	for ( i = 0; i < n_kers; ++i )
+	// Process ukernel preferences until -1 is reached.
+	while ( true )
 	{
+        const int ukr_pref_id0 = va_arg( args, int );
+
+        // If we find a ukr pref ID of -1, then we are done.
+        if ( ukr_pref_id0 == -1 )
+            break;
+
 		// Here, we query the variable argument list for:
-		// - the l1vkr_t of the kernel we're about to process,
+		// - the ukr_t of the kernel we're about to process (already done),
 		// - the datatype of the kernel, and
 		// - the kernel function pointer
 		// that we need to store to the context.
-		const l1vkr_t  ker_id   = ( l1vkr_t )va_arg( args, l1vkr_t );
-		const num_t    ker_dt   = ( num_t   )va_arg( args, num_t   );
-		      void_fp  ker_fp   = ( void_fp )va_arg( args, void_fp );
-
-		// Store the values in our temporary arrays.
-		ker_ids[ i ]   = ker_id;
-		ker_dts[ i ]   = ker_dt;
-		ker_fps[ i ]   = ker_fp;
-	}
-
-	// The last argument should be the context pointer.
-	cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* );
-
-	// Shutdown variable argument environment and clean up stack.
-	va_end( args );
-
-	// -- End variable argument section --
-
-	// Query the context for the address of:
-	// - the level-1v kernels func_t array
-	func_t* cntx_l1v_kers = bli_cntx_l1v_kers_buf( cntx );
-
-	// Now that we have the context address, we want to copy the values
-	// from the temporary buffers into the corresponding buffers in the
-	// context.
-
-	// Process each blocksize id tuple provided.
-	for ( i = 0; i < n_kers; ++i )
-	{
-		// Read the current kernel id, kernel datatype, and kernel function
-		// pointer.
-		const l1vkr_t ker_id   = ker_ids[ i ];
-		const num_t   ker_dt   = ker_dts[ i ];
-		      void_fp ker_fp   = ker_fps[ i ];
+		const ukr_pref_t ukr_pref_id = ( ukr_pref_t )ukr_pref_id0;
+		const bool       ukr_pref_dt = ( num_t      )va_arg( args, num_t   );
+		const bool       ukr_pref    = ( bool       )va_arg( args, int );
 
 		// Index into the func_t and mbool_t for the current kernel id
 		// being processed.
-		func_t*       kers     = &cntx_l1v_kers[ ker_id ];
+		mbool_t* ukr_prefs = &cntx_ukr_prefs[ ukr_pref_id ];
 
-		// Store the ukernel function pointer and preference values into
-		// the context.
-		bli_func_set_dt( ker_fp, ker_dt, kers );
+		// Store the ukernel preference value into the context.
+		bli_mbool_set_dt( ukr_pref, ukr_pref_dt, ukr_prefs );
 	}
 
-	// Free the temporary local arrays.
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l1v_kers(): " );
-	#endif
-	bli_free_intl( ker_ids );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l1v_kers(): " );
-	#endif
-	bli_free_intl( ker_dts );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l1v_kers(): " );
-	#endif
-	bli_free_intl( ker_fps );
+	// Shutdown variable argument environment and clean up stack.
+	va_end( args );
 }
 
 // -----------------------------------------------------------------------------
 
-void bli_cntx_set_packm_kers( dim_t n_kers, ... )
+void bli_cntx_set_l3_sup_handlers( dim_t n_ops, ... )
 {
 	// This function can be called from the bli_cntx_init_*() function for
 	// a particular architecture if the kernel developer wishes to use
-	// non-default packing kernels. It should be called after
-	// bli_cntx_init_defaults() so that the context begins with default packm
-	// kernels across all datatypes.
+	// non-default level-3 operation handler for small/unpacked matrices. It
+	// should be called after bli_cntx_init_defaults() so that the context
+	// begins with default sup handlers across all datatypes.
 
 	/* Example prototypes:
 
-	   void bli_cntx_set_packm_kers
+	   void bli_cntx_set_l3_sup_handlers
 	   (
-	     dim_t   n_ukrs,
-	     l1mkr_t ker0_id, num_t ker0_dt, void_fp ker0_fp,
-	     l1mkr_t ker1_id, num_t ker1_dt, void_fp ker1_fp,
-	     l1mkr_t ker2_id, num_t ker2_dt, void_fp ker2_fp,
+	     dim_t   n_ops,
+	     opid_t  op0_id, void* handler0_fp,
+	     opid_t  op1_id, void* handler1_fp,
+	     opid_t  op2_id, void* handler2_fp,
 	     ...
 	     cntx_t* cntx
 	   );
@@ -1473,41 +369,33 @@ void bli_cntx_set_packm_kers( dim_t n_kers, ... )
 	// Allocate some temporary local arrays.
 
 	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_packm_kers(): " );
-	#endif
-	l1mkr_t* ker_ids   = bli_malloc_intl( n_kers * sizeof( l1mkr_t ), &r_val );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_packm_kers(): " );
+	printf( "bli_cntx_set_l3_sup_handlers(): " );
 	#endif
-	num_t*   ker_dts   = bli_malloc_intl( n_kers * sizeof( num_t   ), &r_val );
+	opid_t* op_ids = bli_malloc_intl( n_ops * sizeof( opid_t ), &r_val );
 
 	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_packm_kers(): " );
+	printf( "bli_cntx_set_l3_sup_handlers(): " );
 	#endif
-	void_fp* ker_fps   = bli_malloc_intl( n_kers * sizeof( void_fp ), &r_val );
+	void**  op_fps = bli_malloc_intl( n_ops * sizeof( void*  ), &r_val );
 
 	// -- Begin variable argument section --
 
 	// Initialize variable argument environment.
-	va_start( args, n_kers );
+	va_start( args, n_ops );
 
-	// Process n_kers tuples.
-	for ( i = 0; i < n_kers; ++i )
+	// Process n_ukrs tuples.
+	for ( i = 0; i < n_ops; ++i )
 	{
 		// Here, we query the variable argument list for:
-		// - the l1mkr_t of the kernel we're about to process,
-		// - the datatype of the kernel, and
-		// - the kernel function pointer
+		// - the opid_t of the operation we're about to process,
+		// - the sup handler function pointer
 		// that we need to store to the context.
-		const l1mkr_t  ker_id   = ( l1mkr_t )va_arg( args, l1mkr_t );
-		const num_t    ker_dt   = ( num_t   )va_arg( args, num_t   );
-		      void_fp  ker_fp   = ( void_fp )va_arg( args, void_fp );
+		const opid_t op_id = ( opid_t )va_arg( args, opid_t );
+		      void*  op_fp = ( void*  )va_arg( args, void*  );
 
 		// Store the values in our temporary arrays.
-		ker_ids[ i ]   = ker_id;
-		ker_dts[ i ]   = ker_dt;
-		ker_fps[ i ]   = ker_fp;
+		op_ids[ i ] = op_id;
+		op_fps[ i ] = op_fp;
 	}
 
 	// The last argument should be the context pointer.
@@ -1518,48 +406,36 @@ void bli_cntx_set_packm_kers( dim_t n_kers, ... )
 
 	// -- End variable argument section --
 
-	// Query the context for the address of:
-	// - the packm kernels func_t array
-	func_t* cntx_packm_kers = bli_cntx_packm_kers_buf( cntx );
+	// Query the context for the addresses of:
+	// - the l3 small/unpacked handlers array
+	void** cntx_l3_sup_handlers = bli_cntx_l3_sup_handlers_buf( cntx );
 
 	// Now that we have the context address, we want to copy the values
 	// from the temporary buffers into the corresponding buffers in the
 	// context.
 
-	// Process each blocksize id tuple provided.
-	for ( i = 0; i < n_kers; ++i )
+	// Process each operation id tuple provided.
+	for ( i = 0; i < n_ops; ++i )
 	{
-		// Read the current kernel id, kernel datatype, and kernel function
-		// pointer.
-		const l1mkr_t ker_id   = ker_ids[ i ];
-		const num_t   ker_dt   = ker_dts[ i ];
-		      void_fp ker_fp   = ker_fps[ i ];
-
-		// Index into the func_t and mbool_t for the current kernel id
-		// being processed.
-		func_t*       kers     = &cntx_packm_kers[ ker_id ];
+		// Read the current operation id and handler function pointer.
+		const opid_t op_id = op_ids[ i ];
+		      void*  op_fp = op_fps[ i ];
 
-		// Store the ukernel function pointer and preference values into
-		// the context.
-		bli_func_set_dt( ker_fp, ker_dt, kers );
+		// Store the sup handler function pointer into the slot for the
+		// specified operation id.
+		cntx_l3_sup_handlers[ op_id ] = op_fp;
 	}
 
 	// Free the temporary local arrays.
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_packm_kers(): " );
-	#endif
-	bli_free_intl( ker_ids );
-
 	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_packm_kers(): " );
+	printf( "bli_cntx_set_l3_sup_handlers(): " );
 	#endif
-	bli_free_intl( ker_dts );
+	bli_free_intl( op_ids );
 
 	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_packm_kers(): " );
+	printf( "bli_cntx_set_l3_sup_handlers(): " );
 	#endif
-	bli_free_intl( ker_fps );
+	bli_free_intl( op_fps );
 }
 
 // -----------------------------------------------------------------------------
@@ -1586,11 +462,11 @@ void bli_cntx_print( cntx_t* cntx )
 		      );
 	}
 
-	for ( i = 0; i < BLIS_NUM_LEVEL3_UKRS; ++i )
+	for ( i = 0; i < BLIS_NUM_UKRS; ++i )
 	{
-		func_t* ukr = bli_cntx_get_l3_vir_ukrs( i, cntx );
+		func_t* ukr = bli_cntx_get_ukrs( i, cntx );
 
-		printf( "l3 vir ukr %2lu:  %16p %16p %16p %16p\n",
+		printf( "ukr %2lu:  %16p %16p %16p %16p\n",
 		        ( unsigned long )i,
 		        bli_func_get_dt( BLIS_FLOAT,    ukr ),
 		        bli_func_get_dt( BLIS_DOUBLE,   ukr ),
@@ -1599,42 +475,16 @@ void bli_cntx_print( cntx_t* cntx )
 		      );
 	}
 
-	for ( i = 0; i < BLIS_NUM_3OP_RC_COMBOS; ++i )
-	{
-		func_t* ukr = bli_cntx_get_l3_sup_kers( i, cntx );
-
-		printf( "l3 sup ukr %2lu:  %16p %16p %16p %16p\n",
-		        ( unsigned long )i,
-		        bli_func_get_dt( BLIS_FLOAT,    ukr ),
-		        bli_func_get_dt( BLIS_DOUBLE,   ukr ),
-		        bli_func_get_dt( BLIS_SCOMPLEX, ukr ),
-		        bli_func_get_dt( BLIS_DCOMPLEX, ukr )
-		      );
-	}
-
-	for ( i = 0; i < BLIS_NUM_LEVEL1F_KERS; ++i )
-	{
-		func_t* ker = bli_cntx_get_l1f_kers( i, cntx );
-
-		printf( "l1f ker    %2lu:  %16p %16p %16p %16p\n",
-		        ( unsigned long )i,
-		        bli_func_get_dt( BLIS_FLOAT,    ker ),
-		        bli_func_get_dt( BLIS_DOUBLE,   ker ),
-		        bli_func_get_dt( BLIS_SCOMPLEX, ker ),
-		        bli_func_get_dt( BLIS_DCOMPLEX, ker )
-		      );
-	}
-
-	for ( i = 0; i < BLIS_NUM_LEVEL1V_KERS; ++i )
+	for ( i = 0; i < BLIS_NUM_UKR_PREFS; ++i )
 	{
-		func_t* ker = bli_cntx_get_l1v_kers( i, cntx );
+		mbool_t* ukr_pref = bli_cntx_get_ukr_prefs( i, cntx );
 
-		printf( "l1v ker    %2lu:  %16p %16p %16p %16p\n",
+		printf( "ukr pref %2lu:  %d %d %d %d\n",
 		        ( unsigned long )i,
-		        bli_func_get_dt( BLIS_FLOAT,    ker ),
-		        bli_func_get_dt( BLIS_DOUBLE,   ker ),
-		        bli_func_get_dt( BLIS_SCOMPLEX, ker ),
-		        bli_func_get_dt( BLIS_DCOMPLEX, ker )
+		        bli_mbool_get_dt( BLIS_FLOAT,    ukr_pref ),
+		        bli_mbool_get_dt( BLIS_DOUBLE,   ukr_pref ),
+		        bli_mbool_get_dt( BLIS_SCOMPLEX, ukr_pref ),
+		        bli_mbool_get_dt( BLIS_DCOMPLEX, ukr_pref )
 		      );
 	}
 
diff --git a/frame/base/bli_cntx.h b/frame/base/bli_cntx.h
index 76350f6bcf..fb75cf9f89 100644
--- a/frame/base/bli_cntx.h
+++ b/frame/base/bli_cntx.h
@@ -43,24 +43,13 @@
 /*
 typedef struct cntx_s
 {
-	blksz_t*  blkszs;
-	bszid_t*  bmults;
+	blksz_t   blkszs[ BLIS_NUM_BLKSZS ];
+	bszid_t   bmults[ BLIS_NUM_BLKSZS ];
 
-	func_t*   l3_vir_ukrs;
-	func_t*   l3_nat_ukrs;
-	mbool_t*  l3_nat_ukrs_prefs;
+	func_t    ukrs[ BLIS_NUM_UKRS ];
+	mbool_t   ukr_prefs[ BLIS_NUM_UKR_PREFS ];
 
-	blksz_t*  l3_sup_thresh;
-	void**    l3_sup_handlers;
-	blksz_t*  l3_sup_blkszs;
-	func_t*   l3_sup_kers;
-	mbool_t*  l3_sup_kers_prefs;
-
-	func_t*   l1f_kers;
-	func_t*   l1v_kers;
-
-	func_t*   packm_kers;
-	func_t*   unpackm_kers;
+	void*     l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ];
 
 	ind_t     method;
 
@@ -81,54 +70,18 @@ BLIS_INLINE bszid_t* bli_cntx_bmults_buf( cntx_t* cntx )
 {
 	return cntx->bmults;
 }
-BLIS_INLINE func_t* bli_cntx_l3_vir_ukrs_buf( cntx_t* cntx )
-{
-	return cntx->l3_vir_ukrs;
-}
-BLIS_INLINE func_t* bli_cntx_l3_nat_ukrs_buf( cntx_t* cntx )
+BLIS_INLINE func_t* bli_cntx_ukrs_buf( cntx_t* cntx )
 {
-	return cntx->l3_nat_ukrs;
+	return cntx->ukrs;
 }
-BLIS_INLINE mbool_t* bli_cntx_l3_nat_ukrs_prefs_buf( cntx_t* cntx )
+BLIS_INLINE mbool_t* bli_cntx_ukr_prefs_buf( cntx_t* cntx )
 {
-	return cntx->l3_nat_ukrs_prefs;
-}
-BLIS_INLINE blksz_t* bli_cntx_l3_sup_thresh_buf( cntx_t* cntx )
-{
-	return cntx->l3_sup_thresh;
+	return cntx->ukr_prefs;
 }
 BLIS_INLINE void** bli_cntx_l3_sup_handlers_buf( cntx_t* cntx )
 {
 	return cntx->l3_sup_handlers;
 }
-BLIS_INLINE blksz_t* bli_cntx_l3_sup_blkszs_buf( cntx_t* cntx )
-{
-	return cntx->l3_sup_blkszs;
-}
-BLIS_INLINE func_t* bli_cntx_l3_sup_kers_buf( cntx_t* cntx )
-{
-	return cntx->l3_sup_kers;
-}
-BLIS_INLINE mbool_t* bli_cntx_l3_sup_kers_prefs_buf( cntx_t* cntx )
-{
-	return cntx->l3_sup_kers_prefs;
-}
-BLIS_INLINE func_t* bli_cntx_l1f_kers_buf( cntx_t* cntx )
-{
-	return cntx->l1f_kers;
-}
-BLIS_INLINE func_t* bli_cntx_l1v_kers_buf( cntx_t* cntx )
-{
-	return cntx->l1v_kers;
-}
-BLIS_INLINE func_t* bli_cntx_packm_kers_buf( cntx_t* cntx )
-{
-	return cntx->packm_kers;
-}
-BLIS_INLINE func_t* bli_cntx_unpackm_kers_buf( cntx_t* cntx )
-{
-	return cntx->unpackm_kers;
-}
 BLIS_INLINE ind_t bli_cntx_method( cntx_t* cntx )
 {
 	return cntx->method;
@@ -204,78 +157,60 @@ BLIS_INLINE dim_t bli_cntx_get_bmult_dt( num_t dt, bszid_t bs_id, cntx_t* cntx )
 
 // -----------------------------------------------------------------------------
 
-BLIS_INLINE func_t* bli_cntx_get_l3_vir_ukrs( l3ukr_t ukr_id, cntx_t* cntx )
+BLIS_INLINE func_t* bli_cntx_get_ukrs( ukr_t ukr_id, cntx_t* cntx )
 {
-	func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx );
+	func_t* funcs = bli_cntx_ukrs_buf( cntx );
 	func_t* func  = &funcs[ ukr_id ];
 
 	return func;
 }
 
-BLIS_INLINE void_fp bli_cntx_get_l3_vir_ukr_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx )
+BLIS_INLINE void_fp bli_cntx_get_ukr_dt( num_t dt, ukr_t ukr_id, cntx_t* cntx )
 {
-	func_t* func = bli_cntx_get_l3_vir_ukrs( ukr_id, cntx );
+	func_t* func = bli_cntx_get_ukrs( ukr_id, cntx );
 
 	return bli_func_get_dt( dt, func );
 }
 
-BLIS_INLINE func_t* bli_cntx_get_l3_nat_ukrs( l3ukr_t ukr_id, cntx_t* cntx )
-{
-	func_t* funcs = bli_cntx_l3_nat_ukrs_buf( cntx );
-	func_t* func  = &funcs[ ukr_id ];
-
-	return func;
-}
-
-BLIS_INLINE void_fp bli_cntx_get_l3_nat_ukr_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx )
+BLIS_INLINE void_fp bli_cntx_get_l3_vir_ukr_dt( num_t dt, ukr_t ukr_id, cntx_t* cntx )
 {
-	func_t* func = bli_cntx_get_l3_nat_ukrs( ukr_id, cntx );
+    switch ( ukr_id )
+    {
+        case BLIS_GEMM_UKR:       ukr_id = BLIS_GEMM_VIR_UKR; break;
+        case BLIS_TRSM_L_UKR:     ukr_id = BLIS_TRSM_L_VIR_UKR; break;
+        case BLIS_TRSM_U_UKR:     ukr_id = BLIS_TRSM_U_VIR_UKR; break;
+        case BLIS_GEMMTRSM_L_UKR: ukr_id = BLIS_GEMMTRSM_L_VIR_UKR; break;
+        case BLIS_GEMMTRSM_U_UKR: ukr_id = BLIS_GEMMTRSM_U_VIR_UKR; break;
+        default: break;
+    };
 
-	return bli_func_get_dt( dt, func );
+    return bli_cntx_get_ukr_dt( dt, ukr_id, cntx );
 }
 
 // -----------------------------------------------------------------------------
 
-BLIS_INLINE mbool_t* bli_cntx_get_l3_nat_ukr_prefs( l3ukr_t ukr_id, cntx_t* cntx )
+BLIS_INLINE mbool_t* bli_cntx_get_ukr_prefs( ukr_pref_t ukr_id, cntx_t* cntx )
 {
-	mbool_t* mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx );
+	mbool_t* mbools = bli_cntx_ukr_prefs_buf( cntx );
 	mbool_t* mbool  = &mbools[ ukr_id ];
 
 	return mbool;
 }
 
-BLIS_INLINE bool bli_cntx_get_l3_nat_ukr_prefs_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx )
+BLIS_INLINE bool bli_cntx_get_ukr_prefs_dt( num_t dt, ukr_pref_t ukr_id, cntx_t* cntx )
 {
-	mbool_t* mbool = bli_cntx_get_l3_nat_ukr_prefs( ukr_id, cntx );
+	mbool_t* mbool = bli_cntx_get_ukr_prefs( ukr_id, cntx );
 
 	return ( bool )bli_mbool_get_dt( dt, mbool );
 }
 
 // -----------------------------------------------------------------------------
 
-BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_thresh( threshid_t thresh_id, cntx_t* cntx )
-{
-	blksz_t* threshs = bli_cntx_l3_sup_thresh_buf( cntx );
-	blksz_t* thresh  = &threshs[ thresh_id ];
-
-	// Return the address of the blksz_t identified by thresh_id.
-	return thresh;
-}
-
-BLIS_INLINE dim_t bli_cntx_get_l3_sup_thresh_dt( num_t dt, threshid_t thresh_id, cntx_t* cntx )
-{
-	blksz_t* threshs   = bli_cntx_get_l3_sup_thresh( thresh_id, cntx );
-	dim_t    thresh_dt = bli_blksz_get_def( dt, threshs );
-
-	// Return the main (default) threshold value for the datatype given.
-	return thresh_dt;
-}
-
 BLIS_INLINE bool bli_cntx_l3_sup_thresh_is_met( num_t dt, dim_t m, dim_t n, dim_t k, cntx_t* cntx )
 {
-	if ( m < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_MT, cntx ) ) return TRUE;
-	if ( n < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_NT, cntx ) ) return TRUE;
-	if ( k < bli_cntx_get_l3_sup_thresh_dt( dt, BLIS_KT, cntx ) ) return TRUE;
+	if ( m < bli_cntx_get_blksz_def_dt( dt, BLIS_MT, cntx ) ) return TRUE;
+	if ( n < bli_cntx_get_blksz_def_dt( dt, BLIS_NT, cntx ) ) return TRUE;
+	if ( k < bli_cntx_get_blksz_def_dt( dt, BLIS_KT, cntx ) ) return TRUE;
 
 	return FALSE;
 }
@@ -292,311 +227,71 @@ BLIS_INLINE void* bli_cntx_get_l3_sup_handler( opid_t op, cntx_t* cntx )
 
 // -----------------------------------------------------------------------------
 
-BLIS_INLINE blksz_t* bli_cntx_get_l3_sup_blksz( bszid_t bs_id, cntx_t* cntx )
-{
-	blksz_t* blkszs = bli_cntx_l3_sup_blkszs_buf( cntx );
-	blksz_t* blksz  = &blkszs[ bs_id ];
-
-	// Return the address of the blksz_t identified by bs_id.
-	return blksz;
-}
-
-BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx )
-{
-	blksz_t* blksz  = bli_cntx_get_l3_sup_blksz( bs_id, cntx );
-	dim_t    bs_dt  = bli_blksz_get_def( dt, blksz );
-
-	// Return the main (default) blocksize value for the datatype given.
-	return bs_dt;
-}
-
-BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx )
-{
-	blksz_t* blksz  = bli_cntx_get_l3_sup_blksz( bs_id, cntx );
-	dim_t    bs_dt  = bli_blksz_get_max( dt, blksz );
-
-	// Return the auxiliary (maximum) blocksize value for the datatype given.
-	return bs_dt;
-}
-
-// -----------------------------------------------------------------------------
-
-BLIS_INLINE func_t* bli_cntx_get_l3_sup_kers( stor3_t stor_id, cntx_t* cntx )
-{
-	func_t* funcs = bli_cntx_l3_sup_kers_buf( cntx );
-	func_t* func  = &funcs[ stor_id ];
-
-	return func;
-}
-
-BLIS_INLINE void* bli_cntx_get_l3_sup_ker_dt( num_t dt, stor3_t stor_id, cntx_t* cntx )
-{
-	func_t* func = bli_cntx_get_l3_sup_kers( stor_id, cntx );
-
-	return bli_func_get_dt( dt, func );
-}
-
-// -----------------------------------------------------------------------------
-
-BLIS_INLINE mbool_t* bli_cntx_get_l3_sup_ker_prefs( stor3_t stor_id, cntx_t* cntx )
-{
-	mbool_t* mbools = bli_cntx_l3_sup_kers_prefs_buf( cntx );
-	mbool_t* mbool  = &mbools[ stor_id ];
-
-	return mbool;
-}
-
-BLIS_INLINE bool bli_cntx_get_l3_sup_ker_prefs_dt( num_t dt, stor3_t stor_id, cntx_t* cntx )
-{
-	mbool_t* mbool = bli_cntx_get_l3_sup_ker_prefs( stor_id, cntx );
-
-	return ( bool )bli_mbool_get_dt( dt, mbool );
-}
-
-// -----------------------------------------------------------------------------
-
-BLIS_INLINE func_t* bli_cntx_get_l1f_kers( l1fkr_t ker_id, cntx_t* cntx )
-{
-	func_t* funcs = bli_cntx_l1f_kers_buf( cntx );
-	func_t* func  = &funcs[ ker_id ];
-
-	return func;
-}
-
-BLIS_INLINE void_fp bli_cntx_get_l1f_ker_dt( num_t dt, l1fkr_t ker_id, cntx_t* cntx )
-{
-	func_t* func = bli_cntx_get_l1f_kers( ker_id, cntx );
-
-	return bli_func_get_dt( dt, func );
-}
-
-// -----------------------------------------------------------------------------
-
-BLIS_INLINE func_t* bli_cntx_get_l1v_kers( l1vkr_t ker_id, cntx_t* cntx )
-{
-	func_t* funcs = bli_cntx_l1v_kers_buf( cntx );
-	func_t* func  = &funcs[ ker_id ];
-
-	return func;
-}
-
-BLIS_INLINE void_fp bli_cntx_get_l1v_ker_dt( num_t dt, l1vkr_t ker_id, cntx_t* cntx )
-{
-	func_t* func = bli_cntx_get_l1v_kers( ker_id, cntx );
-
-	return bli_func_get_dt( dt, func );
-}
-
-// -----------------------------------------------------------------------------
-
-BLIS_INLINE func_t* bli_cntx_get_packm_kers( l1mkr_t ker_id, cntx_t* cntx )
-{
-	func_t* func = NULL;
-
-	// Only index to the requested packm func_t if the packm kernel being
-	// requested is one that is explicitly supported.
-	if ( 0 <= ( gint_t )ker_id &&
-	          ( gint_t )ker_id < BLIS_NUM_PACKM_KERS )
-	{
-		func_t* funcs = bli_cntx_packm_kers_buf( cntx );
-
-		func = &funcs[ ker_id ];
-	}
-
-	return func;
-}
-
-BLIS_INLINE void_fp bli_cntx_get_packm_ker_dt( num_t dt, l1mkr_t ker_id, cntx_t* cntx )
-{
-	void_fp fp = NULL;
-
-	// Only query the context for the packm func_t (and then extract the
-	// datatype-specific function pointer) if the packm kernel being
-	// requested is one that is explicitly supported.
-	if ( 0 <= ( gint_t )ker_id &&
-	          ( gint_t )ker_id < BLIS_NUM_PACKM_KERS )
-	{
-		func_t* func = bli_cntx_get_packm_kers( ker_id, cntx );
-
-		fp = bli_func_get_dt( dt, func );
-	}
-
-	return fp;
-}
-
-BLIS_INLINE func_t* bli_cntx_get_unpackm_kers( l1mkr_t ker_id, cntx_t* cntx )
-{
-	func_t* func = NULL;
-
-	// Only index to the requested unpackm func_t if the unpackm kernel being
-	// requested is one that is explicitly supported.
-	if ( 0 <= ( gint_t )ker_id &&
-	          ( gint_t )ker_id < BLIS_NUM_UNPACKM_KERS )
-	{
-		func_t* funcs = bli_cntx_unpackm_kers_buf( cntx );
-
-		func = &funcs[ ker_id ];
-	}
-
-	return func;
-}
-
-BLIS_INLINE void_fp bli_cntx_get_unpackm_ker_dt( num_t dt, l1mkr_t ker_id, cntx_t* cntx )
-{
-	void_fp fp = NULL;
-
-	// Only query the context for the unpackm func_t (and then extract the
-	// datatype-specific function pointer) if the unpackm kernel being
-	// requested is one that is explicitly supported.
-	if ( 0 <= ( gint_t )ker_id &&
-	          ( gint_t )ker_id < BLIS_NUM_UNPACKM_KERS )
-	{
-		func_t* func = bli_cntx_get_unpackm_kers( ker_id, cntx );
-
-		fp = bli_func_get_dt( dt, func );
-	}
-
-	return fp;
-}
-
-// -----------------------------------------------------------------------------
-
-BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx )
-{
-	const bool prefs = bli_cntx_get_l3_nat_ukr_prefs_dt( dt, ukr_id, cntx );
-
-	// A ukernel preference of TRUE means the ukernel prefers row storage.
-	return ( bool )
-	       ( prefs == TRUE );
-}
-
-BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx )
-{
-	const bool prefs = bli_cntx_get_l3_nat_ukr_prefs_dt( dt, ukr_id, cntx );
-
-	// A ukernel preference of FALSE means the ukernel prefers column storage.
-	return ( bool )
-	       ( prefs == FALSE );
-}
-
-BLIS_INLINE bool bli_cntx_l3_nat_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx )
-{
-	// Note that we use the computation datatype, which may differ from the
-	// storage datatype of C (when performing a mixed datatype operation).
-	const num_t dt    = bli_obj_comp_dt( obj );
-	const bool  ukr_prefers_rows
-	                  = bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx );
-	const bool  ukr_prefers_cols
-	                  = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx );
-	bool        r_val = FALSE;
-
-	if      ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE;
-	else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE;
-
-	return r_val;
-}
-
-BLIS_INLINE bool bli_cntx_l3_nat_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx )
-{
-	return ( bool )
-	       !bli_cntx_l3_nat_ukr_prefers_storage_of( obj, ukr_id, cntx );
-}
-
-// -----------------------------------------------------------------------------
-
-BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_rows_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx )
-{
-	// For induced methods, return the ukernel storage preferences of the
-	// corresponding real micro-kernel.
-	// NOTE: This projection to real domain becomes unnecessary if you
-	// set the exec_dt for 1m to the real projection of the storage
-	// datatype.
-	if ( bli_cntx_method( cntx ) != BLIS_NAT )
-	    dt = bli_dt_proj_to_real( dt );
-
-	return bli_cntx_l3_nat_ukr_prefers_rows_dt( dt, ukr_id, cntx );
-}
-
-BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_cols_dt( num_t dt, l3ukr_t ukr_id, cntx_t* cntx )
-{
-	// For induced methods, return the ukernel storage preferences of the
-	// corresponding real micro-kernel.
-	// NOTE: This projection to real domain becomes unnecessary if you
-	// set the exec_dt for 1m to the real projection of the storage
-	// datatype.
-	if ( bli_cntx_method( cntx ) != BLIS_NAT )
-	    dt = bli_dt_proj_to_real( dt );
-
-	return bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, ukr_id, cntx );
-}
-
-BLIS_INLINE bool bli_cntx_l3_vir_ukr_prefers_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx )
-{
-	// Note that we use the computation datatype, which may differ from the
-	// storage datatype of C (when performing a mixed datatype operation).
-	const num_t dt    = bli_obj_comp_dt( obj );
-	const bool  ukr_prefers_rows
-	                  = bli_cntx_l3_vir_ukr_prefers_rows_dt( dt, ukr_id, cntx );
-	const bool  ukr_prefers_cols
-	                  = bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, ukr_id, cntx );
-	bool        r_val = FALSE;
-
-	if      ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE;
-	else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE;
-
-	return r_val;
-}
-
-BLIS_INLINE bool bli_cntx_l3_vir_ukr_dislikes_storage_of( obj_t* obj, l3ukr_t ukr_id, cntx_t* cntx )
-{
-	return ( bool )
-	       !bli_cntx_l3_vir_ukr_prefers_storage_of( obj, ukr_id, cntx );
-}
-
-// -----------------------------------------------------------------------------
-
-BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_rows_dt( num_t dt, stor3_t stor_id, cntx_t* cntx )
-{
-	const bool prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx );
-
-	// A ukernel preference of TRUE means the ukernel prefers row storage.
-	return ( bool )
-	       ( prefs == TRUE );
-}
-
-BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_cols_dt( num_t dt, stor3_t stor_id, cntx_t* cntx )
-{
-	const bool prefs = bli_cntx_get_l3_sup_ker_prefs_dt( dt, stor_id, cntx );
-
-	// A ukernel preference of FALSE means the ukernel prefers column storage.
-	return ( bool )
-	       ( prefs == FALSE );
-}
-
-#if 0
-// NOTE: These static functions aren't needed yet.
-
-BLIS_INLINE bool bli_cntx_l3_sup_ker_prefers_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx )
-{
-	const num_t dt    = bli_obj_dt( obj );
-	const bool  ukr_prefers_rows
-	                  = bli_cntx_l3_sup_ker_prefers_rows_dt( dt, stor_id, cntx );
-	const bool  ukr_prefers_cols
-	                  = bli_cntx_l3_sup_ker_prefers_cols_dt( dt, stor_id, cntx );
-	bool        r_val = FALSE;
-
-	if      ( bli_obj_is_row_stored( obj ) && ukr_prefers_rows ) r_val = TRUE;
-	else if ( bli_obj_is_col_stored( obj ) && ukr_prefers_cols ) r_val = TRUE;
+BLIS_INLINE bool bli_cntx_ukr_prefers_rows_dt( num_t dt, ukr_t ukr_id, cntx_t* cntx )
+{
+    // Get the correct preference from the kernel ID.
+    ukr_pref_t ukr_pref_id = BLIS_GEMM_UKR_ROW_PREF;
+    switch ( ukr_id )
+    {
+        case BLIS_GEMM_VIR_UKR: // fallthrough
+        case BLIS_GEMM_UKR: ukr_pref_id = BLIS_GEMM_UKR_ROW_PREF; break;
+        case BLIS_TRSM_L_VIR_UKR: // fallthrough
+        case BLIS_TRSM_L_UKR: ukr_pref_id = BLIS_TRSM_L_UKR_ROW_PREF; break;
+        case BLIS_TRSM_U_VIR_UKR: // fallthrough
+        case BLIS_TRSM_U_UKR: ukr_pref_id = BLIS_TRSM_U_UKR_ROW_PREF; break;
+        case BLIS_GEMMTRSM_L_VIR_UKR: // fallthrough
+        case BLIS_GEMMTRSM_L_UKR: ukr_pref_id = BLIS_GEMMTRSM_L_UKR_ROW_PREF; break;
+        case BLIS_GEMMTRSM_U_VIR_UKR: // fallthrough
+        case BLIS_GEMMTRSM_U_UKR: ukr_pref_id = BLIS_GEMMTRSM_U_UKR_ROW_PREF; break;
+        case BLIS_GEMMSUP_RRR_UKR: ukr_pref_id = BLIS_GEMMSUP_RRR_UKR_ROW_PREF; break;
+        case BLIS_GEMMSUP_RRC_UKR: ukr_pref_id = BLIS_GEMMSUP_RRC_UKR_ROW_PREF; break;
+        case BLIS_GEMMSUP_RCR_UKR: ukr_pref_id = BLIS_GEMMSUP_RCR_UKR_ROW_PREF; break;
+        case BLIS_GEMMSUP_RCC_UKR: ukr_pref_id = BLIS_GEMMSUP_RCC_UKR_ROW_PREF; break;
+        case BLIS_GEMMSUP_CRR_UKR: ukr_pref_id = BLIS_GEMMSUP_CRR_UKR_ROW_PREF; break;
+        case BLIS_GEMMSUP_CRC_UKR: ukr_pref_id = BLIS_GEMMSUP_CRC_UKR_ROW_PREF; break;
+        case BLIS_GEMMSUP_CCR_UKR: ukr_pref_id = BLIS_GEMMSUP_CCR_UKR_ROW_PREF; break;
+        case BLIS_GEMMSUP_CCC_UKR: ukr_pref_id = BLIS_GEMMSUP_CCC_UKR_ROW_PREF; break;
+        case BLIS_GEMMSUP_XXX_UKR: ukr_pref_id = BLIS_GEMMSUP_XXX_UKR_ROW_PREF; break;
+        default: break; // TODO: should be an error condition
+    }
+
+    // For virtual ukrs and non-native execution, use the real projection of the datatype.
+    if ( bli_cntx_method( cntx ) != BLIS_NAT )
+    {
+        switch ( ukr_id )
+        {
+            case BLIS_GEMM_VIR_UKR: // fallthrough
+            case BLIS_TRSM_L_VIR_UKR: // fallthrough
+            case BLIS_TRSM_U_VIR_UKR: // fallthrough
+            case BLIS_GEMMTRSM_L_VIR_UKR: // fallthrough
+            case BLIS_GEMMTRSM_U_VIR_UKR: dt = bli_dt_proj_to_real( dt ); break;
+            default: break;
+        }
+    }
+
+	return bli_cntx_get_ukr_prefs_dt( dt, ukr_pref_id, cntx );
+}
+
+BLIS_INLINE bool bli_cntx_ukr_prefers_cols_dt( num_t dt, ukr_t ukr_id, cntx_t* cntx )
+{
+	return ! bli_cntx_ukr_prefers_rows_dt( dt, ukr_id, cntx );
+}
+
+BLIS_INLINE bool bli_cntx_prefers_storage_of( obj_t* obj, ukr_t ukr_id, cntx_t* cntx )
+{
+	const bool ukr_prefers_rows
+	    = bli_cntx_ukr_prefers_rows_dt( bli_obj_dt( obj ), ukr_id, cntx );
+
+	if      ( bli_obj_is_row_stored( obj ) &&  ukr_prefers_rows ) return TRUE;
+	else if ( bli_obj_is_col_stored( obj ) && !ukr_prefers_rows ) return TRUE;
 
-	return r_val;
+	return FALSE;
 }
 
-BLIS_INLINE bool bli_cntx_l3_sup_ker_dislikes_storage_of( obj_t* obj, stor3_t stor_id, cntx_t* cntx )
+BLIS_INLINE bool bli_cntx_dislikes_storage_of( obj_t* obj, ukr_t ukr_id, cntx_t* cntx )
 {
-	return ( bool )
-	       !bli_cntx_l3_sup_ker_prefers_storage_of( obj, stor_id, cntx );
+	return ! bli_cntx_prefers_storage_of( obj, ukr_id, cntx );
 }
-#endif
 
 // -----------------------------------------------------------------------------
 
@@ -632,67 +327,64 @@ BLIS_INLINE void bli_cntx_set_blksz_max_dt( num_t dt, bszid_t bs_id, dim_t bs, c
 	bli_blksz_set_max( bs, dt, blksz );
 }
 
-BLIS_INLINE void bli_cntx_set_l3_vir_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx )
+BLIS_INLINE void bli_cntx_set_ukr( ukr_t ukr_id, func_t* func, cntx_t* cntx )
 {
-	func_t* funcs = bli_cntx_l3_vir_ukrs_buf( cntx );
+	func_t* funcs = bli_cntx_ukrs_buf( cntx );
 
 	funcs[ ukr_id ] = *func;
 }
 
-BLIS_INLINE void bli_cntx_set_l3_nat_ukr( l3ukr_t ukr_id, func_t* func, cntx_t* cntx )
+BLIS_INLINE void bli_cntx_set_ukr_dt( void_fp fp, num_t dt, ukr_t ker_id, cntx_t* cntx )
 {
-	func_t* funcs = bli_cntx_l3_nat_ukrs_buf( cntx );
+	func_t* func = bli_cntx_get_ukrs( ker_id, cntx );
 
-	funcs[ ukr_id ] = *func;
+	bli_func_set_dt( fp, dt, func );
 }
 
-BLIS_INLINE void bli_cntx_set_l3_nat_ukr_prefs( l3ukr_t ukr_id, mbool_t* prefs, cntx_t* cntx )
+BLIS_INLINE void bli_cntx_set_ukr_pref( ukr_pref_t ukr_id, mbool_t* prefs, cntx_t* cntx )
 {
-	mbool_t* mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx );
+	mbool_t* mbools = bli_cntx_ukr_prefs_buf( cntx );
 
 	mbools[ ukr_id ] = *prefs;
 }
 
-BLIS_INLINE void bli_cntx_set_l1f_ker( l1fkr_t ker_id, func_t* func, cntx_t* cntx )
-{
-	func_t* funcs = bli_cntx_l1f_kers_buf( cntx );
-
-	funcs[ ker_id ] = *func;
-}
-
-BLIS_INLINE void bli_cntx_set_l1v_ker( l1vkr_t ker_id, func_t* func, cntx_t* cntx )
-{
-	func_t* funcs = bli_cntx_l1v_kers_buf( cntx );
-
-	funcs[ ker_id ] = *func;
-}
-
-BLIS_INLINE void bli_cntx_set_packm_ker( l1mkr_t ker_id, func_t* func, cntx_t* cntx )
+BLIS_INLINE void_fp bli_cntx_get_l3_sup_ker_dt( num_t dt, stor3_t stor_id, cntx_t* cntx )
 {
-	func_t* funcs = bli_cntx_get_packm_kers( ker_id, cntx );
+    ukr_t ukr_id = bli_stor3_ukr( stor_id );
 
-	funcs[ ker_id ] = *func;
+    return bli_cntx_get_ukr_dt( dt, ukr_id, cntx );
 }
 
-BLIS_INLINE void bli_cntx_set_packm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_id, cntx_t* cntx )
+BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx )
 {
-	func_t* func = ( func_t* )bli_cntx_get_packm_kers( ker_id, cntx );
+    switch ( bs_id )
+    {
+        case BLIS_MR: bs_id = BLIS_MR_SUP; break;
+        case BLIS_NR: bs_id = BLIS_NR_SUP; break;
+        case BLIS_KR: bs_id = BLIS_KR_SUP; break;
+        case BLIS_MC: bs_id = BLIS_MC_SUP; break;
+        case BLIS_NC: bs_id = BLIS_NC_SUP; break;
+        case BLIS_KC: bs_id = BLIS_KC_SUP; break;
+        default: break;
+    };
 
-	bli_func_set_dt( fp, dt, func );
+    return bli_cntx_get_blksz_def_dt( dt, bs_id, cntx );
 }
 
-BLIS_INLINE void bli_cntx_set_unpackm_ker( l1mkr_t ker_id, func_t* func, cntx_t* cntx )
+BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx )
 {
-	func_t* funcs = bli_cntx_get_unpackm_kers( ker_id, cntx );
+    switch ( bs_id )
+    {
+        case BLIS_MR: bs_id = BLIS_MR_SUP; break;
+        case BLIS_NR: bs_id = BLIS_NR_SUP; break;
+        case BLIS_KR: bs_id = BLIS_KR_SUP; break;
+        case BLIS_MC: bs_id = BLIS_MC_SUP; break;
+        case BLIS_NC: bs_id = BLIS_NC_SUP; break;
+        case BLIS_KC: bs_id = BLIS_KC_SUP; break;
+        default: break;
+    };
 
-	funcs[ ker_id ] = *func;
-}
-
-BLIS_INLINE void bli_cntx_set_unpackm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_id, cntx_t* cntx )
-{
-	func_t* func = ( func_t* )bli_cntx_get_unpackm_kers( ker_id, cntx );
-
-	bli_func_set_dt( fp, dt, func );
+    return bli_cntx_get_blksz_max_dt( dt, bs_id, cntx );
 }
 
 // -----------------------------------------------------------------------------
@@ -701,24 +393,17 @@ BLIS_INLINE void bli_cntx_set_unpackm_ker_dt( void_fp fp, num_t dt, l1mkr_t ker_
 
 BLIS_EXPORT_BLIS void bli_cntx_clear( cntx_t* cntx );
 
-BLIS_EXPORT_BLIS void bli_cntx_set_blkszs( ind_t method, dim_t n_bs, ... );
-
-BLIS_EXPORT_BLIS void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, dim_t n_bs, ... );
-
-BLIS_EXPORT_BLIS void bli_cntx_set_l3_nat_ukrs( dim_t n_ukrs, ... );
-BLIS_EXPORT_BLIS void bli_cntx_set_l3_vir_ukrs( dim_t n_ukrs, ... );
+BLIS_EXPORT_BLIS void bli_cntx_set_blkszs( cntx_t* cntx, ... );
 
-BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_thresh( dim_t n_thresh, ... );
-BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_handlers( dim_t n_ops, ... );
-BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_blkszs( dim_t n_bs, ... );
-BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_kers( dim_t n_ukrs, ... );
+BLIS_EXPORT_BLIS void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, cntx_t* cntx, ... );
 
-BLIS_EXPORT_BLIS void bli_cntx_set_l1f_kers( dim_t n_kers, ... );
-BLIS_EXPORT_BLIS void bli_cntx_set_l1v_kers( dim_t n_kers, ... );
-BLIS_EXPORT_BLIS void bli_cntx_set_packm_kers( dim_t n_kers, ... );
+BLIS_EXPORT_BLIS void bli_cntx_set_ukrs( cntx_t* cntx, ... );
+BLIS_EXPORT_BLIS void bli_cntx_set_ukr_prefs( cntx_t* cntx, ... );
 
 BLIS_EXPORT_BLIS void bli_cntx_print( cntx_t* cntx );
 
+BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_handlers( dim_t n_ops, ... );
+
 
 #endif
 
diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c
index cc17b33ffb..1c3f49bc4e 100644
--- a/frame/base/bli_gks.c
+++ b/frame/base/bli_gks.c
@@ -185,7 +185,7 @@ void bli_gks_init( void )
 		bli_gks_register_cntx( BLIS_ARCH_POWER10,     bli_cntx_init_power10,
 		                                              bli_cntx_init_power10_ref,
 		                                              bli_cntx_init_power10_ind );
-#endif													  
+#endif
 #ifdef BLIS_CONFIG_POWER9
 		bli_gks_register_cntx( BLIS_ARCH_POWER9,      bli_cntx_init_power9,
 		                                              bli_cntx_init_power9_ref,
@@ -267,7 +267,7 @@ void bli_gks_finalize( void )
 void bli_gks_init_index( void )
 {
 	// This function is called by bli_gks_init(). It simply initializes all
-	// architecture id elements of the internal arrays to NULL. 
+	// architecture id elements of the internal arrays to NULL.
 
 	const size_t gks_size = sizeof( cntx_t* ) * BLIS_NUM_ARCHS;
 	const size_t fpa_size = sizeof( void_fp ) * BLIS_NUM_ARCHS;
@@ -382,7 +382,7 @@ void bli_gks_register_cntx
 	// functions for reference kernels and induced method execution. The
 	// former will be used whenever we need to obtain reference kernels and
 	// latter will be used later on if the user calls a level-3 function
-	// with induced execution enabled. 
+	// with induced execution enabled.
 	cntx_ref_init[ id ] = ref_fp;
 	cntx_ind_init[ id ] = ind_fp;
 
@@ -582,7 +582,7 @@ cntx_t* bli_gks_query_ind_cntx
 			// function on the newly allocated structure, we must first copy
 			// over the contents of the native context.
 			*gks_id_ind = *gks_id_nat;
-			
+
 			// Use the architecture id to look up the function pointer to the
 			// context initialization function for induced methods.
 			ind_cntx_init_ft f = cntx_ind_init[ id ];
@@ -635,7 +635,7 @@ void bli_gks_init_ref_cntx
 bool bli_gks_cntx_l3_nat_ukr_is_ref
      (
        num_t   dt,
-       l3ukr_t ukr_id,
+       ukr_t ukr_id,
        cntx_t* cntx
      )
 {
@@ -647,8 +647,8 @@ bool bli_gks_cntx_l3_nat_ukr_is_ref
 
 	// Query each context for the micro-kernel function pointer for the
 	// specified datatype.
-	void_fp ref_fp = bli_cntx_get_l3_nat_ukr_dt( dt, ukr_id, &ref_cntx );
-	void_fp fp     = bli_cntx_get_l3_nat_ukr_dt( dt, ukr_id, cntx );
+	void_fp ref_fp = bli_cntx_get_ukr_dt( dt, ukr_id, &ref_cntx );
+	void_fp fp     = bli_cntx_get_ukr_dt( dt, ukr_id, cntx );
 
 	// Return the result.
 	return fp == ref_fp;
@@ -668,7 +668,7 @@ static char* bli_gks_l3_ukr_impl_str[BLIS_NUM_UKR_IMPL_TYPES] =
 
 // -----------------------------------------------------------------------------
 
-char* bli_gks_l3_ukr_impl_string( l3ukr_t ukr, ind_t method, num_t dt )
+char* bli_gks_l3_ukr_impl_string( ukr_t ukr, ind_t method, num_t dt )
 {
 	kimpl_t ki;
 
@@ -676,7 +676,7 @@ char* bli_gks_l3_ukr_impl_string( l3ukr_t ukr, ind_t method, num_t dt )
 	// then query the ukernel function pointer for the given datatype from
 	// that context.
 	cntx_t* cntx  = bli_gks_query_ind_cntx( method, dt );
-	void_fp fp    = bli_cntx_get_l3_vir_ukr_dt( dt, ukr, cntx );
+	void_fp fp    = bli_cntx_get_ukr_dt( dt, ukr, cntx );
 
 	// Check whether the ukernel function pointer is NULL for the given
 	// datatype. If it is NULL, return the string for not applicable.
@@ -691,7 +691,7 @@ char* bli_gks_l3_ukr_impl_string( l3ukr_t ukr, ind_t method, num_t dt )
 }
 
 #if 0
-char* bli_gks_l3_ukr_avail_impl_string( l3ukr_t ukr, num_t dt )
+char* bli_gks_l3_ukr_avail_impl_string( ukr_t ukr, num_t dt )
 {
 	opid_t  oper;
 	ind_t   method;
@@ -716,7 +716,7 @@ char* bli_gks_l3_ukr_avail_impl_string( l3ukr_t ukr, num_t dt )
 }
 #endif
 
-kimpl_t bli_gks_l3_ukr_impl_type( l3ukr_t ukr, ind_t method, num_t dt )
+kimpl_t bli_gks_l3_ukr_impl_type( ukr_t ukr, ind_t method, num_t dt )
 {
 	// If the current available induced method is not native, it
 	// must be virtual.
@@ -731,8 +731,6 @@ kimpl_t bli_gks_l3_ukr_impl_type( l3ukr_t ukr, ind_t method, num_t dt )
 		// method to the typed function pointer within the known
 		// reference ukrs object.
 
-		cntx_t ref_cntx_l;
-
 		// Query the architecture id.
 		arch_t id = bli_arch_query_id();
 
@@ -743,23 +741,13 @@ kimpl_t bli_gks_l3_ukr_impl_type( l3ukr_t ukr, ind_t method, num_t dt )
 			bli_check_error_code( e_val );
 		}
 
-		// Obtain the function pointer to the context initialization function
-		// for reference kernels.
-		ref_cntx_init_ft f = cntx_ref_init[ id ];
-
-		// Initialize a local context with reference kernels and related values.
-		f( &ref_cntx_l );
-
 		// Query the native context from the gks.
 		cntx_t* nat_cntx = bli_gks_lookup_nat_cntx( id );
 
-		// Query the native ukernel func_t from both the native and reference
-		// contexts.
-		void_fp nat_fp = bli_cntx_get_l3_nat_ukr_dt( dt, ukr, nat_cntx );
-		void_fp ref_fp = bli_cntx_get_l3_nat_ukr_dt( dt, ukr, &ref_cntx_l );
-
-		if ( nat_fp == ref_fp ) return BLIS_REFERENCE_UKERNEL;
-		else                    return BLIS_OPTIMIZED_UKERNEL;
+		if ( bli_gks_cntx_l3_nat_ukr_is_ref( dt, ukr, nat_cntx ) )
+            return BLIS_REFERENCE_UKERNEL;
+		else
+            return BLIS_OPTIMIZED_UKERNEL;
 	}
 }
 
diff --git a/frame/base/bli_gks.h b/frame/base/bli_gks.h
index 188dcd5075..93c9c1412a 100644
--- a/frame/base/bli_gks.h
+++ b/frame/base/bli_gks.h
@@ -54,12 +54,12 @@ BLIS_EXPORT_BLIS cntx_t* bli_gks_query_ind_cntx( ind_t ind, num_t dt );
 
 BLIS_EXPORT_BLIS void    bli_gks_init_ref_cntx( cntx_t* cntx );
 
-bool    bli_gks_cntx_l3_nat_ukr_is_ref( num_t dt, l3ukr_t ukr_id, cntx_t* cntx );
+bool    bli_gks_cntx_l3_nat_ukr_is_ref( num_t dt, ukr_t ukr_id, cntx_t* cntx );
 
-BLIS_EXPORT_BLIS char*    bli_gks_l3_ukr_impl_string( l3ukr_t ukr, ind_t method, num_t dt );
-BLIS_EXPORT_BLIS kimpl_t bli_gks_l3_ukr_impl_type( l3ukr_t ukr, ind_t method, num_t dt );
+BLIS_EXPORT_BLIS char*    bli_gks_l3_ukr_impl_string( ukr_t ukr, ind_t method, num_t dt );
+BLIS_EXPORT_BLIS kimpl_t bli_gks_l3_ukr_impl_type( ukr_t ukr, ind_t method, num_t dt );
 
-//char*   bli_gks_l3_ukr_avail_impl_string( l3ukr_t ukr, num_t dt );
+//char*   bli_gks_l3_ukr_avail_impl_string( ukr_t ukr, num_t dt );
 
 #endif
 
diff --git a/frame/include/bli_param_macro_defs.h b/frame/include/bli_param_macro_defs.h
index 286e79e2b7..5aaf2771d1 100644
--- a/frame/include/bli_param_macro_defs.h
+++ b/frame/include/bli_param_macro_defs.h
@@ -754,7 +754,7 @@ BLIS_INLINE void bli_prune_unstored_region_bottom_u( doff_t* diagoff, dim_t* m,
 	*offm_inc = 0;
 
 	// If the diagonal intersects the right side of the matrix,
-	// ignore the area below that intersection. 
+	// ignore the area below that intersection.
 	if ( *m > -(*diagoff) + *n )
 	{
 		*m = -(*diagoff) + *n;
@@ -858,6 +858,22 @@ BLIS_INLINE stor3_t bli_stor3_trans( stor3_t id )
 #endif
 }
 
+BLIS_INLINE ukr_t bli_stor3_ukr( stor3_t id )
+{
+    switch ( id )
+    {
+        case BLIS_RRR: return BLIS_GEMMSUP_RRR_UKR;
+        case BLIS_RRC: return BLIS_GEMMSUP_RRC_UKR;
+        case BLIS_RCR: return BLIS_GEMMSUP_RCR_UKR;
+        case BLIS_RCC: return BLIS_GEMMSUP_RCC_UKR;
+        case BLIS_CRR: return BLIS_GEMMSUP_CRR_UKR;
+        case BLIS_CRC: return BLIS_GEMMSUP_CRC_UKR;
+        case BLIS_CCR: return BLIS_GEMMSUP_CCR_UKR;
+        case BLIS_CCC: return BLIS_GEMMSUP_CCC_UKR;
+        default: return BLIS_GEMMSUP_XXX_UKR;
+    }
+}
+
 BLIS_INLINE stor3_t bli_stor3_transa( stor3_t id )
 {
 #if 0
diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h
index c66505bde8..a45c59d47a 100644
--- a/frame/include/bli_type_defs.h
+++ b/frame/include/bli_type_defs.h
@@ -626,7 +626,8 @@ typedef enum
 
 typedef enum
 {
-	BLIS_ADDV_KER  = 0,
+    // l1v kernels
+	BLIS_ADDV_KER,
 	BLIS_AMAXV_KER,
 	BLIS_AXPBYV_KER,
 	BLIS_AXPYV_KER,
@@ -639,108 +640,136 @@ typedef enum
 	BLIS_SETV_KER,
 	BLIS_SUBV_KER,
 	BLIS_SWAPV_KER,
-	BLIS_XPBYV_KER
-} l1vkr_t;
-
-#define BLIS_NUM_LEVEL1V_KERS 14
-
-
-typedef enum
-{
-	BLIS_AXPY2V_KER = 0,
+	BLIS_XPBYV_KER,
+	BLIS_AXPY2V_KER,
 	BLIS_DOTAXPYV_KER,
+
+    // l1f kernels
 	BLIS_AXPYF_KER,
 	BLIS_DOTXF_KER,
-	BLIS_DOTXAXPYF_KER
-} l1fkr_t;
-
-#define BLIS_NUM_LEVEL1F_KERS 5
-
-
-typedef enum
-{
-	BLIS_PACKM_0XK_KER  = 0,
-	BLIS_PACKM_1XK_KER  = 1,
-	BLIS_PACKM_2XK_KER  = 2,
-	BLIS_PACKM_3XK_KER  = 3,
-	BLIS_PACKM_4XK_KER  = 4,
-	BLIS_PACKM_5XK_KER  = 5,
-	BLIS_PACKM_6XK_KER  = 6,
-	BLIS_PACKM_7XK_KER  = 7,
-	BLIS_PACKM_8XK_KER  = 8,
-	BLIS_PACKM_9XK_KER  = 9,
-	BLIS_PACKM_10XK_KER = 10,
-	BLIS_PACKM_11XK_KER = 11,
-	BLIS_PACKM_12XK_KER = 12,
-	BLIS_PACKM_13XK_KER = 13,
-	BLIS_PACKM_14XK_KER = 14,
-	BLIS_PACKM_15XK_KER = 15,
-	BLIS_PACKM_16XK_KER = 16,
-	BLIS_PACKM_17XK_KER = 17,
-	BLIS_PACKM_18XK_KER = 18,
-	BLIS_PACKM_19XK_KER = 19,
-	BLIS_PACKM_20XK_KER = 20,
-	BLIS_PACKM_21XK_KER = 21,
-	BLIS_PACKM_22XK_KER = 22,
-	BLIS_PACKM_23XK_KER = 23,
-	BLIS_PACKM_24XK_KER = 24,
-	BLIS_PACKM_25XK_KER = 25,
-	BLIS_PACKM_26XK_KER = 26,
-	BLIS_PACKM_27XK_KER = 27,
-	BLIS_PACKM_28XK_KER = 28,
-	BLIS_PACKM_29XK_KER = 29,
-	BLIS_PACKM_30XK_KER = 30,
-	BLIS_PACKM_31XK_KER = 31,
-
-	BLIS_UNPACKM_0XK_KER  = 0,
-	BLIS_UNPACKM_1XK_KER  = 1,
-	BLIS_UNPACKM_2XK_KER  = 2,
-	BLIS_UNPACKM_3XK_KER  = 3,
-	BLIS_UNPACKM_4XK_KER  = 4,
-	BLIS_UNPACKM_5XK_KER  = 5,
-	BLIS_UNPACKM_6XK_KER  = 6,
-	BLIS_UNPACKM_7XK_KER  = 7,
-	BLIS_UNPACKM_8XK_KER  = 8,
-	BLIS_UNPACKM_9XK_KER  = 9,
-	BLIS_UNPACKM_10XK_KER = 10,
-	BLIS_UNPACKM_11XK_KER = 11,
-	BLIS_UNPACKM_12XK_KER = 12,
-	BLIS_UNPACKM_13XK_KER = 13,
-	BLIS_UNPACKM_14XK_KER = 14,
-	BLIS_UNPACKM_15XK_KER = 15,
-	BLIS_UNPACKM_16XK_KER = 16,
-	BLIS_UNPACKM_17XK_KER = 17,
-	BLIS_UNPACKM_18XK_KER = 18,
-	BLIS_UNPACKM_19XK_KER = 19,
-	BLIS_UNPACKM_20XK_KER = 20,
-	BLIS_UNPACKM_21XK_KER = 21,
-	BLIS_UNPACKM_22XK_KER = 22,
-	BLIS_UNPACKM_23XK_KER = 23,
-	BLIS_UNPACKM_24XK_KER = 24,
-	BLIS_UNPACKM_25XK_KER = 25,
-	BLIS_UNPACKM_26XK_KER = 26,
-	BLIS_UNPACKM_27XK_KER = 27,
-	BLIS_UNPACKM_28XK_KER = 28,
-	BLIS_UNPACKM_29XK_KER = 29,
-	BLIS_UNPACKM_30XK_KER = 30,
-	BLIS_UNPACKM_31XK_KER = 31
-
-} l1mkr_t;
-
-#define BLIS_NUM_PACKM_KERS   32
-#define BLIS_NUM_UNPACKM_KERS 32
-
-
-typedef enum
-{
-	BLIS_GEMM_UKR = 0,
+	BLIS_DOTXAXPYF_KER,
+
+    // pack kernels
+	BLIS_PACKM_0XK_KER,
+	BLIS_PACKM_1XK_KER,
+	BLIS_PACKM_2XK_KER,
+	BLIS_PACKM_3XK_KER,
+	BLIS_PACKM_4XK_KER,
+	BLIS_PACKM_5XK_KER,
+	BLIS_PACKM_6XK_KER,
+	BLIS_PACKM_7XK_KER,
+	BLIS_PACKM_8XK_KER,
+	BLIS_PACKM_9XK_KER,
+	BLIS_PACKM_10XK_KER,
+	BLIS_PACKM_11XK_KER,
+	BLIS_PACKM_12XK_KER,
+	BLIS_PACKM_13XK_KER,
+	BLIS_PACKM_14XK_KER,
+	BLIS_PACKM_15XK_KER,
+	BLIS_PACKM_16XK_KER,
+	BLIS_PACKM_17XK_KER,
+	BLIS_PACKM_18XK_KER,
+	BLIS_PACKM_19XK_KER,
+	BLIS_PACKM_20XK_KER,
+	BLIS_PACKM_21XK_KER,
+	BLIS_PACKM_22XK_KER,
+	BLIS_PACKM_23XK_KER,
+	BLIS_PACKM_24XK_KER,
+	BLIS_PACKM_25XK_KER,
+	BLIS_PACKM_26XK_KER,
+	BLIS_PACKM_27XK_KER,
+	BLIS_PACKM_28XK_KER,
+	BLIS_PACKM_29XK_KER,
+	BLIS_PACKM_30XK_KER,
+	BLIS_PACKM_31XK_KER,
+
+    // unpack kernels
+	BLIS_UNPACKM_0XK_KER,
+	BLIS_UNPACKM_1XK_KER,
+	BLIS_UNPACKM_2XK_KER,
+	BLIS_UNPACKM_3XK_KER,
+	BLIS_UNPACKM_4XK_KER,
+	BLIS_UNPACKM_5XK_KER,
+	BLIS_UNPACKM_6XK_KER,
+	BLIS_UNPACKM_7XK_KER,
+	BLIS_UNPACKM_8XK_KER,
+	BLIS_UNPACKM_9XK_KER,
+	BLIS_UNPACKM_10XK_KER,
+	BLIS_UNPACKM_11XK_KER,
+	BLIS_UNPACKM_12XK_KER,
+	BLIS_UNPACKM_13XK_KER,
+	BLIS_UNPACKM_14XK_KER,
+	BLIS_UNPACKM_15XK_KER,
+	BLIS_UNPACKM_16XK_KER,
+	BLIS_UNPACKM_17XK_KER,
+	BLIS_UNPACKM_18XK_KER,
+	BLIS_UNPACKM_19XK_KER,
+	BLIS_UNPACKM_20XK_KER,
+	BLIS_UNPACKM_21XK_KER,
+	BLIS_UNPACKM_22XK_KER,
+	BLIS_UNPACKM_23XK_KER,
+	BLIS_UNPACKM_24XK_KER,
+	BLIS_UNPACKM_25XK_KER,
+	BLIS_UNPACKM_26XK_KER,
+	BLIS_UNPACKM_27XK_KER,
+	BLIS_UNPACKM_28XK_KER,
+	BLIS_UNPACKM_29XK_KER,
+	BLIS_UNPACKM_30XK_KER,
+	BLIS_UNPACKM_31XK_KER,
+
+    // l3 nat kernels
+	BLIS_GEMM_UKR,
 	BLIS_GEMMTRSM_L_UKR,
 	BLIS_GEMMTRSM_U_UKR,
 	BLIS_TRSM_L_UKR,
-	BLIS_TRSM_U_UKR
-} l3ukr_t;
+	BLIS_TRSM_U_UKR,
+
+    // l3 virt kernels
+	BLIS_GEMM_VIR_UKR,
+	BLIS_GEMMTRSM_L_VIR_UKR,
+	BLIS_GEMMTRSM_U_VIR_UKR,
+	BLIS_TRSM_L_VIR_UKR,
+	BLIS_TRSM_U_VIR_UKR,
+
+    // gemmsup kernels
+	BLIS_GEMMSUP_RRR_UKR,
+	BLIS_GEMMSUP_RRC_UKR,
+	BLIS_GEMMSUP_RCR_UKR,
+	BLIS_GEMMSUP_RCC_UKR,
+	BLIS_GEMMSUP_CRR_UKR,
+	BLIS_GEMMSUP_CRC_UKR,
+	BLIS_GEMMSUP_CCR_UKR,
+	BLIS_GEMMSUP_CCC_UKR,
+	BLIS_GEMMSUP_XXX_UKR,
+
+    // BLIS_NUM_UKRS must be last!
+    BLIS_NUM_UKRS
+} ukr_t;
 
-#define BLIS_NUM_LEVEL3_UKRS 5
+
+typedef enum
+{
+    // l3 kernel row preferences
+	BLIS_GEMM_UKR_ROW_PREF,
+	BLIS_GEMMTRSM_L_UKR_ROW_PREF,
+	BLIS_GEMMTRSM_U_UKR_ROW_PREF,
+	BLIS_TRSM_L_UKR_ROW_PREF,
+	BLIS_TRSM_U_UKR_ROW_PREF,
+
+    // gemmsup kernel row preferences
+	BLIS_GEMMSUP_RRR_UKR_ROW_PREF,
+	BLIS_GEMMSUP_RRC_UKR_ROW_PREF,
+	BLIS_GEMMSUP_RCR_UKR_ROW_PREF,
+	BLIS_GEMMSUP_RCC_UKR_ROW_PREF,
+	BLIS_GEMMSUP_CRR_UKR_ROW_PREF,
+	BLIS_GEMMSUP_CRC_UKR_ROW_PREF,
+	BLIS_GEMMSUP_CCR_UKR_ROW_PREF,
+	BLIS_GEMMSUP_CCC_UKR_ROW_PREF,
+	BLIS_GEMMSUP_XXX_UKR_ROW_PREF,
+
+    // BLIS_NUM_UKR_PREFS must be last!
+    BLIS_NUM_UKR_PREFS
+} ukr_pref_t;
 
 
 typedef enum
@@ -884,39 +913,40 @@ typedef enum
 	// NOTE: the level-3 blocksizes MUST be indexed starting at zero.
 	// At one point, we made this assumption in bli_cntx_set_blkszs()
 	// and friends.
-
-	BLIS_KR = 0,
+	BLIS_KR,
 	BLIS_MR,
 	BLIS_NR,
 	BLIS_MC,
 	BLIS_KC,
 	BLIS_NC,
 
+    // level-2 blocksizes
 	BLIS_M2, // level-2 blocksize in m dimension
 	BLIS_N2, // level-2 blocksize in n dimension
 
+    // level-1f blocksizes
 	BLIS_AF, // level-1f axpyf fusing factor
 	BLIS_DF, // level-1f dotxf fusing factor
 	BLIS_XF, // level-1f dotxaxpyf fusing factor
 
-	BLIS_NO_PART  // used as a placeholder when blocksizes are not applicable.
+    // gemmsup thresholds
+	BLIS_MT, // level-3 small/unpacked matrix threshold in m dimension
+	BLIS_NT, // level-3 small/unpacked matrix threshold in n dimension
+	BLIS_KT, // level-3 small/unpacked matrix threshold in k dimension
+
+    // gemmsup block sizes
+	BLIS_KR_SUP,
+	BLIS_MR_SUP,
+	BLIS_NR_SUP,
+	BLIS_MC_SUP,
+	BLIS_KC_SUP,
+	BLIS_NC_SUP,
+
+    // BLIS_NO_PART (= BLIS_NUM_BLKSZS) must be last!
+	BLIS_NO_PART, // used as a placeholder when blocksizes are not applicable.
+    BLIS_NUM_BLKSZS = BLIS_NO_PART
 } bszid_t;
 
-#define BLIS_NUM_BLKSZS 11
-
-
-// -- Threshold ID type --
-
-typedef enum
-{
-	BLIS_MT = 0, // level-3 small/unpacked matrix threshold in m dimension
-	BLIS_NT,     // level-3 small/unpacked matrix threshold in n dimension
-	BLIS_KT      // level-3 small/unpacked matrix threshold in k dimension
-
-} threshid_t;
-
-#define BLIS_NUM_THRESH 3
-
 
 // -- Architecture ID type --
 
@@ -1430,21 +1460,10 @@ typedef struct cntx_s
 	blksz_t   blkszs[ BLIS_NUM_BLKSZS ];
 	bszid_t   bmults[ BLIS_NUM_BLKSZS ];
 
-	func_t    l3_vir_ukrs[ BLIS_NUM_LEVEL3_UKRS ];
-	func_t    l3_nat_ukrs[ BLIS_NUM_LEVEL3_UKRS ];
-	mbool_t   l3_nat_ukrs_prefs[ BLIS_NUM_LEVEL3_UKRS ];
+	func_t    ukrs[ BLIS_NUM_UKRS ];
+	mbool_t   ukr_prefs[ BLIS_NUM_UKR_PREFS ];
 
-	blksz_t   l3_sup_thresh[ BLIS_NUM_THRESH ];
 	void*     l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ];
-	blksz_t   l3_sup_blkszs[ BLIS_NUM_BLKSZS ];
-	func_t    l3_sup_kers[ BLIS_NUM_3OP_RC_COMBOS ];
-	mbool_t   l3_sup_kers_prefs[ BLIS_NUM_3OP_RC_COMBOS ];
-
-	func_t    l1f_kers[ BLIS_NUM_LEVEL1F_KERS ];
-	func_t    l1v_kers[ BLIS_NUM_LEVEL1V_KERS ];
-
-	func_t    packm_kers[ BLIS_NUM_PACKM_KERS ];
-	func_t    unpackm_kers[ BLIS_NUM_UNPACKM_KERS ];
 
 	ind_t     method;
 
@@ -1577,6 +1596,7 @@ typedef enum
 	// Architecture-related errors
 	BLIS_INVALID_ARCH_ID                       = (-150),
 	BLIS_UNINITIALIZED_GKS_CNTX                = (-151),
+	BLIS_INVALID_UKR_ID                        = (-152),
 
 	// Blocksize-related errors
 	BLIS_MC_DEF_NONMULTIPLE_OF_MR              = (-160),
diff --git a/kernels/penryn/1/bli_axpyv_penryn_int.c b/kernels/penryn/1/bli_axpyv_penryn_int.c
index 53904b6452..2dd7c73244 100644
--- a/kernels/penryn/1/bli_axpyv_penryn_int.c
+++ b/kernels/penryn/1/bli_axpyv_penryn_int.c
@@ -102,7 +102,7 @@ void bli_daxpyv_penryn_int
 	// Call the reference implementation if needed.
 	if ( use_ref == TRUE )
 	{
-		daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
+		daxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
 
 		f
 		(
diff --git a/kernels/penryn/1/bli_dotv_penryn_int.c b/kernels/penryn/1/bli_dotv_penryn_int.c
index 4d39b3641d..2e88a577a9 100644
--- a/kernels/penryn/1/bli_dotv_penryn_int.c
+++ b/kernels/penryn/1/bli_dotv_penryn_int.c
@@ -104,7 +104,7 @@ void bli_ddotv_penryn_int
 	// Call the reference implementation if needed.
 	if ( use_ref == TRUE )
 	{
-		ddotv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_DOTV_KER, cntx );
+		ddotv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_DOTV_KER, cntx );
 
 		f
 		(
diff --git a/kernels/penryn/1f/bli_axpy2v_penryn_int.c b/kernels/penryn/1f/bli_axpy2v_penryn_int.c
index 5e8a2a9a1f..c809ebb41c 100644
--- a/kernels/penryn/1f/bli_axpy2v_penryn_int.c
+++ b/kernels/penryn/1f/bli_axpy2v_penryn_int.c
@@ -110,7 +110,7 @@ void bli_daxpy2v_penryn_int
 	// Call the reference implementation if needed.
 	if ( use_ref == TRUE )
 	{
-		daxpy2v_ker_ft f = bli_cntx_get_l1f_ker_dt( BLIS_DOUBLE, BLIS_AXPY2V_KER, cntx );
+		daxpy2v_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPY2V_KER, cntx );
 
 		f
 		(
diff --git a/kernels/penryn/1f/bli_axpyf_penryn_int.c b/kernels/penryn/1f/bli_axpyf_penryn_int.c
index 66bb88ec6f..ce4c4f786f 100644
--- a/kernels/penryn/1f/bli_axpyf_penryn_int.c
+++ b/kernels/penryn/1f/bli_axpyf_penryn_int.c
@@ -115,7 +115,7 @@ void bli_daxpyf_penryn_int
 	// Call the reference implementation if needed.
 	if ( use_ref == TRUE )
 	{
-		daxpyf_ker_ft f = bli_cntx_get_l1f_ker_dt( BLIS_DOUBLE, BLIS_AXPYF_KER, cntx );
+		daxpyf_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPYF_KER, cntx );
 
 		f
 		(
diff --git a/kernels/penryn/1f/bli_dotaxpyv_penryn_int.c b/kernels/penryn/1f/bli_dotaxpyv_penryn_int.c
index 7602a7f282..6b9dab7739 100644
--- a/kernels/penryn/1f/bli_dotaxpyv_penryn_int.c
+++ b/kernels/penryn/1f/bli_dotaxpyv_penryn_int.c
@@ -112,7 +112,7 @@ void bli_ddotaxpyv_penryn_int
 	// Call the reference implementation if needed.
 	if ( use_ref == TRUE )
 	{
-		ddotaxpyv_ker_ft f = bli_cntx_get_l1f_ker_dt( BLIS_DOUBLE, BLIS_DOTAXPYV_KER, cntx );
+		ddotaxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_DOTAXPYV_KER, cntx );
 
 		f
 		(
diff --git a/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c b/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c
index 2deb4a4574..fe102d427b 100644
--- a/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c
+++ b/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c
@@ -104,7 +104,7 @@ void bli_ddotxaxpyf_penryn_int
 	// If the vector lengths are zero, scale y by beta and return.
 	if ( bli_zero_dim1( m ) )
 	{
-		dscalv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_SCALV_KER, cntx );
+		dscalv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_SCALV_KER, cntx );
 
 		f
 		(
@@ -149,7 +149,7 @@ void bli_ddotxaxpyf_penryn_int
 
 	if ( use_ref == TRUE )
 	{
-		ddotxaxpyf_ker_ft f = bli_cntx_get_l1f_ker_dt( BLIS_DOUBLE, BLIS_DOTXAXPYF_KER, cntx );
+		ddotxaxpyf_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_DOTXAXPYF_KER, cntx );
 		f
 		(
 		  conjat,
diff --git a/kernels/penryn/1f/bli_dotxf_penryn_int.c b/kernels/penryn/1f/bli_dotxf_penryn_int.c
index ad9dc5fbd1..ac9887d59e 100644
--- a/kernels/penryn/1f/bli_dotxf_penryn_int.c
+++ b/kernels/penryn/1f/bli_dotxf_penryn_int.c
@@ -90,7 +90,7 @@ void bli_ddotxf_penryn_int
 	// If the vector lengths are zero, scale r by beta and return.
 	if ( bli_zero_dim1( m ) )
 	{
-		dscalv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_SCALV_KER, cntx );
+		dscalv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_SCALV_KER, cntx );
 
 		f
 		(
@@ -134,7 +134,7 @@ void bli_ddotxf_penryn_int
 	// Call the reference implementation if needed.
 	if ( use_ref == TRUE )
 	{
-		ddotxf_ker_ft f = bli_cntx_get_l1f_ker_dt( BLIS_DOUBLE, BLIS_DOTXF_KER, cntx );
+		ddotxf_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_DOTXF_KER, cntx );
 
 		f
 		( conjat,
diff --git a/kernels/zen/1/bli_scalv_zen_int.c b/kernels/zen/1/bli_scalv_zen_int.c
index 9f76e88e18..fb17dd4b38 100644
--- a/kernels/zen/1/bli_scalv_zen_int.c
+++ b/kernels/zen/1/bli_scalv_zen_int.c
@@ -83,7 +83,7 @@ void bli_sscalv_zen_int
 	if ( PASTEMAC(s,eq0)( *alpha ) )
 	{
 		float*       zero = bli_s0;
-		ssetv_ker_ft f    = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_SETV_KER, cntx );
+		ssetv_ker_ft f    = bli_cntx_get_ukr_dt( BLIS_FLOAT, BLIS_SETV_KER, cntx );
 
 		f
 		(
@@ -182,7 +182,7 @@ void bli_dscalv_zen_int
 	if ( PASTEMAC(d,eq0)( *alpha ) )
 	{
 		double*      zero = bli_d0;
-		dsetv_ker_ft f    = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_SETV_KER, cntx );
+		dsetv_ker_ft f    = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_SETV_KER, cntx );
 
 		f
 		(
diff --git a/kernels/zen/1/bli_scalv_zen_int10.c b/kernels/zen/1/bli_scalv_zen_int10.c
index c4096cbbcb..d536ed7c02 100644
--- a/kernels/zen/1/bli_scalv_zen_int10.c
+++ b/kernels/zen/1/bli_scalv_zen_int10.c
@@ -91,7 +91,7 @@ void bli_sscalv_zen_int10
 		  cntx
 		);
 #else
-		ssetv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_SETV_KER, cntx );
+		ssetv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_FLOAT, BLIS_SETV_KER, cntx );
 		f
 		(
 		  BLIS_NO_CONJUGATE,
@@ -291,7 +291,7 @@ void bli_dscalv_zen_int10
 		  cntx
 		);
 #else
-		dsetv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_SETV_KER, cntx );
+		dsetv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_SETV_KER, cntx );
 
 		f
 		(
diff --git a/kernels/zen/1f/bli_axpyf_zen_int_8.c b/kernels/zen/1f/bli_axpyf_zen_int_8.c
index b958600ce6..15fdf46514 100644
--- a/kernels/zen/1f/bli_axpyf_zen_int_8.c
+++ b/kernels/zen/1f/bli_axpyf_zen_int_8.c
@@ -104,7 +104,7 @@ void bli_saxpyf_zen_int_8
 	// operation as a loop over axpyv.
 	if ( b_n != fuse_fac )
 	{
-		saxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_AXPYV_KER, cntx );
+		saxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_FLOAT, BLIS_AXPYV_KER, cntx );
 
 		for ( i = 0; i < b_n; ++i )
 		{
@@ -313,7 +313,7 @@ void bli_daxpyf_zen_int_8
 	// operation as a loop over axpyv.
 	if ( b_n != fuse_fac )
 	{
-		daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
+		daxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
 
 		for ( i = 0; i < b_n; ++i )
 		{
diff --git a/kernels/zen/1f/bli_dotxf_zen_int_8.c b/kernels/zen/1f/bli_dotxf_zen_int_8.c
index e40c785d85..1f4a671b65 100644
--- a/kernels/zen/1f/bli_dotxf_zen_int_8.c
+++ b/kernels/zen/1f/bli_dotxf_zen_int_8.c
@@ -78,8 +78,8 @@ void bli_sdotxf_zen_int_8
 	// simplifies to updating y.
 	if ( bli_zero_dim1( m ) || PASTEMAC(s,eq0)( *alpha ) )
 	{
-		sscalv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_SCALV_KER, cntx );
-		
+		sscalv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_FLOAT, BLIS_SCALV_KER, cntx );
+
 		f
 		(
 		  BLIS_NO_CONJUGATE,
@@ -95,7 +95,7 @@ void bli_sdotxf_zen_int_8
 	// operation as a loop over dotxv.
 	if ( b_n != fuse_fac )
 	{
-		sdotxv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_DOTXV_KER, cntx );
+		sdotxv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_FLOAT, BLIS_DOTXV_KER, cntx );
 
 		for ( dim_t i = 0; i < b_n; ++i )
 		{
@@ -468,8 +468,8 @@ void bli_ddotxf_zen_int_8
 	// simplifies to updating y.
 	if ( bli_zero_dim1( m ) || PASTEMAC(d,eq0)( *alpha ) )
 	{
-		dscalv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_SCALV_KER, cntx );
-		
+		dscalv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_SCALV_KER, cntx );
+
 		f
 		(
 		  BLIS_NO_CONJUGATE,
@@ -485,7 +485,7 @@ void bli_ddotxf_zen_int_8
 	// operation as a loop over dotxv.
 	if ( b_n != fuse_fac )
 	{
-		ddotxv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_DOTXV_KER, cntx );
+		ddotxv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_DOTXV_KER, cntx );
 
 		for ( dim_t i = 0; i < b_n; ++i )
 		{
diff --git a/kernels/zen2/1f/bli_axpyf_zen_int_5.c b/kernels/zen2/1f/bli_axpyf_zen_int_5.c
index 5a919b622b..f8b04d52d6 100644
--- a/kernels/zen2/1f/bli_axpyf_zen_int_5.c
+++ b/kernels/zen2/1f/bli_axpyf_zen_int_5.c
@@ -124,7 +124,7 @@ void bli_saxpyf_zen_int_5
         }
 
 #else
-        saxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_FLOAT, BLIS_AXPYV_KER, cntx );
+        saxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_FLOAT, BLIS_AXPYV_KER, cntx );
 
         for ( i = 0; i < b_n; ++i )
         {
@@ -264,7 +264,7 @@ void bli_saxpyf_zen_int_5
             a3 += n_elem_per_reg;
             a4 += n_elem_per_reg;
         }
-    
+
         // If there are leftover iterations, perform them with scalar code.
         for ( ; (i + 0) < m ; ++i )
         {
@@ -316,7 +316,7 @@ void bli_saxpyf_zen_int_5
             a1 += inca;
             a2 += inca;
             a3 += inca;
-            a4 += inca; 
+            a4 += inca;
             y0 += incy;
         }
 
@@ -398,7 +398,7 @@ void bli_daxpyf_zen_int_5
         }
 
 #else
-        daxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
+        daxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
 
         for ( i = 0; i < b_n; ++i )
         {
@@ -538,7 +538,7 @@ void bli_daxpyf_zen_int_5
             a3 += n_elem_per_reg;
             a4 += n_elem_per_reg;
         }
-    
+
         // If there are leftover iterations, perform them with scalar code.
         for ( ; (i + 0) < m ; ++i )
         {
@@ -590,7 +590,7 @@ void bli_daxpyf_zen_int_5
             a1 += inca;
             a2 += inca;
             a3 += inca;
-            a4 += inca; 
+            a4 += inca;
             y0 += incy;
         }
 
diff --git a/ref_kernels/1/bli_axpbyv_ref.c b/ref_kernels/1/bli_axpbyv_ref.c
index 2e648bbd6a..2da4bc9280 100644
--- a/ref_kernels/1/bli_axpbyv_ref.c
+++ b/ref_kernels/1/bli_axpbyv_ref.c
@@ -60,7 +60,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 			/* Query the context for the kernel function pointer. */ \
 			const num_t             dt     = PASTEMAC(ch,type); \
-			PASTECH(ch,setv_ker_ft) setv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_SETV_KER, cntx ); \
+			PASTECH(ch,setv_ker_ft) setv_p = bli_cntx_get_ukr_dt( dt, BLIS_SETV_KER, cntx ); \
 \
 			setv_p \
 			( \
@@ -83,7 +83,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 			/* Query the context for the kernel function pointer. */ \
 			const num_t              dt      = PASTEMAC(ch,type); \
-			PASTECH(ch,scalv_ker_ft) scalv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_SCALV_KER, cntx ); \
+			PASTECH(ch,scalv_ker_ft) scalv_p = bli_cntx_get_ukr_dt( dt, BLIS_SCALV_KER, cntx ); \
 \
 			scalv_p \
 			( \
@@ -105,7 +105,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 			/* Query the context for the kernel function pointer. */ \
 			const num_t              dt      = PASTEMAC(ch,type); \
-			PASTECH(ch,copyv_ker_ft) copyv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_COPYV_KER, cntx ); \
+			PASTECH(ch,copyv_ker_ft) copyv_p = bli_cntx_get_ukr_dt( dt, BLIS_COPYV_KER, cntx ); \
 \
 			copyv_p \
 			( \
@@ -123,7 +123,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 			/* Query the context for the kernel function pointer. */ \
 			const num_t             dt     = PASTEMAC(ch,type); \
-			PASTECH(ch,addv_ker_ft) addv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_ADDV_KER, cntx ); \
+			PASTECH(ch,addv_ker_ft) addv_p = bli_cntx_get_ukr_dt( dt, BLIS_ADDV_KER, cntx ); \
 \
 			addv_p \
 			( \
@@ -141,7 +141,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 			/* Query the context for the kernel function pointer. */ \
 			const num_t              dt      = PASTEMAC(ch,type); \
-			PASTECH(ch,xpbyv_ker_ft) xpbyv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_XPBYV_KER, cntx ); \
+			PASTECH(ch,xpbyv_ker_ft) xpbyv_p = bli_cntx_get_ukr_dt( dt, BLIS_XPBYV_KER, cntx ); \
 \
 			xpbyv_p \
 			( \
@@ -163,7 +163,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 			/* Query the context for the kernel function pointer. */ \
 			const num_t               dt       = PASTEMAC(ch,type); \
-			PASTECH(ch,scal2v_ker_ft) scal2v_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_SCAL2V_KER, cntx ); \
+			PASTECH(ch,scal2v_ker_ft) scal2v_p = bli_cntx_get_ukr_dt( dt, BLIS_SCAL2V_KER, cntx ); \
 \
 			scal2v_p \
 			( \
@@ -182,7 +182,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 			/* Query the context for the kernel function pointer. */ \
 			const num_t              dt      = PASTEMAC(ch,type); \
-			PASTECH(ch,axpyv_ker_ft) axpyv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
+			PASTECH(ch,axpyv_ker_ft) axpyv_p = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 			axpyv_p \
 			( \
diff --git a/ref_kernels/1/bli_axpyv_ref.c b/ref_kernels/1/bli_axpyv_ref.c
index 31fece0a00..30076ddaf9 100644
--- a/ref_kernels/1/bli_axpyv_ref.c
+++ b/ref_kernels/1/bli_axpyv_ref.c
@@ -58,7 +58,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	{ \
 		/* Query the context for the kernel function pointer. */ \
 		const num_t             dt     = PASTEMAC(ch,type); \
-		PASTECH(ch,addv_ker_ft) addv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_ADDV_KER, cntx ); \
+		PASTECH(ch,addv_ker_ft) addv_p = bli_cntx_get_ukr_dt( dt, BLIS_ADDV_KER, cntx ); \
 \
 		addv_p \
 		( \
@@ -148,7 +148,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	{ \
 		/* Query the context for the kernel function pointer. */ \
 		const num_t             dt     = PASTEMAC(ch,type); \
-		PASTECH(ch,addv_ker_ft) addv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_ADDV_KER, cntx ); \
+		PASTECH(ch,addv_ker_ft) addv_p = bli_cntx_get_ukr_dt( dt, BLIS_ADDV_KER, cntx ); \
 \
 		addv_p \
 		( \
diff --git a/ref_kernels/1/bli_scal2v_ref.c b/ref_kernels/1/bli_scal2v_ref.c
index 1dcb038397..ba05959908 100644
--- a/ref_kernels/1/bli_scal2v_ref.c
+++ b/ref_kernels/1/bli_scal2v_ref.c
@@ -57,7 +57,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 		/* Query the context for the kernel function pointer. */ \
 		const num_t             dt     = PASTEMAC(ch,type); \
-		PASTECH(ch,setv_ker_ft) setv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_SETV_KER, cntx ); \
+		PASTECH(ch,setv_ker_ft) setv_p = bli_cntx_get_ukr_dt( dt, BLIS_SETV_KER, cntx ); \
 \
 		setv_p \
 		( \
@@ -75,7 +75,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 		/* Query the context for the kernel function pointer. */ \
 		const num_t              dt      = PASTEMAC(ch,type); \
-		PASTECH(ch,copyv_ker_ft) copyv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_COPYV_KER, cntx ); \
+		PASTECH(ch,copyv_ker_ft) copyv_p = bli_cntx_get_ukr_dt( dt, BLIS_COPYV_KER, cntx ); \
 \
 		copyv_p \
 		( \
diff --git a/ref_kernels/1/bli_scalv_ref.c b/ref_kernels/1/bli_scalv_ref.c
index 4945b637b0..3e6be74928 100644
--- a/ref_kernels/1/bli_scalv_ref.c
+++ b/ref_kernels/1/bli_scalv_ref.c
@@ -58,7 +58,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 		/* Query the context for the kernel function pointer. */ \
 		const num_t             dt     = PASTEMAC(ch,type); \
-		PASTECH(ch,setv_ker_ft) setv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_SETV_KER, cntx ); \
+		PASTECH(ch,setv_ker_ft) setv_p = bli_cntx_get_ukr_dt( dt, BLIS_SETV_KER, cntx ); \
 \
 		setv_p \
 		( \
diff --git a/ref_kernels/1/bli_xpbyv_ref.c b/ref_kernels/1/bli_xpbyv_ref.c
index 8101023d47..28286a5f8b 100644
--- a/ref_kernels/1/bli_xpbyv_ref.c
+++ b/ref_kernels/1/bli_xpbyv_ref.c
@@ -54,7 +54,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	{ \
 		/* Query the context for the kernel function pointer. */ \
 		const num_t              dt      = PASTEMAC(ch,type); \
-		PASTECH(ch,copyv_ker_ft) copyv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_COPYV_KER, cntx ); \
+		PASTECH(ch,copyv_ker_ft) copyv_p = bli_cntx_get_ukr_dt( dt, BLIS_COPYV_KER, cntx ); \
 \
 		copyv_p \
 		( \
@@ -71,7 +71,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	{ \
 		/* Query the context for the kernel function pointer. */ \
 		const num_t             dt     = PASTEMAC(ch,type); \
-		PASTECH(ch,addv_ker_ft) addv_p = bli_cntx_get_l1v_ker_dt( dt, BLIS_ADDV_KER, cntx ); \
+		PASTECH(ch,addv_ker_ft) addv_p = bli_cntx_get_ukr_dt( dt, BLIS_ADDV_KER, cntx ); \
 \
 		addv_p \
 		( \
diff --git a/ref_kernels/1f/bli_axpy2v_ref.c b/ref_kernels/1f/bli_axpy2v_ref.c
index 9c08c96f1e..6439ff8b01 100644
--- a/ref_kernels/1f/bli_axpy2v_ref.c
+++ b/ref_kernels/1f/bli_axpy2v_ref.c
@@ -110,7 +110,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		const num_t              dt     = PASTEMAC(ch,type); \
 		PASTECH(ch,axpyv_ker_ft) kfp_av \
 		= \
-		bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
+		bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 		kfp_av \
 		( \
diff --git a/ref_kernels/1f/bli_axpyf_ref.c b/ref_kernels/1f/bli_axpyf_ref.c
index f001108e22..5799a03a68 100644
--- a/ref_kernels/1f/bli_axpyf_ref.c
+++ b/ref_kernels/1f/bli_axpyf_ref.c
@@ -97,7 +97,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		const num_t              dt     = PASTEMAC(ch,type); \
 		PASTECH(ch,axpyv_ker_ft) kfp_av \
 		= \
-		bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
+		bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 		for ( dim_t i = 0; i < b_n; ++i ) \
 		{ \
diff --git a/ref_kernels/1f/bli_dotaxpyv_ref.c b/ref_kernels/1f/bli_dotaxpyv_ref.c
index faeef5dead..42936c6506 100644
--- a/ref_kernels/1f/bli_dotaxpyv_ref.c
+++ b/ref_kernels/1f/bli_dotaxpyv_ref.c
@@ -132,10 +132,10 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		const num_t              dt     = PASTEMAC(ch,type); \
 		PASTECH(ch,dotv_ker_ft)  kfp_dv \
 		= \
-		bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTV_KER, cntx ); \
+		bli_cntx_get_ukr_dt( dt, BLIS_DOTV_KER, cntx ); \
 		PASTECH(ch,axpyv_ker_ft) kfp_av \
 		= \
-		bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
+		bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 		kfp_dv \
 		( \
diff --git a/ref_kernels/1f/bli_dotxaxpyf_ref.c b/ref_kernels/1f/bli_dotxaxpyf_ref.c
index c612179413..990133621c 100644
--- a/ref_kernels/1f/bli_dotxaxpyf_ref.c
+++ b/ref_kernels/1f/bli_dotxaxpyf_ref.c
@@ -165,10 +165,10 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		const num_t              dt     = PASTEMAC(ch,type); \
 		PASTECH(ch,dotxf_ker_ft) kfp_df \
 		= \
-		bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); \
+		bli_cntx_get_ukr_dt( dt, BLIS_DOTXF_KER, cntx ); \
 		PASTECH(ch,axpyf_ker_ft) kfp_af \
 		= \
-		bli_cntx_get_l1f_ker_dt( dt, BLIS_AXPYF_KER, cntx ); \
+		bli_cntx_get_ukr_dt( dt, BLIS_AXPYF_KER, cntx ); \
 \
 		kfp_df \
 		( \
diff --git a/ref_kernels/1f/bli_dotxf_ref.c b/ref_kernels/1f/bli_dotxf_ref.c
index 33f5d1ba5c..86781fd58a 100644
--- a/ref_kernels/1f/bli_dotxf_ref.c
+++ b/ref_kernels/1f/bli_dotxf_ref.c
@@ -113,7 +113,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		const num_t              dt     = PASTEMAC(ch,type); \
 		PASTECH(ch,dotxv_ker_ft) kfp_dv \
 		= \
-		bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTXV_KER, cntx ); \
+		bli_cntx_get_ukr_dt( dt, BLIS_DOTXV_KER, cntx ); \
 \
 		for ( dim_t i = 0; i < b_n; ++i ) \
 		{ \
diff --git a/ref_kernels/1f/other/bli_dotxaxpyf_ref_alt.c b/ref_kernels/1f/other/bli_dotxaxpyf_ref_alt.c
index cc5852b37c..e07090754f 100644
--- a/ref_kernels/1f/other/bli_dotxaxpyf_ref_alt.c
+++ b/ref_kernels/1f/other/bli_dotxaxpyf_ref_alt.c
@@ -67,8 +67,8 @@ void PASTEMAC(ch,varname) \
 \
 	/* Query the context for the kernel function pointer. */ \
 	const num_t          dt     = PASTEMAC(ch,type); \
-	PASTECH(ch,dotxv_ft) kfp_dv = bli_cntx_get_l1v_ker_dt( dt, BLIS_DOTXV_KER, cntx ); \
-	PASTECH(ch,axpyv_ft) kfp_av = bli_cntx_get_l1v_ker_dt( dt, BLIS_AXPYV_KER, cntx ); \
+	PASTECH(ch,dotxv_ft) kfp_dv = bli_cntx_get_ukr_dt( dt, BLIS_DOTXV_KER, cntx ); \
+	PASTECH(ch,axpyv_ft) kfp_av = bli_cntx_get_ukr_dt( dt, BLIS_AXPYV_KER, cntx ); \
 \
 	/* A is m x n.                   */ \
 	/* y = beta * y + alpha * A^T w; */ \
diff --git a/ref_kernels/3/bb/bli_gemmtrsmbb_ref.c b/ref_kernels/3/bb/bli_gemmtrsmbb_ref.c
index dd4e1f153d..e40b1b4cca 100644
--- a/ref_kernels/3/bb/bli_gemmtrsmbb_ref.c
+++ b/ref_kernels/3/bb/bli_gemmtrsmbb_ref.c
@@ -72,9 +72,9 @@ printf( "bli_gemmtrsmbb_ref(): k nr = %d %d\n", (int)k, (int)nr ); \
 	ctype*          minus_one = PASTEMAC(ch,m1); \
 \
 	PASTECH(ch,gemm_ukr_ft) \
-	              gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	              gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
 	PASTECH(ch,trsm_ukr_ft) \
-	              trsm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, trsmkerid, cntx ); \
+	              trsm_ukr = bli_cntx_get_ukr_dt( dt, trsmkerid, cntx ); \
 \
 /*
 PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b01", k, nr, \
diff --git a/ref_kernels/3/bli_gemm_ref.c b/ref_kernels/3/bli_gemm_ref.c
index ba3c8bbd1f..70fd025f43 100644
--- a/ref_kernels/3/bli_gemm_ref.c
+++ b/ref_kernels/3/bli_gemm_ref.c
@@ -110,7 +110,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 		if ( PASTEMAC(ch,eq0)( *beta ) ) \
 		{ \
-			for ( dim_t i = 0; i < mr ++i ) \
+			for ( dim_t i = 0; i < mr; ++i ) \
 	        PRAGMA_SIMD \
 			for ( dim_t j = 0; j < nr; ++j ) \
 			PASTEMAC(ch,copys) \
diff --git a/ref_kernels/3/bli_gemmtrsm_ref.c b/ref_kernels/3/bli_gemmtrsm_ref.c
index 2b260c8810..03f343a336 100644
--- a/ref_kernels/3/bli_gemmtrsm_ref.c
+++ b/ref_kernels/3/bli_gemmtrsm_ref.c
@@ -62,9 +62,9 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	ctype*          minus_one = PASTEMAC(ch,m1); \
 \
 	PASTECH(ch,gemm_ukr_ft) \
-	              gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	              gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
 	PASTECH(ch,trsm_ukr_ft) \
-	              trsm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, trsmkerid, cntx ); \
+	              trsm_ukr = bli_cntx_get_ukr_dt( dt, trsmkerid, cntx ); \
 \
 	/* lower: b11 = alpha * b11 - a10 * b01; */ \
 	/* upper: b11 = alpha * b11 - a12 * b21; */ \
diff --git a/ref_kernels/bli_cntx_ref.c b/ref_kernels/bli_cntx_ref.c
index 33e74ecaa8..8cd4a9703c 100644
--- a/ref_kernels/bli_cntx_ref.c
+++ b/ref_kernels/bli_cntx_ref.c
@@ -259,7 +259,6 @@ void GENBARNAME(cntx_init)
      )
 {
 	blksz_t  blkszs[ BLIS_NUM_BLKSZS ];
-	blksz_t  thresh[ BLIS_NUM_THRESH ];
 	func_t*  funcs;
 	mbool_t* mbools;
 	dim_t    i;
@@ -286,11 +285,26 @@ void GENBARNAME(cntx_init)
 	bli_blksz_init_easy( &blkszs[ BLIS_DF ],    6,    6,    6,    6 );
 	bli_blksz_init_easy( &blkszs[ BLIS_XF ],    4,    4,    4,    4 );
 
+	// -- Set level-3 small/unpacked thresholds --------------------------------
+
+	// NOTE: The default thresholds are set to zero so that the sup framework
+	// does not activate by default. Note that the semantic meaning of the
+	// thresholds is that the sup code path is executed if a dimension is
+	// strictly less than its corresponding threshold. So actually, the
+	// thresholds specify the minimum dimension size that will still dispatch
+	// the non-sup/large code path. This "strictly less than" behavior was
+	// chosen over "less than or equal to" so that threshold values of 0 would
+	// effectively disable sup (even for matrix dimensions of 0).
+	//                                          s     d     c     z
+	bli_blksz_init_easy( &blkszs[ BLIS_MT ],    0,    0,    0,    0 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NT ],    0,    0,    0,    0 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KT ],    0,    0,    0,    0 );
+
 	// Initialize the context with the default blocksize objects and their
 	// multiples.
 	bli_cntx_set_blkszs
 	(
-	  BLIS_NAT, 11,
+	  cntx,
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
@@ -302,30 +316,32 @@ void GENBARNAME(cntx_init)
 	  BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
 	  BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
 	  BLIS_XF, &blkszs[ BLIS_XF ], BLIS_XF,
-	  cntx
+	  BLIS_MT, &blkszs[ BLIS_MT ], BLIS_MT,
+	  BLIS_NT, &blkszs[ BLIS_NT ], BLIS_NT,
+	  BLIS_KT, &blkszs[ BLIS_KT ], BLIS_KT,
+	  -1
 	);
 
 
 	// -- Set level-3 virtual micro-kernels ------------------------------------
 
-	funcs = bli_cntx_l3_vir_ukrs_buf( cntx );
+	funcs = bli_cntx_ukrs_buf( cntx );
 
 	// NOTE: We set the virtual micro-kernel slots to contain the addresses
 	// of the native micro-kernels. In general, the ukernels in the virtual
 	// ukernel slots are always called, and if the function called happens to
 	// be a virtual micro-kernel, it will then know to find its native ukernel
 	// (i.e., in the native ukernel slots).
-	gen_func_init( &funcs[ BLIS_GEMM_UKR ],       gemm_ukr_name       );
-	gen_func_init( &funcs[ BLIS_GEMMTRSM_L_UKR ], gemmtrsm_l_ukr_name );
-	gen_func_init( &funcs[ BLIS_GEMMTRSM_U_UKR ], gemmtrsm_u_ukr_name );
-	gen_func_init( &funcs[ BLIS_TRSM_L_UKR ],     trsm_l_ukr_name     );
-	gen_func_init( &funcs[ BLIS_TRSM_U_UKR ],     trsm_u_ukr_name     );
+	gen_func_init( &funcs[ BLIS_GEMM_VIR_UKR ],       gemm_ukr_name       );
+	gen_func_init( &funcs[ BLIS_GEMMTRSM_L_VIR_UKR ], gemmtrsm_l_ukr_name );
+	gen_func_init( &funcs[ BLIS_GEMMTRSM_U_VIR_UKR ], gemmtrsm_u_ukr_name );
+	gen_func_init( &funcs[ BLIS_TRSM_L_VIR_UKR ],     trsm_l_ukr_name     );
+	gen_func_init( &funcs[ BLIS_TRSM_U_VIR_UKR ],     trsm_u_ukr_name     );
 
 
 	// -- Set level-3 native micro-kernels and preferences ---------------------
 
-	funcs  = bli_cntx_l3_nat_ukrs_buf( cntx );
-	mbools = bli_cntx_l3_nat_ukrs_prefs_buf( cntx );
+	mbools = bli_cntx_ukr_prefs_buf( cntx );
 
 	gen_func_init( &funcs[ BLIS_GEMM_UKR ],       gemm_ukr_name       );
 	gen_func_init( &funcs[ BLIS_GEMMTRSM_L_UKR ], gemmtrsm_l_ukr_name );
@@ -333,108 +349,47 @@ void GENBARNAME(cntx_init)
 	gen_func_init( &funcs[ BLIS_TRSM_L_UKR ],     trsm_l_ukr_name     );
 	gen_func_init( &funcs[ BLIS_TRSM_U_UKR ],     trsm_u_ukr_name     );
 
-	//                                                  s      d      c      z
-	bli_mbool_init( &mbools[ BLIS_GEMM_UKR ],        TRUE,  TRUE,  TRUE,  TRUE );
-	bli_mbool_init( &mbools[ BLIS_GEMMTRSM_L_UKR ], FALSE, FALSE, FALSE, FALSE );
-	bli_mbool_init( &mbools[ BLIS_GEMMTRSM_U_UKR ], FALSE, FALSE, FALSE, FALSE );
-	bli_mbool_init( &mbools[ BLIS_TRSM_L_UKR ],     FALSE, FALSE, FALSE, FALSE );
-	bli_mbool_init( &mbools[ BLIS_TRSM_U_UKR ],     FALSE, FALSE, FALSE, FALSE );
-
-
-	// -- Set level-3 small/unpacked thresholds --------------------------------
-
-	// NOTE: The default thresholds are set to zero so that the sup framework
-	// does not activate by default. Note that the semantic meaning of the
-	// thresholds is that the sup code path is executed if a dimension is
-	// strictly less than its corresponding threshold. So actually, the
-	// thresholds specify the minimum dimension size that will still dispatch
-	// the non-sup/large code path. This "strictly less than" behavior was
-	// chosen over "less than or equal to" so that threshold values of 0 would
-	// effectively disable sup (even for matrix dimensions of 0).
-	//                                          s     d     c     z
-	bli_blksz_init_easy( &thresh[ BLIS_MT ],    0,    0,    0,    0 );
-	bli_blksz_init_easy( &thresh[ BLIS_NT ],    0,    0,    0,    0 );
-	bli_blksz_init_easy( &thresh[ BLIS_KT ],    0,    0,    0,    0 );
-
-	// Initialize the context with the default thresholds.
-	bli_cntx_set_l3_sup_thresh
-	(
-	  3,
-	  BLIS_MT, &thresh[ BLIS_MT ],
-	  BLIS_NT, &thresh[ BLIS_NT ],
-	  BLIS_KT, &thresh[ BLIS_KT ],
-	  cntx
-	);
-
-
-	// -- Set level-3 small/unpacked handlers ----------------------------------
-
-	vfuncs = bli_cntx_l3_sup_handlers_buf( cntx );
-
-	// Initialize all of the function pointers to NULL;
-	for ( i = 0; i < BLIS_NUM_LEVEL3_OPS; ++i ) vfuncs[ i ] = NULL;
-
-	// The level-3 sup handlers are oapi-based, so we only set one slot per
-	// operation.
-
-	// Set the gemm slot to the default gemm sup handler.
-	vfuncs[ BLIS_GEMM ]  = bli_gemmsup_ref;
-	vfuncs[ BLIS_GEMMT ] = bli_gemmtsup_ref;
+	//                                                           s      d      c      z
+	bli_mbool_init( &mbools[ BLIS_GEMM_UKR_ROW_PREF ],        TRUE,  TRUE,  TRUE,  TRUE );
+	bli_mbool_init( &mbools[ BLIS_GEMMTRSM_L_UKR_ROW_PREF ], FALSE, FALSE, FALSE, FALSE );
+	bli_mbool_init( &mbools[ BLIS_GEMMTRSM_U_UKR_ROW_PREF ], FALSE, FALSE, FALSE, FALSE );
+	bli_mbool_init( &mbools[ BLIS_TRSM_L_UKR_ROW_PREF ],     FALSE, FALSE, FALSE, FALSE );
+	bli_mbool_init( &mbools[ BLIS_TRSM_U_UKR_ROW_PREF ],     FALSE, FALSE, FALSE, FALSE );
 
 
 	// -- Set level-3 small/unpacked micro-kernels and preferences -------------
 
-	funcs  = bli_cntx_l3_sup_kers_buf( cntx );
-	mbools = bli_cntx_l3_sup_kers_prefs_buf( cntx );
-
-#if 0
-	// Adhere to the small/unpacked ukernel mappings:
-	// - rv -> rrr, rcr
-	// - rg -> rrc, rcc
-	// - cv -> ccr, ccc
-	// - cg -> crr, crc
-	gen_sup_func_init( &funcs[ BLIS_RRR ],
-	                   &funcs[ BLIS_RCR ], gemmsup_rv_ukr_name );
-	gen_sup_func_init( &funcs[ BLIS_RRC ],
-	                   &funcs[ BLIS_RCC ], gemmsup_rg_ukr_name );
-	gen_sup_func_init( &funcs[ BLIS_CCR ],
-	                   &funcs[ BLIS_CCC ], gemmsup_cv_ukr_name );
-	gen_sup_func_init( &funcs[ BLIS_CRR ],
-	                   &funcs[ BLIS_CRC ], gemmsup_cg_ukr_name );
-#endif
-	gen_func_init( &funcs[ BLIS_RRR ], gemmsup_rv_ukr_name );
-	gen_func_init( &funcs[ BLIS_RRC ], gemmsup_rv_ukr_name );
-	gen_func_init( &funcs[ BLIS_RCR ], gemmsup_rv_ukr_name );
-	gen_func_init( &funcs[ BLIS_RCC ], gemmsup_rv_ukr_name );
-	gen_func_init( &funcs[ BLIS_CRR ], gemmsup_rv_ukr_name );
-	gen_func_init( &funcs[ BLIS_CRC ], gemmsup_rv_ukr_name );
-	gen_func_init( &funcs[ BLIS_CCR ], gemmsup_rv_ukr_name );
-	gen_func_init( &funcs[ BLIS_CCC ], gemmsup_rv_ukr_name );
+	gen_func_init( &funcs[ BLIS_GEMMSUP_RRR_UKR ], gemmsup_rv_ukr_name );
+	gen_func_init( &funcs[ BLIS_GEMMSUP_RRC_UKR ], gemmsup_rv_ukr_name );
+	gen_func_init( &funcs[ BLIS_GEMMSUP_RCR_UKR ], gemmsup_rv_ukr_name );
+	gen_func_init( &funcs[ BLIS_GEMMSUP_RCC_UKR ], gemmsup_rv_ukr_name );
+	gen_func_init( &funcs[ BLIS_GEMMSUP_CRR_UKR ], gemmsup_rv_ukr_name );
+	gen_func_init( &funcs[ BLIS_GEMMSUP_CRC_UKR ], gemmsup_rv_ukr_name );
+	gen_func_init( &funcs[ BLIS_GEMMSUP_CCR_UKR ], gemmsup_rv_ukr_name );
+	gen_func_init( &funcs[ BLIS_GEMMSUP_CCC_UKR ], gemmsup_rv_ukr_name );
 
 	// Register the general-stride/generic ukernel to the "catch-all" slot
 	// associated with the BLIS_XXX enum value. This slot will be queried if
 	// *any* operand is stored with general stride.
-	gen_func_init( &funcs[ BLIS_XXX ], gemmsup_gx_ukr_name );
+	gen_func_init( &funcs[ BLIS_GEMMSUP_XXX_UKR ], gemmsup_gx_ukr_name );
 
 
 	// Set the l3 sup ukernel storage preferences.
-	//                                       s      d      c      z
-	bli_mbool_init( &mbools[ BLIS_RRR ],  TRUE,  TRUE,  TRUE,  TRUE );
-	bli_mbool_init( &mbools[ BLIS_RRC ],  TRUE,  TRUE,  TRUE,  TRUE );
-	bli_mbool_init( &mbools[ BLIS_RCR ],  TRUE,  TRUE,  TRUE,  TRUE );
-	bli_mbool_init( &mbools[ BLIS_RCC ],  TRUE,  TRUE,  TRUE,  TRUE );
-	bli_mbool_init( &mbools[ BLIS_CRR ],  TRUE,  TRUE,  TRUE,  TRUE );
-	bli_mbool_init( &mbools[ BLIS_CRC ],  TRUE,  TRUE,  TRUE,  TRUE );
-	bli_mbool_init( &mbools[ BLIS_CCR ],  TRUE,  TRUE,  TRUE,  TRUE );
-	bli_mbool_init( &mbools[ BLIS_CCC ],  TRUE,  TRUE,  TRUE,  TRUE );
+	//                                                            s      d      c      z
+	bli_mbool_init( &mbools[ BLIS_GEMMSUP_RRR_UKR_ROW_PREF ],  TRUE,  TRUE,  TRUE,  TRUE );
+	bli_mbool_init( &mbools[ BLIS_GEMMSUP_RRC_UKR_ROW_PREF ],  TRUE,  TRUE,  TRUE,  TRUE );
+	bli_mbool_init( &mbools[ BLIS_GEMMSUP_RCR_UKR_ROW_PREF ],  TRUE,  TRUE,  TRUE,  TRUE );
+	bli_mbool_init( &mbools[ BLIS_GEMMSUP_RCC_UKR_ROW_PREF ],  TRUE,  TRUE,  TRUE,  TRUE );
+	bli_mbool_init( &mbools[ BLIS_GEMMSUP_CRR_UKR_ROW_PREF ],  TRUE,  TRUE,  TRUE,  TRUE );
+	bli_mbool_init( &mbools[ BLIS_GEMMSUP_CRC_UKR_ROW_PREF ],  TRUE,  TRUE,  TRUE,  TRUE );
+	bli_mbool_init( &mbools[ BLIS_GEMMSUP_CCR_UKR_ROW_PREF ],  TRUE,  TRUE,  TRUE,  TRUE );
+	bli_mbool_init( &mbools[ BLIS_GEMMSUP_CCC_UKR_ROW_PREF ],  TRUE,  TRUE,  TRUE,  TRUE );
 
-	bli_mbool_init( &mbools[ BLIS_XXX ],  TRUE,  TRUE,  TRUE,  TRUE );
+	bli_mbool_init( &mbools[ BLIS_GEMMSUP_XXX_UKR_ROW_PREF ],  TRUE,  TRUE,  TRUE,  TRUE );
 
 
 	// -- Set level-1f kernels -------------------------------------------------
 
-	funcs = bli_cntx_l1f_kers_buf( cntx );
-
 	gen_func_init( &funcs[ BLIS_AXPY2V_KER ],    axpy2v_ker_name    );
 	gen_func_init( &funcs[ BLIS_DOTAXPYV_KER ],  dotaxpyv_ker_name  );
 	gen_func_init( &funcs[ BLIS_AXPYF_KER ],     axpyf_ker_name     );
@@ -444,8 +399,6 @@ void GENBARNAME(cntx_init)
 
 	// -- Set level-1v kernels -------------------------------------------------
 
-	funcs = bli_cntx_l1v_kers_buf( cntx );
-
 	gen_func_init( &funcs[ BLIS_ADDV_KER ],    addv_ker_name    );
 	gen_func_init( &funcs[ BLIS_AMAXV_KER ],   amaxv_ker_name   );
 	gen_func_init( &funcs[ BLIS_AXPBYV_KER ],  axpbyv_ker_name  );
@@ -464,8 +417,6 @@ void GENBARNAME(cntx_init)
 
 	// -- Set level-1m (packm/unpackm) kernels ---------------------------------
 
-	funcs = bli_cntx_packm_kers_buf( cntx );
-
 	// Initialize all packm kernel func_t entries to NULL.
 	for ( i = BLIS_PACKM_0XK_KER; i <= BLIS_PACKM_31XK_KER; ++i )
 	{
@@ -483,8 +434,6 @@ void GENBARNAME(cntx_init)
 	gen_func_init( &funcs[ BLIS_PACKM_16XK_KER ], packm_16xk_ker_name );
 	gen_func_init( &funcs[ BLIS_PACKM_24XK_KER ], packm_24xk_ker_name );
 
-	funcs = bli_cntx_unpackm_kers_buf( cntx );
-
 	// Initialize all packm kernel func_t entries to NULL.
 	for ( i = BLIS_UNPACKM_0XK_KER; i <= BLIS_UNPACKM_31XK_KER; ++i )
 	{
@@ -501,6 +450,21 @@ void GENBARNAME(cntx_init)
 	gen_func_init( &funcs[ BLIS_UNPACKM_16XK_KER ], unpackm_16xk_ker_name );
 
 
+	// -- Set level-3 small/unpacked handlers ----------------------------------
+
+	vfuncs = bli_cntx_l3_sup_handlers_buf( cntx );
+
+	// Initialize all of the function pointers to NULL;
+	for ( i = 0; i < BLIS_NUM_LEVEL3_OPS; ++i ) vfuncs[ i ] = NULL;
+
+	// The level-3 sup handlers are oapi-based, so we only set one slot per
+	// operation.
+
+	// Set the gemm slot to the default gemm sup handler.
+	vfuncs[ BLIS_GEMM ]  = bli_gemmsup_ref;
+	vfuncs[ BLIS_GEMMT ] = bli_gemmtsup_ref;
+
+
 	// -- Set miscellaneous fields ---------------------------------------------
 
 	bli_cntx_set_method( BLIS_NAT, cntx );
@@ -525,23 +489,23 @@ void GENBAINAME(cntx_init)
 
 	// -- Set induced method level-3 virtual micro-kernels ---------------------
 
-	funcs = bli_cntx_l3_vir_ukrs_buf( cntx );
+	funcs = bli_cntx_ukrs_buf( cntx );
 
 	if ( method == BLIS_1M )
 	{
-		gen_func_init_co( &funcs[ BLIS_GEMM_UKR ],       gemm1m_ukr_name       );
-		gen_func_init_co( &funcs[ BLIS_GEMMTRSM_L_UKR ], gemmtrsm1m_l_ukr_name );
-		gen_func_init_co( &funcs[ BLIS_GEMMTRSM_U_UKR ], gemmtrsm1m_u_ukr_name );
-		gen_func_init_co( &funcs[ BLIS_TRSM_L_UKR ],     trsm1m_l_ukr_name     );
-		gen_func_init_co( &funcs[ BLIS_TRSM_U_UKR ],     trsm1m_u_ukr_name     );
+		gen_func_init_co( &funcs[ BLIS_GEMM_VIR_UKR ],       gemm1m_ukr_name       );
+		gen_func_init_co( &funcs[ BLIS_GEMMTRSM_L_VIR_UKR ], gemmtrsm1m_l_ukr_name );
+		gen_func_init_co( &funcs[ BLIS_GEMMTRSM_U_VIR_UKR ], gemmtrsm1m_u_ukr_name );
+		gen_func_init_co( &funcs[ BLIS_TRSM_L_VIR_UKR ],     trsm1m_l_ukr_name     );
+		gen_func_init_co( &funcs[ BLIS_TRSM_U_VIR_UKR ],     trsm1m_u_ukr_name     );
 	}
 	else // if ( method == BLIS_NAT )
 	{
-		gen_func_init_co( &funcs[ BLIS_GEMM_UKR ],       gemm_ukr_name       );
-		gen_func_init_co( &funcs[ BLIS_GEMMTRSM_L_UKR ], gemmtrsm_l_ukr_name );
-		gen_func_init_co( &funcs[ BLIS_GEMMTRSM_U_UKR ], gemmtrsm_u_ukr_name );
-		gen_func_init_co( &funcs[ BLIS_TRSM_L_UKR ],     trsm_l_ukr_name     );
-		gen_func_init_co( &funcs[ BLIS_TRSM_U_UKR ],     trsm_u_ukr_name     );
+		gen_func_init_co( &funcs[ BLIS_GEMM_VIR_UKR ],       gemm_ukr_name       );
+		gen_func_init_co( &funcs[ BLIS_GEMMTRSM_L_VIR_UKR ], gemmtrsm_l_ukr_name );
+		gen_func_init_co( &funcs[ BLIS_GEMMTRSM_U_VIR_UKR ], gemmtrsm_u_ukr_name );
+		gen_func_init_co( &funcs[ BLIS_TRSM_L_VIR_UKR ],     trsm_l_ukr_name     );
+		gen_func_init_co( &funcs[ BLIS_TRSM_U_VIR_UKR ],     trsm_u_ukr_name     );
 	}
 
 	// For 1m, we employ an optimization which requires that we copy the native
@@ -556,8 +520,8 @@ void GENBAINAME(cntx_init)
 	// beta has a zero imaginary component and C is either row- or column-stored).
 	if ( method == BLIS_1M )
 	{
-		func_t* gemm_nat_ukrs = bli_cntx_get_l3_nat_ukrs( BLIS_GEMM_UKR, cntx );
-		func_t* gemm_vir_ukrs = bli_cntx_get_l3_vir_ukrs( BLIS_GEMM_UKR, cntx );
+		func_t* gemm_nat_ukrs = bli_cntx_get_ukrs( BLIS_GEMM_UKR, cntx );
+		func_t* gemm_vir_ukrs = bli_cntx_get_ukrs( BLIS_GEMM_VIR_UKR, cntx );
 
 		bli_func_copy_dt( BLIS_FLOAT,  gemm_nat_ukrs, BLIS_FLOAT,  gemm_vir_ukrs );
 		bli_func_copy_dt( BLIS_DOUBLE, gemm_nat_ukrs, BLIS_DOUBLE, gemm_vir_ukrs );
@@ -566,8 +530,6 @@ void GENBAINAME(cntx_init)
 
 	// -- Set induced method packm kernels -------------------------------------
 
-	funcs = bli_cntx_packm_kers_buf( cntx );
-
 	// Initialize all packm kernel func_t entries to NULL.
 	for ( i = BLIS_PACKM_0XK_KER; i <= BLIS_PACKM_31XK_KER; ++i )
 	{
@@ -640,38 +602,40 @@ void GENBAINAME(cntx_init_blkszs)
 	// kernel.
 	bli_cntx_set_method( method, cntx );
 
+    num_t dt_r = bli_dt_proj_to_real( dt );
+
 	// Initialize the blocksizes according to the micro-kernel preference as
 	// well as the algorithm.
-	if ( bli_cntx_l3_vir_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ) )
+	if ( ! bli_cntx_get_ukr_prefs_dt( dt_r, BLIS_GEMM_UKR_ROW_PREF, cntx ) )
 	{
 		// This branch is used for algorithm 1m_c_bp.
 
 		bli_cntx_set_ind_blkszs
 		(
-		  method, dt, 6,
+		  method, dt, cntx,
 		  BLIS_NC, 1.0, 1.0,
 		  BLIS_KC, 2.0, 2.0, // halve kc...
 		  BLIS_MC, 2.0, 2.0, // halve mc...
 		  BLIS_NR, 1.0, 1.0,
 		  BLIS_MR, 2.0, 1.0, // ...and mr (but NOT packmr)
 		  BLIS_KR, 1.0, 1.0,
-		  cntx
+		  -1
 		);
 	}
-	else // if ( bli_cntx_l3_vir_ukr_prefers_rows_dt( dt, BLIS_GEMM_UKR, cntx ) )
+	else // if ( bli_cntx_get_ukr_prefs_dt( dt, BLIS_GEMM_UKR_ROW_PREF, cntx ) )
 	{
 		// This branch is used for algorithm 1m_r_bp.
 
 		bli_cntx_set_ind_blkszs
 		(
-		  method, dt, 6,
+		  method, dt, cntx,
 		  BLIS_NC, 2.0, 2.0, // halve nc...
 		  BLIS_KC, 2.0, 2.0, // halve kc...
 		  BLIS_MC, 1.0, 1.0,
 		  BLIS_NR, 2.0, 1.0, // ...and nr (but NOT packnr)
 		  BLIS_MR, 1.0, 1.0,
 		  BLIS_KR, 1.0, 1.0,
-		  cntx
+		  -1
 		);
 	}
 }
diff --git a/ref_kernels/ind/bli_gemm1m_ref.c b/ref_kernels/ind/bli_gemm1m_ref.c
index fbd15d695b..2f08083892 100644
--- a/ref_kernels/ind/bli_gemm1m_ref.c
+++ b/ref_kernels/ind/bli_gemm1m_ref.c
@@ -55,8 +55,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	const num_t       dt_r      = PASTEMAC(chr,type); \
 \
 	PASTECH(chr,gemm_ukr_ft) \
-	                  rgemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
-	const bool        col_pref  = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
+	                  rgemm_ukr = bli_cntx_get_ukr_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
+	const bool        col_pref  = bli_cntx_ukr_prefers_cols_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
 	const bool        row_pref  = !col_pref; \
 \
 	const dim_t       mr        = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
diff --git a/ref_kernels/ind/bli_gemmtrsm1m_ref.c b/ref_kernels/ind/bli_gemmtrsm1m_ref.c
index 96f5a16fed..28acf80fb3 100644
--- a/ref_kernels/ind/bli_gemmtrsm1m_ref.c
+++ b/ref_kernels/ind/bli_gemmtrsm1m_ref.c
@@ -54,12 +54,12 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	const num_t       dt_r        = PASTEMAC(chr,type); \
 \
 	PASTECH(chr,gemm_ukr_ft) \
-	                  rgemm_ukr   = bli_cntx_get_l3_nat_ukr_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
+	                  rgemm_ukr   = bli_cntx_get_ukr_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
 \
 	PASTECH(ch,trsm_ukr_ft) \
 	                ctrsm_vir_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, trsmkerid, cntx ); \
 \
-	const bool        col_pref    = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
+	const bool        col_pref    = bli_cntx_ukr_prefers_cols_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
 \
 	const dim_t       mr          = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
 	const dim_t       nr          = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
diff --git a/sandbox/gemmlike/attic/bls_gemm_bp_var2.c b/sandbox/gemmlike/attic/bls_gemm_bp_var2.c
index 957cd57944..8caccf9232 100644
--- a/sandbox/gemmlike/attic/bls_gemm_bp_var2.c
+++ b/sandbox/gemmlike/attic/bls_gemm_bp_var2.c
@@ -157,7 +157,7 @@ void PASTECH2(bls_,ch,varname) \
 	   function pointer type. */ \
 	/*
 	PASTECH(ch,gemm_ukr_ft) \
-               gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+               gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
 	*/ \
 \
 	/* Temporary C buffer for edge cases. Note that the strides of this
@@ -168,7 +168,7 @@ void PASTECH2(bls_,ch,varname) \
 	ctype       ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                / sizeof( ctype ) ] \
 	                __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
 	const inc_t rs_ct   = ( col_pref ? 1 : NR ); \
 	const inc_t cs_ct   = ( col_pref ? MR : 1 ); \
 	*/ \
@@ -524,7 +524,7 @@ void PASTECH2(bls_,ch,varname) \
 	/* Query the context for the microkernel address and cast it to its
 	   function pointer type. */ \
 	PASTECH(ch,gemm_ukr_ft) \
-               gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+               gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
 \
 	/* Temporary C buffer for edge cases. Note that the strides of this
 	   temporary buffer are set so that they match the storage of the
@@ -533,7 +533,7 @@ void PASTECH2(bls_,ch,varname) \
 	ctype       ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                / sizeof( ctype ) ] \
 	                __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool col_pref = bli_cntx_l3_nat_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
+	const bool col_pref = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ); \
 	const inc_t rs_ct   = ( col_pref ? 1 : NR ); \
 	const inc_t cs_ct   = ( col_pref ? MR : 1 ); \
 \
diff --git a/sandbox/gemmlike/bls_gemm.c b/sandbox/gemmlike/bls_gemm.c
index f2f8b7e257..ec5d8d5b1f 100644
--- a/sandbox/gemmlike/bls_gemm.c
+++ b/sandbox/gemmlike/bls_gemm.c
@@ -134,7 +134,7 @@ void bls_gemm_ex
 	// contiguous columns, or if C is stored by columns and the micro-kernel
 	// prefers contiguous rows, transpose the entire operation to allow the
 	// micro-kernel to access elements of C in its preferred manner.
-	if ( bli_cntx_l3_vir_ukr_dislikes_storage_of( &c_local, BLIS_GEMM_UKR, cntx ) )
+	if ( bli_cntx_dislikes_storage_of( &c_local, BLIS_GEMM_VIR_UKR, cntx ) )
 	{
 		bli_obj_swap( &a_local, &b_local );
 
diff --git a/sandbox/gemmlike/bls_gemm_bp_var1.c b/sandbox/gemmlike/bls_gemm_bp_var1.c
index 62dc462d51..1e3e5ea03f 100644
--- a/sandbox/gemmlike/bls_gemm_bp_var1.c
+++ b/sandbox/gemmlike/bls_gemm_bp_var1.c
@@ -156,7 +156,7 @@ void PASTECH2(bls_,ch,varname) \
 	/* Query the context for the microkernel address and cast it to its
 	   function pointer type. */ \
 	PASTECH(ch,gemm_ukr_ft) \
-               gemm_ukr = bli_cntx_get_l3_nat_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
+               gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
 \
 	/* Compute partitioning step values for each matrix of each loop. */ \
 	const inc_t jcstep_c = cs_c; \
diff --git a/sandbox/gemmlike/bls_packm_cxk.c b/sandbox/gemmlike/bls_packm_cxk.c
index ca11c207c0..ceaadbe8ba 100644
--- a/sandbox/gemmlike/bls_packm_cxk.c
+++ b/sandbox/gemmlike/bls_packm_cxk.c
@@ -54,15 +54,15 @@ void PASTECH2(bls_,ch,opname) \
 	/* Note that we use panel_dim_max, not panel_dim, to query the packm
 	   kernel function pointer. This means that we always use the same
 	   kernel, even for edge cases. */ \
-	num_t     dt     = PASTEMAC(ch,type); \
-	l1mkr_t   ker_id = panel_dim_max; \
+	num_t dt     = PASTEMAC(ch,type); \
+	ukr_t ker_id = ( ukr_t )( BLIS_PACKM_0XK_KER + panel_dim_max ); \
 \
 	PASTECH2(ch,opname,_ker_ft) f; \
 \
 	/* Query the context for the packm kernel corresponding to the current
 	   panel dimension, or kernel id. If the id is invalid, the function will
 	   return NULL. */ \
-	f = bli_cntx_get_packm_ker_dt( dt, ker_id, cntx ); \
+	f = bli_cntx_get_ukr_dt( dt, ker_id, cntx ); \
 \
 	/* If there exists a kernel implementation for the micro-panel dimension
 	   provided, we invoke the implementation. Otherwise, we use scal2m. */ \

From ccf99db5e849a985bc3ad8a03f9087906faffc90 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Sun, 6 Feb 2022 14:11:21 -0600
Subject: [PATCH 04/32] Missed `bli_kernel_defs_zen3.h` somehow.

---
 config/zen3/bli_kernel_defs_zen3.h | 52 ++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)
 create mode 100644 config/zen3/bli_kernel_defs_zen3.h

diff --git a/config/zen3/bli_kernel_defs_zen3.h b/config/zen3/bli_kernel_defs_zen3.h
new file mode 100644
index 0000000000..c5bc8d63f3
--- /dev/null
+++ b/config/zen3/bli_kernel_defs_zen3.h
@@ -0,0 +1,52 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2022, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+//#ifndef BLIS_KERNEL_DEFS_H
+//#define BLIS_KERNEL_DEFS_H
+
+
+// -- REGISTER BLOCK SIZES (FOR REFERENCE KERNELS) ----------------------------
+
+#define BLIS_MR_s   6
+#define BLIS_MR_d   6
+#define BLIS_MR_c   3
+#define BLIS_MR_z   3
+
+#define BLIS_NR_s   16
+#define BLIS_NR_d   8
+#define BLIS_NR_c   8
+#define BLIS_NR_z   4
+
+//#endif
+

From eebe527b5af5d2a71e1da0896d55e64029e98bcc Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Mon, 7 Feb 2022 20:53:53 +0000
Subject: [PATCH 05/32] Fix two ARM issues.

1. The generic gemm kernel breaks on armsve because there is no
   compile-time MR/NR. The refernce gemm kernels has been modified
   to detect this and fallback to a "dumb" version.
2. For some reason, adding an optimization for writing back full
   microtiles in row-major storage to the reference gemm kernel
   results in a segfault for armv7a/gcc-9.3. I can't tell if I'm
   doing something wrong of if there is a compiler bug. This
   optimization has been removed for the time being.
---
 ref_kernels/3/bli_gemm_ref.c | 257 ++++++++++++++++++-----------------
 1 file changed, 135 insertions(+), 122 deletions(-)

diff --git a/ref_kernels/3/bli_gemm_ref.c b/ref_kernels/3/bli_gemm_ref.c
index 70fd025f43..0462af8539 100644
--- a/ref_kernels/3/bli_gemm_ref.c
+++ b/ref_kernels/3/bli_gemm_ref.c
@@ -34,10 +34,109 @@
 
 #include "blis.h"
 
-#if 1
+// Completely generic gemm ukr implementation which checks MR/NR at
+// runtime. Very slow, but has to be used in certain cases.
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, opname, arch, suf ) \
+\
+static void PASTEMAC3(ch,opname,arch,suf) \
+     ( \
+       dim_t               m, \
+       dim_t               n, \
+       dim_t               k, \
+       ctype*     restrict alpha, \
+       ctype*     restrict a, \
+       ctype*     restrict b, \
+       ctype*     restrict beta, \
+       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
+       auxinfo_t* restrict data, \
+       cntx_t*    restrict cntx  \
+     ) \
+{ \
+	const num_t     dt     = PASTEMAC(ch,type); \
+\
+	const inc_t     packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \
+	const inc_t     packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \
+\
+	const inc_t     cs_a   = packmr; \
+\
+	const inc_t     rs_b   = packnr; \
+\
+	ctype           ab[ BLIS_STACK_BUF_MAX_SIZE \
+	                    / sizeof( ctype ) ] \
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const inc_t     rs_ab  = 1; \
+	const inc_t     cs_ab  = m; \
+\
+	dim_t           l, j, i; \
+\
+	ctype           ai; \
+	ctype           bj; \
+\
+\
+	/* Initialize the accumulator elements in ab to zero. */ \
+	for ( i = 0; i < m * n; ++i ) \
+	{ \
+		PASTEMAC(ch,set0s)( *(ab + i) ); \
+	} \
+\
+	/* Perform a series of k rank-1 updates into ab. */ \
+	for ( l = 0; l < k; ++l ) \
+	{ \
+		ctype* restrict abij = ab; \
+\
+		/* In an optimized implementation, these two loops over MR and NR
+		   are typically fully unrolled. */ \
+		for ( j = 0; j < n; ++j ) \
+		{ \
+			bj = *(b + j); \
+\
+			for ( i = 0; i < m; ++i ) \
+			{ \
+				ai = *(a + i); \
+\
+				PASTEMAC(ch,dots)( ai, bj, *abij ); \
+\
+				abij += rs_ab; \
+			} \
+		} \
+\
+		a += cs_a; \
+		b += rs_b; \
+	} \
+\
+	/* Scale the result in ab by alpha. */ \
+	for ( i = 0; i < m * n; ++i ) \
+	{ \
+		PASTEMAC(ch,scals)( *alpha, *(ab + i) ); \
+	} \
+\
+	/* If beta is zero, overwrite c with the scaled result in ab. Otherwise,
+	   scale by beta and then add the scaled redult in ab. */ \
+	if ( PASTEMAC(ch,eq0)( *beta ) ) \
+	{ \
+		PASTEMAC(ch,copys_mxn)( m, \
+		                        n, \
+		                        ab, rs_ab, cs_ab, \
+		                        c,  rs_c,  cs_c ); \
+	} \
+	else \
+	{ \
+		PASTEMAC(ch,xpbys_mxn)( m, \
+		                        n, \
+		                        ab, rs_ab, cs_ab, \
+		                        beta, \
+		                        c,  rs_c,  cs_c ); \
+	} \
+}
+
+INSERT_GENTFUNC_BASIC2( gemm_unr, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
 // An implementation that attempts to facilitate emission of vectorized
 // instructions via constant loop bounds + #pragma omp simd directives.
+// If compile-time MR/NR are not available (indicated by BLIS_[MN]R_x = -1),
+// then the non-unrolled version (above) is used.
 
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, arch, suf ) \
@@ -56,15 +155,36 @@ void PASTEMAC3(ch,opname,arch,suf) \
        cntx_t*    restrict cntx  \
      ) \
 { \
-    const dim_t     mr = PASTECH(BLIS_MR_,ch); \
-    const dim_t     nr = PASTECH(BLIS_NR_,ch); \
 \
-	ctype           ab[ mr * nr ] __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const inc_t     rs_ab  = nr; \
-	const inc_t     cs_ab  = 1; \
+	const dim_t     mr = PASTECH(BLIS_MR_,ch); \
+	const dim_t     nr = PASTECH(BLIS_NR_,ch); \
+\
+	if ( mr == -1 || nr == -1 ) \
+	{ \
+		PASTEMAC3(ch,gemm_unr,arch,suf) \
+		( \
+		  m, \
+		  n, \
+		  k, \
+		  alpha, \
+		  a, \
+		  b, \
+		  beta, \
+		  c, rs_c, cs_c, \
+		  data, \
+		  cntx \
+		); \
+		return; \
+	} \
+\
+	ctype       ab[ BLIS_STACK_BUF_MAX_SIZE \
+                        / sizeof( ctype ) ] \
+	                __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
+	const inc_t rs_ab  = nr; \
+	const inc_t cs_ab  = 1; \
 \
-	const inc_t     cs_a   = PASTECH(BLIS_PACKMR_,ch); \
-	const inc_t     rs_b   = PASTECH(BLIS_PACKNR_,ch); \
+	const inc_t cs_a   = PASTECH(BLIS_PACKMR_,ch); \
+	const inc_t rs_b   = PASTECH(BLIS_PACKNR_,ch); \
 \
 \
 	/* Initialize the accumulator elements in ab to zero. */ \
@@ -104,15 +224,14 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 	/* Output/accumulate intermediate result ab based on the storage
 	   of c and the value of beta. */ \
-	if ( cs_c == 1 && m == mr && n == nr ) \
+	if ( cs_c == 1 ) \
 	{ \
-		/* C is row-stored and a full tile. */ \
+		/* C is row-stored. */ \
 \
 		if ( PASTEMAC(ch,eq0)( *beta ) ) \
 		{ \
-			for ( dim_t i = 0; i < mr; ++i ) \
-	        PRAGMA_SIMD \
-			for ( dim_t j = 0; j < nr; ++j ) \
+			for ( dim_t i = 0; i < m; ++i ) \
+			for ( dim_t j = 0; j < n; ++j ) \
 			PASTEMAC(ch,copys) \
 			( \
 			  ab[ i*rs_ab + j*cs_ab ], \
@@ -121,9 +240,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		} \
 		else \
 		{ \
-			for ( dim_t i = 0; i < mr; ++i ) \
-	        PRAGMA_SIMD \
-			for ( dim_t j = 0; j < nr; ++j ) \
+			for ( dim_t i = 0; i < m; ++i ) \
+			for ( dim_t j = 0; j < n; ++j ) \
 			PASTEMAC(ch,xpbys) \
 			( \
 			  ab[ i*rs_ab + j*cs_ab ], \
@@ -134,7 +252,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	} \
 	else \
 	{ \
-		/* C is column-stored, general-stored, or an edge case. */ \
+		/* C is column-stored or general-stored. */ \
 \
 		if ( PASTEMAC(ch,eq0)( *beta ) ) \
 		{ \
@@ -162,109 +280,4 @@ void PASTEMAC3(ch,opname,arch,suf) \
 
 INSERT_GENTFUNC_BASIC2( gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
-#else
-
-// An implementation that uses variable loop bounds (queried from the context)
-// and makes no use of #pragma omp simd.
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       dim_t               m, \
-       dim_t               n, \
-       dim_t               k, \
-       ctype*     restrict alpha, \
-       ctype*     restrict a, \
-       ctype*     restrict b, \
-       ctype*     restrict beta, \
-       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
-     ) \
-{ \
-	const num_t     dt     = PASTEMAC(ch,type); \
-\
-	const dim_t     mr     = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
-	const dim_t     nr     = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
-\
-	const inc_t     packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \
-	const inc_t     packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \
-\
-	const inc_t     cs_a   = packmr; \
-\
-	const inc_t     rs_b   = packnr; \
-\
-	ctype           ab[ BLIS_STACK_BUF_MAX_SIZE \
-	                    / sizeof( ctype ) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const inc_t     rs_ab  = 1; \
-	const inc_t     cs_ab  = mr; \
-\
-	dim_t           l, j, i; \
-\
-	ctype           ai; \
-	ctype           bj; \
-\
-\
-	/* Initialize the accumulator elements in ab to zero. */ \
-	for ( i = 0; i < m * n; ++i ) \
-	{ \
-		PASTEMAC(ch,set0s)( *(ab + i) ); \
-	} \
-\
-	/* Perform a series of k rank-1 updates into ab. */ \
-	for ( l = 0; l < k; ++l ) \
-	{ \
-		ctype* restrict abij = ab; \
-\
-		/* In an optimized implementation, these two loops over MR and NR
-		   are typically fully unrolled. */ \
-		for ( j = 0; j < n; ++j ) \
-		{ \
-			bj = *(b + j); \
-\
-			for ( i = 0; i < m; ++i ) \
-			{ \
-				ai = *(a + i); \
-\
-				PASTEMAC(ch,dots)( ai, bj, *abij ); \
-\
-				abij += rs_ab; \
-			} \
-		} \
-\
-		a += cs_a; \
-		b += rs_b; \
-	} \
-\
-	/* Scale the result in ab by alpha. */ \
-	for ( i = 0; i < m * n; ++i ) \
-	{ \
-		PASTEMAC(ch,scals)( *alpha, *(ab + i) ); \
-	} \
-\
-	/* If beta is zero, overwrite c with the scaled result in ab. Otherwise,
-	   scale by beta and then add the scaled redult in ab. */ \
-	if ( PASTEMAC(ch,eq0)( *beta ) ) \
-	{ \
-		PASTEMAC(ch,copys_mxn)( m, \
-		                        n, \
-		                        ab, rs_ab, cs_ab, \
-		                        c,  rs_c,  cs_c ); \
-	} \
-	else \
-	{ \
-		PASTEMAC(ch,xpbys_mxn)( m, \
-		                        n, \
-		                        ab, rs_ab, cs_ab, \
-		                        beta, \
-		                        c,  rs_c,  cs_c ); \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC2( gemm, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-#endif
 

From 84af20dbd16c3ea1012540613bd3df669e475cdc Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Wed, 9 Feb 2022 16:55:14 -0600
Subject: [PATCH 06/32] Re-write reference packing kernels to make use of MR/NR
 values when available as macros.

The array of reference packing kernels (0--31) are replaced by exactly two kernels for each config/datatype combination, one to pack MRxK micropanels and one to pack NRxK micropanels. *IMPORTANT*: the "bb" reference kernels have been merged into the "standard" kernels (packm [incl. 1er and unpackm], gemm, trsm, gemmtrsm). This replication factor is controlled by BLIS_BB[MN]_[sdcz] etc. Power9/10 need testing since only a replication factor of 1 has been tested. armsve also needs testing since the MR value isn't available as a macro.
---
 addon/gemmd/bao_packm_cxk.c                |    2 +-
 config/a64fx/bli_cntx_init_a64fx.c         |    6 +-
 config/armsve/bli_cntx_init_armsve.c       |    6 +-
 config/armsve/bli_kernel_defs_armsve.h     |    8 +-
 config/firestorm/bli_cntx_init_firestorm.c |    8 +-
 config/haswell/bli_cntx_init_haswell.c     |   16 +-
 config/knl/bli_cntx_init_knl.c             |    4 +-
 config/power10/bli_cntx_init_power10.c     |   55 +-
 config/power10/bli_kernel_defs_power10.h   |    3 +
 config/power9/bli_cntx_init_power9.c       |   54 +-
 config/power9/bli_kernel_defs_power9.h     |    3 +
 config/zen/bli_cntx_init_zen.c             |   16 +-
 config/zen2/bli_cntx_init_zen2.c           |   16 +-
 config/zen3/bli_cntx_init_zen3.c           |   20 +-
 frame/1m/bli_l1m_ker.h                     |   32 +-
 frame/1m/packm/bli_packm_cxk.c             |    2 +-
 frame/1m/packm/bli_packm_cxk_1er.c         |    2 +-
 frame/1m/unpackm/bli_unpackm_blk_var1.c    |    4 +
 frame/1m/unpackm/bli_unpackm_cxk.c         |    3 +-
 frame/1m/unpackm/bli_unpackm_cxk.h         |    1 +
 frame/3/gemm/bli_gemm_md.c                 |    6 +-
 frame/include/bli_gentfunc_macro_defs.h    |    7 +
 frame/include/bli_kernel_macro_defs.h      |   48 +-
 frame/include/bli_type_defs.h              |   74 +-
 ref_kernels/1m/bli_packm_cxk_1er_ref.c     | 2187 ++------------------
 ref_kernels/1m/bli_packm_cxk_bb_ref.c      |  656 ------
 ref_kernels/1m/bli_packm_cxk_ref.c         | 1628 +--------------
 ref_kernels/1m/bli_unpackm_cxk_ref.c       |  839 +-------
 ref_kernels/3/bb/bli_gemmbb_ref.c          |  141 --
 ref_kernels/3/bb/bli_gemmtrsmbb_ref.c      |  140 --
 ref_kernels/3/bb/bli_trsmbb_ref.c          |  214 --
 ref_kernels/3/bli_gemm_ref.c               |   16 +-
 ref_kernels/3/bli_gemmtrsm_ref.c           |   34 +-
 ref_kernels/3/bli_trsm_ref.c               |   24 +-
 ref_kernels/bli_cntx_ref.c                 |  194 +-
 sandbox/gemmlike/bls_packm_cxk.c           |    2 +-
 36 files changed, 473 insertions(+), 5998 deletions(-)
 delete mode 100644 ref_kernels/1m/bli_packm_cxk_bb_ref.c
 delete mode 100644 ref_kernels/3/bb/bli_gemmbb_ref.c
 delete mode 100644 ref_kernels/3/bb/bli_gemmtrsmbb_ref.c
 delete mode 100644 ref_kernels/3/bb/bli_trsmbb_ref.c

diff --git a/addon/gemmd/bao_packm_cxk.c b/addon/gemmd/bao_packm_cxk.c
index 455ce3fe0d..8680c53321 100644
--- a/addon/gemmd/bao_packm_cxk.c
+++ b/addon/gemmd/bao_packm_cxk.c
@@ -56,7 +56,7 @@ void PASTECH2(bao_,ch,opname) \
 	   kernel function pointer. This means that we always use the same
 	   kernel, even for edge cases. */ \
 	num_t dt     = PASTEMAC(ch,type); \
-	ukr_t ker_id = ( ukr_t )( BLIS_PACKM_0XK_KER + panel_dim_max ); \
+	ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_KER : BLIS_PACKM_MRXK_KER; \
 \
 	PASTECH2(ch,opname,_ker_ft) f; \
 \
diff --git a/config/a64fx/bli_cntx_init_a64fx.c b/config/a64fx/bli_cntx_init_a64fx.c
index f002477b0e..d6b95030b3 100644
--- a/config/a64fx/bli_cntx_init_a64fx.c
+++ b/config/a64fx/bli_cntx_init_a64fx.c
@@ -56,10 +56,8 @@ void bli_cntx_init_a64fx( cntx_t* cntx )
 	  BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_armsve_asm_2vx10_unindexed,
 
       // packm
-	  BLIS_PACKM_10XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk,
-	  // 12xk is not used and disabled for GCC 8-9 compatibility.
-	  // BLIS_PACKM_12XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_int_12xk,
-	  BLIS_PACKM_16XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk,
+	  BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk,
 
       -1
 	);
diff --git a/config/armsve/bli_cntx_init_armsve.c b/config/armsve/bli_cntx_init_armsve.c
index 7ee24351c4..02ab7a35f5 100644
--- a/config/armsve/bli_cntx_init_armsve.c
+++ b/config/armsve/bli_cntx_init_armsve.c
@@ -87,15 +87,15 @@ void bli_cntx_init_armsve( cntx_t* cntx )
 	  bli_cntx_set_ukrs
 	  (
 		cntx,
-		BLIS_PACKM_10XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk,
-		BLIS_PACKM_16XK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk,
+		BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk,
+		BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk,
 		-1
 	  );
 	else if (m_r_d==8)
 	  bli_cntx_set_ukrs
 	  (
 		cntx,
-		BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_armsve256_int_8xk,
+		BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_armsve256_int_8xk,
 		-1
 	  );
 
diff --git a/config/armsve/bli_kernel_defs_armsve.h b/config/armsve/bli_kernel_defs_armsve.h
index 8496cb0b77..8c9c0b0dd6 100644
--- a/config/armsve/bli_kernel_defs_armsve.h
+++ b/config/armsve/bli_kernel_defs_armsve.h
@@ -49,10 +49,10 @@
 #define BLIS_MR_c   -1
 #define BLIS_MR_z   -1
 
-#define BLIS_NR_s   -1
-#define BLIS_NR_d   -1
-#define BLIS_NR_c   -1
-#define BLIS_NR_z   -1
+#define BLIS_NR_s   10
+#define BLIS_NR_d   10
+#define BLIS_NR_c   10
+#define BLIS_NR_z   10
 
 //#endif
 
diff --git a/config/firestorm/bli_cntx_init_firestorm.c b/config/firestorm/bli_cntx_init_firestorm.c
index 946aabd433..913540955c 100644
--- a/config/firestorm/bli_cntx_init_firestorm.c
+++ b/config/firestorm/bli_cntx_init_firestorm.c
@@ -53,10 +53,10 @@ void bli_cntx_init_firestorm( cntx_t* cntx )
 	  BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8,
 
       // packm
-	  BLIS_PACKM_8XK_KER,  BLIS_FLOAT,  bli_spackm_armv8a_int_8xk,
-	  BLIS_PACKM_12XK_KER, BLIS_FLOAT,  bli_spackm_armv8a_int_12xk,
-	  BLIS_PACKM_6XK_KER,  BLIS_DOUBLE, bli_dpackm_armv8a_int_6xk,
-	  BLIS_PACKM_8XK_KER,  BLIS_DOUBLE, bli_dpackm_armv8a_int_8xk,
+	  BLIS_PACKM_MRXK_KER, BLIS_FLOAT,  bli_spackm_armv8a_int_8xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_FLOAT,  bli_spackm_armv8a_int_12xk,
+	  BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_6xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_8xk,
 
       // gemmsup
 	  BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m,
diff --git a/config/haswell/bli_cntx_init_haswell.c b/config/haswell/bli_cntx_init_haswell.c
index 34e3909ff9..4e6ca280c4 100644
--- a/config/haswell/bli_cntx_init_haswell.c
+++ b/config/haswell/bli_cntx_init_haswell.c
@@ -71,14 +71,14 @@ void bli_cntx_init_haswell( cntx_t* cntx )
 
 #if 1
       // packm
-	  BLIS_PACKM_6XK_KER,  BLIS_FLOAT,    bli_spackm_haswell_asm_6xk,
-	  BLIS_PACKM_16XK_KER, BLIS_FLOAT,    bli_spackm_haswell_asm_16xk,
-	  BLIS_PACKM_6XK_KER,  BLIS_DOUBLE,   bli_dpackm_haswell_asm_6xk,
-	  BLIS_PACKM_8XK_KER,  BLIS_DOUBLE,   bli_dpackm_haswell_asm_8xk,
-	  BLIS_PACKM_3XK_KER,  BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk,
-	  BLIS_PACKM_8XK_KER,  BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk,
-	  BLIS_PACKM_3XK_KER,  BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk,
-	  BLIS_PACKM_4XK_KER,  BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk,
+	  BLIS_PACKM_MRXK_KER, BLIS_FLOAT,    bli_spackm_haswell_asm_6xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_FLOAT,    bli_spackm_haswell_asm_16xk,
+	  BLIS_PACKM_MRXK_KER, BLIS_DOUBLE,   bli_dpackm_haswell_asm_6xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_DOUBLE,   bli_dpackm_haswell_asm_8xk,
+	  BLIS_PACKM_MRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk,
+	  BLIS_PACKM_MRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk,
 #endif
 
 	  // axpyf
diff --git a/config/knl/bli_cntx_init_knl.c b/config/knl/bli_cntx_init_knl.c
index fcc25f023e..3fa11b4691 100644
--- a/config/knl/bli_cntx_init_knl.c
+++ b/config/knl/bli_cntx_init_knl.c
@@ -53,8 +53,8 @@ void bli_cntx_init_knl( cntx_t* cntx )
 	  BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_knl_asm_24x8,
 
       // packm
-	  BLIS_PACKM_8XK_KER,  BLIS_DOUBLE, bli_dpackm_knl_asm_8xk,
-	  BLIS_PACKM_24XK_KER, BLIS_DOUBLE, bli_dpackm_knl_asm_24xk,
+	  BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_knl_asm_24xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_knl_asm_8xk,
 
 	  // axpyf
 	  BLIS_AXPYF_KER, BLIS_FLOAT,  bli_saxpyf_zen_int_8,
diff --git a/config/power10/bli_cntx_init_power10.c b/config/power10/bli_cntx_init_power10.c
index dda9710ee0..73b9dabc73 100644
--- a/config/power10/bli_cntx_init_power10.c
+++ b/config/power10/bli_cntx_init_power10.c
@@ -34,35 +34,6 @@
 
 #include "blis.h"
 
-// Instantiate prototypes for packm kernels.
-PACKM_KER_PROT(    float,  s, packm_6xk_bb4_power10_ref )
-PACKM_KER_PROT(    double, d, packm_6xk_bb2_power10_ref )
-
-// Instantiate prototypes for level-3 kernels.
-GEMM_UKR_PROT(     float,  s, gemmbb_power10_ref )
-GEMMTRSM_UKR_PROT( float,  s, gemmtrsmbb_l_power10_ref )
-GEMMTRSM_UKR_PROT( float,  s, gemmtrsmbb_u_power10_ref )
-TRSM_UKR_PROT(     float,  s, trsmbb_l_power10_ref )
-TRSM_UKR_PROT(     float,  s, trsmbb_u_power10_ref )
-
-GEMM_UKR_PROT(     double, d, gemmbb_power10_ref )
-GEMMTRSM_UKR_PROT( double, d, gemmtrsmbb_l_power10_ref )
-GEMMTRSM_UKR_PROT( double, d, gemmtrsmbb_u_power10_ref )
-TRSM_UKR_PROT(     double, d, trsmbb_l_power10_ref )
-TRSM_UKR_PROT(     double, d, trsmbb_u_power10_ref )
-
-GEMM_UKR_PROT(     scomplex, c, gemmbb_power10_ref )
-GEMMTRSM_UKR_PROT( scomplex, c, gemmtrsmbb_l_power10_ref )
-GEMMTRSM_UKR_PROT( scomplex, c, gemmtrsmbb_u_power10_ref )
-TRSM_UKR_PROT(     scomplex, c, trsmbb_l_power10_ref )
-TRSM_UKR_PROT(     scomplex, c, trsmbb_u_power10_ref )
-
-GEMM_UKR_PROT(     dcomplex, z, gemmbb_power10_ref )
-GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsmbb_l_power10_ref )
-GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsmbb_u_power10_ref )
-TRSM_UKR_PROT(     dcomplex, z, trsmbb_l_power10_ref )
-TRSM_UKR_PROT(     dcomplex, z, trsmbb_u_power10_ref )
-
 void bli_cntx_init_power10( cntx_t* cntx )
 {
 	blksz_t blkszs[ BLIS_NUM_BLKSZS ];
@@ -78,30 +49,8 @@ void bli_cntx_init_power10( cntx_t* cntx )
 	  cntx,
 
       // level-3
-	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemm_power10_mma_8x16,
-	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_power10_mma_8x8,
-	  BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemmbb_power10_ref,
-	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemmbb_power10_ref,
-	  BLIS_TRSM_L_UKR,     BLIS_FLOAT,    bli_strsmbb_l_power10_ref,
-	  BLIS_TRSM_U_UKR,     BLIS_FLOAT,    bli_strsmbb_u_power10_ref,
-	  BLIS_TRSM_L_UKR,     BLIS_DOUBLE,   bli_dtrsmbb_l_power10_ref,
-	  BLIS_TRSM_U_UKR,     BLIS_DOUBLE,   bli_dtrsmbb_u_power10_ref,
-	  BLIS_TRSM_L_UKR,     BLIS_SCOMPLEX, bli_ctrsmbb_l_power10_ref,
-	  BLIS_TRSM_U_UKR,     BLIS_SCOMPLEX, bli_ctrsmbb_u_power10_ref,
-	  BLIS_TRSM_L_UKR,     BLIS_DCOMPLEX, bli_ztrsmbb_l_power10_ref,
-	  BLIS_TRSM_U_UKR,     BLIS_DCOMPLEX, bli_ztrsmbb_u_power10_ref,
-	  BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT,    bli_sgemmtrsmbb_l_power10_ref,
-	  BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT,    bli_sgemmtrsmbb_u_power10_ref,
-	  BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE,   bli_dgemmtrsmbb_l_power10_ref,
-	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsmbb_u_power10_ref,
-	  BLIS_GEMMTRSM_L_UKR, BLIS_SCOMPLEX, bli_cgemmtrsmbb_l_power10_ref,
-	  BLIS_GEMMTRSM_U_UKR, BLIS_SCOMPLEX, bli_cgemmtrsmbb_u_power10_ref,
-	  BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsmbb_l_power10_ref,
-	  BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsmbb_u_power10_ref,
-
-      // packm
-	  BLIS_PACKM_6XK_KER,  BLIS_FLOAT,    bli_spackm_6xk_bb4_power10_ref,
-	  BLIS_PACKM_6XK_KER,  BLIS_DOUBLE,   bli_dpackm_6xk_bb2_power10_ref,
+	  BLIS_GEMM_UKR, BLIS_FLOAT,  bli_sgemm_power10_mma_8x16,
+	  BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power10_mma_8x8,
 
       -1
 	);
diff --git a/config/power10/bli_kernel_defs_power10.h b/config/power10/bli_kernel_defs_power10.h
index 39a2cf3d58..4e32f1173b 100644
--- a/config/power10/bli_kernel_defs_power10.h
+++ b/config/power10/bli_kernel_defs_power10.h
@@ -44,5 +44,8 @@
 #define BLIS_NR_s   16
 #define BLIS_NR_d   8
 
+#define BLIS_BBN_s   4
+#define BLIS_BBN_d   2
+
 //#endif
 
diff --git a/config/power9/bli_cntx_init_power9.c b/config/power9/bli_cntx_init_power9.c
index 75f9b1ffca..ef1f947737 100644
--- a/config/power9/bli_cntx_init_power9.c
+++ b/config/power9/bli_cntx_init_power9.c
@@ -34,35 +34,6 @@
 
 #include "blis.h"
 
-// Instantiate prototypes for packm kernels.
-PACKM_KER_PROT(    float,  s, packm_6xk_bb4_power9_ref )
-PACKM_KER_PROT(    double, d, packm_6xk_bb2_power9_ref )
-
-// Instantiate prototypes for level-3 kernels.
-GEMM_UKR_PROT(     float,  s, gemmbb_power9_ref )
-GEMMTRSM_UKR_PROT( float,  s, gemmtrsmbb_l_power9_ref )
-GEMMTRSM_UKR_PROT( float,  s, gemmtrsmbb_u_power9_ref )
-TRSM_UKR_PROT(     float,  s, trsmbb_l_power9_ref )
-TRSM_UKR_PROT(     float,  s, trsmbb_u_power9_ref )
-
-GEMM_UKR_PROT(     double, d, gemmbb_power9_ref )
-GEMMTRSM_UKR_PROT( double, d, gemmtrsmbb_l_power9_ref )
-GEMMTRSM_UKR_PROT( double, d, gemmtrsmbb_u_power9_ref )
-TRSM_UKR_PROT(     double, d, trsmbb_l_power9_ref )
-TRSM_UKR_PROT(     double, d, trsmbb_u_power9_ref )
-
-GEMM_UKR_PROT(     scomplex, c, gemmbb_power9_ref )
-GEMMTRSM_UKR_PROT( scomplex, c, gemmtrsmbb_l_power9_ref )
-GEMMTRSM_UKR_PROT( scomplex, c, gemmtrsmbb_u_power9_ref )
-TRSM_UKR_PROT(     scomplex, c, trsmbb_l_power9_ref )
-TRSM_UKR_PROT(     scomplex, c, trsmbb_u_power9_ref )
-
-GEMM_UKR_PROT(     dcomplex, z, gemmbb_power9_ref )
-GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsmbb_l_power9_ref )
-GEMMTRSM_UKR_PROT( dcomplex, z, gemmtrsmbb_u_power9_ref )
-TRSM_UKR_PROT(     dcomplex, z, trsmbb_l_power9_ref )
-TRSM_UKR_PROT(     dcomplex, z, trsmbb_u_power9_ref )
-
 void bli_cntx_init_power9( cntx_t* cntx )
 {
 	blksz_t blkszs[ BLIS_NUM_BLKSZS ];
@@ -78,30 +49,7 @@ void bli_cntx_init_power9( cntx_t* cntx )
 	  cntx,
 
       // level-3
-	  BLIS_GEMM_UKR,       BLIS_FLOAT,    bli_sgemmbb_power9_ref,
-	  BLIS_GEMM_UKR,       BLIS_DOUBLE,   bli_dgemm_power9_asm_12x6,
-	  BLIS_GEMM_UKR,       BLIS_SCOMPLEX, bli_cgemmbb_power9_ref,
-	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemmbb_power9_ref,
-	  BLIS_TRSM_L_UKR,     BLIS_FLOAT,    bli_strsmbb_l_power9_ref,
-	  BLIS_TRSM_U_UKR,     BLIS_FLOAT,    bli_strsmbb_u_power9_ref,
-	  BLIS_TRSM_L_UKR,     BLIS_DOUBLE,   bli_dtrsmbb_l_power9_ref,
-	  BLIS_TRSM_U_UKR,     BLIS_DOUBLE,   bli_dtrsmbb_u_power9_ref,
-	  BLIS_TRSM_L_UKR,     BLIS_SCOMPLEX, bli_ctrsmbb_l_power9_ref,
-	  BLIS_TRSM_U_UKR,     BLIS_SCOMPLEX, bli_ctrsmbb_u_power9_ref,
-	  BLIS_TRSM_L_UKR,     BLIS_DCOMPLEX, bli_ztrsmbb_l_power9_ref,
-	  BLIS_TRSM_U_UKR,     BLIS_DCOMPLEX, bli_ztrsmbb_u_power9_ref,
-	  BLIS_GEMMTRSM_L_UKR, BLIS_FLOAT,    bli_sgemmtrsmbb_l_power9_ref,
-	  BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT,    bli_sgemmtrsmbb_u_power9_ref,
-	  BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE,   bli_dgemmtrsmbb_l_power9_ref,
-	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsmbb_u_power9_ref,
-	  BLIS_GEMMTRSM_L_UKR, BLIS_SCOMPLEX, bli_cgemmtrsmbb_l_power9_ref,
-	  BLIS_GEMMTRSM_U_UKR, BLIS_SCOMPLEX, bli_cgemmtrsmbb_u_power9_ref,
-	  BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsmbb_l_power9_ref,
-	  BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsmbb_u_power9_ref,
-
-      // packm
-	  BLIS_PACKM_6XK_KER,  BLIS_FLOAT,    bli_spackm_6xk_bb4_power9_ref,
-	  BLIS_PACKM_6XK_KER,  BLIS_DOUBLE,   bli_dpackm_6xk_bb2_power9_ref,
+	  BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power9_asm_12x6,
 
       -1
 	);
diff --git a/config/power9/bli_kernel_defs_power9.h b/config/power9/bli_kernel_defs_power9.h
index f367fda1dd..debfeac5fc 100644
--- a/config/power9/bli_kernel_defs_power9.h
+++ b/config/power9/bli_kernel_defs_power9.h
@@ -42,5 +42,8 @@
 
 #define BLIS_NR_d   6
 
+#define BLIS_BBN_s   4
+#define BLIS_BBN_d   2
+
 //#endif
 
diff --git a/config/zen/bli_cntx_init_zen.c b/config/zen/bli_cntx_init_zen.c
index dde28da64e..074d952252 100644
--- a/config/zen/bli_cntx_init_zen.c
+++ b/config/zen/bli_cntx_init_zen.c
@@ -112,14 +112,14 @@ void bli_cntx_init_zen( cntx_t* cntx )
 #endif
 
       // packm
-	  BLIS_PACKM_6XK_KER,  BLIS_FLOAT,    bli_spackm_haswell_asm_6xk,
-	  BLIS_PACKM_16XK_KER, BLIS_FLOAT,    bli_spackm_haswell_asm_16xk,
-	  BLIS_PACKM_6XK_KER,  BLIS_DOUBLE,   bli_dpackm_haswell_asm_6xk,
-	  BLIS_PACKM_8XK_KER,  BLIS_DOUBLE,   bli_dpackm_haswell_asm_8xk,
-	  BLIS_PACKM_3XK_KER,  BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk,
-	  BLIS_PACKM_8XK_KER,  BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk,
-	  BLIS_PACKM_3XK_KER,  BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk,
-	  BLIS_PACKM_4XK_KER,  BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk,
+	  BLIS_PACKM_MRXK_KER, BLIS_FLOAT,    bli_spackm_haswell_asm_6xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_FLOAT,    bli_spackm_haswell_asm_16xk,
+	  BLIS_PACKM_MRXK_KER, BLIS_DOUBLE,   bli_dpackm_haswell_asm_6xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_DOUBLE,   bli_dpackm_haswell_asm_8xk,
+	  BLIS_PACKM_MRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk,
+	  BLIS_PACKM_MRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk,
 
 	  // axpyf
 	  BLIS_AXPYF_KER,     BLIS_FLOAT,  bli_saxpyf_zen_int_8,
diff --git a/config/zen2/bli_cntx_init_zen2.c b/config/zen2/bli_cntx_init_zen2.c
index ea0fc42b3b..0e64005ec7 100644
--- a/config/zen2/bli_cntx_init_zen2.c
+++ b/config/zen2/bli_cntx_init_zen2.c
@@ -110,14 +110,14 @@ void bli_cntx_init_zen2( cntx_t* cntx )
 #endif
 
       // packm
-	  BLIS_PACKM_6XK_KER,  BLIS_FLOAT,    bli_spackm_haswell_asm_6xk,
-	  BLIS_PACKM_16XK_KER, BLIS_FLOAT,    bli_spackm_haswell_asm_16xk,
-	  BLIS_PACKM_6XK_KER,  BLIS_DOUBLE,   bli_dpackm_haswell_asm_6xk,
-	  BLIS_PACKM_8XK_KER,  BLIS_DOUBLE,   bli_dpackm_haswell_asm_8xk,
-	  BLIS_PACKM_3XK_KER,  BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk,
-	  BLIS_PACKM_8XK_KER,  BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk,
-	  BLIS_PACKM_3XK_KER,  BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk,
-	  BLIS_PACKM_4XK_KER,  BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk,
+	  BLIS_PACKM_MRXK_KER, BLIS_FLOAT,    bli_spackm_haswell_asm_6xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_FLOAT,    bli_spackm_haswell_asm_16xk,
+	  BLIS_PACKM_MRXK_KER, BLIS_DOUBLE,   bli_dpackm_haswell_asm_6xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_DOUBLE,   bli_dpackm_haswell_asm_8xk,
+	  BLIS_PACKM_MRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk,
+	  BLIS_PACKM_MRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk,
 
 	  // axpyf
 	  BLIS_AXPYF_KER,     BLIS_FLOAT,  bli_saxpyf_zen_int_5,
diff --git a/config/zen3/bli_cntx_init_zen3.c b/config/zen3/bli_cntx_init_zen3.c
index e8934d2a46..6771c4e069 100644
--- a/config/zen3/bli_cntx_init_zen3.c
+++ b/config/zen3/bli_cntx_init_zen3.c
@@ -117,17 +117,17 @@ void bli_cntx_init_zen3( cntx_t* cntx )
       // packm
 #if 0
 	  // AMD: This will be enabled in other PRs.
-	  BLIS_PACKM_8XK_KER, BLIS_DOUBLE, bli_dpackm_8xk_gen_zen,
-	  BLIS_PACKM_6XK_KER, BLIS_DOUBLE, bli_dpackm_6xk_gen_zen,
+	  BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_6xk_gen_zen,
+	  BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_8xk_gen_zen,
 #else
-	  BLIS_PACKM_6XK_KER,  BLIS_FLOAT,    bli_spackm_haswell_asm_6xk,
-	  BLIS_PACKM_16XK_KER, BLIS_FLOAT,    bli_spackm_haswell_asm_16xk,
-	  BLIS_PACKM_6XK_KER,  BLIS_DOUBLE,   bli_dpackm_haswell_asm_6xk,
-	  BLIS_PACKM_8XK_KER,  BLIS_DOUBLE,   bli_dpackm_haswell_asm_8xk,
-	  BLIS_PACKM_3XK_KER,  BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk,
-	  BLIS_PACKM_8XK_KER,  BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk,
-	  BLIS_PACKM_3XK_KER,  BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk,
-	  BLIS_PACKM_4XK_KER,  BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk,
+	  BLIS_PACKM_MRXK_KER, BLIS_FLOAT,    bli_spackm_haswell_asm_6xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_FLOAT,    bli_spackm_haswell_asm_16xk,
+	  BLIS_PACKM_MRXK_KER, BLIS_DOUBLE,   bli_dpackm_haswell_asm_6xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_DOUBLE,   bli_dpackm_haswell_asm_8xk,
+	  BLIS_PACKM_MRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_3xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_SCOMPLEX, bli_cpackm_haswell_asm_8xk,
+	  BLIS_PACKM_MRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_3xk,
+	  BLIS_PACKM_NRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk,
 #endif
 
 	  // axpyf
diff --git a/frame/1m/bli_l1m_ker.h b/frame/1m/bli_l1m_ker.h
index 76d51af2b0..535217b55f 100644
--- a/frame/1m/bli_l1m_ker.h
+++ b/frame/1m/bli_l1m_ker.h
@@ -47,16 +47,8 @@
 #undef  GENTPROT
 #define GENTPROT PACKM_KER_PROT
 
-INSERT_GENTPROT_BASIC0( packm_2xk_ker_name )
-INSERT_GENTPROT_BASIC0( packm_3xk_ker_name )
-INSERT_GENTPROT_BASIC0( packm_4xk_ker_name )
-INSERT_GENTPROT_BASIC0( packm_6xk_ker_name )
-INSERT_GENTPROT_BASIC0( packm_8xk_ker_name )
-INSERT_GENTPROT_BASIC0( packm_10xk_ker_name )
-INSERT_GENTPROT_BASIC0( packm_12xk_ker_name )
-INSERT_GENTPROT_BASIC0( packm_14xk_ker_name )
-INSERT_GENTPROT_BASIC0( packm_16xk_ker_name )
-INSERT_GENTPROT_BASIC0( packm_24xk_ker_name )
+INSERT_GENTPROT_BASIC0( packm_mrxk_ker_name )
+INSERT_GENTPROT_BASIC0( packm_nrxk_ker_name )
 
 
 // native unpackm kernels
@@ -64,14 +56,8 @@ INSERT_GENTPROT_BASIC0( packm_24xk_ker_name )
 #undef  GENTPROT
 #define GENTPROT UNPACKM_KER_PROT
 
-INSERT_GENTPROT_BASIC0( unpackm_2xk_ker_name )
-INSERT_GENTPROT_BASIC0( unpackm_4xk_ker_name )
-INSERT_GENTPROT_BASIC0( unpackm_6xk_ker_name )
-INSERT_GENTPROT_BASIC0( unpackm_8xk_ker_name )
-INSERT_GENTPROT_BASIC0( unpackm_10xk_ker_name )
-INSERT_GENTPROT_BASIC0( unpackm_12xk_ker_name )
-INSERT_GENTPROT_BASIC0( unpackm_14xk_ker_name )
-INSERT_GENTPROT_BASIC0( unpackm_16xk_ker_name )
+INSERT_GENTPROT_BASIC0( unpackm_mrxk_ker_name )
+INSERT_GENTPROT_BASIC0( unpackm_nrxk_ker_name )
 
 
 // 1e/1r packm kernels
@@ -79,12 +65,6 @@ INSERT_GENTPROT_BASIC0( unpackm_16xk_ker_name )
 #undef  GENTPROT
 #define GENTPROT PACKM_1ER_KER_PROT
 
-INSERT_GENTPROT_BASIC0( packm_2xk_1er_ker_name )
-INSERT_GENTPROT_BASIC0( packm_4xk_1er_ker_name )
-INSERT_GENTPROT_BASIC0( packm_6xk_1er_ker_name )
-INSERT_GENTPROT_BASIC0( packm_8xk_1er_ker_name )
-INSERT_GENTPROT_BASIC0( packm_10xk_1er_ker_name )
-INSERT_GENTPROT_BASIC0( packm_12xk_1er_ker_name )
-INSERT_GENTPROT_BASIC0( packm_14xk_1er_ker_name )
-INSERT_GENTPROT_BASIC0( packm_16xk_1er_ker_name )
+INSERT_GENTPROT_BASIC0( packm_mrxk_1er_ker_name )
+INSERT_GENTPROT_BASIC0( packm_nrxk_1er_ker_name )
 
diff --git a/frame/1m/packm/bli_packm_cxk.c b/frame/1m/packm/bli_packm_cxk.c
index 55d4862cbe..8396552c0f 100644
--- a/frame/1m/packm/bli_packm_cxk.c
+++ b/frame/1m/packm/bli_packm_cxk.c
@@ -55,7 +55,7 @@ void PASTEMAC(ch,opname) \
 	   kernel function pointer. This means that we always use the same
 	   kernel, even for edge cases. */ \
 	num_t dt     = PASTEMAC(ch,type); \
-	ukr_t ker_id = ( ukr_t )( BLIS_PACKM_0XK_KER + panel_dim_max ); \
+	ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_KER : BLIS_PACKM_MRXK_KER; \
 \
 	PASTECH2(ch,opname,_ker_ft) f; \
 \
diff --git a/frame/1m/packm/bli_packm_cxk_1er.c b/frame/1m/packm/bli_packm_cxk_1er.c
index 835e476c60..82dfb7b2e1 100644
--- a/frame/1m/packm/bli_packm_cxk_1er.c
+++ b/frame/1m/packm/bli_packm_cxk_1er.c
@@ -55,7 +55,7 @@ void PASTEMAC(ch,opname) \
 	   kernel function pointer. This means that we always use the same
 	   kernel, even for edge cases. */ \
 	num_t dt     = PASTEMAC(ch,type); \
-	ukr_t ker_id = ( ukr_t )( BLIS_PACKM_0XK_KER + panel_dim_max ); \
+	ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_KER : BLIS_PACKM_MRXK_KER; \
 \
 	PASTECH2(ch,opname,_ker_ft) f; \
 \
diff --git a/frame/1m/unpackm/bli_unpackm_blk_var1.c b/frame/1m/unpackm/bli_unpackm_blk_var1.c
index b2c8620452..e44fd15e4e 100644
--- a/frame/1m/unpackm/bli_unpackm_blk_var1.c
+++ b/frame/1m/unpackm/bli_unpackm_blk_var1.c
@@ -164,6 +164,7 @@ void PASTEMAC(ch,varname) \
 	inc_t           ldp; \
 	dim_t*          m_panel_full; \
 	dim_t*          n_panel_full; \
+    pack_t          schema; \
 \
 \
 	/* If c needs a transposition, induce it so that we can more simply
@@ -182,6 +183,7 @@ void PASTEMAC(ch,varname) \
 	if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
 	{ \
 		/* Prepare to unpack from column panels. */ \
+        schema        = BLIS_PACKED_COL_PANELS; \
 		iter_dim      = n; \
 		panel_len     = m; \
 		panel_dim_max = pd_p; \
@@ -196,6 +198,7 @@ void PASTEMAC(ch,varname) \
 	else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
 	{ \
 		/* Prepare to unpack from row panels. */ \
+        schema        = BLIS_PACKED_ROW_PANELS; \
 		iter_dim      = m; \
 		panel_len     = n; \
 		panel_dim_max = pd_p; \
@@ -256,6 +259,7 @@ void PASTEMAC(ch,varname) \
 			PASTEMAC(ch,unpackm_cxk) \
 			( \
 			  BLIS_NO_CONJUGATE, \
+              schema, \
 			  panel_dim_i, \
 			  panel_len, \
 			  one, \
diff --git a/frame/1m/unpackm/bli_unpackm_cxk.c b/frame/1m/unpackm/bli_unpackm_cxk.c
index bc002c453d..2410c8629e 100644
--- a/frame/1m/unpackm/bli_unpackm_cxk.c
+++ b/frame/1m/unpackm/bli_unpackm_cxk.c
@@ -40,6 +40,7 @@
 void PASTEMAC(ch,opname) \
      ( \
        conj_t  conjp, \
+       pack_t  schema, \
        dim_t   panel_dim, \
        dim_t   panel_len, \
        ctype*  kappa, \
@@ -49,7 +50,7 @@ void PASTEMAC(ch,opname) \
      ) \
 { \
 	num_t dt     = PASTEMAC(ch,type); \
-	ukr_t ker_id = ( ukr_t )( BLIS_UNPACKM_0XK_KER + panel_dim ); \
+	ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_UNPACKM_NRXK_KER : BLIS_UNPACKM_MRXK_KER; \
 \
 	PASTECH2(ch,opname,_ker_ft) f; \
 \
diff --git a/frame/1m/unpackm/bli_unpackm_cxk.h b/frame/1m/unpackm/bli_unpackm_cxk.h
index 53c3c0c44a..d828a9b8ef 100644
--- a/frame/1m/unpackm/bli_unpackm_cxk.h
+++ b/frame/1m/unpackm/bli_unpackm_cxk.h
@@ -39,6 +39,7 @@
 void PASTEMAC(ch,varname) \
      ( \
        conj_t  conjp, \
+       pack_t  schema, \
        dim_t   panel_dim, \
        dim_t   panel_len, \
        ctype*  kappa, \
diff --git a/frame/3/gemm/bli_gemm_md.c b/frame/3/gemm/bli_gemm_md.c
index a16156c157..6202cfffdd 100644
--- a/frame/3/gemm/bli_gemm_md.c
+++ b/frame/3/gemm/bli_gemm_md.c
@@ -433,10 +433,8 @@ mddm_t bli_gemm_md_rcc
 	func_t* cntx_funcs    = bli_cntx_ukrs_buf( *cntx );
 	func_t* cntx_1m_funcs = bli_cntx_ukrs_buf( cntx_1m );
 
-	for ( dim_t i = BLIS_PACKM_0XK_KER; i <= BLIS_PACKM_31XK_KER; ++i )
-	{
-		cntx_funcs[ i ] = cntx_1m_funcs[ i ];
-	}
+	cntx_funcs[ BLIS_PACKM_MRXK_KER ] = cntx_1m_funcs[ BLIS_PACKM_MRXK_KER ];
+	cntx_funcs[ BLIS_PACKM_NRXK_KER ] = cntx_1m_funcs[ BLIS_PACKM_NRXK_KER ];
 
 	// Return the computation and execution domains.
 	return doms;
diff --git a/frame/include/bli_gentfunc_macro_defs.h b/frame/include/bli_gentfunc_macro_defs.h
index 011ebcdfbb..e863f7dcff 100644
--- a/frame/include/bli_gentfunc_macro_defs.h
+++ b/frame/include/bli_gentfunc_macro_defs.h
@@ -289,6 +289,13 @@ GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2 )
 GENTFUNCCO( scomplex, float,  c, s, tfuncname, varname1, varname2, varname3 ) \
 GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3 )
 
+// -- (four auxiliary arguments) --
+
+#define INSERT_GENTFUNCCO_BASIC4( tfuncname, varname1, varname2, varname3, varname4 ) \
+\
+GENTFUNCCO( scomplex, float,  c, s, tfuncname, varname1, varname2, varname3, varname4 ) \
+GENTFUNCCO( dcomplex, double, z, d, tfuncname, varname1, varname2, varname3, varname4 )
+
 
 
 // -- Basic one-operand macro with integer instance --
diff --git a/frame/include/bli_kernel_macro_defs.h b/frame/include/bli_kernel_macro_defs.h
index 2769bf6bc7..20bd73afc9 100644
--- a/frame/include/bli_kernel_macro_defs.h
+++ b/frame/include/bli_kernel_macro_defs.h
@@ -281,36 +281,68 @@
 #define BLIS_NR_z 4
 #endif
 
+#ifndef BLIS_BBM_s
+#define BLIS_BBM_s 1
+#endif
+
+#ifndef BLIS_BBM_d
+#define BLIS_BBM_d 1
+#endif
+
+#ifndef BLIS_BBM_c
+#define BLIS_BBM_c 1
+#endif
+
+#ifndef BLIS_BBM_z
+#define BLIS_BBM_z 1
+#endif
+
+#ifndef BLIS_BBN_s
+#define BLIS_BBN_s 1
+#endif
+
+#ifndef BLIS_BBN_d
+#define BLIS_BBN_d 1
+#endif
+
+#ifndef BLIS_BBN_c
+#define BLIS_BBN_c 1
+#endif
+
+#ifndef BLIS_BBN_z
+#define BLIS_BBN_z 1
+#endif
+
 #ifndef BLIS_PACKMR_s
-#define BLIS_PACKMR_s BLIS_MR_s
+#define BLIS_PACKMR_s (BLIS_MR_s*BLIS_BBM_s)
 #endif
 
 #ifndef BLIS_PACKMR_d
-#define BLIS_PACKMR_d BLIS_MR_d
+#define BLIS_PACKMR_d (BLIS_MR_d*BLIS_BBM_d)
 #endif
 
 #ifndef BLIS_PACKMR_c
-#define BLIS_PACKMR_c BLIS_MR_c
+#define BLIS_PACKMR_c (BLIS_MR_c*BLIS_BBM_c)
 #endif
 
 #ifndef BLIS_PACKMR_z
-#define BLIS_PACKMR_z BLIS_MR_z
+#define BLIS_PACKMR_z (BLIS_MR_z*BLIS_BBM_z)
 #endif
 
 #ifndef BLIS_PACKNR_s
-#define BLIS_PACKNR_s BLIS_NR_s
+#define BLIS_PACKNR_s (BLIS_NR_s*BLIS_BBN_s)
 #endif
 
 #ifndef BLIS_PACKNR_d
-#define BLIS_PACKNR_d BLIS_NR_d
+#define BLIS_PACKNR_d (BLIS_NR_d*BLIS_BBN_d)
 #endif
 
 #ifndef BLIS_PACKNR_c
-#define BLIS_PACKNR_c BLIS_NR_c
+#define BLIS_PACKNR_c (BLIS_NR_c*BLIS_BBN_c)
 #endif
 
 #ifndef BLIS_PACKNR_z
-#define BLIS_PACKNR_z BLIS_NR_z
+#define BLIS_PACKNR_z (BLIS_NR_z*BLIS_BBN_z)
 #endif
 
 #endif
diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h
index a45c59d47a..95707fc26a 100644
--- a/frame/include/bli_type_defs.h
+++ b/frame/include/bli_type_defs.h
@@ -650,72 +650,14 @@ typedef enum
 	BLIS_DOTXAXPYF_KER,
 
     // pack kernels
-	BLIS_PACKM_0XK_KER,
-	BLIS_PACKM_1XK_KER,
-	BLIS_PACKM_2XK_KER,
-	BLIS_PACKM_3XK_KER,
-	BLIS_PACKM_4XK_KER,
-	BLIS_PACKM_5XK_KER,
-	BLIS_PACKM_6XK_KER,
-	BLIS_PACKM_7XK_KER,
-	BLIS_PACKM_8XK_KER,
-	BLIS_PACKM_9XK_KER,
-	BLIS_PACKM_10XK_KER,
-	BLIS_PACKM_11XK_KER,
-	BLIS_PACKM_12XK_KER,
-	BLIS_PACKM_13XK_KER,
-	BLIS_PACKM_14XK_KER,
-	BLIS_PACKM_15XK_KER,
-	BLIS_PACKM_16XK_KER,
-	BLIS_PACKM_17XK_KER,
-	BLIS_PACKM_18XK_KER,
-	BLIS_PACKM_19XK_KER,
-	BLIS_PACKM_20XK_KER,
-	BLIS_PACKM_21XK_KER,
-	BLIS_PACKM_22XK_KER,
-	BLIS_PACKM_23XK_KER,
-	BLIS_PACKM_24XK_KER,
-	BLIS_PACKM_25XK_KER,
-	BLIS_PACKM_26XK_KER,
-	BLIS_PACKM_27XK_KER,
-	BLIS_PACKM_28XK_KER,
-	BLIS_PACKM_29XK_KER,
-	BLIS_PACKM_30XK_KER,
-	BLIS_PACKM_31XK_KER,
+	BLIS_PACKM_MRXK_KER,
+	BLIS_PACKM_NRXK_KER,
+	BLIS_PACKM_MRXK_1ER_KER,
+	BLIS_PACKM_NRXK_1ER_KER,
 
     // unpack kernels
-	BLIS_UNPACKM_0XK_KER,
-	BLIS_UNPACKM_1XK_KER,
-	BLIS_UNPACKM_2XK_KER,
-	BLIS_UNPACKM_3XK_KER,
-	BLIS_UNPACKM_4XK_KER,
-	BLIS_UNPACKM_5XK_KER,
-	BLIS_UNPACKM_6XK_KER,
-	BLIS_UNPACKM_7XK_KER,
-	BLIS_UNPACKM_8XK_KER,
-	BLIS_UNPACKM_9XK_KER,
-	BLIS_UNPACKM_10XK_KER,
-	BLIS_UNPACKM_11XK_KER,
-	BLIS_UNPACKM_12XK_KER,
-	BLIS_UNPACKM_13XK_KER,
-	BLIS_UNPACKM_14XK_KER,
-	BLIS_UNPACKM_15XK_KER,
-	BLIS_UNPACKM_16XK_KER,
-	BLIS_UNPACKM_17XK_KER,
-	BLIS_UNPACKM_18XK_KER,
-	BLIS_UNPACKM_19XK_KER,
-	BLIS_UNPACKM_20XK_KER,
-	BLIS_UNPACKM_21XK_KER,
-	BLIS_UNPACKM_22XK_KER,
-	BLIS_UNPACKM_23XK_KER,
-	BLIS_UNPACKM_24XK_KER,
-	BLIS_UNPACKM_25XK_KER,
-	BLIS_UNPACKM_26XK_KER,
-	BLIS_UNPACKM_27XK_KER,
-	BLIS_UNPACKM_28XK_KER,
-	BLIS_UNPACKM_29XK_KER,
-	BLIS_UNPACKM_30XK_KER,
-	BLIS_UNPACKM_31XK_KER,
+	BLIS_UNPACKM_MRXK_KER,
+	BLIS_UNPACKM_NRXK_KER,
 
     // l3 nat kernels
 	BLIS_GEMM_UKR,
@@ -920,6 +862,10 @@ typedef enum
 	BLIS_KC,
 	BLIS_NC,
 
+    // broadcast factors for packing
+    BLIS_BBM,
+    BLIS_BBN,
+
     // level-2 blocksizes
 	BLIS_M2, // level-2 blocksize in m dimension
 	BLIS_N2, // level-2 blocksize in n dimension
diff --git a/ref_kernels/1m/bli_packm_cxk_1er_ref.c b/ref_kernels/1m/bli_packm_cxk_1er_ref.c
index 03ec46d147..bb23e7e47b 100644
--- a/ref_kernels/1m/bli_packm_cxk_1er_ref.c
+++ b/ref_kernels/1m/bli_packm_cxk_1er_ref.c
@@ -35,7 +35,7 @@
 #include "blis.h"
 
 #undef  GENTFUNCCO
-#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr, arch, suf ) \
+#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr0, bb0, arch, suf ) \
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
@@ -50,1926 +50,61 @@ void PASTEMAC3(ch,opname,arch,suf) \
        cntx_t* restrict cntx  \
      ) \
 { \
-	if ( cdim == mnr ) \
-	{ \
-		if ( bli_is_1e_packed( schema ) ) \
-		{ \
-			const inc_t       inca1      = inca; \
-			const inc_t       lda1       = lda; \
-			const inc_t       ldp1       = ldp; \
-\
-			ctype*   restrict kappa_cast = ( ctype* )kappa; \
-			ctype*   restrict alpha1_ri  = ( ctype* )a; \
-			ctype*   restrict pi1_ri     = ( ctype* )p; \
-			ctype*   restrict pi1_ir     = ( ctype* )p + ldp1/2; \
-\
-			if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-			} \
-			else \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-			} \
-		} \
-		else /* if ( bli_is_1r_packed( schema ) ) */ \
-		{ \
-			const inc_t       inca2      = 2 * inca; \
-			const inc_t       lda2       = 2 * lda; \
-			const inc_t       ldp2       = 2 * ldp; \
-\
-			ctype*            kappa_cast =             kappa; \
-			ctype_r* restrict kappa_r    = ( ctype_r* )kappa; \
-			ctype_r* restrict kappa_i    = ( ctype_r* )kappa + 1; \
-			ctype_r* restrict alpha1_r   = ( ctype_r* )a; \
-			ctype_r* restrict alpha1_i   = ( ctype_r* )a + 1; \
-			ctype_r* restrict pi1_r      = ( ctype_r* )p; \
-			ctype_r* restrict pi1_i      = ( ctype_r* )p + ldp; \
-\
-			if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-			} \
-			else \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-			} \
-		} \
-	} \
-	else /* if ( cdim < mnr ) */ \
-	{ \
-		PASTEMAC(ch,scal21ms_mxn) \
-		( \
-		  schema, \
-		  conja, \
-		  cdim, \
-		  n, \
-		  kappa, \
-		  a, inca, lda, \
-		  p, 1,    ldp, ldp  \
-		); \
-\
-		/* if ( cdim < mnr ) */ \
-		{ \
-			ctype* restrict zero   = PASTEMAC(ch,0); \
-			const dim_t     offm   = cdim; \
-			const dim_t     offn   = 0; \
-			const dim_t     m_edge = mnr - cdim; \
-			const dim_t     n_edge = n_max; \
-\
-			PASTEMAC(ch,set1ms_mxn) \
-			( \
-			  schema, \
-			  offm, \
-			  offn, \
-			  m_edge, \
-			  n_edge, \
-			  zero, \
-			  p, 1, ldp, ldp  \
-			); \
-		} \
-	} \
-\
-	if ( n < n_max ) \
-	{ \
-		ctype* restrict zero   = PASTEMAC(ch,0); \
-		const dim_t     offm   = 0; \
-		const dim_t     offn   = n; \
-		const dim_t     m_edge = mnr; \
-		const dim_t     n_edge = n_max - n; \
-\
-		PASTEMAC(ch,set1ms_mxn) \
-		( \
-		  schema, \
-		  offm, \
-		  offn, \
-		  m_edge, \
-		  n_edge, \
-		  zero, \
-		  p, 1, ldp, ldp  \
-		); \
-	} \
-}
-
-INSERT_GENTFUNCCO_BASIC3( packm_2xk_1er, 2, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-
-
-#undef  GENTFUNCCO
-#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conja, \
-       pack_t           schema, \
-       dim_t            cdim, \
-       dim_t            n, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx  \
-     ) \
-{ \
-	if ( cdim == mnr ) \
-	{ \
-		if ( bli_is_1e_packed( schema ) ) \
-		{ \
-			const inc_t       inca1      = inca; \
-			const inc_t       lda1       = lda; \
-			const inc_t       ldp1       = ldp; \
-\
-			ctype*   restrict kappa_cast = ( ctype* )kappa; \
-			ctype*   restrict alpha1_ri  = ( ctype* )a; \
-			ctype*   restrict pi1_ri     = ( ctype* )p; \
-			ctype*   restrict pi1_ir     = ( ctype* )p + ldp1/2; \
-\
-			if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-			} \
-			else \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-			} \
-		} \
-		else /* if ( bli_is_1r_packed( schema ) ) */ \
-		{ \
-			const inc_t       inca2      = 2 * inca; \
-			const inc_t       lda2       = 2 * lda; \
-			const inc_t       ldp2       = 2 * ldp; \
-\
-			ctype*            kappa_cast =             kappa; \
-			ctype_r* restrict kappa_r    = ( ctype_r* )kappa; \
-			ctype_r* restrict kappa_i    = ( ctype_r* )kappa + 1; \
-			ctype_r* restrict alpha1_r   = ( ctype_r* )a; \
-			ctype_r* restrict alpha1_i   = ( ctype_r* )a + 1; \
-			ctype_r* restrict pi1_r      = ( ctype_r* )p; \
-			ctype_r* restrict pi1_i      = ( ctype_r* )p + ldp; \
-\
-			if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-			} \
-			else \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-			} \
-		} \
-	} \
-	else /* if ( cdim < mnr ) */ \
-	{ \
-		PASTEMAC(ch,scal21ms_mxn) \
-		( \
-		  schema, \
-		  conja, \
-		  cdim, \
-		  n, \
-		  kappa, \
-		  a, inca, lda, \
-		  p, 1,    ldp, ldp  \
-		); \
-\
-		/* if ( cdim < mnr ) */ \
-		{ \
-			ctype* restrict zero   = PASTEMAC(ch,0); \
-			const dim_t     offm   = cdim; \
-			const dim_t     offn   = 0; \
-			const dim_t     m_edge = mnr - cdim; \
-			const dim_t     n_edge = n_max; \
-\
-			PASTEMAC(ch,set1ms_mxn) \
-			( \
-			  schema, \
-			  offm, \
-			  offn, \
-			  m_edge, \
-			  n_edge, \
-			  zero, \
-			  p, 1, ldp, ldp  \
-			); \
-		} \
-	} \
-\
-	if ( n < n_max ) \
-	{ \
-		ctype* restrict zero   = PASTEMAC(ch,0); \
-		const dim_t     offm   = 0; \
-		const dim_t     offn   = n; \
-		const dim_t     m_edge = mnr; \
-		const dim_t     n_edge = n_max - n; \
-\
-		PASTEMAC(ch,set1ms_mxn) \
-		( \
-		  schema, \
-		  offm, \
-		  offn, \
-		  m_edge, \
-		  n_edge, \
-		  zero, \
-		  p, 1, ldp, ldp  \
-		); \
-	} \
-}
-
-INSERT_GENTFUNCCO_BASIC3( packm_4xk_1er, 4, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-
-
-#undef  GENTFUNCCO
-#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conja, \
-       pack_t           schema, \
-       dim_t            cdim, \
-       dim_t            n, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx  \
-     ) \
-{ \
-	if ( cdim == mnr ) \
-	{ \
-		if ( bli_is_1e_packed( schema ) ) \
-		{ \
-			const inc_t       inca1      = inca; \
-			const inc_t       lda1       = lda; \
-			const inc_t       ldp1       = ldp; \
-\
-			ctype*   restrict kappa_cast = ( ctype* )kappa; \
-			ctype*   restrict alpha1_ri  = ( ctype* )a; \
-			ctype*   restrict pi1_ri     = ( ctype* )p; \
-			ctype*   restrict pi1_ir     = ( ctype* )p + ldp1/2; \
-\
-			if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-			} \
-			else \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-			} \
-		} \
-		else /* if ( bli_is_1r_packed( schema ) ) */ \
-		{ \
-			const inc_t       inca2      = 2 * inca; \
-			const inc_t       lda2       = 2 * lda; \
-			const inc_t       ldp2       = 2 * ldp; \
-\
-			ctype*            kappa_cast =             kappa; \
-			ctype_r* restrict kappa_r    = ( ctype_r* )kappa; \
-			ctype_r* restrict kappa_i    = ( ctype_r* )kappa + 1; \
-			ctype_r* restrict alpha1_r   = ( ctype_r* )a; \
-			ctype_r* restrict alpha1_i   = ( ctype_r* )a + 1; \
-			ctype_r* restrict pi1_r      = ( ctype_r* )p; \
-			ctype_r* restrict pi1_i      = ( ctype_r* )p + ldp; \
-\
-			if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-			} \
-			else \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-			} \
-		} \
-	} \
-	else /* if ( cdim < mnr ) */ \
-	{ \
-		PASTEMAC(ch,scal21ms_mxn) \
-		( \
-		  schema, \
-		  conja, \
-		  cdim, \
-		  n, \
-		  kappa, \
-		  a, inca, lda, \
-		  p, 1,    ldp, ldp  \
-		); \
-\
-		/* if ( cdim < mnr ) */ \
-		{ \
-			ctype* restrict zero   = PASTEMAC(ch,0); \
-			const dim_t     offm   = cdim; \
-			const dim_t     offn   = 0; \
-			const dim_t     m_edge = mnr - cdim; \
-			const dim_t     n_edge = n_max; \
-\
-			PASTEMAC(ch,set1ms_mxn) \
-			( \
-			  schema, \
-			  offm, \
-			  offn, \
-			  m_edge, \
-			  n_edge, \
-			  zero, \
-			  p, 1, ldp, ldp  \
-			); \
-		} \
-	} \
-\
-	if ( n < n_max ) \
-	{ \
-		ctype* restrict zero   = PASTEMAC(ch,0); \
-		const dim_t     offm   = 0; \
-		const dim_t     offn   = n; \
-		const dim_t     m_edge = mnr; \
-		const dim_t     n_edge = n_max - n; \
-\
-		PASTEMAC(ch,set1ms_mxn) \
-		( \
-		  schema, \
-		  offm, \
-		  offn, \
-		  m_edge, \
-		  n_edge, \
-		  zero, \
-		  p, 1, ldp, ldp  \
-		); \
-	} \
-}
-
-INSERT_GENTFUNCCO_BASIC3( packm_6xk_1er, 6, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-
-
-#undef  GENTFUNCCO
-#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conja, \
-       pack_t           schema, \
-       dim_t            cdim, \
-       dim_t            n, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx  \
-     ) \
-{ \
-	if ( cdim == mnr ) \
-	{ \
-		if ( bli_is_1e_packed( schema ) ) \
-		{ \
-			const inc_t       inca1      = inca; \
-			const inc_t       lda1       = lda; \
-			const inc_t       ldp1       = ldp; \
-\
-			ctype*   restrict kappa_cast = ( ctype* )kappa; \
-			ctype*   restrict alpha1_ri  = ( ctype* )a; \
-			ctype*   restrict pi1_ri     = ( ctype* )p; \
-			ctype*   restrict pi1_ir     = ( ctype* )p + ldp1/2; \
-\
-			if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-			} \
-			else \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-			} \
-		} \
-		else /* if ( bli_is_1r_packed( schema ) ) */ \
-		{ \
-			const inc_t       inca2      = 2 * inca; \
-			const inc_t       lda2       = 2 * lda; \
-			const inc_t       ldp2       = 2 * ldp; \
-\
-			ctype*            kappa_cast =             kappa; \
-			ctype_r* restrict kappa_r    = ( ctype_r* )kappa; \
-			ctype_r* restrict kappa_i    = ( ctype_r* )kappa + 1; \
-			ctype_r* restrict alpha1_r   = ( ctype_r* )a; \
-			ctype_r* restrict alpha1_i   = ( ctype_r* )a + 1; \
-			ctype_r* restrict pi1_r      = ( ctype_r* )p; \
-			ctype_r* restrict pi1_i      = ( ctype_r* )p + ldp; \
-\
-			if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-			} \
-			else \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-			} \
-		} \
-	} \
-	else /* if ( cdim < mnr ) */ \
-	{ \
-		PASTEMAC(ch,scal21ms_mxn) \
-		( \
-		  schema, \
-		  conja, \
-		  cdim, \
-		  n, \
-		  kappa, \
-		  a, inca, lda, \
-		  p, 1,    ldp, ldp  \
-		); \
-\
-		/* if ( cdim < mnr ) */ \
-		{ \
-			ctype* restrict zero   = PASTEMAC(ch,0); \
-			const dim_t     offm   = cdim; \
-			const dim_t     offn   = 0; \
-			const dim_t     m_edge = mnr - cdim; \
-			const dim_t     n_edge = n_max; \
-\
-			PASTEMAC(ch,set1ms_mxn) \
-			( \
-			  schema, \
-			  offm, \
-			  offn, \
-			  m_edge, \
-			  n_edge, \
-			  zero, \
-			  p, 1, ldp, ldp  \
-			); \
-		} \
-	} \
-\
-	if ( n < n_max ) \
-	{ \
-		ctype* restrict zero   = PASTEMAC(ch,0); \
-		const dim_t     offm   = 0; \
-		const dim_t     offn   = n; \
-		const dim_t     m_edge = mnr; \
-		const dim_t     n_edge = n_max - n; \
-\
-		PASTEMAC(ch,set1ms_mxn) \
-		( \
-		  schema, \
-		  offm, \
-		  offn, \
-		  m_edge, \
-		  n_edge, \
-		  zero, \
-		  p, 1, ldp, ldp  \
-		); \
-	} \
-}
-
-INSERT_GENTFUNCCO_BASIC3( packm_8xk_1er, 8, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-
-
-#undef  GENTFUNCCO
-#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conja, \
-       pack_t           schema, \
-       dim_t            cdim, \
-       dim_t            n, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx  \
-     ) \
-{ \
-	if ( cdim == mnr ) \
-	{ \
-		if ( bli_is_1e_packed( schema ) ) \
-		{ \
-			const inc_t       inca1      = inca; \
-			const inc_t       lda1       = lda; \
-			const inc_t       ldp1       = ldp; \
-\
-			ctype*   restrict kappa_cast = ( ctype* )kappa; \
-			ctype*   restrict alpha1_ri  = ( ctype* )a; \
-			ctype*   restrict pi1_ri     = ( ctype* )p; \
-			ctype*   restrict pi1_ir     = ( ctype* )p + ldp1/2; \
-\
-			if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-			} \
-			else \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-			} \
-		} \
-		else /* if ( bli_is_1r_packed( schema ) ) */ \
-		{ \
-			const inc_t       inca2      = 2 * inca; \
-			const inc_t       lda2       = 2 * lda; \
-			const inc_t       ldp2       = 2 * ldp; \
-\
-			ctype*            kappa_cast =             kappa; \
-			ctype_r* restrict kappa_r    = ( ctype_r* )kappa; \
-			ctype_r* restrict kappa_i    = ( ctype_r* )kappa + 1; \
-			ctype_r* restrict alpha1_r   = ( ctype_r* )a; \
-			ctype_r* restrict alpha1_i   = ( ctype_r* )a + 1; \
-			ctype_r* restrict pi1_r      = ( ctype_r* )p; \
-			ctype_r* restrict pi1_i      = ( ctype_r* )p + ldp; \
-\
-			if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-			} \
-			else \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-			} \
-		} \
-	} \
-	else /* if ( cdim < mnr ) */ \
-	{ \
-		PASTEMAC(ch,scal21ms_mxn) \
-		( \
-		  schema, \
-		  conja, \
-		  cdim, \
-		  n, \
-		  kappa, \
-		  a, inca, lda, \
-		  p, 1,    ldp, ldp  \
-		); \
-\
-		/* if ( cdim < mnr ) */ \
-		{ \
-			ctype* restrict zero   = PASTEMAC(ch,0); \
-			const dim_t     offm   = cdim; \
-			const dim_t     offn   = 0; \
-			const dim_t     m_edge = mnr - cdim; \
-			const dim_t     n_edge = n_max; \
-\
-			PASTEMAC(ch,set1ms_mxn) \
-			( \
-			  schema, \
-			  offm, \
-			  offn, \
-			  m_edge, \
-			  n_edge, \
-			  zero, \
-			  p, 1, ldp, ldp  \
-			); \
-		} \
-	} \
-\
-	if ( n < n_max ) \
-	{ \
-		ctype* restrict zero   = PASTEMAC(ch,0); \
-		const dim_t     offm   = 0; \
-		const dim_t     offn   = n; \
-		const dim_t     m_edge = mnr; \
-		const dim_t     n_edge = n_max - n; \
-\
-		PASTEMAC(ch,set1ms_mxn) \
-		( \
-		  schema, \
-		  offm, \
-		  offn, \
-		  m_edge, \
-		  n_edge, \
-		  zero, \
-		  p, 1, ldp, ldp  \
-		); \
-	} \
-}
-
-INSERT_GENTFUNCCO_BASIC3( packm_10xk_1er, 10, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-
-
-#undef  GENTFUNCCO
-#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conja, \
-       pack_t           schema, \
-       dim_t            cdim, \
-       dim_t            n, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx  \
-     ) \
-{ \
-	if ( cdim == mnr ) \
-	{ \
-		if ( bli_is_1e_packed( schema ) ) \
-		{ \
-			const inc_t       inca1      = inca; \
-			const inc_t       lda1       = lda; \
-			const inc_t       ldp1       = ldp; \
-\
-			ctype*   restrict kappa_cast = ( ctype* )kappa; \
-			ctype*   restrict alpha1_ri  = ( ctype* )a; \
-			ctype*   restrict pi1_ri     = ( ctype* )p; \
-			ctype*   restrict pi1_ir     = ( ctype* )p + ldp1/2; \
-\
-			if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-			} \
-			else \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-			} \
-		} \
-		else /* if ( bli_is_1r_packed( schema ) ) */ \
-		{ \
-			const inc_t       inca2      = 2 * inca; \
-			const inc_t       lda2       = 2 * lda; \
-			const inc_t       ldp2       = 2 * ldp; \
-\
-			ctype*            kappa_cast =             kappa; \
-			ctype_r* restrict kappa_r    = ( ctype_r* )kappa; \
-			ctype_r* restrict kappa_i    = ( ctype_r* )kappa + 1; \
-			ctype_r* restrict alpha1_r   = ( ctype_r* )a; \
-			ctype_r* restrict alpha1_i   = ( ctype_r* )a + 1; \
-			ctype_r* restrict pi1_r      = ( ctype_r* )p; \
-			ctype_r* restrict pi1_i      = ( ctype_r* )p + ldp; \
-\
-			if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-			} \
-			else \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \
+	const dim_t mnr  = PASTECH2(mnr0, _, ch); \
+	const dim_t dfac = PASTECH2(bb0, _, ch); \
 \
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-			} \
-		} \
-	} \
-	else /* if ( cdim < mnr ) */ \
-	{ \
-		PASTEMAC(ch,scal21ms_mxn) \
-		( \
-		  schema, \
-		  conja, \
-		  cdim, \
-		  n, \
-		  kappa, \
-		  a, inca, lda, \
-		  p, 1,    ldp, ldp  \
-		); \
-\
-		/* if ( cdim < mnr ) */ \
-		{ \
-			ctype* restrict zero   = PASTEMAC(ch,0); \
-			const dim_t     offm   = cdim; \
-			const dim_t     offn   = 0; \
-			const dim_t     m_edge = mnr - cdim; \
-			const dim_t     n_edge = n_max; \
-\
-			PASTEMAC(ch,set1ms_mxn) \
-			( \
-			  schema, \
-			  offm, \
-			  offn, \
-			  m_edge, \
-			  n_edge, \
-			  zero, \
-			  p, 1, ldp, ldp  \
-			); \
-		} \
-	} \
-\
-	if ( n < n_max ) \
-	{ \
-		ctype* restrict zero   = PASTEMAC(ch,0); \
-		const dim_t     offm   = 0; \
-		const dim_t     offn   = n; \
-		const dim_t     m_edge = mnr; \
-		const dim_t     n_edge = n_max - n; \
-\
-		PASTEMAC(ch,set1ms_mxn) \
-		( \
-		  schema, \
-		  offm, \
-		  offn, \
-		  m_edge, \
-		  n_edge, \
-		  zero, \
-		  p, 1, ldp, ldp  \
-		); \
-	} \
-}
-
-INSERT_GENTFUNCCO_BASIC3( packm_12xk_1er, 12, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-
-
-#undef  GENTFUNCCO
-#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conja, \
-       pack_t           schema, \
-       dim_t            cdim, \
-       dim_t            n, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx  \
-     ) \
-{ \
-	if ( cdim == mnr ) \
+	if ( cdim == mnr && mnr != -1 ) \
 	{ \
 		if ( bli_is_1e_packed( schema ) ) \
-		{ \
-			const inc_t       inca1      = inca; \
-			const inc_t       lda1       = lda; \
-			const inc_t       ldp1       = ldp; \
-\
-			ctype*   restrict kappa_cast = ( ctype* )kappa; \
-			ctype*   restrict alpha1_ri  = ( ctype* )a; \
-			ctype*   restrict pi1_ri     = ( ctype* )p; \
-			ctype*   restrict pi1_ir     = ( ctype* )p + ldp1/2; \
-\
-			if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-			} \
-			else \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \
-\
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
-					} \
-				} \
-			} \
-		} \
-		else /* if ( bli_is_1r_packed( schema ) ) */ \
 		{ \
 			const inc_t       inca2      = 2 * inca; \
 			const inc_t       lda2       = 2 * lda; \
 			const inc_t       ldp2       = 2 * ldp; \
 \
-			ctype*            kappa_cast =             kappa; \
-			ctype_r* restrict kappa_r    = ( ctype_r* )kappa; \
-			ctype_r* restrict kappa_i    = ( ctype_r* )kappa + 1; \
-			ctype_r* restrict alpha1_r   = ( ctype_r* )a; \
-			ctype_r* restrict alpha1_i   = ( ctype_r* )a + 1; \
-			ctype_r* restrict pi1_r      = ( ctype_r* )p; \
-			ctype_r* restrict pi1_i      = ( ctype_r* )p + ldp; \
-\
-			if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-			} \
-			else \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \
-\
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
-					} \
-				} \
-			} \
-		} \
-	} \
-	else /* if ( cdim < mnr ) */ \
-	{ \
-		PASTEMAC(ch,scal21ms_mxn) \
-		( \
-		  schema, \
-		  conja, \
-		  cdim, \
-		  n, \
-		  kappa, \
-		  a, inca, lda, \
-		  p, 1,    ldp, ldp  \
-		); \
-\
-		/* if ( cdim < mnr ) */ \
-		{ \
-			ctype* restrict zero   = PASTEMAC(ch,0); \
-			const dim_t     offm   = cdim; \
-			const dim_t     offn   = 0; \
-			const dim_t     m_edge = mnr - cdim; \
-			const dim_t     n_edge = n_max; \
-\
-			PASTEMAC(ch,set1ms_mxn) \
-			( \
-			  schema, \
-			  offm, \
-			  offn, \
-			  m_edge, \
-			  n_edge, \
-			  zero, \
-			  p, 1, ldp, ldp  \
-			); \
-		} \
-	} \
-\
-	if ( n < n_max ) \
-	{ \
-		ctype* restrict zero   = PASTEMAC(ch,0); \
-		const dim_t     offm   = 0; \
-		const dim_t     offn   = n; \
-		const dim_t     m_edge = mnr; \
-		const dim_t     n_edge = n_max - n; \
-\
-		PASTEMAC(ch,set1ms_mxn) \
-		( \
-		  schema, \
-		  offm, \
-		  offn, \
-		  m_edge, \
-		  n_edge, \
-		  zero, \
-		  p, 1, ldp, ldp  \
-		); \
-	} \
-}
-
-INSERT_GENTFUNCCO_BASIC3( packm_14xk_1er, 14, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-
-
-#undef  GENTFUNCCO
-#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conja, \
-       pack_t           schema, \
-       dim_t            cdim, \
-       dim_t            n, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx  \
-     ) \
-{ \
-	if ( cdim == mnr ) \
-	{ \
-		if ( bli_is_1e_packed( schema ) ) \
-		{ \
-			const inc_t       inca1      = inca; \
-			const inc_t       lda1       = lda; \
-			const inc_t       ldp1       = ldp; \
+			ctype_r           kappa_r    = ( ( ctype_r* )kappa )[0]; \
+			ctype_r           kappa_i    = ( ( ctype_r* )kappa )[1]; \
+			ctype_r* restrict alpha1     = ( ctype_r* )a; \
+			ctype_r* restrict pi1_ri     = ( ctype_r* )p; \
+			ctype_r* restrict pi1_ir     = ( ctype_r* )p + ldp; \
 \
-			ctype*   restrict kappa_cast = ( ctype* )kappa; \
-			ctype*   restrict alpha1_ri  = ( ctype* )a; \
-			ctype*   restrict pi1_ri     = ( ctype* )p; \
-			ctype*   restrict pi1_ir     = ( ctype* )p + ldp1/2; \
-\
-			if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
+			if ( inca == 1 ) \
 			{ \
 				if ( bli_is_conj( conja ) ) \
 				{ \
 					for ( dim_t k = n; k != 0; --k ) \
 					{ \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri +14*inca1), *(pi1_ri +14), *(pi1_ir +14) ); \
-						PASTEMAC(ch,copyj1es)( *(alpha1_ri +15*inca1), *(pi1_ri +15), *(pi1_ir +15) ); \
+						PRAGMA_SIMD \
+						for ( dim_t mn = 0; mn < mnr; ++mn ) \
+						for ( dim_t d = 0; d < dfac; ++d ) \
+						{ \
+							PASTEMAC(ch,scal2jris)(  kappa_r, kappa_i, *(alpha1 + mn*2 + 0), *(alpha1 + mn*2 + 1), \
+							                                           *(pi1_ri + (mn*2 + 0)*dfac + d), *(pi1_ri + (mn*2 + 1)*dfac + d) ); \
+							PASTEMAC(ch,scal2jris)( -kappa_i, kappa_r, *(alpha1 + mn*2 + 0), *(alpha1 + mn*2 + 1), \
+							                                           *(pi1_ir + (mn*2 + 0)*dfac + d), *(pi1_ir + (mn*2 + 1)*dfac + d) ); \
+						} \
 \
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
+						alpha1 += lda2; \
+						pi1_ri += ldp2; \
+						pi1_ir += ldp2; \
 					} \
 				} \
 				else \
 				{ \
 					for ( dim_t k = n; k != 0; --k ) \
 					{ \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri +14*inca1), *(pi1_ri +14), *(pi1_ir +14) ); \
-						PASTEMAC(ch,copy1es)( *(alpha1_ri +15*inca1), *(pi1_ri +15), *(pi1_ir +15) ); \
+						PRAGMA_SIMD \
+						for ( dim_t mn = 0; mn < mnr; ++mn ) \
+						for ( dim_t d = 0; d < dfac; ++d ) \
+						{ \
+							PASTEMAC(ch,scal2ris)(  kappa_r, kappa_i, *(alpha1 + mn*2 + 0), *(alpha1 + mn*2 + 1), \
+							                                          *(pi1_ri + (mn*2 + 0)*dfac + d), *(pi1_ri + (mn*2 + 1)*dfac + d) ); \
+							PASTEMAC(ch,scal2ris)( -kappa_i, kappa_r, *(alpha1 + mn*2 + 0), *(alpha1 + mn*2 + 1), \
+							                                          *(pi1_ir + (mn*2 + 0)*dfac + d), *(pi1_ir + (mn*2 + 1)*dfac + d) ); \
+						} \
 \
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
+						alpha1 += lda2; \
+						pi1_ri += ldp2; \
+						pi1_ir += ldp2; \
 					} \
 				} \
 			} \
@@ -1979,124 +114,84 @@ void PASTEMAC3(ch,opname,arch,suf) \
 				{ \
 					for ( dim_t k = n; k != 0; --k ) \
 					{ \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +14*inca1), *(pi1_ri +14), *(pi1_ir +14) ); \
-						PASTEMAC(ch,scal2j1es)( *kappa_cast, *(alpha1_ri +15*inca1), *(pi1_ri +15), *(pi1_ir +15) ); \
+						PRAGMA_SIMD \
+						for ( dim_t mn = 0; mn < mnr; ++mn ) \
+						for ( dim_t d = 0; d < dfac; ++d ) \
+						{ \
+							PASTEMAC(ch,scal2jris)(  kappa_r, kappa_i, *(alpha1 + mn*inca2 + 0), *(alpha1 + mn*inca2 + 1), \
+							                                           *(pi1_ri + (mn*2 + 0)*dfac + d), *(pi1_ri + (mn*2 + 1)*dfac + d) ); \
+							PASTEMAC(ch,scal2jris)( -kappa_i, kappa_r, *(alpha1 + mn*inca2 + 0), *(alpha1 + mn*inca2 + 1), \
+							                                           *(pi1_ir + (mn*2 + 0)*dfac + d), *(pi1_ir + (mn*2 + 1)*dfac + d) ); \
+						} \
 \
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
+						alpha1 += lda2; \
+						pi1_ri += ldp2; \
+						pi1_ir += ldp2; \
 					} \
 				} \
 				else \
 				{ \
 					for ( dim_t k = n; k != 0; --k ) \
 					{ \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 0*inca1), *(pi1_ri + 0), *(pi1_ir + 0) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 1*inca1), *(pi1_ri + 1), *(pi1_ir + 1) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 2*inca1), *(pi1_ri + 2), *(pi1_ir + 2) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 3*inca1), *(pi1_ri + 3), *(pi1_ir + 3) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 4*inca1), *(pi1_ri + 4), *(pi1_ir + 4) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 5*inca1), *(pi1_ri + 5), *(pi1_ir + 5) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 6*inca1), *(pi1_ri + 6), *(pi1_ir + 6) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 7*inca1), *(pi1_ri + 7), *(pi1_ir + 7) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 8*inca1), *(pi1_ri + 8), *(pi1_ir + 8) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri + 9*inca1), *(pi1_ri + 9), *(pi1_ir + 9) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +10*inca1), *(pi1_ri +10), *(pi1_ir +10) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +11*inca1), *(pi1_ri +11), *(pi1_ir +11) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +12*inca1), *(pi1_ri +12), *(pi1_ir +12) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +13*inca1), *(pi1_ri +13), *(pi1_ir +13) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +14*inca1), *(pi1_ri +14), *(pi1_ir +14) ); \
-						PASTEMAC(ch,scal21es)( *kappa_cast, *(alpha1_ri +15*inca1), *(pi1_ri +15), *(pi1_ir +15) ); \
+						PRAGMA_SIMD \
+						for ( dim_t mn = 0; mn < mnr; ++mn ) \
+						for ( dim_t d = 0; d < dfac; ++d ) \
+						{ \
+							PASTEMAC(ch,scal2ris)(  kappa_r, kappa_i, *(alpha1 + mn*inca2 + 0), *(alpha1 + mn*inca2 + 1), \
+							                                          *(pi1_ri + (mn*2 + 0)*dfac + d), *(pi1_ri + (mn*2 + 1)*dfac + d) ); \
+							PASTEMAC(ch,scal2ris)( -kappa_i, kappa_r, *(alpha1 + mn*inca2 + 0), *(alpha1 + mn*inca2 + 1), \
+							                                          *(pi1_ir + (mn*2 + 0)*dfac + d), *(pi1_ir + (mn*2 + 1)*dfac + d) ); \
+						} \
 \
-						alpha1_ri += lda1; \
-						pi1_ri    += ldp1; \
-						pi1_ir    += ldp1; \
+						alpha1 += lda2; \
+						pi1_ri += ldp2; \
+						pi1_ir += ldp2; \
 					} \
 				} \
 			} \
 		} \
-		else /* if ( bli_is_1r_packed( schema ) ) */ \
+		else \
 		{ \
 			const inc_t       inca2      = 2 * inca; \
 			const inc_t       lda2       = 2 * lda; \
 			const inc_t       ldp2       = 2 * ldp; \
 \
-			ctype*            kappa_cast =             kappa; \
-			ctype_r* restrict kappa_r    = ( ctype_r* )kappa; \
-			ctype_r* restrict kappa_i    = ( ctype_r* )kappa + 1; \
-			ctype_r* restrict alpha1_r   = ( ctype_r* )a; \
-			ctype_r* restrict alpha1_i   = ( ctype_r* )a + 1; \
+			ctype_r           kappa_r    = ( ( ctype_r* )kappa )[0]; \
+			ctype_r           kappa_i    = ( ( ctype_r* )kappa )[1]; \
+			ctype_r* restrict alpha1     = ( ctype_r* )a; \
 			ctype_r* restrict pi1_r      = ( ctype_r* )p; \
 			ctype_r* restrict pi1_i      = ( ctype_r* )p + ldp; \
 \
-			if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
+			if ( inca == 1 ) \
 			{ \
 				if ( bli_is_conj( conja ) ) \
 				{ \
 					for ( dim_t k = n; k != 0; --k ) \
 					{ \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \
-						PASTEMAC(ch,copyjris)( *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \
+						PRAGMA_SIMD \
+						for ( dim_t mn = 0; mn < mnr; ++mn ) \
+						for ( dim_t d = 0; d < dfac; ++d ) \
+							PASTEMAC(ch,scal2jris)( kappa_r, kappa_i, *(alpha1 + mn*2 + 0), *(alpha1 + mn*2 + 1), \
+							                                          *(pi1_r + mn*dfac + d), *(pi1_i + mn*dfac + d) ); \
 \
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
+						alpha1 += lda2; \
+						pi1_r  += ldp2; \
+						pi1_i  += ldp2; \
 					} \
 				} \
 				else \
 				{ \
 					for ( dim_t k = n; k != 0; --k ) \
 					{ \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \
-						PASTEMAC(ch,copyris)( *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \
+						PRAGMA_SIMD \
+						for ( dim_t mn = 0; mn < mnr; ++mn ) \
+						for ( dim_t d = 0; d < dfac; ++d ) \
+							PASTEMAC(ch,scal2ris)( kappa_r, kappa_i, *(alpha1 + mn*2 + 0), *(alpha1 + mn*2 + 1), \
+							                                         *(pi1_r + mn*dfac + d), *(pi1_i + mn*dfac + d) ); \
 \
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
+						alpha1 += lda2; \
+						pi1_r  += ldp2; \
+						pi1_i  += ldp2; \
 					} \
 				} \
 			} \
@@ -2106,54 +201,30 @@ void PASTEMAC3(ch,opname,arch,suf) \
 				{ \
 					for ( dim_t k = n; k != 0; --k ) \
 					{ \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \
-						PASTEMAC(ch,scal2jris)( *kappa_r, *kappa_i, *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \
+						PRAGMA_SIMD \
+						for ( dim_t mn = 0; mn < mnr; ++mn ) \
+						for ( dim_t d = 0; d < dfac; ++d ) \
+							PASTEMAC(ch,scal2jris)( kappa_r, kappa_i, *(alpha1 + mn*inca2 + 0), *(alpha1 + mn*inca2 + 1), \
+							                                          *(pi1_r + mn*dfac + d), *(pi1_i + mn*dfac + d) ); \
 \
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
+						alpha1 += lda2; \
+						pi1_r  += ldp2; \
+						pi1_i  += ldp2; \
 					} \
 				} \
 				else \
 				{ \
 					for ( dim_t k = n; k != 0; --k ) \
 					{ \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 0*inca2), *(alpha1_i + 0*inca2), *(pi1_r + 0), *(pi1_i + 0) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 1*inca2), *(alpha1_i + 1*inca2), *(pi1_r + 1), *(pi1_i + 1) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 2*inca2), *(alpha1_i + 2*inca2), *(pi1_r + 2), *(pi1_i + 2) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 3*inca2), *(alpha1_i + 3*inca2), *(pi1_r + 3), *(pi1_i + 3) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 4*inca2), *(alpha1_i + 4*inca2), *(pi1_r + 4), *(pi1_i + 4) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 5*inca2), *(alpha1_i + 5*inca2), *(pi1_r + 5), *(pi1_i + 5) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 6*inca2), *(alpha1_i + 6*inca2), *(pi1_r + 6), *(pi1_i + 6) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 7*inca2), *(alpha1_i + 7*inca2), *(pi1_r + 7), *(pi1_i + 7) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 8*inca2), *(alpha1_i + 8*inca2), *(pi1_r + 8), *(pi1_i + 8) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r + 9*inca2), *(alpha1_i + 9*inca2), *(pi1_r + 9), *(pi1_i + 9) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +10*inca2), *(alpha1_i +10*inca2), *(pi1_r +10), *(pi1_i +10) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +11*inca2), *(alpha1_i +11*inca2), *(pi1_r +11), *(pi1_i +11) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +12*inca2), *(alpha1_i +12*inca2), *(pi1_r +12), *(pi1_i +12) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +13*inca2), *(alpha1_i +13*inca2), *(pi1_r +13), *(pi1_i +13) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +14*inca2), *(alpha1_i +14*inca2), *(pi1_r +14), *(pi1_i +14) ); \
-						PASTEMAC(ch,scal2ris)( *kappa_r, *kappa_i, *(alpha1_r +15*inca2), *(alpha1_i +15*inca2), *(pi1_r +15), *(pi1_i +15) ); \
+						PRAGMA_SIMD \
+						for ( dim_t mn = 0; mn < mnr; ++mn ) \
+						for ( dim_t d = 0; d < dfac; ++d ) \
+							PASTEMAC(ch,scal2ris)( kappa_r, kappa_i, *(alpha1 + mn*inca2 + 0), *(alpha1 + mn*inca2 + 1), \
+							                                         *(pi1_r + mn*dfac + d), *(pi1_i + mn*dfac + d) ); \
 \
-						alpha1_r += lda2; \
-						alpha1_i += lda2; \
-						pi1_r    += ldp2; \
-						pi1_i    += ldp2; \
+						alpha1 += lda2; \
+						pi1_r  += ldp2; \
+						pi1_i  += ldp2; \
 					} \
 				} \
 			} \
@@ -2172,47 +243,43 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		  p, 1,    ldp, ldp  \
 		); \
 \
-		/* if ( cdim < mnr ) */ \
-		{ \
-			ctype* restrict zero   = PASTEMAC(ch,0); \
-			const dim_t     offm   = cdim; \
-			const dim_t     offn   = 0; \
-			const dim_t     m_edge = mnr - cdim; \
-			const dim_t     n_edge = n_max; \
-\
-			PASTEMAC(ch,set1ms_mxn) \
-			( \
-			  schema, \
-			  offm, \
-			  offn, \
-			  m_edge, \
-			  n_edge, \
-			  zero, \
-			  p, 1, ldp, ldp  \
-			); \
-		} \
-	} \
-\
-	if ( n < n_max ) \
-	{ \
-		ctype* restrict zero   = PASTEMAC(ch,0); \
-		const dim_t     offm   = 0; \
-		const dim_t     offn   = n; \
-		const dim_t     m_edge = mnr; \
-		const dim_t     n_edge = n_max - n; \
-\
-		PASTEMAC(ch,set1ms_mxn) \
+		const dim_t       i      = cdim; \
+		const dim_t       erfac  = bli_is_1e_packed( schema ) ? 2 : 1; \
+		/* use ldp instead of mnr, in case the latter is -1 \
+		   this may write extra zeros, but not too many \
+		   this also automatically accounts for dfac when \
+		   using set0s_mxn instead of set0bbs_mxn */ \
+		const dim_t       m_edge = ldp - cdim*dfac*erfac; \
+		const dim_t       n_edge = 2*n_max; \
+		ctype_r* restrict p_cast = ( ctype_r* )p; \
+		ctype_r* restrict p_edge = p_cast + (i  )*dfac*erfac; \
+\
+		PASTEMAC(chr,set0s_mxn) \
 		( \
-		  schema, \
-		  offm, \
-		  offn, \
 		  m_edge, \
 		  n_edge, \
-		  zero, \
-		  p, 1, ldp, ldp  \
+		  p_edge, 1, ldp  \
 		); \
 	} \
+\
+	const dim_t       i      = n; \
+	/* use ldp instead of mnr, in case the latter is -1 \
+	   this may write extra zeros, but not too many \
+	   this also automatically accounts for dfac when \
+	   using set0s_mxn instead of set0bbs_mxn */ \
+	const dim_t       m_edge = ldp; \
+	const dim_t       n_edge = 2*(n_max-i); \
+	ctype_r* restrict p_cast = ( ctype_r* )p; \
+	ctype_r* restrict p_edge = p_cast + (i  )*ldp*2; \
+\
+	PASTEMAC(chr,set0s_mxn) \
+	( \
+	  m_edge, \
+	  n_edge, \
+	  p_edge, 1, ldp  \
+	); \
 }
 
-INSERT_GENTFUNCCO_BASIC3( packm_16xk_1er, 16, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNCCO_BASIC4( packm_mrxk_1er, BLIS_MR, BLIS_BBM, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNCCO_BASIC4( packm_nrxk_1er, BLIS_NR, BLIS_BBN, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
diff --git a/ref_kernels/1m/bli_packm_cxk_bb_ref.c b/ref_kernels/1m/bli_packm_cxk_bb_ref.c
deleted file mode 100644
index e7498a735d..0000000000
--- a/ref_kernels/1m/bli_packm_cxk_bb_ref.c
+++ /dev/null
@@ -1,656 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-// -- 6xk, duplication factor 2 ------------------------------------------------
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conja, \
-       pack_t           schema, \
-       dim_t            cdim, \
-       dim_t            n, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx \
-     ) \
-{ \
-	ctype* restrict kappa_cast = kappa; \
-	ctype* restrict alpha1     = a; \
-	ctype* restrict pi1        = p; \
-\
-	const dim_t     dfac       = 2; \
-\
-	/* Handle the packing of B (column panel schemas) separately from packing
-	   of A (row panel schemas). */ \
-	if ( bli_is_col_packed( schema ) ) \
-	{ \
-		if ( cdim == mnr ) \
-		{ \
-			if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 +  0) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 +  1) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 +  2) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 +  3) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 +  4) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 +  5) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 +  6) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 +  7) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 +  8) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 +  9) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 10) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 11) ); \
-\
-						alpha1 += lda; \
-						pi1    += ldp; \
-					} \
-				} \
-				else /* if ( bli_is_noconj( conja ) ) */ \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 +  0) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 +  1) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 +  2) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 +  3) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 +  4) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 +  5) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 +  6) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 +  7) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 +  8) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 +  9) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 10) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 11) ); \
-\
-						alpha1 += lda; \
-						pi1    += ldp; \
-					} \
-				} \
-			} \
-			else /* if ( !PASTEMAC(ch,eq1)( *kappa_cast ) ) */ \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 +  0) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 +  1) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 +  2) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 +  3) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 +  4) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 +  5) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 +  6) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 +  7) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 +  8) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 +  9) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 10) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 11) ); \
-\
-						alpha1 += lda; \
-						pi1    += ldp; \
-					} \
-				} \
-				else /* if ( bli_is_noconj( conja ) ) */ \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 +  0) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 +  1) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 +  2) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 +  3) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 +  4) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 +  5) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 +  6) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 +  7) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 +  8) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 +  9) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 10) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 11) ); \
-\
-						alpha1 += lda; \
-						pi1    += ldp; \
-					} \
-				} \
-			} \
-		} \
-		else /* if ( cdim < mnr ) */ \
-		{ \
-			PASTEMAC(ch,scal2bbs_mxn) \
-			( \
-			  conja, \
-			  cdim, \
-			  n, \
-			  kappa, \
-			  a, inca, lda, \
-			  p, dfac, ldp  \
-			); \
-\
-			/* if ( cdim < mnr ) */ \
-			{ \
-				const dim_t     i      = cdim; \
-				const dim_t     m_edge = mnr - cdim; \
-				const dim_t     n_edge = n_max; \
-				ctype* restrict p_cast = p; \
-				ctype* restrict p_edge = p_cast + (i  )*dfac; \
-\
-				PASTEMAC(ch,set0bbs_mxn) \
-				( \
-				  m_edge, \
-				  n_edge, \
-				  p_edge, dfac, ldp  \
-				); \
-			} \
-		} \
-\
-		if ( n < n_max ) \
-		{ \
-			const dim_t     j      = n; \
-			const dim_t     m_edge = mnr; \
-			const dim_t     n_edge = n_max - n; \
-			ctype* restrict p_cast = p; \
-			ctype* restrict p_edge = p_cast + (j  )*ldp; \
-\
-			PASTEMAC(ch,set0bbs_mxn) \
-			( \
-			  m_edge, \
-			  n_edge, \
-			  p_edge, dfac, ldp  \
-			); \
-		} \
-	} \
-	else /* if ( bli_is_row_packed( schema ) ) */ \
-	{ \
-		if ( cdim == mnr ) \
-		{ \
-			if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
-\
-						alpha1 += lda; \
-						pi1    += ldp; \
-					} \
-				} \
-				else /* if ( bli_is_noconj( conja ) ) */ \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
-\
-						alpha1 += lda; \
-						pi1    += ldp; \
-					} \
-				} \
-			} \
-			else /* if ( !PASTEMAC(ch,eq1)( *kappa_cast ) ) */ \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
-\
-						alpha1 += lda; \
-						pi1    += ldp; \
-					} \
-				} \
-				else /* if ( bli_is_noconj( conja ) ) */ \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
-\
-						alpha1 += lda; \
-						pi1    += ldp; \
-					} \
-				} \
-			} \
-		} \
-		else /* if ( cdim < mnr ) */ \
-		{ \
-			PASTEMAC(ch,scal2s_mxn) \
-			( \
-			  conja, \
-			  cdim, \
-			  n, \
-			  kappa, \
-			  a, inca, lda, \
-			  p, 1,    ldp  \
-			); \
-\
-			/* if ( cdim < mnr ) */ \
-			{ \
-				const dim_t     i      = cdim; \
-				const dim_t     m_edge = mnr - cdim; \
-				const dim_t     n_edge = n_max; \
-				ctype* restrict p_cast = p; \
-				ctype* restrict p_edge = p_cast + (i  )*1; \
-\
-				PASTEMAC(ch,set0s_mxn) \
-				( \
-				  m_edge, \
-				  n_edge, \
-				  p_edge, 1, ldp  \
-				); \
-			} \
-		} \
-\
-		if ( n < n_max ) \
-		{ \
-			const dim_t     j      = n; \
-			const dim_t     m_edge = mnr; \
-			const dim_t     n_edge = n_max - n; \
-			ctype* restrict p_cast = p; \
-			ctype* restrict p_edge = p_cast + (j  )*ldp; \
-\
-			PASTEMAC(ch,set0s_mxn) \
-			( \
-			  m_edge, \
-			  n_edge, \
-			  p_edge, 1, ldp  \
-			); \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC3( packm_6xk_bb2, 6, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-// -- 6xk, duplication factor 4 ------------------------------------------------
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conja, \
-       pack_t           schema, \
-       dim_t            cdim, \
-       dim_t            n, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx \
-     ) \
-{ \
-	ctype* restrict kappa_cast = kappa; \
-	ctype* restrict alpha1     = a; \
-	ctype* restrict pi1        = p; \
-\
-	const dim_t     dfac       = 4; \
-\
-	/* Handle the packing of B (column panel schemas) separately from packing
-	   of A (row panel schemas). */ \
-	if ( bli_is_col_packed( schema ) ) \
-	{ \
-		if ( cdim == mnr ) \
-		{ \
-			if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 +  0) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 +  1) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 +  2) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 +  3) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 +  4) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 +  5) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 +  6) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 +  7) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 +  8) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 +  9) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 10) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 11) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 12) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 13) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 14) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 15) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 16) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 17) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 18) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 19) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 20) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 21) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 22) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 23) ); \
-\
-						alpha1 += lda; \
-						pi1    += ldp; \
-					} \
-				} \
-				else /* if ( bli_is_noconj( conja ) ) */ \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 +  0) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 +  1) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 +  2) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 +  3) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 +  4) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 +  5) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 +  6) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 +  7) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 +  8) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 +  9) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 10) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 11) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 12) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 13) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 14) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 15) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 16) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 17) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 18) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 19) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 20) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 21) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 22) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 23) ); \
-\
-						alpha1 += lda; \
-						pi1    += ldp; \
-					} \
-				} \
-			} \
-			else /* if ( !PASTEMAC(ch,eq1)( *kappa_cast ) ) */ \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 +  0) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 +  1) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 +  2) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 +  3) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 +  4) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 +  5) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 +  6) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 +  7) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 +  8) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 +  9) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 10) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 11) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 12) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 13) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 14) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 15) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 16) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 17) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 18) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 19) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 20) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 21) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 22) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 23) ); \
-\
-						alpha1 += lda; \
-						pi1    += ldp; \
-					} \
-				} \
-				else /* if ( bli_is_noconj( conja ) ) */ \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 +  0) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 +  1) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 +  2) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 +  3) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 +  4) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 +  5) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 +  6) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 +  7) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 +  8) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 +  9) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 10) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 11) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 12) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 13) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 14) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 15) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 16) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 17) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 18) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 19) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 20) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 21) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 22) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 23) ); \
-\
-						alpha1 += lda; \
-						pi1    += ldp; \
-					} \
-				} \
-			} \
-		} \
-		else /* if ( cdim < mnr ) */ \
-		{ \
-			PASTEMAC(ch,scal2bbs_mxn) \
-			( \
-			  conja, \
-			  cdim, \
-			  n, \
-			  kappa, \
-			  a, inca, lda, \
-			  p, dfac, ldp  \
-			); \
-\
-			/* if ( cdim < mnr ) */ \
-			{ \
-				const dim_t     i      = cdim; \
-				const dim_t     m_edge = mnr - cdim; \
-				const dim_t     n_edge = n_max; \
-				ctype* restrict p_cast = p; \
-				ctype* restrict p_edge = p_cast + (i  )*dfac; \
-\
-				PASTEMAC(ch,set0bbs_mxn) \
-				( \
-				  m_edge, \
-				  n_edge, \
-				  p_edge, dfac, ldp  \
-				); \
-			} \
-		} \
-\
-		if ( n < n_max ) \
-		{ \
-			const dim_t     j      = n; \
-			const dim_t     m_edge = mnr; \
-			const dim_t     n_edge = n_max - n; \
-			ctype* restrict p_cast = p; \
-			ctype* restrict p_edge = p_cast + (j  )*ldp; \
-\
-			PASTEMAC(ch,set0bbs_mxn) \
-			( \
-			  m_edge, \
-			  n_edge, \
-			  p_edge, dfac, ldp  \
-			); \
-		} \
-	} \
-	else /* if ( bli_is_row_packed( schema ) ) */ \
-	{ \
-		if ( cdim == mnr ) \
-		{ \
-			if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
-						PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
-\
-						alpha1 += lda; \
-						pi1    += ldp; \
-					} \
-				} \
-				else /* if ( bli_is_noconj( conja ) ) */ \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
-						PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
-\
-						alpha1 += lda; \
-						pi1    += ldp; \
-					} \
-				} \
-			} \
-			else /* if ( !PASTEMAC(ch,eq1)( *kappa_cast ) ) */ \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
-\
-						alpha1 += lda; \
-						pi1    += ldp; \
-					} \
-				} \
-				else /* if ( bli_is_noconj( conja ) ) */ \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
-\
-						alpha1 += lda; \
-						pi1    += ldp; \
-					} \
-				} \
-			} \
-		} \
-		else /* if ( cdim < mnr ) */ \
-		{ \
-			PASTEMAC(ch,scal2s_mxn) \
-			( \
-			  conja, \
-			  cdim, \
-			  n, \
-			  kappa, \
-			  a, inca, lda, \
-			  p, 1,    ldp  \
-			); \
-\
-			/* if ( cdim < mnr ) */ \
-			{ \
-				const dim_t     i      = cdim; \
-				const dim_t     m_edge = mnr - cdim; \
-				const dim_t     n_edge = n_max; \
-				ctype* restrict p_cast = p; \
-				ctype* restrict p_edge = p_cast + (i  )*1; \
-\
-				PASTEMAC(ch,set0s_mxn) \
-				( \
-				  m_edge, \
-				  n_edge, \
-				  p_edge, 1, ldp  \
-				); \
-			} \
-		} \
-\
-		if ( n < n_max ) \
-		{ \
-			const dim_t     j      = n; \
-			const dim_t     m_edge = mnr; \
-			const dim_t     n_edge = n_max - n; \
-			ctype* restrict p_cast = p; \
-			ctype* restrict p_edge = p_cast + (j  )*ldp; \
-\
-			PASTEMAC(ch,set0s_mxn) \
-			( \
-			  m_edge, \
-			  n_edge, \
-			  p_edge, 1, ldp  \
-			); \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC3( packm_6xk_bb4, 6, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
diff --git a/ref_kernels/1m/bli_packm_cxk_ref.c b/ref_kernels/1m/bli_packm_cxk_ref.c
index c98f1b2503..acbd00e893 100644
--- a/ref_kernels/1m/bli_packm_cxk_ref.c
+++ b/ref_kernels/1m/bli_packm_cxk_ref.c
@@ -35,7 +35,7 @@
 #include "blis.h"
 
 #undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \
+#define GENTFUNC( ctype, ch, opname, mnr0, bb0, arch, suf ) \
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
@@ -50,1322 +50,25 @@ void PASTEMAC3(ch,opname,arch,suf) \
        cntx_t* restrict cntx \
      ) \
 { \
-	ctype* restrict kappa_cast = kappa; \
-	ctype* restrict alpha1     = a; \
-	ctype* restrict pi1        = p; \
-\
-	dim_t           n_iter     = n / 4; \
-	dim_t           n_left     = n % 4; \
-\
-	if ( cdim == mnr ) \
-	{ \
-		if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-		{ \
-			if ( bli_is_conj( conja ) ) \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-			else \
-			{ \
-				for ( ; n_iter != 0; --n_iter ) \
-				{ \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 0*lda), *(pi1 + 0 + 0*ldp) ); \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 0*lda), *(pi1 + 1 + 0*ldp) ); \
-\
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 1*lda), *(pi1 + 0 + 1*ldp) ); \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 1*lda), *(pi1 + 1 + 1*ldp) ); \
-\
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 2*lda), *(pi1 + 0 + 2*ldp) ); \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 2*lda), *(pi1 + 1 + 2*ldp) ); \
-\
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 3*lda), *(pi1 + 0 + 3*ldp) ); \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 3*lda), *(pi1 + 1 + 3*ldp) ); \
-\
-					alpha1 += 4*lda; \
-					pi1    += 4*ldp; \
-				} \
-\
-				for ( ; n_left != 0; --n_left ) \
-				{ \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-		} \
-		else \
-		{ \
-			if ( bli_is_conj( conja ) ) \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-			else \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-		} \
-	} \
-	else /* if ( cdim < mnr ) */ \
-	{ \
-		PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \
-		( \
-		  0, \
-		  BLIS_NONUNIT_DIAG, \
-		  BLIS_DENSE, \
-		  ( trans_t )conja, \
-		  cdim, \
-		  n, \
-		  kappa, \
-		  a, inca, lda, \
-		  p,    1, ldp, \
-		  cntx, \
-		  NULL  \
-		); \
-\
-		/* if ( cdim < mnr ) */ \
-		{ \
-			const dim_t     i      = cdim; \
-			const dim_t     m_edge = mnr - cdim; \
-			const dim_t     n_edge = n_max; \
-			ctype* restrict p_cast = p; \
-			ctype* restrict p_edge = p_cast + (i  )*1; \
-\
-			PASTEMAC(ch,set0s_mxn) \
-			( \
-			  m_edge, \
-			  n_edge, \
-			  p_edge, 1, ldp  \
-			); \
-		} \
-	} \
-\
-	if ( n < n_max ) \
-	{ \
-		const dim_t     j      = n; \
-		const dim_t     m_edge = mnr; \
-		const dim_t     n_edge = n_max - n; \
-		ctype* restrict p_cast = p; \
-		ctype* restrict p_edge = p_cast + (j  )*ldp; \
-\
-		PASTEMAC(ch,set0s_mxn) \
-		( \
-		  m_edge, \
-		  n_edge, \
-		  p_edge, 1, ldp  \
-		); \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC3( packm_2xk, 2, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conja, \
-       pack_t           schema, \
-       dim_t            cdim, \
-       dim_t            n, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx \
-     ) \
-{ \
-	ctype* restrict kappa_cast = kappa; \
-	ctype* restrict alpha1     = a; \
-	ctype* restrict pi1        = p; \
-\
-	dim_t           n_iter     = n / 4; \
-	dim_t           n_left     = n % 4; \
-\
-	if ( cdim == mnr ) \
-	{ \
-		if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-		{ \
-			if ( bli_is_conj( conja ) ) \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-			else \
-			{ \
-				for ( ; n_iter != 0; --n_iter ) \
-				{ \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 0*lda), *(pi1 + 0 + 0*ldp) ); \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 0*lda), *(pi1 + 1 + 0*ldp) ); \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca + 0*lda), *(pi1 + 2 + 0*ldp) ); \
-\
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 1*lda), *(pi1 + 0 + 1*ldp) ); \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 1*lda), *(pi1 + 1 + 1*ldp) ); \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca + 1*lda), *(pi1 + 2 + 1*ldp) ); \
-\
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 2*lda), *(pi1 + 0 + 2*ldp) ); \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 2*lda), *(pi1 + 1 + 2*ldp) ); \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca + 2*lda), *(pi1 + 2 + 2*ldp) ); \
-\
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 3*lda), *(pi1 + 0 + 3*ldp) ); \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 3*lda), *(pi1 + 1 + 3*ldp) ); \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca + 3*lda), *(pi1 + 2 + 3*ldp) ); \
-\
-					alpha1 += 4*lda; \
-					pi1    += 4*ldp; \
-				} \
-\
-				for ( ; n_left != 0; --n_left ) \
-				{ \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-		} \
-		else \
-		{ \
-			if ( bli_is_conj( conja ) ) \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-			else \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-		} \
-	} \
-	else /* if ( cdim < mnr ) */ \
-	{ \
-		PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \
-		( \
-		  0, \
-		  BLIS_NONUNIT_DIAG, \
-		  BLIS_DENSE, \
-		  ( trans_t )conja, \
-		  cdim, \
-		  n, \
-		  kappa, \
-		  a, inca, lda, \
-		  p,    1, ldp, \
-		  cntx, \
-		  NULL  \
-		); \
-\
-		/* if ( cdim < mnr ) */ \
-		{ \
-			const dim_t     i      = cdim; \
-			const dim_t     m_edge = mnr - cdim; \
-			const dim_t     n_edge = n_max; \
-			ctype* restrict p_cast = p; \
-			ctype* restrict p_edge = p_cast + (i  )*1; \
-\
-			PASTEMAC(ch,set0s_mxn) \
-			( \
-			  m_edge, \
-			  n_edge, \
-			  p_edge, 1, ldp  \
-			); \
-		} \
-	} \
-\
-	if ( n < n_max ) \
-	{ \
-		const dim_t     j      = n; \
-		const dim_t     m_edge = mnr; \
-		const dim_t     n_edge = n_max - n; \
-		ctype* restrict p_cast = p; \
-		ctype* restrict p_edge = p_cast + (j  )*ldp; \
-\
-		PASTEMAC(ch,set0s_mxn) \
-		( \
-		  m_edge, \
-		  n_edge, \
-		  p_edge, 1, ldp  \
-		); \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC3( packm_3xk, 3, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conja, \
-       pack_t           schema, \
-       dim_t            cdim, \
-       dim_t            n, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx \
-     ) \
-{ \
-	ctype* restrict kappa_cast = kappa; \
-	ctype* restrict alpha1     = a; \
-	ctype* restrict pi1        = p; \
-\
-	dim_t           n_iter     = n / 2; \
-	dim_t           n_left     = n % 2; \
-\
-	if ( cdim == mnr ) \
-	{ \
-		if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-		{ \
-			if ( bli_is_conj( conja ) ) \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC2(ch,ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-			else \
-			{ \
-				for ( ; n_iter != 0; --n_iter ) \
-				{ \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 0*lda), *(pi1 + 0 + 0*ldp) ); \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 0*lda), *(pi1 + 1 + 0*ldp) ); \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca + 0*lda), *(pi1 + 2 + 0*ldp) ); \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 3*inca + 0*lda), *(pi1 + 3 + 0*ldp) ); \
-\
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca + 1*lda), *(pi1 + 0 + 1*ldp) ); \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca + 1*lda), *(pi1 + 1 + 1*ldp) ); \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca + 1*lda), *(pi1 + 2 + 1*ldp) ); \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 3*inca + 1*lda), *(pi1 + 3 + 1*ldp) ); \
-\
-					alpha1 += 2*lda; \
-					pi1    += 2*ldp; \
-				} \
-\
-				for ( ; n_left != 0; --n_left ) \
-				{ \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC2(ch,ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-		} \
-		else \
-		{ \
-			if ( bli_is_conj( conja ) ) \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-			else \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-		} \
-	} \
-	else /* if ( cdim < mnr ) */ \
-	{ \
-		PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \
-		( \
-		  0, \
-		  BLIS_NONUNIT_DIAG, \
-		  BLIS_DENSE, \
-		  ( trans_t )conja, \
-		  cdim, \
-		  n, \
-		  kappa, \
-		  a, inca, lda, \
-		  p,    1, ldp, \
-		  cntx, \
-		  NULL  \
-		); \
-\
-		/* if ( cdim < mnr ) */ \
-		{ \
-			const dim_t     i      = cdim; \
-			const dim_t     m_edge = mnr - cdim; \
-			const dim_t     n_edge = n_max; \
-			ctype* restrict p_cast = p; \
-			ctype* restrict p_edge = p_cast + (i  )*1; \
-\
-			PASTEMAC(ch,set0s_mxn) \
-			( \
-			  m_edge, \
-			  n_edge, \
-			  p_edge, 1, ldp  \
-			); \
-		} \
-	} \
-\
-	if ( n < n_max ) \
-	{ \
-		const dim_t     j      = n; \
-		const dim_t     m_edge = mnr; \
-		const dim_t     n_edge = n_max - n; \
-		ctype* restrict p_cast = p; \
-		ctype* restrict p_edge = p_cast + (j  )*ldp; \
-\
-		PASTEMAC(ch,set0s_mxn) \
-		( \
-		  m_edge, \
-		  n_edge, \
-		  p_edge, 1, ldp  \
-		); \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC3( packm_4xk, 4, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conja, \
-       pack_t           schema, \
-       dim_t            cdim, \
-       dim_t            n, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx \
-     ) \
-{ \
-	ctype* restrict kappa_cast = kappa; \
-	ctype* restrict alpha1     = a; \
-	ctype* restrict pi1        = p; \
-\
-	if ( cdim == mnr ) \
-	{ \
-		if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-		{ \
-			if ( bli_is_conj( conja ) ) \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-			else \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-		} \
-		else \
-		{ \
-			if ( bli_is_conj( conja ) ) \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-			else \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-		} \
-	} \
-	else /* if ( cdim < mnr ) */ \
-	{ \
-		PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \
-		( \
-		  0, \
-		  BLIS_NONUNIT_DIAG, \
-		  BLIS_DENSE, \
-		  ( trans_t )conja, \
-		  cdim, \
-		  n, \
-		  kappa, \
-		  a, inca, lda, \
-		  p,    1, ldp, \
-		  cntx, \
-		  NULL  \
-		); \
-\
-		/* if ( cdim < mnr ) */ \
-		{ \
-			const dim_t     i      = cdim; \
-			const dim_t     m_edge = mnr - cdim; \
-			const dim_t     n_edge = n_max; \
-			ctype* restrict p_cast = p; \
-			ctype* restrict p_edge = p_cast + (i  )*1; \
-\
-			PASTEMAC(ch,set0s_mxn) \
-			( \
-			  m_edge, \
-			  n_edge, \
-			  p_edge, 1, ldp  \
-			); \
-		} \
-	} \
-\
-	if ( n < n_max ) \
-	{ \
-		const dim_t     j      = n; \
-		const dim_t     m_edge = mnr; \
-		const dim_t     n_edge = n_max - n; \
-		ctype* restrict p_cast = p; \
-		ctype* restrict p_edge = p_cast + (j  )*ldp; \
-\
-		PASTEMAC(ch,set0s_mxn) \
-		( \
-		  m_edge, \
-		  n_edge, \
-		  p_edge, 1, ldp  \
-		); \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC3( packm_6xk, 6, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conja, \
-       pack_t           schema, \
-       dim_t            cdim, \
-       dim_t            n, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx \
-     ) \
-{ \
-	ctype* restrict kappa_cast = kappa; \
-	ctype* restrict alpha1     = a; \
-	ctype* restrict pi1        = p; \
-\
-	dim_t           n_iter     = n / 2; \
-	dim_t           n_left     = n % 2; \
-\
-	if ( cdim == mnr ) \
-	{ \
-		if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-		{ \
-			if ( bli_is_conj( conja ) ) \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-			else \
-			{ \
-				for ( ; n_iter != 0; --n_iter ) \
-				{ \
-					PASTEMAC(ch,copys)( *(alpha1 + 0*inca + 0*lda), *(pi1 + 0 + 0*ldp) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 1*inca + 0*lda), *(pi1 + 1 + 0*ldp) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 2*inca + 0*lda), *(pi1 + 2 + 0*ldp) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 3*inca + 0*lda), *(pi1 + 3 + 0*ldp) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 4*inca + 0*lda), *(pi1 + 4 + 0*ldp) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 5*inca + 0*lda), *(pi1 + 5 + 0*ldp) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 6*inca + 0*lda), *(pi1 + 6 + 0*ldp) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 7*inca + 0*lda), *(pi1 + 7 + 0*ldp) ); \
-\
-					PASTEMAC(ch,copys)( *(alpha1 + 0*inca + 1*lda), *(pi1 + 0 + 1*ldp) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 1*inca + 1*lda), *(pi1 + 1 + 1*ldp) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 2*inca + 1*lda), *(pi1 + 2 + 1*ldp) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 3*inca + 1*lda), *(pi1 + 3 + 1*ldp) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 4*inca + 1*lda), *(pi1 + 4 + 1*ldp) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 5*inca + 1*lda), *(pi1 + 5 + 1*ldp) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 6*inca + 1*lda), *(pi1 + 6 + 1*ldp) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 7*inca + 1*lda), *(pi1 + 7 + 1*ldp) ); \
-\
-					alpha1 += 2*lda; \
-					pi1    += 2*ldp; \
-				} \
-\
-				for ( ; n_left != 0; --n_left ) \
-				{ \
-					PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-		} \
-		else \
-		{ \
-			if ( bli_is_conj( conja ) ) \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-			else \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-		} \
-	} \
-	else /* if ( cdim < mnr ) */ \
-	{ \
-		PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \
-		( \
-		  0, \
-		  BLIS_NONUNIT_DIAG, \
-		  BLIS_DENSE, \
-		  ( trans_t )conja, \
-		  cdim, \
-		  n, \
-		  kappa, \
-		  a, inca, lda, \
-		  p,    1, ldp, \
-		  cntx, \
-		  NULL  \
-		); \
-\
-		/* if ( cdim < mnr ) */ \
-		{ \
-			const dim_t     i      = cdim; \
-			const dim_t     m_edge = mnr - cdim; \
-			const dim_t     n_edge = n_max; \
-			ctype* restrict p_cast = p; \
-			ctype* restrict p_edge = p_cast + (i  )*1; \
-\
-			PASTEMAC(ch,set0s_mxn) \
-			( \
-			  m_edge, \
-			  n_edge, \
-			  p_edge, 1, ldp  \
-			); \
-		} \
-	} \
-\
-	if ( n < n_max ) \
-	{ \
-		const dim_t     j      = n; \
-		const dim_t     m_edge = mnr; \
-		const dim_t     n_edge = n_max - n; \
-		ctype* restrict p_cast = p; \
-		ctype* restrict p_edge = p_cast + (j  )*ldp; \
-\
-		PASTEMAC(ch,set0s_mxn) \
-		( \
-		  m_edge, \
-		  n_edge, \
-		  p_edge, 1, ldp  \
-		); \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC3( packm_8xk, 8, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conja, \
-       pack_t           schema, \
-       dim_t            cdim, \
-       dim_t            n, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx \
-     ) \
-{ \
-	ctype* restrict kappa_cast = kappa; \
-	ctype* restrict alpha1     = a; \
-	ctype* restrict pi1        = p; \
-\
-	if ( cdim == mnr ) \
-	{ \
-		if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-		{ \
-			if ( bli_is_conj( conja ) ) \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-			else \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-		} \
-		else \
-		{ \
-			if ( bli_is_conj( conja ) ) \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-			else \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-		} \
-	} \
-	else /* if ( cdim < mnr ) */ \
-	{ \
-		PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \
-		( \
-		  0, \
-		  BLIS_NONUNIT_DIAG, \
-		  BLIS_DENSE, \
-		  ( trans_t )conja, \
-		  cdim, \
-		  n, \
-		  kappa, \
-		  a, inca, lda, \
-		  p,    1, ldp, \
-		  cntx, \
-		  NULL  \
-		); \
-\
-		/* if ( cdim < mnr ) */ \
-		{ \
-			const dim_t     i      = cdim; \
-			const dim_t     m_edge = mnr - cdim; \
-			const dim_t     n_edge = n_max; \
-			ctype* restrict p_cast = p; \
-			ctype* restrict p_edge = p_cast + (i  )*1; \
-\
-			PASTEMAC(ch,set0s_mxn) \
-			( \
-			  m_edge, \
-			  n_edge, \
-			  p_edge, 1, ldp  \
-			); \
-		} \
-	} \
-\
-	if ( n < n_max ) \
-	{ \
-		const dim_t     j      = n; \
-		const dim_t     m_edge = mnr; \
-		const dim_t     n_edge = n_max - n; \
-		ctype* restrict p_cast = p; \
-		ctype* restrict p_edge = p_cast + (j  )*ldp; \
-\
-		PASTEMAC(ch,set0s_mxn) \
-		( \
-		  m_edge, \
-		  n_edge, \
-		  p_edge, 1, ldp  \
-		); \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC3( packm_10xk, 10, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conja, \
-       pack_t           schema, \
-       dim_t            cdim, \
-       dim_t            n, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx \
-     ) \
-{ \
-	ctype* restrict kappa_cast = kappa; \
-	ctype* restrict alpha1     = a; \
-	ctype* restrict pi1        = p; \
-\
-	if ( cdim == mnr ) \
-	{ \
-		if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-		{ \
-			if ( bli_is_conj( conja ) ) \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +10*inca), *(pi1 +10) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +11*inca), *(pi1 +11) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-			else \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +10*inca), *(pi1 +10) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +11*inca), *(pi1 +11) ); \
+	const dim_t     mnr        = PASTECH2(mnr0, _, ch); \
+	const dim_t     dfac       = PASTECH2(bb0, _, ch); \
 \
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-		} \
-		else \
-		{ \
-			if ( bli_is_conj( conja ) ) \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +10*inca), *(pi1 +10) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +11*inca), *(pi1 +11) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-			else \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +10*inca), *(pi1 +10) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +11*inca), *(pi1 +11) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-		} \
-	} \
-	else /* if ( cdim < mnr ) */ \
-	{ \
-		PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \
-		( \
-		  0, \
-		  BLIS_NONUNIT_DIAG, \
-		  BLIS_DENSE, \
-		  ( trans_t )conja, \
-		  cdim, \
-		  n, \
-		  kappa, \
-		  a, inca, lda, \
-		  p,    1, ldp, \
-		  cntx, \
-		  NULL  \
-		); \
-\
-		/* if ( cdim < mnr ) */ \
-		{ \
-			const dim_t     i      = cdim; \
-			const dim_t     m_edge = mnr - cdim; \
-			const dim_t     n_edge = n_max; \
-			ctype* restrict p_cast = p; \
-			ctype* restrict p_edge = p_cast + (i  )*1; \
-\
-			PASTEMAC(ch,set0s_mxn) \
-			( \
-			  m_edge, \
-			  n_edge, \
-			  p_edge, 1, ldp  \
-			); \
-		} \
-	} \
-\
-	if ( n < n_max ) \
-	{ \
-		const dim_t     j      = n; \
-		const dim_t     m_edge = mnr; \
-		const dim_t     n_edge = n_max - n; \
-		ctype* restrict p_cast = p; \
-		ctype* restrict p_edge = p_cast + (j  )*ldp; \
-\
-		PASTEMAC(ch,set0s_mxn) \
-		( \
-		  m_edge, \
-		  n_edge, \
-		  p_edge, 1, ldp  \
-		); \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC3( packm_12xk, 12, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conja, \
-       pack_t           schema, \
-       dim_t            cdim, \
-       dim_t            n, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx \
-     ) \
-{ \
-	ctype* restrict kappa_cast = kappa; \
-	ctype* restrict alpha1     = a; \
-	ctype* restrict pi1        = p; \
-\
-	if ( cdim == mnr ) \
-	{ \
-		if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-		{ \
-			if ( bli_is_conj( conja ) ) \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +10*inca), *(pi1 +10) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +11*inca), *(pi1 +11) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +12*inca), *(pi1 +12) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +13*inca), *(pi1 +13) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-			else \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +10*inca), *(pi1 +10) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +11*inca), *(pi1 +11) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +12*inca), *(pi1 +12) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +13*inca), *(pi1 +13) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-		} \
-		else \
-		{ \
-			if ( bli_is_conj( conja ) ) \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +10*inca), *(pi1 +10) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +11*inca), *(pi1 +11) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +12*inca), *(pi1 +12) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +13*inca), *(pi1 +13) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-			else \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +10*inca), *(pi1 +10) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +11*inca), *(pi1 +11) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +12*inca), *(pi1 +12) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +13*inca), *(pi1 +13) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-		} \
-	} \
-	else /* if ( cdim < mnr ) */ \
-	{ \
-		PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \
-		( \
-		  0, \
-		  BLIS_NONUNIT_DIAG, \
-		  BLIS_DENSE, \
-		  ( trans_t )conja, \
-		  cdim, \
-		  n, \
-		  kappa, \
-		  a, inca, lda, \
-		  p,    1, ldp, \
-		  cntx, \
-		  NULL  \
-		); \
-\
-		/* if ( cdim < mnr ) */ \
-		{ \
-			const dim_t     i      = cdim; \
-			const dim_t     m_edge = mnr - cdim; \
-			const dim_t     n_edge = n_max; \
-			ctype* restrict p_cast = p; \
-			ctype* restrict p_edge = p_cast + (i  )*1; \
-\
-			PASTEMAC(ch,set0s_mxn) \
-			( \
-			  m_edge, \
-			  n_edge, \
-			  p_edge, 1, ldp  \
-			); \
-		} \
-	} \
-\
-	if ( n < n_max ) \
-	{ \
-		const dim_t     j      = n; \
-		const dim_t     m_edge = mnr; \
-		const dim_t     n_edge = n_max - n; \
-		ctype* restrict p_cast = p; \
-		ctype* restrict p_edge = p_cast + (j  )*ldp; \
-\
-		PASTEMAC(ch,set0s_mxn) \
-		( \
-		  m_edge, \
-		  n_edge, \
-		  p_edge, 1, ldp  \
-		); \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC3( packm_14xk, 14, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conja, \
-       pack_t           schema, \
-       dim_t            cdim, \
-       dim_t            n, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx \
-     ) \
-{ \
-	ctype* restrict kappa_cast = kappa; \
+	ctype           kappa_cast = *( ctype* )kappa; \
 	ctype* restrict alpha1     = a; \
 	ctype* restrict pi1        = p; \
 \
-	if ( cdim == mnr ) \
+	if ( cdim == mnr && mnr != -1 ) \
 	{ \
-		if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
+		if ( inca == 1 ) \
 		{ \
 			if ( bli_is_conj( conja ) ) \
 			{ \
 				for ( dim_t k = n; k != 0; --k ) \
 				{ \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +10*inca), *(pi1 +10) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +11*inca), *(pi1 +11) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +12*inca), *(pi1 +12) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +13*inca), *(pi1 +13) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +14*inca), *(pi1 +14) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +15*inca), *(pi1 +15) ); \
+					PRAGMA_SIMD \
+					for ( dim_t mn = 0; mn < mnr; mn++ ) \
+					for ( dim_t d = 0; d < dfac; d++ ) \
+						PASTEMAC(ch,scal2js)( kappa_cast, *(alpha1 + mn*1), *(pi1 + mn*dfac + d) ); \
 \
 					alpha1 += lda; \
 					pi1    += ldp; \
@@ -1375,22 +78,10 @@ void PASTEMAC3(ch,opname,arch,suf) \
 			{ \
 				for ( dim_t k = n; k != 0; --k ) \
 				{ \
-					PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +10*inca), *(pi1 +10) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +11*inca), *(pi1 +11) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +12*inca), *(pi1 +12) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +13*inca), *(pi1 +13) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +14*inca), *(pi1 +14) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +15*inca), *(pi1 +15) ); \
+					PRAGMA_SIMD \
+					for ( dim_t mn = 0; mn < mnr; mn++ ) \
+					for ( dim_t d = 0; d < dfac; d++ ) \
+						PASTEMAC(ch,scal2s)( kappa_cast, *(alpha1 + mn*1), *(pi1 + mn*dfac + d) ); \
 \
 					alpha1 += lda; \
 					pi1    += ldp; \
@@ -1403,22 +94,10 @@ void PASTEMAC3(ch,opname,arch,suf) \
 			{ \
 				for ( dim_t k = n; k != 0; --k ) \
 				{ \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +10*inca), *(pi1 +10) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +11*inca), *(pi1 +11) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +12*inca), *(pi1 +12) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +13*inca), *(pi1 +13) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +14*inca), *(pi1 +14) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +15*inca), *(pi1 +15) ); \
+					PRAGMA_SIMD \
+					for ( dim_t mn = 0; mn < mnr; mn++ ) \
+					for ( dim_t d = 0; d < dfac; d++ ) \
+						PASTEMAC(ch,scal2js)( kappa_cast, *(alpha1 + mn*inca), *(pi1 + mn*dfac + d) ); \
 \
 					alpha1 += lda; \
 					pi1    += ldp; \
@@ -1428,22 +107,10 @@ void PASTEMAC3(ch,opname,arch,suf) \
 			{ \
 				for ( dim_t k = n; k != 0; --k ) \
 				{ \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +10*inca), *(pi1 +10) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +11*inca), *(pi1 +11) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +12*inca), *(pi1 +12) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +13*inca), *(pi1 +13) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +14*inca), *(pi1 +14) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +15*inca), *(pi1 +15) ); \
+					PRAGMA_SIMD \
+					for ( dim_t mn = 0; mn < mnr; mn++ ) \
+					for ( dim_t d = 0; d < dfac; d++ ) \
+						PASTEMAC(ch,scal2s)( kappa_cast, *(alpha1 + mn*inca), *(pi1 + mn*dfac + d) ); \
 \
 					alpha1 += lda; \
 					pi1    += ldp; \
@@ -1453,45 +120,25 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	} \
 	else /* if ( cdim < mnr ) */ \
 	{ \
-		PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \
+		PASTEMAC(ch,scal2bbs_mxn) \
 		( \
-		  0, \
-		  BLIS_NONUNIT_DIAG, \
-		  BLIS_DENSE, \
-		  ( trans_t )conja, \
+		  conja, \
 		  cdim, \
 		  n, \
 		  kappa, \
 		  a, inca, lda, \
-		  p,    1, ldp, \
-		  cntx, \
-		  NULL  \
+		  p, dfac, ldp  \
 		); \
 \
-		/* if ( cdim < mnr ) */ \
-		{ \
-			const dim_t     i      = cdim; \
-			const dim_t     m_edge = mnr - cdim; \
-			const dim_t     n_edge = n_max; \
-			ctype* restrict p_cast = p; \
-			ctype* restrict p_edge = p_cast + (i  )*1; \
-\
-			PASTEMAC(ch,set0s_mxn) \
-			( \
-			  m_edge, \
-			  n_edge, \
-			  p_edge, 1, ldp  \
-			); \
-		} \
-	} \
-\
-	if ( n < n_max ) \
-	{ \
-		const dim_t     j      = n; \
-		const dim_t     m_edge = mnr; \
-		const dim_t     n_edge = n_max - n; \
+		const dim_t     i      = cdim; \
+		/* use ldp instead of mnr, in case the latter is -1 \
+		   this may write extra zeros, but not too many \
+		   this also automatically accounts for dfac when \
+		   using set0s_mxn instead of set0bbs_mxn */ \
+		const dim_t     m_edge = ldp - cdim*dfac; \
+		const dim_t     n_edge = n_max; \
 		ctype* restrict p_cast = p; \
-		ctype* restrict p_edge = p_cast + (j  )*ldp; \
+		ctype* restrict p_edge = p_cast + (i  )*dfac; \
 \
 		PASTEMAC(ch,set0s_mxn) \
 		( \
@@ -1500,211 +147,15 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		  p_edge, 1, ldp  \
 		); \
 	} \
-}
-
-INSERT_GENTFUNC_BASIC3( packm_16xk, 16, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, mnr, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conja, \
-       pack_t           schema, \
-       dim_t            cdim, \
-       dim_t            n, \
-       dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
-       ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx \
-     ) \
-{ \
-    ctype* restrict kappa_cast = kappa; \
-    ctype* restrict alpha1     = a; \
-    ctype* restrict pi1        = p; \
-\
-	if ( cdim == mnr ) \
-	{ \
-		if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-		{ \
-			if ( bli_is_conj( conja ) ) \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 6*inca), *(pi1 + 6) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 7*inca), *(pi1 + 7) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 8*inca), *(pi1 + 8) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 + 9*inca), *(pi1 + 9) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +10*inca), *(pi1 +10) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +11*inca), *(pi1 +11) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +12*inca), *(pi1 +12) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +13*inca), *(pi1 +13) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +14*inca), *(pi1 +14) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +15*inca), *(pi1 +15) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +16*inca), *(pi1 +16) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +17*inca), *(pi1 +17) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +18*inca), *(pi1 +18) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +19*inca), *(pi1 +19) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +20*inca), *(pi1 +20) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +21*inca), *(pi1 +21) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +22*inca), *(pi1 +22) ); \
-					PASTEMAC(ch,copyjs)( *(alpha1 +23*inca), *(pi1 +23) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-			else \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,copys)( *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 5*inca), *(pi1 + 5) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 6*inca), *(pi1 + 6) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 7*inca), *(pi1 + 7) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 8*inca), *(pi1 + 8) ); \
-					PASTEMAC(ch,copys)( *(alpha1 + 9*inca), *(pi1 + 9) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +10*inca), *(pi1 +10) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +11*inca), *(pi1 +11) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +12*inca), *(pi1 +12) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +13*inca), *(pi1 +13) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +14*inca), *(pi1 +14) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +15*inca), *(pi1 +15) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +16*inca), *(pi1 +16) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +17*inca), *(pi1 +17) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +18*inca), *(pi1 +18) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +19*inca), *(pi1 +19) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +20*inca), *(pi1 +20) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +21*inca), *(pi1 +21) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +22*inca), *(pi1 +22) ); \
-					PASTEMAC(ch,copys)( *(alpha1 +23*inca), *(pi1 +23) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-		} \
-		else \
-		{ \
-			if ( bli_is_conj( conja ) ) \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +10*inca), *(pi1 +10) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +11*inca), *(pi1 +11) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +12*inca), *(pi1 +12) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +13*inca), *(pi1 +13) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +14*inca), *(pi1 +14) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +15*inca), *(pi1 +15) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +16*inca), *(pi1 +16) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +17*inca), *(pi1 +17) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +18*inca), *(pi1 +18) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +19*inca), *(pi1 +19) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +20*inca), *(pi1 +20) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +21*inca), *(pi1 +21) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +22*inca), *(pi1 +22) ); \
-					PASTEMAC(ch,scal2js)( *kappa_cast, *(alpha1 +23*inca), *(pi1 +23) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-			else \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 0*inca), *(pi1 + 0) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 1*inca), *(pi1 + 1) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 2*inca), *(pi1 + 2) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 3*inca), *(pi1 + 3) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 4*inca), *(pi1 + 4) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 5*inca), *(pi1 + 5) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 6*inca), *(pi1 + 6) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 7*inca), *(pi1 + 7) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 8*inca), *(pi1 + 8) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 + 9*inca), *(pi1 + 9) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +10*inca), *(pi1 +10) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +11*inca), *(pi1 +11) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +12*inca), *(pi1 +12) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +13*inca), *(pi1 +13) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +14*inca), *(pi1 +14) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +15*inca), *(pi1 +15) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +16*inca), *(pi1 +16) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +17*inca), *(pi1 +17) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +18*inca), *(pi1 +18) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +19*inca), *(pi1 +19) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +20*inca), *(pi1 +20) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +21*inca), *(pi1 +21) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +22*inca), *(pi1 +22) ); \
-					PASTEMAC(ch,scal2s)( *kappa_cast, *(alpha1 +23*inca), *(pi1 +23) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-		} \
-	} \
-	else /* if ( cdim < mnr ) */ \
-	{ \
-		PASTEMAC2(ch,scal2m,BLIS_TAPI_EX_SUF) \
-		( \
-		  0, \
-		  BLIS_NONUNIT_DIAG, \
-		  BLIS_DENSE, \
-		  ( trans_t )conja, \
-		  cdim, \
-		  n, \
-		  kappa, \
-		  a, inca, lda, \
-		  p,    1, ldp, \
-		  cntx, \
-		  NULL  \
-		); \
-\
-		/* if ( cdim < mnr ) */ \
-		{ \
-			const dim_t     i      = cdim; \
-			const dim_t     m_edge = mnr - cdim; \
-			const dim_t     n_edge = n_max; \
-			ctype* restrict p_cast = p; \
-			ctype* restrict p_edge = p_cast + (i  )*1; \
-\
-			PASTEMAC(ch,set0s_mxn) \
-			( \
-			  m_edge, \
-			  n_edge, \
-			  p_edge, 1, ldp  \
-			); \
-		} \
-	} \
 \
 	if ( n < n_max ) \
 	{ \
 		const dim_t     j      = n; \
-		const dim_t     m_edge = mnr; \
+		/* use ldp instead of mnr, in case the latter is -1 \
+		   this may write extra zeros, but not too many \
+		   this also automatically accounts for dfac when \
+		   using set0s_mxn instead of set0bbs_mxn */ \
+		const dim_t     m_edge = ldp; \
 		const dim_t     n_edge = n_max - n; \
 		ctype* restrict p_cast = p; \
 		ctype* restrict p_edge = p_cast + (j  )*ldp; \
@@ -1718,5 +169,6 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC3( packm_24xk, 24, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNC_BASIC4( packm_mrxk, BLIS_MR, BLIS_BBM, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNC_BASIC4( packm_nrxk, BLIS_NR, BLIS_BBN, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
diff --git a/ref_kernels/1m/bli_unpackm_cxk_ref.c b/ref_kernels/1m/bli_unpackm_cxk_ref.c
index 00dc02eb4d..a914e07660 100644
--- a/ref_kernels/1m/bli_unpackm_cxk_ref.c
+++ b/ref_kernels/1m/bli_unpackm_cxk_ref.c
@@ -35,815 +35,100 @@
 #include "blis.h"
 
 #undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, arch, suf ) \
+#define GENTFUNC( ctype, ch, opname, mnr0, bb0, arch, suf ) \
 \
 void PASTEMAC3(ch,opname,arch,suf) \
      ( \
-       conj_t           conjp, \
+       conj_t           conja, \
+       pack_t           schema, \
+       dim_t            cdim, \
        dim_t            n, \
-       void*   restrict kappa, \
-       void*   restrict p,             inc_t ldp, \
-       void*   restrict a, inc_t inca, inc_t lda, \
-       cntx_t* restrict cntx  \
+       dim_t            n_max, \
+       ctype*  restrict kappa, \
+       ctype*  restrict a, inc_t inca, inc_t lda, \
+       ctype*  restrict p,             inc_t ldp, \
+       cntx_t* restrict cntx \
      ) \
 { \
-	ctype* restrict kappa_cast = kappa; \
-	ctype* restrict pi1        = p; \
-	ctype* restrict alpha1     = a; \
-\
-	if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-	{ \
-		if ( bli_is_conj( conjp ) ) \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 1), *(alpha1 + 1*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-		else \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 1), *(alpha1 + 1*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-	} \
-	else \
-	{ \
-		if ( bli_is_conj( conjp ) ) \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \
+	const dim_t     mnr        = PASTECH2(mnr0, _, ch); \
+    /* It's not clear if unpack needs to care about BB storage... */ \
+	const dim_t     dfac       = PASTECH2(bb0, _, ch); \
 \
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-		else \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC2( unpackm_2xk, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conjp, \
-       dim_t            n, \
-       void*   restrict kappa, \
-       void*   restrict p,             inc_t ldp, \
-       void*   restrict a, inc_t inca, inc_t lda, \
-       cntx_t* restrict cntx  \
-     ) \
-{ \
 	ctype* restrict kappa_cast = kappa; \
-	ctype* restrict pi1        = p; \
 	ctype* restrict alpha1     = a; \
-\
-	if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-	{ \
-		if ( bli_is_conj( conjp ) ) \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 3), *(alpha1 + 3*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-		else \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 3), *(alpha1 + 3*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-	} \
-	else \
-	{ \
-		if ( bli_is_conj( conjp ) ) \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-		else \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC2( unpackm_4xk, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conjp, \
-       dim_t            n, \
-       void*   restrict kappa, \
-       void*   restrict p,             inc_t ldp, \
-       void*   restrict a, inc_t inca, inc_t lda, \
-       cntx_t* restrict cntx  \
-     ) \
-{ \
-	ctype* restrict kappa_cast = kappa; \
 	ctype* restrict pi1        = p; \
-	ctype* restrict alpha1     = a; \
 \
-	if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-	{ \
-		if ( bli_is_conj( conjp ) ) \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 3), *(alpha1 + 3*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 4), *(alpha1 + 4*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 5), *(alpha1 + 5*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-		else \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 3), *(alpha1 + 3*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 4), *(alpha1 + 4*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 5), *(alpha1 + 5*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-	} \
-	else \
+	if ( cdim == mnr && mnr != -1 ) \
 	{ \
-		if ( bli_is_conj( conjp ) ) \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-		else \
+		if ( inca == 1 ) \
 		{ \
-			for ( ; n != 0; --n ) \
+			if ( bli_is_conj( conja ) ) \
 			{ \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \
+				for ( dim_t k = n; k != 0; --k ) \
+				{ \
+					PRAGMA_SIMD \
+					for ( dim_t mn = 0; mn < mnr; mn++ ) \
+						PASTEMAC(ch,scal2js)( *kappa_cast, *(pi1 + mn*dfac), *(alpha1 + mn*1) ); \
 \
-				pi1    += ldp; \
-				alpha1 += lda; \
+					alpha1 += lda; \
+					pi1    += ldp; \
+				} \
 			} \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC2( unpackm_6xk, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conjp, \
-       dim_t            n, \
-       void*   restrict kappa, \
-       void*   restrict p,             inc_t ldp, \
-       void*   restrict a, inc_t inca, inc_t lda, \
-       cntx_t* restrict cntx  \
-     ) \
-{ \
-	ctype* restrict kappa_cast = kappa; \
-	ctype* restrict pi1        = p; \
-	ctype* restrict alpha1     = a; \
-\
-	if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-	{ \
-		if ( bli_is_conj( conjp ) ) \
-		{ \
-			for ( ; n != 0; --n ) \
+			else \
 			{ \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 3), *(alpha1 + 3*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 4), *(alpha1 + 4*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 5), *(alpha1 + 5*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 6), *(alpha1 + 6*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 7), *(alpha1 + 7*inca) ); \
+				for ( dim_t k = n; k != 0; --k ) \
+				{ \
+					PRAGMA_SIMD \
+					for ( dim_t mn = 0; mn < mnr; mn++ ) \
+						PASTEMAC(ch,scal2s)( *kappa_cast, *(pi1 + mn*dfac), *(alpha1 + mn*1) ); \
 \
-				pi1    += ldp; \
-				alpha1 += lda; \
+					alpha1 += lda; \
+					pi1    += ldp; \
+				} \
 			} \
 		} \
 		else \
 		{ \
-			for ( ; n != 0; --n ) \
+			if ( bli_is_conj( conja ) ) \
 			{ \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 3), *(alpha1 + 3*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 4), *(alpha1 + 4*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 5), *(alpha1 + 5*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 6), *(alpha1 + 6*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 7), *(alpha1 + 7*inca) ); \
+				for ( dim_t k = n; k != 0; --k ) \
+				{ \
+					PRAGMA_SIMD \
+					for ( dim_t mn = 0; mn < mnr; mn++ ) \
+						PASTEMAC(ch,scal2js)( *kappa_cast, *(pi1 + mn*dfac), *(alpha1 + mn*inca) ); \
 \
-				pi1    += ldp; \
-				alpha1 += lda; \
+					alpha1 += lda; \
+					pi1    += ldp; \
+				} \
 			} \
-		} \
-	} \
-	else \
-	{ \
-		if ( bli_is_conj( conjp ) ) \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-		else \
-		{ \
-			for ( ; n != 0; --n ) \
+			else \
 			{ \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \
+				for ( dim_t k = n; k != 0; --k ) \
+				{ \
+					PRAGMA_SIMD \
+					for ( dim_t mn = 0; mn < mnr; mn++ ) \
+						PASTEMAC(ch,scal2s)( *kappa_cast, *(pi1 + mn*dfac), *(alpha1 + mn*inca) ); \
 \
-				pi1    += ldp; \
-				alpha1 += lda; \
+					alpha1 += lda; \
+					pi1    += ldp; \
+				} \
 			} \
 		} \
 	} \
-}
-
-INSERT_GENTFUNC_BASIC2( unpackm_8xk, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conjp, \
-       dim_t            n, \
-       void*   restrict kappa, \
-       void*   restrict p,             inc_t ldp, \
-       void*   restrict a, inc_t inca, inc_t lda, \
-       cntx_t* restrict cntx  \
-     ) \
-{ \
-	ctype* restrict kappa_cast = kappa; \
-	ctype* restrict pi1        = p; \
-	ctype* restrict alpha1     = a; \
-\
-	if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
+	else /* if ( cdim < mnr ) */ \
 	{ \
-		if ( bli_is_conj( conjp ) ) \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 3), *(alpha1 + 3*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 4), *(alpha1 + 4*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 5), *(alpha1 + 5*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 6), *(alpha1 + 6*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 7), *(alpha1 + 7*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 8), *(alpha1 + 8*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 9), *(alpha1 + 9*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-		else \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 3), *(alpha1 + 3*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 4), *(alpha1 + 4*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 5), *(alpha1 + 5*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 6), *(alpha1 + 6*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 7), *(alpha1 + 7*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 8), *(alpha1 + 8*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 9), *(alpha1 + 9*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-	} \
-	else \
-	{ \
-		if ( bli_is_conj( conjp ) ) \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 8), *(alpha1 + 8*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 9), *(alpha1 + 9*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-		else \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 8), *(alpha1 + 8*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 9), *(alpha1 + 9*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC2( unpackm_10xk, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conjp, \
-       dim_t            n, \
-       void*   restrict kappa, \
-       void*   restrict p,             inc_t ldp, \
-       void*   restrict a, inc_t inca, inc_t lda, \
-       cntx_t* restrict cntx  \
-     ) \
-{ \
-	ctype* restrict kappa_cast = kappa; \
-	ctype* restrict pi1        = p; \
-	ctype* restrict alpha1     = a; \
-\
-	if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-	{ \
-		if ( bli_is_conj( conjp ) ) \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 3), *(alpha1 + 3*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 4), *(alpha1 + 4*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 5), *(alpha1 + 5*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 6), *(alpha1 + 6*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 7), *(alpha1 + 7*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 8), *(alpha1 + 8*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 9), *(alpha1 + 9*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 10), *(alpha1 + 10*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 11), *(alpha1 + 11*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-		else \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 3), *(alpha1 + 3*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 4), *(alpha1 + 4*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 5), *(alpha1 + 5*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 6), *(alpha1 + 6*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 7), *(alpha1 + 7*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 8), *(alpha1 + 8*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 9), *(alpha1 + 9*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 10), *(alpha1 + 10*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 11), *(alpha1 + 11*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-	} \
-	else \
-	{ \
-		if ( bli_is_conj( conjp ) ) \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 8), *(alpha1 + 8*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 9), *(alpha1 + 9*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 10), *(alpha1 + 10*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 11), *(alpha1 + 11*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-		else \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 8), *(alpha1 + 8*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 9), *(alpha1 + 9*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 10), *(alpha1 + 10*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 11), *(alpha1 + 11*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC2( unpackm_12xk, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conjp, \
-       dim_t            n, \
-       void*   restrict kappa, \
-       void*   restrict p,             inc_t ldp, \
-       void*   restrict a, inc_t inca, inc_t lda, \
-       cntx_t* restrict cntx  \
-     ) \
-{ \
-	ctype* restrict kappa_cast = kappa; \
-	ctype* restrict pi1        = p; \
-	ctype* restrict alpha1     = a; \
-\
-	if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-	{ \
-		if ( bli_is_conj( conjp ) ) \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 3), *(alpha1 + 3*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 4), *(alpha1 + 4*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 5), *(alpha1 + 5*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 6), *(alpha1 + 6*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 7), *(alpha1 + 7*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 8), *(alpha1 + 8*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 9), *(alpha1 + 9*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 10), *(alpha1 + 10*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 11), *(alpha1 + 11*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 12), *(alpha1 + 12*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 13), *(alpha1 + 13*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-		else \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 3), *(alpha1 + 3*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 4), *(alpha1 + 4*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 5), *(alpha1 + 5*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 6), *(alpha1 + 6*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 7), *(alpha1 + 7*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 8), *(alpha1 + 8*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 9), *(alpha1 + 9*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 10), *(alpha1 + 10*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 11), *(alpha1 + 11*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 12), *(alpha1 + 12*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 13), *(alpha1 + 13*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-	} \
-	else \
-	{ \
-		if ( bli_is_conj( conjp ) ) \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 8), *(alpha1 + 8*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 9), *(alpha1 + 9*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 10), *(alpha1 + 10*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 11), *(alpha1 + 11*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 12), *(alpha1 + 12*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 13), *(alpha1 + 13*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-		else \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 8), *(alpha1 + 8*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 9), *(alpha1 + 9*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 10), *(alpha1 + 10*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 11), *(alpha1 + 11*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 12), *(alpha1 + 12*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 13), *(alpha1 + 13*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC2( unpackm_14xk, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
-
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       conj_t           conjp, \
-       dim_t            n, \
-       void*   restrict kappa, \
-       void*   restrict p,             inc_t ldp, \
-       void*   restrict a, inc_t inca, inc_t lda, \
-       cntx_t* restrict cntx  \
-     ) \
-{ \
-	ctype* restrict kappa_cast = kappa; \
-	ctype* restrict pi1        = p; \
-	ctype* restrict alpha1     = a; \
-\
-	if ( PASTEMAC(ch,eq1)( *kappa_cast ) ) \
-	{ \
-		if ( bli_is_conj( conjp ) ) \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 3), *(alpha1 + 3*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 4), *(alpha1 + 4*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 5), *(alpha1 + 5*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 6), *(alpha1 + 6*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 7), *(alpha1 + 7*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 8), *(alpha1 + 8*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 9), *(alpha1 + 9*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 10), *(alpha1 + 10*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 11), *(alpha1 + 11*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 12), *(alpha1 + 12*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 13), *(alpha1 + 13*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 14), *(alpha1 + 14*inca) ); \
-				PASTEMAC2(ch,ch,copyjs)( *(pi1 + 15), *(alpha1 + 15*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-		else \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 3), *(alpha1 + 3*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 4), *(alpha1 + 4*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 5), *(alpha1 + 5*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 6), *(alpha1 + 6*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 7), *(alpha1 + 7*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 8), *(alpha1 + 8*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 9), *(alpha1 + 9*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 10), *(alpha1 + 10*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 11), *(alpha1 + 11*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 12), *(alpha1 + 12*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 13), *(alpha1 + 13*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 14), *(alpha1 + 14*inca) ); \
-				PASTEMAC2(ch,ch,copys)( *(pi1 + 15), *(alpha1 + 15*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-	} \
-	else \
-	{ \
-		if ( bli_is_conj( conjp ) ) \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 8), *(alpha1 + 8*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 9), *(alpha1 + 9*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 10), *(alpha1 + 10*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 11), *(alpha1 + 11*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 12), *(alpha1 + 12*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 13), *(alpha1 + 13*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 14), *(alpha1 + 14*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2js)( *kappa_cast, *(pi1 + 15), *(alpha1 + 15*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
-		else \
-		{ \
-			for ( ; n != 0; --n ) \
-			{ \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 0), *(alpha1 + 0*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 1), *(alpha1 + 1*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 2), *(alpha1 + 2*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 3), *(alpha1 + 3*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 4), *(alpha1 + 4*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 5), *(alpha1 + 5*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 6), *(alpha1 + 6*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 7), *(alpha1 + 7*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 8), *(alpha1 + 8*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 9), *(alpha1 + 9*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 10), *(alpha1 + 10*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 11), *(alpha1 + 11*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 12), *(alpha1 + 12*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 13), *(alpha1 + 13*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 14), *(alpha1 + 14*inca) ); \
-				PASTEMAC3(ch,ch,ch,scal2s)( *kappa_cast, *(pi1 + 15), *(alpha1 + 15*inca) ); \
-\
-				pi1    += ldp; \
-				alpha1 += lda; \
-			} \
-		} \
+		PASTEMAC(ch,scal2s_mxn) \
+		( \
+		  conja, \
+		  cdim, \
+		  n, \
+		  kappa, \
+		  p, dfac, ldp, \
+		  a, inca, lda \
+		); \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC2( unpackm_16xk, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNC_BASIC4( unpackm_mrxk, BLIS_MR, BLIS_BBM, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNC_BASIC4( unpackm_nrxk, BLIS_NR, BLIS_BBN, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
diff --git a/ref_kernels/3/bb/bli_gemmbb_ref.c b/ref_kernels/3/bb/bli_gemmbb_ref.c
deleted file mode 100644
index 4c75c064ce..0000000000
--- a/ref_kernels/3/bb/bli_gemmbb_ref.c
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-// An implementation that indexes through B with the assumption that all
-// elements were broadcast (duplicated) by a factor of NP/NR.
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, arch, suf ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       dim_t               m, \
-       dim_t               n, \
-       dim_t               k, \
-       ctype*     restrict alpha, \
-       ctype*     restrict a, \
-       ctype*     restrict b, \
-       ctype*     restrict beta, \
-       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
-     ) \
-{ \
-	const num_t     dt     = PASTEMAC(ch,type); \
-\
-	const dim_t     mr     = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
-	const dim_t     nr     = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
-\
-	const inc_t     packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \
-	const inc_t     packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \
-\
-	const inc_t     cs_a   = packmr; \
-\
-	const inc_t     rs_b   = packnr; \
-\
-	/* Assume that the degree of duplication is equal to packnr / nr. */ \
-	const inc_t     cs_b   = packnr / nr; \
-\
-	ctype           ab[ BLIS_STACK_BUF_MAX_SIZE \
-	                    / sizeof( ctype ) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const inc_t     rs_ab  = 1; \
-	const inc_t     cs_ab  = mr; \
-\
-	dim_t           l, j, i; \
-\
-	ctype           ai; \
-	ctype           bj; \
-\
-\
-	/* Initialize the accumulator elements in ab to zero. */ \
-	for ( i = 0; i < m * n; ++i ) \
-	{ \
-		PASTEMAC(ch,set0s)( *(ab + i) ); \
-	} \
-\
-	/* Perform a series of k rank-1 updates into ab. */ \
-	for ( l = 0; l < k; ++l ) \
-	{ \
-		ctype* restrict abij = ab; \
-\
-		/* In an optimized implementation, these two loops over MR and NR
-		   are typically fully unrolled. */ \
-		for ( j = 0; j < n; ++j ) \
-		{ \
-			bj = *(b + j*cs_b); \
-\
-			for ( i = 0; i < m; ++i ) \
-			{ \
-				ai = *(a + i); \
-\
-				PASTEMAC(ch,dots)( ai, bj, *abij ); \
-\
-				abij += rs_ab; \
-			} \
-		} \
-\
-		a += cs_a; \
-		b += rs_b; \
-	} \
-\
-	/* Scale the result in ab by alpha. */ \
-	for ( i = 0; i < m * n; ++i ) \
-	{ \
-		PASTEMAC(ch,scals)( *alpha, *(ab + i) ); \
-	} \
-\
-	/* If beta is zero, overwrite c with the scaled result in ab. Otherwise,
-	   scale by beta and then add the scaled redult in ab. */ \
-	if ( PASTEMAC(ch,eq0)( *beta ) ) \
-	{ \
-		PASTEMAC(ch,copys_mxn)( m, \
-		                        n, \
-		                        ab, rs_ab, cs_ab, \
-		                        c,  rs_c,  cs_c ); \
-	} \
-	else \
-	{ \
-		PASTEMAC(ch,xpbys_mxn)( m, \
-		                        n, \
-		                        ab, rs_ab, cs_ab, \
-		                        beta, \
-		                        c,  rs_c,  cs_c ); \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC2( gemmbb, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
-
diff --git a/ref_kernels/3/bb/bli_gemmtrsmbb_ref.c b/ref_kernels/3/bb/bli_gemmtrsmbb_ref.c
deleted file mode 100644
index e40b1b4cca..0000000000
--- a/ref_kernels/3/bb/bli_gemmtrsmbb_ref.c
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-// An implementation that indexes through B with the assumption that all
-// elements were broadcast (duplicated) by a factor of NP/NR.
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, arch, suf, trsmkerid ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       dim_t               k, \
-       ctype*     restrict alpha, \
-       ctype*     restrict a1x, \
-       ctype*     restrict a11, \
-       ctype*     restrict bx1, \
-       ctype*     restrict b11, \
-       ctype*     restrict c11, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
-     ) \
-{ \
-	const num_t     dt     = PASTEMAC(ch,type); \
-\
-	const inc_t     mr     = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
-	const inc_t     nr     = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
-\
-	const inc_t     packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \
-\
-	const inc_t     rs_b   = packnr; \
-\
-	/* Assume that the degree of duplication is equal to packnr / nr. */ \
-	const inc_t     cs_b   = packnr / nr; \
-/*
-printf( "bli_gemmtrsmbb_ref(): cs_b = %d\n", (int)cs_b ); \
-printf( "bli_gemmtrsmbb_ref(): k nr = %d %d\n", (int)k, (int)nr ); \
-*/ \
-\
-	ctype*          minus_one = PASTEMAC(ch,m1); \
-\
-	PASTECH(ch,gemm_ukr_ft) \
-	              gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
-	PASTECH(ch,trsm_ukr_ft) \
-	              trsm_ukr = bli_cntx_get_ukr_dt( dt, trsmkerid, cntx ); \
-\
-/*
-PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b01", k, nr, \
-                     (double*)bx1, rs_b, cs_b, "%5.2f", "" ); \
-PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11", mr, 2*nr, \
-                     (double*)b11, rs_b, 1, "%5.2f", "" ); \
-*/ \
-\
-	/* lower: b11 = alpha * b11 - a10 * b01; */ \
-	/* upper: b11 = alpha * b11 - a12 * b21; */ \
-	gemm_ukr \
-	( \
-	  mr, \
-	  nr, \
-	  k, \
-	  minus_one, \
-	  a1x, \
-	  bx1, \
-	  alpha, \
-	  b11, rs_b, cs_b, \
-	  data, \
-	  cntx  \
-	); \
-/*
-PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11 after gemm", mr, 2*nr, \
-                     (double*)b11, rs_b, 1, "%5.2f", "" ); \
-*/ \
-\
-	/* b11 = inv(a11) * b11;
-	   c11 = b11; */ \
-	trsm_ukr \
-	( \
-	  a11, \
-	  b11, \
-	  c11, rs_c, cs_c, \
-	  data, \
-	  cntx  \
-	); \
-/*
-PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11 after trsm", mr, 2*nr, \
-                     (double*)b11, rs_b, 1, "%5.2f", "" ); \
-*/ \
-\
-	/* Broadcast the elements of the updated b11 submatrix to their
-	   duplicated neighbors. */ \
-	PASTEMAC(ch,bcastbbs_mxn) \
-	( \
-	  mr, \
-	  nr, \
-	  b11, rs_b, cs_b  \
-	); \
-\
-/*
-PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b0111p_r after", k+3, 8, \
-                     ( double* )b01,     2*PASTEMAC(ch,packnr), 2, "%4.1f", "" ); \
-PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b0111p_i after", k+3, 8, \
-                     ( double* )b01 + 1, 2*PASTEMAC(ch,packnr), 2, "%4.1f", "" ); \
-*/ \
-}
-
-INSERT_GENTFUNC_BASIC3( gemmtrsmbb_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, BLIS_TRSM_L_UKR )
-INSERT_GENTFUNC_BASIC3( gemmtrsmbb_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, BLIS_TRSM_U_UKR )
-
diff --git a/ref_kernels/3/bb/bli_trsmbb_ref.c b/ref_kernels/3/bb/bli_trsmbb_ref.c
deleted file mode 100644
index e3f5500ccb..0000000000
--- a/ref_kernels/3/bb/bli_trsmbb_ref.c
+++ /dev/null
@@ -1,214 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-// An implementation that indexes through B with the assumption that all
-// elements were broadcast (duplicated) by a factor of NP/NR.
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, arch, suf, diagop ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       ctype*     restrict a, \
-       ctype*     restrict b, \
-       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
-     ) \
-{ \
-	const num_t     dt     = PASTEMAC(ch,type); \
-\
-	const dim_t     mr     = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
-	const dim_t     nr     = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
-\
-	const inc_t     packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \
-	const inc_t     packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \
-\
-	const dim_t     m      = mr; \
-	const dim_t     n      = nr; \
-\
-	const inc_t     rs_a   = 1; \
-	const inc_t     cs_a   = packmr; \
-\
-	const inc_t     rs_b   = packnr; \
-\
-	/* Assume that the degree of duplication is equal to packnr / nr. */ \
-	const inc_t     cs_b   = packnr / nr; \
-\
-	dim_t           iter, i, j, l; \
-	dim_t           n_behind; \
-\
-	for ( iter = 0; iter < m; ++iter ) \
-	{ \
-		i        = iter; \
-		n_behind = i; \
-\
-		ctype* restrict alpha11  = a + (i  )*rs_a + (i  )*cs_a; \
-		ctype* restrict a10t     = a + (i  )*rs_a + (0  )*cs_a; \
-		ctype* restrict B0       = b + (0  )*rs_b + (0  )*cs_b; \
-		ctype* restrict b1       = b + (i  )*rs_b + (0  )*cs_b; \
-\
-		/* b1 = b1 - a10t * B0; */ \
-		/* b1 = b1 / alpha11; */ \
-		for ( j = 0; j < n; ++j ) \
-		{ \
-			ctype* restrict b01     = B0 + (0  )*rs_b + (j  )*cs_b; \
-			ctype* restrict beta11  = b1 + (0  )*rs_b + (j  )*cs_b; \
-			ctype* restrict gamma11 = c  + (i  )*rs_c + (j  )*cs_c; \
-			ctype           beta11c = *beta11; \
-			ctype           rho11; \
-\
-			/* beta11 = beta11 - a10t * b01; */ \
-			PASTEMAC(ch,set0s)( rho11 ); \
-			for ( l = 0; l < n_behind; ++l ) \
-			{ \
-				ctype* restrict alpha10 = a10t + (l  )*cs_a; \
-				ctype* restrict beta01  = b01  + (l  )*rs_b; \
-\
-				PASTEMAC(ch,axpys)( *alpha10, *beta01, rho11 ); \
-			} \
-			PASTEMAC(ch,subs)( rho11, beta11c ); \
-\
-			/* beta11 = beta11 / alpha11; */ \
-			/* NOTE: When preinversion is enabled, the INVERSE of alpha11
-			   (1.0/alpha11) is stored during packing instead alpha11 so we
-			   can multiply rather than divide. When preinversion is disabled,
-			   alpha11 is stored and division happens below explicitly. */ \
-			PASTEMAC(ch,scals)( *alpha11, beta11c ); \
-\
-			/* Output final result to matrix c. */ \
-			PASTEMAC(ch,copys)( beta11c, *gamma11 ); \
-\
-			/* Store the local value back to b11. */ \
-			PASTEMAC(ch,copys)( beta11c, *beta11 ); \
-		} \
-	} \
-}
-
-#ifdef BLIS_ENABLE_TRSM_PREINVERSION
-INSERT_GENTFUNC_BASIC3( trsmbb_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, scals )
-#else
-INSERT_GENTFUNC_BASIC3( trsmbb_l, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, invscals )
-#endif
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname, arch, suf, diagop ) \
-\
-void PASTEMAC3(ch,opname,arch,suf) \
-     ( \
-       ctype*     restrict a, \
-       ctype*     restrict b, \
-       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
-     ) \
-{ \
-	const num_t     dt     = PASTEMAC(ch,type); \
-\
-	const dim_t     mr     = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
-	const dim_t     nr     = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
-\
-	const inc_t     packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \
-	const inc_t     packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \
-\
-	const dim_t     m      = mr; \
-	const dim_t     n      = nr; \
-\
-	const inc_t     rs_a   = 1; \
-	const inc_t     cs_a   = packmr; \
-\
-	const inc_t     rs_b   = packnr; \
-\
-	/* Assume that the degree of duplication is equal to packnr / nr. */ \
-	const inc_t     cs_b   = packnr / nr; \
-\
-	dim_t           iter, i, j, l; \
-	dim_t           n_behind; \
-\
-	for ( iter = 0; iter < m; ++iter ) \
-	{ \
-		i        = m - iter - 1; \
-		n_behind = iter; \
-\
-		ctype* restrict alpha11  = a + (i  )*rs_a + (i  )*cs_a; \
-		ctype* restrict a12t     = a + (i  )*rs_a + (i+1)*cs_a; \
-		ctype* restrict b1       = b + (i  )*rs_b + (0  )*cs_b; \
-		ctype* restrict B2       = b + (i+1)*rs_b + (0  )*cs_b; \
-\
-		/* b1 = b1 - a12t * B2; */ \
-		/* b1 = b1 / alpha11; */ \
-		for ( j = 0; j < n; ++j ) \
-		{ \
-			ctype* restrict beta11  = b1 + (0  )*rs_b + (j  )*cs_b; \
-			ctype* restrict b21     = B2 + (0  )*rs_b + (j  )*cs_b; \
-			ctype* restrict gamma11 = c  + (i  )*rs_c + (j  )*cs_c; \
-			ctype           beta11c = *beta11; \
-			ctype           rho11; \
-\
-			/* beta11 = beta11 - a12t * b21; */ \
-			PASTEMAC(ch,set0s)( rho11 ); \
-			for ( l = 0; l < n_behind; ++l ) \
-			{ \
-				ctype* restrict alpha12 = a12t + (l  )*cs_a; \
-				ctype* restrict beta21  = b21  + (l  )*rs_b; \
-\
-				PASTEMAC(ch,axpys)( *alpha12, *beta21, rho11 ); \
-			} \
-			PASTEMAC(ch,subs)( rho11, beta11c ); \
-\
-			/* beta11 = beta11 / alpha11; */ \
-			/* NOTE: When preinversion is enabled, the INVERSE of alpha11
-			   (1.0/alpha11) is stored during packing instead alpha11 so we
-			   can multiply rather than divide. When preinversion is disabled,
-			   alpha11 is stored and division happens below explicitly. */ \
-			PASTEMAC(ch,diagop)( *alpha11, beta11c ); \
-\
-			/* Output final result to matrix c. */ \
-			PASTEMAC(ch,copys)( beta11c, *gamma11 ); \
-\
-			/* Store the local value back to b11. */ \
-			PASTEMAC(ch,copys)( beta11c, *beta11 ); \
-		} \
-	} \
-}
-
-#ifdef BLIS_ENABLE_TRSM_PREINVERSION
-INSERT_GENTFUNC_BASIC3( trsmbb_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, scals )
-#else
-INSERT_GENTFUNC_BASIC3( trsmbb_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, invscals )
-#endif
-
diff --git a/ref_kernels/3/bli_gemm_ref.c b/ref_kernels/3/bli_gemm_ref.c
index 0462af8539..968ca39979 100644
--- a/ref_kernels/3/bli_gemm_ref.c
+++ b/ref_kernels/3/bli_gemm_ref.c
@@ -59,9 +59,11 @@ static void PASTEMAC3(ch,opname,arch,suf) \
 	const inc_t     packmr = bli_cntx_get_blksz_max_dt( dt, BLIS_MR, cntx ); \
 	const inc_t     packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \
 \
+	const inc_t     rs_a   = bli_cntx_get_blksz_def_dt( dt, BLIS_BBM, cntx ); \
 	const inc_t     cs_a   = packmr; \
 \
 	const inc_t     rs_b   = packnr; \
+	const inc_t     cs_b   = bli_cntx_get_blksz_def_dt( dt, BLIS_BBN, cntx ); \
 \
 	ctype           ab[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
@@ -90,11 +92,11 @@ static void PASTEMAC3(ch,opname,arch,suf) \
 		   are typically fully unrolled. */ \
 		for ( j = 0; j < n; ++j ) \
 		{ \
-			bj = *(b + j); \
+			bj = *(b + j*cs_b); \
 \
 			for ( i = 0; i < m; ++i ) \
 			{ \
-				ai = *(a + i); \
+				ai = *(a + i*rs_a); \
 \
 				PASTEMAC(ch,dots)( ai, bj, *abij ); \
 \
@@ -131,7 +133,7 @@ static void PASTEMAC3(ch,opname,arch,suf) \
 	} \
 }
 
-INSERT_GENTFUNC_BASIC2( gemm_unr, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNC_BASIC2( gemm_gen, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
 
 // An implementation that attempts to facilitate emission of vectorized
 // instructions via constant loop bounds + #pragma omp simd directives.
@@ -161,7 +163,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 	if ( mr == -1 || nr == -1 ) \
 	{ \
-		PASTEMAC3(ch,gemm_unr,arch,suf) \
+		PASTEMAC3(ch,gemm_gen,arch,suf) \
 		( \
 		  m, \
 		  n, \
@@ -183,8 +185,10 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	const inc_t rs_ab  = nr; \
 	const inc_t cs_ab  = 1; \
 \
+	const inc_t rs_a   = PASTECH(BLIS_BBM_,ch); \
 	const inc_t cs_a   = PASTECH(BLIS_PACKMR_,ch); \
 	const inc_t rs_b   = PASTECH(BLIS_PACKNR_,ch); \
+	const inc_t cs_b   = PASTECH(BLIS_BBN_,ch); \
 \
 \
 	/* Initialize the accumulator elements in ab to zero. */ \
@@ -204,8 +208,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
 			{ \
 				PASTEMAC(ch,dots) \
 				( \
-				  a[ i ], \
-				  b[ j ], \
+				  a[ i*rs_a ], \
+				  b[ j*cs_b ], \
 				  ab[ i*rs_ab + j*cs_ab ]  \
 				); \
 			} \
diff --git a/ref_kernels/3/bli_gemmtrsm_ref.c b/ref_kernels/3/bli_gemmtrsm_ref.c
index 03f343a336..481a350cb6 100644
--- a/ref_kernels/3/bli_gemmtrsm_ref.c
+++ b/ref_kernels/3/bli_gemmtrsm_ref.c
@@ -34,6 +34,9 @@
 
 #include "blis.h"
 
+// An implementation that indexes through B with the assumption that all
+// elements were broadcast (duplicated) by a factor of NP/NR.
+
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, arch, suf, trsmkerid ) \
 \
@@ -54,10 +57,15 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 	const inc_t     mr     = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
 	const inc_t     nr     = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
+\
 	const inc_t     packnr = bli_cntx_get_blksz_max_dt( dt, BLIS_NR, cntx ); \
 \
 	const inc_t     rs_b   = packnr; \
-	const inc_t     cs_b   = 1; \
+	const inc_t     cs_b   = bli_cntx_get_blksz_def_dt( dt, BLIS_BBN, cntx ); \
+/*
+printf( "bli_gemmtrsm_ref(): cs_b = %d\n", (int)cs_b ); \
+printf( "bli_gemmtrsm_ref(): k nr = %d %d\n", (int)k, (int)nr ); \
+*/ \
 \
 	ctype*          minus_one = PASTEMAC(ch,m1); \
 \
@@ -65,6 +73,13 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	              gemm_ukr = bli_cntx_get_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
 	PASTECH(ch,trsm_ukr_ft) \
 	              trsm_ukr = bli_cntx_get_ukr_dt( dt, trsmkerid, cntx ); \
+\
+/*
+PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b01", k, nr, \
+                     (double*)bx1, rs_b, cs_b, "%5.2f", "" ); \
+PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11", mr, 2*nr, \
+                     (double*)b11, rs_b, 1, "%5.2f", "" ); \
+*/ \
 \
 	/* lower: b11 = alpha * b11 - a10 * b01; */ \
 	/* upper: b11 = alpha * b11 - a12 * b21; */ \
@@ -81,6 +96,10 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	  data, \
 	  cntx  \
 	); \
+/*
+PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11 after gemm", mr, 2*nr, \
+                     (double*)b11, rs_b, 1, "%5.2f", "" ); \
+*/ \
 \
 	/* b11 = inv(a11) * b11;
 	   c11 = b11; */ \
@@ -92,6 +111,19 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	  data, \
 	  cntx  \
 	); \
+/*
+PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11 after trsm", mr, 2*nr, \
+                     (double*)b11, rs_b, 1, "%5.2f", "" ); \
+*/ \
+\
+	/* Broadcast the elements of the updated b11 submatrix to their
+	   duplicated neighbors. */ \
+	PASTEMAC(ch,bcastbbs_mxn) \
+	( \
+	  mr, \
+	  nr, \
+	  b11, rs_b, cs_b  \
+	); \
 \
 /*
 PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b0111p_r after", k+3, 8, \
diff --git a/ref_kernels/3/bli_trsm_ref.c b/ref_kernels/3/bli_trsm_ref.c
index 786f1129d0..cf80b2e19b 100644
--- a/ref_kernels/3/bli_trsm_ref.c
+++ b/ref_kernels/3/bli_trsm_ref.c
@@ -34,17 +34,8 @@
 
 #include "blis.h"
 
-#if 0
-
-// An implementation that attempts to facilitate emission of vectorized
-// instructions via constant loop bounds + #pragma omp simd directives.
-
-// (Deleted. See 'old' directory.)
-
-#else
-
-// An implementation that uses variable loop bounds (queried from the context)
-// and makes no use of #pragma omp simd.
+// An implementation that indexes through B with the assumption that all
+// elements were broadcast (duplicated) by a factor of NP/NR.
 
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, arch, suf, diagop ) \
@@ -69,11 +60,11 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	const dim_t     m      = mr; \
 	const dim_t     n      = nr; \
 \
-	const inc_t     rs_a   = 1; \
+	const inc_t     rs_a   = bli_cntx_get_blksz_def_dt( dt, BLIS_BBM, cntx ); \
 	const inc_t     cs_a   = packmr; \
 \
 	const inc_t     rs_b   = packnr; \
-	const inc_t     cs_b   = 1; \
+	const inc_t     cs_b   = bli_cntx_get_blksz_def_dt( dt, BLIS_BBN, cntx ); \
 \
 	dim_t           iter, i, j, l; \
 	dim_t           n_behind; \
@@ -114,7 +105,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 			   (1.0/alpha11) is stored during packing instead alpha11 so we
 			   can multiply rather than divide. When preinversion is disabled,
 			   alpha11 is stored and division happens below explicitly. */ \
-			PASTEMAC(ch,diagop)( *alpha11, beta11c ); \
+			PASTEMAC(ch,scals)( *alpha11, beta11c ); \
 \
 			/* Output final result to matrix c. */ \
 			PASTEMAC(ch,copys)( beta11c, *gamma11 ); \
@@ -155,11 +146,11 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	const dim_t     m      = mr; \
 	const dim_t     n      = nr; \
 \
-	const inc_t     rs_a   = 1; \
+	const inc_t     rs_a   = bli_cntx_get_blksz_def_dt( dt, BLIS_BBM, cntx ); \
 	const inc_t     cs_a   = packmr; \
 \
 	const inc_t     rs_b   = packnr; \
-	const inc_t     cs_b   = 1; \
+	const inc_t     cs_b   = bli_cntx_get_blksz_def_dt( dt, BLIS_BBN, cntx ); \
 \
 	dim_t           iter, i, j, l; \
 	dim_t           n_behind; \
@@ -217,4 +208,3 @@ INSERT_GENTFUNC_BASIC3( trsm_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, scals )
 INSERT_GENTFUNC_BASIC3( trsm_u, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX, invscals )
 #endif
 
-#endif
diff --git a/ref_kernels/bli_cntx_ref.c b/ref_kernels/bli_cntx_ref.c
index 8cd4a9703c..53b9aa551f 100644
--- a/ref_kernels/bli_cntx_ref.c
+++ b/ref_kernels/bli_cntx_ref.c
@@ -107,60 +107,20 @@
 
 // -- Level-1m (packm/unpackm) kernel prototype redefinitions ------------------
 
-#undef  packm_2xk_ker_name
-#define packm_2xk_ker_name  GENARNAME(packm_2xk)
-#undef  packm_3xk_ker_name
-#define packm_3xk_ker_name  GENARNAME(packm_3xk)
-#undef  packm_4xk_ker_name
-#define packm_4xk_ker_name  GENARNAME(packm_4xk)
-#undef  packm_6xk_ker_name
-#define packm_6xk_ker_name  GENARNAME(packm_6xk)
-#undef  packm_8xk_ker_name
-#define packm_8xk_ker_name  GENARNAME(packm_8xk)
-#undef  packm_10xk_ker_name
-#define packm_10xk_ker_name GENARNAME(packm_10xk)
-#undef  packm_12xk_ker_name
-#define packm_12xk_ker_name GENARNAME(packm_12xk)
-#undef  packm_14xk_ker_name
-#define packm_14xk_ker_name GENARNAME(packm_14xk)
-#undef  packm_16xk_ker_name
-#define packm_16xk_ker_name GENARNAME(packm_16xk)
-#undef  packm_24xk_ker_name
-#define packm_24xk_ker_name GENARNAME(packm_24xk)
-
-#undef  unpackm_2xk_ker_name
-#define unpackm_2xk_ker_name  GENARNAME(unpackm_2xk)
-#undef  unpackm_4xk_ker_name
-#define unpackm_4xk_ker_name  GENARNAME(unpackm_4xk)
-#undef  unpackm_6xk_ker_name
-#define unpackm_6xk_ker_name  GENARNAME(unpackm_6xk)
-#undef  unpackm_8xk_ker_name
-#define unpackm_8xk_ker_name  GENARNAME(unpackm_8xk)
-#undef  unpackm_10xk_ker_name
-#define unpackm_10xk_ker_name GENARNAME(unpackm_10xk)
-#undef  unpackm_12xk_ker_name
-#define unpackm_12xk_ker_name GENARNAME(unpackm_12xk)
-#undef  unpackm_14xk_ker_name
-#define unpackm_14xk_ker_name GENARNAME(unpackm_14xk)
-#undef  unpackm_16xk_ker_name
-#define unpackm_16xk_ker_name GENARNAME(unpackm_16xk)
-
-#undef  packm_2xk_1er_ker_name
-#define packm_2xk_1er_ker_name  GENARNAME(packm_2xk_1er)
-#undef  packm_4xk_1er_ker_name
-#define packm_4xk_1er_ker_name  GENARNAME(packm_4xk_1er)
-#undef  packm_6xk_1er_ker_name
-#define packm_6xk_1er_ker_name  GENARNAME(packm_6xk_1er)
-#undef  packm_8xk_1er_ker_name
-#define packm_8xk_1er_ker_name  GENARNAME(packm_8xk_1er)
-#undef  packm_10xk_1er_ker_name
-#define packm_10xk_1er_ker_name GENARNAME(packm_10xk_1er)
-#undef  packm_12xk_1er_ker_name
-#define packm_12xk_1er_ker_name GENARNAME(packm_12xk_1er)
-#undef  packm_14xk_1er_ker_name
-#define packm_14xk_1er_ker_name GENARNAME(packm_14xk_1er)
-#undef  packm_16xk_1er_ker_name
-#define packm_16xk_1er_ker_name GENARNAME(packm_16xk_1er)
+#undef  packm_mrxk_ker_name
+#define packm_mrxk_ker_name  GENARNAME(packm_mrxk)
+#undef  packm_nrxk_ker_name
+#define packm_nrxk_ker_name  GENARNAME(packm_nrxk)
+
+#undef  packm_mrxk_1er_ker_name
+#define packm_mrxk_1er_ker_name  GENARNAME(packm_mrxk_1er)
+#undef  packm_nrxk_1er_ker_name
+#define packm_nrxk_1er_ker_name  GENARNAME(packm_nrxk_1er)
+
+#undef  unpackm_mrxk_ker_name
+#define unpackm_mrxk_ker_name  GENARNAME(unpackm_mrxk)
+#undef  unpackm_nrxk_ker_name
+#define unpackm_nrxk_ker_name  GENARNAME(unpackm_nrxk)
 
 // Instantiate prototypes for above functions via the level-1m kernel API
 // template.
@@ -272,18 +232,20 @@ void GENBARNAME(cntx_init)
 
 	// -- Set blocksizes -------------------------------------------------------
 
-	//                                          s     d     c     z
-	bli_blksz_init_easy( &blkszs[ BLIS_KR ],    1,    1,    1,    1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_MR ],    4,    4,    4,    4 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NR ],   16,    8,    8,    4 );
-	bli_blksz_init_easy( &blkszs[ BLIS_MC ],  256,  128,  128,   64 );
-	bli_blksz_init_easy( &blkszs[ BLIS_KC ],  256,  256,  256,  256 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NC ], 4096, 4096, 4096, 4096 );
-	bli_blksz_init_easy( &blkszs[ BLIS_M2 ], 1000, 1000, 1000, 1000 );
-	bli_blksz_init_easy( &blkszs[ BLIS_N2 ], 1000, 1000, 1000, 1000 );
-	bli_blksz_init_easy( &blkszs[ BLIS_AF ],    8,    8,    8,    8 );
-	bli_blksz_init_easy( &blkszs[ BLIS_DF ],    6,    6,    6,    6 );
-	bli_blksz_init_easy( &blkszs[ BLIS_XF ],    4,    4,    4,    4 );
+	//                                           s     d     c     z
+	bli_blksz_init_easy( &blkszs[ BLIS_KR  ],    1,    1,    1,    1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MR  ],    4,    4,    4,    4 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NR  ],   16,    8,    8,    4 );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC  ],  256,  128,  128,   64 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC  ],  256,  256,  256,  256 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC  ], 4096, 4096, 4096, 4096 );
+	bli_blksz_init_easy( &blkszs[ BLIS_M2  ], 1000, 1000, 1000, 1000 );
+	bli_blksz_init_easy( &blkszs[ BLIS_N2  ], 1000, 1000, 1000, 1000 );
+	bli_blksz_init_easy( &blkszs[ BLIS_AF  ],    8,    8,    8,    8 );
+	bli_blksz_init_easy( &blkszs[ BLIS_DF  ],    6,    6,    6,    6 );
+	bli_blksz_init_easy( &blkszs[ BLIS_XF  ],    4,    4,    4,    4 );
+	bli_blksz_init_easy( &blkszs[ BLIS_BBM ],    1,    1,    1,    1 );
+	bli_blksz_init_easy( &blkszs[ BLIS_BBN ],    1,    1,    1,    1 );
 
 	// -- Set level-3 small/unpacked thresholds --------------------------------
 
@@ -305,20 +267,22 @@ void GENBARNAME(cntx_init)
 	bli_cntx_set_blkszs
 	(
 	  cntx,
-	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
-	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
-	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
-	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
-	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
-	  BLIS_KR, &blkszs[ BLIS_KR ], BLIS_KR,
-	  BLIS_M2, &blkszs[ BLIS_M2 ], BLIS_M2,
-	  BLIS_N2, &blkszs[ BLIS_N2 ], BLIS_N2,
-	  BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
-	  BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
-	  BLIS_XF, &blkszs[ BLIS_XF ], BLIS_XF,
-	  BLIS_MT, &blkszs[ BLIS_MT ], BLIS_MT,
-	  BLIS_NT, &blkszs[ BLIS_NT ], BLIS_NT,
-	  BLIS_KT, &blkszs[ BLIS_KT ], BLIS_KT,
+	  BLIS_NC,  &blkszs[ BLIS_NC  ], BLIS_NR,
+	  BLIS_KC,  &blkszs[ BLIS_KC  ], BLIS_KR,
+	  BLIS_MC,  &blkszs[ BLIS_MC  ], BLIS_MR,
+	  BLIS_NR,  &blkszs[ BLIS_NR  ], BLIS_NR,
+	  BLIS_MR,  &blkszs[ BLIS_MR  ], BLIS_MR,
+	  BLIS_KR,  &blkszs[ BLIS_KR  ], BLIS_KR,
+	  BLIS_M2,  &blkszs[ BLIS_M2  ], BLIS_M2,
+	  BLIS_N2,  &blkszs[ BLIS_N2  ], BLIS_N2,
+	  BLIS_AF,  &blkszs[ BLIS_AF  ], BLIS_AF,
+	  BLIS_DF,  &blkszs[ BLIS_DF  ], BLIS_DF,
+	  BLIS_XF,  &blkszs[ BLIS_XF  ], BLIS_XF,
+	  BLIS_MT,  &blkszs[ BLIS_MT  ], BLIS_MT,
+	  BLIS_NT,  &blkszs[ BLIS_NT  ], BLIS_NT,
+	  BLIS_KT,  &blkszs[ BLIS_KT  ], BLIS_KT,
+	  BLIS_BBM, &blkszs[ BLIS_BBM ], BLIS_BBM,
+	  BLIS_BBN, &blkszs[ BLIS_BBN ], BLIS_BBN,
 	  -1
 	);
 
@@ -417,37 +381,14 @@ void GENBARNAME(cntx_init)
 
 	// -- Set level-1m (packm/unpackm) kernels ---------------------------------
 
-	// Initialize all packm kernel func_t entries to NULL.
-	for ( i = BLIS_PACKM_0XK_KER; i <= BLIS_PACKM_31XK_KER; ++i )
-	{
-		bli_func_init_null( &funcs[ i ] );
-	}
+	gen_func_init( &funcs[ BLIS_PACKM_MRXK_KER ],  packm_mrxk_ker_name );
+	gen_func_init( &funcs[ BLIS_PACKM_NRXK_KER ],  packm_nrxk_ker_name );
 
-	gen_func_init( &funcs[ BLIS_PACKM_2XK_KER ],  packm_2xk_ker_name );
-	gen_func_init( &funcs[ BLIS_PACKM_3XK_KER ],  packm_3xk_ker_name );
-	gen_func_init( &funcs[ BLIS_PACKM_4XK_KER ],  packm_4xk_ker_name );
-	gen_func_init( &funcs[ BLIS_PACKM_6XK_KER ],  packm_6xk_ker_name );
-	gen_func_init( &funcs[ BLIS_PACKM_8XK_KER ],  packm_8xk_ker_name );
-	gen_func_init( &funcs[ BLIS_PACKM_10XK_KER ], packm_10xk_ker_name );
-	gen_func_init( &funcs[ BLIS_PACKM_12XK_KER ], packm_12xk_ker_name );
-	gen_func_init( &funcs[ BLIS_PACKM_14XK_KER ], packm_14xk_ker_name );
-	gen_func_init( &funcs[ BLIS_PACKM_16XK_KER ], packm_16xk_ker_name );
-	gen_func_init( &funcs[ BLIS_PACKM_24XK_KER ], packm_24xk_ker_name );
-
-	// Initialize all packm kernel func_t entries to NULL.
-	for ( i = BLIS_UNPACKM_0XK_KER; i <= BLIS_UNPACKM_31XK_KER; ++i )
-	{
-		bli_func_init_null( &funcs[ i ] );
-	}
+	gen_func_init_co( &funcs[ BLIS_PACKM_MRXK_1ER_KER ],  packm_mrxk_1er_ker_name );
+	gen_func_init_co( &funcs[ BLIS_PACKM_NRXK_1ER_KER ],  packm_nrxk_1er_ker_name );
 
-	gen_func_init( &funcs[ BLIS_UNPACKM_2XK_KER ],  unpackm_2xk_ker_name );
-	gen_func_init( &funcs[ BLIS_UNPACKM_4XK_KER ],  unpackm_4xk_ker_name );
-	gen_func_init( &funcs[ BLIS_UNPACKM_6XK_KER ],  unpackm_6xk_ker_name );
-	gen_func_init( &funcs[ BLIS_UNPACKM_8XK_KER ],  unpackm_8xk_ker_name );
-	gen_func_init( &funcs[ BLIS_UNPACKM_10XK_KER ], unpackm_10xk_ker_name );
-	gen_func_init( &funcs[ BLIS_UNPACKM_12XK_KER ], unpackm_12xk_ker_name );
-	gen_func_init( &funcs[ BLIS_UNPACKM_14XK_KER ], unpackm_14xk_ker_name );
-	gen_func_init( &funcs[ BLIS_UNPACKM_16XK_KER ], unpackm_16xk_ker_name );
+	gen_func_init( &funcs[ BLIS_UNPACKM_MRXK_KER ],  unpackm_mrxk_ker_name );
+	gen_func_init( &funcs[ BLIS_UNPACKM_NRXK_KER ],  unpackm_nrxk_ker_name );
 
 
 	// -- Set level-3 small/unpacked handlers ----------------------------------
@@ -479,7 +420,6 @@ void GENBAINAME(cntx_init)
      )
 {
 	func_t* funcs;
-	dim_t   i;
 
 	// This function is designed to modify a copy of an existing native
 	// context to enable computation via an induced method for complex
@@ -530,37 +470,23 @@ void GENBAINAME(cntx_init)
 
 	// -- Set induced method packm kernels -------------------------------------
 
-	// Initialize all packm kernel func_t entries to NULL.
-	for ( i = BLIS_PACKM_0XK_KER; i <= BLIS_PACKM_31XK_KER; ++i )
-	{
-		bli_func_init_null( &funcs[ i ] );
-	}
-
 	if ( method == BLIS_1M )
 	{
-		gen_func_init_co( &funcs[ BLIS_PACKM_2XK_KER ],  packm_2xk_1er_ker_name );
-		gen_func_init_co( &funcs[ BLIS_PACKM_4XK_KER ],  packm_4xk_1er_ker_name );
-		gen_func_init_co( &funcs[ BLIS_PACKM_6XK_KER ],  packm_6xk_1er_ker_name );
-		gen_func_init_co( &funcs[ BLIS_PACKM_8XK_KER ],  packm_8xk_1er_ker_name );
-		gen_func_init_co( &funcs[ BLIS_PACKM_10XK_KER ], packm_10xk_1er_ker_name );
-		gen_func_init_co( &funcs[ BLIS_PACKM_12XK_KER ], packm_12xk_1er_ker_name );
-		gen_func_init_co( &funcs[ BLIS_PACKM_14XK_KER ], packm_14xk_1er_ker_name );
-		gen_func_init_co( &funcs[ BLIS_PACKM_16XK_KER ], packm_16xk_1er_ker_name );
+    	gen_func_init_co( &funcs[ BLIS_PACKM_MRXK_KER ],  packm_mrxk_1er_ker_name );
+    	gen_func_init_co( &funcs[ BLIS_PACKM_NRXK_KER ],  packm_nrxk_1er_ker_name );
 	}
 	else // if ( method == BLIS_NAT )
 	{
-		gen_func_init( &funcs[ BLIS_PACKM_2XK_KER ],  packm_2xk_ker_name );
-		gen_func_init( &funcs[ BLIS_PACKM_3XK_KER ],  packm_3xk_ker_name );
-		gen_func_init( &funcs[ BLIS_PACKM_4XK_KER ],  packm_4xk_ker_name );
-		gen_func_init( &funcs[ BLIS_PACKM_6XK_KER ],  packm_6xk_ker_name );
-		gen_func_init( &funcs[ BLIS_PACKM_8XK_KER ],  packm_8xk_ker_name );
-		gen_func_init( &funcs[ BLIS_PACKM_10XK_KER ], packm_10xk_ker_name );
-		gen_func_init( &funcs[ BLIS_PACKM_12XK_KER ], packm_12xk_ker_name );
-		gen_func_init( &funcs[ BLIS_PACKM_14XK_KER ], packm_14xk_ker_name );
-		gen_func_init( &funcs[ BLIS_PACKM_16XK_KER ], packm_16xk_ker_name );
-		gen_func_init( &funcs[ BLIS_PACKM_24XK_KER ], packm_24xk_ker_name );
+    	gen_func_init( &funcs[ BLIS_PACKM_MRXK_KER ],  packm_mrxk_ker_name );
+    	gen_func_init( &funcs[ BLIS_PACKM_NRXK_KER ],  packm_nrxk_ker_name );
 	}
 
+	gen_func_init_co( &funcs[ BLIS_PACKM_MRXK_1ER_KER ],  packm_mrxk_1er_ker_name );
+	gen_func_init_co( &funcs[ BLIS_PACKM_NRXK_1ER_KER ],  packm_nrxk_1er_ker_name );
+
+	gen_func_init( &funcs[ BLIS_UNPACKM_MRXK_KER ],  unpackm_mrxk_ker_name );
+	gen_func_init( &funcs[ BLIS_UNPACKM_NRXK_KER ],  unpackm_nrxk_ker_name );
+
 
 	// -- Set induced method cache and register blocksizes ---------------------
 
diff --git a/sandbox/gemmlike/bls_packm_cxk.c b/sandbox/gemmlike/bls_packm_cxk.c
index ceaadbe8ba..ee3d57dfee 100644
--- a/sandbox/gemmlike/bls_packm_cxk.c
+++ b/sandbox/gemmlike/bls_packm_cxk.c
@@ -55,7 +55,7 @@ void PASTECH2(bls_,ch,opname) \
 	   kernel function pointer. This means that we always use the same
 	   kernel, even for edge cases. */ \
 	num_t dt     = PASTEMAC(ch,type); \
-	ukr_t ker_id = ( ukr_t )( BLIS_PACKM_0XK_KER + panel_dim_max ); \
+	ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_KER : BLIS_PACKM_MRXK_KER; \
 \
 	PASTECH2(ch,opname,_ker_ft) f; \
 \

From 68ffb78a94ac43d54da70a01d92f3725c13bc1d9 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Fri, 11 Feb 2022 09:16:55 -0600
Subject: [PATCH 07/32] Fix missing `_ROW_PREF`s.

---
 config/cortexa57/bli_cntx_init_cortexa57.c |  4 +-
 config/zen/bli_cntx_init_zen.c             | 60 +++++++++++-----------
 2 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/config/cortexa57/bli_cntx_init_cortexa57.c b/config/cortexa57/bli_cntx_init_cortexa57.c
index 8c327d436e..676fc5ef5b 100644
--- a/config/cortexa57/bli_cntx_init_cortexa57.c
+++ b/config/cortexa57/bli_cntx_init_cortexa57.c
@@ -61,8 +61,8 @@ void bli_cntx_init_cortexa57( cntx_t* cntx )
 	  cntx,
 
       // level-3
-	  BLIS_GEMM_UKR, BLIS_FLOAT,  FALSE,
-	  BLIS_GEMM_UKR, BLIS_DOUBLE, FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,  FALSE,
+	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
 
       -1
 	);
diff --git a/config/zen/bli_cntx_init_zen.c b/config/zen/bli_cntx_init_zen.c
index 074d952252..5fcb03e84a 100644
--- a/config/zen/bli_cntx_init_zen.c
+++ b/config/zen/bli_cntx_init_zen.c
@@ -198,39 +198,39 @@ void bli_cntx_init_zen( cntx_t* cntx )
 	  BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_DOUBLE,   TRUE,
 
       // gemmsup
-	  BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, TRUE,
-	  BLIS_GEMMSUP_RRC_UKR, BLIS_DOUBLE, TRUE,
-	  BLIS_GEMMSUP_RCR_UKR, BLIS_DOUBLE, TRUE,
-	  BLIS_GEMMSUP_RCC_UKR, BLIS_DOUBLE, TRUE,
-	  BLIS_GEMMSUP_CRR_UKR, BLIS_DOUBLE, TRUE,
-	  BLIS_GEMMSUP_CRC_UKR, BLIS_DOUBLE, TRUE,
-	  BLIS_GEMMSUP_CCR_UKR, BLIS_DOUBLE, TRUE,
-	  BLIS_GEMMSUP_CCC_UKR, BLIS_DOUBLE, TRUE,
-
-	  BLIS_GEMMSUP_RRR_UKR, BLIS_FLOAT,  TRUE,
-	  BLIS_GEMMSUP_RRC_UKR, BLIS_FLOAT,  TRUE,
-	  BLIS_GEMMSUP_RCR_UKR, BLIS_FLOAT,  TRUE,
-	  BLIS_GEMMSUP_RCC_UKR, BLIS_FLOAT,  TRUE,
-	  BLIS_GEMMSUP_CRR_UKR, BLIS_FLOAT,  TRUE,
-	  BLIS_GEMMSUP_CRC_UKR, BLIS_FLOAT,  TRUE,
-	  BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT,  TRUE,
-	  BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+	  BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
+
+	  BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_CRC_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
+	  BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
 
 #if 0
 	  // NOTE: This set of kernels is likely broken and therefore disabled.
-	  BLIS_GEMMSUP_RRR_UKR, BLIS_SCOMPLEX, TRUE,
-	  BLIS_GEMMSUP_RCR_UKR, BLIS_SCOMPLEX, TRUE,
-	  BLIS_GEMMSUP_CRR_UKR, BLIS_SCOMPLEX, TRUE,
-	  BLIS_GEMMSUP_RCC_UKR, BLIS_SCOMPLEX, TRUE,
-	  BLIS_GEMMSUP_CCR_UKR, BLIS_SCOMPLEX, TRUE,
-	  BLIS_GEMMSUP_CCC_UKR, BLIS_SCOMPLEX, TRUE,
-
-	  BLIS_GEMMSUP_RRR_UKR, BLIS_DCOMPLEX, TRUE,
-	  BLIS_GEMMSUP_RCR_UKR, BLIS_DCOMPLEX, TRUE,
-	  BLIS_GEMMSUP_CRR_UKR, BLIS_DCOMPLEX, TRUE,
-	  BLIS_GEMMSUP_RCC_UKR, BLIS_DCOMPLEX, TRUE,
-	  BLIS_GEMMSUP_CCR_UKR, BLIS_DCOMPLEX, TRUE,
-	  BLIS_GEMMSUP_CCC_UKR, BLIS_DCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_SCOMPLEX, TRUE,
+
+	  BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_CRR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_RCC_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
+	  BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
 #endif
 
       -1

From 9f9700b9f1fe23aff138d6e81b46bf46df710032 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Sat, 12 Feb 2022 10:34:21 -0600
Subject: [PATCH 08/32] Merge cosmetic changes and fix packm_1er kernel.

This change also includes a new level-0 macro: set0s_edge, which helps to simplify the packm kernels.
---
 common.mk                                     |   2 +-
 config/a64fx/bli_cntx_init_a64fx.c            |  14 +-
 config/armsve/bli_cntx_init_armsve.c          |  24 +-
 config/bgq/bli_cntx_init_bgq.c                |  12 +-
 config/bulldozer/bli_cntx_init_bulldozer.c    |  12 +-
 config/cortexa15/bli_cntx_init_cortexa15.c    |  12 +-
 config/cortexa53/bli_cntx_init_cortexa53.c    |  12 +-
 config/cortexa57/bli_cntx_init_cortexa57.c    |  12 +-
 config/cortexa9/bli_cntx_init_cortexa9.c      |  12 +-
 config/excavator/bli_cntx_init_excavator.c    |  12 +-
 config/firestorm/bli_cntx_init_firestorm.c    |  22 +-
 config/haswell/bli_cntx_init_haswell.c        |  16 +-
 config/knc/bli_cntx_init_knc.c                |  12 +-
 config/knl/bli_cntx_init_knl.c                |  12 +-
 config/penryn/bli_cntx_init_penryn.c          |  12 +-
 config/piledriver/bli_cntx_init_piledriver.c  |  12 +-
 config/power10/bli_cntx_init_power10.c        |  10 +-
 config/power7/bli_cntx_init_power7.c          |  12 +-
 config/power9/bli_cntx_init_power9.c          |  10 +-
 .../sandybridge/bli_cntx_init_sandybridge.c   |  12 +-
 config/skx/bli_cntx_init_skx.c                |   6 +-
 .../steamroller/bli_cntx_init_steamroller.c   |  12 +-
 config/template/bli_cntx_init_template.c      |  16 +-
 config/thunderx2/bli_cntx_init_thunderx2.c    |  12 +-
 config/zen/bli_cntx_init_zen.c                |  24 +-
 config/zen2/bli_cntx_init_zen2.c              |  31 +-
 config/zen3/bli_cntx_init_zen3.c              |  31 +-
 frame/1/bli_l1v_tapi.c                        | 112 +++---
 frame/1d/bli_l1d_tapi.c                       | 157 +++++----
 frame/1f/bli_l1f_tapi.c                       | 104 +++---
 frame/1m/packm/bli_packm_cxk.c                |  28 +-
 frame/base/bli_cntx.c                         | 186 ++++------
 frame/base/bli_cntx.h                         | 163 ++++-----
 frame/base/bli_gks.c                          |   6 +-
 frame/base/bli_gks.h                          |   2 +-
 frame/include/bli_kernel_macro_defs.h         |   6 +-
 frame/include/bli_misc_macro_defs.h           |   6 +
 frame/include/bli_param_macro_defs.h          |  24 +-
 frame/include/bli_scalar_macro_defs.h         |   7 +-
 frame/include/bli_type_defs.h                 |  41 +--
 frame/include/level0/bli_set0s_edge.h         |  73 ++++
 ref_kernels/1m/bli_packm_cxk_1er_ref.c        | 324 ++++++------------
 ref_kernels/1m/bli_packm_cxk_ref.c            | 127 ++-----
 ref_kernels/1m/bli_unpackm_cxk_ref.c          |  78 ++---
 ref_kernels/bli_cntx_ref.c                    |  30 +-
 45 files changed, 863 insertions(+), 997 deletions(-)
 create mode 100644 frame/include/level0/bli_set0s_edge.h

diff --git a/common.mk b/common.mk
index 6ce7abc79c..8d77e4c5f6 100644
--- a/common.mk
+++ b/common.mk
@@ -129,7 +129,7 @@ get-refkern-cflags-for   = $(strip $(call load-var-for,CROPTFLAGS,$(1)) \
                                    -DBLIS_CNAME=$(1) \
                                    $(BUILD_CPPFLAGS) \
                                    $(BUILD_SYMFLAGS) \
-								   -DBLIS_IN_KERNEL=1 \
+                                   -DBLIS_IN_KERNEL=1 \
                                    -include $(CONFIG_PATH)/$(1)/bli_kernel_defs_$(1).h \
                             )
 
diff --git a/config/a64fx/bli_cntx_init_a64fx.c b/config/a64fx/bli_cntx_init_a64fx.c
index d6b95030b3..dd920bcec0 100644
--- a/config/a64fx/bli_cntx_init_a64fx.c
+++ b/config/a64fx/bli_cntx_init_a64fx.c
@@ -49,17 +49,17 @@ void bli_cntx_init_a64fx( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_armsve_asm_2vx10_unindexed,
 	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_armsve_asm_2vx10_unindexed,
 	  BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_armsve_asm_2vx10_unindexed,
 	  BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_armsve_asm_2vx10_unindexed,
 
-      // packm
+	  // packm
 	  BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk,
 	  BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk,
 
-      -1
+	  BLIS_VA_END
 	);
 
 	// Update the context with storage preferences.
@@ -67,13 +67,13 @@ void bli_cntx_init_a64fx( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,    FALSE,
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE,   FALSE,
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
 
-      -1
+	  BLIS_VA_END
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -90,14 +90,14 @@ void bli_cntx_init_a64fx( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
 
-      -1
+	  BLIS_VA_END
 	);
 
 	// Set A64FX cache sector sizes for each PE/CMG
diff --git a/config/armsve/bli_cntx_init_armsve.c b/config/armsve/bli_cntx_init_armsve.c
index 02ab7a35f5..ab88de13ed 100644
--- a/config/armsve/bli_cntx_init_armsve.c
+++ b/config/armsve/bli_cntx_init_armsve.c
@@ -58,14 +58,14 @@ void bli_cntx_init_armsve( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  // These are vector-length agnostic kernels. Yet knowing mr is required at runtime.
 	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_armsve_asm_2vx10_unindexed,
 	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_armsve_asm_2vx10_unindexed,
 	  BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_armsve_asm_2vx10_unindexed,
 	  BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_armsve_asm_2vx10_unindexed,
 
-      -1
+	  BLIS_VA_END
 	);
 
 	// Update the context with storage preferences.
@@ -73,31 +73,35 @@ void bli_cntx_init_armsve( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,    FALSE,
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE,   FALSE,
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
 
-      -1
+	  BLIS_VA_END
 	);
 
 	// Set VL-specific packing routines if applicable.
-	if (m_r_d==16)
+	if ( m_r_d == 16 )
+	{
 	  bli_cntx_set_ukrs
 	  (
 		cntx,
 		BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_16xk,
 		BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_armsve512_asm_10xk,
-		-1
+		BLIS_VA_END
 	  );
-	else if (m_r_d==8)
+	}
+	else if ( m_r_d == 8 )
+	{
 	  bli_cntx_set_ukrs
 	  (
 		cntx,
 		BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_armsve256_int_8xk,
-		-1
+		BLIS_VA_END
 	  );
+	}
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
 	//                                           s      d      c      z
@@ -113,14 +117,14 @@ void bli_cntx_init_armsve( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
 
-      -1
+	  BLIS_VA_END
 	);
 }
 
diff --git a/config/bgq/bli_cntx_init_bgq.c b/config/bgq/bli_cntx_init_bgq.c
index 03f9fd989e..d3871d8f77 100644
--- a/config/bgq/bli_cntx_init_bgq.c
+++ b/config/bgq/bli_cntx_init_bgq.c
@@ -48,11 +48,11 @@ void bli_cntx_init_bgq( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_bgq_int_8x8,
 	  BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_bgq_int_4x4,
 
-      -1
+	  BLIS_VA_END
 	);
 
 	// Update the context with storage preferences.
@@ -60,11 +60,11 @@ void bli_cntx_init_bgq( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE,   FALSE,
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
 
-      -1
+	  BLIS_VA_END
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -81,14 +81,14 @@ void bli_cntx_init_bgq( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
 
-      -1
+	  BLIS_VA_END
 	);
 }
 
diff --git a/config/bulldozer/bli_cntx_init_bulldozer.c b/config/bulldozer/bli_cntx_init_bulldozer.c
index 6d9a230ccc..5b056f591f 100644
--- a/config/bulldozer/bli_cntx_init_bulldozer.c
+++ b/config/bulldozer/bli_cntx_init_bulldozer.c
@@ -48,13 +48,13 @@ void bli_cntx_init_bulldozer( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_bulldozer_asm_8x8_fma4,
 	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_bulldozer_asm_4x6_fma4,
 	  BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_bulldozer_asm_8x4_fma4,
 	  BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_bulldozer_asm_4x4_fma4,
 
-      -1
+	  BLIS_VA_END
 	);
 
 	// Update the context with storage preferences.
@@ -62,13 +62,13 @@ void bli_cntx_init_bulldozer( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,    FALSE,
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE,   FALSE,
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
 
-      -1
+	  BLIS_VA_END
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -85,14 +85,14 @@ void bli_cntx_init_bulldozer( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
 
-      -1
+	  BLIS_VA_END
 	);
 }
 
diff --git a/config/cortexa15/bli_cntx_init_cortexa15.c b/config/cortexa15/bli_cntx_init_cortexa15.c
index 928d8fee46..28ebdef71b 100644
--- a/config/cortexa15/bli_cntx_init_cortexa15.c
+++ b/config/cortexa15/bli_cntx_init_cortexa15.c
@@ -48,11 +48,11 @@ void bli_cntx_init_cortexa15( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_GEMM_UKR, BLIS_FLOAT,  bli_sgemm_armv7a_int_4x4,
 	  BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv7a_int_4x4,
 
-      -1
+	  BLIS_VA_END
 	);
 
 	// Update the context with storage preferences.
@@ -60,11 +60,11 @@ void bli_cntx_init_cortexa15( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,  FALSE,
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
 
-      -1
+	  BLIS_VA_END
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -89,14 +89,14 @@ void bli_cntx_init_cortexa15( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
 
-      -1
+	  BLIS_VA_END
 	);
 }
 
diff --git a/config/cortexa53/bli_cntx_init_cortexa53.c b/config/cortexa53/bli_cntx_init_cortexa53.c
index e0e72c4f36..4957de04e5 100644
--- a/config/cortexa53/bli_cntx_init_cortexa53.c
+++ b/config/cortexa53/bli_cntx_init_cortexa53.c
@@ -48,11 +48,11 @@ void bli_cntx_init_cortexa53( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_GEMM_UKR, BLIS_FLOAT,  bli_sgemm_armv8a_asm_8x12,
 	  BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8,
 
-      -1
+	  BLIS_VA_END
 	);
 
 	// Update the context with storage preferences.
@@ -60,11 +60,11 @@ void bli_cntx_init_cortexa53( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,  FALSE,
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
 
-      -1
+	  BLIS_VA_END
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -81,14 +81,14 @@ void bli_cntx_init_cortexa53( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
 
-      -1
+	  BLIS_VA_END
 	);
 }
 
diff --git a/config/cortexa57/bli_cntx_init_cortexa57.c b/config/cortexa57/bli_cntx_init_cortexa57.c
index 676fc5ef5b..28558bc522 100644
--- a/config/cortexa57/bli_cntx_init_cortexa57.c
+++ b/config/cortexa57/bli_cntx_init_cortexa57.c
@@ -48,11 +48,11 @@ void bli_cntx_init_cortexa57( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_GEMM_UKR, BLIS_FLOAT,  bli_sgemm_armv8a_asm_8x12,
 	  BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8,
 
-      -1
+	  BLIS_VA_END
 	);
 
 	// Update the context with storage preferences.
@@ -60,11 +60,11 @@ void bli_cntx_init_cortexa57( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,  FALSE,
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
 
-      -1
+	  BLIS_VA_END
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -81,14 +81,14 @@ void bli_cntx_init_cortexa57( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
 
-      -1
+	  BLIS_VA_END
 	);
 }
 
diff --git a/config/cortexa9/bli_cntx_init_cortexa9.c b/config/cortexa9/bli_cntx_init_cortexa9.c
index 4751242e16..6af3ff91ce 100644
--- a/config/cortexa9/bli_cntx_init_cortexa9.c
+++ b/config/cortexa9/bli_cntx_init_cortexa9.c
@@ -48,11 +48,11 @@ void bli_cntx_init_cortexa9( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_GEMM_UKR, BLIS_FLOAT,  bli_sgemm_armv7a_int_4x4,
 	  BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv7a_int_4x4,
 
-      -1
+	  BLIS_VA_END
 	);
 
 	// Update the context with storage preferences.
@@ -60,11 +60,11 @@ void bli_cntx_init_cortexa9( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,  FALSE,
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
 
-      -1
+	  BLIS_VA_END
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -81,14 +81,14 @@ void bli_cntx_init_cortexa9( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
 
-      -1
+	  BLIS_VA_END
 	);
 }
 
diff --git a/config/excavator/bli_cntx_init_excavator.c b/config/excavator/bli_cntx_init_excavator.c
index 351b4bc63a..d36865b216 100644
--- a/config/excavator/bli_cntx_init_excavator.c
+++ b/config/excavator/bli_cntx_init_excavator.c
@@ -48,13 +48,13 @@ void bli_cntx_init_excavator( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_piledriver_asm_16x3,
 	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_piledriver_asm_8x3,
 	  BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2,
 	  BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2,
 
-      -1
+	  BLIS_VA_END
 	);
 
 	// Update the context with storage preferences.
@@ -62,13 +62,13 @@ void bli_cntx_init_excavator( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,    FALSE,
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE,   FALSE,
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
 
-      -1
+	  BLIS_VA_END
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -85,14 +85,14 @@ void bli_cntx_init_excavator( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
 
-      -1
+	  BLIS_VA_END
 	);
 }
 
diff --git a/config/firestorm/bli_cntx_init_firestorm.c b/config/firestorm/bli_cntx_init_firestorm.c
index 913540955c..8e4d0088d5 100644
--- a/config/firestorm/bli_cntx_init_firestorm.c
+++ b/config/firestorm/bli_cntx_init_firestorm.c
@@ -48,17 +48,17 @@ void bli_cntx_init_firestorm( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_GEMM_UKR, BLIS_FLOAT,  bli_sgemm_armv8a_asm_8x12,
 	  BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8,
 
-      // packm
+	  // packm
 	  BLIS_PACKM_MRXK_KER, BLIS_FLOAT,  bli_spackm_armv8a_int_8xk,
 	  BLIS_PACKM_NRXK_KER, BLIS_FLOAT,  bli_spackm_armv8a_int_12xk,
 	  BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_6xk,
 	  BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_armv8a_int_8xk,
 
-      // gemmsup
+	  // gemmsup
 	  BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m,
 	  BLIS_GEMMSUP_RRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_armv8a_asm_6x8m,
 	  BLIS_GEMMSUP_RCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8m,
@@ -68,7 +68,7 @@ void bli_cntx_init_firestorm( cntx_t* cntx )
 	  BLIS_GEMMSUP_CCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n,
 	  BLIS_GEMMSUP_CCC_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_armv8a_asm_6x8n,
 
-      -1
+	  BLIS_VA_END
 	);
 
 	// Update the context with storage preferences.
@@ -76,11 +76,11 @@ void bli_cntx_init_firestorm( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,  FALSE,
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
 
-      // gemmsup
+	  // gemmsup
 	  BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
 	  BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
 	  BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
@@ -90,7 +90,7 @@ void bli_cntx_init_firestorm( cntx_t* cntx )
 	  BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
 	  BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
 
-      -1
+	  BLIS_VA_END
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -122,26 +122,26 @@ void bli_cntx_init_firestorm( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
 
-      // sup thresholds
+	  // sup thresholds
 	  BLIS_MT, &blkszs[ BLIS_MT ], BLIS_MT,
 	  BLIS_NT, &blkszs[ BLIS_NT ], BLIS_NT,
 	  BLIS_KT, &blkszs[ BLIS_KT ], BLIS_KT,
 
-      // level-3 sup
+	  // level-3 sup
 	  BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ], BLIS_NR_SUP,
 	  BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ], BLIS_KR_SUP,
 	  BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ], BLIS_MR_SUP,
 	  BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ], BLIS_NR_SUP,
 	  BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ], BLIS_MR_SUP,
 
-	  -1
+	  BLIS_VA_END
 	);
 }
 
diff --git a/config/haswell/bli_cntx_init_haswell.c b/config/haswell/bli_cntx_init_haswell.c
index 4e6ca280c4..fe3b451475 100644
--- a/config/haswell/bli_cntx_init_haswell.c
+++ b/config/haswell/bli_cntx_init_haswell.c
@@ -70,7 +70,7 @@ void bli_cntx_init_haswell( cntx_t* cntx )
 	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_u_haswell_asm_6x8,
 
 #if 1
-      // packm
+	  // packm
 	  BLIS_PACKM_MRXK_KER, BLIS_FLOAT,    bli_spackm_haswell_asm_6xk,
 	  BLIS_PACKM_NRXK_KER, BLIS_FLOAT,    bli_spackm_haswell_asm_16xk,
 	  BLIS_PACKM_MRXK_KER, BLIS_DOUBLE,   bli_dpackm_haswell_asm_6xk,
@@ -117,7 +117,7 @@ void bli_cntx_init_haswell( cntx_t* cntx )
 	  BLIS_SCALV_KER,  BLIS_DOUBLE, bli_dscalv_zen_int10,
 #endif
 
-      // gemmsup
+	  // gemmsup
 	  BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
 	  BLIS_GEMMSUP_RRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m,
 	  BLIS_GEMMSUP_RCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
@@ -136,7 +136,7 @@ void bli_cntx_init_haswell( cntx_t* cntx )
 	  BLIS_GEMMSUP_CCR_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
 	  BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
 
-      -1
+	  BLIS_VA_END
 	);
 
 	// Update the context with storage preferences.
@@ -164,7 +164,7 @@ void bli_cntx_init_haswell( cntx_t* cntx )
 	  BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_FLOAT,    TRUE,
 	  BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_DOUBLE,   TRUE,
 
-      // gemmsup
+	  // gemmsup
 	  BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
 	  BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
 	  BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
@@ -183,7 +183,7 @@ void bli_cntx_init_haswell( cntx_t* cntx )
 	  BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
 	  BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_FLOAT, TRUE,
 
-      -1
+	  BLIS_VA_END
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -242,19 +242,19 @@ void bli_cntx_init_haswell( cntx_t* cntx )
 	  BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
 	  BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
 
-      // gemmsup thresholds
+	  // gemmsup thresholds
 	  BLIS_MT, &blkszs[ BLIS_MT ], BLIS_MT,
 	  BLIS_NT, &blkszs[ BLIS_NT ], BLIS_NT,
 	  BLIS_KT, &blkszs[ BLIS_KT ], BLIS_KT,
 
-      // level-3 sup
+	  // level-3 sup
 	  BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ], BLIS_NR_SUP,
 	  BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ], BLIS_KR_SUP,
 	  BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ], BLIS_MR_SUP,
 	  BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ], BLIS_NR_SUP,
 	  BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ], BLIS_MR_SUP,
 
-	  -1
+	  BLIS_VA_END
 	);
 }
 
diff --git a/config/knc/bli_cntx_init_knc.c b/config/knc/bli_cntx_init_knc.c
index 5fe47f8af7..8f615588c6 100644
--- a/config/knc/bli_cntx_init_knc.c
+++ b/config/knc/bli_cntx_init_knc.c
@@ -48,10 +48,10 @@ void bli_cntx_init_knc( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_knc_asm_30x8,
 
-      -1
+	  BLIS_VA_END
 	);
 
 	// Update the context with storage preferences.
@@ -59,10 +59,10 @@ void bli_cntx_init_knc( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
 
-      -1
+	  BLIS_VA_END
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -81,14 +81,14 @@ void bli_cntx_init_knc( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
 
-      -1
+	  BLIS_VA_END
 	);
 }
 
diff --git a/config/knl/bli_cntx_init_knl.c b/config/knl/bli_cntx_init_knl.c
index 3fa11b4691..87fa3176ab 100644
--- a/config/knl/bli_cntx_init_knl.c
+++ b/config/knl/bli_cntx_init_knl.c
@@ -48,11 +48,11 @@ void bli_cntx_init_knl( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_GEMM_UKR, BLIS_FLOAT,  bli_sgemm_knl_asm_24x16,
 	  BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_knl_asm_24x8,
 
-      // packm
+	  // packm
 	  BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_knl_asm_24xk,
 	  BLIS_PACKM_NRXK_KER, BLIS_DOUBLE, bli_dpackm_knl_asm_8xk,
 
@@ -96,7 +96,7 @@ void bli_cntx_init_knl( cntx_t* cntx )
 	  BLIS_SCALV_KER,  BLIS_DOUBLE, bli_dscalv_zen_int10,
 #endif
 
-      -1
+	  BLIS_VA_END
 	);
 
 	// Update the context with storage preferences.
@@ -104,11 +104,11 @@ void bli_cntx_init_knl( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,  FALSE,
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
 
-      -1
+	  BLIS_VA_END
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -140,7 +140,7 @@ void bli_cntx_init_knl( cntx_t* cntx )
 	  BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
 	  BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
 
-	  -1
+	  BLIS_VA_END
 	);
 }
 
diff --git a/config/penryn/bli_cntx_init_penryn.c b/config/penryn/bli_cntx_init_penryn.c
index 12a36eabb5..964438e834 100644
--- a/config/penryn/bli_cntx_init_penryn.c
+++ b/config/penryn/bli_cntx_init_penryn.c
@@ -48,7 +48,7 @@ void bli_cntx_init_penryn( cntx_t* cntx )
 	(
 	  cntx,
 
-      //level-3
+	  //level-3
 	  BLIS_GEMM_UKR, BLIS_FLOAT,  bli_sgemm_penryn_asm_8x4,
 	  BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_penryn_asm_4x4,
 	  //BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_penryn_asm_8x4,
@@ -56,7 +56,7 @@ void bli_cntx_init_penryn( cntx_t* cntx )
 	  BLIS_GEMMTRSM_L_UKR, BLIS_DOUBLE, bli_dgemmtrsm_l_penryn_asm_4x4,
 	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE, bli_dgemmtrsm_u_penryn_asm_4x4,
 
-	  -1
+	  BLIS_VA_END
 	);
 
 	// Update the context with storage preferences.
@@ -64,7 +64,7 @@ void bli_cntx_init_penryn( cntx_t* cntx )
 	(
 	  cntx,
 
-      //level-3
+	  //level-3
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,  FALSE,
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
 	  //BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
@@ -72,7 +72,7 @@ void bli_cntx_init_penryn( cntx_t* cntx )
 	  BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
 	  BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
 
-	  -1
+	  BLIS_VA_END
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -89,14 +89,14 @@ void bli_cntx_init_penryn( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-1
+	  // level-1
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
 
-      -1
+	  BLIS_VA_END
 	);
 }
 
diff --git a/config/piledriver/bli_cntx_init_piledriver.c b/config/piledriver/bli_cntx_init_piledriver.c
index fe78d51423..1c9a96fd9e 100644
--- a/config/piledriver/bli_cntx_init_piledriver.c
+++ b/config/piledriver/bli_cntx_init_piledriver.c
@@ -48,13 +48,13 @@ void bli_cntx_init_piledriver( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_piledriver_asm_16x3,
 	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_piledriver_asm_8x3,
 	  BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2,
 	  BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2,
 
-      -1
+	  BLIS_VA_END
 	);
 
 	// Update the context with storage preferences.
@@ -62,13 +62,13 @@ void bli_cntx_init_piledriver( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,    FALSE,
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE,   FALSE,
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
 
-      -1
+	  BLIS_VA_END
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -85,14 +85,14 @@ void bli_cntx_init_piledriver( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
 
-      -1
+	  BLIS_VA_END
 	);
 }
 
diff --git a/config/power10/bli_cntx_init_power10.c b/config/power10/bli_cntx_init_power10.c
index 73b9dabc73..12d9f51c6c 100644
--- a/config/power10/bli_cntx_init_power10.c
+++ b/config/power10/bli_cntx_init_power10.c
@@ -48,11 +48,11 @@ void bli_cntx_init_power10( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_GEMM_UKR, BLIS_FLOAT,  bli_sgemm_power10_mma_8x16,
 	  BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power10_mma_8x8,
 
-      -1
+	  BLIS_VA_END
 	);
 
 	// Update the context with storage preferences.
@@ -60,7 +60,7 @@ void bli_cntx_init_power10( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_GEMM_UKR_ROW_PREF,   BLIS_FLOAT,    TRUE,
 	  BLIS_GEMM_UKR_ROW_PREF,   BLIS_DOUBLE,   TRUE,
 	  BLIS_GEMM_UKR_ROW_PREF,   BLIS_SCOMPLEX, FALSE,
@@ -74,7 +74,7 @@ void bli_cntx_init_power10( cntx_t* cntx )
 	  BLIS_TRSM_L_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
 	  BLIS_TRSM_U_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
 
-      -1
+	  BLIS_VA_END
 	);
 
 	//                                           s      d      c      z
@@ -98,7 +98,7 @@ void bli_cntx_init_power10( cntx_t* cntx )
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
 
-      -1
+	  BLIS_VA_END
 	);
 
 }
diff --git a/config/power7/bli_cntx_init_power7.c b/config/power7/bli_cntx_init_power7.c
index 0c83ba8821..d5ffe7dcfa 100644
--- a/config/power7/bli_cntx_init_power7.c
+++ b/config/power7/bli_cntx_init_power7.c
@@ -48,10 +48,10 @@ void bli_cntx_init_power7( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power7_int_8x4,
 
-      -1
+	  BLIS_VA_END
 	);
 
 	// Update the context with storage preferences.
@@ -59,10 +59,10 @@ void bli_cntx_init_power7( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
 
-      -1
+	  BLIS_VA_END
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -79,14 +79,14 @@ void bli_cntx_init_power7( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
 
-      -1
+	  BLIS_VA_END
 	);
 }
 
diff --git a/config/power9/bli_cntx_init_power9.c b/config/power9/bli_cntx_init_power9.c
index ef1f947737..9f2d67632e 100644
--- a/config/power9/bli_cntx_init_power9.c
+++ b/config/power9/bli_cntx_init_power9.c
@@ -48,10 +48,10 @@ void bli_cntx_init_power9( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_power9_asm_12x6,
 
-      -1
+	  BLIS_VA_END
 	);
 
 	// Update the context with storage preferences.
@@ -59,7 +59,7 @@ void bli_cntx_init_power9( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_GEMM_UKR_ROW_PREF,   BLIS_FLOAT,    FALSE,
 	  BLIS_GEMM_UKR_ROW_PREF,   BLIS_DOUBLE,   FALSE,
 	  BLIS_GEMM_UKR_ROW_PREF,   BLIS_SCOMPLEX, FALSE,
@@ -73,7 +73,7 @@ void bli_cntx_init_power9( cntx_t* cntx )
 	  BLIS_TRSM_L_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
 	  BLIS_TRSM_U_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
 
-      -1
+	  BLIS_VA_END
 	);
 
 
@@ -98,6 +98,6 @@ void bli_cntx_init_power9( cntx_t* cntx )
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
 
-      -1
+	  BLIS_VA_END
 	);
 }
diff --git a/config/sandybridge/bli_cntx_init_sandybridge.c b/config/sandybridge/bli_cntx_init_sandybridge.c
index fa92a6d760..0697a3351c 100644
--- a/config/sandybridge/bli_cntx_init_sandybridge.c
+++ b/config/sandybridge/bli_cntx_init_sandybridge.c
@@ -48,13 +48,13 @@ void bli_cntx_init_sandybridge( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_sandybridge_asm_8x8,
 	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_sandybridge_asm_8x4,
 	  BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_sandybridge_asm_8x4,
 	  BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_sandybridge_asm_4x4,
 
-      -1
+	  BLIS_VA_END
 	);
 
 	// Update the context with storage preferences.
@@ -62,13 +62,13 @@ void bli_cntx_init_sandybridge( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,    FALSE,
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE,   FALSE,
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
 
-      -1
+	  BLIS_VA_END
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -85,14 +85,14 @@ void bli_cntx_init_sandybridge( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
 
-      -1
+	  BLIS_VA_END
 	);
 }
 
diff --git a/config/skx/bli_cntx_init_skx.c b/config/skx/bli_cntx_init_skx.c
index 14ffc0680a..3af58b38d2 100644
--- a/config/skx/bli_cntx_init_skx.c
+++ b/config/skx/bli_cntx_init_skx.c
@@ -92,7 +92,7 @@ void bli_cntx_init_skx( cntx_t* cntx )
 	  BLIS_SCALV_KER,  BLIS_DOUBLE, bli_dscalv_zen_int10,
 #endif
 
-      -1
+	  BLIS_VA_END
 	);
 
 	// Update the context with storage preferences.
@@ -104,7 +104,7 @@ void bli_cntx_init_skx( cntx_t* cntx )
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT , FALSE,
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
 
-      -1
+	  BLIS_VA_END
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -135,7 +135,7 @@ void bli_cntx_init_skx( cntx_t* cntx )
 	  BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
 	  BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
 
-	  -1
+	  BLIS_VA_END
 	);
 }
 
diff --git a/config/steamroller/bli_cntx_init_steamroller.c b/config/steamroller/bli_cntx_init_steamroller.c
index ce8870d854..4b4ecdf4e6 100644
--- a/config/steamroller/bli_cntx_init_steamroller.c
+++ b/config/steamroller/bli_cntx_init_steamroller.c
@@ -48,13 +48,13 @@ void bli_cntx_init_steamroller( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_GEMM_UKR, BLIS_FLOAT,    bli_sgemm_piledriver_asm_16x3,
 	  BLIS_GEMM_UKR, BLIS_DOUBLE,   bli_dgemm_piledriver_asm_8x3,
 	  BLIS_GEMM_UKR, BLIS_SCOMPLEX, bli_cgemm_piledriver_asm_4x2,
 	  BLIS_GEMM_UKR, BLIS_DCOMPLEX, bli_zgemm_piledriver_asm_2x2,
 
-      -1
+	  BLIS_VA_END
 	);
 
 	// Update the context with storage preferences.
@@ -62,13 +62,13 @@ void bli_cntx_init_steamroller( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,    FALSE,
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE,   FALSE,
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_SCOMPLEX, FALSE,
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
 
-      -1
+	  BLIS_VA_END
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -85,14 +85,14 @@ void bli_cntx_init_steamroller( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
 
-      -1
+	  BLIS_VA_END
 	);
 }
 
diff --git a/config/template/bli_cntx_init_template.c b/config/template/bli_cntx_init_template.c
index fcc0223f56..4bacc5d63c 100644
--- a/config/template/bli_cntx_init_template.c
+++ b/config/template/bli_cntx_init_template.c
@@ -49,25 +49,25 @@ void bli_cntx_init_template( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_GEMM_UKR,       BLIS_DCOMPLEX, bli_zgemm_template_noopt,
 	  BLIS_GEMMTRSM_L_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_l_template_noopt,
 	  BLIS_GEMMTRSM_U_UKR, BLIS_DCOMPLEX, bli_zgemmtrsm_u_template_noopt,
 	  BLIS_TRSM_L_UKR,     BLIS_DCOMPLEX, bli_ztrsm_l_template_noopt,
 	  BLIS_TRSM_U_UKR,     BLIS_DCOMPLEX, bli_ztrsm_u_template_noopt,
 
-      // level-1f
+	  // level-1f
 	  BLIS_AXPY2V_KER,    BLIS_DCOMPLEX, bli_zaxpy2v_template_noopt,
 	  BLIS_DOTAXPYV_KER,  BLIS_DCOMPLEX, bli_zdotaxpyv_template_noopt,
 	  BLIS_AXPYF_KER,     BLIS_DCOMPLEX, bli_zaxpyf_template_noopt,
 	  BLIS_DOTXF_KER,     BLIS_DCOMPLEX, bli_zdotxf_template_noopt,
 	  BLIS_DOTXAXPYF_KER, BLIS_DCOMPLEX, bli_zdotxaxpyf_template_noopt,
 
-      // level-1v
+	  // level-1v
 	  BLIS_AXPYV_KER, BLIS_DCOMPLEX, bli_zaxpyv_template_noopt,
 	  BLIS_DOTV_KER,  BLIS_DCOMPLEX, bli_zdotv_template_noopt,
 
-      -1
+	  BLIS_VA_END
 	);
 
 	// Update the context with storage preferences.
@@ -75,14 +75,14 @@ void bli_cntx_init_template( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_GEMM_UKR_ROW_PREF,       BLIS_DCOMPLEX, FALSE,
 	  BLIS_GEMMTRSM_L_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
 	  BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_DCOMPLEX, FALSE,
 	  BLIS_TRSM_L_UKR_ROW_PREF,     BLIS_DCOMPLEX, FALSE,
 	  BLIS_TRSM_U_UKR_ROW_PREF,     BLIS_DCOMPLEX, FALSE,
 
-      -1
+	  BLIS_VA_END
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -99,14 +99,14 @@ void bli_cntx_init_template( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
 
-      -1
+	  BLIS_VA_END
 	);
 }
 
diff --git a/config/thunderx2/bli_cntx_init_thunderx2.c b/config/thunderx2/bli_cntx_init_thunderx2.c
index 3c58bb1fa7..9d1af2c99c 100644
--- a/config/thunderx2/bli_cntx_init_thunderx2.c
+++ b/config/thunderx2/bli_cntx_init_thunderx2.c
@@ -48,11 +48,11 @@ void bli_cntx_init_thunderx2( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_GEMM_UKR, BLIS_FLOAT,  bli_sgemm_armv8a_asm_8x12,
 	  BLIS_GEMM_UKR, BLIS_DOUBLE, bli_dgemm_armv8a_asm_6x8,
 
-      -1
+	  BLIS_VA_END
 	);
 
 	// Update the context with storage preferences.
@@ -60,11 +60,11 @@ void bli_cntx_init_thunderx2( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_FLOAT,  FALSE,
 	  BLIS_GEMM_UKR_ROW_PREF, BLIS_DOUBLE, FALSE,
 
-      -1
+	  BLIS_VA_END
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -81,14 +81,14 @@ void bli_cntx_init_thunderx2( cntx_t* cntx )
 	(
 	  cntx,
 
-      // level-3
+	  // level-3
 	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
 	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
 	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
 	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
 	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
 
-      -1
+	  BLIS_VA_END
 	);
 }
 
diff --git a/config/zen/bli_cntx_init_zen.c b/config/zen/bli_cntx_init_zen.c
index 5fcb03e84a..2b80c37838 100644
--- a/config/zen/bli_cntx_init_zen.c
+++ b/config/zen/bli_cntx_init_zen.c
@@ -65,7 +65,7 @@ void bli_cntx_init_zen( cntx_t* cntx )
 	  BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT,    bli_sgemmtrsm_u_haswell_asm_6x16,
 	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_u_haswell_asm_6x8,
 
-      // gemmsup
+	  // gemmsup
 	  BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
 	  BLIS_GEMMSUP_RRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m,
 	  BLIS_GEMMSUP_RCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
@@ -111,7 +111,7 @@ void bli_cntx_init_zen( cntx_t* cntx )
 	  BLIS_GEMMSUP_CCC_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n,
 #endif
 
-      // packm
+	  // packm
 	  BLIS_PACKM_MRXK_KER, BLIS_FLOAT,    bli_spackm_haswell_asm_6xk,
 	  BLIS_PACKM_NRXK_KER, BLIS_FLOAT,    bli_spackm_haswell_asm_16xk,
 	  BLIS_PACKM_MRXK_KER, BLIS_DOUBLE,   bli_dpackm_haswell_asm_6xk,
@@ -175,7 +175,7 @@ void bli_cntx_init_zen( cntx_t* cntx )
 	  BLIS_SWAPV_KER,  BLIS_DOUBLE, bli_dswapv_zen_int8,
 #endif
 
-      -1
+	  BLIS_VA_END
 	);
 
 	// Update the context with storage preferences.
@@ -197,7 +197,7 @@ void bli_cntx_init_zen( cntx_t* cntx )
 	  BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_FLOAT,    TRUE,
 	  BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_DOUBLE,   TRUE,
 
-      // gemmsup
+	  // gemmsup
 	  BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
 	  BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
 	  BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
@@ -233,7 +233,7 @@ void bli_cntx_init_zen( cntx_t* cntx )
 	  BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
 #endif
 
-      -1
+	  BLIS_VA_END
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -315,29 +315,33 @@ void bli_cntx_init_zen( cntx_t* cntx )
 	  BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
 	  BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
 
-      // sup thresholds
+	  // sup thresholds
 	  BLIS_MT, &blkszs[ BLIS_MT ], BLIS_MT,
 	  BLIS_NT, &blkszs[ BLIS_NT ], BLIS_NT,
 	  BLIS_KT, &blkszs[ BLIS_KT ], BLIS_KT,
 
-      // gemmsup
+	  // gemmsup
 	  BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ], BLIS_NR_SUP,
 	  BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ], BLIS_KR_SUP,
 	  BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ], BLIS_MR_SUP,
 	  BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ], BLIS_NR_SUP,
 	  BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ], BLIS_MR_SUP,
 
-	  -1
+	  BLIS_VA_END
 	);
 
 	// -------------------------------------------------------------------------
 
+#if 0
 	// Initialize the context with the sup handlers.
 	bli_cntx_set_l3_sup_handlers
 	(
-	  1,
+	  cntx,
+	  
 	  BLIS_GEMM, bli_gemmsup_ref,
 	  //BLIS_GEMMT, bli_gemmtsup_ref,
-	  cntx
+
+	  BLIS_VA_END
 	);
+#endif
 }
diff --git a/config/zen2/bli_cntx_init_zen2.c b/config/zen2/bli_cntx_init_zen2.c
index 0e64005ec7..ef16fef721 100644
--- a/config/zen2/bli_cntx_init_zen2.c
+++ b/config/zen2/bli_cntx_init_zen2.c
@@ -63,7 +63,7 @@ void bli_cntx_init_zen2( cntx_t* cntx )
 	  BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT,    bli_sgemmtrsm_u_haswell_asm_6x16,
 	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_u_haswell_asm_6x8,
 
-      // level-3 sup
+	  // level-3 sup
 	  BLIS_GEMMSUP_RRR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
 	  BLIS_GEMMSUP_RRC_UKR, BLIS_DOUBLE, bli_dgemmsup_rd_haswell_asm_6x8m,
 	  BLIS_GEMMSUP_RCR_UKR, BLIS_DOUBLE, bli_dgemmsup_rv_haswell_asm_6x8m,
@@ -109,7 +109,7 @@ void bli_cntx_init_zen2( cntx_t* cntx )
 	  BLIS_GEMMSUP_CCC_UKR, BLIS_DCOMPLEX, bli_zgemmsup_rv_zen_asm_3x4n,
 #endif
 
-      // packm
+	  // packm
 	  BLIS_PACKM_MRXK_KER, BLIS_FLOAT,    bli_spackm_haswell_asm_6xk,
 	  BLIS_PACKM_NRXK_KER, BLIS_FLOAT,    bli_spackm_haswell_asm_16xk,
 	  BLIS_PACKM_MRXK_KER, BLIS_DOUBLE,   bli_dpackm_haswell_asm_6xk,
@@ -159,7 +159,7 @@ void bli_cntx_init_zen2( cntx_t* cntx )
 	  BLIS_SETV_KER,  BLIS_FLOAT,  bli_ssetv_zen_int,
 	  BLIS_SETV_KER,  BLIS_DOUBLE, bli_dsetv_zen_int,
 
-      -1
+	  BLIS_VA_END
 	);
 
 	// Update the context with storage preferences.
@@ -181,7 +181,7 @@ void bli_cntx_init_zen2( cntx_t* cntx )
 	  BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_FLOAT,    TRUE,
 	  BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_DOUBLE,   TRUE,
 
-      // level-3 sup
+	  // level-3 sup
 	  BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
 	  BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
 	  BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_DOUBLE, TRUE,
@@ -200,7 +200,7 @@ void bli_cntx_init_zen2( cntx_t* cntx )
 	  BLIS_GEMMSUP_CCR_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
 	  BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
 
-      -1
+	  BLIS_VA_END
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -259,19 +259,34 @@ void bli_cntx_init_zen2( cntx_t* cntx )
 	  BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
 	  BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
 
-      // sup thresholds
+	  // sup thresholds
 	  BLIS_MT, &blkszs[ BLIS_MT ], BLIS_MT,
 	  BLIS_NT, &blkszs[ BLIS_NT ], BLIS_NT,
 	  BLIS_KT, &blkszs[ BLIS_KT ], BLIS_KT,
 
-      // level-3 sup
+	  // level-3 sup
 	  BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ], BLIS_NC_SUP,
 	  BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ], BLIS_KC_SUP,
 	  BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ], BLIS_MC_SUP,
 	  BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ], BLIS_NR_SUP,
 	  BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ], BLIS_MR_SUP,
 
-	  -1
+	  BLIS_VA_END
 	);
+
+	// -------------------------------------------------------------------------
+
+#if 0
+	// Initialize the context with the sup handlers.
+	bli_cntx_set_l3_sup_handlers
+	(
+	  cntx,
+	  
+	  BLIS_GEMM, bli_gemmsup_ref,
+	  //BLIS_GEMMT, bli_gemmtsup_ref,
+
+	  BLIS_VA_END
+	);
+#endif
 }
 
diff --git a/config/zen3/bli_cntx_init_zen3.c b/config/zen3/bli_cntx_init_zen3.c
index 6771c4e069..db38037ddb 100644
--- a/config/zen3/bli_cntx_init_zen3.c
+++ b/config/zen3/bli_cntx_init_zen3.c
@@ -62,7 +62,7 @@ void bli_cntx_init_zen3( cntx_t* cntx )
 	  BLIS_GEMMTRSM_U_UKR, BLIS_FLOAT,    bli_sgemmtrsm_u_haswell_asm_6x16,
 	  BLIS_GEMMTRSM_U_UKR, BLIS_DOUBLE,   bli_dgemmtrsm_u_haswell_asm_6x8,
 
-      // gemmsup
+	  // gemmsup
 #if 0
 	  // AMD: This should be enabled in the PR which has added these kernels
 	  // Update the context with optimized small/unpacked gemm kernels.
@@ -114,7 +114,7 @@ void bli_cntx_init_zen3( cntx_t* cntx )
 	  BLIS_GEMMSUP_CCC_UKR, BLIS_FLOAT, bli_sgemmsup_rv_haswell_asm_6x16n,
 #endif
 
-      // packm
+	  // packm
 #if 0
 	  // AMD: This will be enabled in other PRs.
 	  BLIS_PACKM_MRXK_KER, BLIS_DOUBLE, bli_dpackm_6xk_gen_zen,
@@ -170,7 +170,7 @@ void bli_cntx_init_zen3( cntx_t* cntx )
 	  BLIS_SETV_KER,  BLIS_FLOAT,  bli_ssetv_zen_int,
 	  BLIS_SETV_KER,  BLIS_DOUBLE, bli_dsetv_zen_int,
 
-	  -1
+	  BLIS_VA_END
 	);
 
 	// Update the context with storage preferences.
@@ -192,7 +192,7 @@ void bli_cntx_init_zen3( cntx_t* cntx )
 	  BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_FLOAT,    TRUE,
 	  BLIS_GEMMTRSM_U_UKR_ROW_PREF, BLIS_DOUBLE,   TRUE,
 
-      // gemmsup
+	  // gemmsup
 	  BLIS_GEMMSUP_RRR_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
 	  BLIS_GEMMSUP_RRC_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
 	  BLIS_GEMMSUP_RCR_UKR_ROW_PREF, BLIS_FLOAT,  TRUE,
@@ -226,7 +226,7 @@ void bli_cntx_init_zen3( cntx_t* cntx )
 	  BLIS_GEMMSUP_CCC_UKR_ROW_PREF, BLIS_DCOMPLEX, TRUE,
 #endif
 
-	  -1
+	  BLIS_VA_END
 	);
 
 	// Initialize level-3 blocksize objects with architecture-specific values.
@@ -276,19 +276,34 @@ void bli_cntx_init_zen3( cntx_t* cntx )
 	  BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
 	  BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
 
-      // sup thresholds
+	  // sup thresholds
 	  BLIS_MT, &blkszs[ BLIS_MT ], BLIS_MT,
 	  BLIS_NT, &blkszs[ BLIS_NT ], BLIS_NT,
 	  BLIS_KT, &blkszs[ BLIS_KT ], BLIS_KT,
 
-      // gemmsup
+	  // gemmsup
 	  BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ], BLIS_NR_SUP,
 	  BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ], BLIS_KR_SUP,
 	  BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ], BLIS_MR_SUP,
 	  BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ], BLIS_NR_SUP,
 	  BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ], BLIS_MR_SUP,
 
-	  -1
+	  BLIS_VA_END
 	);
+
+	// -------------------------------------------------------------------------
+
+#if 0
+	// Initialize the context with the sup handlers.
+	bli_cntx_set_l3_sup_handlers
+	(
+	  cntx,
+	  
+	  BLIS_GEMM, bli_gemmsup_ref,
+	  //BLIS_GEMMT, bli_gemmtsup_ref,
+
+	  BLIS_VA_END
+	);
+#endif
 }
 
diff --git a/frame/1/bli_l1v_tapi.c b/frame/1/bli_l1v_tapi.c
index b1b7dea87d..1d12b42ebd 100644
--- a/frame/1/bli_l1v_tapi.c
+++ b/frame/1/bli_l1v_tapi.c
@@ -65,11 +65,11 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 \
 	f \
 	( \
-	   conjx, \
-	   n, \
-	   x, incx, \
-	   y, incy, \
-	   cntx  \
+	  conjx, \
+	  n, \
+	  x, incx, \
+	  y, incy, \
+	  cntx  \
 	); \
 }
 
@@ -102,10 +102,10 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 \
 	f \
 	( \
-	   n, \
-	   x, incx, \
-	   index, \
-	   cntx  \
+	  n, \
+	  x, incx, \
+	  index, \
+	  cntx  \
 	); \
 }
 
@@ -139,13 +139,13 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 \
 	f \
 	( \
-	   conjx, \
-	   n, \
-	   alpha, \
-	   x, incx, \
-	   beta, \
-	   y, incy, \
-	   cntx  \
+	  conjx, \
+	  n, \
+	  alpha, \
+	  x, incx, \
+	  beta, \
+	  y, incy, \
+	  cntx  \
 	); \
 }
 
@@ -179,12 +179,12 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 \
 	f \
 	( \
-	   conjx, \
-	   n, \
-	   alpha, \
-	   x, incx, \
-	   y, incy, \
-	   cntx  \
+	  conjx, \
+	  n, \
+	  alpha, \
+	  x, incx, \
+	  y, incy, \
+	  cntx  \
 	); \
 }
 
@@ -219,13 +219,13 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 \
 	f \
 	( \
-	   conjx, \
-	   conjy, \
-	   n, \
-	   x, incx, \
-	   y, incy, \
-	   rho, \
-	   cntx  \
+	  conjx, \
+	  conjy, \
+	  n, \
+	  x, incx, \
+	  y, incy, \
+	  rho, \
+	  cntx  \
 	); \
 }
 
@@ -261,15 +261,15 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 \
 	f \
 	( \
-	   conjx, \
-	   conjy, \
-	   n, \
-	   alpha, \
-	   x, incx, \
-	   y, incy, \
-	   beta, \
-	   rho, \
-	   cntx  \
+	  conjx, \
+	  conjy, \
+	  n, \
+	  alpha, \
+	  x, incx, \
+	  y, incy, \
+	  beta, \
+	  rho, \
+	  cntx  \
 	); \
 }
 
@@ -299,9 +299,9 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 \
 	f \
 	( \
-	   n, \
-	   x, incx, \
-	   cntx  \
+	  n, \
+	  x, incx, \
+	  cntx  \
 	); \
 }
 
@@ -333,11 +333,11 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 \
 	f \
 	( \
-	   conjalpha, \
-	   n, \
-	   alpha, \
-	   x, incx, \
-	   cntx  \
+	  conjalpha, \
+	  n, \
+	  alpha, \
+	  x, incx, \
+	  cntx  \
 	); \
 }
 
@@ -369,10 +369,10 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 \
 	f \
 	( \
-	   n, \
-	   x, incx, \
-	   y, incy, \
-	   cntx  \
+	  n, \
+	  x, incx, \
+	  y, incy, \
+	  cntx  \
 	); \
 }
 
@@ -404,12 +404,12 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 \
 	f \
 	( \
-	   conjx, \
-	   n, \
-	   x, incx, \
-	   beta, \
-	   y, incy, \
-	   cntx  \
+	  conjx, \
+	  n, \
+	  x, incx, \
+	  beta, \
+	  y, incy, \
+	  cntx  \
 	); \
 }
 
diff --git a/frame/1d/bli_l1d_tapi.c b/frame/1d/bli_l1d_tapi.c
index 921534f375..cfaf5150fe 100644
--- a/frame/1d/bli_l1d_tapi.c
+++ b/frame/1d/bli_l1d_tapi.c
@@ -85,16 +85,16 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 \
 	if ( bli_is_nonunit_diag( diagx ) ) \
 	{ \
-	    x1   = x + offx; \
-	    y1   = y + offy; \
+		x1   = x + offx; \
+		y1   = y + offy; \
 	} \
 	else /* if ( bli_is_unit_diag( diagx ) ) */ \
 	{ \
-	    /* Simulate a unit diagonal for x with a zero increment over a unit
-	       scalar. */ \
-	    x1   = PASTEMAC(ch,1); \
-	    incx = 0; \
-	    y1   = y + offy; \
+		/* Simulate a unit diagonal for x with a zero increment over a unit
+		   scalar. */ \
+		x1   = PASTEMAC(ch,1); \
+		incx = 0; \
+		y1   = y + offy; \
 	} \
 \
 	/* Obtain a valid context from the gks if necessary. */ \
@@ -104,13 +104,14 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	/* Invoke the kernel with the appropriate parameters. */ \
-	f( \
-	   conjx, \
-	   n_elem, \
-	   x1, incx, \
-	   y1, incy, \
-	   cntx  \
-	 ); \
+	f \
+	( \
+	  conjx, \
+	  n_elem, \
+	  x1, incx, \
+	  y1, incy, \
+	  cntx  \
+	); \
 }
 
 INSERT_GENTFUNC_BASIC2( addd,  addv,  BLIS_ADDV_KER )
@@ -164,16 +165,16 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 \
 	if ( bli_is_nonunit_diag( diagx ) ) \
 	{ \
-	    x1   = x + offx; \
-	    y1   = y + offy; \
+		x1   = x + offx; \
+		y1   = y + offy; \
 	} \
 	else /* if ( bli_is_unit_diag( diagx ) ) */ \
 	{ \
-	    /* Simulate a unit diagonal for x with a zero increment over a unit
-	       scalar. */ \
-	    x1   = PASTEMAC(ch,1); \
-	    incx = 0; \
-	    y1   = y + offy; \
+		/* Simulate a unit diagonal for x with a zero increment over a unit
+		   scalar. */ \
+		x1   = PASTEMAC(ch,1); \
+		incx = 0; \
+		y1   = y + offy; \
 	} \
 \
 	/* Obtain a valid context from the gks if necessary. */ \
@@ -183,14 +184,15 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	/* Invoke the kernel with the appropriate parameters. */ \
-	f( \
-	   conjx, \
-	   n_elem, \
-	   alpha, \
-	   x1, incx, \
-	   y1, incy, \
-	   cntx  \
-	 ); \
+	f \
+	( \
+	  conjx, \
+	  n_elem, \
+	  alpha, \
+	  x1, incx, \
+	  y1, incy, \
+	  cntx  \
+	); \
 }
 
 INSERT_GENTFUNC_BASIC2( axpyd,  axpyv,  BLIS_AXPYV_KER )
@@ -233,7 +235,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  &offx, &n_elem, &incx \
 	); \
 \
-    x1 = x + offx; \
+	x1 = x + offx; \
 \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
@@ -242,11 +244,12 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	/* Invoke the kernel with the appropriate parameters. */ \
-	f( \
-	   n_elem, \
-	   x1, incx, \
-	   cntx  \
-	 ); \
+	f \
+	( \
+	  n_elem, \
+	  x1, incx, \
+	  cntx  \
+	); \
 }
 
 INSERT_GENTFUNC_BASIC2( invertd, invertv, BLIS_INVERTV_KER )
@@ -290,7 +293,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  &offx, &n_elem, &incx \
 	); \
 \
-    x1 = x + offx; \
+	x1 = x + offx; \
 \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
@@ -299,13 +302,14 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	/* Invoke the kernel with the appropriate parameters. */ \
-	f( \
-	   conjalpha, \
-	   n_elem, \
-	   alpha, \
-	   x1, incx, \
-	   cntx  \
-	 ); \
+	f \
+	( \
+	  conjalpha, \
+	  n_elem, \
+	  alpha, \
+	  x1, incx, \
+	  cntx  \
+	); \
 }
 
 INSERT_GENTFUNC_BASIC2( scald, scalv, BLIS_SCALV_KER )
@@ -361,11 +365,11 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 		PASTEMAC(ch,setis)( *alpha, *chi11 ); \
 	} */ \
 \
-	/* Acquire the addres of the imaginary component of the first element,
+	/* Acquire the address of the imaginary component of the first element,
 	   and scale the increment for use in the real domain. Note that the
 	   indexing into the imaginary field only needs to work for complex
 	   datatypes since we return early for real domain types. */ \
-    x1   = ( ctype_r* )( x + offx ) + 1; \
+	x1   = ( ctype_r* )( x + offx ) + 1; \
 	incx = 2*incx; \
 \
 	/* Obtain a valid context from the gks if necessary. */ \
@@ -375,13 +379,14 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	PASTECH2(chr,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt_r, kerid, cntx ); \
 \
 	/* Invoke the kernel with the appropriate parameters. */ \
-	f( \
-	   BLIS_NO_CONJUGATE, \
-	   n_elem, \
-	   alpha, \
-	   x1, incx, \
-	   cntx  \
-	 ); \
+	f \
+	( \
+	  BLIS_NO_CONJUGATE, \
+	  n_elem, \
+	  alpha, \
+	  x1, incx, \
+	  cntx  \
+	); \
 }
 
 INSERT_GENTFUNCR_BASIC2( setid, setv, BLIS_SETV_KER )
@@ -424,7 +429,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  &offx, &n_elem, &incx \
 	); \
 \
-    x1 = x + offx; \
+	x1 = x + offx; \
 \
 	/* Obtain a valid context from the gks if necessary. */ \
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx(); \
@@ -433,13 +438,14 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	/* Invoke the kernel with the appropriate parameters. */ \
-	f( \
-	   BLIS_NO_CONJUGATE, \
-	   n_elem, \
-	   alpha, 0, \
-	   x1, incx, \
-	   cntx  \
-	 ); \
+	f \
+	( \
+	  BLIS_NO_CONJUGATE, \
+	  n_elem, \
+	  alpha, 0, \
+	  x1, incx, \
+	  cntx  \
+	); \
 }
 
 INSERT_GENTFUNC_BASIC2( shiftd, addv, BLIS_ADDV_KER )
@@ -491,16 +497,16 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 \
 	if ( bli_is_nonunit_diag( diagx ) ) \
 	{ \
-	    x1   = x + offx; \
-	    y1   = y + offy; \
+		x1   = x + offx; \
+		y1   = y + offy; \
 	} \
 	else /* if ( bli_is_unit_diag( diagx ) ) */ \
 	{ \
-	    /* Simulate a unit diagonal for x with a zero increment over a unit
-	       scalar. */ \
-	    x1   = PASTEMAC(ch,1); \
-	    incx = 0; \
-	    y1   = y + offy; \
+		/* Simulate a unit diagonal for x with a zero increment over a unit
+		   scalar. */ \
+		x1   = PASTEMAC(ch,1); \
+		incx = 0; \
+		y1   = y + offy; \
 	} \
 \
 	/* Obtain a valid context from the gks if necessary. */ \
@@ -510,14 +516,15 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	PASTECH2(ch,kername,_ker_ft) f = bli_cntx_get_ukr_dt( dt, kerid, cntx ); \
 \
 	/* Invoke the kernel with the appropriate parameters. */ \
-	f( \
-	   conjx, \
-	   n_elem, \
-	   x1, incx, \
-	   beta, \
-	   y1, incy, \
-	   cntx  \
-	 ); \
+	f \
+	( \
+	  conjx, \
+	  n_elem, \
+	  x1, incx, \
+	  beta, \
+	  y1, incy, \
+	  cntx  \
+	); \
 }
 
 INSERT_GENTFUNC_BASIC2( xpbyd,  xpbyv,  BLIS_XPBYV_KER )
diff --git a/frame/1f/bli_l1f_tapi.c b/frame/1f/bli_l1f_tapi.c
index c53d490179..a543792998 100644
--- a/frame/1f/bli_l1f_tapi.c
+++ b/frame/1f/bli_l1f_tapi.c
@@ -69,15 +69,15 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 \
 	f \
 	( \
-	   conjx, \
-	   conjy, \
-	   n, \
-	   alphax, \
-	   alphay, \
-	   x, incx, \
-	   y, incy, \
-	   z, incz, \
-	   cntx  \
+	  conjx, \
+	  conjy, \
+	  n, \
+	  alphax, \
+	  alphay, \
+	  x, incx, \
+	  y, incy, \
+	  z, incz, \
+	  cntx  \
 	); \
 }
 
@@ -113,15 +113,15 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 \
 	f \
 	( \
-	   conja, \
-	   conjx, \
-	   m, \
-	   b_n, \
-	   alpha, \
-	   a, inca, lda, \
-	   x, incx, \
-	   y, incy, \
-	   cntx  \
+	  conja, \
+	  conjx, \
+	  m, \
+	  b_n, \
+	  alpha, \
+	  a, inca, lda, \
+	  x, incx, \
+	  y, incy, \
+	  cntx  \
 	); \
 }
 
@@ -158,16 +158,16 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 \
 	f \
 	( \
-	   conjxt, \
-	   conjx, \
-	   conjy, \
-	   n, \
-	   alpha, \
-	   x, incx, \
-	   y, incy, \
-	   rho, \
-	   z, incz, \
-	   cntx  \
+	  conjxt, \
+	  conjx, \
+	  conjy, \
+	  n, \
+	  alpha, \
+	  x, incx, \
+	  y, incy, \
+	  rho, \
+	  z, incz, \
+	  cntx  \
 	); \
 }
 
@@ -208,20 +208,20 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 \
 	f \
 	( \
-	   conjat, \
-	   conja, \
-	   conjw, \
-	   conjx, \
-	   m, \
-	   b_n, \
-	   alpha, \
-	   a, inca, lda, \
-	   w, incw, \
-	   x, incx, \
-	   beta, \
-	   y, incy, \
-	   z, incz, \
-	   cntx  \
+	  conjat, \
+	  conja, \
+	  conjw, \
+	  conjx, \
+	  m, \
+	  b_n, \
+	  alpha, \
+	  a, inca, lda, \
+	  w, incw, \
+	  x, incx, \
+	  beta, \
+	  y, incy, \
+	  z, incz, \
+	  cntx  \
 	); \
 }
 
@@ -258,16 +258,16 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 \
 	f \
 	( \
-	   conjat, \
-	   conjx, \
-	   m, \
-	   b_n, \
-	   alpha, \
-	   a, inca, lda, \
-	   x, incx, \
-	   beta, \
-	   y, incy, \
-	   cntx  \
+	  conjat, \
+	  conjx, \
+	  m, \
+	  b_n, \
+	  alpha, \
+	  a, inca, lda, \
+	  x, incx, \
+	  beta, \
+	  y, incy, \
+	  cntx  \
 	); \
 }
 
diff --git a/frame/1m/packm/bli_packm_cxk.c b/frame/1m/packm/bli_packm_cxk.c
index 8396552c0f..53ae58e215 100644
--- a/frame/1m/packm/bli_packm_cxk.c
+++ b/frame/1m/packm/bli_packm_cxk.c
@@ -91,30 +91,30 @@ void PASTEMAC(ch,opname) \
 		   that happens, the packm kernel must have set the 0's added in
 		   step (3) below.
 
-             packm kernel     packm kernel     packm kernel     packm_tri_cxk
+		     packm kernel     packm kernel     packm kernel     packm_tri_cxk
 		     step 1:          step 2:          step 3:          step 4:
 
-             x x x x . .      x x x x . .      x x x x 0 0      x x x x 0 0
-             ? x x x . .      ? x x x . .      ? x x x 0 0      ? x x x 0 0
-             ? ? x x . .  ->  ? ? x x . .  ->  ? ? x x 0 0  ->  ? ? x x 0 0
-             ? ? ? x . .      ? ? ? x . .      ? ? ? x 0 0      ? ? ? x 0 0
-             . . . . . .      0 0 0 0 0 0      0 0 0 0 0 0      0 0 0 0 1 0
-             . . . . . .      0 0 0 0 0 0      0 0 0 0 0 0      0 0 0 0 0 1
+		     x x x x . .      x x x x . .      x x x x 0 0      x x x x 0 0
+		     ? x x x . .      ? x x x . .      ? x x x 0 0      ? x x x 0 0
+		     ? ? x x . .  ->  ? ? x x . .  ->  ? ? x x 0 0  ->  ? ? x x 0 0
+		     ? ? ? x . .      ? ? ? x . .      ? ? ? x 0 0      ? ? ? x 0 0
+		     . . . . . .      0 0 0 0 0 0      0 0 0 0 0 0      0 0 0 0 1 0
+		     . . . . . .      0 0 0 0 0 0      0 0 0 0 0 0      0 0 0 0 0 1
 
 		     x  Copied from A; valid element.
-             ?  Copied from A, but value is unknown and unused.
+		     ?  Copied from A, but value is unknown and unused.
 		     .  Uninitialized.
-             0  Initialized to zero.
-             1  Initialized to one.
+		     0  Initialized to zero.
+		     1  Initialized to one.
 
 		     NOTE: In step 5 (not shown), bli_packm_tri_cxk() sets the ?'s
 		     to zero. This is not needed to support trsm, but rather to
 		     support trmm. (Both use the same packing format and code.)
 
-           In this case, panel_dim will be 4 because four rows of data are
-           copied from A, panel_len will be 4 because those four rows span
-           four columns of A, and panel_len_max will be 6 because there are a
-           total of 6 columns that can be written to in the packed micropanel,
+		   In this case, panel_dim will be 4 because four rows of data are
+		   copied from A, panel_len will be 4 because those four rows span
+		   four columns of A, and panel_len_max will be 6 because there are a
+		   total of 6 columns that can be written to in the packed micropanel,
 		   2 of which lie beyond the values copied from A. */ \
 		f \
 		( \
diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c
index dfb1dcaa85..5ce04b5025 100644
--- a/frame/base/bli_cntx.c
+++ b/frame/base/bli_cntx.c
@@ -48,8 +48,8 @@ void bli_cntx_set_blkszs( cntx_t* cntx, ... )
 	// This function can be called from the bli_cntx_init_*() function for
 	// a particular architecture if the kernel developer wishes to use
 	// non-default blocksizes. It should be called after
-	// bli_cntx_init_defaults() so that the context begins with default
-	// blocksizes across all datatypes.
+	// bli_cntx_init_<subconfig>_ref() so that the context begins with
+	// default blocksizes across all datatypes.
 
 	/* Example prototypes:
 
@@ -60,7 +60,7 @@ void bli_cntx_set_blkszs( cntx_t* cntx, ... )
 	     bszid_t bs1_id, blksz_t* blksz1, bszid_t bm1_id,
 	     bszid_t bs2_id, blksz_t* blksz2, bszid_t bm2_id,
 	     ...,
-         -1
+	     BLIS_VA_END
 	   );
 	*/
 
@@ -73,20 +73,17 @@ void bli_cntx_set_blkszs( cntx_t* cntx, ... )
 	blksz_t* cntx_blkszs = bli_cntx_blkszs_buf( cntx );
 	bszid_t* cntx_bmults = bli_cntx_bmults_buf( cntx );
 
-	// Now that we have the context address, we want to copy the values
-	// from the temporary buffers into the corresponding buffers in the
-	// context.
-
 	// Initialize variable argument environment.
 	va_list args;
 	va_start( args, cntx );
 
-	// Process block sizes until we get a -1.
+	// Process blocksizes until we get a BLIS_VA_END.
 	while ( true )
 	{
-        int bs_id0 = va_arg( args, int );
-        if ( bs_id0 == -1 )
-            break;
+		int bs_id0 = va_arg( args, int );
+
+		// If we find a bszid_t id of BLIS_VA_END, then we are done.
+		if ( bs_id0 == BLIS_VA_END ) break;
 
 		// Here, we query the variable argument list for:
 		// - the bszid_t of the blocksize we're about to process (already done),
@@ -128,7 +125,7 @@ void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, cntx_t* cntx, ... )
 	     bszid_t bs1_id, dim_t def_scalr1, dim_t max_scalr1,
 	     bszid_t bs2_id, dim_t def_scalr2, dim_t max_scalr2,
 	     ...,
-         -1
+	     BLIS_VA_END
 	   );
 
 		NOTE: This function modifies an existing context that is presumed
@@ -144,20 +141,17 @@ void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, cntx_t* cntx, ... )
 	// Save the execution type into the context.
 	bli_cntx_set_method( method, cntx );
 
-	// Now that we have the context address, we want to copy the values
-	// from the temporary buffers into the corresponding buffers in the
-	// context.
-
 	// Initialize variable argument environment.
 	va_list args;
 	va_start( args, cntx );
 
-	// Process block sizes until we get a -1.
+	// Process blocksizes until we get a BLIS_VA_END.
 	while ( true )
 	{
-        int bs_id0 = va_arg( args, int );
-        if ( bs_id0 == -1 )
-            break;
+		int bs_id0 = va_arg( args, int );
+
+		// If we find a bszid_t id of BLIS_VA_END, then we are done.
+		if ( bs_id0 == BLIS_VA_END ) break;
 
 		// Here, we query the variable argument list for:
 		// - the bszid_t of the blocksize we're about to process (already done),
@@ -207,8 +201,8 @@ void bli_cntx_set_ukrs( cntx_t* cntx , ... )
 	// This function can be called from the bli_cntx_init_*() function for
 	// a particular architecture if the kernel developer wishes to use
 	// non-default microkernels. It should be called after
-	// bli_cntx_init_defaults() so that the context begins with default
-	// microkernels across all datatypes.
+	// bli_cntx_init_<subconfig>_ref() so that the context begins with
+	// default microkernels across all datatypes.
 
 	/* Example prototypes:
 
@@ -219,7 +213,7 @@ void bli_cntx_set_ukrs( cntx_t* cntx , ... )
 	     ukr_t ukr1_id, num_t dt1, void_fp ukr1_fp,
 	     ukr_t ukr2_id, num_t dt2, void_fp ukr2_fp,
 	     ...,
-         -1
+	     BLIS_VA_END
 	   );
 	*/
 
@@ -230,20 +224,18 @@ void bli_cntx_set_ukrs( cntx_t* cntx , ... )
 	va_list   args;
 	va_start( args, cntx );
 
-	// Process ukernels until -1 is reached.
+	// Process ukernels until BLIS_VA_END is reached.
 	while ( true )
 	{
-        const int ukr_id0 = va_arg( args, int );
+		const int ukr_id0 = va_arg( args, int );
 
-        // If we find a ukr ID of -1, then we are done.
-        if ( ukr_id0 == -1 )
-            break;
+		// If we find a ukernel id of BLIS_VA_END, then we are done.
+		if ( ukr_id0 == BLIS_VA_END ) break;
 
 		// Here, we query the variable argument list for:
 		// - the ukr_t of the kernel we're about to process (already done),
 		// - the datatype of the kernel, and
 		// - the kernel function pointer
-		// that we need to store to the context.
 		const ukr_t   ukr_id = ( ukr_t   )ukr_id0;
 		const num_t   ukr_dt = ( num_t   )va_arg( args, num_t   );
 		      void_fp ukr_fp = ( void_fp )va_arg( args, void_fp );
@@ -253,7 +245,7 @@ void bli_cntx_set_ukrs( cntx_t* cntx , ... )
 		func_t* ukrs = &cntx_ukrs[ ukr_id ];
 
 		// Store the ukernel function pointer into the context.
-        // Notice that we redundantly store the native
+		// Notice that we redundantly store the native
 		// ukernel address in both the native and virtual ukernel slots
 		// in the context. This is standard practice when creating a
 		// native context. (Induced method contexts will overwrite the
@@ -261,18 +253,20 @@ void bli_cntx_set_ukrs( cntx_t* cntx , ... )
 		// virtual ukernel.)
 		bli_func_set_dt( ukr_fp, ukr_dt, ukrs );
 
-        switch ( ukr_id )
-        {
-        	case BLIS_GEMM_UKR:       ukrs = &cntx_ukrs[ BLIS_GEMM_VIR_UKR ];
-        	case BLIS_GEMMTRSM_L_UKR: ukrs = &cntx_ukrs[ BLIS_GEMMTRSM_L_VIR_UKR ];
-        	case BLIS_GEMMTRSM_U_UKR: ukrs = &cntx_ukrs[ BLIS_GEMMTRSM_U_VIR_UKR ];
-        	case BLIS_TRSM_L_UKR:     ukrs = &cntx_ukrs[ BLIS_TRSM_L_VIR_UKR ];
-        	case BLIS_TRSM_U_UKR:     ukrs = &cntx_ukrs[ BLIS_TRSM_U_VIR_UKR ];
-            default:                  ukrs = NULL;
-        };
-
-        if ( ukrs )
-		    bli_func_set_dt( ukr_fp, ukr_dt, ukrs );
+		// Locate the virtual ukernel func_t pointer that corresponds to the
+		// ukernel id provided by the caller.
+		switch ( ukr_id )
+		{
+			case BLIS_GEMM_UKR:       ukrs = &cntx_ukrs[ BLIS_GEMM_VIR_UKR ];
+			case BLIS_GEMMTRSM_L_UKR: ukrs = &cntx_ukrs[ BLIS_GEMMTRSM_L_VIR_UKR ];
+			case BLIS_GEMMTRSM_U_UKR: ukrs = &cntx_ukrs[ BLIS_GEMMTRSM_U_VIR_UKR ];
+			case BLIS_TRSM_L_UKR:     ukrs = &cntx_ukrs[ BLIS_TRSM_L_VIR_UKR ];
+			case BLIS_TRSM_U_UKR:     ukrs = &cntx_ukrs[ BLIS_TRSM_U_VIR_UKR ];
+		    default:                  ukrs = NULL;
+		};
+
+		if ( ukrs )
+			bli_func_set_dt( ukr_fp, ukr_dt, ukrs );
 	}
 
 	// Shutdown variable argument environment and clean up stack.
@@ -286,8 +280,8 @@ void bli_cntx_set_ukr_prefs( cntx_t* cntx , ... )
 	// This function can be called from the bli_cntx_init_*() function for
 	// a particular architecture if the kernel developer wishes to use
 	// non-default microkernel preferences. It should be called after
-	// bli_cntx_init_defaults() so that the context begins with default
-	// preferences across all datatypes.
+	// bli_cntx_init_<subconfig>_ref() so that the context begins with
+	// default preferences across all datatypes.
 
 	/* Example prototypes:
 
@@ -298,7 +292,7 @@ void bli_cntx_set_ukr_prefs( cntx_t* cntx , ... )
 	     ukr_pref_t ukr_pref1_id, num_t dt1, bool ukr_pref1,
 	     ukr_pref_t ukr_pref2_id, num_t dt2, bool ukr_pref2,
 	     ...,
-         -1
+	     BLIS_VA_END
 	   );
 	*/
 
@@ -309,20 +303,18 @@ void bli_cntx_set_ukr_prefs( cntx_t* cntx , ... )
 	va_list   args;
 	va_start( args, cntx );
 
-	// Process ukernel preferences until -1 is reached.
+	// Process ukernel preferences until BLIS_VA_END is reached.
 	while ( true )
 	{
-        const int ukr_pref_id0 = va_arg( args, int );
+		const int ukr_pref_id0 = va_arg( args, int );
 
-        // If we find a ukr pref ID of -1, then we are done.
-        if ( ukr_pref_id0 == -1 )
-            break;
+		// If we find a ukernel pref id of BLIS_VA_END, then we are done.
+		if ( ukr_pref_id0 == BLIS_VA_END ) break;
 
 		// Here, we query the variable argument list for:
 		// - the ukr_t of the kernel we're about to process (already done),
 		// - the datatype of the kernel, and
 		// - the kernel function pointer
-		// that we need to store to the context.
 		const ukr_pref_t ukr_pref_id = ( ukr_pref_t )ukr_pref_id0;
 		const bool       ukr_pref_dt = ( num_t      )va_arg( args, num_t   );
 		const bool       ukr_pref    = ( bool       )va_arg( args, int );
@@ -341,101 +333,55 @@ void bli_cntx_set_ukr_prefs( cntx_t* cntx , ... )
 
 // -----------------------------------------------------------------------------
 
-void bli_cntx_set_l3_sup_handlers( dim_t n_ops, ... )
+void bli_cntx_set_l3_sup_handlers( cntx_t* cntx, ... )
 {
 	// This function can be called from the bli_cntx_init_*() function for
 	// a particular architecture if the kernel developer wishes to use
 	// non-default level-3 operation handler for small/unpacked matrices. It
-	// should be called after bli_cntx_init_defaults() so that the context
-	// begins with default sup handlers across all datatypes.
+	// should be called after bli_cntx_init_<subconfig>_ref() so that the
+	// context begins with default sup handlers across all datatypes.
 
 	/* Example prototypes:
 
 	   void bli_cntx_set_l3_sup_handlers
 	   (
-	     dim_t   n_ops,
-	     opid_t  op0_id, void* handler0_fp,
-	     opid_t  op1_id, void* handler1_fp,
-	     opid_t  op2_id, void* handler2_fp,
-	     ...
 	     cntx_t* cntx
+	     opid_t  op0_id, void_fp handler0_fp,
+	     opid_t  op1_id, void_fp handler1_fp,
+	     opid_t  op2_id, void_fp handler2_fp,
+	     ...,
+	     BLIS_VA_END
 	   );
 	*/
 
-	va_list   args;
-	dim_t     i;
-	err_t     r_val;
-
-	// Allocate some temporary local arrays.
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_sup_handlers(): " );
-	#endif
-	opid_t* op_ids = bli_malloc_intl( n_ops * sizeof( opid_t ), &r_val );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_sup_handlers(): " );
-	#endif
-	void**  op_fps = bli_malloc_intl( n_ops * sizeof( void*  ), &r_val );
-
-	// -- Begin variable argument section --
+	// Query the context for the address of the l3 sup handlers array.
+	void_fp* cntx_l3_sup_handlers = bli_cntx_l3_sup_handlers_buf( cntx );
 
 	// Initialize variable argument environment.
-	va_start( args, n_ops );
+	va_list   args;
+	va_start( args, cntx );
 
-	// Process n_ukrs tuples.
-	for ( i = 0; i < n_ops; ++i )
+	// Process sup handlers until BLIS_VA_END is reached.
+	while ( true )
 	{
+		const int op_id0 = va_arg( args, int );
+
+		// If we find an operation id of BLIS_VA_END, then we are done.
+		if ( op_id0 == BLIS_VA_END ) break;
+
 		// Here, we query the variable argument list for:
 		// - the opid_t of the operation we're about to process,
 		// - the sup handler function pointer
-		// that we need to store to the context.
-		const opid_t op_id = ( opid_t )va_arg( args, opid_t );
-		      void*  op_fp = ( void*  )va_arg( args, void*  );
-
-		// Store the values in our temporary arrays.
-		op_ids[ i ] = op_id;
-		op_fps[ i ] = op_fp;
-	}
-
-	// The last argument should be the context pointer.
-	cntx_t* cntx = ( cntx_t* )va_arg( args, cntx_t* );
-
-	// Shutdown variable argument environment and clean up stack.
-	va_end( args );
-
-	// -- End variable argument section --
-
-	// Query the context for the addresses of:
-	// - the l3 small/unpacked handlers array
-	void** cntx_l3_sup_handlers = bli_cntx_l3_sup_handlers_buf( cntx );
-
-	// Now that we have the context address, we want to copy the values
-	// from the temporary buffers into the corresponding buffers in the
-	// context.
-
-	// Process each operation id tuple provided.
-	for ( i = 0; i < n_ops; ++i )
-	{
-		// Read the current operation id and handler function pointer.
-		const opid_t op_id = op_ids[ i ];
-		      void*  op_fp = op_fps[ i ];
+		const opid_t  op_id = ( opid_t  )op_id0;
+		      void_fp op_fp = ( void_fp )va_arg( args, void_fp );
 
 		// Store the sup handler function pointer into the slot for the
 		// specified operation id.
 		cntx_l3_sup_handlers[ op_id ] = op_fp;
 	}
 
-	// Free the temporary local arrays.
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_sup_handlers(): " );
-	#endif
-	bli_free_intl( op_ids );
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntx_set_l3_sup_handlers(): " );
-	#endif
-	bli_free_intl( op_fps );
+	// Shutdown variable argument environment and clean up stack.
+	va_end( args );
 }
 
 // -----------------------------------------------------------------------------
diff --git a/frame/base/bli_cntx.h b/frame/base/bli_cntx.h
index fb75cf9f89..412430e9b2 100644
--- a/frame/base/bli_cntx.h
+++ b/frame/base/bli_cntx.h
@@ -49,7 +49,7 @@ typedef struct cntx_s
 	func_t    ukrs[ BLIS_NUM_UKRS ];
 	mbool_t   ukr_prefs[ BLIS_NUM_UKR_PREFS ];
 
-	void*     l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ];
+	void_fp   l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ];
 
 	ind_t     method;
 
@@ -78,7 +78,7 @@ BLIS_INLINE mbool_t* bli_cntx_ukr_prefs_buf( cntx_t* cntx )
 {
 	return cntx->ukr_prefs;
 }
-BLIS_INLINE void** bli_cntx_l3_sup_handlers_buf( cntx_t* cntx )
+BLIS_INLINE void_fp* bli_cntx_l3_sup_handlers_buf( cntx_t* cntx )
 {
 	return cntx->l3_sup_handlers;
 }
@@ -174,17 +174,17 @@ BLIS_INLINE void_fp bli_cntx_get_ukr_dt( num_t dt, ukr_t ukr_id, cntx_t* cntx )
 
 BLIS_INLINE void_fp bli_cntx_get_l3_vir_ukr_dt( num_t dt, ukr_t ukr_id, cntx_t* cntx )
 {
-    switch ( ukr_id )
-    {
-        case BLIS_GEMM_UKR:       ukr_id = BLIS_GEMM_VIR_UKR; break;
-        case BLIS_TRSM_L_UKR:     ukr_id = BLIS_TRSM_L_VIR_UKR; break;
-        case BLIS_TRSM_U_UKR:     ukr_id = BLIS_TRSM_U_VIR_UKR; break;
-        case BLIS_GEMMTRSM_L_UKR: ukr_id = BLIS_GEMMTRSM_L_VIR_UKR; break;
-        case BLIS_GEMMTRSM_U_UKR: ukr_id = BLIS_GEMMTRSM_U_VIR_UKR; break;
-        default: break;
-    };
+	switch ( ukr_id )
+	{
+		case BLIS_GEMM_UKR:       ukr_id = BLIS_GEMM_VIR_UKR; break;
+		case BLIS_TRSM_L_UKR:     ukr_id = BLIS_TRSM_L_VIR_UKR; break;
+		case BLIS_TRSM_U_UKR:     ukr_id = BLIS_TRSM_U_VIR_UKR; break;
+		case BLIS_GEMMTRSM_L_UKR: ukr_id = BLIS_GEMMTRSM_L_VIR_UKR; break;
+		case BLIS_GEMMTRSM_U_UKR: ukr_id = BLIS_GEMMTRSM_U_VIR_UKR; break;
+		default: break;
+	};
 
-    return bli_cntx_get_ukr_dt( dt, ukr_id, cntx );
+	return bli_cntx_get_ukr_dt( dt, ukr_id, cntx );
 }
 
 // -----------------------------------------------------------------------------
@@ -217,10 +217,10 @@ BLIS_INLINE bool bli_cntx_l3_sup_thresh_is_met( num_t dt, dim_t m, dim_t n, dim_
 
 // -----------------------------------------------------------------------------
 
-BLIS_INLINE void* bli_cntx_get_l3_sup_handler( opid_t op, cntx_t* cntx )
+BLIS_INLINE void_fp bli_cntx_get_l3_sup_handler( opid_t op, cntx_t* cntx )
 {
-	void** funcs = bli_cntx_l3_sup_handlers_buf( cntx );
-	void*  func  = funcs[ op ];
+	void_fp* funcs = bli_cntx_l3_sup_handlers_buf( cntx );
+	void_fp  func  = funcs[ op ];
 
 	return func;
 }
@@ -229,45 +229,48 @@ BLIS_INLINE void* bli_cntx_get_l3_sup_handler( opid_t op, cntx_t* cntx )
 
 BLIS_INLINE bool bli_cntx_ukr_prefers_rows_dt( num_t dt, ukr_t ukr_id, cntx_t* cntx )
 {
-    // Get the correct preference from the kernel ID.
-    ukr_pref_t ukr_pref_id = BLIS_GEMM_UKR_ROW_PREF;
-    switch ( ukr_id )
-    {
-        case BLIS_GEMM_VIR_UKR: // fallthrough
-        case BLIS_GEMM_UKR: ukr_pref_id = BLIS_GEMM_UKR_ROW_PREF; break;
-        case BLIS_TRSM_L_VIR_UKR: // fallthrough
-        case BLIS_TRSM_L_UKR: ukr_pref_id = BLIS_TRSM_L_UKR_ROW_PREF; break;
-        case BLIS_TRSM_U_VIR_UKR: // fallthrough
-        case BLIS_TRSM_U_UKR: ukr_pref_id = BLIS_TRSM_U_UKR_ROW_PREF; break;
-        case BLIS_GEMMTRSM_L_VIR_UKR: // fallthrough
-        case BLIS_GEMMTRSM_L_UKR: ukr_pref_id = BLIS_GEMMTRSM_L_UKR_ROW_PREF; break;
-        case BLIS_GEMMTRSM_U_VIR_UKR: // fallthrough
-        case BLIS_GEMMTRSM_U_UKR: ukr_pref_id = BLIS_GEMMTRSM_U_UKR_ROW_PREF; break;
-        case BLIS_GEMMSUP_RRR_UKR: ukr_pref_id = BLIS_GEMMSUP_RRR_UKR_ROW_PREF; break;
-        case BLIS_GEMMSUP_RRC_UKR: ukr_pref_id = BLIS_GEMMSUP_RRC_UKR_ROW_PREF; break;
-        case BLIS_GEMMSUP_RCR_UKR: ukr_pref_id = BLIS_GEMMSUP_RCR_UKR_ROW_PREF; break;
-        case BLIS_GEMMSUP_RCC_UKR: ukr_pref_id = BLIS_GEMMSUP_RCC_UKR_ROW_PREF; break;
-        case BLIS_GEMMSUP_CRR_UKR: ukr_pref_id = BLIS_GEMMSUP_CRR_UKR_ROW_PREF; break;
-        case BLIS_GEMMSUP_CRC_UKR: ukr_pref_id = BLIS_GEMMSUP_CRC_UKR_ROW_PREF; break;
-        case BLIS_GEMMSUP_CCR_UKR: ukr_pref_id = BLIS_GEMMSUP_CCR_UKR_ROW_PREF; break;
-        case BLIS_GEMMSUP_CCC_UKR: ukr_pref_id = BLIS_GEMMSUP_CCC_UKR_ROW_PREF; break;
-        case BLIS_GEMMSUP_XXX_UKR: ukr_pref_id = BLIS_GEMMSUP_XXX_UKR_ROW_PREF; break;
-        default: break; // TODO: should be an error condition
-    }
-
-    // For virtual ukrs and non-native execution, use the real projection of the datatype.
-    if ( bli_cntx_method( cntx ) != BLIS_NAT )
-    {
-        switch ( ukr_id )
-        {
-            case BLIS_GEMM_VIR_UKR: // fallthrough
-            case BLIS_TRSM_L_VIR_UKR: // fallthrough
-            case BLIS_TRSM_U_VIR_UKR: // fallthrough
-            case BLIS_GEMMTRSM_L_VIR_UKR: // fallthrough
-            case BLIS_GEMMTRSM_U_VIR_UKR: dt = bli_dt_proj_to_real( dt ); break;
-            default: break;
-        }
-    }
+	// This initial value will get overwritten during the switch statement below.
+	ukr_pref_t ukr_pref_id = BLIS_GEMM_UKR_ROW_PREF;
+
+	// Get the correct preference from the kernel ID.
+	switch ( ukr_id )
+	{
+		case BLIS_GEMM_VIR_UKR: // fallthrough
+		case BLIS_GEMM_UKR: ukr_pref_id = BLIS_GEMM_UKR_ROW_PREF; break;
+		case BLIS_TRSM_L_VIR_UKR: // fallthrough
+		case BLIS_TRSM_L_UKR: ukr_pref_id = BLIS_TRSM_L_UKR_ROW_PREF; break;
+		case BLIS_TRSM_U_VIR_UKR: // fallthrough
+		case BLIS_TRSM_U_UKR: ukr_pref_id = BLIS_TRSM_U_UKR_ROW_PREF; break;
+		case BLIS_GEMMTRSM_L_VIR_UKR: // fallthrough
+		case BLIS_GEMMTRSM_L_UKR: ukr_pref_id = BLIS_GEMMTRSM_L_UKR_ROW_PREF; break;
+		case BLIS_GEMMTRSM_U_VIR_UKR: // fallthrough
+		case BLIS_GEMMTRSM_U_UKR: ukr_pref_id = BLIS_GEMMTRSM_U_UKR_ROW_PREF; break;
+		case BLIS_GEMMSUP_RRR_UKR: ukr_pref_id = BLIS_GEMMSUP_RRR_UKR_ROW_PREF; break;
+		case BLIS_GEMMSUP_RRC_UKR: ukr_pref_id = BLIS_GEMMSUP_RRC_UKR_ROW_PREF; break;
+		case BLIS_GEMMSUP_RCR_UKR: ukr_pref_id = BLIS_GEMMSUP_RCR_UKR_ROW_PREF; break;
+		case BLIS_GEMMSUP_RCC_UKR: ukr_pref_id = BLIS_GEMMSUP_RCC_UKR_ROW_PREF; break;
+		case BLIS_GEMMSUP_CRR_UKR: ukr_pref_id = BLIS_GEMMSUP_CRR_UKR_ROW_PREF; break;
+		case BLIS_GEMMSUP_CRC_UKR: ukr_pref_id = BLIS_GEMMSUP_CRC_UKR_ROW_PREF; break;
+		case BLIS_GEMMSUP_CCR_UKR: ukr_pref_id = BLIS_GEMMSUP_CCR_UKR_ROW_PREF; break;
+		case BLIS_GEMMSUP_CCC_UKR: ukr_pref_id = BLIS_GEMMSUP_CCC_UKR_ROW_PREF; break;
+		case BLIS_GEMMSUP_XXX_UKR: ukr_pref_id = BLIS_GEMMSUP_XXX_UKR_ROW_PREF; break;
+		default: break; // TODO: should be an error condition
+	}
+
+	// For virtual ukernels during non-native execution, use the real projection of
+	// the datatype.
+	if ( bli_cntx_method( cntx ) != BLIS_NAT )
+	{
+		switch ( ukr_id )
+		{
+			case BLIS_GEMM_VIR_UKR: // fallthrough
+			case BLIS_TRSM_L_VIR_UKR: // fallthrough
+			case BLIS_TRSM_U_VIR_UKR: // fallthrough
+			case BLIS_GEMMTRSM_L_VIR_UKR: // fallthrough
+			case BLIS_GEMMTRSM_U_VIR_UKR: dt = bli_dt_proj_to_real( dt ); break;
+			default: break;
+		}
+	}
 
 	return bli_cntx_get_ukr_prefs_dt( dt, ukr_pref_id, cntx );
 }
@@ -280,7 +283,7 @@ BLIS_INLINE bool bli_cntx_ukr_prefers_cols_dt( num_t dt, ukr_t ukr_id, cntx_t* c
 BLIS_INLINE bool bli_cntx_prefers_storage_of( obj_t* obj, ukr_t ukr_id, cntx_t* cntx )
 {
 	const bool ukr_prefers_rows
-	    = bli_cntx_ukr_prefers_rows_dt( bli_obj_dt( obj ), ukr_id, cntx );
+		= bli_cntx_ukr_prefers_rows_dt( bli_obj_dt( obj ), ukr_id, cntx );
 
 	if      ( bli_obj_is_row_stored( obj ) &&  ukr_prefers_rows ) return TRUE;
 	else if ( bli_obj_is_col_stored( obj ) && !ukr_prefers_rows ) return TRUE;
@@ -350,41 +353,41 @@ BLIS_INLINE void bli_cntx_set_ukr_pref( ukr_pref_t ukr_id, mbool_t* prefs, cntx_
 
 BLIS_INLINE void_fp bli_cntx_get_l3_sup_ker_dt( num_t dt, stor3_t stor_id, cntx_t* cntx )
 {
-    ukr_t ukr_id = bli_stor3_ukr( stor_id );
+	ukr_t ukr_id = bli_stor3_ukr( stor_id );
 
-    return bli_cntx_get_ukr_dt( dt, ukr_id, cntx );
+	return bli_cntx_get_ukr_dt( dt, ukr_id, cntx );
 }
 
 BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx )
 {
-    switch ( bs_id )
-    {
-        case BLIS_MR: bs_id = BLIS_MR_SUP; break;
-        case BLIS_NR: bs_id = BLIS_NR_SUP; break;
-        case BLIS_KR: bs_id = BLIS_KR_SUP; break;
-        case BLIS_MC: bs_id = BLIS_MC_SUP; break;
-        case BLIS_NC: bs_id = BLIS_NC_SUP; break;
-        case BLIS_KC: bs_id = BLIS_KC_SUP; break;
-        default: break;
-    };
+	switch ( bs_id )
+	{
+		case BLIS_MR: bs_id = BLIS_MR_SUP; break;
+		case BLIS_NR: bs_id = BLIS_NR_SUP; break;
+		case BLIS_KR: bs_id = BLIS_KR_SUP; break;
+		case BLIS_MC: bs_id = BLIS_MC_SUP; break;
+		case BLIS_NC: bs_id = BLIS_NC_SUP; break;
+		case BLIS_KC: bs_id = BLIS_KC_SUP; break;
+		default: break;
+	};
 
-    return bli_cntx_get_blksz_def_dt( dt, bs_id, cntx );
+	return bli_cntx_get_blksz_def_dt( dt, bs_id, cntx );
 }
 
 BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx )
 {
-    switch ( bs_id )
-    {
-        case BLIS_MR: bs_id = BLIS_MR_SUP; break;
-        case BLIS_NR: bs_id = BLIS_NR_SUP; break;
-        case BLIS_KR: bs_id = BLIS_KR_SUP; break;
-        case BLIS_MC: bs_id = BLIS_MC_SUP; break;
-        case BLIS_NC: bs_id = BLIS_NC_SUP; break;
-        case BLIS_KC: bs_id = BLIS_KC_SUP; break;
-        default: break;
-    };
+	switch ( bs_id )
+	{
+		case BLIS_MR: bs_id = BLIS_MR_SUP; break;
+		case BLIS_NR: bs_id = BLIS_NR_SUP; break;
+		case BLIS_KR: bs_id = BLIS_KR_SUP; break;
+		case BLIS_MC: bs_id = BLIS_MC_SUP; break;
+		case BLIS_NC: bs_id = BLIS_NC_SUP; break;
+		case BLIS_KC: bs_id = BLIS_KC_SUP; break;
+		default: break;
+	};
 
-    return bli_cntx_get_blksz_max_dt( dt, bs_id, cntx );
+	return bli_cntx_get_blksz_max_dt( dt, bs_id, cntx );
 }
 
 // -----------------------------------------------------------------------------
@@ -402,7 +405,7 @@ BLIS_EXPORT_BLIS void bli_cntx_set_ukr_prefs( cntx_t* cntx, ... );
 
 BLIS_EXPORT_BLIS void bli_cntx_print( cntx_t* cntx );
 
-BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_handlers( dim_t n_ops, ... );
+BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_handlers( cntx_t* cntx, ... );
 
 
 #endif
diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c
index 1c3f49bc4e..1372a055ab 100644
--- a/frame/base/bli_gks.c
+++ b/frame/base/bli_gks.c
@@ -635,7 +635,7 @@ void bli_gks_init_ref_cntx
 bool bli_gks_cntx_l3_nat_ukr_is_ref
      (
        num_t   dt,
-       ukr_t ukr_id,
+       ukr_t   ukr_id,
        cntx_t* cntx
      )
 {
@@ -745,9 +745,9 @@ kimpl_t bli_gks_l3_ukr_impl_type( ukr_t ukr, ind_t method, num_t dt )
 		cntx_t* nat_cntx = bli_gks_lookup_nat_cntx( id );
 
 		if ( bli_gks_cntx_l3_nat_ukr_is_ref( dt, ukr, nat_cntx ) )
-            return BLIS_REFERENCE_UKERNEL;
+			return BLIS_REFERENCE_UKERNEL;
 		else
-            return BLIS_OPTIMIZED_UKERNEL;
+			return BLIS_OPTIMIZED_UKERNEL;
 	}
 }
 
diff --git a/frame/base/bli_gks.h b/frame/base/bli_gks.h
index 93c9c1412a..b8e4c4fe08 100644
--- a/frame/base/bli_gks.h
+++ b/frame/base/bli_gks.h
@@ -56,7 +56,7 @@ BLIS_EXPORT_BLIS void    bli_gks_init_ref_cntx( cntx_t* cntx );
 
 bool    bli_gks_cntx_l3_nat_ukr_is_ref( num_t dt, ukr_t ukr_id, cntx_t* cntx );
 
-BLIS_EXPORT_BLIS char*    bli_gks_l3_ukr_impl_string( ukr_t ukr, ind_t method, num_t dt );
+BLIS_EXPORT_BLIS char*   bli_gks_l3_ukr_impl_string( ukr_t ukr, ind_t method, num_t dt );
 BLIS_EXPORT_BLIS kimpl_t bli_gks_l3_ukr_impl_type( ukr_t ukr, ind_t method, num_t dt );
 
 //char*   bli_gks_l3_ukr_avail_impl_string( ukr_t ukr, num_t dt );
diff --git a/frame/include/bli_kernel_macro_defs.h b/frame/include/bli_kernel_macro_defs.h
index 20bd73afc9..7f57282667 100644
--- a/frame/include/bli_kernel_macro_defs.h
+++ b/frame/include/bli_kernel_macro_defs.h
@@ -245,8 +245,12 @@
 #define BLIS_POOL_ADDR_OFFSET_SIZE_GEN   0
 #endif
 
-// -- MR and NR block sizes (only for kernels) --------------------------------
+// -- MR and NR blocksizes (only for reference kernels) ------------------------
 
+// The build system defines BLIS_IN_KERNEL, but only when compiling reference
+// kernels. By using compile-time constants for MR and NR, the compiler can
+// perform certain optimizations, such as unrolling and vectorization, that
+// would not be otherwise be possible.
 #ifdef BLIS_IN_KERNEL
 
 #ifndef BLIS_MR_s
diff --git a/frame/include/bli_misc_macro_defs.h b/frame/include/bli_misc_macro_defs.h
index 120338beba..903b4ece6c 100644
--- a/frame/include/bli_misc_macro_defs.h
+++ b/frame/include/bli_misc_macro_defs.h
@@ -164,5 +164,11 @@ BLIS_INLINE void bli_toggle_bool( bool* b )
 #define bli_iformatspec() "%6d"
 
 
+// Sentinel constant used to indicate the end of a variable argument function
+// (See bli_cntx.c)
+
+#define BLIS_VA_END  (-1)
+
+
 #endif
 
diff --git a/frame/include/bli_param_macro_defs.h b/frame/include/bli_param_macro_defs.h
index 5aaf2771d1..688a4fcd03 100644
--- a/frame/include/bli_param_macro_defs.h
+++ b/frame/include/bli_param_macro_defs.h
@@ -860,18 +860,18 @@ BLIS_INLINE stor3_t bli_stor3_trans( stor3_t id )
 
 BLIS_INLINE ukr_t bli_stor3_ukr( stor3_t id )
 {
-    switch ( id )
-    {
-        case BLIS_RRR: return BLIS_GEMMSUP_RRR_UKR;
-        case BLIS_RRC: return BLIS_GEMMSUP_RRC_UKR;
-        case BLIS_RCR: return BLIS_GEMMSUP_RCR_UKR;
-        case BLIS_RCC: return BLIS_GEMMSUP_RCC_UKR;
-        case BLIS_CRR: return BLIS_GEMMSUP_CRR_UKR;
-        case BLIS_CRC: return BLIS_GEMMSUP_CRC_UKR;
-        case BLIS_CCR: return BLIS_GEMMSUP_CCR_UKR;
-        case BLIS_CCC: return BLIS_GEMMSUP_CCC_UKR;
-        default: return BLIS_GEMMSUP_XXX_UKR;
-    }
+	switch ( id )
+	{
+		case BLIS_RRR: return BLIS_GEMMSUP_RRR_UKR;
+		case BLIS_RRC: return BLIS_GEMMSUP_RRC_UKR;
+		case BLIS_RCR: return BLIS_GEMMSUP_RCR_UKR;
+		case BLIS_RCC: return BLIS_GEMMSUP_RCC_UKR;
+		case BLIS_CRR: return BLIS_GEMMSUP_CRR_UKR;
+		case BLIS_CRC: return BLIS_GEMMSUP_CRC_UKR;
+		case BLIS_CCR: return BLIS_GEMMSUP_CCR_UKR;
+		case BLIS_CCC: return BLIS_GEMMSUP_CCC_UKR;
+		default: return BLIS_GEMMSUP_XXX_UKR;
+	}
 }
 
 BLIS_INLINE stor3_t bli_stor3_transa( stor3_t id )
diff --git a/frame/include/bli_scalar_macro_defs.h b/frame/include/bli_scalar_macro_defs.h
index 293c80f910..f567e7ef32 100644
--- a/frame/include/bli_scalar_macro_defs.h
+++ b/frame/include/bli_scalar_macro_defs.h
@@ -49,8 +49,8 @@
 // NOTE: These macros are not used by other scalar macros, but they are
 // related to those defined in bli_sets.h, and so we #include them here.
 
-#include "bli_setrs.h"   // sets real component only 
-#include "bli_setis.h"   // sets imaginary component only 
+#include "bli_setrs.h"   // sets real component only
+#include "bli_setis.h"   // sets imaginary component only
 
 // NOTE: This macro also needs to be defined early on since it determines
 // how real and imaginary components are accessed (ie: whether the fields
@@ -194,6 +194,7 @@
 #include "bli_adds_mxn.h"
 #include "bli_adds_mxn_uplo.h"
 #include "bli_set0s_mxn.h"
+#include "bli_set0s_edge.h"
 #include "bli_copys_mxn.h"
 #include "bli_scal2s_mxn.h"
 #include "bli_xpbys_mxn.h"
@@ -230,7 +231,7 @@
 #include "bli_scal21rs.h"
 #include "bli_scal2j1rs.h"
 
-// 1m (1e or 1r) 
+// 1m (1e or 1r)
 #include "bli_invert1ms_mxn_diag.h"
 
 #include "bli_scal1ms_mxn.h"
diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h
index 95707fc26a..7ecb0a233b 100644
--- a/frame/include/bli_type_defs.h
+++ b/frame/include/bli_type_defs.h
@@ -626,7 +626,7 @@ typedef enum
 
 typedef enum
 {
-    // l1v kernels
+	// l1v kernels
 	BLIS_ADDV_KER,
 	BLIS_AMAXV_KER,
 	BLIS_AXPBYV_KER,
@@ -644,36 +644,36 @@ typedef enum
 	BLIS_AXPY2V_KER,
 	BLIS_DOTAXPYV_KER,
 
-    // l1f kernels
+	// l1f kernels
 	BLIS_AXPYF_KER,
 	BLIS_DOTXF_KER,
 	BLIS_DOTXAXPYF_KER,
 
-    // pack kernels
+	// pack kernels
 	BLIS_PACKM_MRXK_KER,
 	BLIS_PACKM_NRXK_KER,
 	BLIS_PACKM_MRXK_1ER_KER,
 	BLIS_PACKM_NRXK_1ER_KER,
 
-    // unpack kernels
+	// unpack kernels
 	BLIS_UNPACKM_MRXK_KER,
 	BLIS_UNPACKM_NRXK_KER,
 
-    // l3 nat kernels
+	// l3 native kernels
 	BLIS_GEMM_UKR,
 	BLIS_GEMMTRSM_L_UKR,
 	BLIS_GEMMTRSM_U_UKR,
 	BLIS_TRSM_L_UKR,
 	BLIS_TRSM_U_UKR,
 
-    // l3 virt kernels
+	// l3 virtual kernels
 	BLIS_GEMM_VIR_UKR,
 	BLIS_GEMMTRSM_L_VIR_UKR,
 	BLIS_GEMMTRSM_U_VIR_UKR,
 	BLIS_TRSM_L_VIR_UKR,
 	BLIS_TRSM_U_VIR_UKR,
 
-    // gemmsup kernels
+	// gemmsup kernels
 	BLIS_GEMMSUP_RRR_UKR,
 	BLIS_GEMMSUP_RRC_UKR,
 	BLIS_GEMMSUP_RCR_UKR,
@@ -684,8 +684,8 @@ typedef enum
 	BLIS_GEMMSUP_CCC_UKR,
 	BLIS_GEMMSUP_XXX_UKR,
 
-    // BLIS_NUM_UKRS must be last!
-    BLIS_NUM_UKRS
+	// BLIS_NUM_UKRS must be last!
+	BLIS_NUM_UKRS
 } ukr_t;
 
 
@@ -862,25 +862,25 @@ typedef enum
 	BLIS_KC,
 	BLIS_NC,
 
-    // broadcast factors for packing
-    BLIS_BBM,
-    BLIS_BBN,
+	// broadcast factors for packing
+	BLIS_BBM,
+	BLIS_BBN,
 
-    // level-2 blocksizes
+	// level-2 blocksizes
 	BLIS_M2, // level-2 blocksize in m dimension
 	BLIS_N2, // level-2 blocksize in n dimension
 
-    // level-1f blocksizes
+	// level-1f blocksizes
 	BLIS_AF, // level-1f axpyf fusing factor
 	BLIS_DF, // level-1f dotxf fusing factor
 	BLIS_XF, // level-1f dotxaxpyf fusing factor
 
-    // gemmsup thresholds
+	// gemmsup thresholds
 	BLIS_MT, // level-3 small/unpacked matrix threshold in m dimension
 	BLIS_NT, // level-3 small/unpacked matrix threshold in n dimension
 	BLIS_KT, // level-3 small/unpacked matrix threshold in k dimension
 
-    // gemmsup block sizes
+	// gemmsup block sizes
 	BLIS_KR_SUP,
 	BLIS_MR_SUP,
 	BLIS_NR_SUP,
@@ -888,9 +888,10 @@ typedef enum
 	BLIS_KC_SUP,
 	BLIS_NC_SUP,
 
-    // BLIS_NO_PART (= BLIS_NUM_BLKSZS) must be last!
-	BLIS_NO_PART, // used as a placeholder when blocksizes are not applicable.
-    BLIS_NUM_BLKSZS = BLIS_NO_PART
+	// BLIS_NO_PART (= BLIS_NUM_BLKSZS) must be last!
+	BLIS_NO_PART, // used as a placeholder when blocksizes are not applicable,
+	              // such as when characterizing a packm operation.
+	BLIS_NUM_BLKSZS = BLIS_NO_PART
 } bszid_t;
 
 
@@ -1409,7 +1410,7 @@ typedef struct cntx_s
 	func_t    ukrs[ BLIS_NUM_UKRS ];
 	mbool_t   ukr_prefs[ BLIS_NUM_UKR_PREFS ];
 
-	void*     l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ];
+	void_fp   l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ];
 
 	ind_t     method;
 
diff --git a/frame/include/level0/bli_set0s_edge.h b/frame/include/level0/bli_set0s_edge.h
new file mode 100644
index 0000000000..5ce23c36dd
--- /dev/null
+++ b/frame/include/level0/bli_set0s_edge.h
@@ -0,0 +1,73 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef BLIS_SET0S_EDGE_H
+#define BLIS_SET0S_EDGE_H
+
+// set0s_mxn
+
+// Notes:
+// - The first char encodes the type of x.
+// - The second char encodes the type of y.
+
+#define GENTFUNC(ctype,ch,op) \
+\
+BLIS_INLINE void PASTEMAC(ch,op)( const dim_t i, const dim_t m, \
+                                  const dim_t j, const dim_t n, \
+                                  ctype* restrict p, const inc_t ldp ) \
+{ \
+    if ( i < m ) \
+    { \
+    	PASTEMAC(ch,set0s_mxn) \
+        ( \
+          m - i, \
+          j, \
+          p + i*1, 1, ldp \
+        ); \
+    } \
+\
+    if ( j < n ) \
+    { \
+    	PASTEMAC(ch,set0s_mxn) \
+        ( \
+          m, \
+          n - j, \
+          p + j*ldp, 1, ldp \
+        ); \
+    } \
+}
+
+INSERT_GENTFUNC_BASIC0(set0s_edge)
+
+#endif
diff --git a/ref_kernels/1m/bli_packm_cxk_1er_ref.c b/ref_kernels/1m/bli_packm_cxk_1er_ref.c
index bb23e7e47b..94263ade10 100644
--- a/ref_kernels/1m/bli_packm_cxk_1er_ref.c
+++ b/ref_kernels/1m/bli_packm_cxk_1er_ref.c
@@ -34,6 +34,46 @@
 
 #include "blis.h"
 
+#define PACKM_1E_BODY( ctype, ch, pragma, cdim, inca2, op ) \
+\
+do \
+{ \
+	for ( dim_t k = n; k != 0; --k ) \
+	{ \
+		pragma \
+		for ( dim_t mn = 0; mn < cdim; ++mn ) \
+		for ( dim_t d = 0; d < dfac; ++d ) \
+		{ \
+			PASTEMAC(ch,op)(  kappa_r, kappa_i, *(alpha1 + mn*inca2 + 0), *(alpha1 + mn*inca2 + 1), \
+			                                    *(pi1_ri + (mn*2 + 0)*dfac + d), *(pi1_ri + (mn*2 + 1)*dfac + d) ); \
+			PASTEMAC(ch,op)( -kappa_i, kappa_r, *(alpha1 + mn*inca2 + 0), *(alpha1 + mn*inca2 + 1), \
+			                                    *(pi1_ir + (mn*2 + 0)*dfac + d), *(pi1_ir + (mn*2 + 1)*dfac + d) ); \
+		} \
+\
+		alpha1 += lda2; \
+		pi1_ri += ldp2; \
+		pi1_ir += ldp2; \
+	} \
+} while(0)
+
+#define PACKM_1R_BODY( ctype, ch, pragma, cdim, inca2, op ) \
+\
+do \
+{ \
+	for ( dim_t k = n; k != 0; --k ) \
+	{ \
+		pragma \
+		for ( dim_t mn = 0; mn < cdim; ++mn ) \
+		for ( dim_t d = 0; d < dfac; ++d ) \
+			PASTEMAC(ch,op)( kappa_r, kappa_i, *(alpha1 + mn*inca2 + 0), *(alpha1 + mn*inca2 + 1), \
+			                                   *(pi1_r + mn*dfac + d), *(pi1_i + mn*dfac + d) ); \
+\
+		alpha1 += lda2; \
+		pi1_r  += ldp2; \
+		pi1_i  += ldp2; \
+	} \
+} while(0)
+
 #undef  GENTFUNCCO
 #define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr0, bb0, arch, suf ) \
 \
@@ -50,234 +90,92 @@ void PASTEMAC3(ch,opname,arch,suf) \
        cntx_t* restrict cntx  \
      ) \
 { \
-	const dim_t mnr  = PASTECH2(mnr0, _, ch); \
-	const dim_t dfac = PASTECH2(bb0, _, ch); \
+	const dim_t dfac = PASTECH2(bb0, _, chr); \
+    const num_t dt_r = PASTEMAC(chr,type); \
 \
-	if ( cdim == mnr && mnr != -1 ) \
+	if ( bli_is_1e_packed( schema ) ) \
 	{ \
-		if ( bli_is_1e_packed( schema ) ) \
-		{ \
-			const inc_t       inca2      = 2 * inca; \
-			const inc_t       lda2       = 2 * lda; \
-			const inc_t       ldp2       = 2 * ldp; \
-\
-			ctype_r           kappa_r    = ( ( ctype_r* )kappa )[0]; \
-			ctype_r           kappa_i    = ( ( ctype_r* )kappa )[1]; \
-			ctype_r* restrict alpha1     = ( ctype_r* )a; \
-			ctype_r* restrict pi1_ri     = ( ctype_r* )p; \
-			ctype_r* restrict pi1_ir     = ( ctype_r* )p + ldp; \
-\
+        /* cdim and mnr are in units of complex values */ \
+    	const dim_t mnr      = PASTECH2(mnr0, _, chr) == -1 ? -1 : PASTECH2(mnr0, _, chr) / 2; \
+    	const dim_t cdim_max = bli_cntx_get_blksz_def_dt( dt_r, mnr0, cntx ) / 2; \
+\
+		const inc_t       inca2      = 2 * inca; \
+		const inc_t       lda2       = 2 * lda; \
+		const inc_t       ldp2       = 2 * ldp; \
+\
+		ctype_r           kappa_r    = ( ( ctype_r* )kappa )[0]; \
+		ctype_r           kappa_i    = ( ( ctype_r* )kappa )[1]; \
+		ctype_r* restrict alpha1     = ( ctype_r* )a; \
+		ctype_r* restrict pi1_ri     = ( ctype_r* )p; \
+		ctype_r* restrict pi1_ir     = ( ctype_r* )p + ldp; \
+\
+    	if ( cdim == mnr && mnr != -1 ) \
+    	{ \
 			if ( inca == 1 ) \
 			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PRAGMA_SIMD \
-						for ( dim_t mn = 0; mn < mnr; ++mn ) \
-						for ( dim_t d = 0; d < dfac; ++d ) \
-						{ \
-							PASTEMAC(ch,scal2jris)(  kappa_r, kappa_i, *(alpha1 + mn*2 + 0), *(alpha1 + mn*2 + 1), \
-							                                           *(pi1_ri + (mn*2 + 0)*dfac + d), *(pi1_ri + (mn*2 + 1)*dfac + d) ); \
-							PASTEMAC(ch,scal2jris)( -kappa_i, kappa_r, *(alpha1 + mn*2 + 0), *(alpha1 + mn*2 + 1), \
-							                                           *(pi1_ir + (mn*2 + 0)*dfac + d), *(pi1_ir + (mn*2 + 1)*dfac + d) ); \
-						} \
-\
-						alpha1 += lda2; \
-						pi1_ri += ldp2; \
-						pi1_ir += ldp2; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PRAGMA_SIMD \
-						for ( dim_t mn = 0; mn < mnr; ++mn ) \
-						for ( dim_t d = 0; d < dfac; ++d ) \
-						{ \
-							PASTEMAC(ch,scal2ris)(  kappa_r, kappa_i, *(alpha1 + mn*2 + 0), *(alpha1 + mn*2 + 1), \
-							                                          *(pi1_ri + (mn*2 + 0)*dfac + d), *(pi1_ri + (mn*2 + 1)*dfac + d) ); \
-							PASTEMAC(ch,scal2ris)( -kappa_i, kappa_r, *(alpha1 + mn*2 + 0), *(alpha1 + mn*2 + 1), \
-							                                          *(pi1_ir + (mn*2 + 0)*dfac + d), *(pi1_ir + (mn*2 + 1)*dfac + d) ); \
-						} \
-\
-						alpha1 += lda2; \
-						pi1_ri += ldp2; \
-						pi1_ir += ldp2; \
-					} \
-				} \
+				if ( bli_is_conj( conja ) ) PACKM_1E_BODY( ctype, ch, PRAGMA_SIMD, mnr, 2, scal2jris ); \
+				else                        PACKM_1E_BODY( ctype, ch, PRAGMA_SIMD, mnr, 2, scal2ris ); \
 			} \
 			else \
 			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PRAGMA_SIMD \
-						for ( dim_t mn = 0; mn < mnr; ++mn ) \
-						for ( dim_t d = 0; d < dfac; ++d ) \
-						{ \
-							PASTEMAC(ch,scal2jris)(  kappa_r, kappa_i, *(alpha1 + mn*inca2 + 0), *(alpha1 + mn*inca2 + 1), \
-							                                           *(pi1_ri + (mn*2 + 0)*dfac + d), *(pi1_ri + (mn*2 + 1)*dfac + d) ); \
-							PASTEMAC(ch,scal2jris)( -kappa_i, kappa_r, *(alpha1 + mn*inca2 + 0), *(alpha1 + mn*inca2 + 1), \
-							                                           *(pi1_ir + (mn*2 + 0)*dfac + d), *(pi1_ir + (mn*2 + 1)*dfac + d) ); \
-						} \
-\
-						alpha1 += lda2; \
-						pi1_ri += ldp2; \
-						pi1_ir += ldp2; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PRAGMA_SIMD \
-						for ( dim_t mn = 0; mn < mnr; ++mn ) \
-						for ( dim_t d = 0; d < dfac; ++d ) \
-						{ \
-							PASTEMAC(ch,scal2ris)(  kappa_r, kappa_i, *(alpha1 + mn*inca2 + 0), *(alpha1 + mn*inca2 + 1), \
-							                                          *(pi1_ri + (mn*2 + 0)*dfac + d), *(pi1_ri + (mn*2 + 1)*dfac + d) ); \
-							PASTEMAC(ch,scal2ris)( -kappa_i, kappa_r, *(alpha1 + mn*inca2 + 0), *(alpha1 + mn*inca2 + 1), \
-							                                          *(pi1_ir + (mn*2 + 0)*dfac + d), *(pi1_ir + (mn*2 + 1)*dfac + d) ); \
-						} \
-\
-						alpha1 += lda2; \
-						pi1_ri += ldp2; \
-						pi1_ir += ldp2; \
-					} \
-				} \
+				if ( bli_is_conj( conja ) ) PACKM_1E_BODY( ctype, ch, PRAGMA_SIMD, mnr, inca2, scal2jris ); \
+				else                        PACKM_1E_BODY( ctype, ch, PRAGMA_SIMD, mnr, inca2, scal2ris ); \
 			} \
 		} \
 		else \
 		{ \
-			const inc_t       inca2      = 2 * inca; \
-			const inc_t       lda2       = 2 * lda; \
-			const inc_t       ldp2       = 2 * ldp; \
-\
-			ctype_r           kappa_r    = ( ( ctype_r* )kappa )[0]; \
-			ctype_r           kappa_i    = ( ( ctype_r* )kappa )[1]; \
-			ctype_r* restrict alpha1     = ( ctype_r* )a; \
-			ctype_r* restrict pi1_r      = ( ctype_r* )p; \
-			ctype_r* restrict pi1_i      = ( ctype_r* )p + ldp; \
-\
-			if ( inca == 1 ) \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PRAGMA_SIMD \
-						for ( dim_t mn = 0; mn < mnr; ++mn ) \
-						for ( dim_t d = 0; d < dfac; ++d ) \
-							PASTEMAC(ch,scal2jris)( kappa_r, kappa_i, *(alpha1 + mn*2 + 0), *(alpha1 + mn*2 + 1), \
-							                                          *(pi1_r + mn*dfac + d), *(pi1_i + mn*dfac + d) ); \
-\
-						alpha1 += lda2; \
-						pi1_r  += ldp2; \
-						pi1_i  += ldp2; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PRAGMA_SIMD \
-						for ( dim_t mn = 0; mn < mnr; ++mn ) \
-						for ( dim_t d = 0; d < dfac; ++d ) \
-							PASTEMAC(ch,scal2ris)( kappa_r, kappa_i, *(alpha1 + mn*2 + 0), *(alpha1 + mn*2 + 1), \
-							                                         *(pi1_r + mn*dfac + d), *(pi1_i + mn*dfac + d) ); \
-\
-						alpha1 += lda2; \
-						pi1_r  += ldp2; \
-						pi1_i  += ldp2; \
-					} \
-				} \
-			} \
-			else \
-			{ \
-				if ( bli_is_conj( conja ) ) \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PRAGMA_SIMD \
-						for ( dim_t mn = 0; mn < mnr; ++mn ) \
-						for ( dim_t d = 0; d < dfac; ++d ) \
-							PASTEMAC(ch,scal2jris)( kappa_r, kappa_i, *(alpha1 + mn*inca2 + 0), *(alpha1 + mn*inca2 + 1), \
-							                                          *(pi1_r + mn*dfac + d), *(pi1_i + mn*dfac + d) ); \
-\
-						alpha1 += lda2; \
-						pi1_r  += ldp2; \
-						pi1_i  += ldp2; \
-					} \
-				} \
-				else \
-				{ \
-					for ( dim_t k = n; k != 0; --k ) \
-					{ \
-						PRAGMA_SIMD \
-						for ( dim_t mn = 0; mn < mnr; ++mn ) \
-						for ( dim_t d = 0; d < dfac; ++d ) \
-							PASTEMAC(ch,scal2ris)( kappa_r, kappa_i, *(alpha1 + mn*inca2 + 0), *(alpha1 + mn*inca2 + 1), \
-							                                         *(pi1_r + mn*dfac + d), *(pi1_i + mn*dfac + d) ); \
-\
-						alpha1 += lda2; \
-						pi1_r  += ldp2; \
-						pi1_i  += ldp2; \
-					} \
-				} \
-			} \
+			if ( bli_is_conj( conja ) ) PACKM_1E_BODY( ctype, ch, , cdim, inca2, scal2jris ); \
+			else                        PACKM_1E_BODY( ctype, ch, , cdim, inca2, scal2ris ); \
 		} \
-	} \
-	else /* if ( cdim < mnr ) */ \
-	{ \
-		PASTEMAC(ch,scal21ms_mxn) \
-		( \
-		  schema, \
-		  conja, \
-		  cdim, \
-		  n, \
-		  kappa, \
-		  a, inca, lda, \
-		  p, 1,    ldp, ldp  \
-		); \
 \
-		const dim_t       i      = cdim; \
-		const dim_t       erfac  = bli_is_1e_packed( schema ) ? 2 : 1; \
-		/* use ldp instead of mnr, in case the latter is -1 \
-		   this may write extra zeros, but not too many \
-		   this also automatically accounts for dfac when \
-		   using set0s_mxn instead of set0bbs_mxn */ \
-		const dim_t       m_edge = ldp - cdim*dfac*erfac; \
-		const dim_t       n_edge = 2*n_max; \
-		ctype_r* restrict p_cast = ( ctype_r* )p; \
-		ctype_r* restrict p_edge = p_cast + (i  )*dfac*erfac; \
+    	PASTEMAC(chr,set0s_edge) \
+    	( \
+    	  2*cdim, 2*cdim_max, \
+    	  2*n, 2*n_max, \
+    	  ( ctype_r* )p, ldp  \
+    	); \
+    } \
+	else /* ( bli_is_1r_packed( schema ) ) */ \
+	{ \
+    	const dim_t mnr      = PASTECH2(mnr0, _, chr); \
+    	const dim_t cdim_max = bli_cntx_get_blksz_def_dt( dt_r, mnr0, cntx ); \
+\
+		const inc_t       inca2      = 2 * inca; \
+		const inc_t       lda2       = 2 * lda; \
+		const inc_t       ldp2       = 2 * ldp; \
+\
+		ctype_r           kappa_r    = ( ( ctype_r* )kappa )[0]; \
+		ctype_r           kappa_i    = ( ( ctype_r* )kappa )[1]; \
+		ctype_r* restrict alpha1     = ( ctype_r* )a; \
+		ctype_r* restrict pi1_r      = ( ctype_r* )p; \
+		ctype_r* restrict pi1_i      = ( ctype_r* )p + ldp; \
+\
+    	if ( cdim == mnr && mnr != -1 ) \
+    	{ \
+    		if ( inca == 1 ) \
+    		{ \
+    			if ( bli_is_conj( conja ) ) PACKM_1R_BODY( ctype, ch, PRAGMA_SIMD, mnr, 2, scal2jris ); \
+    			else                        PACKM_1R_BODY( ctype, ch, PRAGMA_SIMD, mnr, 2, scal2ris ); \
+    		} \
+    		else \
+    		{ \
+    			if ( bli_is_conj( conja ) ) PACKM_1R_BODY( ctype, ch, PRAGMA_SIMD, mnr, inca2, scal2jris ); \
+    			else                        PACKM_1R_BODY( ctype, ch, PRAGMA_SIMD, mnr, inca2, scal2ris ); \
+    		} \
+        } \
+		else \
+		{ \
+			if ( bli_is_conj( conja ) ) PACKM_1R_BODY( ctype, ch, , cdim, inca2, scal2jris ); \
+			else                        PACKM_1R_BODY( ctype, ch, , cdim, inca2, scal2ris ); \
+		} \
 \
-		PASTEMAC(chr,set0s_mxn) \
-		( \
-		  m_edge, \
-		  n_edge, \
-		  p_edge, 1, ldp  \
-		); \
+    	PASTEMAC(chr,set0s_edge) \
+    	( \
+    	  cdim, cdim_max, \
+    	  2*n, 2*n_max, \
+    	  ( ctype_r* )p, ldp  \
+    	); \
 	} \
-\
-	const dim_t       i      = n; \
-	/* use ldp instead of mnr, in case the latter is -1 \
-	   this may write extra zeros, but not too many \
-	   this also automatically accounts for dfac when \
-	   using set0s_mxn instead of set0bbs_mxn */ \
-	const dim_t       m_edge = ldp; \
-	const dim_t       n_edge = 2*(n_max-i); \
-	ctype_r* restrict p_cast = ( ctype_r* )p; \
-	ctype_r* restrict p_edge = p_cast + (i  )*ldp*2; \
-\
-	PASTEMAC(chr,set0s_mxn) \
-	( \
-	  m_edge, \
-	  n_edge, \
-	  p_edge, 1, ldp  \
-	); \
 }
 
 INSERT_GENTFUNCCO_BASIC4( packm_mrxk_1er, BLIS_MR, BLIS_BBM, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
diff --git a/ref_kernels/1m/bli_packm_cxk_ref.c b/ref_kernels/1m/bli_packm_cxk_ref.c
index acbd00e893..56169a364f 100644
--- a/ref_kernels/1m/bli_packm_cxk_ref.c
+++ b/ref_kernels/1m/bli_packm_cxk_ref.c
@@ -34,6 +34,22 @@
 
 #include "blis.h"
 
+#define PACKM_BODY( ctype, ch, pragma, cdim, inca, op ) \
+\
+do \
+{ \
+	for ( dim_t k = n; k != 0; --k ) \
+	{ \
+		pragma \
+		for ( dim_t mn = 0; mn < cdim; mn++ ) \
+		for ( dim_t d = 0; d < dfac; d++ ) \
+			PASTEMAC(ch,op)( kappa_cast, *(alpha1 + mn*inca), *(pi1 + mn*dfac + d) ); \
+\
+		alpha1 += lda; \
+		pi1    += ldp; \
+	} \
+} while(0)
+
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, mnr0, bb0, arch, suf ) \
 \
@@ -51,6 +67,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
      ) \
 { \
 	const dim_t     mnr        = PASTECH2(mnr0, _, ch); \
+    const num_t     dt         = PASTEMAC(ch,type); \
+	const dim_t     cdim_max   = bli_cntx_get_blksz_def_dt( dt, mnr0, cntx ); \
 	const dim_t     dfac       = PASTECH2(bb0, _, ch); \
 \
 	ctype           kappa_cast = *( ctype* )kappa; \
@@ -61,112 +79,27 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	{ \
 		if ( inca == 1 ) \
 		{ \
-			if ( bli_is_conj( conja ) ) \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PRAGMA_SIMD \
-					for ( dim_t mn = 0; mn < mnr; mn++ ) \
-					for ( dim_t d = 0; d < dfac; d++ ) \
-						PASTEMAC(ch,scal2js)( kappa_cast, *(alpha1 + mn*1), *(pi1 + mn*dfac + d) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-			else \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PRAGMA_SIMD \
-					for ( dim_t mn = 0; mn < mnr; mn++ ) \
-					for ( dim_t d = 0; d < dfac; d++ ) \
-						PASTEMAC(ch,scal2s)( kappa_cast, *(alpha1 + mn*1), *(pi1 + mn*dfac + d) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
+			if ( bli_is_conj( conja ) ) PACKM_BODY( ctype, ch, PRAGMA_SIMD, mnr, 1, scal2js ); \
+            else                        PACKM_BODY( ctype, ch, PRAGMA_SIMD, mnr, 1, scal2s ); \
 		} \
 		else \
 		{ \
-			if ( bli_is_conj( conja ) ) \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PRAGMA_SIMD \
-					for ( dim_t mn = 0; mn < mnr; mn++ ) \
-					for ( dim_t d = 0; d < dfac; d++ ) \
-						PASTEMAC(ch,scal2js)( kappa_cast, *(alpha1 + mn*inca), *(pi1 + mn*dfac + d) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-			else \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PRAGMA_SIMD \
-					for ( dim_t mn = 0; mn < mnr; mn++ ) \
-					for ( dim_t d = 0; d < dfac; d++ ) \
-						PASTEMAC(ch,scal2s)( kappa_cast, *(alpha1 + mn*inca), *(pi1 + mn*dfac + d) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
+			if ( bli_is_conj( conja ) ) PACKM_BODY( ctype, ch, PRAGMA_SIMD, mnr, inca, scal2js ); \
+            else                        PACKM_BODY( ctype, ch, PRAGMA_SIMD, mnr, inca, scal2s ); \
 		} \
 	} \
 	else /* if ( cdim < mnr ) */ \
 	{ \
-		PASTEMAC(ch,scal2bbs_mxn) \
-		( \
-		  conja, \
-		  cdim, \
-		  n, \
-		  kappa, \
-		  a, inca, lda, \
-		  p, dfac, ldp  \
-		); \
-\
-		const dim_t     i      = cdim; \
-		/* use ldp instead of mnr, in case the latter is -1 \
-		   this may write extra zeros, but not too many \
-		   this also automatically accounts for dfac when \
-		   using set0s_mxn instead of set0bbs_mxn */ \
-		const dim_t     m_edge = ldp - cdim*dfac; \
-		const dim_t     n_edge = n_max; \
-		ctype* restrict p_cast = p; \
-		ctype* restrict p_edge = p_cast + (i  )*dfac; \
-\
-		PASTEMAC(ch,set0s_mxn) \
-		( \
-		  m_edge, \
-		  n_edge, \
-		  p_edge, 1, ldp  \
-		); \
+		if ( bli_is_conj( conja ) ) PACKM_BODY( ctype, ch, , cdim, inca, scal2js ); \
+        else                        PACKM_BODY( ctype, ch, , cdim, inca, scal2s ); \
 	} \
 \
-	if ( n < n_max ) \
-	{ \
-		const dim_t     j      = n; \
-		/* use ldp instead of mnr, in case the latter is -1 \
-		   this may write extra zeros, but not too many \
-		   this also automatically accounts for dfac when \
-		   using set0s_mxn instead of set0bbs_mxn */ \
-		const dim_t     m_edge = ldp; \
-		const dim_t     n_edge = n_max - n; \
-		ctype* restrict p_cast = p; \
-		ctype* restrict p_edge = p_cast + (j  )*ldp; \
-\
-		PASTEMAC(ch,set0s_mxn) \
-		( \
-		  m_edge, \
-		  n_edge, \
-		  p_edge, 1, ldp  \
-		); \
-	} \
+	PASTEMAC(ch,set0s_edge) \
+	( \
+	  cdim, cdim_max, \
+	  n, n_max, \
+	  p, ldp  \
+	); \
 }
 
 INSERT_GENTFUNC_BASIC4( packm_mrxk, BLIS_MR, BLIS_BBM, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
diff --git a/ref_kernels/1m/bli_unpackm_cxk_ref.c b/ref_kernels/1m/bli_unpackm_cxk_ref.c
index a914e07660..4f19ddfef3 100644
--- a/ref_kernels/1m/bli_unpackm_cxk_ref.c
+++ b/ref_kernels/1m/bli_unpackm_cxk_ref.c
@@ -34,6 +34,21 @@
 
 #include "blis.h"
 
+#define UNPACKM_BODY( ctype, ch, pragma, cdim, inca, op ) \
+\
+do \
+{ \
+	for ( dim_t k = n; k != 0; --k ) \
+	{ \
+		pragma \
+		for ( dim_t mn = 0; mn < cdim; mn++ ) \
+			PASTEMAC(ch,op)( *kappa_cast, *(pi1 + mn*dfac), *(alpha1 + mn*inca) ); \
+\
+		alpha1 += lda; \
+		pi1    += ldp; \
+	} \
+} while(0)
+
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, mnr0, bb0, arch, suf ) \
 \
@@ -62,70 +77,19 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	{ \
 		if ( inca == 1 ) \
 		{ \
-			if ( bli_is_conj( conja ) ) \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PRAGMA_SIMD \
-					for ( dim_t mn = 0; mn < mnr; mn++ ) \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(pi1 + mn*dfac), *(alpha1 + mn*1) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-			else \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PRAGMA_SIMD \
-					for ( dim_t mn = 0; mn < mnr; mn++ ) \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(pi1 + mn*dfac), *(alpha1 + mn*1) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
+			if ( bli_is_conj( conja ) ) UNPACKM_BODY( ctype, ch, PRAGMA_SIMD, mnr, 1, scal2js ); \
+			else                        UNPACKM_BODY( ctype, ch, PRAGMA_SIMD, mnr, 1, scal2s ); \
 		} \
 		else \
 		{ \
-			if ( bli_is_conj( conja ) ) \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PRAGMA_SIMD \
-					for ( dim_t mn = 0; mn < mnr; mn++ ) \
-						PASTEMAC(ch,scal2js)( *kappa_cast, *(pi1 + mn*dfac), *(alpha1 + mn*inca) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
-			else \
-			{ \
-				for ( dim_t k = n; k != 0; --k ) \
-				{ \
-					PRAGMA_SIMD \
-					for ( dim_t mn = 0; mn < mnr; mn++ ) \
-						PASTEMAC(ch,scal2s)( *kappa_cast, *(pi1 + mn*dfac), *(alpha1 + mn*inca) ); \
-\
-					alpha1 += lda; \
-					pi1    += ldp; \
-				} \
-			} \
+			if ( bli_is_conj( conja ) ) UNPACKM_BODY( ctype, ch, PRAGMA_SIMD, mnr, inca, scal2js ); \
+			else                        UNPACKM_BODY( ctype, ch, PRAGMA_SIMD, mnr, inca, scal2s ); \
 		} \
 	} \
 	else /* if ( cdim < mnr ) */ \
 	{ \
-		PASTEMAC(ch,scal2s_mxn) \
-		( \
-		  conja, \
-		  cdim, \
-		  n, \
-		  kappa, \
-		  p, dfac, ldp, \
-		  a, inca, lda \
-		); \
+			if ( bli_is_conj( conja ) ) UNPACKM_BODY( ctype, ch, , cdim, inca, scal2js ); \
+			else                        UNPACKM_BODY( ctype, ch, , cdim, inca, scal2s ); \
 	} \
 }
 
diff --git a/ref_kernels/bli_cntx_ref.c b/ref_kernels/bli_cntx_ref.c
index 53b9aa551f..3110077101 100644
--- a/ref_kernels/bli_cntx_ref.c
+++ b/ref_kernels/bli_cntx_ref.c
@@ -222,7 +222,7 @@ void GENBARNAME(cntx_init)
 	func_t*  funcs;
 	mbool_t* mbools;
 	dim_t    i;
-	void**   vfuncs;
+	void_fp* vfuncs;
 
 
 	// -- Clear the context ----------------------------------------------------
@@ -283,7 +283,7 @@ void GENBARNAME(cntx_init)
 	  BLIS_KT,  &blkszs[ BLIS_KT  ], BLIS_KT,
 	  BLIS_BBM, &blkszs[ BLIS_BBM ], BLIS_BBM,
 	  BLIS_BBN, &blkszs[ BLIS_BBN ], BLIS_BBN,
-	  -1
+	  BLIS_VA_END
 	);
 
 
@@ -472,13 +472,13 @@ void GENBAINAME(cntx_init)
 
 	if ( method == BLIS_1M )
 	{
-    	gen_func_init_co( &funcs[ BLIS_PACKM_MRXK_KER ],  packm_mrxk_1er_ker_name );
-    	gen_func_init_co( &funcs[ BLIS_PACKM_NRXK_KER ],  packm_nrxk_1er_ker_name );
+		gen_func_init_co( &funcs[ BLIS_PACKM_MRXK_KER ],  packm_mrxk_1er_ker_name );
+		gen_func_init_co( &funcs[ BLIS_PACKM_NRXK_KER ],  packm_nrxk_1er_ker_name );
 	}
 	else // if ( method == BLIS_NAT )
 	{
-    	gen_func_init( &funcs[ BLIS_PACKM_MRXK_KER ],  packm_mrxk_ker_name );
-    	gen_func_init( &funcs[ BLIS_PACKM_NRXK_KER ],  packm_nrxk_ker_name );
+		gen_func_init( &funcs[ BLIS_PACKM_MRXK_KER ],  packm_mrxk_ker_name );
+		gen_func_init( &funcs[ BLIS_PACKM_NRXK_KER ],  packm_nrxk_ker_name );
 	}
 
 	gen_func_init_co( &funcs[ BLIS_PACKM_MRXK_1ER_KER ],  packm_mrxk_1er_ker_name );
@@ -516,22 +516,14 @@ void GENBAINAME(cntx_init_blkszs)
        cntx_t* cntx
      )
 {
-	// We MUST set the induced method in the context prior to calling
-	// bli_cntx_l3_vir_ukr_prefers_cols_dt() because that function queries
-	// the induced method. That function needs the induced method value in
-	// order to determine whether to evaluate the "prefers column storage"
-	// predicate using the storage preference of the kernel for dt, or
-	// the storage preference of the kernel for the real projection of
-	// dt. Failing to set the induced method here can lead to strange
-	// undefined behavior at runtime if the native complex kernel's
-	// storage preference happens to not equal that of the native real
-	// kernel.
+	// Set the induced method in the context.
 	bli_cntx_set_method( method, cntx );
 
-    num_t dt_r = bli_dt_proj_to_real( dt );
+	num_t dt_r = bli_dt_proj_to_real( dt );
 
 	// Initialize the blocksizes according to the micro-kernel preference as
 	// well as the algorithm.
+	//if ( bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ) )
 	if ( ! bli_cntx_get_ukr_prefs_dt( dt_r, BLIS_GEMM_UKR_ROW_PREF, cntx ) )
 	{
 		// This branch is used for algorithm 1m_c_bp.
@@ -545,7 +537,7 @@ void GENBAINAME(cntx_init_blkszs)
 		  BLIS_NR, 1.0, 1.0,
 		  BLIS_MR, 2.0, 1.0, // ...and mr (but NOT packmr)
 		  BLIS_KR, 1.0, 1.0,
-		  -1
+		  BLIS_VA_END
 		);
 	}
 	else // if ( bli_cntx_get_ukr_prefs_dt( dt, BLIS_GEMM_UKR_ROW_PREF, cntx ) )
@@ -561,7 +553,7 @@ void GENBAINAME(cntx_init_blkszs)
 		  BLIS_NR, 2.0, 1.0, // ...and nr (but NOT packnr)
 		  BLIS_MR, 1.0, 1.0,
 		  BLIS_KR, 1.0, 1.0,
-		  -1
+		  BLIS_VA_END
 		);
 	}
 }

From 0b86e10b7b4439bfef04c5455f2df7d24459240a Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Wed, 16 Feb 2022 12:22:57 -0600
Subject: [PATCH 09/32] New packing kernels completely working.

- bli_packm_struc_cxk has been completely rewritten to combine nat/1m execution and use a special packing kernel for diagonal blocks.
- *all* reference kernels now respect broadcast packing for A and/or B. This works for all l3 operations (even trsm!) and with 1m.
---
 common.mk                                     |   2 +
 frame/1m/bli_l1m_ft_ker.h                     |  17 +-
 frame/1m/bli_l1m_ker.h                        |  20 +-
 frame/1m/bli_l1m_ker_prot.h                   |  19 +-
 frame/1m/{packm => other}/bli_packm_cxk.c     |   0
 frame/1m/{packm => other}/bli_packm_cxk.h     |   0
 frame/1m/{packm => other}/bli_packm_cxk_1er.c |   2 +-
 frame/1m/{packm => other}/bli_packm_cxk_1er.h |   0
 .../bli_packm_struc_cxk_1er.c                 |   0
 .../bli_packm_struc_cxk_1er.h                 |   0
 frame/1m/{unpackm => other}/bli_unpackm_cxk.c |   0
 frame/1m/{unpackm => other}/bli_unpackm_cxk.h |   0
 frame/1m/packm/bli_packm.h                    |   4 -
 frame/1m/packm/bli_packm_blk_var1.c           |   8 +-
 frame/1m/packm/bli_packm_struc_cxk.c          | 655 ++++++------------
 frame/1m/unpackm/bli_unpackm.h                |   2 -
 frame/1m/unpackm/bli_unpackm_blk_var1.c       |   9 +-
 frame/3/bli_l3_sup_packm_var.c                |  10 +-
 frame/include/bli_param_macro_defs.h          |   8 +
 frame/include/bli_type_defs.h                 |   4 +
 ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c   | 336 +++++++++
 ref_kernels/1m/bli_packm_cxc_diag_ref.c       | 173 +++++
 ref_kernels/1m/bli_packm_cxk_1er_ref.c        |   4 +-
 ref_kernels/1m/bli_packm_cxk_ref.c            |   2 +-
 ref_kernels/1m/bli_unpackm_cxk_ref.c          |   3 +-
 ref_kernels/3/bli_gemmtrsm_ref.c              |  18 +-
 ref_kernels/3/bli_trsm_ref.c                  |  19 +-
 ref_kernels/bli_cntx_ref.c                    |  51 +-
 ref_kernels/ind/bli_gemmtrsm1m_ref.c          |  50 +-
 ref_kernels/ind/bli_trsm1m_ref.c              | 116 ++--
 testsuite/src/test_trsm_ukr.c                 |   4 +-
 31 files changed, 955 insertions(+), 581 deletions(-)
 rename frame/1m/{packm => other}/bli_packm_cxk.c (100%)
 rename frame/1m/{packm => other}/bli_packm_cxk.h (100%)
 rename frame/1m/{packm => other}/bli_packm_cxk_1er.c (97%)
 rename frame/1m/{packm => other}/bli_packm_cxk_1er.h (100%)
 rename frame/1m/{packm => other}/bli_packm_struc_cxk_1er.c (100%)
 rename frame/1m/{packm => other}/bli_packm_struc_cxk_1er.h (100%)
 rename frame/1m/{unpackm => other}/bli_unpackm_cxk.c (100%)
 rename frame/1m/{unpackm => other}/bli_unpackm_cxk.h (100%)
 create mode 100644 ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c
 create mode 100644 ref_kernels/1m/bli_packm_cxc_diag_ref.c

diff --git a/common.mk b/common.mk
index 8d77e4c5f6..5d681132f4 100644
--- a/common.mk
+++ b/common.mk
@@ -120,6 +120,8 @@ get-refinit-cflags-for   = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
                                    -DBLIS_CNAME=$(1) \
                                    $(BUILD_CPPFLAGS) \
                                    $(BUILD_SYMFLAGS) \
+                                   -DBLIS_IN_KERNEL=1 \
+                                   -include $(CONFIG_PATH)/$(1)/bli_kernel_defs_$(1).h \
                             )
 
 get-refkern-cflags-for   = $(strip $(call load-var-for,CROPTFLAGS,$(1)) \
diff --git a/frame/1m/bli_l1m_ft_ker.h b/frame/1m/bli_l1m_ft_ker.h
index 2e813cf4a6..41d80e217d 100644
--- a/frame/1m/bli_l1m_ft_ker.h
+++ b/frame/1m/bli_l1m_ft_ker.h
@@ -102,35 +102,40 @@ INSERT_GENTDEF( packm_cxk )
 \
 typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
      ( \
-       conj_t           conjp, \
+       conj_t           conja, \
+       pack_t           schema, \
+       dim_t            cdim, \
        dim_t            n, \
        ctype*  restrict kappa, \
        ctype*  restrict p,             inc_t ldp, \
        ctype*  restrict a, inc_t inca, inc_t lda, \
-       cntx_t* restrict cntx  \
+       cntx_t* restrict cntx \
      );
 
 INSERT_GENTDEF( unpackm_cxk )
 
-// packm_1er_ker
+// packm_diag_ker
 
 #undef  GENTDEF
 #define GENTDEF( ctype, ch, opname, tsuf ) \
 \
 typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
      ( \
+       struc_t          struca, \
+       diag_t           diaga, \
+       uplo_t           uploa, \
        conj_t           conja, \
        pack_t           schema, \
+       bool             invdiag, \
        dim_t            cdim, \
-       dim_t            n, \
        dim_t            n_max, \
        ctype*  restrict kappa, \
        ctype*  restrict a, inc_t inca, inc_t lda, \
        ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx  \
+       cntx_t* restrict cntx \
      );
 
-INSERT_GENTDEF( packm_cxk_1er )
+INSERT_GENTDEF( packm_cxc_diag )
 
 
 #endif
diff --git a/frame/1m/bli_l1m_ker.h b/frame/1m/bli_l1m_ker.h
index 535217b55f..970c5f040d 100644
--- a/frame/1m/bli_l1m_ker.h
+++ b/frame/1m/bli_l1m_ker.h
@@ -63,8 +63,26 @@ INSERT_GENTPROT_BASIC0( unpackm_nrxk_ker_name )
 // 1e/1r packm kernels
 
 #undef  GENTPROT
-#define GENTPROT PACKM_1ER_KER_PROT
+#define GENTPROT PACKM_KER_PROT
 
 INSERT_GENTPROT_BASIC0( packm_mrxk_1er_ker_name )
 INSERT_GENTPROT_BASIC0( packm_nrxk_1er_ker_name )
 
+
+// packm kernels for diagonal blocks
+
+#undef  GENTPROT
+#define GENTPROT PACKM_DIAG_KER_PROT
+
+INSERT_GENTPROT_BASIC0( packm_mrxmr_diag_ker_name )
+INSERT_GENTPROT_BASIC0( packm_nrxnr_diag_ker_name )
+
+
+// 1e/1r packm kernels for diagonal blocks
+
+#undef  GENTPROT
+#define GENTPROT PACKM_DIAG_KER_PROT
+
+INSERT_GENTPROT_BASIC0( packm_mrxmr_diag_1er_ker_name )
+INSERT_GENTPROT_BASIC0( packm_nrxnr_diag_1er_ker_name )
+
diff --git a/frame/1m/bli_l1m_ker_prot.h b/frame/1m/bli_l1m_ker_prot.h
index 02d3296220..80284ea223 100644
--- a/frame/1m/bli_l1m_ker_prot.h
+++ b/frame/1m/bli_l1m_ker_prot.h
@@ -37,7 +37,7 @@
 // Define template prototypes for level-1m kernels.
 //
 
-// native packm kernels
+// packm kernels
 
 #define PACKM_KER_PROT( ctype, ch, varname ) \
 \
@@ -55,35 +55,40 @@ void PASTEMAC(ch,varname) \
      );
 
 
-// native unpackm kernels
+// unpackm kernels
 
 #define UNPACKM_KER_PROT( ctype, ch, varname ) \
 \
 void PASTEMAC(ch,varname) \
      ( \
        conj_t           conja, \
+       pack_t           schema, \
+       dim_t            cdim, \
        dim_t            n, \
        ctype*  restrict kappa, \
        ctype*  restrict p,             inc_t ldp, \
        ctype*  restrict a, inc_t inca, inc_t lda, \
-       cntx_t* restrict cntx  \
+       cntx_t* restrict cntx \
      );
 
 
-// 1e/1r packm kernels
+// packm kernels for diagonal blocks
 
-#define PACKM_1ER_KER_PROT( ctype, ch, varname ) \
+#define PACKM_DIAG_KER_PROT( ctype, ch, varname ) \
 \
 void PASTEMAC(ch,varname) \
      ( \
+       struc_t          struca, \
+       diag_t           diaga, \
+       uplo_t           uploa, \
        conj_t           conja, \
        pack_t           schema, \
+       bool             invdiag, \
        dim_t            cdim, \
-       dim_t            n, \
        dim_t            n_max, \
        ctype*  restrict kappa, \
        ctype*  restrict a, inc_t inca, inc_t lda, \
        ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx  \
+       cntx_t* restrict cntx \
      );
 
diff --git a/frame/1m/packm/bli_packm_cxk.c b/frame/1m/other/bli_packm_cxk.c
similarity index 100%
rename from frame/1m/packm/bli_packm_cxk.c
rename to frame/1m/other/bli_packm_cxk.c
diff --git a/frame/1m/packm/bli_packm_cxk.h b/frame/1m/other/bli_packm_cxk.h
similarity index 100%
rename from frame/1m/packm/bli_packm_cxk.h
rename to frame/1m/other/bli_packm_cxk.h
diff --git a/frame/1m/packm/bli_packm_cxk_1er.c b/frame/1m/other/bli_packm_cxk_1er.c
similarity index 97%
rename from frame/1m/packm/bli_packm_cxk_1er.c
rename to frame/1m/other/bli_packm_cxk_1er.c
index 82dfb7b2e1..0c63609e52 100644
--- a/frame/1m/packm/bli_packm_cxk_1er.c
+++ b/frame/1m/other/bli_packm_cxk_1er.c
@@ -55,7 +55,7 @@ void PASTEMAC(ch,opname) \
 	   kernel function pointer. This means that we always use the same
 	   kernel, even for edge cases. */ \
 	num_t dt     = PASTEMAC(ch,type); \
-	ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_KER : BLIS_PACKM_MRXK_KER; \
+	ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_1ER_KER : BLIS_PACKM_MRXK_1ER_KER; \
 \
 	PASTECH2(ch,opname,_ker_ft) f; \
 \
diff --git a/frame/1m/packm/bli_packm_cxk_1er.h b/frame/1m/other/bli_packm_cxk_1er.h
similarity index 100%
rename from frame/1m/packm/bli_packm_cxk_1er.h
rename to frame/1m/other/bli_packm_cxk_1er.h
diff --git a/frame/1m/packm/bli_packm_struc_cxk_1er.c b/frame/1m/other/bli_packm_struc_cxk_1er.c
similarity index 100%
rename from frame/1m/packm/bli_packm_struc_cxk_1er.c
rename to frame/1m/other/bli_packm_struc_cxk_1er.c
diff --git a/frame/1m/packm/bli_packm_struc_cxk_1er.h b/frame/1m/other/bli_packm_struc_cxk_1er.h
similarity index 100%
rename from frame/1m/packm/bli_packm_struc_cxk_1er.h
rename to frame/1m/other/bli_packm_struc_cxk_1er.h
diff --git a/frame/1m/unpackm/bli_unpackm_cxk.c b/frame/1m/other/bli_unpackm_cxk.c
similarity index 100%
rename from frame/1m/unpackm/bli_unpackm_cxk.c
rename to frame/1m/other/bli_unpackm_cxk.c
diff --git a/frame/1m/unpackm/bli_unpackm_cxk.h b/frame/1m/other/bli_unpackm_cxk.h
similarity index 100%
rename from frame/1m/unpackm/bli_unpackm_cxk.h
rename to frame/1m/other/bli_unpackm_cxk.h
diff --git a/frame/1m/packm/bli_packm.h b/frame/1m/packm/bli_packm.h
index 88657a7128..7d73bf903e 100644
--- a/frame/1m/packm/bli_packm.h
+++ b/frame/1m/packm/bli_packm.h
@@ -43,10 +43,6 @@
 #include "bli_packm_part.h"
 
 #include "bli_packm_struc_cxk.h"
-#include "bli_packm_struc_cxk_1er.h"
-
-#include "bli_packm_cxk.h"
-#include "bli_packm_cxk_1er.h"
 
 // Mixed datatype support.
 #ifdef BLIS_ENABLE_GEMM_MD
diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c
index edeeae2b98..e133911510 100644
--- a/frame/1m/packm/bli_packm_blk_var1.c
+++ b/frame/1m/packm/bli_packm_blk_var1.c
@@ -43,11 +43,11 @@ static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] =
     { { bli_spackm_struc_cxk,      bli_cpackm_struc_cxk,
         bli_dpackm_struc_cxk,      bli_zpackm_struc_cxk,      } },
 // 0001 row/col panels: 1m-expanded (1e)
-    { { NULL,                      bli_cpackm_struc_cxk_1er,
-        NULL,                      bli_zpackm_struc_cxk_1er,  } },
+    { { NULL,                      bli_cpackm_struc_cxk,
+        NULL,                      bli_zpackm_struc_cxk,  } },
 // 0010 row/col panels: 1m-reordered (1r)
-    { { NULL,                      bli_cpackm_struc_cxk_1er,
-        NULL,                      bli_zpackm_struc_cxk_1er,  } },
+    { { NULL,                      bli_cpackm_struc_cxk,
+        NULL,                      bli_zpackm_struc_cxk,  } },
 };
 
 static void_fp GENARRAY2_ALL(packm_struc_cxk_md,packm_struc_cxk_md);
diff --git a/frame/1m/packm/bli_packm_struc_cxk.c b/frame/1m/packm/bli_packm_struc_cxk.c
index 2a52c42def..7b8e41fb30 100644
--- a/frame/1m/packm/bli_packm_struc_cxk.c
+++ b/frame/1m/packm/bli_packm_struc_cxk.c
@@ -34,8 +34,8 @@
 
 #include "blis.h"
 
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname, kername ) \
+#undef  GENTFUNCR
+#define GENTFUNCR( ctype, ctype_r, ch, chr, varname, cxk_kername, cxc_kername ) \
 \
 void PASTEMAC(ch,varname) \
      ( \
@@ -58,460 +58,237 @@ void PASTEMAC(ch,varname) \
        cntx_t*         cntx  \
      ) \
 { \
-	/* Handle micro-panel packing based on the structure of the matrix
-	   being packed. */ \
-	if      ( bli_is_general( strucc ) ) \
-	{ \
-		/* For micro-panels of general matrices, we can call the pack
-		   kernel front-end directly. */ \
-		PASTEMAC(ch,kername) \
-		( \
-		  conjc, \
-		  schema, \
-		  panel_dim, \
-		  panel_dim_max, \
-		  panel_len, \
-		  panel_len_max, \
-		  kappa, \
-		  c, incc, ldc, \
-		  p,       ldp, \
-		  cntx  \
-		); \
-	} \
-	else if ( bli_is_herm_or_symm( strucc ) ) \
-	{ \
-		/* Call a helper function for micro-panels of Hermitian/symmetric
-		   matrices. */ \
-		PASTEMAC(ch,packm_herm_cxk) \
-		( \
-		  strucc, \
-		  diagc, \
-		  uploc, \
-		  conjc, \
-		  schema, \
-		  invdiag, \
-		  panel_dim, \
-		  panel_len, \
-		  panel_dim_max, \
-		  panel_len_max, \
-		  panel_dim_off, \
-		  panel_len_off, \
-		  kappa, \
-		  c, incc, ldc, \
-		  p,       ldp, \
-		     is_p, \
-		  cntx  \
-		); \
-	} \
-	else /* ( bli_is_triangular( strucc ) ) */ \
-	{ \
-		/* Call a helper function for micro-panels of triangular
-		   matrices. */ \
-		PASTEMAC(ch,packm_tri_cxk) \
-		( \
-		  strucc, \
-		  diagc, \
-		  uploc, \
-		  conjc, \
-		  schema, \
-		  invdiag, \
-		  panel_dim, \
-		  panel_len, \
-		  panel_dim_max, \
-		  panel_len_max, \
-		  panel_dim_off, \
-		  panel_len_off, \
-		  kappa, \
-		  c, incc, ldc, \
-		  p,       ldp, \
-		     is_p, \
-		  cntx  \
-		); \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC( packm_struc_cxk, packm_cxk )
-
-
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname, kername ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       struc_t         strucc, \
-       diag_t          diagc, \
-       uplo_t          uploc, \
-       conj_t          conjc, \
-       pack_t          schema, \
-       bool            invdiag, \
-       dim_t           panel_dim, \
-       dim_t           panel_len, \
-       dim_t           panel_dim_max, \
-       dim_t           panel_len_max, \
-       dim_t           panel_dim_off, \
-       dim_t           panel_len_off, \
-       ctype* restrict kappa, \
-       ctype* restrict c, inc_t incc, inc_t ldc, \
-       ctype* restrict p,             inc_t ldp, \
-                          inc_t is_p, \
-       cntx_t*         cntx  \
-     ) \
-{ \
-	doff_t diagoffc = panel_dim_off - panel_len_off; \
-	doff_t diagoffc_abs; \
-	dim_t  i, j; \
-\
-	/* Handle the case where the micro-panel does NOT intersect the
-	   diagonal separately from the case where it does intersect. */ \
-	if ( !bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) \
-	{ \
-		/* If the current panel is unstored, we need to make a few
-		   adjustments so we refer to the data where it is actually
-		   stored, also taking conjugation into account. (Note this
-		   implicitly assumes we are operating on a dense panel
-		   within a larger symmetric or Hermitian matrix, since a
-		   general matrix would not contain any unstored region.) */ \
-		if ( bli_is_unstored_subpart_n( diagoffc, uploc, panel_dim, panel_len ) ) \
-		{ \
-			c = c + diagoffc * ( doff_t )ldc + \
-			       -diagoffc * ( doff_t )incc;  \
-			bli_swap_incs( &incc, &ldc ); \
-\
-			if ( bli_is_hermitian( strucc ) ) \
-				bli_toggle_conj( &conjc ); \
-		} \
-\
-		/* Pack the full panel. */ \
-		PASTEMAC(ch,kername) \
-		( \
-		  conjc, \
-		  schema, \
-		  panel_dim, \
-		  panel_dim_max, \
-		  panel_len, \
-		  panel_len_max, \
-		  kappa, \
-		  c, incc, ldc, \
-		  p,       ldp, \
-		  cntx  \
-		); \
-	} \
-	else /* if ( bli_intersects_diag_n( diagoffc, panel_dim, panel_len ) ) */ \
-	{ \
-		ctype* restrict c10; \
-		ctype* restrict p10; \
-		dim_t           p10_dim, p10_len; \
-		inc_t           incc10, ldc10; \
-		doff_t          diagoffc10; \
-		conj_t          conjc10; \
-\
-		ctype* restrict c12; \
-		ctype* restrict p12; \
-		dim_t           p12_dim, p12_len; \
-		inc_t           incc12, ldc12; \
-		doff_t          diagoffc12; \
-		conj_t          conjc12; \
-\
-		/* Sanity check. Diagonals should not intersect the short end of
-		   a micro-panel. If they do, then somehow the constraints on
-		   cache blocksizes being a whole multiple of the register
-		   blocksizes was somehow violated. */ \
-		if ( diagoffc < 0 ) \
-			bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
-\
-		diagoffc_abs = bli_abs( diagoffc ); \
-\
-		if      ( bli_is_lower( uploc ) ) \
-		{ \
-			p10_dim    = panel_dim; \
-			p10_len    = diagoffc_abs; \
-			p10        = p; \
-			c10        = c; \
-			incc10     = incc; \
-			ldc10      = ldc; \
-			conjc10    = conjc; \
+	num_t   dt            = PASTEMAC(ch,type); \
+	num_t   dt_r          = PASTEMAC(chr,type); \
+    dim_t   panel_len_pad = panel_len_max - panel_len; \
 \
-			p12_dim    = panel_dim; \
-			p12_len    = panel_len - p10_len; \
-			j          = p10_len; \
-			diagoffc12 = diagoffc_abs - j; \
-			p12        = p + (j  )*ldp; \
-			c12        = c + (j  )*ldc; \
-			c12        = c12 + diagoffc12 * ( doff_t )ldc + \
-			                  -diagoffc12 * ( doff_t )incc;  \
-			incc12     = ldc; \
-			ldc12      = incc; \
-			conjc12    = conjc; \
+	bszid_t bsz_id        = bli_is_col_packed( schema ) ? BLIS_NR : BLIS_MR; \
+	dim_t   packmrnr      = bli_cntx_get_blksz_max_dt( dt, bsz_id, cntx ); \
+	dim_t   packmrnr_r    = bli_cntx_get_blksz_max_dt( dt_r, bsz_id, cntx ); \
 \
-			if ( bli_is_hermitian( strucc ) ) \
-				bli_toggle_conj( &conjc12 ); \
-		} \
-		else /* if ( bli_is_upper( uploc ) ) */ \
-		{ \
-			p10_dim    = panel_dim; \
-			p10_len    = diagoffc_abs + panel_dim; \
-			diagoffc10 = diagoffc; \
-			p10        = p; \
-			c10        = c; \
-			c10        = c10 + diagoffc10 * ( doff_t )ldc + \
-			                  -diagoffc10 * ( doff_t )incc;  \
-			incc10     = ldc; \
-			ldc10      = incc; \
-			conjc10    = conjc; \
+	ukr_t   cxk_ker_id    = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_KER : BLIS_PACKM_MRXK_KER; \
+	ukr_t   cxc_ker_id    = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXNR_DIAG_KER : BLIS_PACKM_MRXMR_DIAG_KER; \
 \
-			p12_dim    = panel_dim; \
-			p12_len    = panel_len - p10_len; \
-			j          = p10_len; \
-			p12        = p + (j  )*ldp; \
-			c12        = c + (j  )*ldc; \
-			incc12     = incc; \
-			ldc12      = ldc; \
-			conjc12    = conjc; \
+    if ( bli_is_1m_packed( schema ) ) \
+    { \
+	    cxk_ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_1ER_KER : BLIS_PACKM_MRXK_1ER_KER; \
+	    cxc_ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXNR_DIAG_1ER_KER : BLIS_PACKM_MRXMR_DIAG_1ER_KER; \
+    } \
 \
-			if ( bli_is_hermitian( strucc ) ) \
-				bli_toggle_conj( &conjc10 ); \
-		} \
+	PASTECH2(ch,cxk_kername,_ker_ft) f_cxk = bli_cntx_get_ukr_dt( dt, cxk_ker_id, cntx ); \
+	PASTECH2(ch,cxc_kername,_ker_ft) f_cxc = bli_cntx_get_ukr_dt( dt, cxc_ker_id, cntx ); \
 \
-		/* Pack to p10. For upper storage, this includes the unstored
-		   triangle of c11. */ \
-		/* NOTE: Since we're only packing partial panels here, we pass in
-		   p1x_len as panel_len_max; otherwise, the packm kernel will zero-
-		   fill the columns up to panel_len_max, which is not what we need
-		   or want to happen. */ \
-		PASTEMAC(ch,kername) \
-		( \
-		  conjc10, \
-		  schema, \
-		  p10_dim, \
-		  panel_dim_max, \
-		  p10_len, \
-		  p10_len, \
-		  kappa, \
-		  c10, incc10, ldc10, \
-		  p10,         ldp, \
-		  cntx  \
-		); \
+    /* For general matrices, pack and return early */ \
+    if ( bli_is_general( strucc ) ) \
+    { \
+    	f_cxk \
+    	( \
+    	  conjc, \
+    	  schema, \
+    	  panel_dim, \
+    	  panel_len, \
+    	  panel_len_max, \
+    	  kappa, \
+    	  c, incc, ldc, \
+    	  p,       ldp, \
+    	  cntx  \
+    	); \
+        return; \
+    } \
 \
-		/* Pack to p12. For lower storage, this includes the unstored
-		   triangle of c11. */ \
-		/* NOTE: Since we're only packing partial panels here, we pass in
-		   p1x_len as panel_len_max; otherwise, the packm kernel will zero-
-		   fill the columns up to panel_len_max, which is not what we need
-		   or want to happen. */ \
-		PASTEMAC(ch,kername) \
-		( \
-		  conjc12, \
-		  schema, \
-		  p12_dim, \
-		  panel_dim_max, \
-		  p12_len, \
-		  p12_len, \
-		  kappa, \
-		  c12, incc12, ldc12, \
-		  p12,         ldp, \
-		  cntx  \
-		); \
-\
-		/* Pack the stored triangle of c11 to p11. */ \
-		{ \
-			dim_t           p11_m  = panel_dim; \
-			dim_t           p11_n  = panel_dim; \
-			dim_t           j2     = diagoffc_abs; \
-			ctype* restrict c11    = c + (j2 )*ldc; \
-			ctype* restrict p11    = p + (j2 )*ldp; \
-			trans_t         transc = ( trans_t )conjc; \
-\
-			PASTEMAC2(ch,copym,BLIS_TAPI_EX_SUF) \
-			( \
-			  0, \
-			  BLIS_NONUNIT_DIAG, \
-			  uploc, \
-			  transc, \
-			  p11_m, \
-			  p11_n, \
-			  c11, incc, ldc, \
-			  p11,    1, ldp, \
-			  cntx, \
-			  NULL  \
-			); \
-\
-			/* If source matrix c is Hermitian, we have to zero out the
-			   imaginary components of the diagonal of p11 in case the
-			   corresponding elements in c11 were not already zero. */ \
-			if ( bli_is_hermitian( strucc ) ) \
-			{ \
-				ctype* restrict pi11 = p11; \
-\
-				for ( i = 0; i < p11_m; ++i ) \
-				{ \
-					PASTEMAC(ch,seti0s)( *pi11 ); \
-\
-					pi11 += 1 + ldp; \
-				} \
-			} \
+	/* Sanity check. Diagonals should not intersect the short end of
+	   a micro-panel. If they do, then somehow the constraints on
+	   cache blocksizes being a whole multiple of the register
+	   blocksizes was somehow violated. */ \
+	doff_t diagoffc = panel_dim_off - panel_len_off; \
+	if ( (          -panel_dim < diagoffc && diagoffc <         0 ) || \
+         ( panel_len-panel_dim < diagoffc && diagoffc < panel_len ) ) \
+		bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
 \
-			/* Now that the diagonal has been made explicitly Hermitian
-			   (if applicable), we can now safely scale the stored
-			   triangle specified by uploc. */ \
-			PASTEMAC2(ch,scalm,BLIS_TAPI_EX_SUF) \
-			( \
-			  BLIS_NO_CONJUGATE, \
-			  0, \
-			  BLIS_NONUNIT_DIAG, \
-			  uploc, \
-			  p11_m, \
-			  p11_n, \
-			  kappa, \
-			  p11, 1, ldp, \
-			  cntx, \
-			  NULL  \
-			); \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC( packm_herm_cxk, packm_cxk )
-
-
-
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname, kername ) \
+    /* For triangular, symmetric, and hermitian matrices we need to consider three parts. */ \
 \
-void PASTEMAC(ch,varname) \
-     ( \
-       struc_t         strucc, \
-       diag_t          diagc, \
-       uplo_t          uploc, \
-       conj_t          conjc, \
-       pack_t          schema, \
-       bool            invdiag, \
-       dim_t           panel_dim, \
-       dim_t           panel_len, \
-       dim_t           panel_dim_max, \
-       dim_t           panel_len_max, \
-       dim_t           panel_dim_off, \
-       dim_t           panel_len_off, \
-       ctype* restrict kappa, \
-       ctype* restrict c, inc_t incc, inc_t ldc, \
-       ctype* restrict p,             inc_t ldp, \
-                          inc_t is_p, \
-       cntx_t*         cntx  \
-     ) \
-{ \
-	doff_t diagoffc = panel_dim_off - panel_len_off; \
+	/* Pack to p10. */ \
+    if ( 0 < diagoffc ) \
+    { \
+    	dim_t  p10_dim     = panel_dim; \
+    	dim_t  p10_len     = bli_min( diagoffc, panel_len ); \
+    	dim_t  p10_len_max = p10_len == panel_len ? panel_len_max : p10_len; \
+    	ctype* p10         = p; \
+    	conj_t conjc10     = conjc; \
+        ctype* c10         = c; \
+        inc_t  incc10      = incc; \
+    	inc_t  ldc10       = ldc; \
 \
-	/* Pack the panel. */ \
-	PASTEMAC(ch,kername) \
-	( \
-	  conjc, \
-	  schema, \
-	  panel_dim, \
-	  panel_dim_max, \
-	  panel_len, \
-	  panel_len_max, \
-	  kappa, \
-	  c, incc, ldc, \
-	  p,       ldp, \
-	  cntx  \
-	); \
+    	if ( bli_is_upper( uploc ) ) \
+        { \
+            bli_reflect_to_stored_part( diagoffc, c10, incc10, ldc10 ); \
 \
+    		if ( bli_is_hermitian( strucc ) ) \
+    			bli_toggle_conj( &conjc10 ); \
+        } \
 \
-	/* If the diagonal of c is implicitly unit, explicitly set the
-	   the diagonal of the packed panel to kappa. */ \
-	if ( bli_is_unit_diag( diagc ) ) \
-	{ \
-		PASTEMAC2(ch,setd,BLIS_TAPI_EX_SUF) \
-		( \
-		  BLIS_NO_CONJUGATE, \
-		  diagoffc, \
-		  panel_dim, \
-		  panel_len, \
-		  kappa, \
-		  p, 1, ldp, \
-		  cntx, \
-		  NULL  \
-		); \
-	} \
+        /* If we are referencing the unstored part of a triangular matrix, explicitly store zeros */ \
+        if ( bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) \
+        { \
+            if ( bli_is_1m_packed( schema ) ) \
+            { \
+    		    ctype_r* restrict zero = PASTEMAC(chr,0); \
+        		PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \
+        		( \
+        		  BLIS_NO_CONJUGATE, \
+        		  0, \
+        		  BLIS_NONUNIT_DIAG, \
+        		  BLIS_DENSE, \
+        		  packmrnr_r, \
+        		  p10_len_max * 2, \
+        		  zero, \
+        		  ( ctype_r* )p10, 1, ldp, \
+        		  cntx, \
+        		  NULL  \
+        		); \
+            } \
+            else \
+            { \
+    		    ctype* restrict zero = PASTEMAC(ch,0); \
+        		PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \
+        		( \
+        		  BLIS_NO_CONJUGATE, \
+        		  0, \
+        		  BLIS_NONUNIT_DIAG, \
+        		  BLIS_DENSE, \
+        		  packmrnr, \
+        		  p10_len_max, \
+        		  zero, \
+        		  p10, 1, ldp, \
+        		  cntx, \
+        		  NULL  \
+        		); \
+            } \
+        } \
+        else \
+        { \
+        	f_cxk \
+        	( \
+        	  conjc10, \
+        	  schema, \
+        	  p10_dim, \
+        	  p10_len, \
+        	  p10_len_max, \
+        	  kappa, \
+        	  c10, incc10, ldc10, \
+        	  p10,         ldp, \
+        	  cntx  \
+        	); \
+        } \
+    } \
 \
-	/* If requested, invert the diagonal of the packed panel. */ \
-	if ( invdiag == TRUE ) \
-	{ \
-		PASTEMAC2(ch,invertd,BLIS_TAPI_EX_SUF) \
-		( \
-		  diagoffc, \
-		  panel_dim, \
-		  panel_len, \
-		  p, 1, ldp, \
-		  cntx, \
-		  NULL  \
-		); \
-	} \
+	/* Pack to p11. */ \
+    if ( 0 <= diagoffc && diagoffc + panel_dim <= panel_len ) \
+    { \
+        dim_t  i           = diagoffc; \
+    	dim_t  p11_dim     = panel_dim; \
+    	dim_t  p11_len_max = panel_dim + ( diagoffc + panel_dim == panel_len ? panel_len_pad : 0 ); \
+    	ctype* p11         = p + i * ldp; \
+    	conj_t conjc11     = conjc; \
+    	ctype* c11         = c + i * ldc; \
+        inc_t  incc11      = incc; \
+    	inc_t  ldc11       = ldc; \
 \
-	/* Set the region opposite the diagonal of p to zero. To do this,
-	   we need to reference the "unstored" region on the other side of
-	   the diagonal. This amounts to toggling uploc and then shifting
-	   the diagonal offset to shrink the newly referenced region (by
-	   one diagonal). Note that this zero-filling is not needed for
-	   trsm, since the unstored region is not referenced by the trsm
-	   micro-kernel; however, zero-filling is needed for trmm, which
-	   uses the gemm micro-kernel.*/ \
-	{ \
-		ctype* restrict zero  = PASTEMAC(ch,0); \
-		uplo_t          uplop = uploc; \
+    	f_cxc \
+    	( \
+          strucc, \
+          diagc, \
+          uploc, \
+          conjc11, \
+          schema, \
+          invdiag, \
+    	  p11_dim, \
+    	  p11_len_max, \
+    	  kappa, \
+    	  c11, incc11, ldc11, \
+    	  p11,         ldp, \
+    	  cntx  \
+    	); \
+    } \
 \
-		bli_toggle_uplo( &uplop ); \
-		bli_shift_diag_offset_to_shrink_uplo( uplop, &diagoffc ); \
+	/* Pack to p12. */ \
+    if ( diagoffc + panel_dim < panel_len ) \
+    { \
+        dim_t  i           = bli_max( 0, diagoffc + panel_dim ); \
+    	dim_t  p12_dim     = panel_dim; \
+    	dim_t  p12_len     = panel_len - i; \
+        /* If we are packing p12, then it is always the last partial block \
+           and so we should make sure to pad with zeros if necessary. */ \
+    	dim_t  p12_len_max = p12_len + panel_len_pad; \
+    	ctype* p12         = p + i * ldp; \
+    	conj_t conjc12     = conjc; \
+    	ctype* c12         = c + i * ldc; \
+        inc_t  incc12      = incc; \
+    	inc_t  ldc12       = ldc; \
 \
-		PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \
-		( \
-		  BLIS_NO_CONJUGATE, \
-		  diagoffc, \
-		  BLIS_NONUNIT_DIAG, \
-		  uplop, \
-		  panel_dim, \
-		  panel_len, \
-		  zero, \
-		  p, 1, ldp, \
-		  cntx, \
-		  NULL  \
-		); \
-	} \
+    	if ( bli_is_lower( uploc ) ) \
+        { \
+            bli_reflect_to_stored_part( -panel_dim, c12, incc12, ldc12 ); \
 \
-	/* If this panel is an edge case in both panel dimension and length,
-	   then it must be a bottom-right corner case. Set the part of the
-	   diagonal that extends into the zero-padded region to identity.
-	   NOTE: This is actually only necessary when packing for trsm, as
-	   it helps prevent NaNs and Infs from creeping into the computation.
-	   However, we set the region to identity for trmm as well. Those
-	   1.0's end up getting muliplied by the 0.0's in the zero-padded
-	   region of the other matrix, so there is no harm in this. */ \
-	if ( panel_dim != panel_dim_max && \
-	     panel_len != panel_len_max ) \
-	{ \
-		ctype* restrict one    = PASTEMAC(ch,1); \
-		dim_t           i      = panel_dim; \
-		dim_t           j      = panel_len; \
-		dim_t           m_br   = panel_dim_max - i; \
-		dim_t           n_br   = panel_len_max - j; \
-		ctype*          p_br   = p + (i  ) + (j  )*ldp; \
+    		if ( bli_is_hermitian( strucc ) ) \
+    			bli_toggle_conj( &conjc12 ); \
+        } \
 \
-		PASTEMAC2(ch,setd,BLIS_TAPI_EX_SUF) \
-		( \
-		  BLIS_NO_CONJUGATE, \
-		  0, \
-		  m_br, \
-		  n_br, \
-		  one, \
-		  p_br, 1, ldp, \
-		  cntx, \
-		  NULL  \
-		); \
-	} \
+        /* If we are referencing the unstored part of a triangular matrix, explicitly store zeros */ \
+        if ( bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) \
+        { \
+            if ( bli_is_1m_packed( schema ) ) \
+            { \
+    		    ctype_r* restrict zero = PASTEMAC(chr,0); \
+        		PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \
+        		( \
+        		  BLIS_NO_CONJUGATE, \
+        		  0, \
+        		  BLIS_NONUNIT_DIAG, \
+        		  BLIS_DENSE, \
+        		  packmrnr_r, \
+        		  p12_len_max * 2, \
+        		  zero, \
+        		  ( ctype_r* )p12, 1, ldp, \
+        		  cntx, \
+        		  NULL  \
+        		); \
+            } \
+            else \
+            { \
+    		    ctype* restrict zero = PASTEMAC(ch,0); \
+        		PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \
+        		( \
+        		  BLIS_NO_CONJUGATE, \
+        		  0, \
+        		  BLIS_NONUNIT_DIAG, \
+        		  BLIS_DENSE, \
+        		  packmrnr, \
+        		  p12_len_max, \
+        		  zero, \
+        		  p12, 1, ldp, \
+        		  cntx, \
+        		  NULL  \
+        		); \
+            } \
+        } \
+        else \
+        { \
+        	f_cxk \
+        	( \
+        	  conjc12, \
+        	  schema, \
+        	  p12_dim, \
+        	  p12_len, \
+        	  p12_len_max, \
+        	  kappa, \
+        	  c12, incc12, ldc12, \
+        	  p12,         ldp, \
+        	  cntx  \
+        	); \
+        } \
+    } \
 }
 
-INSERT_GENTFUNC_BASIC( packm_tri_cxk, packm_cxk )
+INSERT_GENTFUNCR_BASIC2( packm_struc_cxk, packm_cxk, packm_cxc_diag )
 
diff --git a/frame/1m/unpackm/bli_unpackm.h b/frame/1m/unpackm/bli_unpackm.h
index 5e45428410..80fa3804a1 100644
--- a/frame/1m/unpackm/bli_unpackm.h
+++ b/frame/1m/unpackm/bli_unpackm.h
@@ -37,5 +37,3 @@
 #include "bli_unpackm_int.h"
 
 #include "bli_unpackm_blk_var1.h"
-
-#include "bli_unpackm_cxk.h"
diff --git a/frame/1m/unpackm/bli_unpackm_blk_var1.c b/frame/1m/unpackm/bli_unpackm_blk_var1.c
index e44fd15e4e..cbd9045d9d 100644
--- a/frame/1m/unpackm/bli_unpackm_blk_var1.c
+++ b/frame/1m/unpackm/bli_unpackm_blk_var1.c
@@ -210,6 +210,13 @@ void PASTEMAC(ch,varname) \
 		m_panel_full  = &panel_dim_i; \
 		n_panel_full  = &n; \
 	} \
+\
+	num_t dt     = PASTEMAC(ch,type); \
+	ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_UNPACKM_NRXK_KER : BLIS_UNPACKM_MRXK_KER; \
+\
+	/* Query the context for the unpackm kernel corresponding to the current
+	   panel dimension, or kernel id. */ \
+	PASTECH2(ch,unpackm_cxk,_ker_ft) f = bli_cntx_get_ukr_dt( dt, ker_id, cntx ); \
 \
 	/* Compute the total number of iterations we'll need. */ \
 	num_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
@@ -256,7 +263,7 @@ void PASTEMAC(ch,varname) \
 		else \
 		{ \
 			/* Pack the current panel. */ \
-			PASTEMAC(ch,unpackm_cxk) \
+			f \
 			( \
 			  BLIS_NO_CONJUGATE, \
               schema, \
diff --git a/frame/3/bli_l3_sup_packm_var.c b/frame/3/bli_l3_sup_packm_var.c
index 85fb246f01..f54e5f1256 100644
--- a/frame/3/bli_l3_sup_packm_var.c
+++ b/frame/3/bli_l3_sup_packm_var.c
@@ -122,6 +122,13 @@ void PASTEMAC(ch,varname) \
 		ldc            = cs_c; \
 		ldp            = cs_p; \
 	} \
+\
+	num_t dt     = PASTEMAC(ch,type); \
+	ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_KER : BLIS_PACKM_MRXK_KER; \
+\
+	/* Query the context for the unpackm kernel corresponding to the current
+	   panel dimension, or kernel id. */ \
+	PASTECH2(ch,packm_cxk,_ker_ft) f = bli_cntx_get_ukr_dt( dt, ker_id, cntx ); \
 \
 	/* Compute the total number of iterations we'll need. */ \
 	n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 ); \
@@ -171,12 +178,11 @@ void PASTEMAC(ch,varname) \
 			   or round-robin partitioning was requested at configure-time. */ \
 			if ( bli_packm_my_iter( it, it_start, it_end, tid, nt ) ) \
 			{ \
-				PASTEMAC(ch,packm_cxk) \
+				f \
 				( \
 				  conjc, \
 				  schema, \
 				  panel_dim_i, \
-				  panel_dim_max, \
 				  panel_len_i, \
 				  panel_len_max_i, \
 				  kappa_cast, \
diff --git a/frame/include/bli_param_macro_defs.h b/frame/include/bli_param_macro_defs.h
index 688a4fcd03..1822065dab 100644
--- a/frame/include/bli_param_macro_defs.h
+++ b/frame/include/bli_param_macro_defs.h
@@ -777,6 +777,14 @@ BLIS_INLINE void bli_reflect_about_diag( doff_t* diagoff, uplo_t* uplo, dim_t* m
 	bli_toggle_uplo( uplo );
 }
 
+// we don't know the type of a, so this must be a macro
+// rs_a and cs_a must be variables and not expressions
+#define bli_reflect_to_stored_part( diagoff, a, rs_a, cs_a ) \
+do { \
+	a += ( diagoff ) * ( cs_a - rs_a ); \
+	bli_swap_incs( &rs_a, &cs_a ); \
+} while (0) \
+
 BLIS_INLINE void bli_reverse_index_direction( dim_t n, dim_t* start, dim_t* end )
 {
 	dim_t start2 = n - *start;
diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h
index 7ecb0a233b..4e64f37116 100644
--- a/frame/include/bli_type_defs.h
+++ b/frame/include/bli_type_defs.h
@@ -654,6 +654,10 @@ typedef enum
 	BLIS_PACKM_NRXK_KER,
 	BLIS_PACKM_MRXK_1ER_KER,
 	BLIS_PACKM_NRXK_1ER_KER,
+	BLIS_PACKM_MRXMR_DIAG_KER,
+	BLIS_PACKM_NRXNR_DIAG_KER,
+	BLIS_PACKM_MRXMR_DIAG_1ER_KER,
+	BLIS_PACKM_NRXNR_DIAG_1ER_KER,
 
 	// unpack kernels
 	BLIS_UNPACKM_MRXK_KER,
diff --git a/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c b/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c
new file mode 100644
index 0000000000..17ed9bef65
--- /dev/null
+++ b/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c
@@ -0,0 +1,336 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define PACKM_SET1_1E( chr, mnk ) \
+do { \
+    PASTEMAC(chr,set1s)( *(pi1_ri + (mnk*2 + 0)*dfac + d + mnk*ldp2) ); \
+    PASTEMAC(chr,set0s)( *(pi1_ri + (mnk*2 + 1)*dfac + d + mnk*ldp2) ); \
+    PASTEMAC(chr,set0s)( *(pi1_ir + (mnk*2 + 0)*dfac + d + mnk*ldp2) ); \
+    PASTEMAC(chr,set1s)( *(pi1_ir + (mnk*2 + 1)*dfac + d + mnk*ldp2) ); \
+} while (0)
+
+#define PACKM_SET1_1R( chr, mnk ) \
+do { \
+    PASTEMAC(chr,set1s)( *(pi1_r + mnk*dfac + d + mnk*ldp2) ); \
+    PASTEMAC(chr,set0s)( *(pi1_i + mnk*dfac + d + mnk*ldp2) ); \
+} while (0)
+
+#define PACKM_SCAL_1E( ch, mn, k, op ) \
+do { \
+	PASTEMAC(ch,op)(  kappa_r, kappa_i, *(alpha1 +  mn       *inca2 + 0 + k*lda2), \
+                                        *(alpha1 +  mn       *inca2 + 1 + k*lda2), \
+	                                    *(pi1_ri + (mn*2 + 0)*dfac  + d + k*ldp2), \
+                                        *(pi1_ri + (mn*2 + 1)*dfac  + d + k*ldp2) ); \
+	PASTEMAC(ch,op)( -kappa_i, kappa_r, *(alpha1 +  mn       *inca2 + 0 + k*lda2), \
+                                        *(alpha1 +  mn       *inca2 + 1 + k*lda2), \
+	                                    *(pi1_ir + (mn*2 + 0)*dfac  + d + k*ldp2), \
+                                        *(pi1_ir + (mn*2 + 1)*dfac  + d + k*ldp2) ); \
+} while (0)
+
+#define PACKM_SCAL_1R( ch, mn, k, op ) \
+do { \
+	PASTEMAC(ch,op)( kappa_r, kappa_i, *(alpha1 + mn*inca2 + 0 + k*lda2), \
+                                       *(alpha1 + mn*inca2 + 1 + k*lda2), \
+	                                   *(pi1_r  + mn*dfac  + d + k*ldp2), \
+                                       *(pi1_i  + mn*dfac  + d + k*ldp2) ); \
+} while (0)
+
+#define PACKM_DIAG_1E_BODY( ch, mn_min, mn_max, inca2_lu, lda2_lu, op ) \
+\
+do \
+{ \
+    /* PACKM_SCAL_1E assumes inca2 and lda2 are the strides to use. */ \
+    dim_t inca2 = inca2_lu; \
+    dim_t lda2 = lda2_lu; \
+	for ( dim_t k = 0; k < cdim; k++ ) \
+	for ( dim_t mn = mn_min; mn < mn_max; mn++ ) \
+	for ( dim_t d = 0; d < dfac; d++ ) \
+        PACKM_SCAL_1E( ch, mn, k, op ); \
+} while(0)
+
+#define PACKM_DIAG_BODY_1E_L( ch, op ) \
+    PACKM_DIAG_1E_BODY( ch, k+1, cdim, inca_l2, lda_l2, op )
+
+#define PACKM_DIAG_BODY_1E_U( ch, op ) \
+    PACKM_DIAG_1E_BODY( ch, 0, k, inca_u2, lda_u2, op )
+
+#define PACKM_DIAG_1R_BODY( ch, mn_min, mn_max, inca2_lu, lda2_lu, op ) \
+\
+do \
+{ \
+    /* PACKM_SCAL_1R assumes inca2 and lda2 are the strides to use. */ \
+    dim_t inca2 = inca2_lu; \
+    dim_t lda2 = lda2_lu; \
+	for ( dim_t k = 0; k < cdim; k++ ) \
+	for ( dim_t mn = mn_min; mn < mn_max; mn++ ) \
+	for ( dim_t d = 0; d < dfac; d++ ) \
+        PACKM_SCAL_1R( ch, mn, k, op ); \
+} while(0)
+
+#define PACKM_DIAG_BODY_1R_L( ch, op ) \
+    PACKM_DIAG_1R_BODY( ch, k+1, cdim, inca_l2, lda_l2, op )
+
+#define PACKM_DIAG_BODY_1R_U( ch, op ) \
+    PACKM_DIAG_1R_BODY( ch, 0, k, inca_u2, lda_u2, op )
+
+#undef  GENTFUNCCO
+#define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr0, bb0, arch, suf ) \
+\
+void PASTEMAC3(ch,opname,arch,suf) \
+     ( \
+       struc_t          struca, \
+       diag_t           diaga, \
+       uplo_t           uploa, \
+       conj_t           conja, \
+       pack_t           schema, \
+       bool             invdiag, \
+       dim_t            cdim, \
+       dim_t            n_max, \
+       ctype*  restrict kappa, \
+       ctype*  restrict a, inc_t inca, inc_t lda, \
+       ctype*  restrict p,             inc_t ldp, \
+       cntx_t* restrict cntx \
+     ) \
+{ \
+    const num_t dt_r      = PASTEMAC(chr,type); \
+	const dim_t cdim_pack = bli_cntx_get_blksz_max_dt( dt_r, mnr0, cntx ); \
+	const dim_t dfac      = bli_cntx_get_blksz_def_dt( dt_r, bb0, cntx ); \
+\
+    /* start by zeroing out the whole block */ \
+	PASTEMAC(chr,set0s_mxn) \
+	( \
+	  cdim_pack, \
+	  2*n_max, \
+	  ( ctype_r* )p, 1, ldp  \
+	); \
+\
+	const inc_t       inca2   = 2 * inca; \
+	const inc_t       lda2    = 2 * lda; \
+	const inc_t       ldp2    = 2 * ldp; \
+\
+	ctype_r           kappa_r = ( ( ctype_r* )kappa )[0]; \
+	ctype_r           kappa_i = ( ( ctype_r* )kappa )[1]; \
+	ctype_r* restrict alpha1  = ( ctype_r* )a; \
+\
+    if ( bli_is_1e_packed( schema ) ) \
+    { \
+	    const dim_t       cdim_max = bli_cntx_get_blksz_def_dt( dt_r, mnr0, cntx ) / 2; \
+\
+		ctype_r* restrict pi1_ri   = ( ctype_r* )p; \
+		ctype_r* restrict pi1_ir   = ( ctype_r* )p + ldp; \
+\
+        /* write the strictly lower part if it exists */ \
+        if ( bli_is_lower( uploa ) || bli_is_herm_or_symm( struca ) ) \
+        { \
+            dim_t  inca_l2 = inca2; \
+            dim_t  lda_l2  = lda2; \
+            conj_t conja_l = conja; \
+\
+            if ( bli_is_upper( uploa ) ) \
+            { \
+                bli_swap_incs( &inca_l2, &lda_l2 ); \
+                if ( bli_is_hermitian( struca ) ) \
+                    bli_toggle_conj( &conja_l ); \
+            } \
+\
+            if ( bli_is_conj( conja_l ) ) PACKM_DIAG_BODY_1E_L( ch, scal2jris ); \
+            else                          PACKM_DIAG_BODY_1E_L( ch, scal2ris ); \
+        } \
+\
+        /* write the strictly upper part if it exists */ \
+        /* assume either symmetric, hermitian, or triangular */ \
+        if ( bli_is_upper( uploa ) || bli_is_herm_or_symm( struca ) ) \
+        { \
+            dim_t  inca_u2 = inca2; \
+            dim_t  lda_u2  = lda2; \
+            conj_t conja_u = conja; \
+\
+            if ( bli_is_lower( uploa ) ) \
+            { \
+                bli_swap_incs( &inca_u2, &lda_u2 ); \
+                if ( bli_is_hermitian( struca ) ) \
+                    bli_toggle_conj( &conja_u ); \
+            } \
+\
+            if ( bli_is_conj( conja_u ) ) PACKM_DIAG_BODY_1E_U( ch, scal2jris ); \
+            else                          PACKM_DIAG_BODY_1E_U( ch, scal2ris ); \
+        } \
+\
+        /* write the diagonal */ \
+        if ( bli_is_unit_diag( diaga ) ) \
+        { \
+            for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+            for ( dim_t d = 0; d < dfac; ++d ) \
+                PACKM_SET1_1E( chr, mnk ); \
+        } \
+        else if ( bli_is_hermitian( struca ) ) \
+        { \
+            for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+            for ( dim_t d = 0; d < dfac; ++d ) \
+            { \
+                ctype_r mu_r = *(alpha1 + mnk*(inca2 + lda2)); \
+        		PASTEMAC(chr,scal2s)(  kappa_r, mu_r, *(pi1_ri + (mnk*2 + 0)*dfac + d + mnk*ldp2) ); \
+        		PASTEMAC(chr,scal2s)(  kappa_i, mu_r, *(pi1_ri + (mnk*2 + 1)*dfac + d + mnk*ldp2) ); \
+        		PASTEMAC(chr,scal2s)( -kappa_i, mu_r, *(pi1_ir + (mnk*2 + 0)*dfac + d + mnk*ldp2) ); \
+        		PASTEMAC(chr,scal2s)(  kappa_r, mu_r, *(pi1_ir + (mnk*2 + 1)*dfac + d + mnk*ldp2) ); \
+            } \
+        } \
+        else if ( bli_is_conj( conja )) \
+        { \
+            for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+            for ( dim_t d = 0; d < dfac; ++d ) \
+                PACKM_SCAL_1E( ch, mnk, mnk, scal2jris ); \
+        } \
+        else \
+        { \
+            for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+            for ( dim_t d = 0; d < dfac; ++d ) \
+                PACKM_SCAL_1E( ch, mnk, mnk, scal2ris ); \
+        } \
+\
+        /* invert the diagonal if requested */ \
+        if ( invdiag ) \
+        { \
+            for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+            for ( dim_t d = 0; d < dfac; ++d ) \
+            { \
+                PASTEMAC(ch,invertris)( *(pi1_ri + (mnk*2 + 0)*dfac + d + mnk*ldp2), \
+                                        *(pi1_ri + (mnk*2 + 1)*dfac + d + mnk*ldp2) ); \
+                PASTEMAC(ch,copyjris)( *(pi1_ri + (mnk*2 + 0)*dfac + d + mnk*ldp2), \
+                                       *(pi1_ri + (mnk*2 + 1)*dfac + d + mnk*ldp2), \
+                                       *(pi1_ir + (mnk*2 + 1)*dfac + d + mnk*ldp2), \
+                                       *(pi1_ir + (mnk*2 + 0)*dfac + d + mnk*ldp2) ); \
+            } \
+        } \
+\
+        /* if this an edge case in both directions, extend the diagonal with ones */ \
+        for ( dim_t mnk = cdim; mnk < bli_min( cdim_max, n_max ); ++mnk ) \
+        for ( dim_t d = 0; d < dfac; ++d ) \
+            PACKM_SET1_1E( chr, mnk ); \
+    } \
+    else /* bli_is_1r_packed( schema ) */ \
+    { \
+    	const dim_t       cdim_max = bli_cntx_get_blksz_def_dt( dt_r, mnr0, cntx ); \
+\
+		ctype_r* restrict pi1_r    = ( ctype_r* )p; \
+		ctype_r* restrict pi1_i    = ( ctype_r* )p + ldp; \
+\
+        /* write the strictly lower part if it exists */ \
+        if ( bli_is_lower( uploa ) || bli_is_herm_or_symm( struca ) ) \
+        { \
+            dim_t  inca_l2 = inca2; \
+            dim_t  lda_l2  = lda2; \
+            conj_t conja_l = conja; \
+\
+            if ( bli_is_upper( uploa ) ) \
+            { \
+                bli_swap_incs( &inca_l2, &lda_l2 ); \
+                if ( bli_is_hermitian( struca ) ) \
+                    bli_toggle_conj( &conja_l ); \
+            } \
+\
+            if ( bli_is_conj( conja_l ) ) PACKM_DIAG_BODY_1R_L( ch, scal2jris ); \
+            else                          PACKM_DIAG_BODY_1R_L( ch, scal2ris ); \
+        } \
+\
+        /* write the strictly upper part if it exists */ \
+        /* assume either symmetric, hermitian, or triangular */ \
+        if ( bli_is_upper( uploa ) || bli_is_herm_or_symm( struca ) ) \
+        { \
+            dim_t  inca_u2 = inca2; \
+            dim_t  lda_u2  = lda2; \
+            conj_t conja_u = conja; \
+\
+            if ( bli_is_lower( uploa ) ) \
+            { \
+                bli_swap_incs( &inca_u2, &lda_u2 ); \
+                if ( bli_is_hermitian( struca ) ) \
+                    bli_toggle_conj( &conja_u ); \
+            } \
+\
+            if ( bli_is_conj( conja_u ) ) PACKM_DIAG_BODY_1R_U( ch, scal2jris ); \
+            else                          PACKM_DIAG_BODY_1R_U( ch, scal2ris ); \
+        } \
+\
+        /* write the diagonal */ \
+        if ( bli_is_unit_diag( diaga ) ) \
+        { \
+            for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+            for ( dim_t d = 0; d < dfac; ++d ) \
+                PACKM_SET1_1R( chr, mnk ); \
+        } \
+        else if ( bli_is_hermitian( struca ) ) \
+        { \
+            for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+            for ( dim_t d = 0; d < dfac; ++d ) \
+            { \
+                ctype_r mu_r = *(alpha1 + mnk*(inca2 + lda2)); \
+        		PASTEMAC(chr,scal2s)( kappa_r, mu_r, *(pi1_r + mnk*(dfac + ldp2) + d) ); \
+        		PASTEMAC(chr,scal2s)( kappa_i, mu_r, *(pi1_i + mnk*(dfac + ldp2) + d) ); \
+            } \
+        } \
+        else if ( bli_is_conj( conja ) ) \
+        { \
+            for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+            for ( dim_t d = 0; d < dfac; ++d ) \
+                PACKM_SCAL_1R( ch, mnk, mnk, scal2jris ); \
+        } \
+        else \
+        { \
+            for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+            for ( dim_t d = 0; d < dfac; ++d ) \
+                PACKM_SCAL_1R( ch, mnk, mnk, scal2ris ); \
+        } \
+\
+        /* invert the diagonal if requested */ \
+        if ( invdiag ) \
+        { \
+            for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+            for ( dim_t d = 0; d < dfac; ++d ) \
+                PASTEMAC(ch,invertris)( *(pi1_r + mnk*(dfac + ldp2) + d), \
+                                        *(pi1_i + mnk*(dfac + ldp2) + d) ); \
+        } \
+\
+        /* if this an edge case in both directions, extend the diagonal with ones */ \
+        for ( dim_t mnk = cdim; mnk < bli_min( cdim_max, n_max ); ++mnk ) \
+        for ( dim_t d = 0; d < dfac; ++d ) \
+            PACKM_SET1_1R( chr, mnk ); \
+    } \
+}
+
+INSERT_GENTFUNCCO_BASIC4( packm_mrxmr_diag_1er, BLIS_MR, BLIS_BBM, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNCCO_BASIC4( packm_nrxnr_diag_1er, BLIS_NR, BLIS_BBN, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+
diff --git a/ref_kernels/1m/bli_packm_cxc_diag_ref.c b/ref_kernels/1m/bli_packm_cxc_diag_ref.c
new file mode 100644
index 0000000000..bbfa1e3cc3
--- /dev/null
+++ b/ref_kernels/1m/bli_packm_cxc_diag_ref.c
@@ -0,0 +1,173 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#define PACKM_DIAG_BODY( ctype, ch, mn_min, mn_max, inca, lda, op ) \
+\
+do \
+{ \
+	for ( dim_t k = 0; k < cdim; k++ ) \
+	for ( dim_t mn = mn_min; mn < mn_max; mn++ ) \
+	for ( dim_t d = 0; d < dfac; d++ ) \
+		PASTEMAC(ch,op)( kappa_cast, *(alpha1 + mn*inca + k*lda), *(pi1 + mn*dfac + d + k*ldp) ); \
+} while(0)
+
+#define PACKM_DIAG_BODY_L( ctype, ch, op ) \
+    PACKM_DIAG_BODY( ctype, ch, k+1, cdim, inca_l, lda_l, op )
+
+#define PACKM_DIAG_BODY_U( ctype, ch, op ) \
+    PACKM_DIAG_BODY( ctype, ch, 0, k, inca_u, lda_u, op )
+
+#undef  GENTFUNC
+#define GENTFUNC( ctype, ch, opname, mnr0, bb0, arch, suf ) \
+\
+void PASTEMAC3(ch,opname,arch,suf) \
+     ( \
+       struc_t          struca, \
+       diag_t           diaga, \
+       uplo_t           uploa, \
+       conj_t           conja, \
+       pack_t           schema, \
+       bool             invdiag, \
+       dim_t            cdim, \
+       dim_t            n_max, \
+       ctype*  restrict kappa, \
+       ctype*  restrict a, inc_t inca, inc_t lda, \
+       ctype*  restrict p,             inc_t ldp, \
+       cntx_t* restrict cntx \
+     ) \
+{ \
+    const num_t dt        = PASTEMAC(ch,type); \
+	const dim_t cdim_max  = bli_cntx_get_blksz_def_dt( dt, mnr0, cntx ); \
+	const dim_t cdim_pack = bli_cntx_get_blksz_max_dt( dt, mnr0, cntx ); \
+	const dim_t dfac      = bli_cntx_get_blksz_def_dt( dt, bb0, cntx ); \
+\
+    /* start by zeroing out the whole block */ \
+	PASTEMAC(ch,set0s_mxn) \
+	( \
+	  cdim_pack, \
+	  n_max, \
+	  p, 1, ldp  \
+	); \
+\
+	ctype           kappa_cast = *( ctype* )kappa; \
+	ctype* restrict alpha1     = a; \
+	ctype* restrict pi1        = p; \
+\
+    /* write the strictly lower part if it exists */ \
+    if ( bli_is_lower( uploa ) || bli_is_herm_or_symm( struca ) ) \
+    { \
+        dim_t  inca_l  = inca; \
+        dim_t  lda_l   = lda; \
+        conj_t conja_l = conja; \
+\
+        if ( bli_is_upper( uploa ) ) \
+        { \
+            bli_swap_incs( &inca_l, &lda_l ); \
+            if ( bli_is_hermitian( struca ) ) \
+                bli_toggle_conj( &conja_l ); \
+        } \
+\
+        if ( bli_is_conj( conja_l ) ) PACKM_DIAG_BODY_L( ctype, ch, scal2js ); \
+        else                          PACKM_DIAG_BODY_L( ctype, ch, scal2s ); \
+    } \
+\
+    /* write the strictly upper part if it exists */ \
+    /* assume either symmetric, hermitian, or triangular */ \
+    if ( bli_is_upper( uploa ) || bli_is_herm_or_symm( struca ) ) \
+    { \
+        dim_t  inca_u  = inca; \
+        dim_t  lda_u   = lda; \
+        conj_t conja_u = conja; \
+\
+        if ( bli_is_lower( uploa ) ) \
+        { \
+            bli_swap_incs( &inca_u, &lda_u ); \
+            if ( bli_is_hermitian( struca ) ) \
+                bli_toggle_conj( &conja_u ); \
+        } \
+\
+        if ( bli_is_conj( conja_u ) ) PACKM_DIAG_BODY_U( ctype, ch, scal2js ); \
+        else                          PACKM_DIAG_BODY_U( ctype, ch, scal2s ); \
+    } \
+\
+    /* write the diagonal */ \
+    if ( bli_is_unit_diag( diaga ) ) \
+    { \
+        for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+        for ( dim_t d = 0; d < dfac; ++d ) \
+            PASTEMAC(ch,set1s)( *(pi1 + mnk*(dfac + ldp) + d) ); \
+    } \
+    else if ( bli_is_hermitian( struca ) ) \
+    { \
+        for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+        for ( dim_t d = 0; d < dfac; ++d ) \
+        { \
+            ctype mu; \
+            PASTEMAC(ch,copys)( *(alpha1 + mnk*(inca + lda)), mu ); \
+            PASTEMAC(ch,seti0s)( mu ); \
+            PASTEMAC(ch,scal2s)( kappa_cast, mu, *(pi1 + mnk*(dfac + ldp) + d) ); \
+        } \
+    } \
+    else if ( bli_is_conj( conja )) \
+    { \
+        for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+        for ( dim_t d = 0; d < dfac; ++d ) \
+            PASTEMAC(ch,scal2js)( kappa_cast, *(alpha1 + mnk*(inca + lda)), *(pi1 + mnk*(dfac + ldp) + d) ); \
+    } \
+    else \
+    { \
+        for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+        for ( dim_t d = 0; d < dfac; ++d ) \
+            PASTEMAC(ch,scal2s)( kappa_cast, *(alpha1 + mnk*(inca + lda)), *(pi1 + mnk*(dfac + ldp) + d) ); \
+    } \
+\
+    /* invert the diagonal if requested */ \
+    if ( invdiag ) \
+    { \
+        for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+        for ( dim_t d = 0; d < dfac; ++d ) \
+            PASTEMAC(ch,inverts)( *(pi1 + mnk*(dfac + ldp) + d) ); \
+    } \
+\
+    /* if this an edge case in both directions, extend the diagonal with ones */ \
+    for ( dim_t mnk = cdim; mnk < bli_min( cdim_max, n_max ); ++mnk ) \
+    for ( dim_t d = 0; d < dfac; ++d ) \
+        PASTEMAC(ch,set1s)( *(pi1 + mnk*(dfac + ldp) + d) ); \
+}
+
+INSERT_GENTFUNC_BASIC4( packm_mrxmr_diag, BLIS_MR, BLIS_BBM, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+INSERT_GENTFUNC_BASIC4( packm_nrxnr_diag, BLIS_NR, BLIS_BBN, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
+
diff --git a/ref_kernels/1m/bli_packm_cxk_1er_ref.c b/ref_kernels/1m/bli_packm_cxk_1er_ref.c
index 94263ade10..06b83debaf 100644
--- a/ref_kernels/1m/bli_packm_cxk_1er_ref.c
+++ b/ref_kernels/1m/bli_packm_cxk_1er_ref.c
@@ -130,7 +130,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
     	PASTEMAC(chr,set0s_edge) \
     	( \
-    	  2*cdim, 2*cdim_max, \
+    	  2*cdim*dfac, 2*cdim_max*dfac, \
     	  2*n, 2*n_max, \
     	  ( ctype_r* )p, ldp  \
     	); \
@@ -171,7 +171,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
     	PASTEMAC(chr,set0s_edge) \
     	( \
-    	  cdim, cdim_max, \
+    	  cdim*dfac, cdim_max*dfac, \
     	  2*n, 2*n_max, \
     	  ( ctype_r* )p, ldp  \
     	); \
diff --git a/ref_kernels/1m/bli_packm_cxk_ref.c b/ref_kernels/1m/bli_packm_cxk_ref.c
index 56169a364f..c385fca1ac 100644
--- a/ref_kernels/1m/bli_packm_cxk_ref.c
+++ b/ref_kernels/1m/bli_packm_cxk_ref.c
@@ -96,7 +96,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 	PASTEMAC(ch,set0s_edge) \
 	( \
-	  cdim, cdim_max, \
+	  cdim*dfac, cdim_max*dfac, \
 	  n, n_max, \
 	  p, ldp  \
 	); \
diff --git a/ref_kernels/1m/bli_unpackm_cxk_ref.c b/ref_kernels/1m/bli_unpackm_cxk_ref.c
index 4f19ddfef3..73d98e2681 100644
--- a/ref_kernels/1m/bli_unpackm_cxk_ref.c
+++ b/ref_kernels/1m/bli_unpackm_cxk_ref.c
@@ -58,10 +58,9 @@ void PASTEMAC3(ch,opname,arch,suf) \
        pack_t           schema, \
        dim_t            cdim, \
        dim_t            n, \
-       dim_t            n_max, \
        ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t inca, inc_t lda, \
        ctype*  restrict p,             inc_t ldp, \
+       ctype*  restrict a, inc_t inca, inc_t lda, \
        cntx_t* restrict cntx \
      ) \
 { \
diff --git a/ref_kernels/3/bli_gemmtrsm_ref.c b/ref_kernels/3/bli_gemmtrsm_ref.c
index 481a350cb6..465ee01d18 100644
--- a/ref_kernels/3/bli_gemmtrsm_ref.c
+++ b/ref_kernels/3/bli_gemmtrsm_ref.c
@@ -100,6 +100,15 @@ PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11", mr, 2*nr, \
 PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11 after gemm", mr, 2*nr, \
                      (double*)b11, rs_b, 1, "%5.2f", "" ); \
 */ \
+\
+	/* Broadcast the elements of the updated b11 submatrix to their
+	   duplicated neighbors. */ \
+	PASTEMAC(ch,bcastbbs_mxn) \
+	( \
+	  mr, \
+	  nr, \
+	  b11, rs_b, cs_b  \
+	); \
 \
 	/* b11 = inv(a11) * b11;
 	   c11 = b11; */ \
@@ -115,15 +124,6 @@ PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11 after gemm", mr, 2*nr, \
 PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11 after trsm", mr, 2*nr, \
                      (double*)b11, rs_b, 1, "%5.2f", "" ); \
 */ \
-\
-	/* Broadcast the elements of the updated b11 submatrix to their
-	   duplicated neighbors. */ \
-	PASTEMAC(ch,bcastbbs_mxn) \
-	( \
-	  mr, \
-	  nr, \
-	  b11, rs_b, cs_b  \
-	); \
 \
 /*
 PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b0111p_r after", k+3, 8, \
diff --git a/ref_kernels/3/bli_trsm_ref.c b/ref_kernels/3/bli_trsm_ref.c
index cf80b2e19b..504849e4ef 100644
--- a/ref_kernels/3/bli_trsm_ref.c
+++ b/ref_kernels/3/bli_trsm_ref.c
@@ -111,7 +111,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
 			PASTEMAC(ch,copys)( beta11c, *gamma11 ); \
 \
 			/* Store the local value back to b11. */ \
-			PASTEMAC(ch,copys)( beta11c, *beta11 ); \
+		    for ( dim_t d = 0; d < cs_b; ++d ) \
+			    PASTEMAC(ch,copys)( beta11c, *(beta11 + d) ); \
 		} \
 	} \
 }
@@ -152,13 +153,10 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	const inc_t     rs_b   = packnr; \
 	const inc_t     cs_b   = bli_cntx_get_blksz_def_dt( dt, BLIS_BBN, cntx ); \
 \
-	dim_t           iter, i, j, l; \
-	dim_t           n_behind; \
-\
-	for ( iter = 0; iter < m; ++iter ) \
+	for ( dim_t iter = 0; iter < m; ++iter ) \
 	{ \
-		i        = m - iter - 1; \
-		n_behind = iter; \
+		dim_t i        = m - iter - 1; \
+		dim_t n_behind = iter; \
 \
 		ctype* restrict alpha11  = a + (i  )*rs_a + (i  )*cs_a; \
 		ctype* restrict a12t     = a + (i  )*rs_a + (i+1)*cs_a; \
@@ -167,7 +165,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 		/* b1 = b1 - a12t * B2; */ \
 		/* b1 = b1 / alpha11; */ \
-		for ( j = 0; j < n; ++j ) \
+		for ( dim_t j = 0; j < n; ++j ) \
 		{ \
 			ctype* restrict beta11  = b1 + (0  )*rs_b + (j  )*cs_b; \
 			ctype* restrict b21     = B2 + (0  )*rs_b + (j  )*cs_b; \
@@ -177,7 +175,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 			/* beta11 = beta11 - a12t * b21; */ \
 			PASTEMAC(ch,set0s)( rho11 ); \
-			for ( l = 0; l < n_behind; ++l ) \
+			for ( dim_t l = 0; l < n_behind; ++l ) \
 			{ \
 				ctype* restrict alpha12 = a12t + (l  )*cs_a; \
 				ctype* restrict beta21  = b21  + (l  )*rs_b; \
@@ -197,7 +195,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
 			PASTEMAC(ch,copys)( beta11c, *gamma11 ); \
 \
 			/* Store the local value back to b11. */ \
-			PASTEMAC(ch,copys)( beta11c, *beta11 ); \
+		    for ( dim_t d = 0; d < cs_b; ++d ) \
+			    PASTEMAC(ch,copys)( beta11c, *(beta11 + d) ); \
 		} \
 	} \
 }
diff --git a/ref_kernels/bli_cntx_ref.c b/ref_kernels/bli_cntx_ref.c
index 3110077101..69c546cd4b 100644
--- a/ref_kernels/bli_cntx_ref.c
+++ b/ref_kernels/bli_cntx_ref.c
@@ -117,6 +117,16 @@
 #undef  packm_nrxk_1er_ker_name
 #define packm_nrxk_1er_ker_name  GENARNAME(packm_nrxk_1er)
 
+#undef  packm_mrxmr_diag_ker_name
+#define packm_mrxmr_diag_ker_name  GENARNAME(packm_mrxmr_diag)
+#undef  packm_nrxnr_diag_ker_name
+#define packm_nrxnr_diag_ker_name  GENARNAME(packm_nrxnr_diag)
+
+#undef  packm_mrxmr_diag_1er_ker_name
+#define packm_mrxmr_diag_1er_ker_name  GENARNAME(packm_mrxmr_diag_1er)
+#undef  packm_nrxnr_diag_1er_ker_name
+#define packm_nrxnr_diag_1er_ker_name  GENARNAME(packm_nrxnr_diag_1er)
+
 #undef  unpackm_mrxk_ker_name
 #define unpackm_mrxk_ker_name  GENARNAME(unpackm_mrxk)
 #undef  unpackm_nrxk_ker_name
@@ -232,20 +242,27 @@ void GENBARNAME(cntx_init)
 
 	// -- Set blocksizes -------------------------------------------------------
 
-	//                                           s     d     c     z
-	bli_blksz_init_easy( &blkszs[ BLIS_KR  ],    1,    1,    1,    1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_MR  ],    4,    4,    4,    4 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NR  ],   16,    8,    8,    4 );
-	bli_blksz_init_easy( &blkszs[ BLIS_MC  ],  256,  128,  128,   64 );
-	bli_blksz_init_easy( &blkszs[ BLIS_KC  ],  256,  256,  256,  256 );
-	bli_blksz_init_easy( &blkszs[ BLIS_NC  ], 4096, 4096, 4096, 4096 );
-	bli_blksz_init_easy( &blkszs[ BLIS_M2  ], 1000, 1000, 1000, 1000 );
-	bli_blksz_init_easy( &blkszs[ BLIS_N2  ], 1000, 1000, 1000, 1000 );
-	bli_blksz_init_easy( &blkszs[ BLIS_AF  ],    8,    8,    8,    8 );
-	bli_blksz_init_easy( &blkszs[ BLIS_DF  ],    6,    6,    6,    6 );
-	bli_blksz_init_easy( &blkszs[ BLIS_XF  ],    4,    4,    4,    4 );
-	bli_blksz_init_easy( &blkszs[ BLIS_BBM ],    1,    1,    1,    1 );
-	bli_blksz_init_easy( &blkszs[ BLIS_BBN ],    1,    1,    1,    1 );
+	// NOTE: The macro values for register blocksizes and packm broadcast factors are
+	// used here as defined in the bli_kernel_defs_<family>.h or generic values from
+	// bli_kernel_macro_defs.h otherwise. Configurations should also initialize the
+	// blocksizes in the context explicitly, but using the correct values here helps
+	// to prevent accidents.
+	//                                                    s              d              c              z
+	bli_blksz_init_easy( &blkszs[ BLIS_KR  ],             1,             1,             1,             1 );
+	bli_blksz_init     ( &blkszs[ BLIS_MR  ],     BLIS_MR_s,     BLIS_MR_d,     BLIS_MR_c,     BLIS_MR_z,
+	                                          BLIS_PACKMR_s, BLIS_PACKMR_d, BLIS_PACKMR_c, BLIS_PACKMR_z );
+	bli_blksz_init     ( &blkszs[ BLIS_NR  ],     BLIS_NR_s,     BLIS_NR_d,     BLIS_NR_c,     BLIS_NR_z,
+	                                          BLIS_PACKNR_s, BLIS_PACKNR_d, BLIS_PACKNR_c, BLIS_PACKNR_z );
+	bli_blksz_init_easy( &blkszs[ BLIS_MC  ],           256,           128,           128,            64 );
+	bli_blksz_init_easy( &blkszs[ BLIS_KC  ],           256,           256,           256,           256 );
+	bli_blksz_init_easy( &blkszs[ BLIS_NC  ],          4096,          4096,          4096,          4096 );
+	bli_blksz_init_easy( &blkszs[ BLIS_M2  ],          1000,          1000,          1000,          1000 );
+	bli_blksz_init_easy( &blkszs[ BLIS_N2  ],          1000,          1000,          1000,          1000 );
+	bli_blksz_init_easy( &blkszs[ BLIS_AF  ],             8,             8,             8,             8 );
+	bli_blksz_init_easy( &blkszs[ BLIS_DF  ],             6,             6,             6,             6 );
+	bli_blksz_init_easy( &blkszs[ BLIS_XF  ],             4,             4,             4,             4 );
+	bli_blksz_init_easy( &blkszs[ BLIS_BBM ],    BLIS_BBM_s,    BLIS_BBM_d,    BLIS_BBM_c,    BLIS_BBM_z );
+	bli_blksz_init_easy( &blkszs[ BLIS_BBN ],    BLIS_BBN_s,    BLIS_BBN_d,    BLIS_BBN_c,    BLIS_BBN_z );
 
 	// -- Set level-3 small/unpacked thresholds --------------------------------
 
@@ -387,6 +404,12 @@ void GENBARNAME(cntx_init)
 	gen_func_init_co( &funcs[ BLIS_PACKM_MRXK_1ER_KER ],  packm_mrxk_1er_ker_name );
 	gen_func_init_co( &funcs[ BLIS_PACKM_NRXK_1ER_KER ],  packm_nrxk_1er_ker_name );
 
+	gen_func_init( &funcs[ BLIS_PACKM_MRXMR_DIAG_KER ],  packm_mrxmr_diag_ker_name );
+	gen_func_init( &funcs[ BLIS_PACKM_NRXNR_DIAG_KER ],  packm_nrxnr_diag_ker_name );
+
+	gen_func_init_co( &funcs[ BLIS_PACKM_MRXMR_DIAG_1ER_KER ],  packm_mrxmr_diag_1er_ker_name );
+	gen_func_init_co( &funcs[ BLIS_PACKM_NRXNR_DIAG_1ER_KER ],  packm_nrxnr_diag_1er_ker_name );
+
 	gen_func_init( &funcs[ BLIS_UNPACKM_MRXK_KER ],  unpackm_mrxk_ker_name );
 	gen_func_init( &funcs[ BLIS_UNPACKM_NRXK_KER ],  unpackm_nrxk_ker_name );
 
diff --git a/ref_kernels/ind/bli_gemmtrsm1m_ref.c b/ref_kernels/ind/bli_gemmtrsm1m_ref.c
index 28acf80fb3..23699b1b8b 100644
--- a/ref_kernels/ind/bli_gemmtrsm1m_ref.c
+++ b/ref_kernels/ind/bli_gemmtrsm1m_ref.c
@@ -87,7 +87,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	ctype_r* restrict bx1_r       = ( ctype_r* )bx1; \
 \
 	const inc_t       rs_b        = packnr; \
-	const inc_t       cs_b        = 1; \
+	const inc_t       cs_b        = bli_cntx_get_blksz_def_dt( dt_r, BLIS_BBN, cntx ); \
 \
 	ctype_r* restrict zero_r      = PASTEMAC(chr,0); \
 	ctype_r* restrict minus_one_r = PASTEMAC(chr,m1); \
@@ -168,24 +168,25 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 	if ( bli_is_1e_packed( schema_b ) ) \
 	{ \
-		const inc_t     ld_b = rs_b; \
+		const inc_t       ld_b   =     rs_b; \
+		const inc_t       rs_b2  = 2 * rs_b; \
+		const inc_t       cs_b2  = 2 * cs_b; \
 \
-		ctype* restrict b11_ri = ( ctype* )b11; \
-		ctype* restrict b11_ir = ( ctype* )b11 + ld_b/2; \
-\
-		dim_t i, j; \
+		ctype_r* restrict b11_ri = ( ctype_r* )b11; \
+		ctype_r* restrict b11_ir = ( ctype_r* )b11 + ld_b; \
 \
 		/* b11 = alpha * b11 + bt; */ \
-		for ( j = 0; j < nr; ++j ) \
-		for ( i = 0; i < mr; ++i ) \
+		for ( dim_t j = 0; j < nr; ++j ) \
+		for ( dim_t i = 0; i < mr; ++i ) \
+		for ( dim_t d = 0; d < cs_b; ++d ) \
 		{ \
-			ctype*   restrict beta11t   = bt     + i*rs_bt + j*cs_bt; \
-			ctype_r* restrict beta11t_r = &PASTEMAC(ch,real)( *beta11t ); \
-			ctype_r* restrict beta11t_i = &PASTEMAC(ch,imag)( *beta11t ); \
-			ctype*   restrict beta11_ri = b11_ri + i*rs_b  + j*cs_b; \
-			ctype_r* restrict beta11_r  = &PASTEMAC(ch,real)( *beta11_ri ); \
-			ctype_r* restrict beta11_i  = &PASTEMAC(ch,imag)( *beta11_ri ); \
-			ctype*   restrict beta11_ir = b11_ir + i*rs_b  + j*cs_b; \
+			ctype*   restrict beta11t     = bt     + i*rs_bt + j*cs_bt; \
+			ctype_r* restrict beta11t_r   = &PASTEMAC(ch,real)( *beta11t ); \
+			ctype_r* restrict beta11t_i   = &PASTEMAC(ch,imag)( *beta11t ); \
+			ctype_r* restrict beta11_ri_r = b11_ri + i*rs_b2 + j*cs_b2 + 0*cs_b + d; \
+			ctype_r* restrict beta11_ri_i = b11_ri + i*rs_b2 + j*cs_b2 + 1*cs_b + d; \
+			ctype_r* restrict beta11_ir_r = b11_ir + i*rs_b2 + j*cs_b2 + 0*cs_b + d; \
+			ctype_r* restrict beta11_ir_i = b11_ir + i*rs_b2 + j*cs_b2 + 1*cs_b + d; \
 \
 			PASTEMAC3(ch,chr,ch,xpbyris) \
 			( \
@@ -193,12 +194,12 @@ void PASTEMAC3(ch,opname,arch,suf) \
 			  *beta11t_i, \
 			  alpha_r, \
 			  alpha_i, /* alpha_i not referenced */ \
-			  *beta11_r, \
-			  *beta11_i  \
+			  *beta11_ri_r, \
+			  *beta11_ri_i  \
 			); \
 \
-			PASTEMAC(ch,sets)( -*beta11_i, \
-			                    *beta11_r, *beta11_ir ); \
+			PASTEMAC(ch,copyris)( -*beta11_ri_i, *beta11_ri_r, \
+                                   *beta11_ir_r, *beta11_ir_i ); \
 		} \
 	} \
 	else /* if ( bli_is_1r_packed( schema_b ) ) */ \
@@ -209,18 +210,17 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 		ctype_r* restrict b11_r = ( ctype_r* )b11; \
 		ctype_r* restrict b11_i = ( ctype_r* )b11 + ld_b; \
-\
-		dim_t i, j; \
 \
 		/* b11 = alpha * b11 + bt; */ \
-		for ( j = 0; j < nr; ++j ) \
-		for ( i = 0; i < mr; ++i ) \
+		for ( dim_t j = 0; j < nr; ++j ) \
+		for ( dim_t i = 0; i < mr; ++i ) \
+		for ( dim_t d = 0; d < cs_b; ++d ) \
 		{ \
 			ctype*   restrict beta11t   = bt    + i*rs_bt + j*cs_bt; \
 			ctype_r* restrict beta11t_r = &PASTEMAC(ch,real)( *beta11t ); \
 			ctype_r* restrict beta11t_i = &PASTEMAC(ch,imag)( *beta11t ); \
-			ctype_r* restrict beta11_r  = b11_r + i*rs_b2 + j*cs_b2; \
-			ctype_r* restrict beta11_i  = b11_i + i*rs_b2 + j*cs_b2; \
+			ctype_r* restrict beta11_r  = b11_r + i*rs_b2 + j*cs_b2 + d; \
+			ctype_r* restrict beta11_i  = b11_i + i*rs_b2 + j*cs_b2 + d; \
 \
 			PASTEMAC3(ch,chr,ch,xpbyris) \
 			( \
diff --git a/ref_kernels/ind/bli_trsm1m_ref.c b/ref_kernels/ind/bli_trsm1m_ref.c
index 68717f7a6c..175bc9e14a 100644
--- a/ref_kernels/ind/bli_trsm1m_ref.c
+++ b/ref_kernels/ind/bli_trsm1m_ref.c
@@ -48,6 +48,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
      ) \
 { \
 	const num_t       dt     = PASTEMAC(ch,type); \
+	const num_t       dt_r   = PASTEMAC(chr,type); \
 \
 	const dim_t       mr     = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
 	const dim_t       nr     = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
@@ -58,11 +59,11 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	const dim_t       m      = mr; \
 	const dim_t       n      = nr; \
 \
-	const inc_t       rs_a  = 1; \
-	const inc_t       cs_a  = packmr; \
+	const inc_t       rs_a   = bli_cntx_get_blksz_def_dt( dt_r, BLIS_BBM, cntx ); \
+	const inc_t       cs_a   = packmr; \
 \
-	const inc_t       rs_b  = packnr; \
-	const inc_t       cs_b  = 1; \
+	const inc_t       rs_b   = packnr; \
+	const inc_t       cs_b   = bli_cntx_get_blksz_def_dt( dt_r, BLIS_BBN, cntx ); \
 \
 	const inc_t       ld_a  = cs_a; \
 	const inc_t       ld_b  = rs_b; \
@@ -77,12 +78,14 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	{ \
 		const inc_t       rs_a2 = 1 * rs_a; \
 		const inc_t       cs_a2 = 2 * cs_a; \
+		const inc_t       rs_b2 = 2 * rs_b; \
+		const inc_t       cs_b2 = 2 * cs_b; \
 \
 		ctype_r* restrict a_r   = ( ctype_r* )a; \
 		ctype_r* restrict a_i   = ( ctype_r* )a + ld_a; \
 \
-		ctype*   restrict b_ri  = ( ctype*   )b; \
-		ctype*   restrict b_ir  = ( ctype*   )b + ld_b/2; \
+		ctype_r* restrict b_ri  = ( ctype_r* )b; \
+		ctype_r* restrict b_ir  = ( ctype_r* )b + ld_b; \
 \
 		for ( iter = 0; iter < m; ++iter ) \
 		{ \
@@ -93,20 +96,22 @@ void PASTEMAC3(ch,opname,arch,suf) \
 			ctype_r* restrict alpha11_i  = a_i  + (i  )*rs_a2 + (i  )*cs_a2; \
 			ctype_r* restrict a10t_r     = a_r  + (i  )*rs_a2 + (0  )*cs_a2; \
 			ctype_r* restrict a10t_i     = a_i  + (i  )*rs_a2 + (0  )*cs_a2; \
-			ctype*   restrict b1_ri      = b_ri + (i  )*rs_b  + (0  )*cs_b; \
-			ctype*   restrict b1_ir      = b_ir + (i  )*rs_b  + (0  )*cs_b; \
-			ctype*   restrict B0_ri      = b_ri + (0  )*rs_b  + (0  )*cs_b; \
+			ctype_r* restrict b1_ri      = b_ri + (i  )*rs_b2 + (0  )*cs_b2; \
+			ctype_r* restrict b1_ir      = b_ir + (i  )*rs_b2 + (0  )*cs_b2; \
+			ctype_r* restrict B0_ri      = b_ri + (0  )*rs_b2 + (0  )*cs_b2; \
 \
 			/* b1 = b1 - a10t * B0; */ \
 			/* b1 = b1 / alpha11; */ \
 			for ( j = 0; j < n; ++j ) \
 			{ \
-				ctype*   restrict beta11_ri = b1_ri + (0  )*rs_b + (j  )*cs_b; \
-				ctype*   restrict beta11_ir = b1_ir + (0  )*rs_b + (j  )*cs_b; \
-				ctype*   restrict b01_ri    = B0_ri + (0  )*rs_b + (j  )*cs_b; \
-				ctype*   restrict gamma11   = c     + (i  )*rs_c + (j  )*cs_c; \
-				ctype_r           beta11c_r = PASTEMAC(ch,real)( *beta11_ri ); \
-				ctype_r           beta11c_i = PASTEMAC(ch,imag)( *beta11_ri ); \
+				ctype_r* restrict beta11_ri_r = b1_ri + (0  )*rs_b2 + (j  )*cs_b2 + 0*cs_b; \
+				ctype_r* restrict beta11_ri_i = b1_ri + (0  )*rs_b2 + (j  )*cs_b2 + 1*cs_b; \
+				ctype_r* restrict beta11_ir_r = b1_ir + (0  )*rs_b2 + (j  )*cs_b2 + 0*cs_b; \
+				ctype_r* restrict beta11_ir_i = b1_ir + (0  )*rs_b2 + (j  )*cs_b2 + 1*cs_b; \
+				ctype_r* restrict b01_ri      = B0_ri + (0  )*rs_b2 + (j  )*cs_b2; \
+				ctype*   restrict gamma11     = c     + (i  )*rs_c  + (j  )*cs_c; \
+				ctype_r           beta11c_r   = *beta11_ri_r; \
+				ctype_r           beta11c_i   = *beta11_ri_i; \
 				ctype_r           rho11_r; \
 				ctype_r           rho11_i; \
 \
@@ -117,9 +122,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
 				{ \
 					ctype_r* restrict alpha10_r = a10t_r  + (l  )*cs_a2; \
 					ctype_r* restrict alpha10_i = a10t_i  + (l  )*cs_a2; \
-					ctype*   restrict beta01_ri = b01_ri  + (l  )*rs_b; \
-					ctype_r* restrict beta01_r  = &PASTEMAC(ch,real)( *beta01_ri ); \
-					ctype_r* restrict beta01_i  = &PASTEMAC(ch,imag)( *beta01_ri ); \
+					ctype_r* restrict beta01_r  = b01_ri  + (l  )*rs_b2 + 0*cs_b; \
+					ctype_r* restrict beta01_i  = b01_ri  + (l  )*rs_b2 + 1*cs_b; \
 \
 					PASTEMAC(ch,axpyris)( *alpha10_r, \
 					                      *alpha10_i, \
@@ -147,8 +151,11 @@ void PASTEMAC3(ch,opname,arch,suf) \
 				PASTEMAC(ch,sets)(  beta11c_r, beta11c_i, *gamma11 ); \
 \
 				/* Store the local values back to b11. */ \
-				PASTEMAC(ch,sets)(  beta11c_r, beta11c_i, *beta11_ri ); \
-				PASTEMAC(ch,sets)( -beta11c_i, beta11c_r, *beta11_ir ); \
+                for ( dim_t d = 0; d < cs_b; ++d ) \
+                { \
+    				PASTEMAC(ch,copyris)(  beta11c_r, beta11c_i, *(beta11_ri_r + d), *(beta11_ri_i + d) ); \
+    				PASTEMAC(ch,copyris)( -beta11c_i, beta11c_r, *(beta11_ir_r + d), *(beta11_ir_i + d) ); \
+                } \
 			} \
 		} \
 	} \
@@ -229,10 +236,11 @@ void PASTEMAC3(ch,opname,arch,suf) \
 				                   beta11c_i, *gamma11 ); \
 \
 				/* Store the local values back to b11. */ \
-				PASTEMAC(ch,copyris)( beta11c_r, \
-				                      beta11c_i, \
-				                      *beta11_r, \
-				                      *beta11_i ); \
+                for ( dim_t d = 0; d < cs_b; ++d ) \
+    				PASTEMAC(ch,copyris)( beta11c_r, \
+    				                      beta11c_i, \
+    				                      *(beta11_r + d), \
+    				                      *(beta11_i + d) ); \
 			} \
 		} \
 	} \
@@ -258,6 +266,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
      ) \
 { \
 	const num_t       dt     = PASTEMAC(ch,type); \
+	const num_t       dt_r   = PASTEMAC(chr,type); \
 \
 	const dim_t       mr     = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
 	const dim_t       nr     = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
@@ -268,11 +277,11 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	const dim_t       m      = mr; \
 	const dim_t       n      = nr; \
 \
-	const inc_t       rs_a  = 1; \
-	const inc_t       cs_a  = packmr; \
+	const inc_t       rs_a   = bli_cntx_get_blksz_def_dt( dt_r, BLIS_BBM, cntx ); \
+	const inc_t       cs_a   = packmr; \
 \
-	const inc_t       rs_b  = packnr; \
-	const inc_t       cs_b  = 1; \
+	const inc_t       rs_b   = packnr; \
+	const inc_t       cs_b   = bli_cntx_get_blksz_def_dt( dt_r, BLIS_BBN, cntx ); \
 \
 	const inc_t       ld_a  = cs_a; \
 	const inc_t       ld_b  = rs_b; \
@@ -287,12 +296,14 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	{ \
 		const inc_t       rs_a2 = 1 * rs_a; \
 		const inc_t       cs_a2 = 2 * cs_a; \
+		const inc_t       rs_b2 = 2 * rs_b; \
+		const inc_t       cs_b2 = 2 * cs_b; \
 \
 		ctype_r* restrict a_r   = ( ctype_r* )a; \
 		ctype_r* restrict a_i   = ( ctype_r* )a + ld_a; \
 \
-		ctype*   restrict b_ri  = ( ctype*   )b; \
-		ctype*   restrict b_ir  = ( ctype*   )b + ld_b/2; \
+		ctype_r* restrict b_ri  = ( ctype_r* )b; \
+		ctype_r* restrict b_ir  = ( ctype_r* )b + ld_b; \
 \
 		for ( iter = 0; iter < m; ++iter ) \
 		{ \
@@ -303,20 +314,22 @@ void PASTEMAC3(ch,opname,arch,suf) \
 			ctype_r* restrict alpha11_i  = a_i  + (i  )*rs_a2 + (i  )*cs_a2; \
 			ctype_r* restrict a12t_r     = a_r  + (i  )*rs_a2 + (i+1)*cs_a2; \
 			ctype_r* restrict a12t_i     = a_i  + (i  )*rs_a2 + (i+1)*cs_a2; \
-			ctype*   restrict b1_ri      = b_ri + (i  )*rs_b  + (0  )*cs_b; \
-			ctype*   restrict b1_ir      = b_ir + (i  )*rs_b  + (0  )*cs_b; \
-			ctype*   restrict B2_ri      = b_ri + (i+1)*rs_b  + (0  )*cs_b; \
+			ctype_r* restrict b1_ri      = b_ri + (i  )*rs_b2 + (0  )*cs_b2; \
+			ctype_r* restrict b1_ir      = b_ir + (i  )*rs_b2 + (0  )*cs_b2; \
+			ctype_r* restrict B2_ri      = b_ri + (i+1)*rs_b2 + (0  )*cs_b2; \
 \
 			/* b1 = b1 - a12t * B2; */ \
 			/* b1 = b1 / alpha11; */ \
 			for ( j = 0; j < n; ++j ) \
 			{ \
-				ctype*   restrict beta11_ri = b1_ri + (0  )*rs_b + (j  )*cs_b; \
-				ctype*   restrict beta11_ir = b1_ir + (0  )*rs_b + (j  )*cs_b; \
-				ctype*   restrict b21_ri    = B2_ri + (0  )*rs_b + (j  )*cs_b; \
-				ctype*   restrict gamma11   = c     + (i  )*rs_c + (j  )*cs_c; \
-				ctype_r           beta11c_r = PASTEMAC(ch,real)( *beta11_ri ); \
-				ctype_r           beta11c_i = PASTEMAC(ch,imag)( *beta11_ri ); \
+				ctype_r* restrict beta11_ri_r = b1_ri + (0  )*rs_b2 + (j  )*cs_b2 + 0*cs_b; \
+				ctype_r* restrict beta11_ri_i = b1_ri + (0  )*rs_b2 + (j  )*cs_b2 + 1*cs_b; \
+				ctype_r* restrict beta11_ir_r = b1_ir + (0  )*rs_b2 + (j  )*cs_b2 + 0*cs_b; \
+				ctype_r* restrict beta11_ir_i = b1_ir + (0  )*rs_b2 + (j  )*cs_b2 + 1*cs_b; \
+				ctype_r* restrict b21_ri      = B2_ri + (0  )*rs_b2 + (j  )*cs_b2; \
+				ctype*   restrict gamma11     = c     + (i  )*rs_c + (j  )*cs_c; \
+				ctype_r           beta11c_r   = *beta11_ri_r; \
+				ctype_r           beta11c_i   = *beta11_ri_i; \
 				ctype_r           rho11_r; \
 				ctype_r           rho11_i; \
 \
@@ -325,11 +338,10 @@ void PASTEMAC3(ch,opname,arch,suf) \
 				                      rho11_i ); \
 				for ( l = 0; l < n_behind; ++l ) \
 				{ \
-					ctype_r* restrict alpha12_r = a12t_r  + (l  )*cs_a2; \
-					ctype_r* restrict alpha12_i = a12t_i  + (l  )*cs_a2; \
-					ctype*   restrict beta21_ri = b21_ri  + (l  )*rs_b; \
-					ctype_r* restrict beta21_r  = &PASTEMAC(ch,real)( *beta21_ri ); \
-					ctype_r* restrict beta21_i  = &PASTEMAC(ch,imag)( *beta21_ri ); \
+					ctype_r* restrict alpha12_r = a12t_r + (l  )*cs_a2; \
+					ctype_r* restrict alpha12_i = a12t_i + (l  )*cs_a2; \
+					ctype_r* restrict beta21_r  = b21_ri + (l  )*rs_b2 + 0*cs_b; \
+					ctype_r* restrict beta21_i  = b21_ri + (l  )*rs_b2 + 1*cs_b; \
 \
 					PASTEMAC(ch,axpyris)( *alpha12_r, \
 					                      *alpha12_i, \
@@ -357,8 +369,11 @@ void PASTEMAC3(ch,opname,arch,suf) \
 				PASTEMAC(ch,sets)(  beta11c_r, beta11c_i, *gamma11 ); \
 \
 				/* Store the local values back to b11. */ \
-				PASTEMAC(ch,sets)(  beta11c_r, beta11c_i, *beta11_ri ); \
-				PASTEMAC(ch,sets)( -beta11c_i, beta11c_r, *beta11_ir ); \
+                for ( dim_t d = 0; d < cs_b; ++d ) \
+                { \
+    				PASTEMAC(ch,copyris)(  beta11c_r, beta11c_i, *(beta11_ri_r + d), *(beta11_ri_i + d) ); \
+    				PASTEMAC(ch,copyris)( -beta11c_i, beta11c_r, *(beta11_ir_r + d), *(beta11_ir_i + d) ); \
+                } \
 			} \
 		} \
 	} \
@@ -439,10 +454,11 @@ void PASTEMAC3(ch,opname,arch,suf) \
 				                   beta11c_i, *gamma11 ); \
 \
 				/* Store the local values back to b11. */ \
-				PASTEMAC(ch,copyris)( beta11c_r, \
-				                      beta11c_i, \
-				                      *beta11_r, \
-				                      *beta11_i ); \
+                for ( dim_t d = 0; d < cs_b; ++d ) \
+    				PASTEMAC(ch,copyris)( beta11c_r, \
+    				                      beta11c_i, \
+    				                      *(beta11_r + d), \
+    				                      *(beta11_i + d) ); \
 			} \
 		} \
 	} \
diff --git a/testsuite/src/test_trsm_ukr.c b/testsuite/src/test_trsm_ukr.c
index b07da91cc8..9568dfee73 100644
--- a/testsuite/src/test_trsm_ukr.c
+++ b/testsuite/src/test_trsm_ukr.c
@@ -305,7 +305,7 @@ bli_printm( "ap", &ap, "%5.2f", "" );
 	if ( bli_obj_is_complex( &b ) ) *perf *= 4.0;
 
 	// Perform checks.
-	libblis_test_trsm_ukr_check( params, side, &ap, &c, &b, resid );
+	libblis_test_trsm_ukr_check( params, side, &a, &c, &b, resid );
 
 	// Zero out performance and residual if output matrix is empty.
 	//libblis_test_check_empty_problem( &c, perf, resid );
@@ -418,9 +418,11 @@ void libblis_test_trsm_ukr_check
 bli_printm( "a11", a, "%5.2f", "" );
 #endif
 
+#if 0
 	// Restore the diagonal of a11 to its original, un-inverted state
 	// (needed for trsv).
 	bli_invertd( a );
+#endif
 
 	if ( bli_is_left( side ) )
 	{

From 034d88c6353f4914151c5581319ce5ef0483b251 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Wed, 16 Feb 2022 14:32:01 -0600
Subject: [PATCH 10/32] Fix one last bug.

---
 frame/1m/packm/bli_packm_struc_cxk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/frame/1m/packm/bli_packm_struc_cxk.c b/frame/1m/packm/bli_packm_struc_cxk.c
index 7b8e41fb30..0cf4ac9304 100644
--- a/frame/1m/packm/bli_packm_struc_cxk.c
+++ b/frame/1m/packm/bli_packm_struc_cxk.c
@@ -228,7 +228,7 @@ void PASTEMAC(ch,varname) \
 \
     	if ( bli_is_lower( uploc ) ) \
         { \
-            bli_reflect_to_stored_part( -panel_dim, c12, incc12, ldc12 ); \
+            bli_reflect_to_stored_part( diagoffc - i, c12, incc12, ldc12 ); \
 \
     		if ( bli_is_hermitian( strucc ) ) \
     			bli_toggle_conj( &conjc12 ); \

From 2dd8b3800262068896e6c65d1369193745e974c1 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Mon, 7 Mar 2022 08:54:16 -0600
Subject: [PATCH 11/32] Partial addition of `const` to all interfaces above the
 level of the (micro)kernels.

---
 addon/gemmd/bao_gemmd.c                 |  28 +-
 addon/gemmd/bao_gemmd.h                 |  68 ++---
 addon/gemmd/bao_gemmd_bp_var1.c         | 172 +++++------
 addon/gemmd/bao_gemmd_check.c           |  14 +-
 addon/gemmd/bao_gemmd_var.h             |  70 ++---
 build/detect/config/config_detect.c     |   4 +-
 frame/0/bli_l0_check.c                  |  50 ++--
 frame/0/bli_l0_check.h                  |  50 ++--
 frame/0/bli_l0_ft.h                     |  42 +--
 frame/0/bli_l0_oapi.c                   |  44 +--
 frame/0/bli_l0_oapi.h                   |  30 +-
 frame/0/bli_l0_tapi.c                   |  48 ++--
 frame/0/bli_l0_tapi.h                   |  38 +--
 frame/0/copysc/bli_copysc.c             |  30 +-
 frame/0/copysc/bli_copysc.h             |  10 +-
 frame/1/bli_l1v_check.c                 |  94 +++---
 frame/1/bli_l1v_check.h                 |  94 +++---
 frame/1/bli_l1v_ft.h                    |  92 +++---
 frame/1/bli_l1v_oapi.c                  |  54 ++--
 frame/1/bli_l1v_oapi.h                  |  68 ++---
 frame/1/bli_l1v_tapi.c                  |  82 +++---
 frame/1/bli_l1v_tapi.h                  |  82 +++---
 frame/1d/bli_l1d_check.c                |  38 +--
 frame/1d/bli_l1d_check.h                |  38 +--
 frame/1d/bli_l1d_ft.h                   |  78 ++---
 frame/1d/bli_l1d_oapi.c                 |  30 +-
 frame/1d/bli_l1d_oapi.h                 |  30 +-
 frame/1d/bli_l1d_tapi.c                 |  84 +++---
 frame/1d/bli_l1d_tapi.h                 |  78 ++---
 frame/1f/bli_l1f_check.c                |  56 ++--
 frame/1f/bli_l1f_check.h                |  56 ++--
 frame/1f/bli_l1f_ft.h                   |  94 +++---
 frame/1f/bli_l1f_oapi.c                 |  56 ++--
 frame/1f/bli_l1f_oapi.h                 |  56 ++--
 frame/1f/bli_l1f_tapi.c                 |  94 +++---
 frame/1f/bli_l1f_tapi.h                 |  94 +++---
 frame/1m/bli_l1m_check.c                |  34 +--
 frame/1m/bli_l1m_check.h                |  34 +--
 frame/1m/bli_l1m_ft.h                   |  86 +++---
 frame/1m/bli_l1m_oapi.c                 |  30 +-
 frame/1m/bli_l1m_oapi.h                 |  20 +-
 frame/1m/bli_l1m_oft_var.h              |  16 +-
 frame/1m/bli_l1m_tapi.c                 | 120 ++++----
 frame/1m/bli_l1m_tapi.h                 |  86 +++---
 frame/1m/packm/bli_packm_alloc.c        |   4 +-
 frame/1m/packm/bli_packm_alloc.h        |   4 +-
 frame/1m/packm/bli_packm_blk_var1.c     |   8 +-
 frame/1m/packm/bli_packm_blk_var1.h     |   8 +-
 frame/1m/packm/bli_packm_check.c        |  12 +-
 frame/1m/packm/bli_packm_check.h        |  12 +-
 frame/1m/packm/bli_packm_cntl.h         |  14 +-
 frame/1m/packm/bli_packm_init.c         |   8 +-
 frame/1m/packm/bli_packm_init.h         |   8 +-
 frame/1m/packm/bli_packm_int.c          |   8 +-
 frame/1m/packm/bli_packm_int.h          |   8 +-
 frame/1m/packm/bli_packm_part.c         |  32 +--
 frame/1m/packm/bli_packm_part.h         |  38 +--
 frame/1m/unpackm/bli_unpackm_blk_var1.c |  14 +-
 frame/1m/unpackm/bli_unpackm_blk_var1.h |  12 +-
 frame/1m/unpackm/bli_unpackm_check.c    |   8 +-
 frame/1m/unpackm/bli_unpackm_check.h    |   6 +-
 frame/1m/unpackm/bli_unpackm_int.c      |  10 +-
 frame/1m/unpackm/bli_unpackm_int.h      |  10 +-
 frame/2/bli_l2_check.c                  |  96 +++----
 frame/2/bli_l2_check.h                  |  48 ++--
 frame/2/bli_l2_ft.h                     | 106 +++----
 frame/2/bli_l2_oapi.c                   |  48 ++--
 frame/2/bli_l2_oapi.h                   |  30 +-
 frame/2/bli_l2_tapi.c                   | 106 +++----
 frame/2/bli_l2_tapi.h                   | 106 +++----
 frame/3/bli_l3_blocksize.c              |  46 +--
 frame/3/bli_l3_blocksize.h              |  20 +-
 frame/3/bli_l3_check.c                  | 198 ++++++-------
 frame/3/bli_l3_check.h                  | 118 ++++----
 frame/3/bli_l3_cntl.c                   |   6 +-
 frame/3/bli_l3_cntl.h                   |   6 +-
 frame/3/bli_l3_direct.c                 |  32 +--
 frame/3/bli_l3_direct.h                 |  14 +-
 frame/3/bli_l3_int.c                    |  12 +-
 frame/3/bli_l3_int.h                    |  12 +-
 frame/3/bli_l3_oapi.c                   |  34 +--
 frame/3/bli_l3_oapi.h                   |  34 +--
 frame/3/bli_l3_oapi_ex.c                |  98 +++----
 frame/3/bli_l3_oapi_ex.h                |  34 +--
 frame/3/bli_l3_oft.h                    |  42 +--
 frame/3/bli_l3_oft_var.h                |   8 +-
 frame/3/bli_l3_packab.c                 |  16 +-
 frame/3/bli_l3_packab.h                 |  16 +-
 frame/3/bli_l3_prune.c                  |   4 +-
 frame/3/bli_l3_prune.h                  |   2 +-
 frame/3/bli_l3_schema.c                 |   2 +-
 frame/3/bli_l3_schema.h                 |   2 +-
 frame/3/bli_l3_sup_int.c                |  24 +-
 frame/3/bli_l3_sup_int.h                |  24 +-
 frame/3/bli_l3_sup_oft.h                |  12 +-
 frame/3/bli_l3_sup_packm_a.c            |  14 +-
 frame/3/bli_l3_sup_packm_a.h            |  14 +-
 frame/base/bli_apool.h                  |   8 +-
 frame/base/bli_arch.c                   |  12 +-
 frame/base/bli_arch.h                   |   4 +-
 frame/base/bli_array.c                  |  43 +--
 frame/base/bli_array.h                  |  28 +-
 frame/base/bli_auxinfo.h                |  20 +-
 frame/base/bli_blksz.c                  |  36 +--
 frame/base/bli_blksz.h                  |  60 ++--
 frame/base/bli_check.c                  |  84 +++---
 frame/base/bli_check.h                  |  86 +++---
 frame/base/bli_cntl.c                   |   4 +-
 frame/base/bli_cntl.h                   |  24 +-
 frame/base/bli_cntx.c                   |  18 +-
 frame/base/bli_cntx.h                   | 138 +++------
 frame/base/bli_const.c                  |  10 +-
 frame/base/bli_env.c                    |   2 +-
 frame/base/bli_error.c                  |   6 +-
 frame/base/bli_error.h                  |   4 +-
 frame/base/bli_func.c                   |   6 +-
 frame/base/bli_func.h                   |  12 +-
 frame/base/bli_gks.c                    |  58 ++--
 frame/base/bli_gks.h                    |  24 +-
 frame/base/bli_ind.c                    |   8 +-
 frame/base/bli_ind.h                    |  24 +-
 frame/base/bli_info.c                   |  40 +--
 frame/base/bli_info.h                   |  36 +--
 frame/base/bli_mbool.h                  |   2 +-
 frame/base/bli_memsys.c                 |   2 +-
 frame/base/bli_obj.c                    |  14 +-
 frame/base/bli_obj.h                    |  14 +-
 frame/base/bli_obj_scalar.c             |  26 +-
 frame/base/bli_obj_scalar.h             |  20 +-
 frame/base/bli_part.c                   | 144 +++++-----
 frame/base/bli_part.h                   |  58 ++--
 frame/base/bli_pba.c                    |  42 +--
 frame/base/bli_pba.h                    |  34 +--
 frame/base/bli_pool.c                   |  84 +++---
 frame/base/bli_pool.h                   |  92 +++---
 frame/base/bli_query.c                  |   6 +-
 frame/base/bli_query.h                  |   6 +-
 frame/base/bli_rntm.c                   |   6 +-
 frame/base/bli_rntm.h                   |  36 +--
 frame/base/bli_setgetijm.c              |  58 ++--
 frame/base/bli_setgetijm.h              |  40 +--
 frame/base/bli_setgetijv.c              |  52 ++--
 frame/base/bli_setgetijv.h              |  24 +-
 frame/base/bli_setri.c                  |  16 +-
 frame/base/bli_setri.h                  |  16 +-
 frame/base/cast/bli_castm.c             |  40 +--
 frame/base/cast/bli_castm.h             |  18 +-
 frame/base/cast/bli_castnzm.c           |  40 +--
 frame/base/cast/bli_castnzm.h           |  18 +-
 frame/base/cast/bli_castv.c             |  28 +-
 frame/base/cast/bli_castv.h             |  16 +-
 frame/base/check/bli_obj_check.c        |  18 +-
 frame/base/check/bli_obj_check.h        |  20 +-
 frame/base/check/bli_part_check.c       |  30 +-
 frame/base/check/bli_part_check.h       |  34 +--
 frame/base/proj/bli_projm.c             |   8 +-
 frame/base/proj/bli_projm.h             |   8 +-
 frame/base/proj/bli_projv.c             |   8 +-
 frame/base/proj/bli_projv.h             |   8 +-
 frame/include/bli_extern_defs.h         |  16 +-
 frame/include/bli_obj_macro_defs.h      | 264 ++++++++---------
 frame/include/bli_type_defs.h           |  20 +-
 frame/thread/bli_thread.c               | 112 ++++----
 frame/thread/bli_thread.h               |  32 +--
 frame/thread/bli_thrinfo.c              |   8 +-
 frame/thread/bli_thrinfo.h              |  28 +-
 frame/util/bli_util_check.c             |  94 +++---
 frame/util/bli_util_check.h             |  80 +++---
 frame/util/bli_util_ft.h                |  96 +++----
 frame/util/bli_util_oapi.c              |  70 ++---
 frame/util/bli_util_oapi.h              |  60 ++--
 frame/util/bli_util_tapi.c              |  92 +++---
 frame/util/bli_util_tapi.h              |  92 +++---
 frame/util/bli_util_unb_var1.c          | 368 ++++++++++++------------
 frame/util/bli_util_unb_var1.h          | 132 ++++-----
 175 files changed, 3789 insertions(+), 3836 deletions(-)

diff --git a/addon/gemmd/bao_gemmd.c b/addon/gemmd/bao_gemmd.c
index 01185a9d75..fe38505a79 100644
--- a/addon/gemmd/bao_gemmd.c
+++ b/addon/gemmd/bao_gemmd.c
@@ -40,12 +40,12 @@
 
 void bao_gemmd
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  d,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  d,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c
      )
 {
 	bao_gemmd_ex
@@ -63,14 +63,14 @@ void bao_gemmd
 
 void bao_gemmd_ex
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  d,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  d,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm
      )
 {
 	bli_init_once();
diff --git a/addon/gemmd/bao_gemmd.h b/addon/gemmd/bao_gemmd.h
index 7c7466494d..e3ea11e4e1 100644
--- a/addon/gemmd/bao_gemmd.h
+++ b/addon/gemmd/bao_gemmd.h
@@ -38,24 +38,24 @@
 
 BLIS_EXPORT_ADDON void bao_gemmd
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  d,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  d,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c
      );
 
 BLIS_EXPORT_ADDON void bao_gemmd_ex
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  d,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  d,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm
      );
 
 //
@@ -64,15 +64,15 @@ BLIS_EXPORT_ADDON void bao_gemmd_ex
 
 void bao_gemmd_int
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  d,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  d,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+       const rntm_t* rntm,
+       const thrinfo_t* thread
      );
 
 //
@@ -84,17 +84,17 @@ void bao_gemmd_int
 \
 BLIS_EXPORT_ADDON void PASTECH2(bao_,ch,opname) \
      ( \
-       trans_t transa, \
-       trans_t transb, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   k, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  d, inc_t incd, \
-       ctype*  b, inc_t rs_b, inc_t cs_b, \
-       ctype*  beta, \
-       ctype*  c, inc_t rs_c, inc_t cs_c  \
+       trans_t       transa, \
+       trans_t       transb, \
+       dim_t         m, \
+       dim_t         n, \
+       dim_t         k, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  d, inc_t incd, \
+       const ctype*  b, inc_t rs_b, inc_t cs_b, \
+       const ctype*  beta, \
+             ctype*  c, inc_t rs_c, inc_t cs_c  \
      );
 
 //INSERT_GENTPROT_BASIC0( gemmd )
diff --git a/addon/gemmd/bao_gemmd_bp_var1.c b/addon/gemmd/bao_gemmd_bp_var1.c
index 689471367f..e4393dcb23 100644
--- a/addon/gemmd/bao_gemmd_bp_var1.c
+++ b/addon/gemmd/bao_gemmd_bp_var1.c
@@ -43,15 +43,15 @@ typedef void (*FUNCPTR_T)
        dim_t            m,
        dim_t            n,
        dim_t            k,
-       void*   restrict alpha,
-       void*   restrict a, inc_t rs_a, inc_t cs_a,
-       void*   restrict d, inc_t incd,
-       void*   restrict b, inc_t rs_b, inc_t cs_b,
-       void*   restrict beta,
-       void*   restrict c, inc_t rs_c, inc_t cs_c,
-       cntx_t* restrict cntx,
-       rntm_t* restrict rntm,
-       thrinfo_t* restrict thread
+       const void*      alpha,
+       const void*      a, inc_t rs_a, inc_t cs_a,
+       const void*      d, inc_t incd,
+       const void*      b, inc_t rs_b, inc_t cs_b,
+       const void*      beta,
+             void*      c, inc_t rs_c, inc_t cs_c,
+       const cntx_t*    cntx,
+       const rntm_t*    rntm,
+       const thrinfo_t* thread
      );
 
 //
@@ -64,43 +64,43 @@ static FUNCPTR_T GENARRAY_PREF(ftypes,bao_,gemmd_bp_var1);
 
 void bao_gemmd_bp_var1
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  d,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  d,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+       const rntm_t* rntm,
+       const thrinfo_t* thread
      )
 {
-	const num_t    dt        = bli_obj_dt( c );
+	const num_t  dt        = bli_obj_dt( c );
 
-	const conj_t   conja     = bli_obj_conj_status( a );
-	const conj_t   conjb     = bli_obj_conj_status( b );
+	const conj_t conja     = bli_obj_conj_status( a );
+	const conj_t conjb     = bli_obj_conj_status( b );
 
-	const dim_t    m         = bli_obj_length( c );
-	const dim_t    n         = bli_obj_width( c );
-	const dim_t    k         = bli_obj_width( a );
+	const dim_t  m         = bli_obj_length( c );
+	const dim_t  n         = bli_obj_width( c );
+	const dim_t  k         = bli_obj_width( a );
 
-	void* restrict buf_a     = bli_obj_buffer_at_off( a );
-	const inc_t    rs_a      = bli_obj_row_stride( a );
-	const inc_t    cs_a      = bli_obj_col_stride( a );
+	const void*  buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t  rs_a      = bli_obj_row_stride( a );
+	const inc_t  cs_a      = bli_obj_col_stride( a );
 
-	void* restrict buf_d     = bli_obj_buffer_at_off( d );
-	const inc_t    incd      = bli_obj_vector_inc( d );
+	const void*  buf_d     = bli_obj_buffer_at_off( d );
+	const inc_t  incd      = bli_obj_vector_inc( d );
 
-	void* restrict buf_b     = bli_obj_buffer_at_off( b );
-	const inc_t    rs_b      = bli_obj_row_stride( b );
-	const inc_t    cs_b      = bli_obj_col_stride( b );
+	const void*  buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t  rs_b      = bli_obj_row_stride( b );
+	const inc_t  cs_b      = bli_obj_col_stride( b );
 
-	void* restrict buf_c     = bli_obj_buffer_at_off( c );
-	const inc_t    rs_c      = bli_obj_row_stride( c );
-	const inc_t    cs_c      = bli_obj_col_stride( c );
+	      void*  buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t  rs_c      = bli_obj_row_stride( c );
+	const inc_t  cs_c      = bli_obj_col_stride( c );
 
-	void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
-	void* restrict buf_beta  = bli_obj_buffer_for_1x1( dt, beta );
+	const void*  buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
+	const void*  buf_beta  = bli_obj_buffer_for_1x1( dt, beta );
 
 	// Index into the function pointer array to extract the correct
 	// typed function pointer based on the chosen datatype.
@@ -140,15 +140,15 @@ void PASTECH2(bao_,ch,varname) \
        dim_t            m, \
        dim_t            n, \
        dim_t            k, \
-       void*   restrict alpha, \
-       void*   restrict a, inc_t rs_a, inc_t cs_a, \
-       void*   restrict d, inc_t incd, \
-       void*   restrict b, inc_t rs_b, inc_t cs_b, \
-       void*   restrict beta, \
-       void*   restrict c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* restrict cntx, \
-       rntm_t* restrict rntm, \
-       thrinfo_t* restrict thread  \
+       const void*      alpha, \
+       const void*      a, inc_t rs_a, inc_t cs_a, \
+       const void*      d, inc_t incd, \
+       const void*      b, inc_t rs_b, inc_t cs_b, \
+       const void*      beta, \
+             void*      c, inc_t rs_c, inc_t cs_c, \
+       const cntx_t*    cntx, \
+       const rntm_t*    rntm, \
+       const thrinfo_t* thread  \
      ) \
 { \
 	const num_t dt = PASTEMAC(ch,type); \
@@ -180,12 +180,12 @@ void PASTECH2(bao_,ch,varname) \
 \
 	const inc_t irstep_c = rs_c * MR; \
 \
-	ctype* restrict a_00       = a; \
-	ctype* restrict d_00       = d; \
-	ctype* restrict b_00       = b; \
-	ctype* restrict c_00       = c; \
-	ctype* restrict alpha_cast = alpha; \
-	ctype* restrict beta_cast  = beta; \
+	const ctype* a_00       = a; \
+	const ctype* d_00       = d; \
+	const ctype* b_00       = b; \
+	      ctype* c_00       = c; \
+	const ctype* alpha_cast = alpha; \
+	const ctype* beta_cast  = beta; \
 \
 	/* Make local copies of the scalars to prevent any unnecessary sharing of
 	   cache lines between the cores' caches. */ \
@@ -212,21 +212,21 @@ void PASTECH2(bao_,ch,varname) \
 	                      BLIS_MR,      /* 1st loop */ \
 	                      BLIS_KR };    /* microkernel loop */  \
 \
-	bszid_t* restrict bszids_jc = &bszids[0]; \
-	bszid_t* restrict bszids_pc = &bszids[1]; \
-	/*bszid_t* restrict bszids_pb = &bszids[2];*/ \
-	bszid_t* restrict bszids_ic = &bszids[3]; \
-	/*bszid_t* restrict bszids_pa = &bszids[4];*/ \
-	bszid_t* restrict bszids_jr = &bszids[5]; \
-	/*bszid_t* restrict bszids_ir = &bszids[6];*/ \
-\
-	thrinfo_t* restrict thread_jc = NULL; \
-	thrinfo_t* restrict thread_pc = NULL; \
-	thrinfo_t* restrict thread_pb = NULL; \
-	thrinfo_t* restrict thread_ic = NULL; \
-	thrinfo_t* restrict thread_pa = NULL; \
-	thrinfo_t* restrict thread_jr = NULL; \
-	thrinfo_t* restrict thread_ir = NULL; \
+	const bszid_t* bszids_jc = &bszids[0]; \
+	const bszid_t* bszids_pc = &bszids[1]; \
+	/*const bszid_t* bszids_pb = &bszids[2];*/ \
+	const bszid_t* bszids_ic = &bszids[3]; \
+	/*const bszid_t* bszids_pa = &bszids[4];*/ \
+	const bszid_t* bszids_jr = &bszids[5]; \
+	/*const bszid_t* bszids_ir = &bszids[6];*/ \
+\
+	thrinfo_t* thread_jc = NULL; \
+	thrinfo_t* thread_pc = NULL; \
+	thrinfo_t* thread_pb = NULL; \
+	thrinfo_t* thread_ic = NULL; \
+	thrinfo_t* thread_pa = NULL; \
+	thrinfo_t* thread_jr = NULL; \
+	thrinfo_t* thread_ir = NULL; \
 \
 	/* Identify the current thrinfo_t node and then grow the tree. */ \
 	thread_jc = thread; \
@@ -239,7 +239,7 @@ void PASTECH2(bao_,ch,varname) \
 \
 	/* Compute number of primary and leftover components of the JC loop. */ \
 	/*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/ \
-	const dim_t jc_left =   n_local % NC; \
+	const dim_t jc_left = n_local % NC; \
 \
 	/* Loop over the n dimension (NC rows/columns at a time). */ \
 	for ( dim_t jj = jc_start; jj < jc_end; jj += NC ) \
@@ -247,8 +247,8 @@ void PASTECH2(bao_,ch,varname) \
 		/* Calculate the thread's current JC block dimension. */ \
 		const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \
 \
-		ctype* restrict b_jc = b_00 + jj * jcstep_b; \
-		ctype* restrict c_jc = c_00 + jj * jcstep_c; \
+		const ctype* b_jc = b_00 + jj * jcstep_b; \
+		      ctype* c_jc = c_00 + jj * jcstep_c; \
 \
 		/* Identify the current thrinfo_t node and then grow the tree. */ \
 		thread_pc = bli_thrinfo_sub_node( thread_jc ); \
@@ -268,14 +268,14 @@ void PASTECH2(bao_,ch,varname) \
 			/* Calculate the thread's current PC block dimension. */ \
 			const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \
 \
-			ctype* restrict a_pc = a_00 + pp * pcstep_a; \
-			ctype* restrict d_pc = d_00 + pp * pcstep_d; \
-			ctype* restrict b_pc = b_jc + pp * pcstep_b; \
+			const ctype* a_pc = a_00 + pp * pcstep_a; \
+			const ctype* d_pc = d_00 + pp * pcstep_d; \
+			const ctype* b_pc = b_jc + pp * pcstep_b; \
 \
 			/* Only apply beta to the first iteration of the pc loop. */ \
-			ctype* restrict beta_use = ( pp == 0 ? &beta_local : &one_local ); \
+			const ctype* beta_use = ( pp == 0 ? &beta_local : &one_local ); \
 \
-			ctype* b_use; \
+			const ctype* b_use; \
 			inc_t  rs_b_use, cs_b_use, ps_b_use; \
 \
 			/* Identify the current thrinfo_t node. Note that the thrinfo_t
@@ -306,7 +306,7 @@ void PASTECH2(bao_,ch,varname) \
 \
 			/* Alias b_use so that it's clear this is our current block of
 			   matrix B. */ \
-			ctype* restrict b_pc_use = b_use; \
+			const ctype* b_pc_use = b_use; \
 \
 			/* Identify the current thrinfo_t node and then grow the tree. */ \
 			thread_ic = bli_thrinfo_sub_node( thread_pb ); \
@@ -327,10 +327,10 @@ void PASTECH2(bao_,ch,varname) \
 				/* Calculate the thread's current IC block dimension. */ \
 				const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \
 \
-				ctype* restrict a_ic = a_pc + ii * icstep_a; \
-				ctype* restrict c_ic = c_jc + ii * icstep_c; \
+				const ctype* a_ic = a_pc + ii * icstep_a; \
+				      ctype* c_ic = c_jc + ii * icstep_c; \
 \
-				ctype* a_use; \
+				const ctype* a_use; \
 				inc_t  rs_a_use, cs_a_use, ps_a_use; \
 \
 				/* Identify the current thrinfo_t node. Note that the thrinfo_t
@@ -361,7 +361,7 @@ void PASTECH2(bao_,ch,varname) \
 \
 				/* Alias a_use so that it's clear this is our current block of
 				   matrix A. */ \
-				ctype* restrict a_ic_use = a_use; \
+				const ctype* a_ic_use = a_use; \
 \
 				/* Identify the current thrinfo_t node and then grow the tree. */ \
 				thread_jr = bli_thrinfo_sub_node( thread_pa ); \
@@ -387,12 +387,12 @@ void PASTECH2(bao_,ch,varname) \
 					const dim_t nr_cur \
 					= ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \
 \
-					ctype* restrict b_jr = b_pc_use + j * ps_b_use; \
-					ctype* restrict c_jr = c_ic     + j * jrstep_c; \
+					const ctype* b_jr = b_pc_use + j * ps_b_use; \
+					      ctype* c_jr = c_ic     + j * jrstep_c; \
 \
 					/* Assume for now that our next panel of B to be the current panel
 					   of B. */ \
-					ctype* restrict b2 = b_jr; \
+					const ctype* b2 = b_jr; \
 \
 					/* Identify the current thrinfo_t node. */ \
 					thread_ir = bli_thrinfo_sub_node( thread_jr ); \
@@ -417,10 +417,10 @@ void PASTECH2(bao_,ch,varname) \
 						const dim_t mr_cur \
 						= ( bli_is_not_edge_f( i, ir_iter, ir_left ) ? MR : ir_left ); \
 \
-						ctype* restrict a_ir = a_ic_use + i * ps_a_use; \
-						ctype* restrict c_ir = c_jr     + i * irstep_c; \
+						const ctype* a_ir = a_ic_use + i * ps_a_use; \
+						      ctype* c_ir = c_jr     + i * irstep_c; \
 \
-						ctype* restrict a2; \
+						const ctype* a2; \
 \
 						/* Compute the addresses of the next micropanels of A and B. */ \
 						a2 = bli_gemm_get_next_a_upanel( a_ir, ps_a_use, 1 ); \
diff --git a/addon/gemmd/bao_gemmd_check.c b/addon/gemmd/bao_gemmd_check.c
index 864e9a1acb..c900ac188d 100644
--- a/addon/gemmd/bao_gemmd_check.c
+++ b/addon/gemmd/bao_gemmd_check.c
@@ -36,13 +36,13 @@
 
 void bao_gemmd_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  d,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  d,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx
      )
 {
 	err_t e_val;
diff --git a/addon/gemmd/bao_gemmd_var.h b/addon/gemmd/bao_gemmd_var.h
index 05ec45e07e..98d6c7d479 100644
--- a/addon/gemmd/bao_gemmd_var.h
+++ b/addon/gemmd/bao_gemmd_var.h
@@ -42,15 +42,15 @@
 \
 void PASTECH(bao_,opname) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  d, \
-       obj_t*  b, \
-       obj_t*  beta, \
-       obj_t*  c, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       thrinfo_t* thread  \
+       const obj_t*  alpha, \
+       const obj_t*  a, \
+       const obj_t*  d, \
+       const obj_t*  b, \
+       const obj_t*  beta, \
+       const obj_t*  c, \
+       const cntx_t* cntx, \
+       const rntm_t* rntm, \
+       const thrinfo_t* thread  \
      );
 
 GENPROT( gemmd_bp_var1 )
@@ -65,20 +65,20 @@ GENPROT( gemmd_bp_var1 )
 \
 void PASTECH2(bao_,ch,varname) \
      ( \
-       conj_t           conja, \
-       conj_t           conjb, \
-       dim_t            m, \
-       dim_t            n, \
-       dim_t            k, \
-       void*   restrict alpha, \
-       void*   restrict a, inc_t rs_a, inc_t cs_a, \
-       void*   restrict d, inc_t incd, \
-       void*   restrict b, inc_t rs_b, inc_t cs_b, \
-       void*   restrict beta, \
-       void*   restrict c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* restrict cntx, \
-       rntm_t* restrict rntm, \
-       thrinfo_t* restrict thread  \
+       conj_t        conja, \
+       conj_t        conjb, \
+       dim_t         m, \
+       dim_t         n, \
+       dim_t         k, \
+       const void*   alpha, \
+       const void*   a, inc_t rs_a, inc_t cs_a, \
+       const void*   d, inc_t incd, \
+       const void*   b, inc_t rs_b, inc_t cs_b, \
+       const void*   beta, \
+             void*   c, inc_t rs_c, inc_t cs_c, \
+       const cntx_t* cntx, \
+       const rntm_t* rntm, \
+       const thrinfo_t* thread  \
      );
 
 //INSERT_GENTPROT_BASIC0( gemmd_bp_var1 )
@@ -97,18 +97,18 @@ GENTPROT( dcomplex, z, gemmd_bp_var1 )
 \
 void PASTECH2(bao_,ch,varname) \
      ( \
-       const dim_t         MR, \
-       const dim_t         NR, \
-       dim_t               mr_cur, \
-       dim_t               nr_cur, \
-       dim_t               k, \
-       ctype*     restrict alpha, \
-       ctype*     restrict a, inc_t rs_a, inc_t cs_a, \
-       ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
-       ctype*     restrict beta, \
-       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict aux, \
-       cntx_t*    restrict cntx  \
+       const dim_t      MR, \
+       const dim_t      NR, \
+       dim_t            mr_cur, \
+       dim_t            nr_cur, \
+       dim_t            k, \
+       const ctype*     alpha, \
+       const ctype*     a, inc_t rs_a, inc_t cs_a, \
+       const ctype*     b, inc_t rs_b, inc_t cs_b, \
+       const ctype*     beta, \
+             ctype*     c, inc_t rs_c, inc_t cs_c, \
+       const auxinfo_t* aux, \
+       const cntx_t*    cntx  \
      );
 
 //INSERT_GENTPROT_BASIC0( gemm_kernel )
diff --git a/build/detect/config/config_detect.c b/build/detect/config/config_detect.c
index 5e29defe15..5f1ea0f420 100644
--- a/build/detect/config/config_detect.c
+++ b/build/detect/config/config_detect.c
@@ -69,8 +69,8 @@
 
 int main( int argc, char** argv )
 {
-	arch_t id = bli_cpuid_query_id();
-	char*  s  = bli_arch_string( id );
+	arch_t id     = bli_cpuid_query_id();
+	const char* s = bli_arch_string( id );
 
 	printf( "%s\n", s );
 
diff --git a/frame/0/bli_l0_check.c b/frame/0/bli_l0_check.c
index 966f0c6aaa..02867a22d2 100644
--- a/frame/0/bli_l0_check.c
+++ b/frame/0/bli_l0_check.c
@@ -43,8 +43,8 @@
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  chi, \
-       obj_t*  psi  \
+       const obj_t* chi, \
+       const obj_t* psi  \
      ) \
 { \
 	bli_l0_xxsc_check( chi, psi ); \
@@ -63,7 +63,7 @@ GENFRONT( subsc )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  chi  \
+       const obj_t* chi  \
      ) \
 { \
 	bli_l0_xsc_check( chi ); \
@@ -77,8 +77,8 @@ GENFRONT( invertsc )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  chi, \
-       obj_t*  norm  \
+       const obj_t* chi, \
+       const obj_t* norm  \
      ) \
 { \
 	bli_l0_xx2sc_check( chi, norm ); \
@@ -91,9 +91,9 @@ GENFRONT( normfsc )
 
 void bli_getsc_check
      (
-       obj_t*  chi,
-       double* zeta_r,
-       double* zeta_i 
+       const obj_t*  chi,
+       const double* zeta_r,
+       const double* zeta_i
      )
 {
 	err_t e_val;
@@ -117,9 +117,9 @@ void bli_getsc_check
 
 void bli_setsc_check
      (
-       double  zeta_r,
-       double  zeta_i,
-       obj_t*  chi 
+       double       zeta_r,
+       double       zeta_i,
+       const obj_t* chi
      )
 {
 	err_t e_val;
@@ -143,9 +143,9 @@ void bli_setsc_check
 
 void bli_unzipsc_check
      (
-       obj_t*  chi,
-       obj_t*  zeta_r,
-       obj_t*  zeta_i 
+       const obj_t* chi,
+       const obj_t* zeta_r,
+       const obj_t* zeta_i
      )
 {
 	err_t e_val;
@@ -199,9 +199,9 @@ void bli_unzipsc_check
 
 void bli_zipsc_check
      (
-       obj_t*  zeta_r,
-       obj_t*  zeta_i,
-       obj_t*  chi 
+       const obj_t* zeta_r,
+       const obj_t* zeta_i,
+       const obj_t* chi
      )
 {
 	err_t e_val;
@@ -254,7 +254,7 @@ void bli_zipsc_check
 
 void bli_l0_xsc_check
      (
-       obj_t*  chi
+       const obj_t* chi
      )
 {
 	err_t e_val;
@@ -280,8 +280,8 @@ void bli_l0_xsc_check
 
 void bli_l0_xxsc_check
      (
-       obj_t*  chi,
-       obj_t*  psi 
+       const obj_t* chi,
+       const obj_t* psi
      )
 {
 	err_t e_val;
@@ -316,8 +316,8 @@ void bli_l0_xxsc_check
 
 void bli_l0_xx2sc_check
      (
-       obj_t*  chi,
-       obj_t*  absq 
+       const obj_t* chi,
+       const obj_t* absq
      )
 {
 	err_t e_val;
@@ -355,9 +355,9 @@ void bli_l0_xx2sc_check
 
 void bli_l0_xxbsc_check
      (
-       obj_t*  chi,
-       obj_t*  psi,
-       bool*   is_eq
+       const obj_t* chi,
+       const obj_t* psi,
+       const bool*  is_eq
      )
 {
 	err_t e_val;
diff --git a/frame/0/bli_l0_check.h b/frame/0/bli_l0_check.h
index f495866c62..c5610d2bac 100644
--- a/frame/0/bli_l0_check.h
+++ b/frame/0/bli_l0_check.h
@@ -42,8 +42,8 @@
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  chi, \
-       obj_t*  psi  \
+       const obj_t* chi, \
+       const obj_t* psi  \
      );
 
 GENTPROT( addsc )
@@ -59,7 +59,7 @@ GENTPROT( subsc )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  chi  \
+       const obj_t* chi  \
      );
 
 GENTPROT( invertsc )
@@ -70,8 +70,8 @@ GENTPROT( invertsc )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  chi, \
-       obj_t*  absq  \
+       const obj_t* chi, \
+       const obj_t* absq  \
      );
 
 GENTPROT( absqsc )
@@ -83,9 +83,9 @@ GENTPROT( normfsc )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  chi, \
-       double* zeta_r, \
-       double* zeta_i  \
+       const obj_t*  chi, \
+       const double* zeta_r, \
+       const double* zeta_i  \
      );
 
 GENTPROT( getsc )
@@ -96,9 +96,9 @@ GENTPROT( getsc )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       double  zeta_r, \
-       double  zeta_i, \
-       obj_t*  chi  \
+       double       zeta_r, \
+       double       zeta_i, \
+       const obj_t* chi  \
      );
 
 GENTPROT( setsc )
@@ -109,9 +109,9 @@ GENTPROT( setsc )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  chi, \
-       obj_t*  zeta_r, \
-       obj_t*  zeta_i  \
+       const obj_t* chi, \
+       const obj_t* zeta_r, \
+       const obj_t* zeta_i  \
      );
 
 GENTPROT( unzipsc )
@@ -122,9 +122,9 @@ GENTPROT( unzipsc )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  zeta_r, \
-       obj_t*  zeta_i, \
-       obj_t*  chi  \
+       const obj_t* zeta_r, \
+       const obj_t* zeta_i, \
+       const obj_t* chi  \
      );
 
 GENTPROT( zipsc )
@@ -133,24 +133,24 @@ GENTPROT( zipsc )
 
 void bli_l0_xsc_check
      (
-       obj_t*  chi
+       const obj_t* chi
      );
 
 void bli_l0_xxsc_check
      (
-       obj_t*  chi,
-       obj_t*  psi 
+       const obj_t* chi,
+       const obj_t* psi
      );
 
 void bli_l0_xx2sc_check
      (
-       obj_t*  chi,
-       obj_t*  norm 
+       const obj_t* chi,
+       const obj_t* norm
      );
 
 void bli_l0_xxbsc_check
      (
-       obj_t*  chi,
-       obj_t*  psi,
-       bool*   is_eq
+       const obj_t* chi,
+       const obj_t* psi,
+       const bool*  is_eq
      );
diff --git a/frame/0/bli_l0_ft.h b/frame/0/bli_l0_ft.h
index b90e35eb59..96e3573c65 100644
--- a/frame/0/bli_l0_ft.h
+++ b/frame/0/bli_l0_ft.h
@@ -44,9 +44,9 @@
 \
 typedef void (*PASTECH2(ch,opname,tsuf)) \
      ( \
-       conj_t  conjchi, \
-       ctype*  chi, \
-       ctype*  psi  \
+       conj_t       conjchi, \
+       const ctype* chi, \
+             ctype* psi  \
      );
 
 INSERT_GENTDEF( addsc )
@@ -73,9 +73,9 @@ INSERT_GENTDEF( invertsc )
 \
 typedef void (*PASTECH2(ch,opname,tsuf)) \
      ( \
-       conj_t  conjchi, \
-       ctype*  chi, \
-       ctype*  psi  \
+       conj_t       conjchi, \
+       const ctype* chi, \
+             ctype* psi  \
      );
 
 INSERT_GENTDEF( mulsc )
@@ -87,8 +87,8 @@ INSERT_GENTDEF( mulsc )
 \
 typedef void (*PASTECH2(ch,opname,tsuf)) \
      ( \
-       ctype*   chi, \
-       ctype_r* absq  \
+       const ctype* chi, \
+       ctype_r*     absq  \
      );
 
 INSERT_GENTDEFR( absqsc )
@@ -100,8 +100,8 @@ INSERT_GENTDEFR( absqsc )
 \
 typedef void (*PASTECH2(ch,opname,tsuf)) \
      ( \
-       ctype*   chi, \
-       ctype_r* norm  \
+       const ctype* chi, \
+       ctype_r*     norm  \
      );
 
 INSERT_GENTDEFR( normfsc )
@@ -113,8 +113,8 @@ INSERT_GENTDEFR( normfsc )
 \
 typedef void (*PASTECH2(ch,opname,tsuf)) \
      ( \
-       ctype*  chi, \
-       ctype*  psi  \
+       const ctype* chi, \
+             ctype* psi  \
      );
 
 INSERT_GENTDEF( sqrtsc )
@@ -126,9 +126,9 @@ INSERT_GENTDEF( sqrtsc )
 \
 typedef void (*PASTECH2(ch,opname,tsuf)) \
      ( \
-       ctype*  chi, \
-       double* zeta_r, \
-       double* zeta_i  \
+       const ctype* chi, \
+       double*      zeta_r, \
+       double*      zeta_i  \
      );
 
 INSERT_GENTDEF( getsc )
@@ -154,9 +154,9 @@ INSERT_GENTDEF( setsc )
 \
 typedef void (*PASTECH2(ch,opname,tsuf)) \
      ( \
-       ctype*   chi, \
-       ctype_r* zeta_r, \
-       ctype_r* zeta_i  \
+       const ctype* chi, \
+       ctype_r*     zeta_r, \
+       ctype_r*     zeta_i  \
      );
 
 INSERT_GENTDEFR( unzipsc )
@@ -168,9 +168,9 @@ INSERT_GENTDEFR( unzipsc )
 \
 typedef void (*PASTECH2(ch,opname,tsuf)) \
      ( \
-       ctype_r* zeta_r, \
-       ctype_r* zeta_i, \
-       ctype*   chi  \
+       const ctype_r* zeta_r, \
+       const ctype_r* zeta_i, \
+       ctype*         chi  \
      );
 
 INSERT_GENTDEFR( zipsc )
diff --git a/frame/0/bli_l0_oapi.c b/frame/0/bli_l0_oapi.c
index ac62530dbc..3fb903e987 100644
--- a/frame/0/bli_l0_oapi.c
+++ b/frame/0/bli_l0_oapi.c
@@ -43,17 +43,17 @@
 \
 void PASTEMAC0(opname) \
      ( \
-       obj_t*  chi, \
-       obj_t*  absq  \
+       const obj_t*  chi, \
+       const obj_t*  absq  \
      ) \
 { \
 	bli_init_once(); \
 \
-	num_t     dt_chi; \
-	num_t     dt_absq_c  = bli_obj_dt_proj_to_complex( absq ); \
+	num_t       dt_chi; \
+	num_t       dt_absq_c  = bli_obj_dt_proj_to_complex( absq ); \
 \
-	void*     buf_chi; \
-	void*     buf_absq   = bli_obj_buffer_at_off( absq ); \
+	const void* buf_chi; \
+	void*       buf_absq   = bli_obj_buffer_at_off( absq ); \
 \
 	if ( bli_error_checking_is_enabled() ) \
 	    PASTEMAC(opname,_check)( chi, absq ); \
@@ -61,7 +61,7 @@ void PASTEMAC0(opname) \
 	/* If chi is a scalar constant, use dt_absq_c to extract the address of the
 	   corresponding constant value; otherwise, use the datatype encoded
 	   within the chi object and extract the buffer at the chi offset. */ \
-	bli_obj_scalar_set_dt_buffer( chi, dt_absq_c, &dt_chi, &buf_chi ); \
+	bli_obj_scalar_set_dt_buffer( chi, dt_absq_c, &dt_chi, ( void** )&buf_chi ); \
 \
 	/* Query a type-specific function pointer, except one that uses
 	   void* for function arguments instead of typed pointers. */ \
@@ -83,8 +83,8 @@ GENFRONT( normfsc )
 \
 void PASTEMAC0(opname) \
      ( \
-       obj_t*  chi, \
-       obj_t*  psi  \
+       const obj_t* chi, \
+       const obj_t* psi  \
      ) \
 { \
 	bli_init_once(); \
@@ -122,7 +122,7 @@ GENFRONT( subsc )
 \
 void PASTEMAC0(opname) \
      ( \
-       obj_t*  chi  \
+       const obj_t*  chi  \
      ) \
 { \
 	bli_init_once(); \
@@ -155,8 +155,8 @@ GENFRONT( invertsc )
 \
 void PASTEMAC0(opname) \
      ( \
-       obj_t*  chi, \
-       obj_t*  psi  \
+       const obj_t* chi, \
+       const obj_t* psi  \
      ) \
 { \
 	bli_init_once(); \
@@ -188,9 +188,9 @@ GENFRONT( sqrtsc )
 \
 void PASTEMAC0(opname) \
      ( \
-       obj_t*  chi, \
-       double* zeta_r, \
-       double* zeta_i  \
+       const obj_t* chi, \
+       double*      zeta_r, \
+       double*      zeta_i  \
      ) \
 { \
 	bli_init_once(); \
@@ -234,7 +234,7 @@ void PASTEMAC0(opname) \
      ( \
        double  zeta_r, \
        double  zeta_i, \
-       obj_t*  chi  \
+       const obj_t*  chi  \
      ) \
 { \
 	bli_init_once(); \
@@ -266,9 +266,9 @@ GENFRONT( setsc )
 \
 void PASTEMAC0(opname) \
      ( \
-       obj_t*  chi, \
-       obj_t*  zeta_r, \
-       obj_t*  zeta_i  \
+       const obj_t* chi, \
+       const obj_t* zeta_r, \
+       const obj_t* zeta_i  \
      ) \
 { \
 	bli_init_once(); \
@@ -309,9 +309,9 @@ GENFRONT( unzipsc )
 \
 void PASTEMAC0(opname) \
      ( \
-       obj_t*  zeta_r, \
-       obj_t*  zeta_i, \
-       obj_t*  chi  \
+       const obj_t* zeta_r, \
+       const obj_t* zeta_i, \
+       const obj_t* chi  \
      ) \
 { \
 	bli_init_once(); \
diff --git a/frame/0/bli_l0_oapi.h b/frame/0/bli_l0_oapi.h
index 702bb40eaa..ef111ebe40 100644
--- a/frame/0/bli_l0_oapi.h
+++ b/frame/0/bli_l0_oapi.h
@@ -42,8 +42,8 @@
 \
 BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
      ( \
-       obj_t*  chi, \
-       obj_t*  absq  \
+       const obj_t*  chi, \
+       const obj_t*  absq  \
      );
 
 GENPROT( absqsc )
@@ -55,8 +55,8 @@ GENPROT( normfsc )
 \
 BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
      ( \
-       obj_t*  chi, \
-       obj_t*  psi  \
+       const obj_t*  chi, \
+       const obj_t*  psi  \
      );
 
 GENPROT( addsc )
@@ -71,7 +71,7 @@ GENPROT( subsc )
 \
 BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
      ( \
-       obj_t*  chi  \
+       const obj_t*  chi  \
      );
 
 GENPROT( invertsc )
@@ -82,9 +82,9 @@ GENPROT( invertsc )
 \
 BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
      ( \
-       obj_t*  chi, \
-       double* zeta_r, \
-       double* zeta_i  \
+       const obj_t* chi, \
+       double*      zeta_r, \
+       double*      zeta_i  \
      );
 
 GENPROT( getsc )
@@ -97,7 +97,7 @@ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
      ( \
        double  zeta_r, \
        double  zeta_i, \
-       obj_t*  chi  \
+       const obj_t*  chi  \
      );
 
 GENPROT( setsc )
@@ -108,9 +108,9 @@ GENPROT( setsc )
 \
 BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
      ( \
-       obj_t*  chi, \
-       obj_t*  zeta_r, \
-       obj_t*  zeta_i  \
+       const obj_t*  chi, \
+       const obj_t*  zeta_r, \
+       const obj_t*  zeta_i  \
      );
 
 GENPROT( unzipsc )
@@ -121,9 +121,9 @@ GENPROT( unzipsc )
 \
 BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
      ( \
-       obj_t*  zeta_r, \
-       obj_t*  zeta_i, \
-       obj_t*  chi  \
+       const obj_t*  zeta_r, \
+       const obj_t*  zeta_i, \
+       const obj_t*  chi  \
      );
 
 GENPROT( zipsc )
diff --git a/frame/0/bli_l0_tapi.c b/frame/0/bli_l0_tapi.c
index 620cad2996..55dbcab722 100644
--- a/frame/0/bli_l0_tapi.c
+++ b/frame/0/bli_l0_tapi.c
@@ -43,9 +43,9 @@
 \
 void PASTEMAC(ch,opname) \
      ( \
-       conj_t  conjchi, \
-       ctype*  chi, \
-       ctype*  psi  \
+       conj_t       conjchi, \
+       const ctype* chi, \
+             ctype* psi  \
      ) \
 { \
 	bli_init_once(); \
@@ -87,9 +87,9 @@ INSERT_GENTFUNC_BASIC( invertsc, inverts )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       conj_t  conjchi, \
-       ctype*  chi, \
-       ctype*  psi  \
+       conj_t       conjchi, \
+       const ctype* chi, \
+             ctype* psi  \
      ) \
 { \
 	bli_init_once(); \
@@ -116,8 +116,8 @@ INSERT_GENTFUNC_BASIC( mulsc, scals )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       ctype*   chi, \
-       ctype_r* absq  \
+       const ctype* chi, \
+       ctype_r*     absq  \
      ) \
 { \
 	bli_init_once(); \
@@ -145,8 +145,8 @@ INSERT_GENTFUNCR_BASIC0( absqsc )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       ctype*   chi, \
-       ctype_r* norm  \
+       const ctype* chi, \
+       ctype_r*     norm  \
      ) \
 { \
 	bli_init_once(); \
@@ -163,8 +163,8 @@ INSERT_GENTFUNCR_BASIC0( normfsc )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       ctype*  chi, \
-       ctype*  psi  \
+       const ctype* chi, \
+             ctype* psi  \
      ) \
 { \
 	bli_init_once(); \
@@ -181,9 +181,9 @@ INSERT_GENTFUNC_BASIC0( sqrtsc )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       ctype*  chi, \
-       double* zeta_r, \
-       double* zeta_i  \
+       const ctype* chi, \
+       double*      zeta_r, \
+       double*      zeta_i  \
      ) \
 { \
 	bli_init_once(); \
@@ -217,9 +217,9 @@ INSERT_GENTFUNC_BASIC0( setsc )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       ctype*   chi, \
-       ctype_r* zeta_r, \
-       ctype_r* zeta_i  \
+       const ctype* chi, \
+       ctype_r*     zeta_r, \
+       ctype_r*     zeta_i  \
      ) \
 { \
 	bli_init_once(); \
@@ -235,9 +235,9 @@ INSERT_GENTFUNCR_BASIC0( unzipsc )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       ctype_r* zeta_r, \
-       ctype_r* zeta_i, \
-       ctype*   chi  \
+       const ctype_r* zeta_r, \
+       const ctype_r* zeta_i, \
+       ctype*         chi  \
      ) \
 { \
 	bli_init_once(); \
@@ -251,9 +251,9 @@ INSERT_GENTFUNCR_BASIC0( zipsc )
 
 void bli_igetsc
      (
-       dim_t*  chi,
-       double* zeta_r,
-       double* zeta_i
+       const dim_t* chi,
+       double*      zeta_r,
+       double*      zeta_i
      )
 {
 	bli_init_once();
diff --git a/frame/0/bli_l0_tapi.h b/frame/0/bli_l0_tapi.h
index c2d600d669..3ff7667869 100644
--- a/frame/0/bli_l0_tapi.h
+++ b/frame/0/bli_l0_tapi.h
@@ -42,9 +42,9 @@
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       conj_t  conjchi, \
-       ctype*  chi, \
-       ctype*  psi  \
+       conj_t        conjchi, \
+       const ctype*  chi, \
+             ctype*  psi  \
      );
 
 INSERT_GENTPROT_BASIC0( addsc )
@@ -70,8 +70,8 @@ INSERT_GENTPROT_BASIC0( invertsc )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       ctype*   chi, \
-       ctype_r* absq  \
+       const ctype* chi, \
+       ctype_r*     absq  \
      );
 
 INSERT_GENTPROTR_BASIC0( absqsc )
@@ -83,8 +83,8 @@ INSERT_GENTPROTR_BASIC0( normfsc )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       ctype*  chi, \
-       ctype*  psi  \
+       const ctype* chi, \
+             ctype* psi  \
      );
 
 INSERT_GENTPROT_BASIC0( sqrtsc )
@@ -95,9 +95,9 @@ INSERT_GENTPROT_BASIC0( sqrtsc )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       ctype*  chi, \
-       double* zeta_r, \
-       double* zeta_i  \
+       const ctype* chi, \
+       double*      zeta_r, \
+       double*      zeta_i  \
      );
 
 INSERT_GENTPROT_BASIC0( getsc )
@@ -121,9 +121,9 @@ INSERT_GENTPROT_BASIC0( setsc )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       ctype*   chi, \
-       ctype_r* zeta_r, \
-       ctype_r* zeta_i  \
+       const ctype* chi, \
+       ctype_r*     zeta_r, \
+       ctype_r*     zeta_i  \
      );
 
 INSERT_GENTPROTR_BASIC0( unzipsc )
@@ -134,9 +134,9 @@ INSERT_GENTPROTR_BASIC0( unzipsc )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       ctype_r* zeta_r, \
-       ctype_r* zeta_i, \
-       ctype*   chi  \
+       const ctype_r* zeta_r, \
+       const ctype_r* zeta_i, \
+       ctype*         chi  \
      );
 
 INSERT_GENTPROTR_BASIC0( zipsc )
@@ -145,9 +145,9 @@ INSERT_GENTPROTR_BASIC0( zipsc )
 
 BLIS_EXPORT_BLIS void bli_igetsc
      (
-       dim_t*  chi,
-       double* zeta_r,
-       double* zeta_i
+       const dim_t* chi,
+       double*      zeta_r,
+       double*      zeta_i
      );
 
 BLIS_EXPORT_BLIS void bli_isetsc
diff --git a/frame/0/copysc/bli_copysc.c b/frame/0/copysc/bli_copysc.c
index 3001aa6c7c..04f4efbbe8 100644
--- a/frame/0/copysc/bli_copysc.c
+++ b/frame/0/copysc/bli_copysc.c
@@ -41,9 +41,9 @@
 
 typedef void (*FUNCPTR_T)
      (
-       conj_t conjchi,
-       void*  chi,
-       void*  psi
+       conj_t      conjchi,
+       const void* chi,
+             void* psi
      );
 
 static FUNCPTR_T GENARRAY2_ALL(ftypes,copysc);
@@ -57,19 +57,19 @@ static FUNCPTR_T GENARRAY2_ALL(ftypes,copysc);
 \
 void PASTEMAC0(opname) \
      ( \
-       obj_t*  chi, \
-       obj_t*  psi  \
+       const obj_t* chi, \
+       const obj_t* psi  \
      ) \
 { \
 	bli_init_once(); \
 \
-	conj_t    conjchi   = bli_obj_conj_status( chi ); \
+	conj_t conjchi = bli_obj_conj_status( chi ); \
 \
-	num_t     dt_psi    = bli_obj_dt( psi ); \
-	void*     buf_psi   = bli_obj_buffer_at_off( psi ); \
+	num_t  dt_psi  = bli_obj_dt( psi ); \
+	void*  buf_psi = bli_obj_buffer_at_off( psi ); \
 \
-	num_t     dt_chi; \
-	void*     buf_chi; \
+	num_t  dt_chi; \
+    void*  buf_chi; \
 \
 	FUNCPTR_T f; \
 \
@@ -105,15 +105,15 @@ GENFRONT( copysc )
 \
 void PASTEMAC2(chx,chy,varname) \
      ( \
-       conj_t conjchi, \
-       void*  chi, \
-       void*  psi \
+       conj_t      conjchi, \
+       const void* chi, \
+             void* psi \
      ) \
 { \
 	bli_init_once(); \
 \
-	ctype_x* chi_cast = chi; \
-	ctype_y* psi_cast = psi; \
+	const ctype_x* chi_cast = chi; \
+	      ctype_y* psi_cast = psi; \
 \
 	if ( bli_is_conj( conjchi ) ) \
 	{ \
diff --git a/frame/0/copysc/bli_copysc.h b/frame/0/copysc/bli_copysc.h
index 1dfd9d7bcf..1c2ee4e862 100644
--- a/frame/0/copysc/bli_copysc.h
+++ b/frame/0/copysc/bli_copysc.h
@@ -42,8 +42,8 @@
 \
 BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
      ( \
-       obj_t*  chi, \
-       obj_t*  psi  \
+       const obj_t* chi, \
+       const obj_t* psi  \
      );
 GENFRONT( copysc )
 
@@ -57,9 +57,9 @@ GENFRONT( copysc )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,varname) \
      ( \
-       conj_t conjchi, \
-       void*  chi, \
-       void*  psi \
+       conj_t      conjchi, \
+       const void* chi, \
+             void* psi \
      );
 
 INSERT_GENTPROT2_BASIC0( copysc )
diff --git a/frame/1/bli_l1v_check.c b/frame/1/bli_l1v_check.c
index 74b60febd6..8ab470bf43 100644
--- a/frame/1/bli_l1v_check.c
+++ b/frame/1/bli_l1v_check.c
@@ -43,8 +43,8 @@
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t* x, \
+       const obj_t* y  \
      ) \
 { \
 	bli_l1v_xy_check( x, y ); \
@@ -61,8 +61,8 @@ GENFRONT( swapv )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  index  \
+       const obj_t* x, \
+       const obj_t* index  \
      ) \
 { \
 	bli_l1v_xi_check( x, index ); \
@@ -76,10 +76,10 @@ GENFRONT( amaxv )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y  \
+       const obj_t* alpha, \
+       const obj_t* x, \
+       const obj_t* beta, \
+       const obj_t* y  \
      ) \
 { \
 	bli_l1v_axby_check( alpha, x, beta, y ); \
@@ -93,9 +93,9 @@ GENFRONT( axpbyv )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t* alpha, \
+       const obj_t* x, \
+       const obj_t* y  \
      ) \
 { \
 	bli_l1v_axy_check( alpha, x, y ); \
@@ -110,9 +110,9 @@ GENFRONT( scal2v )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  y, \
-       obj_t*  rho  \
+       const obj_t* x, \
+       const obj_t* y, \
+       const obj_t* rho  \
      ) \
 { \
 	bli_l1v_dot_check( &BLIS_ONE, x, y, &BLIS_ONE, rho ); \
@@ -126,11 +126,11 @@ GENFRONT( dotv )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  y, \
-       obj_t*  beta, \
-       obj_t*  rho  \
+       const obj_t* alpha, \
+       const obj_t* x, \
+       const obj_t* y, \
+       const obj_t* beta, \
+       const obj_t* rho  \
      ) \
 { \
 	bli_l1v_dot_check( alpha, x, y, beta, rho ); \
@@ -144,7 +144,7 @@ GENFRONT( dotxv )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x  \
+       const obj_t* x  \
      ) \
 { \
 	bli_l1v_x_check( x ); \
@@ -158,8 +158,8 @@ GENFRONT( invertv )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x  \
+       const obj_t* alpha, \
+       const obj_t* x  \
      ) \
 { \
 	bli_l1v_ax_check( alpha, x ); \
@@ -174,9 +174,9 @@ GENFRONT( setv )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y  \
+       const obj_t* x, \
+       const obj_t* beta, \
+       const obj_t* y  \
      ) \
 { \
 	bli_l1v_xby_check( x, beta, y ); \
@@ -189,8 +189,8 @@ GENFRONT( xpbyv )
 
 void bli_l1v_xy_check
      (
-       obj_t*  x,
-       obj_t*  y 
+       const obj_t* x,
+       const obj_t* y
      )
 {
 	err_t e_val;
@@ -230,9 +230,9 @@ void bli_l1v_xy_check
 
 void bli_l1v_axy_check
      (
-       obj_t*  alpha,
-       obj_t*  x,
-       obj_t*  y 
+       const obj_t* alpha,
+       const obj_t* x,
+       const obj_t* y
      )
 {
 	err_t e_val;
@@ -281,9 +281,9 @@ void bli_l1v_axy_check
 
 void bli_l1v_xby_check
      (
-       obj_t*  x,
-       obj_t*  beta,
-       obj_t*  y
+       const obj_t* x,
+       const obj_t* beta,
+       const obj_t* y
      )
 {
 	err_t e_val;
@@ -332,10 +332,10 @@ void bli_l1v_xby_check
 
 void bli_l1v_axby_check
      (
-       obj_t*  alpha,
-       obj_t*  x,
-       obj_t*  beta,
-       obj_t*  y
+       const obj_t* alpha,
+       const obj_t* x,
+       const obj_t* beta,
+       const obj_t* y
      )
 {
 	err_t e_val;
@@ -393,11 +393,11 @@ void bli_l1v_axby_check
 
 void bli_l1v_dot_check
      (
-       obj_t*  alpha,
-       obj_t*  x,
-       obj_t*  y,
-       obj_t*  beta,
-       obj_t*  rho 
+       const obj_t* alpha,
+       const obj_t* x,
+       const obj_t* y,
+       const obj_t* beta,
+       const obj_t* rho
      )
 {
 	err_t e_val;
@@ -467,7 +467,7 @@ void bli_l1v_dot_check
 
 void bli_l1v_x_check
      (
-       obj_t*  x 
+       const obj_t* x
      )
 {
 	err_t e_val;
@@ -490,8 +490,8 @@ void bli_l1v_x_check
 
 void bli_l1v_ax_check
      (
-       obj_t*  alpha,
-       obj_t*  x 
+       const obj_t* alpha,
+       const obj_t* x
      )
 {
 	err_t e_val;
@@ -523,8 +523,8 @@ void bli_l1v_ax_check
 
 void bli_l1v_xi_check
      (
-       obj_t*  x,
-       obj_t*  index
+       const obj_t* x,
+       const obj_t* index
      )
 {
 	err_t e_val;
diff --git a/frame/1/bli_l1v_check.h b/frame/1/bli_l1v_check.h
index 98051d0cde..110b25d557 100644
--- a/frame/1/bli_l1v_check.h
+++ b/frame/1/bli_l1v_check.h
@@ -42,8 +42,8 @@
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t* x, \
+       const obj_t* y  \
      );
 
 GENTPROT( addv )
@@ -57,8 +57,8 @@ GENTPROT( swapv )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  index  \
+       const obj_t* x, \
+       const obj_t* index  \
      );
 
 GENTPROT( amaxv )
@@ -69,10 +69,10 @@ GENTPROT( amaxv )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y  \
+       const obj_t* alpha, \
+       const obj_t* x, \
+       const obj_t* beta, \
+       const obj_t* y  \
      );
 
 GENTPROT( axpbyv )
@@ -83,9 +83,9 @@ GENTPROT( axpbyv )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t* alpha, \
+       const obj_t* x, \
+       const obj_t* y  \
      );
 
 GENTPROT( axpyv )
@@ -97,9 +97,9 @@ GENTPROT( scal2v )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  y, \
-       obj_t*  rho  \
+       const obj_t* x, \
+       const obj_t* y, \
+       const obj_t* rho  \
      );
 
 GENTPROT( dotv )
@@ -110,11 +110,11 @@ GENTPROT( dotv )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  y, \
-       obj_t*  beta, \
-       obj_t*  rho  \
+       const obj_t* alpha, \
+       const obj_t* x, \
+       const obj_t* y, \
+       const obj_t* beta, \
+       const obj_t* rho  \
      );
 
 GENTPROT( dotxv )
@@ -125,7 +125,7 @@ GENTPROT( dotxv )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x  \
+       const obj_t* x  \
      );
 
 GENTPROT( invertv )
@@ -136,8 +136,8 @@ GENTPROT( invertv )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x  \
+       const obj_t* alpha, \
+       const obj_t* x  \
      );
 
 GENTPROT( scalv )
@@ -149,9 +149,9 @@ GENTPROT( setv )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y  \
+       const obj_t* x, \
+       const obj_t* beta, \
+       const obj_t* y  \
      );
 
 GENTPROT( xpbyv )
@@ -162,55 +162,55 @@ GENTPROT( xpbyv )
 
 void bli_l1v_xy_check
      (
-       obj_t*  x,
-       obj_t*  y 
+       const obj_t* x,
+       const obj_t* y
      );
 
 void bli_l1v_axy_check
      (
-       obj_t*  alpha,
-       obj_t*  x,
-       obj_t*  y 
+       const obj_t* alpha,
+       const obj_t* x,
+       const obj_t* y
      );
 
 void bli_l1v_xby_check
      (
-       obj_t*  x,
-       obj_t*  beta,
-       obj_t*  y
+       const obj_t* x,
+       const obj_t* beta,
+       const obj_t* y
      );
 
 void bli_l1v_axby_check
      (
-       obj_t*  alpha,
-       obj_t*  x,
-       obj_t*  beta,
-       obj_t*  y
+       const obj_t* alpha,
+       const obj_t* x,
+       const obj_t* beta,
+       const obj_t* y
      );
 
 void bli_l1v_dot_check
      (
-       obj_t*  alpha,
-       obj_t*  x,
-       obj_t*  y,
-       obj_t*  beta,
-       obj_t*  rho 
+       const obj_t* alpha,
+       const obj_t* x,
+       const obj_t* y,
+       const obj_t* beta,
+       const obj_t* rho
      );
 
 void bli_l1v_x_check
      (
-       obj_t*  x 
+       const obj_t* x
      );
 
 void bli_l1v_ax_check
      (
-       obj_t*  alpha,
-       obj_t*  x 
+       const obj_t* alpha,
+       const obj_t* x
      );
 
 void bli_l1v_xi_check
      (
-       obj_t*  x,
-       obj_t*  index
+       const obj_t* x,
+       const obj_t* index
      );
 
diff --git a/frame/1/bli_l1v_ft.h b/frame/1/bli_l1v_ft.h
index 162f1bf600..7f1d93ad34 100644
--- a/frame/1/bli_l1v_ft.h
+++ b/frame/1/bli_l1v_ft.h
@@ -44,10 +44,10 @@
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       conj_t  conjx, \
-       dim_t   n, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy  \
+       conj_t       conjx, \
+       dim_t        n, \
+       const ctype* x, inc_t incx, \
+             ctype* y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -62,9 +62,9 @@ INSERT_GENTDEF( subv )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       dim_t   n, \
-       ctype*  x, inc_t incx, \
-       dim_t*  index  \
+       dim_t        n, \
+       const ctype* x, inc_t incx, \
+       dim_t*       index  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -77,12 +77,12 @@ INSERT_GENTDEF( amaxv )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       conj_t  conjx, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t incx, \
-       ctype*  beta, \
-       ctype*  y, inc_t incy  \
+       conj_t       conjx, \
+       dim_t        n, \
+       const ctype* alpha, \
+       const ctype* x, inc_t incx, \
+       const ctype* beta, \
+             ctype* y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -95,11 +95,11 @@ INSERT_GENTDEF( axpbyv )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       conj_t  conjx, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy  \
+       conj_t       conjx, \
+       dim_t        n, \
+       const ctype* alpha, \
+       const ctype* x, inc_t incx, \
+             ctype* y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -113,12 +113,12 @@ INSERT_GENTDEF( scal2v )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       conj_t  conjx, \
-       conj_t  conjy, \
-       dim_t   n, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy, \
-       ctype*  rho  \
+       conj_t       conjx, \
+       conj_t       conjy, \
+       dim_t        n, \
+       const ctype* x, inc_t incx, \
+       const ctype* y, inc_t incy, \
+             ctype* rho  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -131,14 +131,14 @@ INSERT_GENTDEF( dotv )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       conj_t  conjx, \
-       conj_t  conjy, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy, \
-       ctype*  beta, \
-       ctype*  rho  \
+       conj_t       conjx, \
+       conj_t       conjy, \
+       dim_t        n, \
+       const ctype* alpha, \
+       const ctype* x, inc_t incx, \
+       const ctype* y, inc_t incy, \
+       const ctype* beta, \
+             ctype* rho  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -151,8 +151,8 @@ INSERT_GENTDEF( dotxv )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       dim_t   n, \
-       ctype*  x, inc_t incx  \
+       dim_t  n, \
+       ctype* x, inc_t incx  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -165,10 +165,10 @@ INSERT_GENTDEF( invertv )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       conj_t  conjalpha, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t incx  \
+       conj_t       conjalpha, \
+       dim_t        n, \
+       const ctype* alpha, \
+             ctype* x, inc_t incx  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -182,9 +182,9 @@ INSERT_GENTDEF( setv )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       dim_t   n, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy  \
+       dim_t  n, \
+       ctype* x, inc_t incx, \
+       ctype* y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -197,11 +197,11 @@ INSERT_GENTDEF( swapv )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       conj_t  conjx, \
-       dim_t   n, \
-       ctype*  x, inc_t incx, \
-       ctype*  beta, \
-       ctype*  y, inc_t incy  \
+       conj_t       conjx, \
+       dim_t        n, \
+       const ctype* x, inc_t incx, \
+       const ctype* beta, \
+             ctype* y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
diff --git a/frame/1/bli_l1v_oapi.c b/frame/1/bli_l1v_oapi.c
index 201af2e091..cc7f38e380 100644
--- a/frame/1/bli_l1v_oapi.c
+++ b/frame/1/bli_l1v_oapi.c
@@ -45,8 +45,8 @@
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t*  x, \
+       const obj_t*  y  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -92,8 +92,8 @@ GENFRONT( subv )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  index  \
+       const obj_t*  x, \
+       const obj_t*  index  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -135,10 +135,10 @@ GENFRONT( amaxv )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y  \
+       const obj_t*  alpha, \
+       const obj_t*  x, \
+       const obj_t*  beta, \
+       const obj_t*  y  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -199,9 +199,9 @@ GENFRONT( axpbyv )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t*  alpha, \
+       const obj_t*  x, \
+       const obj_t*  y  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -257,9 +257,9 @@ GENFRONT( scal2v )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  y, \
-       obj_t*  rho  \
+       const obj_t*  x, \
+       const obj_t*  y, \
+       const obj_t*  rho  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -307,11 +307,11 @@ GENFRONT( dotv )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  y, \
-       obj_t*  beta, \
-       obj_t*  rho  \
+       const obj_t*  alpha, \
+       const obj_t*  x, \
+       const obj_t*  y, \
+       const obj_t*  beta, \
+       const obj_t*  rho  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -376,7 +376,7 @@ GENFRONT( dotxv )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x  \
+       const obj_t*  x  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -415,8 +415,8 @@ GENFRONT( invertv )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x  \
+       const obj_t*  alpha, \
+       const obj_t*  x  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -469,8 +469,8 @@ GENFRONT( setv )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t*  x, \
+       const obj_t*  y  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -512,9 +512,9 @@ GENFRONT( swapv )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y  \
+       const obj_t*  x, \
+       const obj_t*  beta, \
+       const obj_t*  y  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
diff --git a/frame/1/bli_l1v_oapi.h b/frame/1/bli_l1v_oapi.h
index 41aecdc4d1..8e22dff7e8 100644
--- a/frame/1/bli_l1v_oapi.h
+++ b/frame/1/bli_l1v_oapi.h
@@ -42,10 +42,10 @@
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t*  x, \
+       const obj_t*  y  \
        BLIS_OAPI_EX_PARAMS  \
-     ); 
+     );
 
 GENTPROT( addv )
 GENTPROT( copyv )
@@ -57,8 +57,8 @@ GENTPROT( subv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  index  \
+       const obj_t*  x, \
+       const obj_t*  index  \
        BLIS_OAPI_EX_PARAMS  \
      );
 
@@ -70,10 +70,10 @@ GENTPROT( amaxv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y  \
+       const obj_t*  alpha, \
+       const obj_t*  x, \
+       const obj_t*  beta, \
+       const obj_t*  y  \
        BLIS_OAPI_EX_PARAMS  \
      );
 
@@ -85,11 +85,11 @@ GENTPROT( axpbyv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t*  alpha, \
+       const obj_t*  x, \
+       const obj_t*  y  \
        BLIS_OAPI_EX_PARAMS  \
-     ); 
+     );
 
 GENTPROT( axpyv )
 GENTPROT( scal2v )
@@ -100,11 +100,11 @@ GENTPROT( scal2v )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  y, \
-       obj_t*  rho  \
+       const obj_t*  x, \
+       const obj_t*  y, \
+       const obj_t*  rho  \
        BLIS_OAPI_EX_PARAMS  \
-     ); 
+     );
 
 GENTPROT( dotv )
 
@@ -114,13 +114,13 @@ GENTPROT( dotv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  y, \
-       obj_t*  beta, \
-       obj_t*  rho  \
+       const obj_t*  alpha, \
+       const obj_t*  x, \
+       const obj_t*  y, \
+       const obj_t*  beta, \
+       const obj_t*  rho  \
        BLIS_OAPI_EX_PARAMS  \
-     ); 
+     );
 
 GENTPROT( dotxv )
 
@@ -130,9 +130,9 @@ GENTPROT( dotxv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x  \
+       const obj_t*  x  \
        BLIS_OAPI_EX_PARAMS  \
-     ); 
+     );
 
 GENTPROT( invertv )
 
@@ -142,10 +142,10 @@ GENTPROT( invertv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x  \
+       const obj_t*  alpha, \
+       const obj_t*  x  \
        BLIS_OAPI_EX_PARAMS  \
-     ); 
+     );
 
 GENTPROT( scalv )
 GENTPROT( setv )
@@ -156,10 +156,10 @@ GENTPROT( setv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t*  x, \
+       const obj_t*  y  \
        BLIS_OAPI_EX_PARAMS  \
-     ); 
+     );
 
 GENTPROT( swapv )
 
@@ -169,9 +169,9 @@ GENTPROT( swapv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y  \
+       const obj_t*  x, \
+       const obj_t*  beta, \
+       const obj_t*  y  \
        BLIS_OAPI_EX_PARAMS  \
      );
 
diff --git a/frame/1/bli_l1v_tapi.c b/frame/1/bli_l1v_tapi.c
index 1d12b42ebd..d9e247978f 100644
--- a/frame/1/bli_l1v_tapi.c
+++ b/frame/1/bli_l1v_tapi.c
@@ -45,10 +45,10 @@
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjx, \
-       dim_t   n, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy  \
+       conj_t       conjx, \
+       dim_t        n, \
+       const ctype* x, inc_t incx, \
+             ctype* y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -83,9 +83,9 @@ INSERT_GENTFUNC_BASIC( subv,  BLIS_SUBV_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       dim_t   n, \
-       ctype*  x, inc_t incx, \
-       dim_t*  index  \
+       dim_t         n, \
+       const ctype*  x, inc_t incx, \
+       dim_t*        index  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -117,12 +117,12 @@ INSERT_GENTFUNC_BASIC( amaxv, BLIS_AMAXV_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjx, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t incx, \
-       ctype*  beta, \
-       ctype*  y, inc_t incy  \
+       conj_t        conjx, \
+       dim_t         n, \
+       const ctype*  alpha, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  beta, \
+             ctype*  y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -157,11 +157,11 @@ INSERT_GENTFUNC_BASIC( axpbyv, BLIS_AXPBYV_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjx, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy  \
+       conj_t        conjx, \
+       dim_t         n, \
+       const ctype*  alpha, \
+       const ctype*  x, inc_t incx, \
+             ctype*  y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -197,12 +197,12 @@ INSERT_GENTFUNC_BASIC( scal2v, BLIS_SCAL2V_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjx, \
-       conj_t  conjy, \
-       dim_t   n, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy, \
-       ctype*  rho  \
+       conj_t        conjx, \
+       conj_t        conjy, \
+       dim_t         n, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  y, inc_t incy, \
+             ctype*  rho  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -237,14 +237,14 @@ INSERT_GENTFUNC_BASIC( dotv, BLIS_DOTV_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjx, \
-       conj_t  conjy, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy, \
-       ctype*  beta, \
-       ctype*  rho  \
+       conj_t        conjx, \
+       conj_t        conjy, \
+       dim_t         n, \
+       const ctype*  alpha, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  y, inc_t incy, \
+       const ctype*  beta, \
+             ctype*  rho  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -313,10 +313,10 @@ INSERT_GENTFUNC_BASIC( invertv, BLIS_INVERTV_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjalpha, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t incx  \
+       conj_t        conjalpha, \
+       dim_t         n, \
+       const ctype*  alpha, \
+             ctype*  x, inc_t incx  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -383,11 +383,11 @@ INSERT_GENTFUNC_BASIC( swapv, BLIS_SWAPV_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjx, \
-       dim_t   n, \
-       ctype*  x, inc_t incx, \
-       ctype*  beta, \
-       ctype*  y, inc_t incy  \
+       conj_t        conjx, \
+       dim_t         n, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  beta, \
+             ctype*  y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
diff --git a/frame/1/bli_l1v_tapi.h b/frame/1/bli_l1v_tapi.h
index 5cb3295ef1..7514f617e0 100644
--- a/frame/1/bli_l1v_tapi.h
+++ b/frame/1/bli_l1v_tapi.h
@@ -42,10 +42,10 @@
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
       ( \
-        conj_t  conjx, \
-        dim_t   n, \
-        ctype*  x, inc_t incx, \
-        ctype*  y, inc_t incy  \
+        conj_t        conjx, \
+        dim_t         n, \
+        const ctype*  x, inc_t incx, \
+              ctype*  y, inc_t incy  \
         BLIS_TAPI_EX_PARAMS  \
       );
 
@@ -59,9 +59,9 @@ INSERT_GENTPROT_BASIC0( subv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       dim_t   n, \
-       ctype*  x, inc_t incx, \
-       dim_t*  index  \
+       dim_t          n, \
+       const ctype*  x, inc_t incx, \
+       dim_t*        index  \
        BLIS_TAPI_EX_PARAMS  \
      ); \
 
@@ -73,12 +73,12 @@ INSERT_GENTPROT_BASIC0( amaxv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjx, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t incx, \
-       ctype*  beta, \
-       ctype*  y, inc_t incy  \
+       conj_t        conjx, \
+       dim_t         n, \
+       const ctype*  alpha, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  beta, \
+             ctype*  y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      ); \
 
@@ -90,11 +90,11 @@ INSERT_GENTPROT_BASIC0( axpbyv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjx, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy  \
+       conj_t        conjx, \
+       dim_t         n, \
+       const ctype*  alpha, \
+       const ctype*  x, inc_t incx, \
+             ctype*  y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      ); \
 
@@ -107,12 +107,12 @@ INSERT_GENTPROT_BASIC0( scal2v )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjx, \
-       conj_t  conjy, \
-       dim_t   n, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy, \
-       ctype*  rho  \
+       conj_t        conjx, \
+       conj_t        conjy, \
+       dim_t         n, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  y, inc_t incy, \
+             ctype*  rho  \
        BLIS_TAPI_EX_PARAMS  \
      ); \
 
@@ -124,14 +124,14 @@ INSERT_GENTPROT_BASIC0( dotv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjx, \
-       conj_t  conjy, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy, \
-       ctype*  beta, \
-       ctype*  rho  \
+       conj_t        conjx, \
+       conj_t        conjy, \
+       dim_t         n, \
+       const ctype*  alpha, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  y, inc_t incy, \
+       const ctype*  beta, \
+             ctype*  rho  \
        BLIS_TAPI_EX_PARAMS  \
      ); \
 
@@ -156,10 +156,10 @@ INSERT_GENTPROT_BASIC0( invertv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjalpha, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t incx  \
+       conj_t        conjalpha, \
+       dim_t         n, \
+       const ctype*  alpha, \
+             ctype*  x, inc_t incx  \
        BLIS_TAPI_EX_PARAMS  \
      ); \
 
@@ -186,11 +186,11 @@ INSERT_GENTPROT_BASIC0( swapv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjx, \
-       dim_t   n, \
-       ctype*  x, inc_t incx, \
-       ctype*  beta, \
-       ctype*  y, inc_t incy  \
+       conj_t        conjx, \
+       dim_t         n, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  beta, \
+             ctype*  y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      ); \
 
diff --git a/frame/1d/bli_l1d_check.c b/frame/1d/bli_l1d_check.c
index 908a410ad0..fcc62a7576 100644
--- a/frame/1d/bli_l1d_check.c
+++ b/frame/1d/bli_l1d_check.c
@@ -43,8 +43,8 @@
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t*  x, \
+       const obj_t*  y  \
      ) \
 { \
 	bli_l1d_xy_check( x, y ); \
@@ -60,9 +60,9 @@ GENFRONT( subd )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t*  alpha, \
+       const obj_t*  x, \
+       const obj_t*  y  \
      ) \
 { \
 	bli_l1d_axy_check( alpha, x, y ); \
@@ -77,7 +77,7 @@ GENFRONT( scal2d )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x  \
+       const obj_t*  x  \
      ) \
 { \
 	bli_l1d_x_check( x ); \
@@ -91,8 +91,8 @@ GENFRONT( invertd )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x  \
+       const obj_t*  alpha, \
+       const obj_t*  x  \
      ) \
 { \
 	bli_l1d_ax_check( alpha, x ); \
@@ -109,9 +109,9 @@ GENFRONT( shiftd )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y  \
+       const obj_t*  x, \
+       const obj_t*  beta, \
+       const obj_t*  y  \
      ) \
 { \
 	bli_l1d_axy_check( beta, x, y ); \
@@ -124,8 +124,8 @@ GENFRONT( xpbyd )
 
 void bli_l1d_xy_check
      (
-       obj_t*  x,
-       obj_t*  y 
+       const obj_t*  x,
+       const obj_t*  y
      )
 {
 	err_t e_val;
@@ -165,9 +165,9 @@ void bli_l1d_xy_check
 
 void bli_l1d_axy_check
      (
-       obj_t*  alpha,
-       obj_t*  x,
-       obj_t*  y 
+       const obj_t*  alpha,
+       const obj_t*  x,
+       const obj_t*  y
      )
 {
 	err_t e_val;
@@ -216,7 +216,7 @@ void bli_l1d_axy_check
 
 void bli_l1d_x_check
      (
-       obj_t*  x 
+       const obj_t*  x
      )
 {
 	err_t e_val;
@@ -239,8 +239,8 @@ void bli_l1d_x_check
 
 void bli_l1d_ax_check
      (
-       obj_t*  alpha,
-       obj_t*  x 
+       const obj_t*  alpha,
+       const obj_t*  x
      )
 {
 	err_t e_val;
diff --git a/frame/1d/bli_l1d_check.h b/frame/1d/bli_l1d_check.h
index 6d000d3145..1ef57e2367 100644
--- a/frame/1d/bli_l1d_check.h
+++ b/frame/1d/bli_l1d_check.h
@@ -42,8 +42,8 @@
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t*  x, \
+       const obj_t*  y  \
     );
 
 GENTPROT( addd )
@@ -56,9 +56,9 @@ GENTPROT( subd )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t*  alpha, \
+       const obj_t*  x, \
+       const obj_t*  y  \
     );
 
 GENTPROT( axpyd )
@@ -70,7 +70,7 @@ GENTPROT( scal2d )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x  \
+       const obj_t*  x  \
     );
 
 GENTPROT( invertd )
@@ -81,8 +81,8 @@ GENTPROT( invertd )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x  \
+       const obj_t*  alpha, \
+       const obj_t*  x  \
     );
 
 GENTPROT( scald )
@@ -96,9 +96,9 @@ GENTPROT( shiftd )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y  \
+       const obj_t*  x, \
+       const obj_t*  beta, \
+       const obj_t*  y  \
     );
 
 GENTPROT( xpbyd )
@@ -108,25 +108,25 @@ GENTPROT( xpbyd )
 
 void bli_l1d_xy_check
      (
-       obj_t*  x,
-       obj_t*  y 
+       const obj_t*  x,
+       const obj_t*  y
      );
 
 void bli_l1d_axy_check
      (
-       obj_t*  alpha,
-       obj_t*  x,
-       obj_t*  y 
+       const obj_t*  alpha,
+       const obj_t*  x,
+       const obj_t*  y
      );
 
 void bli_l1d_x_check
      (
-       obj_t*  x 
+       const obj_t*  x
      );
 
 void bli_l1d_ax_check
      (
-       obj_t*  alpha,
-       obj_t*  x 
+       const obj_t*  alpha,
+       const obj_t*  x
      );
 
diff --git a/frame/1d/bli_l1d_ft.h b/frame/1d/bli_l1d_ft.h
index 53e296616f..5eecdafe1a 100644
--- a/frame/1d/bli_l1d_ft.h
+++ b/frame/1d/bli_l1d_ft.h
@@ -44,13 +44,13 @@
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  y, inc_t rs_y, inc_t cs_y  \
+       doff_t        diagoffx, \
+       diag_t        diagx, \
+       trans_t       transx, \
+       dim_t         m, \
+       dim_t         n, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+             ctype*  y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -65,14 +65,14 @@ INSERT_GENTDEF( subd )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  y, inc_t rs_y, inc_t cs_y  \
+       doff_t        diagoffx, \
+       diag_t        diagx, \
+       trans_t       transx, \
+       dim_t         m, \
+       dim_t         n, \
+       const ctype*  alpha, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+             ctype*  y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -102,12 +102,12 @@ INSERT_GENTDEF( invertd )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       conj_t  conjalpha, \
-       doff_t  diagoffx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t rs_x, inc_t cs_x  \
+       conj_t        conjalpha, \
+       doff_t        diagoffx, \
+       dim_t         m, \
+       dim_t         n, \
+       const ctype*  alpha, \
+             ctype*  x, inc_t rs_x, inc_t cs_x  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -121,11 +121,11 @@ INSERT_GENTDEF( setd )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       doff_t   diagoffx, \
-       dim_t    m, \
-       dim_t    n, \
-       ctype_r* alpha, \
-       ctype*   x, inc_t rs_x, inc_t cs_x  \
+       doff_t         diagoffx, \
+       dim_t          m, \
+       dim_t          n, \
+       const ctype_r* alpha, \
+             ctype*   x, inc_t rs_x, inc_t cs_x  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -138,11 +138,11 @@ INSERT_GENTDEFR( setid )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       doff_t  diagoffx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t rs_x, inc_t cs_x  \
+       doff_t        diagoffx, \
+       dim_t         m, \
+       dim_t         n, \
+       const ctype*  alpha, \
+             ctype*  x, inc_t rs_x, inc_t cs_x  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -155,14 +155,14 @@ INSERT_GENTDEF( shiftd )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  beta, \
-       ctype*  y, inc_t rs_y, inc_t cs_y  \
+       doff_t        diagoffx, \
+       diag_t        diagx, \
+       trans_t       transx, \
+       dim_t         m, \
+       dim_t         n, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+       const ctype*  beta, \
+             ctype*  y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
diff --git a/frame/1d/bli_l1d_oapi.c b/frame/1d/bli_l1d_oapi.c
index 15e68cf50f..cc1df29fcd 100644
--- a/frame/1d/bli_l1d_oapi.c
+++ b/frame/1d/bli_l1d_oapi.c
@@ -45,8 +45,8 @@
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t*  x, \
+       const obj_t*  y  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -100,9 +100,9 @@ GENFRONT( subd )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t*  alpha, \
+       const obj_t*  x, \
+       const obj_t*  y  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -166,7 +166,7 @@ GENFRONT( scal2d )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x  \
+       const obj_t*  x  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -210,8 +210,8 @@ GENFRONT( invertd )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x  \
+       const obj_t*  alpha, \
+       const obj_t*  x  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -269,8 +269,8 @@ GENFRONT( setd )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x  \
+       const obj_t*  alpha, \
+       const obj_t*  x  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -317,8 +317,8 @@ GENFRONT( setid )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x  \
+       const obj_t*  alpha, \
+       const obj_t*  x  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -373,9 +373,9 @@ GENFRONT( shiftd )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y  \
+       const obj_t*  x, \
+       const obj_t*  beta, \
+       const obj_t*  y  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
diff --git a/frame/1d/bli_l1d_oapi.h b/frame/1d/bli_l1d_oapi.h
index 47129b7719..5ace224f10 100644
--- a/frame/1d/bli_l1d_oapi.h
+++ b/frame/1d/bli_l1d_oapi.h
@@ -42,10 +42,10 @@
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t*  x, \
+       const obj_t*  y  \
        BLIS_OAPI_EX_PARAMS  \
-     ); 
+     );
 
 GENTPROT( addd )
 GENTPROT( copyd )
@@ -57,11 +57,11 @@ GENTPROT( subd )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t*  alpha, \
+       const obj_t*  x, \
+       const obj_t*  y  \
        BLIS_OAPI_EX_PARAMS  \
-     ); 
+     );
 
 GENTPROT( axpyd )
 GENTPROT( scal2d )
@@ -72,9 +72,9 @@ GENTPROT( scal2d )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x  \
+       const obj_t*  x  \
        BLIS_OAPI_EX_PARAMS  \
-     ); 
+     );
 
 GENTPROT( invertd )
 
@@ -84,10 +84,10 @@ GENTPROT( invertd )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x  \
+       const obj_t*  alpha, \
+       const obj_t*  x  \
        BLIS_OAPI_EX_PARAMS  \
-     ); 
+     );
 
 GENTPROT( scald )
 GENTPROT( setd )
@@ -100,9 +100,9 @@ GENTPROT( shiftd )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y  \
+       const obj_t*  x, \
+       const obj_t*  beta, \
+       const obj_t*  y  \
        BLIS_OAPI_EX_PARAMS  \
      );
 
diff --git a/frame/1d/bli_l1d_tapi.c b/frame/1d/bli_l1d_tapi.c
index cfaf5150fe..560cb40efc 100644
--- a/frame/1d/bli_l1d_tapi.c
+++ b/frame/1d/bli_l1d_tapi.c
@@ -45,13 +45,13 @@
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  y, inc_t rs_y, inc_t cs_y  \
+       doff_t        diagoffx, \
+       diag_t        diagx, \
+       trans_t       transx, \
+       dim_t         m, \
+       dim_t         n, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+             ctype*  y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -61,7 +61,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 \
 	const num_t dt = PASTEMAC(ch,type); \
 \
-	ctype*      x1; \
+	const ctype*      x1; \
 	ctype*      y1; \
 	conj_t      conjx; \
 	dim_t       n_elem; \
@@ -124,14 +124,14 @@ INSERT_GENTFUNC_BASIC2( subd,  subv,  BLIS_SUBV_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  y, inc_t rs_y, inc_t cs_y  \
+       doff_t        diagoffx, \
+       diag_t        diagx, \
+       trans_t       transx, \
+       dim_t         m, \
+       dim_t         n, \
+       const ctype*  alpha, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+             ctype*  y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -141,7 +141,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 \
 	const num_t dt = PASTEMAC(ch,type); \
 \
-	ctype*      x1; \
+	const ctype*      x1; \
 	ctype*      y1; \
 	conj_t      conjx; \
 	dim_t       n_elem; \
@@ -260,12 +260,12 @@ INSERT_GENTFUNC_BASIC2( invertd, invertv, BLIS_INVERTV_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjalpha, \
-       doff_t  diagoffx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t rs_x, inc_t cs_x  \
+       conj_t        conjalpha, \
+       doff_t        diagoffx, \
+       dim_t         m, \
+       dim_t         n, \
+       const ctype*  alpha, \
+             ctype*  x, inc_t rs_x, inc_t cs_x  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -321,11 +321,11 @@ INSERT_GENTFUNC_BASIC2( setd,  setv,  BLIS_SETV_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t   diagoffx, \
-       dim_t    m, \
-       dim_t    n, \
-       ctype_r* alpha, \
-       ctype*   x, inc_t rs_x, inc_t cs_x  \
+       doff_t         diagoffx, \
+       dim_t          m, \
+       dim_t          n, \
+       const ctype_r* alpha, \
+       ctype*         x, inc_t rs_x, inc_t cs_x  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -397,11 +397,11 @@ INSERT_GENTFUNCR_BASIC2( setid, setv, BLIS_SETV_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t  diagoffx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t rs_x, inc_t cs_x  \
+       doff_t        diagoffx, \
+       dim_t         m, \
+       dim_t         n, \
+       const ctype*  alpha, \
+             ctype*  x, inc_t rs_x, inc_t cs_x  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -456,14 +456,14 @@ INSERT_GENTFUNC_BASIC2( shiftd, addv, BLIS_ADDV_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  beta, \
-       ctype*  y, inc_t rs_y, inc_t cs_y  \
+       doff_t        diagoffx, \
+       diag_t        diagx, \
+       trans_t       transx, \
+       dim_t         m, \
+       dim_t         n, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+       const ctype*  beta, \
+             ctype*  y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -473,7 +473,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 \
 	const num_t dt = PASTEMAC(ch,type); \
 \
-	ctype*      x1; \
+	const ctype*      x1; \
 	ctype*      y1; \
 	conj_t      conjx; \
 	dim_t       n_elem; \
diff --git a/frame/1d/bli_l1d_tapi.h b/frame/1d/bli_l1d_tapi.h
index 35d093e865..d063db3617 100644
--- a/frame/1d/bli_l1d_tapi.h
+++ b/frame/1d/bli_l1d_tapi.h
@@ -42,13 +42,13 @@
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  y, inc_t rs_y, inc_t cs_y  \
+       doff_t        diagoffx, \
+       diag_t        diagx, \
+       trans_t       transx, \
+       dim_t         m, \
+       dim_t         n, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+             ctype*  y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -62,14 +62,14 @@ INSERT_GENTPROT_BASIC0( subd )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  y, inc_t rs_y, inc_t cs_y  \
+       doff_t        diagoffx, \
+       diag_t        diagx, \
+       trans_t       transx, \
+       dim_t         m, \
+       dim_t         n, \
+       const ctype*  alpha, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+             ctype*  y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -97,12 +97,12 @@ INSERT_GENTPROT_BASIC0( invertd )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjalpha, \
-       doff_t  diagoffx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t rs_x, inc_t cs_x  \
+       conj_t        conjalpha, \
+       doff_t        diagoffx, \
+       dim_t         m, \
+       dim_t         n, \
+       const ctype*  alpha, \
+             ctype*  x, inc_t rs_x, inc_t cs_x  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -115,11 +115,11 @@ INSERT_GENTPROT_BASIC0( setd )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t   diagoffx, \
-       dim_t    m, \
-       dim_t    n, \
-       ctype_r* alpha, \
-       ctype*   x, inc_t rs_x, inc_t cs_x  \
+       doff_t         diagoffx, \
+       dim_t          m, \
+       dim_t          n, \
+       const ctype_r* alpha, \
+       ctype*         x, inc_t rs_x, inc_t cs_x  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -131,11 +131,11 @@ INSERT_GENTPROTR_BASIC0( setid )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t  diagoffx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t rs_x, inc_t cs_x  \
+       doff_t        diagoffx, \
+       dim_t         m, \
+       dim_t         n, \
+       const ctype*  alpha, \
+             ctype*  x, inc_t rs_x, inc_t cs_x  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -147,14 +147,14 @@ INSERT_GENTPROT_BASIC0( shiftd )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  beta, \
-       ctype*  y, inc_t rs_y, inc_t cs_y  \
+       doff_t        diagoffx, \
+       diag_t        diagx, \
+       trans_t       transx, \
+       dim_t         m, \
+       dim_t         n, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+       const ctype*  beta, \
+             ctype*  y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
diff --git a/frame/1f/bli_l1f_check.c b/frame/1f/bli_l1f_check.c
index c880237c1e..f3c6ef0b7a 100644
--- a/frame/1f/bli_l1f_check.c
+++ b/frame/1f/bli_l1f_check.c
@@ -40,11 +40,11 @@
 
 void bli_axpy2v_check
      (
-       obj_t*  alphax,
-       obj_t*  alphay,
-       obj_t*  x,
-       obj_t*  y,
-       obj_t*  z 
+       const obj_t*  alphax,
+       const obj_t*  alphay,
+       const obj_t*  x,
+       const obj_t*  y,
+       const obj_t*  z
      )
 {
 	err_t e_val;
@@ -118,10 +118,10 @@ void bli_axpy2v_check
 
 void bli_axpyf_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  x,
-       obj_t*  y 
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  x,
+       const obj_t*  y
      )
 {
 	err_t e_val;
@@ -186,12 +186,12 @@ void bli_axpyf_check
 
 void bli_dotaxpyv_check
      (
-       obj_t*  alpha,
-       obj_t*  xt,
-       obj_t*  x,
-       obj_t*  y,
-       obj_t*  rho,
-       obj_t*  z 
+       const obj_t*  alpha,
+       const obj_t*  xt,
+       const obj_t*  x,
+       const obj_t*  y,
+       const obj_t*  rho,
+       const obj_t*  z
      )
 {
 	err_t e_val;
@@ -288,14 +288,14 @@ void bli_dotaxpyv_check
 
 void bli_dotxaxpyf_check
      (
-       obj_t*  alpha,
-       obj_t*  at,
-       obj_t*  a,
-       obj_t*  w,
-       obj_t*  x,
-       obj_t*  beta,
-       obj_t*  y,
-       obj_t*  z 
+       const obj_t*  alpha,
+       const obj_t*  at,
+       const obj_t*  a,
+       const obj_t*  w,
+       const obj_t*  x,
+       const obj_t*  beta,
+       const obj_t*  y,
+       const obj_t*  z
      )
 {
 	err_t e_val;
@@ -425,11 +425,11 @@ void bli_dotxaxpyf_check
 
 void bli_dotxf_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  x,
-       obj_t*  beta,
-       obj_t*  y 
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  x,
+       const obj_t*  beta,
+       const obj_t*  y
      )
 {
 	err_t e_val;
diff --git a/frame/1f/bli_l1f_check.h b/frame/1f/bli_l1f_check.h
index d630f32058..6ba13322c1 100644
--- a/frame/1f/bli_l1f_check.h
+++ b/frame/1f/bli_l1f_check.h
@@ -42,11 +42,11 @@
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alphax, \
-       obj_t*  alphay, \
-       obj_t*  x, \
-       obj_t*  y, \
-       obj_t*  z  \
+       const obj_t*  alphax, \
+       const obj_t*  alphay, \
+       const obj_t*  x, \
+       const obj_t*  y, \
+       const obj_t*  z  \
     );
 
 GENTPROT( axpy2v )
@@ -57,10 +57,10 @@ GENTPROT( axpy2v )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t*  alpha, \
+       const obj_t*  a, \
+       const obj_t*  x, \
+       const obj_t*  y  \
     );
 
 GENTPROT( axpyf )
@@ -71,12 +71,12 @@ GENTPROT( axpyf )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  xt, \
-       obj_t*  x, \
-       obj_t*  y, \
-       obj_t*  rho, \
-       obj_t*  z  \
+       const obj_t*  alpha, \
+       const obj_t*  xt, \
+       const obj_t*  x, \
+       const obj_t*  y, \
+       const obj_t*  rho, \
+       const obj_t*  z  \
     );
 
 GENTPROT( dotaxpyv )
@@ -87,14 +87,14 @@ GENTPROT( dotaxpyv )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  at, \
-       obj_t*  a, \
-       obj_t*  w, \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y, \
-       obj_t*  z  \
+       const obj_t*  alpha, \
+       const obj_t*  at, \
+       const obj_t*  a, \
+       const obj_t*  w, \
+       const obj_t*  x, \
+       const obj_t*  beta, \
+       const obj_t*  y, \
+       const obj_t*  z  \
     );
 
 GENTPROT( dotxaxpyf )
@@ -105,11 +105,11 @@ GENTPROT( dotxaxpyf )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y  \
+       const obj_t*  alpha, \
+       const obj_t*  a, \
+       const obj_t*  x, \
+       const obj_t*  beta, \
+       const obj_t*  y  \
     );
 
 GENTPROT( dotxf )
diff --git a/frame/1f/bli_l1f_ft.h b/frame/1f/bli_l1f_ft.h
index 1c7bfd9b67..76f036dbfa 100644
--- a/frame/1f/bli_l1f_ft.h
+++ b/frame/1f/bli_l1f_ft.h
@@ -44,14 +44,14 @@
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       conj_t  conjx, \
-       conj_t  conjy, \
-       dim_t   n, \
-       ctype*  alpha1, \
-       ctype*  alpha2, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy, \
-       ctype*  z, inc_t incz  \
+       conj_t        conjx, \
+       conj_t        conjy, \
+       dim_t         n, \
+       const ctype*  alpha1, \
+       const ctype*  alpha2, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  y, inc_t incy, \
+             ctype*  z, inc_t incz  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -64,14 +64,14 @@ INSERT_GENTDEF( axpy2v )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       conj_t  conja, \
-       conj_t  conjx, \
-       dim_t   m, \
-       dim_t   b_n, \
-       ctype*  alpha, \
-       ctype*  a, inc_t inca, inc_t lda, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy  \
+       conj_t        conja, \
+       conj_t        conjx, \
+       dim_t         m, \
+       dim_t         b_n, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t inca, inc_t lda, \
+       const ctype*  x, inc_t incx, \
+             ctype*  y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -84,15 +84,15 @@ INSERT_GENTDEF( axpyf )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       conj_t  conjxt, \
-       conj_t  conjx, \
-       conj_t  conjy, \
-       dim_t   m, \
-       ctype*  alpha, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy, \
-       ctype*  rho, \
-       ctype*  z, inc_t incz  \
+       conj_t        conjxt, \
+       conj_t        conjx, \
+       conj_t        conjy, \
+       dim_t         m, \
+       const ctype*  alpha, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  y, inc_t incy, \
+             ctype*  rho, \
+             ctype*  z, inc_t incz  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -105,15 +105,15 @@ INSERT_GENTDEF( dotaxpyv )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       conj_t  conjat, \
-       conj_t  conjx, \
-       dim_t   m, \
-       dim_t   b_n, \
-       ctype*  alpha, \
-       ctype*  a, inc_t inca, inc_t lda, \
-       ctype*  x, inc_t incx, \
-       ctype*  beta, \
-       ctype*  y, inc_t incy  \
+       conj_t        conjat, \
+       conj_t        conjx, \
+       dim_t         m, \
+       dim_t         b_n, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t inca, inc_t lda, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  beta, \
+             ctype*  y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -126,19 +126,19 @@ INSERT_GENTDEF( dotxf )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       conj_t  conjat, \
-       conj_t  conja, \
-       conj_t  conjw, \
-       conj_t  conjx, \
-       dim_t   m, \
-       dim_t   b_n, \
-       ctype*  alpha, \
-       ctype*  a, inc_t inca, inc_t lda, \
-       ctype*  w, inc_t incw, \
-       ctype*  x, inc_t incx, \
-       ctype*  beta, \
-       ctype*  y, inc_t incy, \
-       ctype*  z, inc_t incz  \
+       conj_t        conjat, \
+       conj_t        conja, \
+       conj_t        conjw, \
+       conj_t        conjx, \
+       dim_t         m, \
+       dim_t         b_n, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t inca, inc_t lda, \
+       const ctype*  w, inc_t incw, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  beta, \
+             ctype*  y, inc_t incy, \
+             ctype*  z, inc_t incz  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
diff --git a/frame/1f/bli_l1f_oapi.c b/frame/1f/bli_l1f_oapi.c
index db8fdfb68c..0d879d686e 100644
--- a/frame/1f/bli_l1f_oapi.c
+++ b/frame/1f/bli_l1f_oapi.c
@@ -45,11 +45,11 @@
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alphax, \
-       obj_t*  alphay, \
-       obj_t*  x, \
-       obj_t*  y, \
-       obj_t*  z  \
+       const obj_t*  alphax, \
+       const obj_t*  alphay, \
+       const obj_t*  x, \
+       const obj_t*  y, \
+       const obj_t*  z  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -115,10 +115,10 @@ GENFRONT( axpy2v )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t*  alpha, \
+       const obj_t*  a, \
+       const obj_t*  x, \
+       const obj_t*  y  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -184,12 +184,12 @@ GENFRONT( axpyf )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  xt, \
-       obj_t*  x, \
-       obj_t*  y, \
-       obj_t*  rho, \
-       obj_t*  z  \
+       const obj_t*  alpha, \
+       const obj_t*  xt, \
+       const obj_t*  x, \
+       const obj_t*  y, \
+       const obj_t*  rho, \
+       const obj_t*  z  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -253,14 +253,14 @@ GENFRONT( dotaxpyv )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  at, \
-       obj_t*  a, \
-       obj_t*  w, \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y, \
-       obj_t*  z  \
+       const obj_t*  alpha, \
+       const obj_t*  at, \
+       const obj_t*  a, \
+       const obj_t*  w, \
+       const obj_t*  x, \
+       const obj_t*  beta, \
+       const obj_t*  y, \
+       const obj_t*  z  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -342,11 +342,11 @@ GENFRONT( dotxaxpyf )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y  \
+       const obj_t*  alpha, \
+       const obj_t*  a, \
+       const obj_t*  x, \
+       const obj_t*  beta, \
+       const obj_t*  y  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
diff --git a/frame/1f/bli_l1f_oapi.h b/frame/1f/bli_l1f_oapi.h
index 0348c48714..fa00414c7f 100644
--- a/frame/1f/bli_l1f_oapi.h
+++ b/frame/1f/bli_l1f_oapi.h
@@ -42,11 +42,11 @@
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alphax, \
-       obj_t*  alphay, \
-       obj_t*  x, \
-       obj_t*  y, \
-       obj_t*  z  \
+       const obj_t*  alphax, \
+       const obj_t*  alphay, \
+       const obj_t*  x, \
+       const obj_t*  y, \
+       const obj_t*  z  \
        BLIS_OAPI_EX_PARAMS  \
      );
 
@@ -58,10 +58,10 @@ GENTPROT( axpy2v )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t*  alpha, \
+       const obj_t*  a, \
+       const obj_t*  x, \
+       const obj_t*  y  \
        BLIS_OAPI_EX_PARAMS  \
      );
 
@@ -73,12 +73,12 @@ GENTPROT( axpyf )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  xt, \
-       obj_t*  x, \
-       obj_t*  y, \
-       obj_t*  rho, \
-       obj_t*  z  \
+       const obj_t*  alpha, \
+       const obj_t*  xt, \
+       const obj_t*  x, \
+       const obj_t*  y, \
+       const obj_t*  rho, \
+       const obj_t*  z  \
        BLIS_OAPI_EX_PARAMS  \
      );
 
@@ -90,14 +90,14 @@ GENTPROT( dotaxpyv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  at, \
-       obj_t*  a, \
-       obj_t*  w, \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y, \
-       obj_t*  z  \
+       const obj_t*  alpha, \
+       const obj_t*  at, \
+       const obj_t*  a, \
+       const obj_t*  w, \
+       const obj_t*  x, \
+       const obj_t*  beta, \
+       const obj_t*  y, \
+       const obj_t*  z  \
        BLIS_OAPI_EX_PARAMS  \
      );
 
@@ -109,11 +109,11 @@ GENTPROT( dotxaxpyf )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y  \
+       const obj_t*  alpha, \
+       const obj_t*  a, \
+       const obj_t*  x, \
+       const obj_t*  beta, \
+       const obj_t*  y  \
        BLIS_OAPI_EX_PARAMS  \
      );
 
diff --git a/frame/1f/bli_l1f_tapi.c b/frame/1f/bli_l1f_tapi.c
index a543792998..31d2553ba9 100644
--- a/frame/1f/bli_l1f_tapi.c
+++ b/frame/1f/bli_l1f_tapi.c
@@ -45,14 +45,14 @@
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjx, \
-       conj_t  conjy, \
-       dim_t   n, \
-       ctype*  alphax, \
-       ctype*  alphay, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy, \
-       ctype*  z, inc_t incz  \
+       conj_t        conjx, \
+       conj_t        conjy, \
+       dim_t         n, \
+       const ctype*  alphax, \
+       const ctype*  alphay, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  y, inc_t incy, \
+             ctype*  z, inc_t incz  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -89,14 +89,14 @@ INSERT_GENTFUNC_BASIC( axpy2v, BLIS_AXPY2V_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conja, \
-       conj_t  conjx, \
-       dim_t   m, \
-       dim_t   b_n, \
-       ctype*  alpha, \
-       ctype*  a, inc_t inca, inc_t lda, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy  \
+       conj_t        conja, \
+       conj_t        conjx, \
+       dim_t         m, \
+       dim_t         b_n, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t inca, inc_t lda, \
+       const ctype*  x, inc_t incx, \
+             ctype*  y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -133,15 +133,15 @@ INSERT_GENTFUNC_BASIC( axpyf, BLIS_AXPYF_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjxt, \
-       conj_t  conjx, \
-       conj_t  conjy, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy, \
-       ctype*  rho, \
-       ctype*  z, inc_t incz  \
+       conj_t        conjxt, \
+       conj_t        conjx, \
+       conj_t        conjy, \
+       dim_t         n, \
+       const ctype*  alpha, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  y, inc_t incy, \
+             ctype*  rho, \
+             ctype*  z, inc_t incz  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -179,19 +179,19 @@ INSERT_GENTFUNC_BASIC( dotaxpyv, BLIS_DOTAXPYV_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjat, \
-       conj_t  conja, \
-       conj_t  conjw, \
-       conj_t  conjx, \
-       dim_t   m, \
-       dim_t   b_n, \
-       ctype*  alpha, \
-       ctype*  a, inc_t inca, inc_t lda, \
-       ctype*  w, inc_t incw, \
-       ctype*  x, inc_t incx, \
-       ctype*  beta, \
-       ctype*  y, inc_t incy, \
-       ctype*  z, inc_t incz  \
+       conj_t        conjat, \
+       conj_t        conja, \
+       conj_t        conjw, \
+       conj_t        conjx, \
+       dim_t         m, \
+       dim_t         b_n, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t inca, inc_t lda, \
+       const ctype*  w, inc_t incw, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  beta, \
+             ctype*  y, inc_t incy, \
+             ctype*  z, inc_t incz  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -233,15 +233,15 @@ INSERT_GENTFUNC_BASIC( dotxaxpyf, BLIS_DOTXAXPYF_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjat, \
-       conj_t  conjx, \
-       dim_t   m, \
-       dim_t   b_n, \
-       ctype*  alpha, \
-       ctype*  a, inc_t inca, inc_t lda, \
-       ctype*  x, inc_t incx, \
-       ctype*  beta, \
-       ctype*  y, inc_t incy  \
+       conj_t        conjat, \
+       conj_t        conjx, \
+       dim_t         m, \
+       dim_t         b_n, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t inca, inc_t lda, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  beta, \
+             ctype*  y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
diff --git a/frame/1f/bli_l1f_tapi.h b/frame/1f/bli_l1f_tapi.h
index 2138b989df..93ef982933 100644
--- a/frame/1f/bli_l1f_tapi.h
+++ b/frame/1f/bli_l1f_tapi.h
@@ -42,14 +42,14 @@
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjx, \
-       conj_t  conjy, \
-       dim_t   n, \
-       ctype*  alphax, \
-       ctype*  alphay, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy, \
-       ctype*  z, inc_t incz  \
+       conj_t        conjx, \
+       conj_t        conjy, \
+       dim_t         n, \
+       const ctype*  alphax, \
+       const ctype*  alphay, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  y, inc_t incy, \
+             ctype*  z, inc_t incz  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -61,14 +61,14 @@ INSERT_GENTPROT_BASIC0( axpy2v )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conja, \
-       conj_t  conjx, \
-       dim_t   m, \
-       dim_t   b_n, \
-       ctype*  alpha, \
-       ctype*  a, inc_t inca, inc_t lda, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy  \
+       conj_t        conja, \
+       conj_t        conjx, \
+       dim_t         m, \
+       dim_t         b_n, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t inca, inc_t lda, \
+       const ctype*  x, inc_t incx, \
+             ctype*  y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -80,15 +80,15 @@ INSERT_GENTPROT_BASIC0( axpyf )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjxt, \
-       conj_t  conjx, \
-       conj_t  conjy, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy, \
-       ctype*  rho, \
-       ctype*  z, inc_t incz  \
+       conj_t        conjxt, \
+       conj_t        conjx, \
+       conj_t        conjy, \
+       dim_t         n, \
+       const ctype*  alpha, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  y, inc_t incy, \
+             ctype*  rho, \
+             ctype*  z, inc_t incz  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -100,19 +100,19 @@ INSERT_GENTPROT_BASIC0( dotaxpyv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjat, \
-       conj_t  conja, \
-       conj_t  conjw, \
-       conj_t  conjx, \
-       dim_t   m, \
-       dim_t   b_n, \
-       ctype*  alpha, \
-       ctype*  a, inc_t inca, inc_t lda, \
-       ctype*  w, inc_t incw, \
-       ctype*  x, inc_t incx, \
-       ctype*  beta, \
-       ctype*  y, inc_t incy, \
-       ctype*  z, inc_t incz  \
+       conj_t        conjat, \
+       conj_t        conja, \
+       conj_t        conjw, \
+       conj_t        conjx, \
+       dim_t         m, \
+       dim_t         b_n, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t inca, inc_t lda, \
+       const ctype*  w, inc_t incw, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  beta, \
+             ctype*  y, inc_t incy, \
+             ctype*  z, inc_t incz  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -124,15 +124,15 @@ INSERT_GENTPROT_BASIC0( dotxaxpyf )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjat, \
-       conj_t  conjx, \
-       dim_t   m, \
-       dim_t   b_n, \
-       ctype*  alpha, \
-       ctype*  a, inc_t inca, inc_t lda, \
-       ctype*  x, inc_t incx, \
-       ctype*  beta, \
-       ctype*  y, inc_t incy  \
+       conj_t        conjat, \
+       conj_t        conjx, \
+       dim_t         m, \
+       dim_t         b_n, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t inca, inc_t lda, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  beta, \
+             ctype*  y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
diff --git a/frame/1m/bli_l1m_check.c b/frame/1m/bli_l1m_check.c
index 8914e43b1d..f5d4bf1b4b 100644
--- a/frame/1m/bli_l1m_check.c
+++ b/frame/1m/bli_l1m_check.c
@@ -43,8 +43,8 @@
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t* x, \
+       const obj_t* y  \
      ) \
 { \
 	bli_l1m_xy_check( x, y ); \
@@ -60,9 +60,9 @@ GENFRONT( subm )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t* alpha, \
+       const obj_t* x, \
+       const obj_t* y  \
      ) \
 { \
 	bli_l1m_axy_check( alpha, x, y ); \
@@ -77,8 +77,8 @@ GENFRONT( scal2m )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x  \
+       const obj_t* alpha, \
+       const obj_t* x  \
      ) \
 { \
 	bli_l1m_ax_check( alpha, x ); \
@@ -93,9 +93,9 @@ GENFRONT( setm )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y  \
+       const obj_t* x, \
+       const obj_t* beta, \
+       const obj_t* y  \
      ) \
 { \
 	bli_l1m_axy_check( beta, x, y ); \
@@ -108,8 +108,8 @@ GENFRONT( xpbym )
 
 void bli_l1m_xy_check
      (
-       obj_t*  x,
-       obj_t*  y 
+       const obj_t* x,
+       const obj_t* y
      )
 {
 	err_t e_val;
@@ -149,9 +149,9 @@ void bli_l1m_xy_check
 
 void bli_l1m_axy_check
      (
-       obj_t*  alpha,
-       obj_t*  x,
-       obj_t*  y 
+       const obj_t* alpha,
+       const obj_t* x,
+       const obj_t* y
      )
 {
 	err_t e_val;
@@ -200,8 +200,8 @@ void bli_l1m_axy_check
 
 void bli_l1m_ax_check
      (
-       obj_t*  alpha,
-       obj_t*  x 
+       const obj_t* alpha,
+       const obj_t* x
      )
 {
 	err_t e_val;
diff --git a/frame/1m/bli_l1m_check.h b/frame/1m/bli_l1m_check.h
index 030c0e2191..6089dfa177 100644
--- a/frame/1m/bli_l1m_check.h
+++ b/frame/1m/bli_l1m_check.h
@@ -42,8 +42,8 @@
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t* x, \
+       const obj_t* y  \
     );
 
 GENPROT( addm )
@@ -56,9 +56,9 @@ GENPROT( subm )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t* alpha, \
+       const obj_t* x, \
+       const obj_t* y  \
     );
 
 GENPROT( axpym )
@@ -70,8 +70,8 @@ GENPROT( scal2m )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x  \
+       const obj_t* alpha, \
+       const obj_t* x  \
     );
 
 GENPROT( scalm )
@@ -83,9 +83,9 @@ GENPROT( setm )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y  \
+       const obj_t* x, \
+       const obj_t* beta, \
+       const obj_t* y  \
     );
 
 GENPROT( xpbym )
@@ -95,20 +95,20 @@ GENPROT( xpbym )
 
 void bli_l1m_xy_check
      (
-       obj_t*  x,
-       obj_t*  y 
+       const obj_t*  x,
+       const obj_t*  y
      );
 
 void bli_l1m_axy_check
      (
-       obj_t*  alpha,
-       obj_t*  x,
-       obj_t*  y 
+       const obj_t*  alpha,
+       const obj_t*  x,
+       const obj_t*  y
      );
 
 void bli_l1m_ax_check
      (
-       obj_t*  alpha,
-       obj_t*  x 
+       const obj_t*  alpha,
+       const obj_t*  x
      );
 
diff --git a/frame/1m/bli_l1m_ft.h b/frame/1m/bli_l1m_ft.h
index af6c384e53..fd5c023cb1 100644
--- a/frame/1m/bli_l1m_ft.h
+++ b/frame/1m/bli_l1m_ft.h
@@ -44,14 +44,14 @@
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       uplo_t  uplox, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  y, inc_t rs_y, inc_t cs_y  \
+       doff_t        diagoffx, \
+       diag_t        diagx, \
+       uplo_t        uplox, \
+       trans_t       transx, \
+       dim_t         m, \
+       dim_t         n, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+             ctype*  y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -66,15 +66,15 @@ INSERT_GENTDEF( copym )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       uplo_t  uplox, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  y, inc_t rs_y, inc_t cs_y  \
+       doff_t        diagoffx, \
+       diag_t        diagx, \
+       uplo_t        uplox, \
+       trans_t       transx, \
+       dim_t         m, \
+       dim_t         n, \
+       const ctype*  alpha, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+             ctype*  y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -87,15 +87,15 @@ INSERT_GENTDEF( axpym )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       uplo_t  uplox, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  y, inc_t rs_y, inc_t cs_y  \
+       doff_t        diagoffx, \
+       diag_t        diagx, \
+       uplo_t        uplox, \
+       trans_t       transx, \
+       dim_t         m, \
+       dim_t         n, \
+       const ctype*  alpha, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+             ctype*  y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -108,14 +108,14 @@ INSERT_GENTDEF( scal2m )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       conj_t  conjalpha, \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       uplo_t  uplox, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t rs_x, inc_t cs_x  \
+       conj_t        conjalpha, \
+       doff_t        diagoffx, \
+       diag_t        diagx, \
+       uplo_t        uplox, \
+       dim_t         m, \
+       dim_t         n, \
+       const ctype*  alpha, \
+             ctype*  x, inc_t rs_x, inc_t cs_x  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -129,15 +129,15 @@ INSERT_GENTDEF( setm )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       uplo_t  uplox, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  beta, \
-       ctype*  y, inc_t rs_y, inc_t cs_y  \
+       doff_t        diagoffx, \
+       diag_t        diagx, \
+       uplo_t        uplox, \
+       trans_t       transx, \
+       dim_t         m, \
+       dim_t         n, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+       const ctype*  beta, \
+             ctype*  y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
diff --git a/frame/1m/bli_l1m_oapi.c b/frame/1m/bli_l1m_oapi.c
index 840b058d4a..04d1477d0c 100644
--- a/frame/1m/bli_l1m_oapi.c
+++ b/frame/1m/bli_l1m_oapi.c
@@ -45,8 +45,8 @@
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t*  x, \
+       const obj_t*  y  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -102,9 +102,9 @@ GENFRONT( subm )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t*  alpha, \
+       const obj_t*  x, \
+       const obj_t*  y  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -170,8 +170,8 @@ GENFRONT( scal2m )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x  \
+       const obj_t*  alpha, \
+       const obj_t*  x  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -245,8 +245,8 @@ GENFRONT( scalm )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x  \
+       const obj_t*  alpha, \
+       const obj_t*  x  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -307,9 +307,9 @@ GENFRONT( setm )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y  \
+       const obj_t*  x, \
+       const obj_t*  beta, \
+       const obj_t*  y  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -377,9 +377,9 @@ GENFRONT( xpbym )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y  \
+       const obj_t*  x, \
+       const obj_t*  beta, \
+       const obj_t*  y  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
diff --git a/frame/1m/bli_l1m_oapi.h b/frame/1m/bli_l1m_oapi.h
index a6a94cf9f6..184f7c0cd1 100644
--- a/frame/1m/bli_l1m_oapi.h
+++ b/frame/1m/bli_l1m_oapi.h
@@ -42,8 +42,8 @@
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t*  x, \
+       const obj_t*  y  \
        BLIS_OAPI_EX_PARAMS  \
      );
 
@@ -57,9 +57,9 @@ GENPROT( subm )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  y  \
+       const obj_t*  alpha, \
+       const obj_t*  x, \
+       const obj_t*  y  \
        BLIS_OAPI_EX_PARAMS  \
      );
 
@@ -72,8 +72,8 @@ GENPROT( scal2m )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x  \
+       const obj_t*  alpha, \
+       const obj_t*  x  \
        BLIS_OAPI_EX_PARAMS  \
      );
 
@@ -86,9 +86,9 @@ GENPROT( setm )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y  \
+       const obj_t*  x, \
+       const obj_t*  beta, \
+       const obj_t*  y  \
        BLIS_OAPI_EX_PARAMS  \
      );
 
diff --git a/frame/1m/bli_l1m_oft_var.h b/frame/1m/bli_l1m_oft_var.h
index 0b60d4e2f6..496f79e0b2 100644
--- a/frame/1m/bli_l1m_oft_var.h
+++ b/frame/1m/bli_l1m_oft_var.h
@@ -45,12 +45,12 @@
 \
 typedef void (*PASTECH(opname,_var_oft)) \
 ( \
-  obj_t*  a, \
+  const obj_t*  a, \
   obj_t*  p, \
-  cntx_t* cntx, \
+  const cntx_t* cntx, \
   rntm_t* rntm, \
   cntl_t* cntl, \
-  thrinfo_t* thread  \
+  const thrinfo_t* thread  \
 );
 
 GENTDEF( packm )
@@ -61,11 +61,11 @@ GENTDEF( packm )
 \
 typedef void (*PASTECH(opname,_var_oft)) \
 ( \
-  obj_t*  p, \
-  obj_t*  a, \
-  cntx_t* cntx, \
-  cntl_t* cntl, \
-  thrinfo_t* thread  \
+  const obj_t*  p, \
+  const obj_t*  a, \
+  const cntx_t* cntx, \
+  const cntl_t* cntl, \
+  const thrinfo_t* thread  \
 );
 
 GENTDEF( unpackm )
diff --git a/frame/1m/bli_l1m_tapi.c b/frame/1m/bli_l1m_tapi.c
index 2b3c4bb4ab..ff84f03827 100644
--- a/frame/1m/bli_l1m_tapi.c
+++ b/frame/1m/bli_l1m_tapi.c
@@ -45,14 +45,14 @@
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       uplo_t  uplox, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  y, inc_t rs_y, inc_t cs_y  \
+       doff_t        diagoffx, \
+       diag_t        diagx, \
+       uplo_t        uplox, \
+       trans_t       transx, \
+       dim_t         m, \
+       dim_t         n, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+             ctype*  y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -110,14 +110,14 @@ INSERT_GENTFUNC_BASIC( subm, subd )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       uplo_t  uplox, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  y, inc_t rs_y, inc_t cs_y  \
+       doff_t        diagoffx, \
+       diag_t        diagx, \
+       uplo_t        uplox, \
+       trans_t       transx, \
+       dim_t         m, \
+       dim_t         n, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+             ctype*  y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -179,15 +179,15 @@ INSERT_GENTFUNC_BASIC0( copym )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       uplo_t  uplox, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  y, inc_t rs_y, inc_t cs_y  \
+       doff_t        diagoffx, \
+       diag_t        diagx, \
+       uplo_t        uplox, \
+       trans_t       transx, \
+       dim_t         m, \
+       dim_t         n, \
+       const ctype*  alpha, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+             ctype*  y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -249,15 +249,15 @@ INSERT_GENTFUNC_BASIC0( axpym )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       uplo_t  uplox, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  y, inc_t rs_y, inc_t cs_y  \
+       doff_t        diagoffx, \
+       diag_t        diagx, \
+       uplo_t        uplox, \
+       trans_t       transx, \
+       dim_t         m, \
+       dim_t         n, \
+       const ctype*  alpha, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+             ctype*  y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -341,14 +341,14 @@ INSERT_GENTFUNC_BASIC0( scal2m )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjalpha, \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       uplo_t  uplox, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t rs_x, inc_t cs_x  \
+       conj_t        conjalpha, \
+       doff_t        diagoffx, \
+       diag_t        diagx, \
+       uplo_t        uplox, \
+       dim_t         m, \
+       dim_t         n, \
+       const ctype*  alpha, \
+             ctype*  x, inc_t rs_x, inc_t cs_x  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -387,15 +387,15 @@ INSERT_GENTFUNC_BASIC0( setm )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       uplo_t  uplox, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  beta, \
-       ctype*  y, inc_t rs_y, inc_t cs_y  \
+       doff_t        diagoffx, \
+       diag_t        diagx, \
+       uplo_t        uplox, \
+       trans_t       transx, \
+       dim_t         m, \
+       dim_t         n, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+       const ctype*  beta, \
+             ctype*  y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -474,15 +474,15 @@ INSERT_GENTFUNC_BASIC0( xpbym )
 \
 void PASTEMAC3(chx,chy,opname,EX_SUF) \
      ( \
-       doff_t   diagoffx, \
-       diag_t   diagx, \
-       uplo_t   uplox, \
-       trans_t  transx, \
-       dim_t    m, \
-       dim_t    n, \
-       ctype_x* x, inc_t rs_x, inc_t cs_x, \
-       ctype_y* beta, \
-       ctype_y* y, inc_t rs_y, inc_t cs_y  \
+       doff_t         diagoffx, \
+       diag_t         diagx, \
+       uplo_t         uplox, \
+       trans_t        transx, \
+       dim_t          m, \
+       dim_t          n, \
+       const ctype_x* x, inc_t rs_x, inc_t cs_x, \
+       const ctype_y* beta, \
+             ctype_y* y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
diff --git a/frame/1m/bli_l1m_tapi.h b/frame/1m/bli_l1m_tapi.h
index 03a1196edd..ff99047ebc 100644
--- a/frame/1m/bli_l1m_tapi.h
+++ b/frame/1m/bli_l1m_tapi.h
@@ -42,14 +42,14 @@
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       uplo_t  uplox, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  y, inc_t rs_y, inc_t cs_y  \
+       doff_t        diagoffx, \
+       diag_t        diagx, \
+       uplo_t        uplox, \
+       trans_t       transx, \
+       dim_t         m, \
+       dim_t         n, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+             ctype*  y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -63,15 +63,15 @@ INSERT_GENTPROT_BASIC0( subm )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       uplo_t  uplox, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  y, inc_t rs_y, inc_t cs_y  \
+       doff_t        diagoffx, \
+       diag_t        diagx, \
+       uplo_t        uplox, \
+       trans_t       transx, \
+       dim_t         m, \
+       dim_t         n, \
+       const ctype*  alpha, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+             ctype*  y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -84,14 +84,14 @@ INSERT_GENTPROT_BASIC0( scal2m )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjalpha, \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       uplo_t  uplox, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t rs_x, inc_t cs_x  \
+       conj_t        conjalpha, \
+       doff_t        diagoffx, \
+       diag_t        diagx, \
+       uplo_t        uplox, \
+       dim_t         m, \
+       dim_t         n, \
+       const ctype*  alpha, \
+             ctype*  x, inc_t rs_x, inc_t cs_x  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -104,15 +104,15 @@ INSERT_GENTPROT_BASIC0( setm )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       uplo_t  uplox, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  beta, \
-       ctype*  y, inc_t rs_y, inc_t cs_y  \
+       doff_t        diagoffx, \
+       diag_t        diagx, \
+       uplo_t        uplox, \
+       trans_t       transx, \
+       dim_t         m, \
+       dim_t         n, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+       const ctype*  beta, \
+             ctype*  y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -124,15 +124,15 @@ INSERT_GENTPROT_BASIC0( xpbym )
 \
 BLIS_EXPORT_BLIS void PASTEMAC3(chx,chy,opname,EX_SUF) \
      ( \
-       doff_t   diagoffx, \
-       diag_t   diagx, \
-       uplo_t   uplox, \
-       trans_t  transx, \
-       dim_t    m, \
-       dim_t    n, \
-       ctype_x* x, inc_t rs_x, inc_t cs_x, \
-       ctype_y* beta, \
-       ctype_y* y, inc_t rs_y, inc_t cs_y  \
+       doff_t         diagoffx, \
+       diag_t         diagx, \
+       uplo_t         uplox, \
+       trans_t        transx, \
+       dim_t          m, \
+       dim_t          n, \
+       const ctype_x* x, inc_t rs_x, inc_t cs_x, \
+       const ctype_y* beta, \
+             ctype_y* y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
diff --git a/frame/1m/packm/bli_packm_alloc.c b/frame/1m/packm/bli_packm_alloc.c
index b12a93ddc0..bd292c9249 100644
--- a/frame/1m/packm/bli_packm_alloc.c
+++ b/frame/1m/packm/bli_packm_alloc.c
@@ -40,7 +40,7 @@ void* bli_packm_alloc
        siz_t      size_needed,
        rntm_t*    rntm,
        cntl_t*    cntl,
-       thrinfo_t* thread
+       const thrinfo_t* thread
      )
 {
 	// Query the pack buffer type from the control tree node.
@@ -62,7 +62,7 @@ void* bli_packm_alloc_ex
        packbuf_t  pack_buf_type,
        rntm_t*    rntm,
        cntl_t*    cntl,
-       thrinfo_t* thread
+       const thrinfo_t* thread
      )
 {
 	// Query the address of the mem_t entry within the control tree node.
diff --git a/frame/1m/packm/bli_packm_alloc.h b/frame/1m/packm/bli_packm_alloc.h
index 5a5cf126b1..c7d0325aed 100644
--- a/frame/1m/packm/bli_packm_alloc.h
+++ b/frame/1m/packm/bli_packm_alloc.h
@@ -37,7 +37,7 @@ BLIS_EXPORT_BLIS void* bli_packm_alloc
        siz_t      size_needed,
        rntm_t*    rntm,
        cntl_t*    cntl,
-       thrinfo_t* thread
+       const thrinfo_t* thread
      );
 
 BLIS_EXPORT_BLIS void* bli_packm_alloc_ex
@@ -46,6 +46,6 @@ BLIS_EXPORT_BLIS void* bli_packm_alloc_ex
        packbuf_t  pack_buf_type,
        rntm_t*    rntm,
        cntl_t*    cntl,
-       thrinfo_t* thread
+       const thrinfo_t* thread
      );
 
diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c
index e133911510..7296423c8e 100644
--- a/frame/1m/packm/bli_packm_blk_var1.c
+++ b/frame/1m/packm/bli_packm_blk_var1.c
@@ -54,12 +54,12 @@ static void_fp GENARRAY2_ALL(packm_struc_cxk_md,packm_struc_cxk_md);
 
 void bli_packm_blk_var1
      (
-       obj_t*   c,
-       obj_t*   p,
-       cntx_t*  cntx,
+       const obj_t*   c,
+             obj_t*   p,
+       const cntx_t*  cntx,
        rntm_t*  rntm,
        cntl_t*  cntl,
-       thrinfo_t* thread
+       const thrinfo_t* thread
      )
 {
 	// Extract various fields from the control tree.
diff --git a/frame/1m/packm/bli_packm_blk_var1.h b/frame/1m/packm/bli_packm_blk_var1.h
index 9cda5828b5..f35bf3cf99 100644
--- a/frame/1m/packm/bli_packm_blk_var1.h
+++ b/frame/1m/packm/bli_packm_blk_var1.h
@@ -49,11 +49,11 @@ typedef struct
 
 BLIS_EXPORT_BLIS void bli_packm_blk_var1
      (
-       obj_t*   c,
-       obj_t*   p,
-       cntx_t*  cntx,
+       const obj_t*   c,
+             obj_t*   p,
+       const cntx_t*  cntx,
        rntm_t*  rntm,
        cntl_t*  cntl,
-       thrinfo_t* t
+       const thrinfo_t* t
      );
 
diff --git a/frame/1m/packm/bli_packm_check.c b/frame/1m/packm/bli_packm_check.c
index e662a85df0..15bd032cab 100644
--- a/frame/1m/packm/bli_packm_check.c
+++ b/frame/1m/packm/bli_packm_check.c
@@ -37,9 +37,9 @@
 
 void bli_packm_init_check
      (
-       obj_t*  a,
-       obj_t*  p,
-       cntx_t* cntx
+       const obj_t*  a,
+       const obj_t*  p,
+       const cntx_t* cntx
      )
 {
 	err_t e_val;
@@ -59,9 +59,9 @@ void bli_packm_init_check
 
 void bli_packm_int_check
      (
-       obj_t*  a,
-       obj_t*  p,
-       cntx_t* cntx
+       const obj_t*  a,
+       const obj_t*  p,
+       const cntx_t* cntx
      )
 {
 	err_t e_val;
diff --git a/frame/1m/packm/bli_packm_check.h b/frame/1m/packm/bli_packm_check.h
index be375fcf76..da9399b313 100644
--- a/frame/1m/packm/bli_packm_check.h
+++ b/frame/1m/packm/bli_packm_check.h
@@ -34,15 +34,15 @@
 
 void bli_packm_init_check
      (
-       obj_t*  a,
-       obj_t*  p,
-       cntx_t* cntx
+       const obj_t*  a,
+       const obj_t*  p,
+       const cntx_t* cntx
      );
 
 void bli_packm_int_check
      (
-       obj_t*  a,
-       obj_t*  p,
-       cntx_t* cntx
+       const obj_t*  a,
+       const obj_t*  p,
+       const cntx_t* cntx
      );
 
diff --git a/frame/1m/packm/bli_packm_cntl.h b/frame/1m/packm/bli_packm_cntl.h
index 14bfe1ce85..b923682df6 100644
--- a/frame/1m/packm/bli_packm_cntl.h
+++ b/frame/1m/packm/bli_packm_cntl.h
@@ -46,37 +46,37 @@ struct packm_params_s
 };
 typedef struct packm_params_s packm_params_t;
 
-BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_m( cntl_t* cntl )
+BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_m( const cntl_t* cntl )
 {
 	packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_m;
 }
 
-BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_n( cntl_t* cntl )
+BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_n( const cntl_t* cntl )
 {
 	packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_n;
 }
 
-BLIS_INLINE bool bli_cntl_packm_params_does_invert_diag( cntl_t* cntl )
+BLIS_INLINE bool bli_cntl_packm_params_does_invert_diag( const cntl_t* cntl )
 {
 	packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->does_invert_diag;
 }
 
-BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_upper( cntl_t* cntl )
+BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_upper( const cntl_t* cntl )
 {
 	packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->rev_iter_if_upper;
 }
 
-BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_lower( cntl_t* cntl )
+BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_lower( const cntl_t* cntl )
 {
 	packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->rev_iter_if_lower;
 }
 
-BLIS_INLINE pack_t bli_cntl_packm_params_pack_schema( cntl_t* cntl )
+BLIS_INLINE pack_t bli_cntl_packm_params_pack_schema( const cntl_t* cntl )
 {
 	packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_schema;
 }
 
-BLIS_INLINE packbuf_t bli_cntl_packm_params_pack_buf_type( cntl_t* cntl )
+BLIS_INLINE packbuf_t bli_cntl_packm_params_pack_buf_type( const cntl_t* cntl )
 {
 	packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_buf_type;
 }
diff --git a/frame/1m/packm/bli_packm_init.c b/frame/1m/packm/bli_packm_init.c
index 5a7d716fe6..e4cbcc49cf 100644
--- a/frame/1m/packm/bli_packm_init.c
+++ b/frame/1m/packm/bli_packm_init.c
@@ -37,12 +37,12 @@
 
 bool bli_packm_init
      (
-       obj_t*  c,
-       obj_t*  p,
-       cntx_t* cntx,
+       const obj_t*  c,
+             obj_t*  p,
+       const cntx_t* cntx,
        rntm_t* rntm,
        cntl_t* cntl,
-       thrinfo_t* thread
+       const thrinfo_t* thread
      )
 {
 	bli_init_once();
diff --git a/frame/1m/packm/bli_packm_init.h b/frame/1m/packm/bli_packm_init.h
index 152c6f15cd..6f0997bc3b 100644
--- a/frame/1m/packm/bli_packm_init.h
+++ b/frame/1m/packm/bli_packm_init.h
@@ -34,11 +34,11 @@
 
 BLIS_EXPORT_BLIS bool bli_packm_init
      (
-       obj_t*  a,
-       obj_t*  p,
-       cntx_t* cntx,
+       const obj_t*  a,
+             obj_t*  p,
+       const cntx_t* cntx,
        rntm_t* rntm,
        cntl_t* cntl,
-       thrinfo_t* thread
+       const thrinfo_t* thread
      );
 
diff --git a/frame/1m/packm/bli_packm_int.c b/frame/1m/packm/bli_packm_int.c
index c9a2bb9db2..45872ebb07 100644
--- a/frame/1m/packm/bli_packm_int.c
+++ b/frame/1m/packm/bli_packm_int.c
@@ -36,12 +36,12 @@
 
 void bli_packm_int
      (
-       obj_t*  a,
-       obj_t*  p,
-       cntx_t* cntx,
+       const obj_t*  a,
+             obj_t*  p,
+       const cntx_t* cntx,
        rntm_t* rntm,
        cntl_t* cntl,
-       thrinfo_t* thread
+       const thrinfo_t* thread
      )
 {
 	bli_init_once();
diff --git a/frame/1m/packm/bli_packm_int.h b/frame/1m/packm/bli_packm_int.h
index 16a5c2c34d..389c49ad59 100644
--- a/frame/1m/packm/bli_packm_int.h
+++ b/frame/1m/packm/bli_packm_int.h
@@ -34,10 +34,10 @@
 
 void bli_packm_int
      (
-       obj_t*  a,
-       obj_t*  p,
-       cntx_t* cntx,
+       const obj_t*  a,
+             obj_t*  p,
+       const cntx_t* cntx,
        rntm_t* rntm,
        cntl_t* cntl,
-       thrinfo_t* thread
+       const thrinfo_t* thread
      );
diff --git a/frame/1m/packm/bli_packm_part.c b/frame/1m/packm/bli_packm_part.c
index 2fff4b7c87..feaaaeea80 100644
--- a/frame/1m/packm/bli_packm_part.c
+++ b/frame/1m/packm/bli_packm_part.c
@@ -38,11 +38,11 @@
 // -- Matrix partitioning ------------------------------------------------------
 
 
-void bli_packm_acquire_mpart_t2b( subpart_t requested_part,
-                                  dim_t     i,
-                                  dim_t     b,
-                                  obj_t*    obj,
-                                  obj_t*    sub_obj )
+void bli_packm_acquire_mpart_t2b( subpart_t    requested_part,
+                                  dim_t        i,
+                                  dim_t        b,
+                                  const obj_t* obj,
+                                        obj_t* sub_obj )
 {
 	dim_t m, n;
 
@@ -110,11 +110,11 @@ void bli_packm_acquire_mpart_t2b( subpart_t requested_part,
 
 
 
-void bli_packm_acquire_mpart_l2r( subpart_t requested_part,
-                                  dim_t     j,
-                                  dim_t     b,
-                                  obj_t*    obj,
-                                  obj_t*    sub_obj )
+void bli_packm_acquire_mpart_l2r( subpart_t    requested_part,
+                                  dim_t        j,
+                                  dim_t        b,
+                                  const obj_t* obj,
+                                        obj_t* sub_obj )
 {
 	dim_t m, n;
 
@@ -186,18 +186,18 @@ void bli_packm_acquire_mpart_l2r( subpart_t requested_part,
 
 
 
-void bli_packm_acquire_mpart_tl2br( subpart_t requested_part,
-                                    dim_t     ij,
-                                    dim_t     b,
-                                    obj_t*    obj,
-                                    obj_t*    sub_obj )
+void bli_packm_acquire_mpart_tl2br( subpart_t    requested_part,
+                                    dim_t        ij,
+                                    dim_t        b,
+                                    const obj_t* obj,
+                                          obj_t* sub_obj )
 {
 	bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );
 }
 
 
 
-dim_t bli_packm_offset_to_panel_for( dim_t offmn, obj_t* p )
+dim_t bli_packm_offset_to_panel_for( dim_t offmn, const obj_t* p )
 {
 	dim_t panel_off;
 
diff --git a/frame/1m/packm/bli_packm_part.h b/frame/1m/packm/bli_packm_part.h
index 5930d312ec..39ee69a2c3 100644
--- a/frame/1m/packm/bli_packm_part.h
+++ b/frame/1m/packm/bli_packm_part.h
@@ -34,23 +34,23 @@
 
 // -- Matrix partitioning ------------------------------------------------------
 
-void bli_packm_acquire_mpart_t2b( subpart_t requested_part,
-                                  dim_t     i,
-                                  dim_t     b,
-                                  obj_t*    obj,
-                                  obj_t*    sub_obj );
-
-void bli_packm_acquire_mpart_l2r( subpart_t requested_part,
-                                  dim_t     j,
-                                  dim_t     b,
-                                  obj_t*    obj,
-                                  obj_t*    sub_obj );
-
-void bli_packm_acquire_mpart_tl2br( subpart_t requested_part,
-                                    dim_t     ij,
-                                    dim_t     b,
-                                    obj_t*    obj,
-                                    obj_t*    sub_obj );
-
-dim_t bli_packm_offset_to_panel_for( dim_t offmn, obj_t* p );
+void bli_packm_acquire_mpart_t2b( subpart_t    requested_part,
+                                  dim_t        i,
+                                  dim_t        b,
+                                  const obj_t* obj,
+                                        obj_t* sub_obj );
+
+void bli_packm_acquire_mpart_l2r( subpart_t    requested_part,
+                                  dim_t        j,
+                                  dim_t        b,
+                                  const obj_t* obj,
+                                        obj_t* sub_obj );
+
+void bli_packm_acquire_mpart_tl2br( subpart_t    requested_part,
+                                    dim_t        ij,
+                                    dim_t        b,
+                                    const obj_t* obj,
+                                          obj_t* sub_obj );
+
+dim_t bli_packm_offset_to_panel_for( dim_t offmn, const obj_t* p );
 
diff --git a/frame/1m/unpackm/bli_unpackm_blk_var1.c b/frame/1m/unpackm/bli_unpackm_blk_var1.c
index cbd9045d9d..d8193fbc1a 100644
--- a/frame/1m/unpackm/bli_unpackm_blk_var1.c
+++ b/frame/1m/unpackm/bli_unpackm_blk_var1.c
@@ -49,7 +49,7 @@ typedef void (*FUNCPTR_T)(
                            void*   p, inc_t rs_p, inc_t cs_p,
                                       dim_t pd_p, inc_t ps_p,
                            void*   c, inc_t rs_c, inc_t cs_c,
-                           cntx_t* cntx
+                           const cntx_t* cntx
                          );
 
 static FUNCPTR_T GENARRAY(ftypes,unpackm_blk_var1);
@@ -57,11 +57,11 @@ static FUNCPTR_T GENARRAY(ftypes,unpackm_blk_var1);
 
 void bli_unpackm_blk_var1
      (
-       obj_t*  p,
-       obj_t*  c,
-       cntx_t* cntx,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  p,
+       const obj_t*  c,
+       const cntx_t* cntx,
+       const cntl_t* cntl,
+       const thrinfo_t* thread
      )
 {
 	num_t     dt_cp     = bli_obj_dt( c );
@@ -140,7 +140,7 @@ void PASTEMAC(ch,varname) \
        void*   p, inc_t rs_p, inc_t cs_p, \
                   dim_t pd_p, inc_t ps_p, \
        void*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx  \
+       const cntx_t* cntx  \
      ) \
 { \
 	ctype* restrict one       = PASTEMAC(ch,1); \
diff --git a/frame/1m/unpackm/bli_unpackm_blk_var1.h b/frame/1m/unpackm/bli_unpackm_blk_var1.h
index abd0445493..a9c9f5548f 100644
--- a/frame/1m/unpackm/bli_unpackm_blk_var1.h
+++ b/frame/1m/unpackm/bli_unpackm_blk_var1.h
@@ -34,11 +34,11 @@
 
 void bli_unpackm_blk_var1
      (
-       obj_t*  p,
-       obj_t*  c,
-       cntx_t* cntx,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  p,
+       const obj_t*  c,
+       const cntx_t* cntx,
+       const cntl_t* cntl,
+       const thrinfo_t* thread
      );
 
 
@@ -59,7 +59,7 @@ void PASTEMAC(ch,varname) \
        void*   p, inc_t rs_p, inc_t cs_p, \
                   dim_t pd_p, inc_t ps_p, \
        void*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx  \
+       const cntx_t* cntx  \
      );
 
 INSERT_GENTPROT_BASIC0( unpackm_blk_var1 )
diff --git a/frame/1m/unpackm/bli_unpackm_check.c b/frame/1m/unpackm/bli_unpackm_check.c
index 5bce60ed3b..e397c311f9 100644
--- a/frame/1m/unpackm/bli_unpackm_check.c
+++ b/frame/1m/unpackm/bli_unpackm_check.c
@@ -1,4 +1,4 @@
-/*
+const /*
 
    BLIS
    An object-based framework for developing high-performance BLAS-like
@@ -36,9 +36,9 @@
 
 void bli_unpackm_int_check
      (
-       obj_t*  p,
-       obj_t*  a,
-       cntx_t* cntx
+       const obj_t*  p,
+       const obj_t*  a,
+       const cntx_t* cntx
      )
 {
 	err_t e_val;
diff --git a/frame/1m/unpackm/bli_unpackm_check.h b/frame/1m/unpackm/bli_unpackm_check.h
index d2a976dd8d..697010fa70 100644
--- a/frame/1m/unpackm/bli_unpackm_check.h
+++ b/frame/1m/unpackm/bli_unpackm_check.h
@@ -34,8 +34,8 @@
 
 void bli_unpackm_int_check
      (
-       obj_t*  p,
-       obj_t*  a,
-       cntx_t* cntx
+       const obj_t*  p,
+       const obj_t*  a,
+       const cntx_t* cntx
      );
 
diff --git a/frame/1m/unpackm/bli_unpackm_int.c b/frame/1m/unpackm/bli_unpackm_int.c
index 550a8fb870..f67cae084a 100644
--- a/frame/1m/unpackm/bli_unpackm_int.c
+++ b/frame/1m/unpackm/bli_unpackm_int.c
@@ -36,11 +36,11 @@
 
 void bli_unpackm_int
      (
-       obj_t*  p,
-       obj_t*  a,
-       cntx_t* cntx,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  p,
+       const obj_t*  a,
+       const cntx_t* cntx,
+       const cntl_t* cntl,
+       const thrinfo_t* thread
      )
 {
 	bli_init_once();
diff --git a/frame/1m/unpackm/bli_unpackm_int.h b/frame/1m/unpackm/bli_unpackm_int.h
index cb66d09753..8258ea3676 100644
--- a/frame/1m/unpackm/bli_unpackm_int.h
+++ b/frame/1m/unpackm/bli_unpackm_int.h
@@ -34,10 +34,10 @@
 
 void bli_unpackm_int
      (
-       obj_t*  p,
-       obj_t*  a,
-       cntx_t* cntx,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  p,
+       const obj_t*  a,
+       const cntx_t* cntx,
+       const cntl_t* cntl,
+       const thrinfo_t* thread
      );
 
diff --git a/frame/2/bli_l2_check.c b/frame/2/bli_l2_check.c
index fac91fec42..7c93a860d3 100644
--- a/frame/2/bli_l2_check.c
+++ b/frame/2/bli_l2_check.c
@@ -36,11 +36,11 @@
 
 void bli_gemv_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  x,
-       obj_t*  beta,
-       obj_t*  y 
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  x,
+       const obj_t*  beta,
+       const obj_t*  y
      )
 {
 	err_t e_val;
@@ -66,11 +66,11 @@ void bli_gemv_check
 
 void bli_hemv_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  x,
-       obj_t*  beta,
-       obj_t*  y 
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  x,
+       const obj_t*  beta,
+       const obj_t*  y
      )
 {
 	err_t e_val;
@@ -101,11 +101,11 @@ void bli_hemv_check
 
 void bli_symv_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  x,
-       obj_t*  beta,
-       obj_t*  y 
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  x,
+       const obj_t*  beta,
+       const obj_t*  y
      )
 {
 	err_t e_val;
@@ -136,9 +136,9 @@ void bli_symv_check
 
 void bli_trmv_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  x 
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  x
      )
 {
 	err_t e_val;
@@ -166,9 +166,9 @@ void bli_trmv_check
 
 void bli_trsv_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  x 
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  x
      )
 {
 	err_t e_val;
@@ -196,10 +196,10 @@ void bli_trsv_check
 
 void bli_ger_check
      (
-       obj_t*  alpha,
-       obj_t*  x,
-       obj_t*  y,
-       obj_t*  a 
+       const obj_t*  alpha,
+       const obj_t*  x,
+       const obj_t*  y,
+       const obj_t*  a
      )
 {
 	err_t e_val;
@@ -225,9 +225,9 @@ void bli_ger_check
 
 void bli_her_check
      (
-       obj_t*  alpha,
-       obj_t*  x,
-       obj_t*  a 
+       const obj_t*  alpha,
+       const obj_t*  x,
+       const obj_t*  a
      )
 {
 	err_t e_val;
@@ -255,10 +255,10 @@ void bli_her_check
 
 void bli_her2_check
      (
-       obj_t*  alpha,
-       obj_t*  x,
-       obj_t*  y,
-       obj_t*  a 
+       const obj_t*  alpha,
+       const obj_t*  x,
+       const obj_t*  y,
+       const obj_t*  a
      )
 {
 	err_t e_val;
@@ -289,9 +289,9 @@ void bli_her2_check
 
 void bli_syr_check
      (
-       obj_t*  alpha,
-       obj_t*  x,
-       obj_t*  a 
+       const obj_t*  alpha,
+       const obj_t*  x,
+       const obj_t*  a
      )
 {
 	err_t e_val;
@@ -319,10 +319,10 @@ void bli_syr_check
 
 void bli_syr2_check
      (
-       obj_t*  alpha,
-       obj_t*  x,
-       obj_t*  y,
-       obj_t*  a 
+       const obj_t*  alpha,
+       const obj_t*  x,
+       const obj_t*  y,
+       const obj_t*  a
      )
 {
 	err_t e_val;
@@ -355,11 +355,11 @@ void bli_syr2_check
 
 void bli_xxmv_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  x,
-       obj_t*  beta,
-       obj_t*  y 
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  x,
+       const obj_t*  beta,
+       const obj_t*  y
      )
 {
 	err_t e_val;
@@ -424,10 +424,10 @@ void bli_xxmv_check
 
 void bli_xxr_check
      (
-       obj_t*  alpha,
-       obj_t*  x,
-       obj_t*  y,
-       obj_t*  a 
+       const obj_t*  alpha,
+       const obj_t*  x,
+       const obj_t*  y,
+       const obj_t*  a
      )
 {
 	err_t e_val;
diff --git a/frame/2/bli_l2_check.h b/frame/2/bli_l2_check.h
index af9388753f..690abf8616 100644
--- a/frame/2/bli_l2_check.h
+++ b/frame/2/bli_l2_check.h
@@ -42,11 +42,11 @@
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y  \
+       const obj_t*  alpha, \
+       const obj_t*  a, \
+       const obj_t*  x, \
+       const obj_t*  beta, \
+       const obj_t*  y  \
     );
 
 GENPROT( gemv )
@@ -59,10 +59,10 @@ GENPROT( symv )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  y, \
-       obj_t*  a  \
+       const obj_t*  alpha, \
+       const obj_t*  x, \
+       const obj_t*  y, \
+       const obj_t*  a  \
     );
 
 GENPROT( ger )
@@ -75,9 +75,9 @@ GENPROT( syr2 )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  a  \
+       const obj_t*  alpha, \
+       const obj_t*  x, \
+       const obj_t*  a  \
     );
 
 GENPROT( her )
@@ -89,9 +89,9 @@ GENPROT( syr )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  x  \
+       const obj_t*  alpha, \
+       const obj_t*  a, \
+       const obj_t*  x  \
     );
 
 GENPROT( trmv )
@@ -102,17 +102,17 @@ GENPROT( trsv )
 
 void bli_xxmv_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  x,
-       obj_t*  beta,
-       obj_t*  y 
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  x,
+       const obj_t*  beta,
+       const obj_t*  y
      );
 
 void bli_xxr_check
      (
-       obj_t*  alpha,
-       obj_t*  x,
-       obj_t*  y,
-       obj_t*  a 
+       const obj_t*  alpha,
+       const obj_t*  x,
+       const obj_t*  y,
+       const obj_t*  a
      );
diff --git a/frame/2/bli_l2_ft.h b/frame/2/bli_l2_ft.h
index 73aa4dd489..410b796658 100644
--- a/frame/2/bli_l2_ft.h
+++ b/frame/2/bli_l2_ft.h
@@ -44,15 +44,15 @@
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       trans_t transa, \
-       conj_t  conjx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  x, inc_t incx, \
-       ctype*  beta, \
-       ctype*  y, inc_t incy  \
+       trans_t       transa, \
+       conj_t        conjx, \
+       dim_t         m, \
+       dim_t         n, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  beta, \
+             ctype*  y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -65,14 +65,14 @@ INSERT_GENTDEF( gemv )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       conj_t  conjx, \
-       conj_t  conjy, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy, \
-       ctype*  a, inc_t rs_a, inc_t cs_a  \
+       conj_t        conjx, \
+       conj_t        conjy, \
+       dim_t         m, \
+       dim_t         n, \
+       const ctype*  alpha, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  y, inc_t incy, \
+             ctype*  a, inc_t rs_a, inc_t cs_a  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -85,15 +85,15 @@ INSERT_GENTDEF( ger )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       uplo_t  uploa, \
-       conj_t  conja, \
-       conj_t  conjx, \
-       dim_t   m, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  x, inc_t incx, \
-       ctype*  beta, \
-       ctype*  y, inc_t incy  \
+       uplo_t        uploa, \
+       conj_t        conja, \
+       conj_t        conjx, \
+       dim_t         m, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  beta, \
+             ctype*  y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -107,12 +107,12 @@ INSERT_GENTDEF( symv )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       uplo_t   uploa, \
-       conj_t   conjx, \
-       dim_t    m, \
-       ctype_r* alpha, \
-       ctype*   x, inc_t incx, \
-       ctype*   a, inc_t rs_a, inc_t cs_a  \
+       uplo_t         uploa, \
+       conj_t         conjx, \
+       dim_t          m, \
+       const ctype_r* alpha, \
+       const ctype*   x, inc_t incx, \
+             ctype*   a, inc_t rs_a, inc_t cs_a  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -125,12 +125,12 @@ INSERT_GENTDEFR( her )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       uplo_t   uploa, \
-       conj_t   conjx, \
-       dim_t    m, \
-       ctype*   alpha, \
-       ctype*   x, inc_t incx, \
-       ctype*   a, inc_t rs_a, inc_t cs_a  \
+       uplo_t       uploa, \
+       conj_t       conjx, \
+       dim_t        m, \
+       const ctype* alpha, \
+       const ctype* x, inc_t incx, \
+             ctype* a, inc_t rs_a, inc_t cs_a  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -143,14 +143,14 @@ INSERT_GENTDEF( syr )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       uplo_t  uploa, \
-       conj_t  conjx, \
-       conj_t  conjy, \
-       dim_t   m, \
-       ctype*  alpha, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy, \
-       ctype*  a, inc_t rs_a, inc_t cs_a  \
+       uplo_t        uploa, \
+       conj_t        conjx, \
+       conj_t        conjy, \
+       dim_t         m, \
+       const ctype*  alpha, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  y, inc_t incy, \
+             ctype*  a, inc_t rs_a, inc_t cs_a  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -164,13 +164,13 @@ INSERT_GENTDEF( syr2 )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       uplo_t  uploa, \
-       trans_t transa, \
-       diag_t  diaga, \
-       dim_t   m, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  x, inc_t incx  \
+       uplo_t        uploa, \
+       trans_t       transa, \
+       diag_t        diaga, \
+       dim_t         m, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+             ctype*  x, inc_t incx  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
diff --git a/frame/2/bli_l2_oapi.c b/frame/2/bli_l2_oapi.c
index cc32fb61e6..0d26fecfd0 100644
--- a/frame/2/bli_l2_oapi.c
+++ b/frame/2/bli_l2_oapi.c
@@ -45,11 +45,11 @@
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y  \
+       const obj_t*  alpha, \
+       const obj_t*  a, \
+       const obj_t*  x, \
+       const obj_t*  beta, \
+       const obj_t*  y  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -118,10 +118,10 @@ GENFRONT( gemv )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  y, \
-       obj_t*  a  \
+       const obj_t*  alpha, \
+       const obj_t*  x, \
+       const obj_t*  y, \
+       const obj_t*  a  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -184,11 +184,11 @@ GENFRONT( ger )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y  \
+       const obj_t*  alpha, \
+       const obj_t*  a, \
+       const obj_t*  x, \
+       const obj_t*  beta, \
+       const obj_t*  y  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -258,9 +258,9 @@ GENFRONT( symv )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  a  \
+       const obj_t*  alpha, \
+       const obj_t*  x, \
+       const obj_t*  a  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -319,10 +319,10 @@ GENFRONT( syr )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  y, \
-       obj_t*  a  \
+       const obj_t*  alpha, \
+       const obj_t*  x, \
+       const obj_t*  y, \
+       const obj_t*  a  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -386,9 +386,9 @@ GENFRONT( syr2 )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  x  \
+       const obj_t*  alpha, \
+       const obj_t*  a, \
+       const obj_t*  x  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
diff --git a/frame/2/bli_l2_oapi.h b/frame/2/bli_l2_oapi.h
index 6b6a1d77ec..317277f222 100644
--- a/frame/2/bli_l2_oapi.h
+++ b/frame/2/bli_l2_oapi.h
@@ -42,11 +42,11 @@
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  x, \
-       obj_t*  beta, \
-       obj_t*  y  \
+       const obj_t*  alpha, \
+       const obj_t*  a, \
+       const obj_t*  x, \
+       const obj_t*  beta, \
+       const obj_t*  y  \
        BLIS_OAPI_EX_PARAMS  \
      );
 
@@ -60,10 +60,10 @@ GENPROT( symv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  y, \
-       obj_t*  a  \
+       const obj_t*  alpha, \
+       const obj_t*  x, \
+       const obj_t*  y, \
+       const obj_t*  a  \
        BLIS_OAPI_EX_PARAMS  \
      );
 
@@ -77,9 +77,9 @@ GENPROT( syr2 )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  x, \
-       obj_t*  a  \
+       const obj_t*  alpha, \
+       const obj_t*  x, \
+       const obj_t*  a  \
        BLIS_OAPI_EX_PARAMS  \
      );
 
@@ -92,9 +92,9 @@ GENPROT( syr )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  x  \
+       const obj_t*  alpha, \
+       const obj_t*  a, \
+       const obj_t*  x  \
        BLIS_OAPI_EX_PARAMS  \
      );
 
diff --git a/frame/2/bli_l2_tapi.c b/frame/2/bli_l2_tapi.c
index f6eb6c7d93..17f50ac7bc 100644
--- a/frame/2/bli_l2_tapi.c
+++ b/frame/2/bli_l2_tapi.c
@@ -45,15 +45,15 @@
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       trans_t transa, \
-       conj_t  conjx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  x, inc_t incx, \
-       ctype*  beta, \
-       ctype*  y, inc_t incy  \
+       trans_t       transa, \
+       conj_t        conjx, \
+       dim_t         m, \
+       dim_t         n, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  beta, \
+             ctype*  y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -128,14 +128,14 @@ INSERT_GENTFUNC_BASIC3( gemv, gemv, gemv_unf_var1, gemv_unf_var2 )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjx, \
-       conj_t  conjy, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy, \
-       ctype*  a, inc_t rs_a, inc_t cs_a  \
+       conj_t        conjx, \
+       conj_t        conjy, \
+       dim_t         m, \
+       dim_t         n, \
+       const ctype*  alpha, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  y, inc_t incy, \
+             ctype*  a, inc_t rs_a, inc_t cs_a  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -180,15 +180,15 @@ INSERT_GENTFUNC_BASIC3( ger, ger, ger_unb_var1, ger_unb_var2 )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       uplo_t  uploa, \
-       conj_t  conja, \
-       conj_t  conjx, \
-       dim_t   m, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  x, inc_t incx, \
-       ctype*  beta, \
-       ctype*  y, inc_t incy  \
+       uplo_t        uploa, \
+       conj_t        conja, \
+       conj_t        conjx, \
+       dim_t         m, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  beta, \
+             ctype*  y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -257,12 +257,12 @@ INSERT_GENTFUNC_BASIC4( symv, hemv, BLIS_NO_CONJUGATE, hemv_unf_var1, hemv_unf_v
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       uplo_t   uploa, \
-       conj_t   conjx, \
-       dim_t    m, \
-       ctype_r* alpha, \
-       ctype*   x, inc_t incx, \
-       ctype*   a, inc_t rs_a, inc_t cs_a  \
+       uplo_t         uploa, \
+       conj_t         conjx, \
+       dim_t          m, \
+       const ctype_r* alpha, \
+       const ctype*   x, inc_t incx, \
+             ctype*   a, inc_t rs_a, inc_t cs_a  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -321,12 +321,12 @@ INSERT_GENTFUNCR_BASIC4( her, her, BLIS_CONJUGATE, her_unb_var1, her_unb_var2 )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       uplo_t   uploa, \
-       conj_t   conjx, \
-       dim_t    m, \
-       ctype*   alpha, \
-       ctype*   x, inc_t incx, \
-       ctype*   a, inc_t rs_a, inc_t cs_a  \
+       uplo_t         uploa, \
+       conj_t         conjx, \
+       dim_t          m, \
+       const ctype*   alpha, \
+       const ctype*   x, inc_t incx, \
+             ctype*   a, inc_t rs_a, inc_t cs_a  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -378,14 +378,14 @@ INSERT_GENTFUNC_BASIC4( syr, her, BLIS_NO_CONJUGATE, her_unb_var1, her_unb_var2
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       uplo_t  uploa, \
-       conj_t  conjx, \
-       conj_t  conjy, \
-       dim_t   m, \
-       ctype*  alpha, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy, \
-       ctype*  a, inc_t rs_a, inc_t cs_a  \
+       uplo_t        uploa, \
+       conj_t        conjx, \
+       conj_t        conjy, \
+       dim_t         m, \
+       const ctype*  alpha, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  y, inc_t incy, \
+             ctype*  a, inc_t rs_a, inc_t cs_a  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -440,13 +440,13 @@ INSERT_GENTFUNC_BASIC4( syr2, her2, BLIS_NO_CONJUGATE, her2_unf_var1, her2_unf_v
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       uplo_t  uploa, \
-       trans_t transa, \
-       diag_t  diaga, \
-       dim_t   m, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  x, inc_t incx  \
+       uplo_t        uploa, \
+       trans_t       transa, \
+       diag_t        diaga, \
+       dim_t         m, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+             ctype*  x, inc_t incx  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
diff --git a/frame/2/bli_l2_tapi.h b/frame/2/bli_l2_tapi.h
index 4b45236e23..072c87a2f1 100644
--- a/frame/2/bli_l2_tapi.h
+++ b/frame/2/bli_l2_tapi.h
@@ -42,15 +42,15 @@
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       trans_t transa, \
-       conj_t  conjx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  x, inc_t incx, \
-       ctype*  beta, \
-       ctype*  y, inc_t incy  \
+       trans_t       transa, \
+       conj_t        conjx, \
+       dim_t         m, \
+       dim_t         n, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  beta, \
+             ctype*  y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -62,14 +62,14 @@ INSERT_GENTPROT_BASIC0( gemv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t  conjx, \
-       conj_t  conjy, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy, \
-       ctype*  a, inc_t rs_a, inc_t cs_a  \
+       conj_t        conjx, \
+       conj_t        conjy, \
+       dim_t         m, \
+       dim_t         n, \
+       const ctype*  alpha, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  y, inc_t incy, \
+             ctype*  a, inc_t rs_a, inc_t cs_a  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -81,15 +81,15 @@ INSERT_GENTPROT_BASIC0( ger )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       uplo_t  uploa, \
-       conj_t  conja, \
-       conj_t  conjx, \
-       dim_t   m, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  x, inc_t incx, \
-       ctype*  beta, \
-       ctype*  y, inc_t incy  \
+       uplo_t        uploa, \
+       conj_t        conja, \
+       conj_t        conjx, \
+       dim_t         m, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  beta, \
+             ctype*  y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -102,12 +102,12 @@ INSERT_GENTPROT_BASIC0( symv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       uplo_t   uploa, \
-       conj_t   conjx, \
-       dim_t    m, \
-       ctype_r* alpha, \
-       ctype*   x, inc_t incx, \
-       ctype*   a, inc_t rs_a, inc_t cs_a  \
+       uplo_t         uploa, \
+       conj_t         conjx, \
+       dim_t          m, \
+       const ctype_r* alpha, \
+       const ctype*   x, inc_t incx, \
+             ctype*   a, inc_t rs_a, inc_t cs_a  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -119,12 +119,12 @@ INSERT_GENTPROTR_BASIC0( her )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       uplo_t   uploa, \
-       conj_t   conjx, \
-       dim_t    m, \
-       ctype*   alpha, \
-       ctype*   x, inc_t incx, \
-       ctype*   a, inc_t rs_a, inc_t cs_a  \
+       uplo_t         uploa, \
+       conj_t         conjx, \
+       dim_t          m, \
+       const ctype*   alpha, \
+       const ctype*   x, inc_t incx, \
+             ctype*   a, inc_t rs_a, inc_t cs_a  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -136,14 +136,14 @@ INSERT_GENTPROT_BASIC0( syr )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       uplo_t  uploa, \
-       conj_t  conjx, \
-       conj_t  conjy, \
-       dim_t   m, \
-       ctype*  alpha, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy, \
-       ctype*  a, inc_t rs_a, inc_t cs_a  \
+       uplo_t        uploa, \
+       conj_t        conjx, \
+       conj_t        conjy, \
+       dim_t         m, \
+       const ctype*  alpha, \
+       const ctype*  x, inc_t incx, \
+       const ctype*  y, inc_t incy, \
+             ctype*  a, inc_t rs_a, inc_t cs_a  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -156,13 +156,13 @@ INSERT_GENTPROT_BASIC0( syr2 )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       uplo_t  uploa, \
-       trans_t transa, \
-       diag_t  diaga, \
-       dim_t   m, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  x, inc_t incx  \
+       uplo_t        uploa, \
+       trans_t       transa, \
+       diag_t        diaga, \
+       dim_t         m, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+             ctype*  x, inc_t incx  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
diff --git a/frame/3/bli_l3_blocksize.c b/frame/3/bli_l3_blocksize.c
index 1986b3b0f6..284c92b011 100644
--- a/frame/3/bli_l3_blocksize.c
+++ b/frame/3/bli_l3_blocksize.c
@@ -40,11 +40,11 @@ dim_t bli_l3_determine_kc
         dir_t   direct,
         dim_t   i,
         dim_t   dim,
-        obj_t*  a,
-        obj_t*  b,
+        const obj_t*  a,
+        const obj_t*  b,
         bszid_t bszid,
-        cntx_t* cntx,
-        cntl_t* cntl
+        const cntx_t* cntx,
+        const cntl_t* cntl
       )
 {
 	opid_t family = bli_cntl_family( cntl );
@@ -78,10 +78,10 @@ dim_t PASTEMAC0(opname) \
         dir_t   direct, \
         dim_t   i, \
         dim_t   dim, \
-        obj_t*  a, \
-        obj_t*  b, \
+        const obj_t*  a, \
+        const obj_t*  b, \
         bszid_t bszid, \
-        cntx_t* cntx  \
+        const cntx_t* cntx  \
       ) \
 { \
 	if ( direct == BLIS_FWD ) \
@@ -104,14 +104,14 @@ dim_t PASTEMAC0(opname) \
       ( \
         dim_t   i, \
         dim_t   dim, \
-        obj_t*  a, \
-        obj_t*  b, \
+        const obj_t*  a, \
+        const obj_t*  b, \
         bszid_t bszid, \
-        cntx_t* cntx  \
+        const cntx_t* cntx  \
       ) \
 { \
 	num_t    dt; \
-	blksz_t* bsize; \
+	const blksz_t* bsize; \
 	dim_t    mnr; \
 	dim_t    b_alg, b_max; \
 	dim_t    b_use; \
@@ -171,14 +171,14 @@ dim_t PASTEMAC0(opname) \
       ( \
         dim_t   i, \
         dim_t   dim, \
-        obj_t*  a, \
-        obj_t*  b, \
+        const obj_t*  a, \
+        const obj_t*  b, \
         bszid_t bszid, \
-        cntx_t* cntx  \
+        const cntx_t* cntx  \
       ) \
 { \
 	num_t    dt; \
-	blksz_t* bsize; \
+	const blksz_t* bsize; \
 	dim_t    b_alg, b_max; \
 	dim_t    b_use; \
  \
@@ -223,14 +223,14 @@ dim_t PASTEMAC0(opname) \
       ( \
         dim_t   i, \
         dim_t   dim, \
-        obj_t*  a, \
-        obj_t*  b, \
+        const obj_t*  a, \
+        const obj_t*  b, \
         bszid_t bszid, \
-        cntx_t* cntx  \
+        const cntx_t* cntx  \
       ) \
 { \
 	num_t    dt; \
-	blksz_t* bsize; \
+	const blksz_t* bsize; \
 	dim_t    mnr; \
 	dim_t    b_alg, b_max; \
 	dim_t    b_use; \
@@ -284,14 +284,14 @@ dim_t PASTEMAC0(opname) \
       ( \
         dim_t   i, \
         dim_t   dim, \
-        obj_t*  a, \
-        obj_t*  b, \
+        const obj_t*  a, \
+        const obj_t*  b, \
         bszid_t bszid, \
-        cntx_t* cntx  \
+        const cntx_t* cntx  \
       ) \
 { \
 	num_t    dt; \
-	blksz_t* bsize; \
+	const blksz_t* bsize; \
 	dim_t    mnr; \
 	dim_t    b_alg, b_max; \
 	dim_t    b_use; \
diff --git a/frame/3/bli_l3_blocksize.h b/frame/3/bli_l3_blocksize.h
index 3ea3c5aa02..cae6c85199 100644
--- a/frame/3/bli_l3_blocksize.h
+++ b/frame/3/bli_l3_blocksize.h
@@ -37,11 +37,11 @@ dim_t bli_l3_determine_kc
         dir_t   direct,
         dim_t   i,
         dim_t   dim,
-        obj_t*  a,
-        obj_t*  b,
+        const obj_t*  a,
+        const obj_t*  b,
         bszid_t bszid,
-        cntx_t* cntx,
-        cntl_t* cntl
+        const cntx_t* cntx,
+        const cntl_t* cntl
       );
 
 
@@ -53,10 +53,10 @@ dim_t PASTEMAC0(opname) \
          dir_t   direct, \
          dim_t   i, \
          dim_t   dim, \
-         obj_t*  a, \
-         obj_t*  b, \
+         const obj_t*  a, \
+         const obj_t*  b, \
          bszid_t bszid, \
-         cntx_t* cntx  \
+         const cntx_t* cntx  \
       );
 
 GENPROT( gemm_determine_kc )
@@ -72,10 +72,10 @@ dim_t PASTEMAC0(opname) \
       ( \
          dim_t   i, \
          dim_t   dim, \
-         obj_t*  a, \
-         obj_t*  b, \
+         const obj_t*  a, \
+         const obj_t*  b, \
          bszid_t bszid, \
-         cntx_t* cntx  \
+         const cntx_t* cntx  \
       );
 
 GENPROT( gemm_determine_kc_f )
diff --git a/frame/3/bli_l3_check.c b/frame/3/bli_l3_check.c
index 3e7882bc39..9c25922aae 100644
--- a/frame/3/bli_l3_check.c
+++ b/frame/3/bli_l3_check.c
@@ -36,12 +36,12 @@
 
 void bli_gemm_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx
      )
 {
 	//err_t e_val;
@@ -65,12 +65,12 @@ void bli_gemm_check
 
 void bli_gemmt_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx
      )
 {
 	err_t e_val;
@@ -88,12 +88,12 @@ void bli_gemmt_check
 void bli_hemm_check
      (
        side_t  side,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx
      )
 {
 	err_t e_val;
@@ -110,15 +110,15 @@ void bli_hemm_check
 
 void bli_herk_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx
      )
 {
 	err_t e_val;
-	obj_t ah;
+    obj_t ah;
 
 	// Alias A to A^H so we can perform dimension checks.
 	bli_obj_alias_with_trans( BLIS_CONJ_TRANSPOSE, a, &ah );
@@ -143,12 +143,12 @@ void bli_herk_check
 
 void bli_her2k_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx
      )
 {
 	err_t e_val;
@@ -176,12 +176,12 @@ void bli_her2k_check
 void bli_symm_check
      (
        side_t  side,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx
      )
 {
 	err_t e_val;
@@ -198,11 +198,11 @@ void bli_symm_check
 
 void bli_syrk_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx
      )
 {
 	err_t e_val;
@@ -223,12 +223,12 @@ void bli_syrk_check
 
 void bli_syr2k_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx
      )
 {
 	err_t e_val;
@@ -251,12 +251,12 @@ void bli_syr2k_check
 void bli_trmm3_check
      (
        side_t  side,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx
      )
 {
 	err_t e_val;
@@ -274,10 +274,10 @@ void bli_trmm3_check
 void bli_trmm_check
      (
        side_t  side,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       cntx_t* cntx
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const cntx_t* cntx
      )
 {
 	err_t e_val;
@@ -295,10 +295,10 @@ void bli_trmm_check
 void bli_trsm_check
      (
        side_t  side,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       cntx_t* cntx
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const cntx_t* cntx
      )
 {
 	err_t e_val;
@@ -317,12 +317,12 @@ void bli_trsm_check
 
 void bli_gemm_basic_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx
      )
 {
 	err_t e_val;
@@ -367,12 +367,12 @@ void bli_gemm_basic_check
 
 void bli_gemmt_basic_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx
      )
 {
 	err_t e_val;
@@ -390,12 +390,12 @@ void bli_gemmt_basic_check
 void bli_hemm_basic_check
      (
        side_t  side,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx
      )
 {
 	err_t e_val;
@@ -433,12 +433,12 @@ void bli_hemm_basic_check
 
 void bli_herk_basic_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  ah,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  ah,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx
      )
 {
 	err_t e_val;
@@ -476,14 +476,14 @@ void bli_herk_basic_check
 
 void bli_her2k_basic_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  bh,
-       obj_t*  b,
-       obj_t*  ah,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  bh,
+       const obj_t*  b,
+       const obj_t*  ah,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx
      )
 {
 	err_t e_val;
@@ -537,12 +537,12 @@ void bli_her2k_basic_check
 
 void bli_l3_basic_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx
      )
 {
 	err_t e_val;
diff --git a/frame/3/bli_l3_check.h b/frame/3/bli_l3_check.h
index c600d60b9a..b8ea6661d9 100644
--- a/frame/3/bli_l3_check.h
+++ b/frame/3/bli_l3_check.h
@@ -42,12 +42,12 @@
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  beta, \
-       obj_t*  c, \
-       cntx_t* cntx  \
+       const obj_t*  alpha, \
+       const obj_t*  a, \
+       const obj_t*  b, \
+       const obj_t*  beta, \
+       const obj_t*  c, \
+       const cntx_t* cntx  \
     );
 
 GENPROT( gemm )
@@ -62,12 +62,12 @@ GENPROT( syr2k )
 void PASTEMAC(opname,_check) \
      ( \
        side_t  side, \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  beta, \
-       obj_t*  c, \
-       cntx_t* cntx  \
+       const obj_t*  alpha, \
+       const obj_t*  a, \
+       const obj_t*  b, \
+       const obj_t*  beta, \
+       const obj_t*  c, \
+       const cntx_t* cntx  \
     );
 
 GENPROT( hemm )
@@ -80,11 +80,11 @@ GENPROT( trmm3 )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  beta, \
-       obj_t*  c, \
-       cntx_t* cntx  \
+       const obj_t*  alpha, \
+       const obj_t*  a, \
+       const obj_t*  beta, \
+       const obj_t*  c, \
+       const cntx_t* cntx  \
     );
 
 GENPROT( herk )
@@ -97,10 +97,10 @@ GENPROT( syrk )
 void PASTEMAC(opname,_check) \
      ( \
        side_t  side, \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  b, \
-       cntx_t* cntx  \
+       const obj_t*  alpha, \
+       const obj_t*  a, \
+       const obj_t*  b, \
+       const cntx_t* cntx  \
     );
 
 GENPROT( trmm )
@@ -111,63 +111,63 @@ GENPROT( trsm )
 
 void bli_gemm_basic_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx
      );
 
 void bli_gemmt_basic_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx
      );
 
 void bli_hemm_basic_check
      (
        side_t  side,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx
      );
 
 void bli_herk_basic_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  ah,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  ah,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx
      );
 
 void bli_her2k_basic_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  bh,
-       obj_t*  b,
-       obj_t*  ah,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  bh,
+       const obj_t*  b,
+       const obj_t*  ah,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx
      );
 
 void bli_l3_basic_check
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx
      );
diff --git a/frame/3/bli_l3_cntl.c b/frame/3/bli_l3_cntl.c
index 83ff8e5af5..025e5a7248 100644
--- a/frame/3/bli_l3_cntl.c
+++ b/frame/3/bli_l3_cntl.c
@@ -41,9 +41,9 @@ void bli_l3_cntl_create_if
        opid_t   family,
        pack_t   schema_a,
        pack_t   schema_b,
-       obj_t*   a,
-       obj_t*   b,
-       obj_t*   c,
+       const obj_t*   a,
+       const obj_t*   b,
+       const obj_t*   c,
        rntm_t*  rntm,
        cntl_t*  cntl_orig,
        cntl_t** cntl_use
diff --git a/frame/3/bli_l3_cntl.h b/frame/3/bli_l3_cntl.h
index c308c8a964..2ba68feca2 100644
--- a/frame/3/bli_l3_cntl.h
+++ b/frame/3/bli_l3_cntl.h
@@ -43,9 +43,9 @@ void bli_l3_cntl_create_if
        opid_t   family,
        pack_t   schema_a,
        pack_t   schema_b,
-       obj_t*   a,
-       obj_t*   b,
-       obj_t*   c,
+       const obj_t*   a,
+       const obj_t*   b,
+       const obj_t*   c,
        rntm_t*  rntm,
        cntl_t*  cntl_orig,
        cntl_t** cntl_use
diff --git a/frame/3/bli_l3_direct.c b/frame/3/bli_l3_direct.c
index 0d0a719214..bbc4af7a0c 100644
--- a/frame/3/bli_l3_direct.c
+++ b/frame/3/bli_l3_direct.c
@@ -36,10 +36,10 @@
 
 dir_t bli_l3_direct
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntl_t* cntl
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntl_t* cntl
      )
 {
 	// Query the operation family.
@@ -58,9 +58,9 @@ dir_t bli_l3_direct
 
 dir_t bli_gemm_direct
      (
-       obj_t* a,
-       obj_t* b,
-       obj_t* c
+       const obj_t* a,
+       const obj_t* b,
+       const obj_t* c
      )
 {
 	// For gemm, movement may be forwards (or backwards).
@@ -70,9 +70,9 @@ dir_t bli_gemm_direct
 
 dir_t bli_gemmt_direct
      (
-       obj_t* a,
-       obj_t* b,
-       obj_t* c
+       const obj_t* a,
+       const obj_t* b,
+       const obj_t* c
      )
 {
 	// For gemmt, movement may be forwards (or backwards).
@@ -82,9 +82,9 @@ dir_t bli_gemmt_direct
 
 dir_t bli_trmm_direct
      (
-       obj_t* a,
-       obj_t* b,
-       obj_t* c
+       const obj_t* a,
+       const obj_t* b,
+       const obj_t* c
      )
 {
 	dir_t direct;
@@ -111,9 +111,9 @@ dir_t bli_trmm_direct
 
 dir_t bli_trsm_direct
      (
-       obj_t* a,
-       obj_t* b,
-       obj_t* c
+       const obj_t* a,
+       const obj_t* b,
+       const obj_t* c
      )
 {
 	dir_t direct;
diff --git a/frame/3/bli_l3_direct.h b/frame/3/bli_l3_direct.h
index 39798407a2..1d644317df 100644
--- a/frame/3/bli_l3_direct.h
+++ b/frame/3/bli_l3_direct.h
@@ -34,10 +34,10 @@
 
 dir_t bli_l3_direct
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntl_t* cntl
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntl_t* cntl
      );
 
 // -----------------------------------------------------------------------------
@@ -47,9 +47,9 @@ dir_t bli_l3_direct
 \
 dir_t PASTEMAC0(opname) \
       ( \
-         obj_t*  a, \
-         obj_t*  b, \
-         obj_t*  c  \
+         const obj_t*  a, \
+         const obj_t*  b, \
+         const obj_t*  c  \
       );
 
 GENPROT( gemm_direct )
diff --git a/frame/3/bli_l3_int.c b/frame/3/bli_l3_int.c
index d4b974030c..a88bd5249a 100644
--- a/frame/3/bli_l3_int.c
+++ b/frame/3/bli_l3_int.c
@@ -36,12 +36,12 @@
 
 void bli_l3_int
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
        rntm_t* rntm,
        cntl_t* cntl,
        thrinfo_t* thread
diff --git a/frame/3/bli_l3_int.h b/frame/3/bli_l3_int.h
index d76b0ac3e2..9648670feb 100644
--- a/frame/3/bli_l3_int.h
+++ b/frame/3/bli_l3_int.h
@@ -34,12 +34,12 @@
 
 void bli_l3_int
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
        rntm_t* rntm,
        cntl_t* cntl,
        thrinfo_t* thread
diff --git a/frame/3/bli_l3_oapi.c b/frame/3/bli_l3_oapi.c
index 1df8e80123..52e8234028 100644
--- a/frame/3/bli_l3_oapi.c
+++ b/frame/3/bli_l3_oapi.c
@@ -43,11 +43,11 @@
 \
 void PASTEMAC0(opname) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  beta, \
-       obj_t*  c  \
+       const obj_t*  alpha, \
+       const obj_t*  a, \
+       const obj_t*  b, \
+       const obj_t*  beta, \
+       const obj_t*  c  \
      ) \
 { \
 	/* Invoke the expert interface and request default cntx_t and rntm_t
@@ -67,11 +67,11 @@ GENFRONT( syr2k )
 void PASTEMAC0(opname) \
      ( \
        side_t  side, \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  beta, \
-       obj_t*  c  \
+       const obj_t*  alpha, \
+       const obj_t*  a, \
+       const obj_t*  b, \
+       const obj_t*  beta, \
+       const obj_t*  c  \
      ) \
 { \
 	/* Invoke the expert interface and request default cntx_t and rntm_t
@@ -89,10 +89,10 @@ GENFRONT( trmm3 )
 \
 void PASTEMAC0(opname) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  beta, \
-       obj_t*  c  \
+       const obj_t*  alpha, \
+       const obj_t*  a, \
+       const obj_t*  beta, \
+       const obj_t*  c  \
      ) \
 { \
 	/* Invoke the expert interface and request default cntx_t and rntm_t
@@ -110,9 +110,9 @@ GENFRONT( syrk )
 void PASTEMAC0(opname) \
      ( \
        side_t  side, \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  b  \
+       const obj_t*  alpha, \
+       const obj_t*  a, \
+       const obj_t*  b  \
      ) \
 { \
 	/* Invoke the expert interface and request default cntx_t and rntm_t
diff --git a/frame/3/bli_l3_oapi.h b/frame/3/bli_l3_oapi.h
index e00f238add..bdef0217ab 100644
--- a/frame/3/bli_l3_oapi.h
+++ b/frame/3/bli_l3_oapi.h
@@ -43,11 +43,11 @@
 \
 BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  beta, \
-       obj_t*  c  \
+       const obj_t*  alpha, \
+       const obj_t*  a, \
+       const obj_t*  b, \
+       const obj_t*  beta, \
+       const obj_t*  c  \
      );
 
 GENPROT( gemm )
@@ -62,11 +62,11 @@ GENPROT( syr2k )
 BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
      ( \
        side_t  side, \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  beta, \
-       obj_t*  c  \
+       const obj_t*  alpha, \
+       const obj_t*  a, \
+       const obj_t*  b, \
+       const obj_t*  beta, \
+       const obj_t*  c  \
      );
 
 GENPROT( hemm )
@@ -79,10 +79,10 @@ GENPROT( trmm3 )
 \
 BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  beta, \
-       obj_t*  c  \
+       const obj_t*  alpha, \
+       const obj_t*  a, \
+       const obj_t*  beta, \
+       const obj_t*  c  \
      );
 
 GENPROT( herk )
@@ -95,9 +95,9 @@ GENPROT( syrk )
 BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
      ( \
        side_t  side, \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  b  \
+       const obj_t*  alpha, \
+       const obj_t*  a, \
+       const obj_t*  b  \
      );
 
 GENPROT( trmm )
diff --git a/frame/3/bli_l3_oapi_ex.c b/frame/3/bli_l3_oapi_ex.c
index cd0df7017c..342131ecde 100644
--- a/frame/3/bli_l3_oapi_ex.c
+++ b/frame/3/bli_l3_oapi_ex.c
@@ -44,11 +44,11 @@
 
 void PASTEMAC(gemm,BLIS_OAPI_EX_SUF)
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
        cntx_t* cntx,
        rntm_t* rntm
      )
@@ -117,11 +117,11 @@ void PASTEMAC(gemm,BLIS_OAPI_EX_SUF)
 
 void PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
        cntx_t* cntx,
        rntm_t* rntm
      )
@@ -166,11 +166,11 @@ void PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)
 
 void PASTEMAC(her2k,BLIS_OAPI_EX_SUF)
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
        cntx_t* cntx,
        rntm_t* rntm
      )
@@ -212,11 +212,11 @@ void PASTEMAC(her2k,BLIS_OAPI_EX_SUF)
 
 void PASTEMAC(syr2k,BLIS_OAPI_EX_SUF)
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
        cntx_t* cntx,
        rntm_t* rntm
      )
@@ -245,11 +245,11 @@ void PASTEMAC(syr2k,BLIS_OAPI_EX_SUF)
 void PASTEMAC(hemm,BLIS_OAPI_EX_SUF)
      (
        side_t  side,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
        cntx_t* cntx,
        rntm_t* rntm
      )
@@ -295,11 +295,11 @@ void PASTEMAC(hemm,BLIS_OAPI_EX_SUF)
 void PASTEMAC(symm,BLIS_OAPI_EX_SUF)
      (
        side_t  side,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
        cntx_t* cntx,
        rntm_t* rntm
      )
@@ -345,11 +345,11 @@ void PASTEMAC(symm,BLIS_OAPI_EX_SUF)
 void PASTEMAC(trmm3,BLIS_OAPI_EX_SUF)
      (
        side_t  side,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
        cntx_t* cntx,
        rntm_t* rntm
      )
@@ -394,10 +394,10 @@ void PASTEMAC(trmm3,BLIS_OAPI_EX_SUF)
 
 void PASTEMAC(herk,BLIS_OAPI_EX_SUF)
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  beta,
-       obj_t*  c,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  beta,
+       const obj_t*  c,
        cntx_t* cntx,
        rntm_t* rntm
      )
@@ -428,10 +428,10 @@ void PASTEMAC(herk,BLIS_OAPI_EX_SUF)
 
 void PASTEMAC(syrk,BLIS_OAPI_EX_SUF)
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  beta,
-       obj_t*  c,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  beta,
+       const obj_t*  c,
        cntx_t* cntx,
        rntm_t* rntm
      )
@@ -454,9 +454,9 @@ void PASTEMAC(syrk,BLIS_OAPI_EX_SUF)
 void PASTEMAC(trmm,BLIS_OAPI_EX_SUF)
      (
        side_t  side,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
        cntx_t* cntx,
        rntm_t* rntm
      )
@@ -501,9 +501,9 @@ void PASTEMAC(trmm,BLIS_OAPI_EX_SUF)
 void PASTEMAC(trsm,BLIS_OAPI_EX_SUF)
      (
        side_t  side,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
        cntx_t* cntx,
        rntm_t* rntm
      )
diff --git a/frame/3/bli_l3_oapi_ex.h b/frame/3/bli_l3_oapi_ex.h
index 946a7aa175..68b98aa064 100644
--- a/frame/3/bli_l3_oapi_ex.h
+++ b/frame/3/bli_l3_oapi_ex.h
@@ -43,11 +43,11 @@
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  beta, \
-       obj_t*  c, \
+       const obj_t*  alpha, \
+       const obj_t*  a, \
+       const obj_t*  b, \
+       const obj_t*  beta, \
+       const obj_t*  c, \
        cntx_t* cntx, \
        rntm_t* rntm  \
      );
@@ -64,11 +64,11 @@ GENPROT( syr2k )
 BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
      ( \
        side_t  side, \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  beta, \
-       obj_t*  c, \
+       const obj_t*  alpha, \
+       const obj_t*  a, \
+       const obj_t*  b, \
+       const obj_t*  beta, \
+       const obj_t*  c, \
        cntx_t* cntx, \
        rntm_t* rntm  \
      );
@@ -83,10 +83,10 @@ GENPROT( trmm3 )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
      ( \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  beta, \
-       obj_t*  c, \
+       const obj_t*  alpha, \
+       const obj_t*  a, \
+       const obj_t*  beta, \
+       const obj_t*  c, \
        cntx_t* cntx, \
        rntm_t* rntm  \
      );
@@ -101,9 +101,9 @@ GENPROT( syrk )
 BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
      ( \
        side_t  side, \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  b, \
+       const obj_t*  alpha, \
+       const obj_t*  a, \
+       const obj_t*  b, \
        cntx_t* cntx, \
        rntm_t* rntm  \
      );
diff --git a/frame/3/bli_l3_oft.h b/frame/3/bli_l3_oft.h
index e7c8dcca31..22496faefa 100644
--- a/frame/3/bli_l3_oft.h
+++ b/frame/3/bli_l3_oft.h
@@ -48,12 +48,12 @@
 \
 typedef void (*PASTECH(opname,_oft)) \
 ( \
-  obj_t*  alpha, \
-  obj_t*  a, \
-  obj_t*  b, \
-  obj_t*  beta, \
-  obj_t*  c, \
-  cntx_t* cntx, \
+  const obj_t*  alpha, \
+  const obj_t*  a, \
+  const obj_t*  b, \
+  const obj_t*  beta, \
+  const obj_t*  c, \
+  const cntx_t* cntx, \
   rntm_t* rntm  \
 );
 
@@ -71,12 +71,12 @@ GENTDEF( syr2k )
 typedef void (*PASTECH(opname,_oft)) \
 ( \
   side_t  side, \
-  obj_t*  alpha, \
-  obj_t*  a, \
-  obj_t*  b, \
-  obj_t*  beta, \
-  obj_t*  c, \
-  cntx_t* cntx, \
+  const obj_t*  alpha, \
+  const obj_t*  a, \
+  const obj_t*  b, \
+  const obj_t*  beta, \
+  const obj_t*  c, \
+  const cntx_t* cntx, \
   rntm_t* rntm  \
 );
 
@@ -92,11 +92,11 @@ GENTDEF( trmm3 )
 \
 typedef void (*PASTECH(opname,_oft)) \
 ( \
-  obj_t*  alpha, \
-  obj_t*  a, \
-  obj_t*  beta, \
-  obj_t*  c, \
-  cntx_t* cntx, \
+  const obj_t*  alpha, \
+  const obj_t*  a, \
+  const obj_t*  beta, \
+  const obj_t*  c, \
+  const cntx_t* cntx, \
   rntm_t* rntm  \
 );
 
@@ -112,10 +112,10 @@ GENTDEF( syrk )
 typedef void (*PASTECH(opname,_oft)) \
 ( \
   side_t  side, \
-  obj_t*  alpha, \
-  obj_t*  a, \
-  obj_t*  b, \
-  cntx_t* cntx, \
+  const obj_t*  alpha, \
+  const obj_t*  a, \
+  const obj_t*  b, \
+  const cntx_t* cntx, \
   rntm_t* rntm  \
 );
 
diff --git a/frame/3/bli_l3_oft_var.h b/frame/3/bli_l3_oft_var.h
index ea10d80904..016fe79418 100644
--- a/frame/3/bli_l3_oft_var.h
+++ b/frame/3/bli_l3_oft_var.h
@@ -45,10 +45,10 @@
 \
 typedef void (*PASTECH(opname,_var_oft)) \
 ( \
-  obj_t*  a, \
-  obj_t*  b, \
-  obj_t*  c, \
-  cntx_t* cntx, \
+  const obj_t*  a, \
+  const obj_t*  b, \
+  const obj_t*  c, \
+  const cntx_t* cntx, \
   rntm_t* rntm, \
   cntl_t* cntl, \
   thrinfo_t* thread  \
diff --git a/frame/3/bli_l3_packab.c b/frame/3/bli_l3_packab.c
index d911819429..1ea97beade 100644
--- a/frame/3/bli_l3_packab.c
+++ b/frame/3/bli_l3_packab.c
@@ -36,10 +36,10 @@
 
 void bli_l3_packa
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
        rntm_t* rntm,
        cntl_t* cntl,
        thrinfo_t* thread
@@ -84,10 +84,10 @@ void bli_l3_packa
 
 void bli_l3_packb
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
        rntm_t* rntm,
        cntl_t* cntl,
        thrinfo_t* thread
diff --git a/frame/3/bli_l3_packab.h b/frame/3/bli_l3_packab.h
index 380ca72123..1901eea434 100644
--- a/frame/3/bli_l3_packab.h
+++ b/frame/3/bli_l3_packab.h
@@ -34,10 +34,10 @@
 
 void bli_l3_packa
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
        rntm_t* rntm,
        cntl_t* cntl,
        thrinfo_t* thread
@@ -45,10 +45,10 @@ void bli_l3_packa
 
 void bli_l3_packb
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
        rntm_t* rntm,
        cntl_t* cntl,
        thrinfo_t* thread
diff --git a/frame/3/bli_l3_prune.c b/frame/3/bli_l3_prune.c
index 6ca8244cbb..d227a3e7d8 100644
--- a/frame/3/bli_l3_prune.c
+++ b/frame/3/bli_l3_prune.c
@@ -40,7 +40,7 @@ void bli_l3_prune_unref_mparts_m
        obj_t*  a,
        obj_t*  b,
        obj_t*  c,
-       cntl_t* cntl
+       const cntl_t* cntl
      )
 {
 	// Query the operation family.
@@ -61,7 +61,7 @@ void PASTEMAC(l3_prune_unref_mparts_,dim) \
        obj_t*  a, \
        obj_t*  b, \
        obj_t*  c, \
-       cntl_t* cntl  \
+       const cntl_t* cntl  \
      ) \
 { \
 	/* Query the operation family. */ \
diff --git a/frame/3/bli_l3_prune.h b/frame/3/bli_l3_prune.h
index ad8f07dc43..887f2b5e6d 100644
--- a/frame/3/bli_l3_prune.h
+++ b/frame/3/bli_l3_prune.h
@@ -41,7 +41,7 @@ void PASTEMAC(l3_prune_unref_mparts_,dim) \
        obj_t*  a, \
        obj_t*  b, \
        obj_t*  c, \
-       cntl_t* cntl  \
+       const cntl_t* cntl  \
      );
 
 GENPROT( m )
diff --git a/frame/3/bli_l3_schema.c b/frame/3/bli_l3_schema.c
index 1d46087997..1b03468776 100644
--- a/frame/3/bli_l3_schema.c
+++ b/frame/3/bli_l3_schema.c
@@ -39,7 +39,7 @@ void bli_l3_set_schemas
        obj_t*  a,
        obj_t*  b,
        obj_t*  c,
-       cntx_t* cntx
+       const cntx_t* cntx
      )
 {
 	// Begin with pack schemas for native execution.
diff --git a/frame/3/bli_l3_schema.h b/frame/3/bli_l3_schema.h
index c6a12ce520..5ec5be3ccc 100644
--- a/frame/3/bli_l3_schema.h
+++ b/frame/3/bli_l3_schema.h
@@ -37,5 +37,5 @@ void bli_l3_set_schemas
        obj_t*  a,
        obj_t*  b,
        obj_t*  c,
-       cntx_t* cntx
+       const cntx_t* cntx
      );
diff --git a/frame/3/bli_l3_sup_int.c b/frame/3/bli_l3_sup_int.c
index 3da3954fa7..b95fa1368b 100644
--- a/frame/3/bli_l3_sup_int.c
+++ b/frame/3/bli_l3_sup_int.c
@@ -36,12 +36,12 @@
 
 err_t bli_gemmsup_int
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
        rntm_t* rntm,
        thrinfo_t* thread
      )
@@ -240,12 +240,12 @@ err_t bli_gemmsup_int
 
 err_t bli_gemmtsup_int
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
        rntm_t* rntm,
        thrinfo_t* thread
      )
diff --git a/frame/3/bli_l3_sup_int.h b/frame/3/bli_l3_sup_int.h
index c6cb88056e..6d3abdf5c5 100644
--- a/frame/3/bli_l3_sup_int.h
+++ b/frame/3/bli_l3_sup_int.h
@@ -34,24 +34,24 @@
 
 err_t bli_gemmsup_int
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
        rntm_t* rntm,
        thrinfo_t* thread
      );
 
 err_t bli_gemmtsup_int
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
        rntm_t* rntm,
        thrinfo_t* thread
      );
diff --git a/frame/3/bli_l3_sup_oft.h b/frame/3/bli_l3_sup_oft.h
index 98a06cf57e..6d9cb09e49 100644
--- a/frame/3/bli_l3_sup_oft.h
+++ b/frame/3/bli_l3_sup_oft.h
@@ -47,12 +47,12 @@
 \
 typedef err_t (*PASTECH(opname,_oft)) \
 ( \
-  obj_t*  alpha, \
-  obj_t*  a, \
-  obj_t*  b, \
-  obj_t*  beta, \
-  obj_t*  c, \
-  cntx_t* cntx, \
+  const obj_t*  alpha, \
+  const obj_t*  a, \
+  const obj_t*  b, \
+  const obj_t*  beta, \
+  const obj_t*  c, \
+  const cntx_t* cntx, \
   rntm_t* rntm  \
 );
 
diff --git a/frame/3/bli_l3_sup_packm_a.c b/frame/3/bli_l3_sup_packm_a.c
index 56726c5f8c..e89cc15601 100644
--- a/frame/3/bli_l3_sup_packm_a.c
+++ b/frame/3/bli_l3_sup_packm_a.c
@@ -45,10 +45,10 @@ void PASTEMAC(ch,opname) \
        dim_t            m, \
        dim_t            k, \
        dim_t            mr, \
-       cntx_t* restrict cntx, \
-       rntm_t* restrict rntm, \
-       mem_t*  restrict mem, \
-       thrinfo_t* restrict thread  \
+       const cntx_t*    cntx, \
+       rntm_t*          rntm, \
+       mem_t*           mem, \
+       const thrinfo_t* thread  \
      ) \
 { \
 	/* Inspect whether we are going to be packing matrix A. */ \
@@ -175,9 +175,9 @@ INSERT_GENTFUNC_BASIC0( packm_sup_init_mem_a )
 void PASTEMAC(ch,opname) \
      ( \
        bool             did_pack, \
-       rntm_t* restrict rntm, \
-       mem_t*  restrict mem, \
-       thrinfo_t* restrict thread  \
+       rntm_t*          rntm, \
+       mem_t*           mem, \
+       const thrinfo_t* thread  \
      ) \
 { \
 	/* Inspect whether we previously packed matrix A. */ \
diff --git a/frame/3/bli_l3_sup_packm_a.h b/frame/3/bli_l3_sup_packm_a.h
index 95c9582e79..6b40f950a6 100644
--- a/frame/3/bli_l3_sup_packm_a.h
+++ b/frame/3/bli_l3_sup_packm_a.h
@@ -43,10 +43,10 @@ void PASTEMAC(ch,opname) \
        dim_t            m, \
        dim_t            k, \
        dim_t            mr, \
-       cntx_t* restrict cntx, \
-       rntm_t* restrict rntm, \
-       mem_t*  restrict mem, \
-       thrinfo_t* restrict thread  \
+       const cntx_t*    cntx, \
+       rntm_t*          rntm, \
+       mem_t*           mem, \
+       const thrinfo_t* thread  \
      ); \
 
 INSERT_GENTPROT_BASIC0( packm_sup_init_mem_a )
@@ -58,9 +58,9 @@ INSERT_GENTPROT_BASIC0( packm_sup_init_mem_a )
 void PASTEMAC(ch,opname) \
      ( \
        bool             did_pack, \
-       rntm_t* restrict rntm, \
-       mem_t*  restrict mem, \
-       thrinfo_t* restrict thread  \
+       rntm_t*          rntm, \
+       mem_t*           mem, \
+       const thrinfo_t* thread  \
      ); \
 
 INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_a )
diff --git a/frame/base/bli_apool.h b/frame/base/bli_apool.h
index e6e91958af..e7ea722d6f 100644
--- a/frame/base/bli_apool.h
+++ b/frame/base/bli_apool.h
@@ -61,16 +61,14 @@ BLIS_INLINE  bli_pthread_mutex_t* bli_apool_mutex( apool_t* apool )
 	return &(apool->mutex);
 }
 
-BLIS_INLINE siz_t bli_apool_def_array_len( apool_t* pool )
+BLIS_INLINE siz_t bli_apool_def_array_len( const apool_t* pool )
 {
 	return pool->def_array_len;
 }
 
-BLIS_INLINE bool bli_apool_is_exhausted( apool_t* apool )
+BLIS_INLINE bool bli_apool_is_exhausted( const apool_t* apool )
 {
-	pool_t* restrict pool = bli_apool_pool( apool );
-
-	return bli_pool_is_exhausted( pool );
+	return bli_pool_is_exhausted( &apool->pool );
 }
 
 // apool action
diff --git a/frame/base/bli_arch.c b/frame/base/bli_arch.c
index 54aa64d42c..48b50a7748 100644
--- a/frame/base/bli_arch.c
+++ b/frame/base/bli_arch.c
@@ -121,7 +121,7 @@ void bli_arch_set_id( void )
 		// initialized. Query the address of an internal context data structure
 		// corresponding to req_id. This pointer will be NULL if the associated
 		// subconfig is not available.
-		cntx_t** req_cntx = bli_gks_lookup_id( req_id );
+		const cntx_t* const * req_cntx = bli_gks_lookup_id( req_id );
 
 		// This function checks the context pointer and aborts with a useful
 		// error message if the pointer is found to be NULL.
@@ -253,7 +253,7 @@ void bli_arch_set_id( void )
 // enumeration that is typedef'ed in bli_type_defs.h. That is, the
 // index order of each string should correspond to the implied/assigned
 // enum value given to the corresponding BLIS_ARCH_ value.
-static char* config_name[ BLIS_NUM_ARCHS ] =
+static const char* config_name[ BLIS_NUM_ARCHS ] =
 {
     "skx",
     "knl",
@@ -283,11 +283,11 @@ static char* config_name[ BLIS_NUM_ARCHS ] =
     "power9",
     "power7",
     "bgq",
-    
+
     "generic"
 };
 
-char* bli_arch_string( arch_t id )
+const char* bli_arch_string( arch_t id )
 {
 	return config_name[ id ];
 }
@@ -306,9 +306,9 @@ bool bli_arch_get_logging( void )
 	return arch_dolog;
 }
 
-void bli_arch_log( char* fmt, ... )
+void bli_arch_log( const char* fmt, ... )
 {
-	char prefix[] = "libblis: ";
+	const char prefix[] = "libblis: ";
 	int  n_chars  = strlen( prefix ) + strlen( fmt ) + 1;
 
 	if ( bli_arch_get_logging() && fmt )
diff --git a/frame/base/bli_arch.h b/frame/base/bli_arch.h
index 0cd55dace3..08af7ae79d 100644
--- a/frame/base/bli_arch.h
+++ b/frame/base/bli_arch.h
@@ -40,11 +40,11 @@ BLIS_EXPORT_BLIS arch_t bli_arch_query_id( void );
 void bli_arch_set_id_once( void );
 void bli_arch_set_id( void );
 
-BLIS_EXPORT_BLIS char*  bli_arch_string( arch_t id );
+BLIS_EXPORT_BLIS const char*  bli_arch_string( arch_t id );
 
 void bli_arch_set_logging( bool dolog );
 bool bli_arch_get_logging( void );
-void bli_arch_log( char*, ... );
+void bli_arch_log( const char*, ... );
 
 #endif
 
diff --git a/frame/base/bli_array.c b/frame/base/bli_array.c
index 3844cd52f7..ae46eb4e17 100644
--- a/frame/base/bli_array.c
+++ b/frame/base/bli_array.c
@@ -38,9 +38,9 @@
 
 void bli_array_init
      (
-       const siz_t       num_elem,
-       const siz_t       elem_size,
-       array_t* restrict array
+       siz_t    num_elem,
+       siz_t    elem_size,
+       array_t* array
      )
 {
 	err_t r_val;
@@ -54,7 +54,7 @@ void bli_array_init
 	const size_t array_size = num_elem * elem_size;
 
 	// Allocate the array buffer.
-	void* restrict buf = bli_malloc_intl( array_size, &r_val );
+	void* buf = bli_malloc_intl( array_size, &r_val );
 
 	// Initialize the array elements to zero. THIS IS IMPORANT because
 	// consumer threads will use the NULL-ness of the array elements to
@@ -70,8 +70,8 @@ void bli_array_init
 
 void bli_array_resize
      (
-       const siz_t       num_elem_new,
-       array_t* restrict array
+       siz_t    num_elem_new,
+       array_t* array
      )
 {
 	err_t r_val;
@@ -94,7 +94,7 @@ void bli_array_resize
 	const size_t array_size_new  = num_elem_new  * elem_size;
 
 	// Query the previous array buffer.
-	void* restrict buf_prev = bli_array_buf( array );
+	void* buf_prev = bli_array_buf( array );
 
 	#ifdef BLIS_ENABLE_MEM_TRACING
 	printf( "bli_array_resize(): allocating array [%d * %d]: ",
@@ -102,7 +102,7 @@ void bli_array_resize
 	#endif
 
 	// Allocate a new array buffer.
-	char* restrict buf_new = bli_malloc_intl( array_size_new, &r_val );
+	char* buf_new = bli_malloc_intl( array_size_new, &r_val );
 
 	// Copy the previous array contents to the new array.
 	memcpy( buf_new, buf_prev, array_size_prev );
@@ -129,7 +129,7 @@ void bli_array_resize
 
 void bli_array_finalize
      (
-       array_t* restrict array
+       array_t* array
      )
 {
 	#ifdef BLIS_ENABLE_MEM_TRACING
@@ -138,7 +138,7 @@ void bli_array_finalize
 	#endif
 
 	// Query the buffer from the array.
-	void* restrict buf = bli_array_buf( array );
+	void* buf = bli_array_buf( array );
 
 	// Free the buffer.
 	bli_free_intl( buf );
@@ -146,8 +146,8 @@ void bli_array_finalize
 
 void* bli_array_elem
      (
-       const siz_t       index,
-       array_t* restrict array
+       siz_t          index,
+       const array_t* array
      )
 {
 	// Query the number of elements in the array.
@@ -161,7 +161,7 @@ void* bli_array_elem
 
 	// Query the buffer from the array, but store it as a char* so we can use
 	// it to easily perform byte pointer arithmetic.
-	char* restrict buf = bli_array_buf( array );
+	char* buf = bli_array_buf( array );
 
 	// Advance the pointer by (index * elem_size) bytes.
 	buf += index * elem_size;
@@ -172,17 +172,19 @@ void* bli_array_elem
 
 void bli_array_set_elem
      (
-       void*    restrict elem,
-       const siz_t       index,
-       array_t* restrict array
+       void*    elem,
+       siz_t    index,
+       array_t* array
      )
 {
 	// Query the size of each element in the array.
 	const siz_t elem_size = bli_array_elem_size( array );
 
 	// Query the buffer from the array as a char*.
-	char* restrict buf = bli_array_buf( array );
+	char* buf = bli_array_buf( array );
 
+// memcpy() is the only safe way to copy data of unknown type
+#if 0
 	if ( elem_size == sizeof( void* ) )
 	{
 		#ifdef BLIS_ENABLE_MEM_TRACING
@@ -193,16 +195,19 @@ void bli_array_set_elem
 
 		// Special case: Handle elem_size = sizeof( void* ) without calling
 		// memcpy().
-		void** restrict buf_vvp  = ( void** )buf;
-		void** restrict elem_vvp = ( void** )elem;
+		void** buf_vvp  = ( void** )buf;
+		void** elem_vvp = ( void** )elem;
 
 		buf_vvp[ index ] = *elem_vvp;
 	}
 	else
 	{
+#endif
 		// General case: Copy the elem_size bytes from elem to buf at the
 		// element index specified by index.
 		memcpy( &buf[ index * elem_size ], elem, ( size_t )elem_size );
+#if 0
 	}
+#endif
 }
 
diff --git a/frame/base/bli_array.h b/frame/base/bli_array.h
index 4cb00496b2..d05801f27f 100644
--- a/frame/base/bli_array.h
+++ b/frame/base/bli_array.h
@@ -51,17 +51,17 @@ typedef struct
 
 // Array entry query
 
-BLIS_INLINE void* bli_array_buf( array_t* array )
+BLIS_INLINE void* bli_array_buf( const array_t* array )
 {
 	return array->buf;
 }
 
-BLIS_INLINE siz_t bli_array_num_elem( array_t* array )
+BLIS_INLINE siz_t bli_array_num_elem( const array_t* array )
 {
 	return array->num_elem;
 }
 
-BLIS_INLINE siz_t bli_array_elem_size( array_t* array )
+BLIS_INLINE siz_t bli_array_elem_size( const array_t* array )
 {
 	return array->elem_size;
 }
@@ -87,30 +87,30 @@ BLIS_INLINE void bli_array_set_elem_size( siz_t elem_size, array_t* array ) \
 
 void bli_array_init
      (
-       const siz_t       num_elem,
-       const siz_t       elem_size,
-       array_t* restrict array
+       siz_t    num_elem,
+       siz_t    elem_size,
+       array_t* array
      );
 void bli_array_resize
      (
-       const siz_t       num_elem_new,
-       array_t* restrict array
+       siz_t    num_elem_new,
+       array_t* array
      );
 void bli_array_finalize
      (
-       array_t* restrict array
+       array_t* array
      );
 
 void* bli_array_elem
      (
-       const siz_t       index,
-       array_t* restrict array
+       siz_t          index,
+       const array_t* array
      );
 void bli_array_set_elem
      (
-       void*    restrict elem,
-       const siz_t       index,
-       array_t* restrict array
+       void*    elem,
+       siz_t    index,
+       array_t* array
      );
 
 #endif
diff --git a/frame/base/bli_auxinfo.h b/frame/base/bli_auxinfo.h
index d8c6cbb13f..e1e34c8816 100644
--- a/frame/base/bli_auxinfo.h
+++ b/frame/base/bli_auxinfo.h
@@ -38,47 +38,47 @@
 
 // auxinfo_t field query
 
-BLIS_INLINE pack_t bli_auxinfo_schema_a( auxinfo_t* ai )
+BLIS_INLINE pack_t bli_auxinfo_schema_a( const auxinfo_t* ai )
 {
 	return ai->schema_a;
 }
-BLIS_INLINE pack_t bli_auxinfo_schema_b( auxinfo_t* ai )
+BLIS_INLINE pack_t bli_auxinfo_schema_b( const auxinfo_t* ai )
 {
 	return ai->schema_b;
 }
 
-BLIS_INLINE void* bli_auxinfo_next_a( auxinfo_t* ai )
+BLIS_INLINE void* bli_auxinfo_next_a( const auxinfo_t* ai )
 {
 	return ai->a_next;
 }
-BLIS_INLINE void* bli_auxinfo_next_b( auxinfo_t* ai )
+BLIS_INLINE void* bli_auxinfo_next_b( const auxinfo_t* ai )
 {
 	return ai->b_next;
 }
 
-BLIS_INLINE inc_t bli_auxinfo_is_a( auxinfo_t* ai )
+BLIS_INLINE inc_t bli_auxinfo_is_a( const auxinfo_t* ai )
 {
 	return ai->is_a;
 }
-BLIS_INLINE inc_t bli_auxinfo_is_b( auxinfo_t* ai )
+BLIS_INLINE inc_t bli_auxinfo_is_b( const auxinfo_t* ai )
 {
 	return ai->is_b;
 }
 
-BLIS_INLINE inc_t bli_auxinfo_ps_a( auxinfo_t* ai )
+BLIS_INLINE inc_t bli_auxinfo_ps_a( const auxinfo_t* ai )
 {
 	return ai->ps_a;
 }
-BLIS_INLINE inc_t bli_auxinfo_ps_b( auxinfo_t* ai )
+BLIS_INLINE inc_t bli_auxinfo_ps_b( const auxinfo_t* ai )
 {
 	return ai->ps_b;
 }
 
-BLIS_INLINE void_fp bli_auxinfo_ukr( auxinfo_t* ai )
+BLIS_INLINE void_fp bli_auxinfo_ukr( const auxinfo_t* ai )
 {
     return ai->ukr;
 }
-BLIS_INLINE void* bli_auxinfo_params( auxinfo_t* ai )
+BLIS_INLINE void* bli_auxinfo_params( const auxinfo_t* ai )
 {
     return ai->params;
 }
diff --git a/frame/base/bli_blksz.c b/frame/base/bli_blksz.c
index 524653d743..8168bc2656 100644
--- a/frame/base/bli_blksz.c
+++ b/frame/base/bli_blksz.c
@@ -235,12 +235,12 @@ void bli_blksz_reduce_max_to
 
 dim_t bli_determine_blocksize
      (
-       dir_t   direct,
-       dim_t   i,
-       dim_t   dim,
-       obj_t*  obj,
-       bszid_t bszid,
-       cntx_t* cntx
+       dir_t         direct,
+       dim_t         i,
+       dim_t         dim,
+       const obj_t*  obj,
+       bszid_t       bszid,
+       const cntx_t* cntx
      )
 {
 	if ( direct == BLIS_FWD )
@@ -251,15 +251,15 @@ dim_t bli_determine_blocksize
 
 dim_t bli_determine_blocksize_f
      (
-       dim_t   i,
-       dim_t   dim,
-       obj_t*  obj,
-       bszid_t bszid,
-       cntx_t* cntx
+       dim_t         i,
+       dim_t         dim,
+       const obj_t*  obj,
+       bszid_t       bszid,
+       const cntx_t* cntx
      )
 {
 	num_t    dt;
-	blksz_t* bsize;
+	const blksz_t* bsize;
 	dim_t    b_alg, b_max;
 	dim_t    b_use;
 
@@ -277,15 +277,15 @@ dim_t bli_determine_blocksize_f
 
 dim_t bli_determine_blocksize_b
      (
-       dim_t   i,
-       dim_t   dim,
-       obj_t*  obj,
-       bszid_t bszid,
-       cntx_t* cntx
+       dim_t         i,
+       dim_t         dim,
+       const obj_t*  obj,
+       bszid_t       bszid,
+       const cntx_t* cntx
      )
 {
 	num_t    dt;
-	blksz_t* bsize;
+	const blksz_t* bsize;
 	dim_t    b_alg, b_max;
 	dim_t    b_use;
 
diff --git a/frame/base/bli_blksz.h b/frame/base/bli_blksz.h
index 2e0fefeae9..63864a186b 100644
--- a/frame/base/bli_blksz.h
+++ b/frame/base/bli_blksz.h
@@ -36,8 +36,8 @@
 
 BLIS_INLINE dim_t bli_blksz_get_def
      (
-       num_t    dt,
-       blksz_t* b
+       num_t          dt,
+       const blksz_t* b
      )
 {
 	return b->v[ dt ];
@@ -45,8 +45,8 @@ BLIS_INLINE dim_t bli_blksz_get_def
 
 BLIS_INLINE dim_t bli_blksz_get_max
      (
-       num_t    dt,
-       blksz_t* b
+       num_t          dt,
+       const blksz_t* b
      )
 {
 	return b->e[ dt ];
@@ -77,8 +77,8 @@ BLIS_INLINE void bli_blksz_set_max
 
 BLIS_INLINE void bli_blksz_copy
      (
-       blksz_t* b_src,
-       blksz_t* b_dst
+       const blksz_t* b_src,
+             blksz_t* b_dst
      )
 {
 	*b_dst = *b_src;
@@ -86,8 +86,8 @@ BLIS_INLINE void bli_blksz_copy
 
 BLIS_INLINE void bli_blksz_copy_if_pos
      (
-       blksz_t* b_src,
-       blksz_t* b_dst
+       const blksz_t* b_src,
+             blksz_t* b_dst
      )
 {
 	// Copy the blocksize values over to b_dst one-by-one so that
@@ -116,8 +116,8 @@ BLIS_INLINE void bli_blksz_copy_if_pos
 
 BLIS_INLINE void bli_blksz_copy_def_dt
      (
-       num_t dt_src, blksz_t* b_src,
-       num_t dt_dst, blksz_t* b_dst
+       num_t dt_src, const blksz_t* b_src,
+       num_t dt_dst,       blksz_t* b_dst
      )
 {
 	const dim_t val = bli_blksz_get_def( dt_src, b_src );
@@ -127,8 +127,8 @@ BLIS_INLINE void bli_blksz_copy_def_dt
 
 BLIS_INLINE void bli_blksz_copy_max_dt
      (
-       num_t dt_src, blksz_t* b_src,
-       num_t dt_dst, blksz_t* b_dst
+       num_t dt_src, const blksz_t* b_src,
+       num_t dt_dst,       blksz_t* b_dst
      )
 {
 	const dim_t val = bli_blksz_get_max( dt_src, b_src );
@@ -138,8 +138,8 @@ BLIS_INLINE void bli_blksz_copy_max_dt
 
 BLIS_INLINE void bli_blksz_copy_dt
      (
-       num_t dt_src, blksz_t* b_src,
-       num_t dt_dst, blksz_t* b_dst
+       num_t dt_src, const blksz_t* b_src,
+       num_t dt_dst,       blksz_t* b_dst
      )
 {
 	bli_blksz_copy_def_dt( dt_src, b_src, dt_dst, b_dst );
@@ -252,30 +252,30 @@ void bli_blksz_reduce_max_to
 
 dim_t bli_determine_blocksize
      (
-       dir_t   direct,
-       dim_t   i,
-       dim_t   dim,
-       obj_t*  obj,
-       bszid_t bszid,
-       cntx_t* cntx
+       dir_t         direct,
+       dim_t         i,
+       dim_t         dim,
+       const obj_t*  obj,
+       bszid_t       bszid,
+       const cntx_t* cntx
      );
 
 dim_t bli_determine_blocksize_f
      (
-       dim_t   i,
-       dim_t   dim,
-       obj_t*  obj,
-       bszid_t bszid,
-       cntx_t* cntx
+       dim_t         i,
+       dim_t         dim,
+       const obj_t*  obj,
+       bszid_t       bszid,
+       const cntx_t* cntx
      );
 
 dim_t bli_determine_blocksize_b
      (
-       dim_t   i,
-       dim_t   dim,
-       obj_t*  obj,
-       bszid_t bszid,
-       cntx_t* cntx
+       dim_t         i,
+       dim_t         dim,
+       const obj_t*  obj,
+       bszid_t       bszid,
+       const cntx_t* cntx
      );
 
 dim_t bli_determine_blocksize_f_sub
diff --git a/frame/base/bli_check.c b/frame/base/bli_check.c
index e76314036f..16c418b49e 100644
--- a/frame/base/bli_check.c
+++ b/frame/base/bli_check.c
@@ -37,7 +37,7 @@
 
 // -- General stuff ------------------------------------------------------------
 
-err_t bli_check_error_code_helper( gint_t code, char* file, guint_t line )
+err_t bli_check_error_code_helper( gint_t code, const char* file, guint_t line )
 {
 	if ( code == BLIS_SUCCESS ) return code;
 
@@ -68,7 +68,7 @@ err_t bli_check_valid_error_level( errlev_t level )
 	return e_val;
 }
 
-err_t bli_check_null_pointer( void* ptr )
+err_t bli_check_null_pointer( const void* ptr )
 {
 	err_t e_val = BLIS_SUCCESS;
 
@@ -128,7 +128,7 @@ err_t bli_check_valid_diag( diag_t diag )
 	return e_val;
 }
 
-err_t bli_check_nonunit_diag( obj_t* a )
+err_t bli_check_nonunit_diag( const obj_t* a )
 {
 	err_t e_val = BLIS_SUCCESS;
 
@@ -155,7 +155,7 @@ err_t bli_check_valid_datatype( num_t dt )
 	return e_val;
 }
 
-err_t bli_check_object_valid_datatype( obj_t* a )
+err_t bli_check_object_valid_datatype( const obj_t* a )
 {
 	err_t e_val;
 	num_t dt;
@@ -176,7 +176,7 @@ err_t bli_check_noninteger_datatype( num_t dt )
 	return e_val;
 }
 
-err_t bli_check_noninteger_object( obj_t* a )
+err_t bli_check_noninteger_object( const obj_t* a )
 {
 	err_t e_val;
 	num_t dt;
@@ -197,7 +197,7 @@ err_t bli_check_nonconstant_datatype( num_t dt )
 	return e_val;
 }
 
-err_t bli_check_nonconstant_object( obj_t* a )
+err_t bli_check_nonconstant_object( const obj_t* a )
 {
 	err_t e_val;
 	num_t dt;
@@ -221,7 +221,7 @@ err_t bli_check_floating_datatype( num_t dt )
 	return e_val;
 }
 
-err_t bli_check_floating_object( obj_t* a )
+err_t bli_check_floating_object( const obj_t* a )
 {
 	err_t e_val;
 	num_t dt;
@@ -243,7 +243,7 @@ err_t bli_check_real_datatype( num_t dt )
 	return e_val;
 }
 
-err_t bli_check_real_object( obj_t* a )
+err_t bli_check_real_object( const obj_t* a )
 {
 	err_t e_val;
 	num_t dt;
@@ -264,7 +264,7 @@ err_t bli_check_integer_datatype( num_t dt )
 	return e_val;
 }
 
-err_t bli_check_integer_object( obj_t* a )
+err_t bli_check_integer_object( const obj_t* a )
 {
 	err_t e_val;
 	num_t dt;
@@ -287,7 +287,7 @@ err_t bli_check_consistent_datatypes( num_t dt_a, num_t dt_b )
 	return e_val;
 }
 
-err_t bli_check_consistent_object_datatypes( obj_t* a, obj_t* b )
+err_t bli_check_consistent_object_datatypes( const obj_t* a, const obj_t* b )
 {
 	err_t e_val;
 	num_t dt_a;
@@ -315,7 +315,7 @@ err_t bli_check_datatype_real_proj_of( num_t dt_c, num_t dt_r )
 	return e_val;
 }
 
-err_t bli_check_object_real_proj_of( obj_t* c, obj_t* r )
+err_t bli_check_object_real_proj_of( const obj_t* c, const obj_t* r )
 {
 	err_t e_val;
 	num_t dt_c;
@@ -329,7 +329,7 @@ err_t bli_check_object_real_proj_of( obj_t* c, obj_t* r )
 	return e_val;
 }
 
-err_t bli_check_real_valued_object( obj_t* a )
+err_t bli_check_real_valued_object( const obj_t* a )
 {
 	err_t  e_val = BLIS_SUCCESS;
 	double a_real;
@@ -363,7 +363,7 @@ err_t bli_check_consistent_precisions( num_t dt_a, num_t dt_b )
 	return e_val;
 }
 
-err_t bli_check_consistent_object_precisions( obj_t* a, obj_t* b )
+err_t bli_check_consistent_object_precisions( const obj_t* a, const obj_t* b )
 {
 	err_t e_val;
 	num_t dt_a;
@@ -379,7 +379,7 @@ err_t bli_check_consistent_object_precisions( obj_t* a, obj_t* b )
 
 // -- Dimension-related checks -------------------------------------------------
 
-err_t bli_check_conformal_dims( obj_t* a, obj_t* b )
+err_t bli_check_conformal_dims( const obj_t* a, const obj_t* b )
 {
 	err_t e_val = BLIS_SUCCESS;
 	dim_t m_a, n_a;
@@ -396,7 +396,7 @@ err_t bli_check_conformal_dims( obj_t* a, obj_t* b )
 	return e_val;
 }
 
-err_t bli_check_level3_dims( obj_t* a, obj_t* b, obj_t* c )
+err_t bli_check_level3_dims( const obj_t* a, const obj_t* b, const obj_t* c )
 {
 	err_t e_val = BLIS_SUCCESS;
 	dim_t m_c, n_c;
@@ -420,7 +420,7 @@ err_t bli_check_level3_dims( obj_t* a, obj_t* b, obj_t* c )
 	return e_val;
 }
 
-err_t bli_check_scalar_object( obj_t* a )
+err_t bli_check_scalar_object( const obj_t* a )
 {
 	err_t e_val = BLIS_SUCCESS;
 
@@ -435,7 +435,7 @@ err_t bli_check_scalar_object( obj_t* a )
 	return e_val;
 }
 
-err_t bli_check_vector_object( obj_t* a )
+err_t bli_check_vector_object( const obj_t* a )
 {
 	err_t e_val = BLIS_SUCCESS;
 
@@ -449,7 +449,7 @@ err_t bli_check_vector_object( obj_t* a )
 	return e_val;
 }
 
-err_t bli_check_matrix_object( obj_t* a )
+err_t bli_check_matrix_object( const obj_t* a )
 {
 	err_t e_val = BLIS_SUCCESS;
 
@@ -460,7 +460,7 @@ err_t bli_check_matrix_object( obj_t* a )
 	return e_val;
 }
 
-err_t bli_check_equal_vector_lengths( obj_t* x, obj_t* y )
+err_t bli_check_equal_vector_lengths( const obj_t* x, const obj_t* y )
 {
 	err_t e_val = BLIS_SUCCESS;
 	dim_t dim_x;
@@ -475,7 +475,7 @@ err_t bli_check_equal_vector_lengths( obj_t* x, obj_t* y )
 	return e_val;
 }
 
-err_t bli_check_square_object( obj_t* a )
+err_t bli_check_square_object( const obj_t* a )
 {
 	err_t e_val = BLIS_SUCCESS;
 
@@ -485,7 +485,7 @@ err_t bli_check_square_object( obj_t* a )
 	return e_val;
 }
 
-err_t bli_check_object_length_equals( obj_t* a, dim_t m )
+err_t bli_check_object_length_equals( const obj_t* a, dim_t m )
 {
 	err_t e_val = BLIS_SUCCESS;
 
@@ -495,7 +495,7 @@ err_t bli_check_object_length_equals( obj_t* a, dim_t m )
 	return e_val;
 }
 
-err_t bli_check_object_width_equals( obj_t* a, dim_t n )
+err_t bli_check_object_width_equals( const obj_t* a, dim_t n )
 {
 	err_t e_val = BLIS_SUCCESS;
 
@@ -505,7 +505,7 @@ err_t bli_check_object_width_equals( obj_t* a, dim_t n )
 	return e_val;
 }
 
-err_t bli_check_vector_dim_equals( obj_t* a, dim_t n )
+err_t bli_check_vector_dim_equals( const obj_t* a, dim_t n )
 {
 	err_t e_val = BLIS_SUCCESS;
 
@@ -515,7 +515,7 @@ err_t bli_check_vector_dim_equals( obj_t* a, dim_t n )
 	return e_val;
 }
 
-err_t bli_check_object_diag_offset_equals( obj_t* a, doff_t offset )
+err_t bli_check_object_diag_offset_equals( const obj_t* a, doff_t offset )
 {
 	err_t e_val = BLIS_SUCCESS;
 
@@ -612,7 +612,7 @@ err_t bli_check_matrix_strides( dim_t m, dim_t n, inc_t rs, inc_t cs, inc_t is )
 
 // -- Structure-related checks -------------------------------------------------
 
-err_t bli_check_general_object( obj_t* a )
+err_t bli_check_general_object( const obj_t* a )
 {
 	err_t e_val = BLIS_SUCCESS;
 
@@ -622,7 +622,7 @@ err_t bli_check_general_object( obj_t* a )
 	return e_val;
 }
 
-err_t bli_check_hermitian_object( obj_t* a )
+err_t bli_check_hermitian_object( const obj_t* a )
 {
 	err_t e_val = BLIS_SUCCESS;
 
@@ -632,7 +632,7 @@ err_t bli_check_hermitian_object( obj_t* a )
 	return e_val;
 }
 
-err_t bli_check_symmetric_object( obj_t* a )
+err_t bli_check_symmetric_object( const obj_t* a )
 {
 	err_t e_val = BLIS_SUCCESS;
 
@@ -642,7 +642,7 @@ err_t bli_check_symmetric_object( obj_t* a )
 	return e_val;
 }
 
-err_t bli_check_triangular_object( obj_t* a )
+err_t bli_check_triangular_object( const obj_t* a )
 {
 	err_t e_val = BLIS_SUCCESS;
 
@@ -652,7 +652,7 @@ err_t bli_check_triangular_object( obj_t* a )
 	return e_val;
 }
 
-err_t bli_check_object_struc( obj_t* a, struc_t struc )
+err_t bli_check_object_struc( const obj_t* a, struc_t struc )
 {
 	err_t e_val = BLIS_SUCCESS;
 
@@ -666,7 +666,7 @@ err_t bli_check_object_struc( obj_t* a, struc_t struc )
 
 // -- Storage-related checks ---------------------------------------------------
 
-err_t bli_check_upper_or_lower_object( obj_t* a )
+err_t bli_check_upper_or_lower_object( const obj_t* a )
 {
 	err_t e_val = BLIS_SUCCESS;
 
@@ -731,7 +731,7 @@ err_t bli_check_valid_3x3_subpart( subpart_t part )
 
 // -- Control tree-related checks ----------------------------------------------
 
-err_t bli_check_valid_cntl( void* cntl )
+err_t bli_check_valid_cntl( const void* cntl )
 {
 	err_t e_val = BLIS_SUCCESS;
 
@@ -743,7 +743,7 @@ err_t bli_check_valid_cntl( void* cntl )
 
 // -- Packing-related checks ---------------------------------------------------
 
-err_t bli_check_packm_schema_on_unpack( obj_t* a )
+err_t bli_check_packm_schema_on_unpack( const obj_t* a )
 {
 	err_t e_val = BLIS_SUCCESS;
 
@@ -756,7 +756,7 @@ err_t bli_check_packm_schema_on_unpack( obj_t* a )
 	return e_val;
 }
 
-err_t bli_check_packv_schema_on_unpack( obj_t* a )
+err_t bli_check_packv_schema_on_unpack( const obj_t* a )
 {
 	err_t e_val = BLIS_SUCCESS;
 
@@ -768,7 +768,7 @@ err_t bli_check_packv_schema_on_unpack( obj_t* a )
 
 // -- Buffer-related checks ----------------------------------------------------
 
-err_t bli_check_object_buffer( obj_t* a )
+err_t bli_check_object_buffer( const obj_t* a )
 {
 	err_t e_val = BLIS_SUCCESS;
 
@@ -783,7 +783,7 @@ err_t bli_check_object_buffer( obj_t* a )
 
 // -- Memory checks ------------------------------------------------------------
 
-err_t bli_check_valid_malloc_buf( void* ptr )
+err_t bli_check_valid_malloc_buf( const void* ptr )
 {
 	err_t e_val = BLIS_SUCCESS;
 
@@ -809,7 +809,7 @@ err_t bli_check_valid_packbuf( packbuf_t buf_type )
 	return e_val;
 }
 
-err_t bli_check_if_exhausted_pool( pool_t* pool )
+err_t bli_check_if_exhausted_pool( const pool_t* pool )
 {
 	err_t e_val = BLIS_SUCCESS;
 
@@ -819,7 +819,7 @@ err_t bli_check_if_exhausted_pool( pool_t* pool )
 	return e_val;
 }
 
-err_t bli_check_sufficient_stack_buf_size( cntx_t* cntx )
+err_t bli_check_sufficient_stack_buf_size( const cntx_t* cntx )
 {
 	err_t e_val = BLIS_SUCCESS;
 	num_t dt;
@@ -873,7 +873,7 @@ err_t bli_check_alignment_is_mult_of_ptr_size( size_t align_size )
 
 // -- Object-related errors ----------------------------------------------------
 
-err_t bli_check_object_alias_of( obj_t* a, obj_t* b )
+err_t bli_check_object_alias_of( const obj_t* a, const obj_t* b )
 {
 	err_t e_val = BLIS_SUCCESS;
 
@@ -895,7 +895,7 @@ err_t bli_check_valid_arch_id( arch_t id )
 	return e_val;
 }
 
-err_t bli_check_initialized_gks_cntx( cntx_t** cntx )
+err_t bli_check_initialized_gks_cntx( const cntx_t* const * cntx )
 {
 	err_t e_val = BLIS_SUCCESS;
 
@@ -907,7 +907,7 @@ err_t bli_check_initialized_gks_cntx( cntx_t** cntx )
 
 // -- Architecture-related errors ----------------------------------------------
 
-err_t bli_check_valid_mc_mod_mult( blksz_t* mc, blksz_t* mr )
+err_t bli_check_valid_mc_mod_mult( const blksz_t* mc, const blksz_t* mr )
 {
 	num_t dt;
 
@@ -924,7 +924,7 @@ err_t bli_check_valid_mc_mod_mult( blksz_t* mc, blksz_t* mr )
 	return BLIS_SUCCESS;
 }
 
-err_t bli_check_valid_nc_mod_mult( blksz_t* nc, blksz_t* nr )
+err_t bli_check_valid_nc_mod_mult( const blksz_t* nc, const blksz_t* nr )
 {
 	num_t dt;
 
@@ -941,7 +941,7 @@ err_t bli_check_valid_nc_mod_mult( blksz_t* nc, blksz_t* nr )
 	return BLIS_SUCCESS;
 }
 
-err_t bli_check_valid_kc_mod_mult( blksz_t* kc, blksz_t* kr )
+err_t bli_check_valid_kc_mod_mult( const blksz_t* kc, const blksz_t* kr )
 {
 	num_t dt;
 
diff --git a/frame/base/bli_check.h b/frame/base/bli_check.h
index 276d276897..f1e2201a7e 100644
--- a/frame/base/bli_check.h
+++ b/frame/base/bli_check.h
@@ -34,85 +34,85 @@
 */
 
 
-BLIS_EXPORT_BLIS err_t bli_check_error_code_helper( gint_t code, char* file, guint_t line );
+BLIS_EXPORT_BLIS err_t bli_check_error_code_helper( gint_t code, const char* file, guint_t line );
 
 err_t bli_check_valid_error_level( errlev_t level );
 
-err_t bli_check_null_pointer( void* ptr );
+err_t bli_check_null_pointer( const void* ptr );
 
 err_t bli_check_valid_side( side_t side );
 err_t bli_check_valid_uplo( uplo_t uplo );
 err_t bli_check_valid_trans( trans_t trans );
 err_t bli_check_valid_diag( diag_t diag );
-err_t bli_check_nonunit_diag( obj_t* a );
+err_t bli_check_nonunit_diag( const obj_t* a );
 
 err_t bli_check_valid_datatype( num_t dt );
-err_t bli_check_object_valid_datatype( obj_t* a );
+err_t bli_check_object_valid_datatype( const obj_t* a );
 err_t bli_check_noninteger_datatype( num_t dt );
-err_t bli_check_noninteger_object( obj_t* a );
+err_t bli_check_noninteger_object( const obj_t* a );
 err_t bli_check_nonconstant_datatype( num_t dt );
-err_t bli_check_nonconstant_object( obj_t* a );
+err_t bli_check_nonconstant_object( const obj_t* a );
 err_t bli_check_floating_datatype( num_t dt );
-err_t bli_check_floating_object( obj_t* a );
+err_t bli_check_floating_object( const obj_t* a );
 err_t bli_check_real_datatype( num_t dt );
-err_t bli_check_real_object( obj_t* a );
+err_t bli_check_real_object( const obj_t* a );
 err_t bli_check_integer_datatype( num_t dt );
-err_t bli_check_integer_object( obj_t* a );
+err_t bli_check_integer_object( const obj_t* a );
 err_t bli_check_consistent_datatypes( num_t dt_a, num_t dt_b );
-err_t bli_check_consistent_object_datatypes( obj_t* a, obj_t* b );
+err_t bli_check_consistent_object_datatypes( const obj_t* a, const obj_t* b );
 err_t bli_check_datatype_real_proj_of( num_t dt_c, num_t dt_r );
-err_t bli_check_object_real_proj_of( obj_t* c, obj_t* r );
-err_t bli_check_real_valued_object( obj_t* a );
+err_t bli_check_object_real_proj_of( const obj_t* c, const obj_t* r );
+err_t bli_check_real_valued_object( const obj_t* a );
 err_t bli_check_consistent_precisions( num_t dt_a, num_t dt_b );
-err_t bli_check_consistent_object_precisions( obj_t* a, obj_t* b );
-
-err_t bli_check_conformal_dims( obj_t* a, obj_t* b );
-err_t bli_check_level3_dims( obj_t* a, obj_t* b, obj_t* c );
-err_t bli_check_scalar_object( obj_t* a );
-err_t bli_check_vector_object( obj_t* a );
-err_t bli_check_matrix_object( obj_t* a );
-err_t bli_check_equal_vector_lengths( obj_t* x, obj_t* y );
-err_t bli_check_square_object( obj_t* a );
-err_t bli_check_object_length_equals( obj_t* a, dim_t m );
-err_t bli_check_object_width_equals( obj_t* a, dim_t n );
-err_t bli_check_vector_dim_equals( obj_t* a, dim_t n );
-err_t bli_check_object_diag_offset_equals( obj_t* a, doff_t offset );
+err_t bli_check_consistent_object_precisions( const obj_t* a, const obj_t* b );
+
+err_t bli_check_conformal_dims( const obj_t* a, const obj_t* b );
+err_t bli_check_level3_dims( const obj_t* a, const obj_t* b, const obj_t* c );
+err_t bli_check_scalar_object( const obj_t* a );
+err_t bli_check_vector_object( const obj_t* a );
+err_t bli_check_matrix_object( const obj_t* a );
+err_t bli_check_equal_vector_lengths( const obj_t* x, const obj_t* y );
+err_t bli_check_square_object( const obj_t* a );
+err_t bli_check_object_length_equals( const obj_t* a, dim_t m );
+err_t bli_check_object_width_equals( const obj_t* a, dim_t n );
+err_t bli_check_vector_dim_equals( const obj_t* a, dim_t n );
+err_t bli_check_object_diag_offset_equals( const obj_t* a, doff_t offset );
 
 err_t bli_check_matrix_strides( dim_t m, dim_t n, inc_t rs, inc_t cs, inc_t is );
 
-err_t bli_check_general_object( obj_t* a );
-err_t bli_check_hermitian_object( obj_t* a );
-err_t bli_check_symmetric_object( obj_t* a );
-err_t bli_check_triangular_object( obj_t* a );
-err_t bli_check_object_struc( obj_t* a, struc_t struc );
+err_t bli_check_general_object( const obj_t* a );
+err_t bli_check_hermitian_object( const obj_t* a );
+err_t bli_check_symmetric_object( const obj_t* a );
+err_t bli_check_triangular_object( const obj_t* a );
+err_t bli_check_object_struc( const obj_t* a, struc_t struc );
 
-err_t bli_check_upper_or_lower_object( obj_t* a );
+err_t bli_check_upper_or_lower_object( const obj_t* a );
 
 err_t bli_check_valid_3x1_subpart( subpart_t part );
 err_t bli_check_valid_1x3_subpart( subpart_t part );
 err_t bli_check_valid_3x3_subpart( subpart_t part );
 
-err_t bli_check_valid_cntl( void* cntl );
+err_t bli_check_valid_cntl( const void* cntl );
 
-err_t bli_check_packm_schema_on_unpack( obj_t* a );
-err_t bli_check_packv_schema_on_unpack( obj_t* a );
+err_t bli_check_packm_schema_on_unpack( const obj_t* a );
+err_t bli_check_packv_schema_on_unpack( const obj_t* a );
 
-err_t bli_check_object_buffer( obj_t* a );
+err_t bli_check_object_buffer( const obj_t* a );
 
-err_t bli_check_valid_malloc_buf( void* ptr );
+err_t bli_check_valid_malloc_buf( const void* ptr );
 
 err_t bli_check_valid_packbuf( packbuf_t buf_type );
-err_t bli_check_if_exhausted_pool( pool_t* pool );
-err_t bli_check_sufficient_stack_buf_size( cntx_t* cntx );
+err_t bli_check_if_exhausted_pool( const pool_t* pool );
+err_t bli_check_sufficient_stack_buf_size( const cntx_t* cntx );
 err_t bli_check_alignment_is_power_of_two( size_t align_size );
 err_t bli_check_alignment_is_mult_of_ptr_size( size_t align_size );
 
-err_t bli_check_object_alias_of( obj_t* a, obj_t* b );
+err_t bli_check_object_alias_of( const obj_t* a, const obj_t* b );
 
 err_t bli_check_valid_arch_id( arch_t id );
-err_t bli_check_initialized_gks_cntx( cntx_t** cntx );
+err_t bli_check_initialized_gks_cntx( const cntx_t* const * cntx );
 
-err_t bli_check_valid_mc_mod_mult( blksz_t* mc, blksz_t* mr );
-err_t bli_check_valid_nc_mod_mult( blksz_t* nc, blksz_t* nr );
-err_t bli_check_valid_kc_mod_mult( blksz_t* kc, blksz_t* kr );
+err_t bli_check_valid_mc_mod_mult( const blksz_t* mc, const blksz_t* mr );
+err_t bli_check_valid_nc_mod_mult( const blksz_t* nc, const blksz_t* nr );
+err_t bli_check_valid_kc_mod_mult( const blksz_t* kc, const blksz_t* kr );
 
diff --git a/frame/base/bli_cntl.c b/frame/base/bli_cntl.c
index f8846198f1..b22ddbee0b 100644
--- a/frame/base/bli_cntl.c
+++ b/frame/base/bli_cntl.c
@@ -349,8 +349,8 @@ void bli_cntl_mark_family
 
 dim_t bli_cntl_calc_num_threads_in
      (
-       rntm_t* rntm,
-       cntl_t* cntl
+       const rntm_t* rntm,
+       const cntl_t* cntl
      )
 {
 	dim_t n_threads_in = 1;
diff --git a/frame/base/bli_cntl.h b/frame/base/bli_cntl.h
index 67dd02f0c1..406a350eec 100644
--- a/frame/base/bli_cntl.h
+++ b/frame/base/bli_cntl.h
@@ -119,45 +119,45 @@ BLIS_EXPORT_BLIS void bli_cntl_mark_family
 
 dim_t bli_cntl_calc_num_threads_in
      (
-       rntm_t* rntm,
-       cntl_t* cntl
+       const rntm_t* rntm,
+       const cntl_t* cntl
      );
 
 // -----------------------------------------------------------------------------
 
 // cntl_t query (fields only)
 
-BLIS_INLINE opid_t bli_cntl_family( cntl_t* cntl )
+BLIS_INLINE opid_t bli_cntl_family( const cntl_t* cntl )
 {
 	return cntl->family;
 }
 
-BLIS_INLINE bszid_t bli_cntl_bszid( cntl_t* cntl )
+BLIS_INLINE bszid_t bli_cntl_bszid( const cntl_t* cntl )
 {
 	return cntl->bszid;
 }
 
-BLIS_INLINE void_fp bli_cntl_var_func( cntl_t* cntl )
+BLIS_INLINE void_fp bli_cntl_var_func( const cntl_t* cntl )
 {
 	return cntl->var_func;
 }
 
-BLIS_INLINE cntl_t* bli_cntl_sub_prenode( cntl_t* cntl )
+BLIS_INLINE cntl_t* bli_cntl_sub_prenode( const cntl_t* cntl )
 {
 	return cntl->sub_prenode;
 }
 
-BLIS_INLINE cntl_t* bli_cntl_sub_node( cntl_t* cntl )
+BLIS_INLINE cntl_t* bli_cntl_sub_node( const cntl_t* cntl )
 {
 	return cntl->sub_node;
 }
 
-BLIS_INLINE void* bli_cntl_params( cntl_t* cntl )
+BLIS_INLINE void* bli_cntl_params( const cntl_t* cntl )
 {
 	return cntl->params;
 }
 
-BLIS_INLINE uint64_t bli_cntl_params_size( cntl_t* cntl )
+BLIS_INLINE uint64_t bli_cntl_params_size( const cntl_t* cntl )
 {
 	// The first 64 bytes is always the size of the params structure.
 	return *( ( uint64_t* )(cntl->params) );
@@ -170,19 +170,19 @@ BLIS_INLINE mem_t* bli_cntl_pack_mem( cntl_t* cntl )
 
 // cntl_t query (complex)
 
-BLIS_INLINE bool bli_cntl_is_null( cntl_t* cntl )
+BLIS_INLINE bool bli_cntl_is_null( const cntl_t* cntl )
 {
 	return ( bool )
 	       ( cntl == NULL );
 }
 
-BLIS_INLINE bool bli_cntl_is_leaf( cntl_t* cntl )
+BLIS_INLINE bool bli_cntl_is_leaf( const cntl_t* cntl )
 {
 	return ( bool )
 	       ( bli_cntl_sub_node( cntl ) == NULL );
 }
 
-BLIS_INLINE bool bli_cntl_does_part( cntl_t* cntl )
+BLIS_INLINE bool bli_cntl_does_part( const cntl_t* cntl )
 {
 	return ( bool )
 	       ( bli_cntl_bszid( cntl ) != BLIS_NO_PART );
diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c
index 5ce04b5025..bfc82d1f9e 100644
--- a/frame/base/bli_cntx.c
+++ b/frame/base/bli_cntx.c
@@ -70,8 +70,8 @@ void bli_cntx_set_blkszs( cntx_t* cntx, ... )
 	// Query the context for the addresses of:
 	// - the blocksize object array
 	// - the blocksize multiple array
-	blksz_t* cntx_blkszs = bli_cntx_blkszs_buf( cntx );
-	bszid_t* cntx_bmults = bli_cntx_bmults_buf( cntx );
+	blksz_t* cntx_blkszs = cntx->blkszs;
+	bszid_t* cntx_bmults = cntx->bmults;
 
 	// Initialize variable argument environment.
 	va_list args;
@@ -165,7 +165,7 @@ void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, cntx_t* cntx, ... )
 		// Query the context for the blksz_t object assoicated with the
 		// current blocksize id, and also query the object corresponding
 		// to the blocksize multiple.
-		blksz_t* cntx_blksz = bli_cntx_get_blksz( bs_id, cntx );
+		blksz_t* cntx_blksz = ( blksz_t* )bli_cntx_get_blksz( bs_id, cntx );
 
 		// Copy the real domain value of the blksz_t object into the
 		// corresponding complex domain slot of the same object.
@@ -218,7 +218,7 @@ void bli_cntx_set_ukrs( cntx_t* cntx , ... )
 	*/
 
 	// Query the context for the address of the ukernel func_t array
-	func_t*  cntx_ukrs = bli_cntx_ukrs_buf( cntx );
+	func_t*  cntx_ukrs = cntx->ukrs;
 
 	// Initialize variable argument environment.
 	va_list   args;
@@ -297,7 +297,7 @@ void bli_cntx_set_ukr_prefs( cntx_t* cntx , ... )
 	*/
 
 	// Query the context for the address of the ukernel preference mbool_t array
-	mbool_t* cntx_ukr_prefs = bli_cntx_ukr_prefs_buf( cntx );
+	mbool_t* cntx_ukr_prefs = cntx->ukr_prefs;
 
 	// Initialize variable argument environment.
 	va_list   args;
@@ -355,7 +355,7 @@ void bli_cntx_set_l3_sup_handlers( cntx_t* cntx, ... )
 	*/
 
 	// Query the context for the address of the l3 sup handlers array.
-	void_fp* cntx_l3_sup_handlers = bli_cntx_l3_sup_handlers_buf( cntx );
+	void_fp* cntx_l3_sup_handlers = cntx->l3_sup_handlers;
 
 	// Initialize variable argument environment.
 	va_list   args;
@@ -386,7 +386,7 @@ void bli_cntx_set_l3_sup_handlers( cntx_t* cntx, ... )
 
 // -----------------------------------------------------------------------------
 
-void bli_cntx_print( cntx_t* cntx )
+void bli_cntx_print( const cntx_t* cntx )
 {
 	dim_t i;
 
@@ -410,7 +410,7 @@ void bli_cntx_print( cntx_t* cntx )
 
 	for ( i = 0; i < BLIS_NUM_UKRS; ++i )
 	{
-		func_t* ukr = bli_cntx_get_ukrs( i, cntx );
+		const func_t* ukr = bli_cntx_get_ukrs( i, cntx );
 
 		printf( "ukr %2lu:  %16p %16p %16p %16p\n",
 		        ( unsigned long )i,
@@ -423,7 +423,7 @@ void bli_cntx_print( cntx_t* cntx )
 
 	for ( i = 0; i < BLIS_NUM_UKR_PREFS; ++i )
 	{
-		mbool_t* ukr_pref = bli_cntx_get_ukr_prefs( i, cntx );
+		const mbool_t* ukr_pref = bli_cntx_get_ukr_prefs( i, cntx );
 
 		printf( "ukr pref %2lu:  %d %d %d %d\n",
 		        ( unsigned long )i,
diff --git a/frame/base/bli_cntx.h b/frame/base/bli_cntx.h
index 412430e9b2..3af16a7c51 100644
--- a/frame/base/bli_cntx.h
+++ b/frame/base/bli_cntx.h
@@ -62,27 +62,7 @@ typedef struct cntx_s
 // -- cntx_t query (fields only) -----------------------------------------------
 //
 
-BLIS_INLINE blksz_t* bli_cntx_blkszs_buf( cntx_t* cntx )
-{
-	return cntx->blkszs;
-}
-BLIS_INLINE bszid_t* bli_cntx_bmults_buf( cntx_t* cntx )
-{
-	return cntx->bmults;
-}
-BLIS_INLINE func_t* bli_cntx_ukrs_buf( cntx_t* cntx )
-{
-	return cntx->ukrs;
-}
-BLIS_INLINE mbool_t* bli_cntx_ukr_prefs_buf( cntx_t* cntx )
-{
-	return cntx->ukr_prefs;
-}
-BLIS_INLINE void_fp* bli_cntx_l3_sup_handlers_buf( cntx_t* cntx )
-{
-	return cntx->l3_sup_handlers;
-}
-BLIS_INLINE ind_t bli_cntx_method( cntx_t* cntx )
+BLIS_INLINE ind_t bli_cntx_method( const cntx_t* cntx )
 {
 	return cntx->method;
 }
@@ -104,75 +84,66 @@ BLIS_INLINE void bli_cntx_set_method( ind_t method, cntx_t* cntx )
 // -- cntx_t query (complex) ---------------------------------------------------
 //
 
-BLIS_INLINE blksz_t* bli_cntx_get_blksz( bszid_t bs_id, cntx_t* cntx )
+BLIS_INLINE const blksz_t* bli_cntx_get_blksz( bszid_t bs_id, const cntx_t* cntx )
 {
-	blksz_t* blkszs = bli_cntx_blkszs_buf( cntx );
-	blksz_t* blksz  = &blkszs[ bs_id ];
-
 	// Return the address of the blksz_t identified by bs_id.
-	return blksz;
+	return &cntx->blkszs[ bs_id ];
 }
 
-BLIS_INLINE dim_t bli_cntx_get_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx )
+BLIS_INLINE dim_t bli_cntx_get_blksz_def_dt( num_t dt, bszid_t bs_id, const cntx_t* cntx )
 {
-	blksz_t* blksz  = bli_cntx_get_blksz( bs_id, cntx );
-	dim_t    bs_dt  = bli_blksz_get_def( dt, blksz );
+	const blksz_t* blksz  = bli_cntx_get_blksz( bs_id, cntx );
+	dim_t          bs_dt  = bli_blksz_get_def( dt, blksz );
 
 	// Return the main (default) blocksize value for the datatype given.
 	return bs_dt;
 }
 
-BLIS_INLINE dim_t bli_cntx_get_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx )
+BLIS_INLINE dim_t bli_cntx_get_blksz_max_dt( num_t dt, bszid_t bs_id, const cntx_t* cntx )
 {
-	blksz_t* blksz  = bli_cntx_get_blksz( bs_id, cntx );
-	dim_t    bs_dt  = bli_blksz_get_max( dt, blksz );
+	const blksz_t* blksz  = bli_cntx_get_blksz( bs_id, cntx );
+	dim_t          bs_dt  = bli_blksz_get_max( dt, blksz );
 
 	// Return the auxiliary (maximum) blocksize value for the datatype given.
 	return bs_dt;
 }
 
-BLIS_INLINE bszid_t bli_cntx_get_bmult_id( bszid_t bs_id, cntx_t* cntx )
+BLIS_INLINE bszid_t bli_cntx_get_bmult_id( bszid_t bs_id, const cntx_t* cntx )
 {
-	bszid_t* restrict bmults = bli_cntx_bmults_buf( cntx );
-	bszid_t           bm_id  = bmults[ bs_id ];
-
-	return bm_id;
+	return cntx->bmults[ bs_id ];
 }
 
-BLIS_INLINE blksz_t* bli_cntx_get_bmult( bszid_t bs_id, cntx_t* cntx )
+BLIS_INLINE const blksz_t* bli_cntx_get_bmult( bszid_t bs_id, const cntx_t* cntx )
 {
-	bszid_t           bm_id  = bli_cntx_get_bmult_id( bs_id, cntx );
-	blksz_t* restrict bmult  = bli_cntx_get_blksz( bm_id, cntx );
+	bszid_t        bm_id  = bli_cntx_get_bmult_id( bs_id, cntx );
+	const blksz_t* bmult  = bli_cntx_get_blksz( bm_id, cntx );
 
 	return bmult;
 }
 
-BLIS_INLINE dim_t bli_cntx_get_bmult_dt( num_t dt, bszid_t bs_id, cntx_t* cntx )
+BLIS_INLINE dim_t bli_cntx_get_bmult_dt( num_t dt, bszid_t bs_id, const cntx_t* cntx )
 {
-	blksz_t* bmult  = bli_cntx_get_bmult( bs_id, cntx );
-	dim_t    bm_dt  = bli_blksz_get_def( dt, bmult );
+	const blksz_t* bmult  = bli_cntx_get_bmult( bs_id, cntx );
+	dim_t          bm_dt  = bli_blksz_get_def( dt, bmult );
 
 	return bm_dt;
 }
 
 // -----------------------------------------------------------------------------
 
-BLIS_INLINE func_t* bli_cntx_get_ukrs( ukr_t ukr_id, cntx_t* cntx )
+BLIS_INLINE const func_t* bli_cntx_get_ukrs( ukr_t ukr_id, const cntx_t* cntx )
 {
-	func_t* funcs = bli_cntx_ukrs_buf( cntx );
-	func_t* func  = &funcs[ ukr_id ];
-
-	return func;
+	return &cntx->ukrs[ ukr_id ];
 }
 
-BLIS_INLINE void_fp bli_cntx_get_ukr_dt( num_t dt, ukr_t ukr_id, cntx_t* cntx )
+BLIS_INLINE void_fp bli_cntx_get_ukr_dt( num_t dt, ukr_t ukr_id, const cntx_t* cntx )
 {
-	func_t* func = bli_cntx_get_ukrs( ukr_id, cntx );
+	const func_t* func = bli_cntx_get_ukrs( ukr_id, cntx );
 
 	return bli_func_get_dt( dt, func );
 }
 
-BLIS_INLINE void_fp bli_cntx_get_l3_vir_ukr_dt( num_t dt, ukr_t ukr_id, cntx_t* cntx )
+BLIS_INLINE void_fp bli_cntx_get_l3_vir_ukr_dt( num_t dt, ukr_t ukr_id, const cntx_t* cntx )
 {
 	switch ( ukr_id )
 	{
@@ -189,24 +160,21 @@ BLIS_INLINE void_fp bli_cntx_get_l3_vir_ukr_dt( num_t dt, ukr_t ukr_id, cntx_t*
 
 // -----------------------------------------------------------------------------
 
-BLIS_INLINE mbool_t* bli_cntx_get_ukr_prefs( ukr_pref_t ukr_id, cntx_t* cntx )
+BLIS_INLINE const mbool_t* bli_cntx_get_ukr_prefs( ukr_pref_t pref_id, const cntx_t* cntx )
 {
-	mbool_t* mbools = bli_cntx_ukr_prefs_buf( cntx );
-	mbool_t* mbool  = &mbools[ ukr_id ];
-
-	return mbool;
+	return &cntx->ukr_prefs[ pref_id ];
 }
 
-BLIS_INLINE bool bli_cntx_get_ukr_prefs_dt( num_t dt, ukr_pref_t ukr_id, cntx_t* cntx )
+BLIS_INLINE bool bli_cntx_get_ukr_prefs_dt( num_t dt, ukr_pref_t ukr_id, const cntx_t* cntx )
 {
-	mbool_t* mbool = bli_cntx_get_ukr_prefs( ukr_id, cntx );
+	const mbool_t* mbool = bli_cntx_get_ukr_prefs( ukr_id, cntx );
 
 	return ( bool )bli_mbool_get_dt( dt, mbool );
 }
 
 // -----------------------------------------------------------------------------
 
-BLIS_INLINE bool bli_cntx_l3_sup_thresh_is_met( num_t dt, dim_t m, dim_t n, dim_t k, cntx_t* cntx )
+BLIS_INLINE bool bli_cntx_l3_sup_thresh_is_met( num_t dt, dim_t m, dim_t n, dim_t k, const cntx_t* cntx )
 {
 	if ( m < bli_cntx_get_blksz_def_dt( dt, BLIS_MT, cntx ) ) return TRUE;
 	if ( n < bli_cntx_get_blksz_def_dt( dt, BLIS_NT, cntx ) ) return TRUE;
@@ -217,17 +185,14 @@ BLIS_INLINE bool bli_cntx_l3_sup_thresh_is_met( num_t dt, dim_t m, dim_t n, dim_
 
 // -----------------------------------------------------------------------------
 
-BLIS_INLINE void_fp bli_cntx_get_l3_sup_handler( opid_t op, cntx_t* cntx )
+BLIS_INLINE void_fp bli_cntx_get_l3_sup_handler( opid_t op, const cntx_t* cntx )
 {
-	void_fp* funcs = bli_cntx_l3_sup_handlers_buf( cntx );
-	void_fp  func  = funcs[ op ];
-
-	return func;
+	return cntx->l3_sup_handlers[ op ];
 }
 
 // -----------------------------------------------------------------------------
 
-BLIS_INLINE bool bli_cntx_ukr_prefers_rows_dt( num_t dt, ukr_t ukr_id, cntx_t* cntx )
+BLIS_INLINE bool bli_cntx_ukr_prefers_rows_dt( num_t dt, ukr_t ukr_id, const cntx_t* cntx )
 {
 	// This initial value will get overwritten during the switch statement below.
 	ukr_pref_t ukr_pref_id = BLIS_GEMM_UKR_ROW_PREF;
@@ -275,12 +240,12 @@ BLIS_INLINE bool bli_cntx_ukr_prefers_rows_dt( num_t dt, ukr_t ukr_id, cntx_t* c
 	return bli_cntx_get_ukr_prefs_dt( dt, ukr_pref_id, cntx );
 }
 
-BLIS_INLINE bool bli_cntx_ukr_prefers_cols_dt( num_t dt, ukr_t ukr_id, cntx_t* cntx )
+BLIS_INLINE bool bli_cntx_ukr_prefers_cols_dt( num_t dt, ukr_t ukr_id, const cntx_t* cntx )
 {
 	return ! bli_cntx_ukr_prefers_rows_dt( dt, ukr_id, cntx );
 }
 
-BLIS_INLINE bool bli_cntx_prefers_storage_of( obj_t* obj, ukr_t ukr_id, cntx_t* cntx )
+BLIS_INLINE bool bli_cntx_prefers_storage_of( obj_t* obj, ukr_t ukr_id, const cntx_t* cntx )
 {
 	const bool ukr_prefers_rows
 		= bli_cntx_ukr_prefers_rows_dt( bli_obj_dt( obj ), ukr_id, cntx );
@@ -291,7 +256,7 @@ BLIS_INLINE bool bli_cntx_prefers_storage_of( obj_t* obj, ukr_t ukr_id, cntx_t*
 	return FALSE;
 }
 
-BLIS_INLINE bool bli_cntx_dislikes_storage_of( obj_t* obj, ukr_t ukr_id, cntx_t* cntx )
+BLIS_INLINE bool bli_cntx_dislikes_storage_of( obj_t* obj, ukr_t ukr_id, const cntx_t* cntx )
 {
 	return ! bli_cntx_prefers_storage_of( obj, ukr_id, cntx );
 }
@@ -307,58 +272,43 @@ BLIS_INLINE bool bli_cntx_dislikes_storage_of( obj_t* obj, ukr_t ukr_id, cntx_t*
 
 BLIS_INLINE void bli_cntx_set_blksz( bszid_t bs_id, blksz_t* blksz, bszid_t mult_id, cntx_t* cntx )
 {
-	blksz_t* blkszs = bli_cntx_blkszs_buf( cntx );
-	bszid_t* bmults = bli_cntx_bmults_buf( cntx );
-
-	blkszs[ bs_id ] = *blksz;
-	bmults[ bs_id ] = mult_id;
+	cntx->blkszs[ bs_id ] = *blksz;
+	cntx->bmults[ bs_id ] = mult_id;
 }
 
 BLIS_INLINE void bli_cntx_set_blksz_def_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx )
 {
-	blksz_t* blkszs = bli_cntx_blkszs_buf( cntx );
-	blksz_t* blksz  = &blkszs[ bs_id ];
-
-	bli_blksz_set_def( bs, dt, blksz );
+	bli_blksz_set_def( bs, dt, &cntx->blkszs[ bs_id ] );
 }
 
 BLIS_INLINE void bli_cntx_set_blksz_max_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx )
 {
-	blksz_t* blkszs = bli_cntx_blkszs_buf( cntx );
-	blksz_t* blksz  = &blkszs[ bs_id ];
-
-	bli_blksz_set_max( bs, dt, blksz );
+	bli_blksz_set_max( bs, dt, &cntx->blkszs[ bs_id ]);
 }
 
 BLIS_INLINE void bli_cntx_set_ukr( ukr_t ukr_id, func_t* func, cntx_t* cntx )
 {
-	func_t* funcs = bli_cntx_ukrs_buf( cntx );
-
-	funcs[ ukr_id ] = *func;
+	cntx->ukrs[ ukr_id ] = *func;
 }
 
 BLIS_INLINE void bli_cntx_set_ukr_dt( void_fp fp, num_t dt, ukr_t ker_id, cntx_t* cntx )
 {
-	func_t* func = bli_cntx_get_ukrs( ker_id, cntx );
-
-	bli_func_set_dt( fp, dt, func );
+	bli_func_set_dt( fp, dt, &cntx->ukrs[ ker_id ] );
 }
 
 BLIS_INLINE void bli_cntx_set_ukr_pref( ukr_pref_t ukr_id, mbool_t* prefs, cntx_t* cntx )
 {
-	mbool_t* mbools = bli_cntx_ukr_prefs_buf( cntx );
-
-	mbools[ ukr_id ] = *prefs;
+	cntx->ukr_prefs[ ukr_id ] = *prefs;
 }
 
-BLIS_INLINE void_fp bli_cntx_get_l3_sup_ker_dt( num_t dt, stor3_t stor_id, cntx_t* cntx )
+BLIS_INLINE void_fp bli_cntx_get_l3_sup_ker_dt( num_t dt, stor3_t stor_id, const cntx_t* cntx )
 {
 	ukr_t ukr_id = bli_stor3_ukr( stor_id );
 
 	return bli_cntx_get_ukr_dt( dt, ukr_id, cntx );
 }
 
-BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_def_dt( num_t dt, bszid_t bs_id, cntx_t* cntx )
+BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_def_dt( num_t dt, bszid_t bs_id, const cntx_t* cntx )
 {
 	switch ( bs_id )
 	{
@@ -374,7 +324,7 @@ BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_def_dt( num_t dt, bszid_t bs_id, cnt
 	return bli_cntx_get_blksz_def_dt( dt, bs_id, cntx );
 }
 
-BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_max_dt( num_t dt, bszid_t bs_id, cntx_t* cntx )
+BLIS_INLINE dim_t bli_cntx_get_l3_sup_blksz_max_dt( num_t dt, bszid_t bs_id, const cntx_t* cntx )
 {
 	switch ( bs_id )
 	{
@@ -403,7 +353,7 @@ BLIS_EXPORT_BLIS void bli_cntx_set_ind_blkszs( ind_t method, num_t dt, cntx_t* c
 BLIS_EXPORT_BLIS void bli_cntx_set_ukrs( cntx_t* cntx, ... );
 BLIS_EXPORT_BLIS void bli_cntx_set_ukr_prefs( cntx_t* cntx, ... );
 
-BLIS_EXPORT_BLIS void bli_cntx_print( cntx_t* cntx );
+BLIS_EXPORT_BLIS void bli_cntx_print( const cntx_t* cntx );
 
 BLIS_EXPORT_BLIS void bli_cntx_set_l3_sup_handlers( cntx_t* cntx, ... );
 
diff --git a/frame/base/bli_const.c b/frame/base/bli_const.c
index f20bc84473..210d6ae77d 100644
--- a/frame/base/bli_const.c
+++ b/frame/base/bli_const.c
@@ -44,11 +44,11 @@ static constdata_t bli_mtwo_buffer = bli_obj_init_constdata( -2.0 );
 
 // Statically initialize global scalar constants, attaching the addresses
 // of the corresponding structs above.
-obj_t BLIS_TWO       = bli_obj_init_const( &bli_two_buffer );
-obj_t BLIS_ONE       = bli_obj_init_const( &bli_one_buffer );
-obj_t BLIS_ZERO      = bli_obj_init_const( &bli_zero_buffer );
-obj_t BLIS_MINUS_ONE = bli_obj_init_const( &bli_mone_buffer );
-obj_t BLIS_MINUS_TWO = bli_obj_init_const( &bli_mtwo_buffer );
+const obj_t BLIS_TWO       = bli_obj_init_const( &bli_two_buffer );
+const obj_t BLIS_ONE       = bli_obj_init_const( &bli_one_buffer );
+const obj_t BLIS_ZERO      = bli_obj_init_const( &bli_zero_buffer );
+const obj_t BLIS_MINUS_ONE = bli_obj_init_const( &bli_mone_buffer );
+const obj_t BLIS_MINUS_TWO = bli_obj_init_const( &bli_mtwo_buffer );
 
 #if 0
 obj_t BLIS_TWO = {};
diff --git a/frame/base/bli_env.c b/frame/base/bli_env.c
index 92aba69700..a4d54e4e7a 100644
--- a/frame/base/bli_env.c
+++ b/frame/base/bli_env.c
@@ -68,7 +68,7 @@
 gint_t bli_env_get_var( const char* env, gint_t fallback )
 {
 	gint_t r_val;
-	char*  str;
+	const char*  str;
 
 	// Query the environment variable and store the result in str.
 	str = getenv( env );
diff --git a/frame/base/bli_error.c b/frame/base/bli_error.c
index 37add3b674..f4933d9629 100644
--- a/frame/base/bli_error.c
+++ b/frame/base/bli_error.c
@@ -36,7 +36,7 @@
 #include "blis.h"
 
 // Internal array to hold error strings.
-static char *bli_error_string[-BLIS_ERROR_CODE_MAX] =
+static const char *bli_error_string[-BLIS_ERROR_CODE_MAX] =
 {
 	[-BLIS_INVALID_ERROR_CHECKING_LEVEL]         = "Invalid error checking level.",
 	[-BLIS_UNDEFINED_ERROR_CODE]                 = "Undefined error code.",
@@ -116,7 +116,7 @@ static char *bli_error_string[-BLIS_ERROR_CODE_MAX] =
 
 // -----------------------------------------------------------------------------
 
-void bli_print_msg( char* str, char* file, guint_t line )
+void bli_print_msg( const char* str, const char* file, guint_t line )
 {
 	fprintf( stderr, "\n" );
 	fprintf( stderr, "libblis: %s (line %lu):\n", file, ( long unsigned int )line );
@@ -156,7 +156,7 @@ bool bli_error_checking_is_enabled( void )
 	return bli_error_checking_level() != BLIS_NO_ERROR_CHECKING;
 }
 
-char* bli_error_string_for_code( gint_t code )
+const char* bli_error_string_for_code( gint_t code )
 {
 	return bli_error_string[-code];
 }
diff --git a/frame/base/bli_error.h b/frame/base/bli_error.h
index e6e6f35dde..f3037e2c21 100644
--- a/frame/base/bli_error.h
+++ b/frame/base/bli_error.h
@@ -39,8 +39,8 @@ BLIS_EXPORT_BLIS void     bli_error_checking_level_set( errlev_t new_level );
 
 BLIS_EXPORT_BLIS bool     bli_error_checking_is_enabled( void );
 
-void                      bli_print_msg( char* str, char* file, guint_t line );
+void                      bli_print_msg( const char* str, const char* file, guint_t line );
 BLIS_EXPORT_BLIS void     bli_abort( void );
 
-char*                     bli_error_string_for_code( gint_t code );
+const char*               bli_error_string_for_code( gint_t code );
 
diff --git a/frame/base/bli_func.c b/frame/base/bli_func.c
index 477710ff00..7b462cd850 100644
--- a/frame/base/bli_func.c
+++ b/frame/base/bli_func.c
@@ -93,13 +93,13 @@ void bli_func_free( func_t* f )
 
 // -----------------------------------------------------------------------------
 
-bool bli_func_is_null_dt( num_t   dt,
-                          func_t* f )
+bool bli_func_is_null_dt( num_t         dt,
+                          const func_t* f )
 {
 	return ( bli_func_get_dt( dt, f ) == NULL );
 }
 
-bool bli_func_is_null( func_t* f )
+bool bli_func_is_null( const func_t* f )
 {
 	bool  r_val = TRUE;
 	num_t dt;
diff --git a/frame/base/bli_func.h b/frame/base/bli_func.h
index 7bdd1ab10e..9094d56f86 100644
--- a/frame/base/bli_func.h
+++ b/frame/base/bli_func.h
@@ -38,8 +38,8 @@
 
 BLIS_INLINE void_fp bli_func_get_dt
      (
-       num_t   dt,
-       func_t* func
+       num_t         dt,
+       const func_t* func
      )
 {
     return func->ptr[ dt ];
@@ -59,8 +59,8 @@ BLIS_INLINE void bli_func_set_dt
 
 BLIS_INLINE void bli_func_copy_dt
      (
-       num_t dt_src, func_t* func_src,
-       num_t dt_dst, func_t* func_dst
+       num_t dt_src, const func_t* func_src,
+       num_t dt_dst,       func_t* func_dst
      )
 {
 	void_fp fp = bli_func_get_dt( dt_src, func_src );
@@ -97,6 +97,6 @@ void bli_func_free( func_t* f );
 // -----------------------------------------------------------------------------
 
 bool bli_func_is_null_dt( num_t   dt,
-                          func_t* f );
-bool bli_func_is_null( func_t* f );
+                          const func_t* f );
+bool bli_func_is_null( const func_t* f );
 
diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c
index 1372a055ab..f932650cb1 100644
--- a/frame/base/bli_gks.c
+++ b/frame/base/bli_gks.c
@@ -226,7 +226,7 @@ void bli_gks_finalize( void )
 		// Iterate over the architectures in the gks array.
 		for ( id = 0; id < BLIS_NUM_ARCHS; ++id )
 		{
-			cntx_t** restrict gks_id = gks[ id ];
+			cntx_t** gks_id = gks[ id ];
 
 			// Only consider context arrays for architectures that were allocated
 			// in the first place.
@@ -236,7 +236,7 @@ void bli_gks_finalize( void )
 				// referenced by cntx_pp.
 				for ( ind = 0; ind < BLIS_NUM_IND_METHODS; ++ind )
 				{
-					cntx_t* restrict gks_id_ind = gks_id[ ind ];
+					cntx_t* gks_id_ind = gks_id[ ind ];
 
 					// If the current context was allocated, free it.
 					if ( gks_id_ind != NULL )
@@ -282,7 +282,7 @@ void bli_gks_init_index( void )
 
 // -----------------------------------------------------------------------------
 
-cntx_t* bli_gks_lookup_nat_cntx
+const cntx_t* bli_gks_lookup_nat_cntx
      (
        arch_t id
      )
@@ -295,7 +295,7 @@ cntx_t* bli_gks_lookup_nat_cntx
 
 // -----------------------------------------------------------------------------
 
-cntx_t* bli_gks_lookup_ind_cntx
+const cntx_t* bli_gks_lookup_ind_cntx
      (
        arch_t id,
        ind_t  ind
@@ -325,7 +325,7 @@ cntx_t* bli_gks_lookup_ind_cntx
 
 // -----------------------------------------------------------------------------
 
-cntx_t** bli_gks_lookup_id
+const cntx_t* const * bli_gks_lookup_id
      (
        arch_t id
      )
@@ -336,10 +336,10 @@ cntx_t** bli_gks_lookup_id
 	// initialized.
 
 	// Index into the array of context pointers for the given architecture id.
-	cntx_t** restrict gks_id = gks[ id ];
+	cntx_t** gks_id = gks[ id ];
 
 	// Return the context pointer at gks_id_ind.
-	return gks_id;
+	return ( const cntx_t* const * )gks_id;
 }
 
 // -----------------------------------------------------------------------------
@@ -440,12 +440,12 @@ void bli_gks_register_cntx
 	// kernel is called.
 	err_t e_val;
 
-	blksz_t* restrict mc = bli_cntx_get_blksz( BLIS_MC, gks_id_nat );
-	blksz_t* restrict nc = bli_cntx_get_blksz( BLIS_NC, gks_id_nat );
-	blksz_t* restrict kc = bli_cntx_get_blksz( BLIS_KC, gks_id_nat );
-	blksz_t* restrict mr = bli_cntx_get_blksz( BLIS_MR, gks_id_nat );
-	blksz_t* restrict nr = bli_cntx_get_blksz( BLIS_NR, gks_id_nat );
-	blksz_t* restrict kr = bli_cntx_get_blksz( BLIS_KR, gks_id_nat );
+	const blksz_t* mc = bli_cntx_get_blksz( BLIS_MC, gks_id_nat );
+	const blksz_t* nc = bli_cntx_get_blksz( BLIS_NC, gks_id_nat );
+	const blksz_t* kc = bli_cntx_get_blksz( BLIS_KC, gks_id_nat );
+	const blksz_t* mr = bli_cntx_get_blksz( BLIS_MR, gks_id_nat );
+	const blksz_t* nr = bli_cntx_get_blksz( BLIS_NR, gks_id_nat );
+	const blksz_t* kr = bli_cntx_get_blksz( BLIS_KR, gks_id_nat );
 
 	e_val = bli_check_valid_mc_mod_mult( mc, mr ); bli_check_error_code( e_val );
 	e_val = bli_check_valid_nc_mod_mult( nc, nr ); bli_check_error_code( e_val );
@@ -463,12 +463,12 @@ void bli_gks_register_cntx
 
 // -----------------------------------------------------------------------------
 
-cntx_t* bli_gks_query_cntx( void )
+const cntx_t* bli_gks_query_cntx( void )
 {
 	return bli_gks_query_nat_cntx();
 }
 
-cntx_t* bli_gks_query_nat_cntx( void )
+const cntx_t* bli_gks_query_nat_cntx( void )
 {
 	bli_init_once();
 
@@ -480,14 +480,14 @@ cntx_t* bli_gks_query_nat_cntx( void )
 	arch_t id = bli_arch_query_id();
 
 	// Use the architecture id to look up a pointer to its context.
-	cntx_t* cntx = bli_gks_lookup_nat_cntx( id );
+	const cntx_t* cntx = bli_gks_lookup_nat_cntx( id );
 
 	return cntx;
 }
 
 // -----------------------------------------------------------------------------
 
-cntx_t* bli_gks_query_cntx_noinit( void )
+const cntx_t* bli_gks_query_cntx_noinit( void )
 {
 	// This function is identical to bli_gks_query_cntx(), except that it
 	// does not call bli_init_once().
@@ -496,7 +496,7 @@ cntx_t* bli_gks_query_cntx_noinit( void )
 	arch_t id = bli_arch_query_id();
 
 	// Use the architecture id to look up a pointer to its context.
-	cntx_t* cntx = bli_gks_lookup_nat_cntx( id );
+	const cntx_t* cntx = bli_gks_lookup_nat_cntx( id );
 
 	return cntx;
 }
@@ -507,7 +507,7 @@ cntx_t* bli_gks_query_cntx_noinit( void )
 // with a new entry corresponding to a context for an ind_t value.
 static bli_pthread_mutex_t gks_mutex = BLIS_PTHREAD_MUTEX_INITIALIZER;
 
-cntx_t* bli_gks_query_ind_cntx
+const cntx_t* bli_gks_query_ind_cntx
      (
        ind_t ind,
        num_t dt
@@ -547,8 +547,8 @@ cntx_t* bli_gks_query_ind_cntx
 
 	// Query the gks for the array of context pointers corresponding to the
 	// given architecture id.
-	cntx_t** restrict gks_id     = gks[ id ];
-	cntx_t*  restrict gks_id_nat = gks_id[ BLIS_NAT ];
+	cntx_t** gks_id     = gks[ id ];
+	cntx_t*  gks_id_nat = gks_id[ BLIS_NAT ];
 
 	// If for some reason the native context was requested, we can return
 	// its address early.
@@ -634,9 +634,9 @@ void bli_gks_init_ref_cntx
 
 bool bli_gks_cntx_l3_nat_ukr_is_ref
      (
-       num_t   dt,
-       ukr_t   ukr_id,
-       cntx_t* cntx
+       num_t         dt,
+       ukr_t         ukr_id,
+       const cntx_t* cntx
      )
 {
 	cntx_t ref_cntx;
@@ -658,7 +658,7 @@ bool bli_gks_cntx_l3_nat_ukr_is_ref
 // -- level-3 micro-kernel implementation strings ------------------------------
 //
 
-static char* bli_gks_l3_ukr_impl_str[BLIS_NUM_UKR_IMPL_TYPES] =
+static const char* bli_gks_l3_ukr_impl_str[BLIS_NUM_UKR_IMPL_TYPES] =
 {
 	"refrnce",
 	"virtual",
@@ -668,15 +668,15 @@ static char* bli_gks_l3_ukr_impl_str[BLIS_NUM_UKR_IMPL_TYPES] =
 
 // -----------------------------------------------------------------------------
 
-char* bli_gks_l3_ukr_impl_string( ukr_t ukr, ind_t method, num_t dt )
+const char* bli_gks_l3_ukr_impl_string( ukr_t ukr, ind_t method, num_t dt )
 {
 	kimpl_t ki;
 
 	// Query the context for the current induced method and datatype, and
 	// then query the ukernel function pointer for the given datatype from
 	// that context.
-	cntx_t* cntx  = bli_gks_query_ind_cntx( method, dt );
-	void_fp fp    = bli_cntx_get_ukr_dt( dt, ukr, cntx );
+	const cntx_t* cntx = bli_gks_query_ind_cntx( method, dt );
+	void_fp fp         = bli_cntx_get_ukr_dt( dt, ukr, cntx );
 
 	// Check whether the ukernel function pointer is NULL for the given
 	// datatype. If it is NULL, return the string for not applicable.
@@ -742,7 +742,7 @@ kimpl_t bli_gks_l3_ukr_impl_type( ukr_t ukr, ind_t method, num_t dt )
 		}
 
 		// Query the native context from the gks.
-		cntx_t* nat_cntx = bli_gks_lookup_nat_cntx( id );
+		const cntx_t* nat_cntx = bli_gks_lookup_nat_cntx( id );
 
 		if ( bli_gks_cntx_l3_nat_ukr_is_ref( dt, ukr, nat_cntx ) )
 			return BLIS_REFERENCE_UKERNEL;
diff --git a/frame/base/bli_gks.h b/frame/base/bli_gks.h
index b8e4c4fe08..4a5c519880 100644
--- a/frame/base/bli_gks.h
+++ b/frame/base/bli_gks.h
@@ -40,24 +40,24 @@ void    bli_gks_finalize( void );
 
 void    bli_gks_init_index( void );
 
-cntx_t* bli_gks_lookup_nat_cntx( arch_t id );
-cntx_t* bli_gks_lookup_ind_cntx( arch_t id, ind_t ind );
-cntx_t** bli_gks_lookup_id( arch_t id );
-void    bli_gks_register_cntx( arch_t id, void_fp nat_fp, void_fp ref_fp, void_fp ind_fp );
+const cntx_t*         bli_gks_lookup_nat_cntx( arch_t id );
+const cntx_t*         bli_gks_lookup_ind_cntx( arch_t id, ind_t ind );
+const cntx_t* const * bli_gks_lookup_id( arch_t id );
+void                  bli_gks_register_cntx( arch_t id, void_fp nat_fp, void_fp ref_fp, void_fp ind_fp );
 
-BLIS_EXPORT_BLIS cntx_t* bli_gks_query_cntx( void );
-BLIS_EXPORT_BLIS cntx_t* bli_gks_query_nat_cntx( void );
+BLIS_EXPORT_BLIS const cntx_t* bli_gks_query_cntx( void );
+BLIS_EXPORT_BLIS const cntx_t* bli_gks_query_nat_cntx( void );
 
-cntx_t* bli_gks_query_cntx_noinit( void );
+const cntx_t* bli_gks_query_cntx_noinit( void );
 
-BLIS_EXPORT_BLIS cntx_t* bli_gks_query_ind_cntx( ind_t ind, num_t dt );
+BLIS_EXPORT_BLIS const cntx_t* bli_gks_query_ind_cntx( ind_t ind, num_t dt );
 
-BLIS_EXPORT_BLIS void    bli_gks_init_ref_cntx( cntx_t* cntx );
+BLIS_EXPORT_BLIS void          bli_gks_init_ref_cntx( cntx_t* cntx );
 
-bool    bli_gks_cntx_l3_nat_ukr_is_ref( num_t dt, ukr_t ukr_id, cntx_t* cntx );
+bool    bli_gks_cntx_l3_nat_ukr_is_ref( num_t dt, ukr_t ukr_id, const cntx_t* cntx );
 
-BLIS_EXPORT_BLIS char*   bli_gks_l3_ukr_impl_string( ukr_t ukr, ind_t method, num_t dt );
-BLIS_EXPORT_BLIS kimpl_t bli_gks_l3_ukr_impl_type( ukr_t ukr, ind_t method, num_t dt );
+BLIS_EXPORT_BLIS const char* bli_gks_l3_ukr_impl_string( ukr_t ukr, ind_t method, num_t dt );
+BLIS_EXPORT_BLIS kimpl_t     bli_gks_l3_ukr_impl_type( ukr_t ukr, ind_t method, num_t dt );
 
 //char*   bli_gks_l3_ukr_avail_impl_string( ukr_t ukr, num_t dt );
 
diff --git a/frame/base/bli_ind.c b/frame/base/bli_ind.c
index a359e89a38..7a94c656b6 100644
--- a/frame/base/bli_ind.c
+++ b/frame/base/bli_ind.c
@@ -34,7 +34,7 @@
 
 #include "blis.h"
 
-static char* bli_ind_impl_str[BLIS_NUM_IND_METHODS] =
+static const char* bli_ind_impl_str[BLIS_NUM_IND_METHODS] =
 {
 /* 1m   */ "1m",
 /* nat  */ "native",
@@ -46,7 +46,7 @@ void bli_ind_init( void )
 {
 	// NOTE: Instead of calling bli_gks_query_cntx(), we call
 	// bli_gks_query_cntx_noinit() to avoid the call to bli_init_once().
-	cntx_t* cntx     = bli_gks_query_cntx_noinit();
+	const cntx_t* cntx = bli_gks_query_cntx_noinit();
 
 	// For each precision, enable the default induced method (1m) if both of
 	// the following conditions are met:
@@ -176,7 +176,7 @@ ind_t bli_ind_oper_find_avail( opid_t oper, num_t dt )
 	return method;
 }
 
-char* bli_ind_oper_get_avail_impl_string( opid_t oper, num_t dt )
+const char* bli_ind_oper_get_avail_impl_string( opid_t oper, num_t dt )
 {
 	ind_t method = bli_ind_oper_find_avail( oper, dt );
 
@@ -185,7 +185,7 @@ char* bli_ind_oper_get_avail_impl_string( opid_t oper, num_t dt )
 
 // -----------------------------------------------------------------------------
 
-char* bli_ind_get_impl_string( ind_t method )
+const char* bli_ind_get_impl_string( ind_t method )
 {
 	return bli_ind_impl_str[ method ];
 }
diff --git a/frame/base/bli_ind.h b/frame/base/bli_ind.h
index 85cad648e9..a49d35a05a 100644
--- a/frame/base/bli_ind.h
+++ b/frame/base/bli_ind.h
@@ -41,22 +41,22 @@
 void   bli_ind_init( void );
 void   bli_ind_finalize( void );
 
-BLIS_EXPORT_BLIS void    bli_ind_enable( ind_t method );
-BLIS_EXPORT_BLIS void    bli_ind_disable( ind_t method );
-BLIS_EXPORT_BLIS void    bli_ind_disable_all( void );
+BLIS_EXPORT_BLIS void        bli_ind_enable( ind_t method );
+BLIS_EXPORT_BLIS void        bli_ind_disable( ind_t method );
+BLIS_EXPORT_BLIS void        bli_ind_disable_all( void );
 
-BLIS_EXPORT_BLIS void    bli_ind_enable_dt( ind_t method, num_t dt );
-BLIS_EXPORT_BLIS void    bli_ind_disable_dt( ind_t method, num_t dt );
-BLIS_EXPORT_BLIS void    bli_ind_disable_all_dt( num_t dt );
+BLIS_EXPORT_BLIS void        bli_ind_enable_dt( ind_t method, num_t dt );
+BLIS_EXPORT_BLIS void        bli_ind_disable_dt( ind_t method, num_t dt );
+BLIS_EXPORT_BLIS void        bli_ind_disable_all_dt( num_t dt );
 
-BLIS_EXPORT_BLIS void    bli_ind_oper_enable_only( opid_t oper, ind_t method, num_t dt );
+BLIS_EXPORT_BLIS void        bli_ind_oper_enable_only( opid_t oper, ind_t method, num_t dt );
 
-BLIS_EXPORT_BLIS bool    bli_ind_oper_is_impl( opid_t oper, ind_t method );
-BLIS_EXPORT_BLIS ind_t   bli_ind_oper_find_avail( opid_t oper, num_t dt );
-BLIS_EXPORT_BLIS char*   bli_ind_oper_get_avail_impl_string( opid_t oper, num_t dt );
+BLIS_EXPORT_BLIS bool        bli_ind_oper_is_impl( opid_t oper, ind_t method );
+BLIS_EXPORT_BLIS ind_t       bli_ind_oper_find_avail( opid_t oper, num_t dt );
+BLIS_EXPORT_BLIS const char* bli_ind_oper_get_avail_impl_string( opid_t oper, num_t dt );
 
-char*  bli_ind_get_impl_string( ind_t method );
-num_t  bli_ind_map_cdt_to_index( num_t dt );
+const char* bli_ind_get_impl_string( ind_t method );
+num_t       bli_ind_map_cdt_to_index( num_t dt );
 
 
 #endif
diff --git a/frame/base/bli_info.c b/frame/base/bli_info.c
index bfa5ca9a38..a1f8eaf5a7 100644
--- a/frame/base/bli_info.c
+++ b/frame/base/bli_info.c
@@ -40,11 +40,11 @@
 
 // This string gets defined via -D on the command line when BLIS is compiled.
 // This string is (or rather, should be) only used here.
-static char* bli_version_str       = BLIS_VERSION_STRING;
-static char* bli_int_type_size_str = STRINGIFY_INT( BLIS_INT_TYPE_SIZE );
+static const char* bli_version_str       = BLIS_VERSION_STRING;
+static const char* bli_int_type_size_str = STRINGIFY_INT( BLIS_INT_TYPE_SIZE );
 
-char* bli_info_get_version_str( void )                { return bli_version_str; }
-char* bli_info_get_int_type_size_str( void )          { return bli_int_type_size_str; }
+const char* bli_info_get_version_str( void )                { return bli_version_str; }
+const char* bli_info_get_int_type_size_str( void )          { return bli_int_type_size_str; }
 
 
 
@@ -164,30 +164,30 @@ gint_t bli_info_get_enable_sandbox( void )
 
 // -- Level-3 kernel definitions --
 
-char* bli_info_get_gemm_ukr_impl_string( ind_t method, num_t dt )
+const char* bli_info_get_gemm_ukr_impl_string( ind_t method, num_t dt )
 { bli_init_once(); return bli_gks_l3_ukr_impl_string( BLIS_GEMM_UKR,       method, dt ); }
-char* bli_info_get_gemmtrsm_l_ukr_impl_string( ind_t method, num_t dt )
+const char* bli_info_get_gemmtrsm_l_ukr_impl_string( ind_t method, num_t dt )
 { bli_init_once(); return bli_gks_l3_ukr_impl_string( BLIS_GEMMTRSM_L_UKR, method, dt ); }
-char* bli_info_get_gemmtrsm_u_ukr_impl_string( ind_t method, num_t dt )
+const char* bli_info_get_gemmtrsm_u_ukr_impl_string( ind_t method, num_t dt )
 { bli_init_once(); return bli_gks_l3_ukr_impl_string( BLIS_GEMMTRSM_U_UKR, method, dt ); }
-char* bli_info_get_trsm_l_ukr_impl_string( ind_t method, num_t dt )
+const char* bli_info_get_trsm_l_ukr_impl_string( ind_t method, num_t dt )
 { bli_init_once(); return bli_gks_l3_ukr_impl_string( BLIS_TRSM_L_UKR,     method, dt ); }
-char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t dt )
+const char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t dt )
 { bli_init_once(); return bli_gks_l3_ukr_impl_string( BLIS_TRSM_U_UKR,     method, dt ); }
 
 
 
 // -- BLIS implementation query (level-3) --------------------------------------
 
-char* bli_info_get_gemm_impl_string( num_t dt )  { return bli_ind_oper_get_avail_impl_string( BLIS_GEMM,  dt ); }
-char* bli_info_get_gemmt_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); }
-char* bli_info_get_hemm_impl_string( num_t dt )  { return bli_ind_oper_get_avail_impl_string( BLIS_HEMM,  dt ); }
-char* bli_info_get_herk_impl_string( num_t dt )  { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); }
-char* bli_info_get_her2k_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); }
-char* bli_info_get_symm_impl_string( num_t dt )  { return bli_ind_oper_get_avail_impl_string( BLIS_SYMM,  dt ); }
-char* bli_info_get_syrk_impl_string( num_t dt )  { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); }
-char* bli_info_get_syr2k_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); }
-char* bli_info_get_trmm_impl_string( num_t dt )  { return bli_ind_oper_get_avail_impl_string( BLIS_TRMM,  dt ); }
-char* bli_info_get_trmm3_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_TRMM3, dt ); }
-char* bli_info_get_trsm_impl_string( num_t dt )  { return bli_ind_oper_get_avail_impl_string( BLIS_TRSM,  dt ); }
+const char* bli_info_get_gemm_impl_string( num_t dt )  { return bli_ind_oper_get_avail_impl_string( BLIS_GEMM,  dt ); }
+const char* bli_info_get_gemmt_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); }
+const char* bli_info_get_hemm_impl_string( num_t dt )  { return bli_ind_oper_get_avail_impl_string( BLIS_HEMM,  dt ); }
+const char* bli_info_get_herk_impl_string( num_t dt )  { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); }
+const char* bli_info_get_her2k_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); }
+const char* bli_info_get_symm_impl_string( num_t dt )  { return bli_ind_oper_get_avail_impl_string( BLIS_SYMM,  dt ); }
+const char* bli_info_get_syrk_impl_string( num_t dt )  { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); }
+const char* bli_info_get_syr2k_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_GEMMT, dt ); }
+const char* bli_info_get_trmm_impl_string( num_t dt )  { return bli_ind_oper_get_avail_impl_string( BLIS_TRMM,  dt ); }
+const char* bli_info_get_trmm3_impl_string( num_t dt ) { return bli_ind_oper_get_avail_impl_string( BLIS_TRMM3, dt ); }
+const char* bli_info_get_trsm_impl_string( num_t dt )  { return bli_ind_oper_get_avail_impl_string( BLIS_TRSM,  dt ); }
 
diff --git a/frame/base/bli_info.h b/frame/base/bli_info.h
index 99c7d000db..250504c231 100644
--- a/frame/base/bli_info.h
+++ b/frame/base/bli_info.h
@@ -36,8 +36,8 @@
 
 // -- General library information ----------------------------------------------
 
-BLIS_EXPORT_BLIS char* bli_info_get_version_str( void );
-BLIS_EXPORT_BLIS char* bli_info_get_int_type_size_str( void );
+BLIS_EXPORT_BLIS const char* bli_info_get_version_str( void );
+BLIS_EXPORT_BLIS const char* bli_info_get_int_type_size_str( void );
 
 
 // -- General configuration-related --------------------------------------------
@@ -81,24 +81,24 @@ BLIS_EXPORT_BLIS gint_t bli_info_get_enable_sandbox( void );
 
 // -- Level-3 kernel definitions --
 
-BLIS_EXPORT_BLIS char* bli_info_get_gemm_ukr_impl_string( ind_t method, num_t dt );
-BLIS_EXPORT_BLIS char* bli_info_get_gemmtrsm_l_ukr_impl_string( ind_t method, num_t dt );
-BLIS_EXPORT_BLIS char* bli_info_get_gemmtrsm_u_ukr_impl_string( ind_t method, num_t dt );
-BLIS_EXPORT_BLIS char* bli_info_get_trsm_l_ukr_impl_string( ind_t method, num_t dt );
-BLIS_EXPORT_BLIS char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t dt );
+BLIS_EXPORT_BLIS const char* bli_info_get_gemm_ukr_impl_string( ind_t method, num_t dt );
+BLIS_EXPORT_BLIS const char* bli_info_get_gemmtrsm_l_ukr_impl_string( ind_t method, num_t dt );
+BLIS_EXPORT_BLIS const char* bli_info_get_gemmtrsm_u_ukr_impl_string( ind_t method, num_t dt );
+BLIS_EXPORT_BLIS const char* bli_info_get_trsm_l_ukr_impl_string( ind_t method, num_t dt );
+BLIS_EXPORT_BLIS const char* bli_info_get_trsm_u_ukr_impl_string( ind_t method, num_t dt );
 
 
 // -- BLIS implementation query (level-3) --------------------------------------
 
-BLIS_EXPORT_BLIS char* bli_info_get_gemm_impl_string( num_t dt );
-BLIS_EXPORT_BLIS char* bli_info_get_gemmt_impl_string( num_t dt );
-BLIS_EXPORT_BLIS char* bli_info_get_hemm_impl_string( num_t dt );
-BLIS_EXPORT_BLIS char* bli_info_get_herk_impl_string( num_t dt );
-BLIS_EXPORT_BLIS char* bli_info_get_her2k_impl_string( num_t dt );
-BLIS_EXPORT_BLIS char* bli_info_get_symm_impl_string( num_t dt );
-BLIS_EXPORT_BLIS char* bli_info_get_syrk_impl_string( num_t dt );
-BLIS_EXPORT_BLIS char* bli_info_get_syr2k_impl_string( num_t dt );
-BLIS_EXPORT_BLIS char* bli_info_get_trmm_impl_string( num_t dt );
-BLIS_EXPORT_BLIS char* bli_info_get_trmm3_impl_string( num_t dt );
-BLIS_EXPORT_BLIS char* bli_info_get_trsm_impl_string( num_t dt );
+BLIS_EXPORT_BLIS const char* bli_info_get_gemm_impl_string( num_t dt );
+BLIS_EXPORT_BLIS const char* bli_info_get_gemmt_impl_string( num_t dt );
+BLIS_EXPORT_BLIS const char* bli_info_get_hemm_impl_string( num_t dt );
+BLIS_EXPORT_BLIS const char* bli_info_get_herk_impl_string( num_t dt );
+BLIS_EXPORT_BLIS const char* bli_info_get_her2k_impl_string( num_t dt );
+BLIS_EXPORT_BLIS const char* bli_info_get_symm_impl_string( num_t dt );
+BLIS_EXPORT_BLIS const char* bli_info_get_syrk_impl_string( num_t dt );
+BLIS_EXPORT_BLIS const char* bli_info_get_syr2k_impl_string( num_t dt );
+BLIS_EXPORT_BLIS const char* bli_info_get_trmm_impl_string( num_t dt );
+BLIS_EXPORT_BLIS const char* bli_info_get_trmm3_impl_string( num_t dt );
+BLIS_EXPORT_BLIS const char* bli_info_get_trsm_impl_string( num_t dt );
 
diff --git a/frame/base/bli_mbool.h b/frame/base/bli_mbool.h
index 6a989590b2..d004242734 100644
--- a/frame/base/bli_mbool.h
+++ b/frame/base/bli_mbool.h
@@ -36,7 +36,7 @@
 
 // mbool_t query
 
-BLIS_INLINE bool bli_mbool_get_dt( num_t dt, mbool_t* mb )
+BLIS_INLINE bool bli_mbool_get_dt( num_t dt, const mbool_t* mb )
 {
 	return ( bool )( mb->v[ dt ] );
 }
diff --git a/frame/base/bli_memsys.c b/frame/base/bli_memsys.c
index ca3c46f998..7b62ded5c7 100644
--- a/frame/base/bli_memsys.c
+++ b/frame/base/bli_memsys.c
@@ -44,7 +44,7 @@ void bli_memsys_init( void )
 	// contexts for induced methods.
 	// NOTE: Instead of calling bli_gks_query_cntx(), we call
 	// bli_gks_query_cntx_noinit() to avoid the call to bli_init_once().
-	cntx_t* cntx_p = bli_gks_query_cntx_noinit();
+	const cntx_t* cntx_p = bli_gks_query_cntx_noinit();
 
 	// Initialize the packing block allocator and its data structures.
 	bli_pba_init( cntx_p );
diff --git a/frame/base/bli_obj.c b/frame/base/bli_obj.c
index 23fbb4cd10..f7946b90e6 100644
--- a/frame/base/bli_obj.c
+++ b/frame/base/bli_obj.c
@@ -264,8 +264,8 @@ void bli_obj_create_1x1_with_attached_buffer
 
 void bli_obj_create_conf_to
      (
-       obj_t* s,
-       obj_t* d
+       const obj_t* s,
+             obj_t* d
      )
 {
 	const num_t dt = bli_obj_dt( s );
@@ -552,7 +552,7 @@ static char* dt_names[ BLIS_NUM_FP_TYPES+1 ] =
 	"int"
 };
 
-char* bli_dt_string
+const char* bli_dt_string
      (
        num_t dt
      )
@@ -600,8 +600,8 @@ dim_t bli_align_dim_to_size
 
 dim_t bli_align_ptr_to_size
      (
-       void*  p,
-       size_t align_size
+       const void* p,
+       size_t      align_size
      )
 {
 	dim_t dim;
@@ -634,8 +634,8 @@ num_t bli_dt_union( num_t dt1, num_t dt2 )
 
 void bli_obj_print
      (
-       char*  label,
-       obj_t* obj
+       const char*  label,
+       const obj_t* obj
      )
 {
 	bli_init_once();
diff --git a/frame/base/bli_obj.h b/frame/base/bli_obj.h
index 4436d2cd8e..d806563fdd 100644
--- a/frame/base/bli_obj.h
+++ b/frame/base/bli_obj.h
@@ -95,8 +95,8 @@ BLIS_EXPORT_BLIS void bli_obj_create_1x1_with_attached_buffer
 
 BLIS_EXPORT_BLIS void bli_obj_create_conf_to
      (
-       obj_t* s,
-       obj_t* d
+       const obj_t* s,
+             obj_t* d
      );
 
 BLIS_EXPORT_BLIS void bli_obj_free
@@ -119,7 +119,7 @@ BLIS_EXPORT_BLIS siz_t bli_dt_size
        num_t dt
      );
 
-BLIS_EXPORT_BLIS char* bli_dt_string
+BLIS_EXPORT_BLIS const char* bli_dt_string
      (
        num_t dt
      );
@@ -139,13 +139,13 @@ BLIS_EXPORT_BLIS dim_t bli_align_dim_to_size
 
 BLIS_EXPORT_BLIS dim_t bli_align_ptr_to_size
      (
-       void*  p,
-       size_t align_size
+       const void* p,
+       size_t      align_size
      );
 
 BLIS_EXPORT_BLIS void bli_obj_print
      (
-       char*  label,
-       obj_t* obj
+       const char*  label,
+       const obj_t* obj
      );
 
diff --git a/frame/base/bli_obj_scalar.c b/frame/base/bli_obj_scalar.c
index e28d4fda98..8efe22d7d4 100644
--- a/frame/base/bli_obj_scalar.c
+++ b/frame/base/bli_obj_scalar.c
@@ -59,10 +59,10 @@ void bli_obj_scalar_init_detached
 
 void bli_obj_scalar_init_detached_copy_of
      (
-       num_t  dt,
-       conj_t conj,
-       obj_t* alpha,
-       obj_t* beta
+       num_t        dt,
+       conj_t       conj,
+       const obj_t* alpha,
+             obj_t* beta
      )
 {
 	obj_t alpha_local;
@@ -81,8 +81,8 @@ void bli_obj_scalar_init_detached_copy_of
 
 void bli_obj_scalar_detach
      (
-       obj_t* a,
-       obj_t* alpha
+       const obj_t* a,
+             obj_t* alpha
      )
 {
 	// Use the scalar datatype of A as the storage datatype of the detached
@@ -165,8 +165,8 @@ void bli_obj_scalar_cast_to
 
 void bli_obj_scalar_apply_scalar
      (
-       obj_t* alpha,
-       obj_t* a
+       const obj_t* alpha,
+             obj_t* a
      )
 {
 	obj_t alpha_cast;
@@ -193,9 +193,9 @@ void bli_obj_scalar_reset
        obj_t* a
      )
 {
-	num_t dt       = bli_obj_scalar_dt( a );
-	void* scalar_a = bli_obj_internal_scalar_buffer( a );
-	void* one      = bli_obj_buffer_for_const( dt, &BLIS_ONE );
+	num_t       dt       = bli_obj_scalar_dt( a );
+	void*       scalar_a = bli_obj_internal_scalar_buffer( a );
+	const void* one      = bli_obj_buffer_for_const( dt, &BLIS_ONE );
 
 	if      ( bli_is_float( dt )    ) *(( float*    )scalar_a) = *(( float*    )one);
 	else if ( bli_is_double( dt )   ) *(( double*   )scalar_a) = *(( double*   )one);
@@ -236,8 +236,8 @@ bool bli_obj_scalar_has_nonzero_imag
 
 bool bli_obj_scalar_equals
      (
-       obj_t* a,
-       obj_t* beta
+       const obj_t* a,
+       const obj_t* beta
      )
 {
 	obj_t scalar_a;
diff --git a/frame/base/bli_obj_scalar.h b/frame/base/bli_obj_scalar.h
index 86b699659b..753707f043 100644
--- a/frame/base/bli_obj_scalar.h
+++ b/frame/base/bli_obj_scalar.h
@@ -40,16 +40,16 @@ BLIS_EXPORT_BLIS void bli_obj_scalar_init_detached
 
 BLIS_EXPORT_BLIS void bli_obj_scalar_init_detached_copy_of
      (
-       num_t  dt,
-       conj_t conj,
-       obj_t* alpha,
-       obj_t* beta
+       num_t        dt,
+       conj_t       conj,
+       const obj_t* alpha,
+             obj_t* beta
      );
 
 BLIS_EXPORT_BLIS void bli_obj_scalar_detach
      (
-       obj_t* a,
-       obj_t* alpha
+       const obj_t* a,
+             obj_t* alpha
      );
 
 BLIS_EXPORT_BLIS void bli_obj_scalar_attach
@@ -67,8 +67,8 @@ BLIS_EXPORT_BLIS void bli_obj_scalar_cast_to
 
 BLIS_EXPORT_BLIS void bli_obj_scalar_apply_scalar
      (
-       obj_t* alpha,
-       obj_t* a
+       const obj_t* alpha,
+             obj_t* a
      );
 
 BLIS_EXPORT_BLIS void bli_obj_scalar_reset
@@ -83,7 +83,7 @@ BLIS_EXPORT_BLIS bool bli_obj_scalar_has_nonzero_imag
 
 BLIS_EXPORT_BLIS bool bli_obj_scalar_equals
      (
-       obj_t* a,
-       obj_t* beta
+       const obj_t* a,
+       const obj_t* beta
      );
 
diff --git a/frame/base/bli_part.c b/frame/base/bli_part.c
index 95587e4a71..accab54e43 100644
--- a/frame/base/bli_part.c
+++ b/frame/base/bli_part.c
@@ -40,12 +40,12 @@
 
 void bli_acquire_mpart
      (
-       dim_t     i,
-       dim_t     j,
-       dim_t     bm,
-       dim_t     bn,
-       obj_t*    parent,
-       obj_t*    child
+       dim_t        i,
+       dim_t        j,
+       dim_t        bm,
+       dim_t        bn,
+       const obj_t* parent,
+             obj_t* child
      )
 {
 	// Query the dimensions of the parent object.
@@ -83,11 +83,11 @@ void bli_acquire_mpart
 
 void bli_acquire_mpart_t2b
      (
-       subpart_t req_part,
-       dim_t     i,
-       dim_t     b,
-       obj_t*    obj,
-       obj_t*    sub_obj
+       subpart_t    req_part,
+       dim_t        i,
+       dim_t        b,
+       const obj_t* obj,
+             obj_t* sub_obj
      )
 {
 	bli_acquire_mpart_mdim( BLIS_FWD, req_part, i, b, obj, sub_obj );
@@ -96,11 +96,11 @@ void bli_acquire_mpart_t2b
 
 void bli_acquire_mpart_b2t
      (
-       subpart_t req_part,
-       dim_t     i,
-       dim_t     b,
-       obj_t*    obj,
-       obj_t*    sub_obj
+       subpart_t    req_part,
+       dim_t        i,
+       dim_t        b,
+       const obj_t* obj,
+             obj_t* sub_obj
      )
 {
 	bli_acquire_mpart_mdim( BLIS_BWD, req_part, i, b, obj, sub_obj );
@@ -109,12 +109,12 @@ void bli_acquire_mpart_b2t
 
 void bli_acquire_mpart_mdim
      (
-       dir_t     direct,
-       subpart_t req_part,
-       dim_t     i,
-       dim_t     b,
-       obj_t*    obj,
-       obj_t*    sub_obj
+       dir_t        direct,
+       subpart_t    req_part,
+       dim_t        i,
+       dim_t        b,
+       const obj_t* obj,
+             obj_t* sub_obj
      )
 {
 	dim_t  m;
@@ -307,24 +307,24 @@ void bli_acquire_mpart_mdim
 
 void bli_acquire_mpart_l2r
      (
-       subpart_t req_part,
-       dim_t     i,
-       dim_t     b,
-       obj_t*    obj,
-       obj_t*    sub_obj
+       subpart_t    req_part,
+       dim_t        j,
+       dim_t        b,
+       const obj_t* obj,
+             obj_t* sub_obj
      )
 {
-	bli_acquire_mpart_ndim( BLIS_FWD, req_part, i, b, obj, sub_obj );
+	bli_acquire_mpart_ndim( BLIS_FWD, req_part, j, b, obj, sub_obj );
 }
 
 
 void bli_acquire_mpart_r2l
      (
-       subpart_t req_part,
-       dim_t     j,
-       dim_t     b,
-       obj_t*    obj,
-       obj_t*    sub_obj
+       subpart_t    req_part,
+       dim_t        j,
+       dim_t        b,
+       const obj_t* obj,
+             obj_t* sub_obj
      )
 {
 	bli_acquire_mpart_ndim( BLIS_BWD, req_part, j, b, obj, sub_obj );
@@ -333,12 +333,12 @@ void bli_acquire_mpart_r2l
 
 void bli_acquire_mpart_ndim
      (
-       dir_t     direct,
-       subpart_t req_part,
-       dim_t     j,
-       dim_t     b,
-       obj_t*    obj,
-       obj_t*    sub_obj
+       dir_t        direct,
+       subpart_t    req_part,
+       dim_t        j,
+       dim_t        b,
+       const obj_t* obj,
+             obj_t* sub_obj
      )
 {
 	dim_t  m;
@@ -530,11 +530,11 @@ void bli_acquire_mpart_ndim
 
 void bli_acquire_mpart_tl2br
      (
-       subpart_t req_part,
-       dim_t     i,
-       dim_t     b,
-       obj_t*    obj,
-       obj_t*    sub_obj
+       subpart_t    req_part,
+       dim_t        i,
+       dim_t        b,
+       const obj_t* obj,
+             obj_t* sub_obj
      )
 {
 	bli_acquire_mpart_mndim( BLIS_FWD, req_part, i, b, obj, sub_obj );
@@ -543,11 +543,11 @@ void bli_acquire_mpart_tl2br
 
 void bli_acquire_mpart_br2tl
      (
-       subpart_t req_part,
-       dim_t     j,
-       dim_t     b,
-       obj_t*    obj,
-       obj_t*    sub_obj
+       subpart_t    req_part,
+       dim_t        j,
+       dim_t        b,
+       const obj_t* obj,
+             obj_t* sub_obj
      )
 {
 	bli_acquire_mpart_mndim( BLIS_BWD, req_part, j, b, obj, sub_obj );
@@ -556,12 +556,12 @@ void bli_acquire_mpart_br2tl
 
 void bli_acquire_mpart_mndim
      (
-       dir_t     direct,
-       subpart_t req_part,
-       dim_t     ij,
-       dim_t     b,
-       obj_t*    obj,
-       obj_t*    sub_obj
+       dir_t        direct,
+       subpart_t    req_part,
+       dim_t        ij,
+       dim_t        b,
+       const obj_t* obj,
+             obj_t* sub_obj
      )
 {
 	dim_t  m;
@@ -798,11 +798,11 @@ void bli_acquire_mpart_mndim
 
 void bli_acquire_vpart_f2b
      (
-       subpart_t req_part,
-       dim_t     i,
-       dim_t     b,
-       obj_t*    obj,
-       obj_t*    sub_obj
+       subpart_t    req_part,
+       dim_t        i,
+       dim_t        b,
+       const obj_t* obj,
+             obj_t* sub_obj
      )
 {
 	if ( bli_obj_is_col_vector( obj ) )
@@ -814,11 +814,11 @@ void bli_acquire_vpart_f2b
 
 void bli_acquire_vpart_b2f
      (
-       subpart_t req_part,
-       dim_t     i,
-       dim_t     b,
-       obj_t*    obj,
-       obj_t*    sub_obj
+       subpart_t    req_part,
+       dim_t        i,
+       dim_t        b,
+       const obj_t* obj,
+             obj_t* sub_obj
      )
 {
 	if ( bli_obj_is_col_vector( obj ) )
@@ -833,10 +833,10 @@ void bli_acquire_vpart_b2f
 
 void bli_acquire_mij
      (
-       dim_t     i,
-       dim_t     j,
-       obj_t*    obj,
-       obj_t*    sub_obj
+       dim_t        i,
+       dim_t        j,
+       const obj_t* obj,
+             obj_t* sub_obj
      )
 {
 	obj_t tmp_obj;
@@ -848,9 +848,9 @@ void bli_acquire_mij
 
 void bli_acquire_vi
      (
-       dim_t     i,
-       obj_t*    obj,
-       obj_t*    sub_obj
+       dim_t        i,
+       const obj_t* obj,
+             obj_t* sub_obj
      )
 {
 	if ( bli_obj_is_col_vector( obj ) )
diff --git a/frame/base/bli_part.h b/frame/base/bli_part.h
index 5e56a9fece..971887e787 100644
--- a/frame/base/bli_part.h
+++ b/frame/base/bli_part.h
@@ -38,12 +38,12 @@
 
 BLIS_EXPORT_BLIS void bli_acquire_mpart
      (
-       dim_t     i,
-       dim_t     j,
-       dim_t     m,
-       dim_t     n,
-       obj_t*    obj,
-       obj_t*    sub_obj
+       dim_t        i,
+       dim_t        j,
+       dim_t        m,
+       dim_t        n,
+       const obj_t* obj,
+             obj_t* sub_obj
      );
 
 #undef  GENPROT
@@ -51,11 +51,11 @@ BLIS_EXPORT_BLIS void bli_acquire_mpart
 \
 BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \
      ( \
-       subpart_t req_part, \
-       dim_t     i, \
-       dim_t     b, \
-       obj_t*    obj, \
-       obj_t*    sub_obj \
+       subpart_t    req_part, \
+       dim_t        i, \
+       dim_t        b, \
+       const obj_t* obj, \
+             obj_t* sub_obj \
      );
 
 GENPROT( acquire_mpart_t2b )
@@ -71,12 +71,12 @@ GENPROT( acquire_mpart_br2tl )
 \
 BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \
      ( \
-       dir_t     direct, \
-       subpart_t req_part, \
-       dim_t     i, \
-       dim_t     b, \
-       obj_t*    obj, \
-       obj_t*    sub_obj \
+       dir_t        direct, \
+       subpart_t    req_part, \
+       dim_t        i, \
+       dim_t        b, \
+       const obj_t* obj, \
+             obj_t* sub_obj \
      );
 
 GENPROT( acquire_mpart_mdim )
@@ -91,11 +91,11 @@ GENPROT( acquire_mpart_mndim )
 \
 BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \
      ( \
-       subpart_t req_part, \
-       dim_t     i, \
-       dim_t     b, \
-       obj_t*    obj, \
-       obj_t*    sub_obj \
+       subpart_t    req_part, \
+       dim_t        i, \
+       dim_t        b, \
+       const obj_t* obj, \
+             obj_t* sub_obj \
      );
 
 GENPROT( acquire_vpart_f2b )
@@ -105,16 +105,16 @@ GENPROT( acquire_vpart_b2f )
 
 BLIS_EXPORT_BLIS void bli_acquire_mij
      (
-       dim_t     i,
-       dim_t     j,
-       obj_t*    obj,
-       obj_t*    sub_obj
+       dim_t        i,
+       dim_t        j,
+       const obj_t* obj,
+             obj_t* sub_obj
      );
 
 BLIS_EXPORT_BLIS void bli_acquire_vi
      (
-       dim_t     i,
-       obj_t*    obj,
-       obj_t*    sub_obj
+       dim_t        i,
+       const obj_t* obj,
+             obj_t* sub_obj
      );
 
diff --git a/frame/base/bli_pba.c b/frame/base/bli_pba.c
index f8835e5de0..bb62c18a74 100644
--- a/frame/base/bli_pba.c
+++ b/frame/base/bli_pba.c
@@ -48,10 +48,10 @@ pba_t* bli_pba_query( void )
 
 void bli_pba_init
      (
-       cntx_t* restrict cntx
+       const cntx_t* cntx
      )
 {
-	pba_t* restrict pba = bli_pba_query();
+	pba_t* pba = bli_pba_query();
 
 	const siz_t align_size = BLIS_POOL_ADDR_ALIGN_SIZE_GEN;
 	malloc_ft   malloc_fp  = BLIS_MALLOC_POOL;
@@ -284,8 +284,8 @@ void bli_pba_acquire_v
 
 siz_t bli_pba_pool_size
      (
-       pba_t*    pba,
-       packbuf_t buf_type
+       const pba_t* pba,
+       packbuf_t    buf_type
      )
 {
 	siz_t r_val;
@@ -304,7 +304,7 @@ siz_t bli_pba_pool_size
 		// Acquire the pointer to the pool corresponding to the buf_type
 		// provided.
 		pool_index = bli_packbuf_index( buf_type );
-		pool       = bli_pba_pool( pool_index, pba );
+		pool       = bli_pba_pool( pool_index, ( pba_t* )pba );
 
 		// Compute the pool "size" as the product of the block size
 		// and the number of blocks in the pool.
@@ -319,8 +319,8 @@ siz_t bli_pba_pool_size
 
 void bli_pba_init_pools
      (
-       cntx_t* cntx,
-       pba_t*  pba
+       const cntx_t* cntx,
+       pba_t*        pba
      )
 {
 	// Map each of the packbuf_t values to an index starting at zero.
@@ -402,10 +402,10 @@ void bli_pba_finalize_pools
 
 void bli_pba_compute_pool_block_sizes
      (
-       siz_t*  bs_a,
-       siz_t*  bs_b,
-       siz_t*  bs_c,
-       cntx_t* cntx
+       siz_t*        bs_a,
+       siz_t*        bs_b,
+       siz_t*        bs_c,
+       const cntx_t* cntx
      )
 {
 	const ind_t im = bli_cntx_method( cntx );
@@ -449,21 +449,21 @@ void bli_pba_compute_pool_block_sizes
 
 void bli_pba_compute_pool_block_sizes_dt
      (
-       num_t   dt,
-       siz_t*  bs_a,
-       siz_t*  bs_b,
-       siz_t*  bs_c,
-       cntx_t* cntx
+       num_t         dt,
+       siz_t*        bs_a,
+       siz_t*        bs_b,
+       siz_t*        bs_c,
+       const cntx_t* cntx
      )
 {
 	siz_t    size_dt = bli_dt_size( dt );
 
-	blksz_t* mr;
-	blksz_t* nr;
+	const blksz_t* mr;
+	const blksz_t* nr;
 
-	blksz_t* mc;
-	blksz_t* kc;
-	blksz_t* nc;
+	const blksz_t* mc;
+	const blksz_t* kc;
+	const blksz_t* nc;
 
 	dim_t    mr_dt;
 	dim_t    nr_dt;
diff --git a/frame/base/bli_pba.h b/frame/base/bli_pba.h
index 6431607ec9..bd56f9fc60 100644
--- a/frame/base/bli_pba.h
+++ b/frame/base/bli_pba.h
@@ -73,17 +73,17 @@ BLIS_INLINE pool_t* bli_pba_pool( dim_t pool_index, pba_t* pba )
 	return &(pba->pools[ pool_index ]);
 }
 
-BLIS_INLINE siz_t bli_pba_align_size( pba_t* pba )
+BLIS_INLINE siz_t bli_pba_align_size( const pba_t* pba )
 {
 	return pba->align_size;
 }
 
-BLIS_INLINE malloc_ft bli_pba_malloc_fp( pba_t* pba )
+BLIS_INLINE malloc_ft bli_pba_malloc_fp( const pba_t* pba )
 {
 	return pba->malloc_fp;
 }
 
-BLIS_INLINE free_ft bli_pba_free_fp( pba_t* pba )
+BLIS_INLINE free_ft bli_pba_free_fp( const pba_t* pba )
 {
 	return pba->free_fp;
 }
@@ -123,7 +123,7 @@ BLIS_EXPORT_BLIS pba_t* bli_pba_query( void );
 
 void bli_pba_init
      (
-       cntx_t*   cntx
+       const cntx_t* cntx
      );
 void bli_pba_finalize
      (
@@ -156,16 +156,16 @@ BLIS_INLINE void bli_pba_rntm_set_pba
 
 siz_t bli_pba_pool_size
      (
-       pba_t*    pba,
-       packbuf_t buf_type
+       const pba_t* pba,
+       packbuf_t    buf_type
      );
 
 // ----------------------------------------------------------------------------
 
 void bli_pba_init_pools
      (
-       cntx_t* cntx,
-       pba_t*  pba
+       const cntx_t* cntx,
+       pba_t*        pba
      );
 void bli_pba_finalize_pools
      (
@@ -174,18 +174,18 @@ void bli_pba_finalize_pools
 
 void bli_pba_compute_pool_block_sizes
      (
-       siz_t*  bs_a,
-       siz_t*  bs_b,
-       siz_t*  bs_c,
-       cntx_t* cntx
+       siz_t*        bs_a,
+       siz_t*        bs_b,
+       siz_t*        bs_c,
+       const cntx_t* cntx
      );
 void bli_pba_compute_pool_block_sizes_dt
      (
-       num_t   dt,
-       siz_t*  bs_a,
-       siz_t*  bs_b,
-       siz_t*  bs_c,
-       cntx_t* cntx
+       num_t         dt,
+       siz_t*        bs_a,
+       siz_t*        bs_b,
+       siz_t*        bs_c,
+       const cntx_t* cntx
      );
 
 #endif
diff --git a/frame/base/bli_pool.c b/frame/base/bli_pool.c
index 112ab68e80..684b0ef736 100644
--- a/frame/base/bli_pool.c
+++ b/frame/base/bli_pool.c
@@ -39,14 +39,14 @@
 
 void bli_pool_init
      (
-       siz_t            num_blocks,
-       siz_t            block_ptrs_len,
-       siz_t            block_size,
-       siz_t            align_size,
-       siz_t            offset_size,
-       malloc_ft        malloc_fp,
-       free_ft          free_fp,
-       pool_t* restrict pool
+       siz_t     num_blocks,
+       siz_t     block_ptrs_len,
+       siz_t     block_size,
+       siz_t     align_size,
+       siz_t     offset_size,
+       malloc_ft malloc_fp,
+       free_ft   free_fp,
+       pool_t*   pool
      )
 {
 	err_t r_val;
@@ -67,7 +67,7 @@ void bli_pool_init
 	// Allocate the block_ptrs array.
 	// FGVZ: Do we want to call malloc_fp() for internal data structures as
 	// well as pool blocks? If so, don't forget to s/bli_free_intl/free_fp/g.
-	pblk_t* restrict block_ptrs
+	pblk_t* block_ptrs
 	=
 	bli_malloc_intl( block_ptrs_len * sizeof( pblk_t ), &r_val );
 
@@ -115,7 +115,7 @@ void bli_pool_init
 
 void bli_pool_finalize
      (
-       pool_t* restrict pool
+       pool_t* pool
      )
 {
 	// NOTE: This implementation assumes that either:
@@ -124,7 +124,7 @@ void bli_pool_finalize
 	//   is bli_pool_reinit().
 
 	// Query the block_ptrs array.
-	pblk_t* restrict block_ptrs = bli_pool_block_ptrs( pool );
+	pblk_t* block_ptrs = bli_pool_block_ptrs( pool );
 
 	// Query the total number of blocks currently allocated.
 	const siz_t num_blocks = bli_pool_num_blocks( pool );
@@ -196,12 +196,12 @@ void bli_pool_finalize
 
 void bli_pool_reinit
      (
-       siz_t            num_blocks_new,
-       siz_t            block_ptrs_len_new,
-       siz_t            block_size_new,
-       siz_t            align_size_new,
-       siz_t            offset_size_new,
-       pool_t* restrict pool
+       siz_t   num_blocks_new,
+       siz_t   block_ptrs_len_new,
+       siz_t   block_size_new,
+       siz_t   align_size_new,
+       siz_t   offset_size_new,
+       pool_t* pool
      )
 {
 	// Preserve the pointers to malloc() and free() provided when the pool
@@ -234,9 +234,9 @@ void bli_pool_reinit
 
 void bli_pool_checkout_block
      (
-       siz_t            req_size,
-       pblk_t* restrict block,
-       pool_t* restrict pool
+       siz_t   req_size,
+       pblk_t* block,
+       pool_t* pool
      )
 {
 	// If the requested block size is smaller than what the pool was
@@ -282,7 +282,7 @@ void bli_pool_checkout_block
 	// At this point, at least one block is guaranteed to be available.
 
 	// Query the block_ptrs array.
-	pblk_t* restrict block_ptrs = bli_pool_block_ptrs( pool );
+	pblk_t* block_ptrs = bli_pool_block_ptrs( pool );
 
 	// Query the top_index of the pool.
 	const siz_t top_index = bli_pool_top_index( pool );
@@ -309,8 +309,8 @@ void bli_pool_checkout_block
 
 void bli_pool_checkin_block
      (
-       pblk_t* restrict block,
-       pool_t* restrict pool
+       pblk_t* block,
+       pool_t* pool
      )
 {
 	// If the pblk_t being checked in was allocated with a different block
@@ -330,7 +330,7 @@ void bli_pool_checkin_block
 	}
 
 	// Query the block_ptrs array.
-	pblk_t* restrict block_ptrs = bli_pool_block_ptrs( pool );
+	pblk_t* block_ptrs = bli_pool_block_ptrs( pool );
 
 	// Query the top_index of the pool.
 	const siz_t top_index = bli_pool_top_index( pool );
@@ -353,8 +353,8 @@ void bli_pool_checkin_block
 
 void bli_pool_grow
      (
-       siz_t            num_blocks_add,
-       pool_t* restrict pool
+       siz_t   num_blocks_add,
+       pool_t* pool
      )
 {
 	err_t r_val;
@@ -394,12 +394,12 @@ void bli_pool_grow
 		#endif
 
 		// Query the current block_ptrs array.
-		pblk_t* restrict block_ptrs_cur = bli_pool_block_ptrs( pool );
+		pblk_t* block_ptrs_cur = bli_pool_block_ptrs( pool );
 
 		// Allocate a new block_ptrs array.
 		// FGVZ: Do we want to call malloc_fp() for internal data structures as
 		// well as pool blocks? If so, don't forget to s/bli_free_intl/free_fp/g.
-		pblk_t* restrict block_ptrs_new
+		pblk_t* block_ptrs_new
 		=
 		bli_malloc_intl( block_ptrs_len_new * sizeof( pblk_t ), &r_val );
 
@@ -433,7 +433,7 @@ void bli_pool_grow
 	// blocks.
 
 	// Query the current block_ptrs array (which was mabye just resized).
-	pblk_t* restrict block_ptrs = bli_pool_block_ptrs( pool );
+	pblk_t* block_ptrs = bli_pool_block_ptrs( pool );
 
 	// Query the block size and alignment size of the pool.
 	const siz_t block_size  = bli_pool_block_size( pool );
@@ -470,8 +470,8 @@ void bli_pool_grow
 
 void bli_pool_shrink
      (
-       siz_t            num_blocks_sub,
-       pool_t* restrict pool
+       siz_t   num_blocks_sub,
+       pool_t* pool
      )
 {
 	// If the requested decrease is zero, return early.
@@ -493,7 +493,7 @@ void bli_pool_shrink
 	num_blocks_sub = bli_min( num_blocks_sub, num_blocks_avail );
 
 	// Query the block_ptrs array.
-	pblk_t* restrict block_ptrs = bli_pool_block_ptrs( pool );
+	pblk_t* block_ptrs = bli_pool_block_ptrs( pool );
 
 	// Compute the new total number of blocks.
 	const siz_t num_blocks_new = num_blocks - num_blocks_sub;
@@ -520,11 +520,11 @@ void bli_pool_shrink
 
 void bli_pool_alloc_block
      (
-       siz_t            block_size,
-       siz_t            align_size,
-       siz_t            offset_size,
-       malloc_ft        malloc_fp,
-       pblk_t* restrict block
+       siz_t     block_size,
+       siz_t     align_size,
+       siz_t     offset_size,
+       malloc_ft malloc_fp,
+       pblk_t*   block
      )
 {
 	err_t r_val;
@@ -540,7 +540,7 @@ void bli_pool_alloc_block
 	// be recovered when it's time to free the block. Note that we have to
 	// add offset_size to the number of bytes requested since we will skip
 	// that many bytes at the beginning of the allocated memory.
-	void* restrict buf
+	void* buf
 	=
 	bli_fmalloc_align( malloc_fp, block_size + offset_size, align_size, &r_val );
 
@@ -579,7 +579,7 @@ void bli_pool_free_block
      (
        siz_t            offset_size,
        free_ft          free_fp,
-       pblk_t* restrict block
+       pblk_t* block
      )
 {
 	#ifdef BLIS_ENABLE_MEM_TRACING
@@ -590,7 +590,7 @@ void bli_pool_free_block
 
 	// Extract the pblk_t buffer, which is the aligned address returned from
 	// bli_fmalloc_align() when the block was allocated.
-	void* restrict buf = bli_pblk_buf( block );
+	void* buf = bli_pblk_buf( block );
 
 	// Undo the pointer advancement by offset_size bytes performed previously
 	// by bli_pool_alloc_block().
@@ -604,7 +604,7 @@ void bli_pool_free_block
 
 void bli_pool_print
      (
-       pool_t* restrict pool
+       const pool_t* pool
      )
 {
 	pblk_t* block_ptrs     = bli_pool_block_ptrs( pool );
@@ -633,7 +633,7 @@ void bli_pool_print
 
 void bli_pblk_print
      (
-       pblk_t* restrict pblk
+       const pblk_t* pblk
      )
 {
 	void* buf = bli_pblk_buf( pblk );
diff --git a/frame/base/bli_pool.h b/frame/base/bli_pool.h
index b4bb23feca..0b16ae8eea 100644
--- a/frame/base/bli_pool.h
+++ b/frame/base/bli_pool.h
@@ -70,12 +70,12 @@ typedef struct
 
 // Pool block query
 
-BLIS_INLINE void* bli_pblk_buf( pblk_t* pblk )
+BLIS_INLINE void* bli_pblk_buf( const pblk_t* pblk )
 {
 	return pblk->buf;
 }
 
-BLIS_INLINE siz_t bli_pblk_block_size( pblk_t* pblk )
+BLIS_INLINE siz_t bli_pblk_block_size( const pblk_t* pblk )
 {
 	return pblk->block_size;
 }
@@ -115,52 +115,52 @@ BLIS_INLINE void bli_pblk_clear( pblk_t* pblk )
 
 // Pool entry query
 
-BLIS_INLINE void* bli_pool_block_ptrs( pool_t* pool )
+BLIS_INLINE void* bli_pool_block_ptrs( const pool_t* pool )
 {
 	return pool->block_ptrs;
 }
 
-BLIS_INLINE siz_t bli_pool_block_ptrs_len( pool_t* pool )
+BLIS_INLINE siz_t bli_pool_block_ptrs_len( const pool_t* pool )
 {
 	return pool->block_ptrs_len;
 }
 
-BLIS_INLINE siz_t bli_pool_num_blocks( pool_t* pool )
+BLIS_INLINE siz_t bli_pool_num_blocks( const pool_t* pool )
 {
 	return pool->num_blocks;
 }
 
-BLIS_INLINE siz_t bli_pool_block_size( pool_t* pool )
+BLIS_INLINE siz_t bli_pool_block_size( const pool_t* pool )
 {
 	return pool->block_size;
 }
 
-BLIS_INLINE siz_t bli_pool_align_size( pool_t* pool )
+BLIS_INLINE siz_t bli_pool_align_size( const pool_t* pool )
 {
 	return pool->align_size;
 }
 
-BLIS_INLINE siz_t bli_pool_offset_size( pool_t* pool )
+BLIS_INLINE siz_t bli_pool_offset_size( const pool_t* pool )
 {
 	return pool->offset_size;
 }
 
-BLIS_INLINE malloc_ft bli_pool_malloc_fp( pool_t* pool )
+BLIS_INLINE malloc_ft bli_pool_malloc_fp( const pool_t* pool )
 {
 	return pool->malloc_fp;
 }
 
-BLIS_INLINE free_ft bli_pool_free_fp( pool_t* pool )
+BLIS_INLINE free_ft bli_pool_free_fp( const pool_t* pool )
 {
 	return pool->free_fp;
 }
 
-BLIS_INLINE siz_t bli_pool_top_index( pool_t* pool )
+BLIS_INLINE siz_t bli_pool_top_index( const pool_t* pool )
 {
 	return pool->top_index;
 }
 
-BLIS_INLINE bool bli_pool_is_exhausted( pool_t* pool )
+BLIS_INLINE bool bli_pool_is_exhausted( const pool_t* pool )
 {
 	return ( bool )
 	       ( bli_pool_top_index( pool ) == bli_pool_num_blocks( pool ) );
@@ -217,74 +217,74 @@ BLIS_INLINE void bli_pool_set_top_index( siz_t top_index, pool_t* pool ) \
 
 void bli_pool_init
      (
-       siz_t            num_blocks,
-       siz_t            block_ptrs_len,
-       siz_t            block_size,
-       siz_t            align_size,
-       siz_t            offset_size,
-       malloc_ft        malloc_fp,
-       free_ft          free_fp,
-       pool_t* restrict pool
+       siz_t     num_blocks,
+       siz_t     block_ptrs_len,
+       siz_t     block_size,
+       siz_t     align_size,
+       siz_t     offset_size,
+       malloc_ft malloc_fp,
+       free_ft   free_fp,
+       pool_t*   pool
      );
 void bli_pool_finalize
      (
-       pool_t* restrict pool
+       pool_t* pool
      );
 void bli_pool_reinit
      (
-       siz_t            num_blocks_new,
-       siz_t            block_ptrs_len_new,
-       siz_t            block_size_new,
-       siz_t            align_size_new,
-       siz_t            offset_size_new,
-       pool_t* restrict pool
+       siz_t   num_blocks_new,
+       siz_t   block_ptrs_len_new,
+       siz_t   block_size_new,
+       siz_t   align_size_new,
+       siz_t   offset_size_new,
+       pool_t* pool
      );
 
 void bli_pool_checkout_block
      (
-       siz_t            req_size,
-       pblk_t* restrict block,
-       pool_t* restrict pool
+       siz_t   req_size,
+       pblk_t* block,
+       pool_t* pool
      );
 void bli_pool_checkin_block
      (
-       pblk_t* restrict block,
-       pool_t* restrict pool
+       pblk_t* block,
+       pool_t* pool
      );
 
 void bli_pool_grow
      (
-       siz_t            num_blocks_add,
-       pool_t* restrict pool
+       siz_t   num_blocks_add,
+       pool_t* pool
      );
 void bli_pool_shrink
      (
-       siz_t            num_blocks_sub,
-       pool_t* restrict pool
+       siz_t   num_blocks_sub,
+       pool_t* pool
      );
 
 void bli_pool_alloc_block
      (
-       siz_t            block_size,
-       siz_t            align_size,
-       siz_t            offset_size,
-       malloc_ft        malloc_fp,
-       pblk_t* restrict block
+       siz_t     block_size,
+       siz_t     align_size,
+       siz_t     offset_size,
+       malloc_ft malloc_fp,
+       pblk_t*   block
      );
 void bli_pool_free_block
      (
-       siz_t            offset_size,
-       free_ft          free_fp,
-       pblk_t* restrict block
+       siz_t   offset_size,
+       free_ft free_fp,
+       pblk_t* block
      );
 
 void bli_pool_print
      (
-       pool_t* restrict pool
+       const pool_t* pool
      );
 void bli_pblk_print
      (
-       pblk_t* restrict pblk
+       const pblk_t* pblk
      );
 
 #endif
diff --git a/frame/base/bli_query.c b/frame/base/bli_query.c
index c62a30cccd..140fc2f978 100644
--- a/frame/base/bli_query.c
+++ b/frame/base/bli_query.c
@@ -34,7 +34,7 @@
 
 #include "blis.h"
 
-bool bli_obj_equals( obj_t* a, obj_t* b )
+bool bli_obj_equals( const obj_t* a, const obj_t* b )
 {
 #if 0
 	bool  r_val = FALSE;
@@ -95,7 +95,7 @@ bool bli_obj_equals( obj_t* a, obj_t* b )
 #endif
 }
 
-bool bli_obj_imag_equals( obj_t* a, obj_t* b )
+bool bli_obj_imag_equals( const obj_t* a, const obj_t* b )
 {
 #if 0
 	bool  r_val = FALSE;
@@ -165,7 +165,7 @@ bool bli_obj_imag_equals( obj_t* a, obj_t* b )
 	return r_val;
 }
 
-bool bli_obj_imag_is_zero( obj_t* a )
+bool bli_obj_imag_is_zero( const obj_t* a )
 {
 	bool r_val = TRUE;
 
diff --git a/frame/base/bli_query.h b/frame/base/bli_query.h
index 65246050b5..d2decf928f 100644
--- a/frame/base/bli_query.h
+++ b/frame/base/bli_query.h
@@ -32,8 +32,8 @@
 
 */
 
-BLIS_EXPORT_BLIS bool bli_obj_equals( obj_t* a, obj_t* b );
+BLIS_EXPORT_BLIS bool bli_obj_equals( const obj_t* a, const obj_t* b );
 
-BLIS_EXPORT_BLIS bool bli_obj_imag_equals( obj_t* a, obj_t* b );
+BLIS_EXPORT_BLIS bool bli_obj_imag_equals( const obj_t* a, const obj_t* b );
 
-BLIS_EXPORT_BLIS bool bli_obj_imag_is_zero( obj_t* a );
+BLIS_EXPORT_BLIS bool bli_obj_imag_is_zero( const obj_t* a );
diff --git a/frame/base/bli_rntm.c b/frame/base/bli_rntm.c
index a6ded35b32..732dbb00d0 100644
--- a/frame/base/bli_rntm.c
+++ b/frame/base/bli_rntm.c
@@ -410,7 +410,7 @@ void bli_rntm_set_ways_from_rntm_sup
 
 void bli_rntm_print
      (
-       rntm_t* rntm
+       const rntm_t* rntm
      )
 {
 	dim_t af = bli_rntm_auto_factor( rntm );
@@ -433,8 +433,8 @@ void bli_rntm_print
 
 dim_t bli_rntm_calc_num_threads_in
      (
-       bszid_t* restrict bszid_cur,
-       rntm_t*  restrict rntm
+       const bszid_t* restrict bszid_cur,
+       const rntm_t*  restrict rntm
      )
 {
 	/*                                     // bp algorithm:
diff --git a/frame/base/bli_rntm.h b/frame/base/bli_rntm.h
index 249a698051..e10c541542 100644
--- a/frame/base/bli_rntm.h
+++ b/frame/base/bli_rntm.h
@@ -61,56 +61,56 @@ typedef struct rntm_s
 // -- rntm_t query (public API) ------------------------------------------------
 //
 
-BLIS_INLINE bool bli_rntm_auto_factor( rntm_t* rntm )
+BLIS_INLINE bool bli_rntm_auto_factor( const rntm_t* rntm )
 {
 	return rntm->auto_factor;
 }
 
-BLIS_INLINE dim_t bli_rntm_num_threads( rntm_t* rntm )
+BLIS_INLINE dim_t bli_rntm_num_threads( const rntm_t* rntm )
 {
 	return rntm->num_threads;
 }
 
-BLIS_INLINE dim_t bli_rntm_ways_for( bszid_t bszid, rntm_t* rntm )
+BLIS_INLINE dim_t bli_rntm_ways_for( bszid_t bszid, const rntm_t* rntm )
 {
 	return rntm->thrloop[ bszid ];
 }
 
-BLIS_INLINE dim_t bli_rntm_jc_ways( rntm_t* rntm )
+BLIS_INLINE dim_t bli_rntm_jc_ways( const rntm_t* rntm )
 {
 	return bli_rntm_ways_for( BLIS_NC, rntm );
 }
-BLIS_INLINE dim_t bli_rntm_pc_ways( rntm_t* rntm )
+BLIS_INLINE dim_t bli_rntm_pc_ways( const rntm_t* rntm )
 {
 	return bli_rntm_ways_for( BLIS_KC, rntm );
 }
-BLIS_INLINE dim_t bli_rntm_ic_ways( rntm_t* rntm )
+BLIS_INLINE dim_t bli_rntm_ic_ways( const rntm_t* rntm )
 {
 	return bli_rntm_ways_for( BLIS_MC, rntm );
 }
-BLIS_INLINE dim_t bli_rntm_jr_ways( rntm_t* rntm )
+BLIS_INLINE dim_t bli_rntm_jr_ways( const rntm_t* rntm )
 {
 	return bli_rntm_ways_for( BLIS_NR, rntm );
 }
-BLIS_INLINE dim_t bli_rntm_ir_ways( rntm_t* rntm )
+BLIS_INLINE dim_t bli_rntm_ir_ways( const rntm_t* rntm )
 {
 	return bli_rntm_ways_for( BLIS_MR, rntm );
 }
-BLIS_INLINE dim_t bli_rntm_pr_ways( rntm_t* rntm )
+BLIS_INLINE dim_t bli_rntm_pr_ways( const rntm_t* rntm )
 {
 	return bli_rntm_ways_for( BLIS_KR, rntm );
 }
 
-BLIS_INLINE bool bli_rntm_pack_a( rntm_t* rntm )
+BLIS_INLINE bool bli_rntm_pack_a( const rntm_t* rntm )
 {
 	return ( bool )( rntm->pack_a );
 }
-BLIS_INLINE bool bli_rntm_pack_b( rntm_t* rntm )
+BLIS_INLINE bool bli_rntm_pack_b( const rntm_t* rntm )
 {
 	return ( bool )( rntm->pack_b );
 }
 
-BLIS_INLINE bool bli_rntm_l3_sup( rntm_t* rntm )
+BLIS_INLINE bool bli_rntm_l3_sup( const rntm_t* rntm )
 {
 	return rntm->l3_sup;
 }
@@ -119,12 +119,12 @@ BLIS_INLINE bool bli_rntm_l3_sup( rntm_t* rntm )
 // -- rntm_t query (internal use only) -----------------------------------------
 //
 
-BLIS_INLINE pool_t* bli_rntm_sba_pool( rntm_t* rntm )
+BLIS_INLINE pool_t* bli_rntm_sba_pool( const rntm_t* rntm )
 {
 	return rntm->sba_pool;
 }
 
-BLIS_INLINE pba_t* bli_rntm_pba( rntm_t* rntm )
+BLIS_INLINE pba_t* bli_rntm_pba( const rntm_t* rntm )
 {
 	return rntm->pba;
 }
@@ -334,7 +334,7 @@ BLIS_INLINE void bli_rntm_init( rntm_t* rntm )
 
 BLIS_INLINE dim_t bli_rntm_calc_num_threads
      (
-       rntm_t*  restrict rntm
+       const rntm_t*  restrict rntm
      )
 {
 	dim_t n_threads;
@@ -382,13 +382,13 @@ void bli_rntm_set_ways_from_rntm_sup
 
 void bli_rntm_print
      (
-       rntm_t* rntm
+       const rntm_t* rntm
      );
 
 dim_t bli_rntm_calc_num_threads_in
      (
-       bszid_t* restrict bszid_cur,
-       rntm_t*  restrict rntm
+       const bszid_t* restrict bszid_cur,
+       const rntm_t*  restrict rntm
      );
 
 #endif
diff --git a/frame/base/bli_setgetijm.c b/frame/base/bli_setgetijm.c
index 78ff58a29c..86f1c8845c 100644
--- a/frame/base/bli_setgetijm.c
+++ b/frame/base/bli_setgetijm.c
@@ -36,11 +36,11 @@
 
 typedef void (*setijm_fp)
      (
-       double         ar,
-       double         ai,
-       dim_t          i,
-       dim_t          j,
-       void* restrict b, inc_t rs, inc_t cs
+       double ar,
+       double ai,
+       dim_t  i,
+       dim_t  j,
+       void*  b, inc_t rs, inc_t cs
      );
 static setijm_fp GENARRAY(ftypes_setijm,setijm);
 
@@ -90,16 +90,16 @@ err_t bli_setijm
 \
 void PASTEMAC(ch,opname) \
      ( \
-       double         ar, \
-       double         ai, \
-       dim_t          i, \
-       dim_t          j, \
-       void* restrict b, inc_t rs, inc_t cs  \
+       double ar, \
+       double ai, \
+       dim_t  i, \
+       dim_t  j, \
+       void*  b, inc_t rs, inc_t cs  \
      ) \
 { \
-	ctype* restrict b_cast = ( ctype* )b; \
+	ctype* b_cast = ( ctype* )b; \
 \
-	ctype* restrict b_ij = b_cast + (i  )*rs + (j  )*cs; \
+	ctype* b_ij = b_cast + (i  )*rs + (j  )*cs; \
 \
 	PASTEMAC2(z,ch,sets)( ar, ai, *b_ij ); \
 }
@@ -110,21 +110,21 @@ INSERT_GENTFUNC_BASIC0( setijm )
 
 typedef void (*getijm_fp)
      (
-       dim_t          i,
-       dim_t          j,
-       void* restrict b, inc_t rs, inc_t cs,
-       double*        ar,
-       double*        ai
+       dim_t       i,
+       dim_t       j,
+       const void* b, inc_t rs, inc_t cs,
+       double*     ar,
+       double*     ai
      );
 static getijm_fp GENARRAY(ftypes_getijm,getijm);
 
 err_t bli_getijm
       (
-        dim_t   i,
-        dim_t   j,
-        obj_t*  b,
-        double* ar,
-        double* ai
+        dim_t        i,
+        dim_t        j,
+        const obj_t* b,
+        double*      ar,
+        double*      ai
       )
 {
 	dim_t m  = bli_obj_length( b );
@@ -164,16 +164,16 @@ err_t bli_getijm
 \
 void PASTEMAC(ch,opname) \
      ( \
-       dim_t          i, \
-       dim_t          j, \
-       void* restrict b, inc_t rs, inc_t cs, \
-       double*        ar, \
-       double*        ai  \
+       dim_t       i, \
+       dim_t       j, \
+       const void* b, inc_t rs, inc_t cs, \
+       double*     ar, \
+       double*     ai  \
      ) \
 { \
-	ctype* restrict b_cast = ( ctype* )b; \
+	const ctype* b_cast = ( const ctype* )b; \
 \
-	ctype* restrict b_ij = b_cast + (i  )*rs + (j  )*cs; \
+	const ctype* b_ij = b_cast + (i  )*rs + (j  )*cs; \
 \
 	PASTEMAC2(ch,z,gets)( *b_ij, *ar, *ai ); \
 }
diff --git a/frame/base/bli_setgetijm.h b/frame/base/bli_setgetijm.h
index 55ce0ee119..76b03a64b3 100644
--- a/frame/base/bli_setgetijm.h
+++ b/frame/base/bli_setgetijm.h
@@ -34,11 +34,11 @@
 
 BLIS_EXPORT_BLIS err_t bli_setijm
      (
-       double  ar,
-       double  ai,
-       dim_t   i,
-       dim_t   j,
-       obj_t*  b
+       double ar,
+       double ai,
+       dim_t  i,
+       dim_t  j,
+       obj_t* b
      );
 
 #undef  GENTPROT
@@ -46,11 +46,11 @@ BLIS_EXPORT_BLIS err_t bli_setijm
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       double         ar, \
-       double         ai, \
-       dim_t          i, \
-       dim_t          j, \
-       void* restrict b, inc_t rs, inc_t cs  \
+       double ar, \
+       double ai, \
+       dim_t  i, \
+       dim_t  j, \
+       void*  b, inc_t rs, inc_t cs  \
      );
 
 INSERT_GENTPROT_BASIC0( setijm )
@@ -59,11 +59,11 @@ INSERT_GENTPROT_BASIC0( setijm )
 
 BLIS_EXPORT_BLIS err_t bli_getijm
       (
-        dim_t   i,
-        dim_t   j,
-        obj_t*  b,
-        double* ar,
-        double* ai
+        dim_t        i,
+        dim_t        j,
+        const obj_t* b,
+        double*      ar,
+        double*      ai
       );
 
 #undef  GENTPROT
@@ -71,11 +71,11 @@ BLIS_EXPORT_BLIS err_t bli_getijm
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       dim_t          i, \
-       dim_t          j, \
-       void* restrict b, inc_t rs, inc_t cs, \
-       double*        ar, \
-       double*        ai  \
+       dim_t       i, \
+       dim_t       j, \
+       const void* b, inc_t rs, inc_t cs, \
+       double*     ar, \
+       double*     ai  \
      );
 
 INSERT_GENTPROT_BASIC0( getijm )
diff --git a/frame/base/bli_setgetijv.c b/frame/base/bli_setgetijv.c
index 610f6f271c..3728daed75 100644
--- a/frame/base/bli_setgetijv.c
+++ b/frame/base/bli_setgetijv.c
@@ -36,19 +36,19 @@
 
 typedef void (*setijv_fp)
      (
-       double         ar,
-       double         ai,
-       dim_t          i,
-       void* restrict x, inc_t incx
+       double ar,
+       double ai,
+       dim_t  i,
+       void*  x, inc_t incx
      );
 static setijv_fp GENARRAY(ftypes_setijv,setijv);
 
 err_t bli_setijv
      (
-       double  ar,
-       double  ai,
-       dim_t   i,
-       obj_t*  x
+       double ar,
+       double ai,
+       dim_t  i,
+       obj_t* x
      )
 {
 	dim_t n    = bli_obj_vector_dim( x );
@@ -84,10 +84,10 @@ err_t bli_setijv
 \
 void PASTEMAC(ch,opname) \
      ( \
-       double         ar, \
-       double         ai, \
-       dim_t          i, \
-       void* restrict x, inc_t incx  \
+       double ar, \
+       double ai, \
+       dim_t  i, \
+       void*  x, inc_t incx  \
      ) \
 { \
 	ctype* restrict x_cast = ( ctype* )x; \
@@ -103,19 +103,19 @@ INSERT_GENTFUNC_BASIC0( setijv )
 
 typedef void (*getijv_fp)
      (
-       dim_t          i,
-       void* restrict x, inc_t incx,
-       double*        ar,
-       double*        ai
+       dim_t       i,
+       const void* x, inc_t incx,
+       double*     ar,
+       double*     ai
      );
 static getijv_fp GENARRAY(ftypes_getijv,getijv);
 
 err_t bli_getijv
       (
-        dim_t   i,
-        obj_t*  x,
-        double* ar,
-        double* ai
+        dim_t        i,
+        const obj_t* x,
+        double*      ar,
+        double*      ai
       )
 {
 	dim_t n    = bli_obj_vector_dim( x );
@@ -151,15 +151,15 @@ err_t bli_getijv
 \
 void PASTEMAC(ch,opname) \
      ( \
-       dim_t          i, \
-       void* restrict x, inc_t incx, \
-       double*        ar, \
-       double*        ai  \
+       dim_t       i, \
+       const void* x, inc_t incx, \
+       double*     ar, \
+       double*     ai  \
      ) \
 { \
-	ctype* restrict x_cast = ( ctype* )x; \
+	const ctype* restrict x_cast = ( const ctype* )x; \
 \
-	ctype* restrict x_i = x_cast + (i  )*incx; \
+	const ctype* restrict x_i = x_cast + (i  )*incx; \
 \
 	PASTEMAC2(ch,z,gets)( *x_i, *ar, *ai ); \
 }
diff --git a/frame/base/bli_setgetijv.h b/frame/base/bli_setgetijv.h
index 703fe41aae..3b61179759 100644
--- a/frame/base/bli_setgetijv.h
+++ b/frame/base/bli_setgetijv.h
@@ -45,10 +45,10 @@ BLIS_EXPORT_BLIS err_t bli_setijv
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       double         ar, \
-       double         ai, \
-       dim_t          i, \
-       void* restrict x, inc_t incx  \
+       double ar, \
+       double ai, \
+       dim_t  i, \
+       void*  x, inc_t incx  \
      );
 
 INSERT_GENTPROT_BASIC0( setijv )
@@ -57,10 +57,10 @@ INSERT_GENTPROT_BASIC0( setijv )
 
 BLIS_EXPORT_BLIS err_t bli_getijv
       (
-        dim_t   i,
-        obj_t*  x,
-        double* ar,
-        double* ai
+        dim_t        i,
+        const obj_t* x,
+        double*      ar,
+        double*      ai
       );
 
 #undef  GENTPROT
@@ -68,10 +68,10 @@ BLIS_EXPORT_BLIS err_t bli_getijv
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       dim_t          i, \
-       void* restrict b, inc_t incx, \
-       double*        ar, \
-       double*        ai  \
+       dim_t       i, \
+       const void* b, inc_t incx, \
+       double*     ar, \
+       double*     ai  \
      );
 
 INSERT_GENTPROT_BASIC0( getijv )
diff --git a/frame/base/bli_setri.c b/frame/base/bli_setri.c
index 7220571c03..e7e69c3391 100644
--- a/frame/base/bli_setri.c
+++ b/frame/base/bli_setri.c
@@ -38,8 +38,8 @@
 
 void bli_setrm
      (
-       obj_t* alpha,
-       obj_t* b
+       const obj_t* alpha,
+             obj_t* b
      )
 {
 	obj_t alpha_real;
@@ -67,8 +67,8 @@ void bli_setrm
 
 void bli_setrv
      (
-       obj_t* alpha,
-       obj_t* x
+       const obj_t* alpha,
+             obj_t* x
      )
 {
 	obj_t alpha_real;
@@ -98,8 +98,8 @@ void bli_setrv
 
 void bli_setim
      (
-       obj_t* alpha,
-       obj_t* b
+       const obj_t* alpha,
+             obj_t* b
      )
 {
 	obj_t alpha_real;
@@ -130,8 +130,8 @@ void bli_setim
 
 void bli_setiv
      (
-       obj_t* alpha,
-       obj_t* x
+       const obj_t* alpha,
+             obj_t* x
      )
 {
 	obj_t alpha_real;
diff --git a/frame/base/bli_setri.h b/frame/base/bli_setri.h
index dd6ce9f3f1..0beac1ec5b 100644
--- a/frame/base/bli_setri.h
+++ b/frame/base/bli_setri.h
@@ -36,27 +36,27 @@
 
 BLIS_EXPORT_BLIS void bli_setrm
      (
-       obj_t* alpha,
-       obj_t* b
+       const obj_t* alpha,
+             obj_t* b
      );
 
 BLIS_EXPORT_BLIS void bli_setrv
      (
-       obj_t* alpha,
-       obj_t* x
+       const obj_t* alpha,
+             obj_t* x
      );
 
 // -- seti ---------------------------------------------------------------------
 
 BLIS_EXPORT_BLIS void bli_setim
      (
-       obj_t* alpha,
-       obj_t* b
+       const obj_t* alpha,
+             obj_t* b
      );
 
 BLIS_EXPORT_BLIS void bli_setiv
      (
-       obj_t* alpha,
-       obj_t* x
+       const obj_t* alpha,
+             obj_t* x
      );
 
diff --git a/frame/base/cast/bli_castm.c b/frame/base/cast/bli_castm.c
index 64db75d240..e3ee3e097d 100644
--- a/frame/base/cast/bli_castm.c
+++ b/frame/base/cast/bli_castm.c
@@ -41,11 +41,11 @@
 
 typedef void (*FUNCPTR_T)
      (
-       trans_t        transa,
-       dim_t          m,
-       dim_t          n,
-       void* restrict a, inc_t rs_a, inc_t cs_a,
-       void* restrict b, inc_t rs_b, inc_t cs_b
+       trans_t              transa,
+       dim_t                m,
+       dim_t                n,
+       const void* restrict a, inc_t rs_a, inc_t cs_a,
+             void* restrict b, inc_t rs_b, inc_t cs_b
      );
 
 static FUNCPTR_T GENARRAY2_ALL(ftypes,castm);
@@ -56,8 +56,8 @@ static FUNCPTR_T GENARRAY2_ALL(ftypes,castm);
 
 void bli_castm
      (
-       obj_t* a,
-       obj_t* b
+       const obj_t* a,
+       const obj_t* b
      )
 {
 	num_t     dt_a     = bli_obj_dt( a );
@@ -68,7 +68,7 @@ void bli_castm
 	dim_t     m        = bli_obj_length( b );
 	dim_t     n        = bli_obj_width( b );
 
-	void*     buf_a    = bli_obj_buffer_at_off( a );
+	const void*     buf_a    = bli_obj_buffer_at_off( a );
 	inc_t     rs_a     = bli_obj_row_stride( a );
 	inc_t     cs_a     = bli_obj_col_stride( a );
 
@@ -117,14 +117,14 @@ void bli_castm
 \
 void PASTEMAC2(cha,chb,opname) \
      ( \
-       trans_t        transa, \
-       dim_t          m, \
-       dim_t          n, \
-       void* restrict a, inc_t rs_a, inc_t cs_a, \
-       void* restrict b, inc_t rs_b, inc_t cs_b  \
+       trans_t              transa, \
+       dim_t                m, \
+       dim_t                n, \
+       const void* restrict a, inc_t rs_a, inc_t cs_a, \
+             void* restrict b, inc_t rs_b, inc_t cs_b  \
      ) \
 { \
-	ctype_a* restrict a_cast = a; \
+	const ctype_a* restrict a_cast = a; \
 	ctype_b* restrict b_cast = b; \
 	conj_t            conja; \
 	dim_t             n_iter; \
@@ -150,7 +150,7 @@ void PASTEMAC2(cha,chb,opname) \
 		{ \
 			for ( j = 0; j < n_iter; ++j ) \
 			{ \
-				ctype_a* restrict a1 = a_cast + (j  )*lda + (0  )*inca; \
+				const ctype_a* restrict a1 = a_cast + (j  )*lda + (0  )*inca; \
 				ctype_b* restrict b1 = b_cast + (j  )*ldb + (0  )*incb; \
 \
 				for ( i = 0; i < n_elem; ++i ) \
@@ -163,7 +163,7 @@ void PASTEMAC2(cha,chb,opname) \
 		{ \
 			for ( j = 0; j < n_iter; ++j ) \
 			{ \
-				ctype_a* restrict a1 = a_cast + (j  )*lda + (0  )*inca; \
+				const ctype_a* restrict a1 = a_cast + (j  )*lda + (0  )*inca; \
 				ctype_b* restrict b1 = b_cast + (j  )*ldb + (0  )*incb; \
 \
 				for ( i = 0; i < n_elem; ++i ) \
@@ -182,7 +182,7 @@ void PASTEMAC2(cha,chb,opname) \
 		{ \
 			for ( j = 0; j < n_iter; ++j ) \
 			{ \
-				ctype_a* restrict a1 = a_cast + (j  )*lda + (0  )*inca; \
+				const ctype_a* restrict a1 = a_cast + (j  )*lda + (0  )*inca; \
 				ctype_b* restrict b1 = b_cast + (j  )*ldb + (0  )*incb; \
 \
 				for ( i = 0; i < n_elem; ++i ) \
@@ -195,7 +195,7 @@ void PASTEMAC2(cha,chb,opname) \
 		{ \
 			for ( j = 0; j < n_iter; ++j ) \
 			{ \
-				ctype_a* restrict a1 = a_cast + (j  )*lda + (0  )*inca; \
+				const ctype_a* restrict a1 = a_cast + (j  )*lda + (0  )*inca; \
 				ctype_b* restrict b1 = b_cast + (j  )*ldb + (0  )*incb; \
 \
 				for ( i = 0; i < n_elem; ++i ) \
@@ -221,8 +221,8 @@ INSERT_GENTFUNC2_MIXDP0( castm )
 
 void bli_castm_check
      (
-       obj_t* a,
-       obj_t* b
+       const obj_t* a,
+       const obj_t* b
      )
 {
 	err_t e_val;
diff --git a/frame/base/cast/bli_castm.h b/frame/base/cast/bli_castm.h
index e9e1dee212..2cd784670f 100644
--- a/frame/base/cast/bli_castm.h
+++ b/frame/base/cast/bli_castm.h
@@ -38,8 +38,8 @@
 
 BLIS_EXPORT_BLIS void bli_castm
      (
-       obj_t* a,
-       obj_t* b
+       const obj_t* a,
+       const obj_t* b
      );
 
 //
@@ -51,11 +51,11 @@ BLIS_EXPORT_BLIS void bli_castm
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \
      ( \
-       trans_t transa, \
-       dim_t   m, \
-       dim_t   n, \
-       void*   a, inc_t rs_a, inc_t cs_a, \
-       void*   b, inc_t rs_b, inc_t cs_b  \
+       trans_t     transa, \
+       dim_t       m, \
+       dim_t       n, \
+       const void* a, inc_t rs_a, inc_t cs_a, \
+             void* b, inc_t rs_b, inc_t cs_b  \
      );
 
 INSERT_GENTPROT2_BASIC0( castm )
@@ -67,7 +67,7 @@ INSERT_GENTPROT2_MIXDP0( castm )
 
 void bli_castm_check
      (
-       obj_t* a,
-       obj_t* b
+       const obj_t* a,
+       const obj_t* b
      );
 
diff --git a/frame/base/cast/bli_castnzm.c b/frame/base/cast/bli_castnzm.c
index a50bdfc159..238405a6ea 100644
--- a/frame/base/cast/bli_castnzm.c
+++ b/frame/base/cast/bli_castnzm.c
@@ -41,11 +41,11 @@
 
 typedef void (*FUNCPTR_T)
      (
-       trans_t        transa,
-       dim_t          m,
-       dim_t          n,
-       void* restrict a, inc_t rs_a, inc_t cs_a,
-       void* restrict b, inc_t rs_b, inc_t cs_b
+       trans_t              transa,
+       dim_t                m,
+       dim_t                n,
+       const void* restrict a, inc_t rs_a, inc_t cs_a,
+             void* restrict b, inc_t rs_b, inc_t cs_b
      );
 
 static FUNCPTR_T GENARRAY2_ALL(ftypes,castnzm);
@@ -56,8 +56,8 @@ static FUNCPTR_T GENARRAY2_ALL(ftypes,castnzm);
 
 void bli_castnzm
      (
-       obj_t* a,
-       obj_t* b
+       const obj_t* a,
+       const obj_t* b
      )
 {
 	num_t     dt_a     = bli_obj_dt( a );
@@ -68,7 +68,7 @@ void bli_castnzm
 	dim_t     m        = bli_obj_length( b );
 	dim_t     n        = bli_obj_width( b );
 
-	void*     buf_a    = bli_obj_buffer_at_off( a );
+	const void*     buf_a    = bli_obj_buffer_at_off( a );
 	inc_t     rs_a     = bli_obj_row_stride( a );
 	inc_t     cs_a     = bli_obj_col_stride( a );
 
@@ -117,14 +117,14 @@ void bli_castnzm
 \
 void PASTEMAC2(cha,chb,opname) \
      ( \
-       trans_t        transa, \
-       dim_t          m, \
-       dim_t          n, \
-       void* restrict a, inc_t rs_a, inc_t cs_a, \
-       void* restrict b, inc_t rs_b, inc_t cs_b  \
+       trans_t              transa, \
+       dim_t                m, \
+       dim_t                n, \
+       const void* restrict a, inc_t rs_a, inc_t cs_a, \
+             void* restrict b, inc_t rs_b, inc_t cs_b  \
      ) \
 { \
-	ctype_a* restrict a_cast = a; \
+	const ctype_a* restrict a_cast = a; \
 	ctype_b* restrict b_cast = b; \
 	conj_t            conja; \
 	dim_t             n_iter; \
@@ -150,7 +150,7 @@ void PASTEMAC2(cha,chb,opname) \
 		{ \
 			for ( j = 0; j < n_iter; ++j ) \
 			{ \
-				ctype_a* restrict a1 = a_cast + (j  )*lda + (0  )*inca; \
+				const ctype_a* restrict a1 = a_cast + (j  )*lda + (0  )*inca; \
 				ctype_b* restrict b1 = b_cast + (j  )*ldb + (0  )*incb; \
 \
 				for ( i = 0; i < n_elem; ++i ) \
@@ -163,7 +163,7 @@ void PASTEMAC2(cha,chb,opname) \
 		{ \
 			for ( j = 0; j < n_iter; ++j ) \
 			{ \
-				ctype_a* restrict a1 = a_cast + (j  )*lda + (0  )*inca; \
+				const ctype_a* restrict a1 = a_cast + (j  )*lda + (0  )*inca; \
 				ctype_b* restrict b1 = b_cast + (j  )*ldb + (0  )*incb; \
 \
 				for ( i = 0; i < n_elem; ++i ) \
@@ -182,7 +182,7 @@ void PASTEMAC2(cha,chb,opname) \
 		{ \
 			for ( j = 0; j < n_iter; ++j ) \
 			{ \
-				ctype_a* restrict a1 = a_cast + (j  )*lda + (0  )*inca; \
+				const ctype_a* restrict a1 = a_cast + (j  )*lda + (0  )*inca; \
 				ctype_b* restrict b1 = b_cast + (j  )*ldb + (0  )*incb; \
 \
 				for ( i = 0; i < n_elem; ++i ) \
@@ -195,7 +195,7 @@ void PASTEMAC2(cha,chb,opname) \
 		{ \
 			for ( j = 0; j < n_iter; ++j ) \
 			{ \
-				ctype_a* restrict a1 = a_cast + (j  )*lda + (0  )*inca; \
+				const ctype_a* restrict a1 = a_cast + (j  )*lda + (0  )*inca; \
 				ctype_b* restrict b1 = b_cast + (j  )*ldb + (0  )*incb; \
 \
 				for ( i = 0; i < n_elem; ++i ) \
@@ -221,8 +221,8 @@ INSERT_GENTFUNC2_MIXDP0( castnzm )
 
 void bli_castnzm_check
      (
-       obj_t* a,
-       obj_t* b
+       const obj_t* a,
+       const obj_t* b
      )
 {
 	err_t e_val;
diff --git a/frame/base/cast/bli_castnzm.h b/frame/base/cast/bli_castnzm.h
index 42cfef8c0d..9c351d3ea6 100644
--- a/frame/base/cast/bli_castnzm.h
+++ b/frame/base/cast/bli_castnzm.h
@@ -38,8 +38,8 @@
 
 BLIS_EXPORT_BLIS void bli_castnzm
      (
-       obj_t* a,
-       obj_t* b
+       const obj_t* a,
+       const obj_t* b
      );
 
 //
@@ -51,11 +51,11 @@ BLIS_EXPORT_BLIS void bli_castnzm
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \
      ( \
-       trans_t transa, \
-       dim_t   m, \
-       dim_t   n, \
-       void*   a, inc_t rs_a, inc_t cs_a, \
-       void*   b, inc_t rs_b, inc_t cs_b  \
+       trans_t     transa, \
+       dim_t       m, \
+       dim_t       n, \
+       const void* a, inc_t rs_a, inc_t cs_a, \
+             void* b, inc_t rs_b, inc_t cs_b  \
      );
 
 INSERT_GENTPROT2_BASIC0( castnzm )
@@ -67,7 +67,7 @@ INSERT_GENTPROT2_MIXDP0( castnzm )
 
 void bli_castnzm_check
      (
-       obj_t* a,
-       obj_t* b
+       const obj_t* a,
+       const obj_t* b
      );
 
diff --git a/frame/base/cast/bli_castv.c b/frame/base/cast/bli_castv.c
index 213c960d89..74d1f8757b 100644
--- a/frame/base/cast/bli_castv.c
+++ b/frame/base/cast/bli_castv.c
@@ -41,10 +41,10 @@
 
 typedef void (*FUNCPTR_T)
      (
-       conj_t         conjx,
-       dim_t          n,
-       void* restrict x, inc_t inc_x,
-       void* restrict y, inc_t inc_y
+       conj_t               conjx,
+       dim_t                n,
+       const void* restrict x, inc_t inc_x,
+             void* restrict y, inc_t inc_y
      );
 
 static FUNCPTR_T GENARRAY2_ALL(ftypes,castv);
@@ -55,8 +55,8 @@ static FUNCPTR_T GENARRAY2_ALL(ftypes,castv);
 
 void bli_castv
      (
-       obj_t* x,
-       obj_t* y
+       const obj_t* x,
+       const obj_t* y
      )
 {
 	num_t     dt_x     = bli_obj_dt( x );
@@ -66,7 +66,7 @@ void bli_castv
 
 	dim_t     n        = bli_obj_vector_dim( x );
 
-	void*     buf_x    = bli_obj_buffer_at_off( x );
+	const void*     buf_x    = bli_obj_buffer_at_off( x );
 	inc_t     inc_x    = bli_obj_vector_inc( x );
 
 	void*     buf_y    = bli_obj_buffer_at_off( y );
@@ -112,13 +112,13 @@ void bli_castv
 \
 void PASTEMAC2(chx,chy,opname) \
      ( \
-       conj_t         conjx, \
-       dim_t          n, \
-       void* restrict x, inc_t incx, \
-       void* restrict y, inc_t incy  \
+       conj_t               conjx, \
+       dim_t                n, \
+       const void* restrict x, inc_t incx, \
+             void* restrict y, inc_t incy  \
      ) \
 { \
-	ctype_x* restrict x1 = x; \
+	const ctype_x* restrict x1 = x; \
 	ctype_y* restrict y1 = y; \
 	dim_t             i; \
 \
@@ -175,8 +175,8 @@ INSERT_GENTFUNC2_MIXDP0( castv )
 
 void bli_castv_check
      (
-       obj_t* x,
-       obj_t* y
+       const obj_t* x,
+       const obj_t* y
      )
 {
 	err_t e_val;
diff --git a/frame/base/cast/bli_castv.h b/frame/base/cast/bli_castv.h
index 9a82615143..542795ca5a 100644
--- a/frame/base/cast/bli_castv.h
+++ b/frame/base/cast/bli_castv.h
@@ -38,8 +38,8 @@
 
 BLIS_EXPORT_BLIS void bli_castv
      (
-       obj_t* x,
-       obj_t* y
+       const obj_t* x,
+       const obj_t* y
      );
 
 //
@@ -51,10 +51,10 @@ BLIS_EXPORT_BLIS void bli_castv
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,opname) \
      ( \
-       conj_t  conjx, \
-       dim_t   n, \
-       void*   x, inc_t incx, \
-       void*   y, inc_t incy  \
+       conj_t      conjx, \
+       dim_t       n, \
+       const void* x, inc_t incx, \
+             void* y, inc_t incy  \
      );
 
 INSERT_GENTPROT2_BASIC0( castv )
@@ -66,7 +66,7 @@ INSERT_GENTPROT2_MIXDP0( castv )
 
 void bli_castv_check
      (
-       obj_t* x,
-       obj_t* y
+       const obj_t* x,
+       const obj_t* y
      );
 
diff --git a/frame/base/check/bli_obj_check.c b/frame/base/check/bli_obj_check.c
index a971fa19a7..1739afc428 100644
--- a/frame/base/check/bli_obj_check.c
+++ b/frame/base/check/bli_obj_check.c
@@ -39,7 +39,7 @@ void bli_obj_create_check( num_t  dt,
                            dim_t  n,
                            inc_t  rs,
                            inc_t  cs,
-                           obj_t* obj )
+                           const obj_t* obj )
 {
 	err_t e_val;
 
@@ -56,7 +56,7 @@ void bli_obj_create_check( num_t  dt,
 void bli_obj_create_without_buffer_check( num_t  dt,
                                           dim_t  m,
                                           dim_t  n,
-                                          obj_t* obj )
+                                          const obj_t* obj )
 {
 	err_t e_val;
 
@@ -70,7 +70,7 @@ void bli_obj_create_without_buffer_check( num_t  dt,
 void bli_obj_alloc_buffer_check( inc_t  rs,
                                  inc_t  cs,
                                  inc_t  is,
-                                 obj_t* obj )
+                                 const obj_t* obj )
 {
 	err_t e_val;
 
@@ -83,11 +83,11 @@ void bli_obj_alloc_buffer_check( inc_t  rs,
 	bli_check_error_code( e_val );
 }
 
-void bli_obj_attach_buffer_check( void*  p,
+void bli_obj_attach_buffer_check( const void*  p,
                                   inc_t  rs,
                                   inc_t  cs,
                                   inc_t  is,
-                                  obj_t* obj )
+                                  const obj_t* obj )
 {
 	err_t e_val;
 
@@ -110,7 +110,7 @@ void bli_obj_attach_buffer_check( void*  p,
 }
 
 void bli_obj_create_scalar_check( num_t  dt,
-                                  obj_t* obj )
+                                  const obj_t* obj )
 {
 	err_t e_val;
 
@@ -121,7 +121,7 @@ void bli_obj_create_scalar_check( num_t  dt,
 	bli_check_error_code( e_val );
 }
 
-void bli_obj_free_check( obj_t* obj )
+void bli_obj_free_check( const obj_t* obj )
 {
 	//err_t e_val;
 
@@ -131,7 +131,7 @@ void bli_obj_free_check( obj_t* obj )
 	//bli_check_error_code( e_val );
 }
 
-void bli_obj_create_const_check( double value, obj_t* obj )
+void bli_obj_create_const_check( double value, const obj_t* obj )
 {
 	err_t e_val;
 
@@ -185,7 +185,7 @@ void bli_dt_union_check( num_t dt1, num_t dt2 )
 	bli_check_error_code( e_val );
 }
 
-void bli_obj_print_check( char* label, obj_t* obj )
+void bli_obj_print_check( const char* label, const obj_t* obj )
 {
 	err_t e_val;
 
diff --git a/frame/base/check/bli_obj_check.h b/frame/base/check/bli_obj_check.h
index 2018428443..232fb02097 100644
--- a/frame/base/check/bli_obj_check.h
+++ b/frame/base/check/bli_obj_check.h
@@ -37,32 +37,32 @@ void bli_obj_create_check( num_t  dt,
                            dim_t  n,
                            inc_t  rs,
                            inc_t  cs,
-                           obj_t* obj );
+                           const obj_t* obj );
 
 void bli_obj_create_without_buffer_check( num_t  dt,
                                           dim_t  m,
                                           dim_t  n,
-                                          obj_t* obj );
+                                          const obj_t* obj );
 
 void bli_obj_alloc_buffer_check( inc_t  rs,
                                  inc_t  cs,
                                  inc_t  is,
-                                 obj_t* obj );
+                                 const obj_t* obj );
 
-void bli_obj_attach_buffer_check( void*  p,
+void bli_obj_attach_buffer_check( const void*  p,
                                   inc_t  rs,
                                   inc_t  cs,
                                   inc_t  is,
-                                  obj_t* obj );
+                                  const obj_t* obj );
 
 void bli_obj_create_scalar_check( num_t  dt,
-                                  obj_t* obj );
+                                  const obj_t* obj );
 
-void bli_obj_free_check( obj_t* obj );
+void bli_obj_free_check( const obj_t* obj );
 
-void bli_obj_create_const_check( double value, obj_t* obj );
+void bli_obj_create_const_check( double value, const obj_t* obj );
 
-void bli_obj_create_const_copy_of_check( obj_t* a, obj_t* b );
+void bli_obj_create_const_copy_of_check( const obj_t* a, const obj_t* b );
 
 void bli_dt_size_check( num_t dt );
 
@@ -70,5 +70,5 @@ void bli_dt_string_check( num_t dt );
 
 void bli_dt_union_check( num_t dt1, num_t dt2 );
 
-void bli_obj_print_check( char* label, obj_t* obj );
+void bli_obj_print_check( const char* label, const obj_t* obj );
 
diff --git a/frame/base/check/bli_part_check.c b/frame/base/check/bli_part_check.c
index 6d9aa37b97..c8abb4b6e7 100644
--- a/frame/base/check/bli_part_check.c
+++ b/frame/base/check/bli_part_check.c
@@ -34,11 +34,11 @@
 
 #include "blis.h"
 
-void bli_acquire_mpart_t2b_check( subpart_t  requested_part,
-                                      dim_t  i,
-                                      dim_t  b,
-                                      obj_t* obj,
-                                      obj_t* sub_obj )
+void bli_acquire_mpart_t2b_check( subpart_t    requested_part,
+                                  dim_t        i,
+                                  dim_t        b,
+                                  const obj_t* obj,
+                                  const obj_t* sub_obj )
 {
 	err_t e_val;
 
@@ -52,11 +52,11 @@ void bli_acquire_mpart_t2b_check( subpart_t  requested_part,
 	bli_check_error_code( e_val );
 }
 
-void bli_acquire_mpart_l2r_check( subpart_t  requested_part,
-                                      dim_t  j,
-                                      dim_t  b,
-                                      obj_t* obj,
-                                      obj_t* sub_obj )
+void bli_acquire_mpart_l2r_check( subpart_t    requested_part,
+                                  dim_t        j,
+                                  dim_t        b,
+                                  const obj_t* obj,
+                                  const obj_t* sub_obj )
 {
 	err_t e_val;
 
@@ -70,11 +70,11 @@ void bli_acquire_mpart_l2r_check( subpart_t  requested_part,
 	bli_check_error_code( e_val );
 }
 
-void bli_acquire_mpart_tl2br_check( subpart_t  requested_part,
-                                        dim_t  ij,
-                                        dim_t  b,
-                                        obj_t* obj,
-                                        obj_t* sub_obj )
+void bli_acquire_mpart_tl2br_check( subpart_t    requested_part,
+                                    dim_t        ij,
+                                    dim_t        b,
+                                    const obj_t* obj,
+                                    const obj_t* sub_obj )
 {
 	err_t e_val;
 
diff --git a/frame/base/check/bli_part_check.h b/frame/base/check/bli_part_check.h
index 2905af0e4f..4576e09b5d 100644
--- a/frame/base/check/bli_part_check.h
+++ b/frame/base/check/bli_part_check.h
@@ -32,21 +32,21 @@
 
 */
 
-void bli_acquire_mpart_t2b_check( subpart_t  requested_part,
-                                      dim_t  i,
-                                      dim_t  b,
-                                      obj_t* obj,
-                                      obj_t* sub_obj );
-
-void bli_acquire_mpart_l2r_check( subpart_t  requested_part,
-                                      dim_t  j,
-                                      dim_t  b,
-                                      obj_t* obj,
-                                      obj_t* sub_obj );
-
-void bli_acquire_mpart_tl2br_check( subpart_t  requested_part,
-                                        dim_t  ij,
-                                        dim_t  b,
-                                        obj_t* obj,
-                                        obj_t* sub_obj );
+void bli_acquire_mpart_t2b_check( subpart_t    requested_part,
+                                  dim_t        i,
+                                  dim_t        b,
+                                  const obj_t* obj,
+                                  const obj_t* sub_obj );
+
+void bli_acquire_mpart_l2r_check( subpart_t    requested_part,
+                                  dim_t        j,
+                                  dim_t        b,
+                                  const obj_t* obj,
+                                  const obj_t* sub_obj );
+
+void bli_acquire_mpart_tl2br_check( subpart_t    requested_part,
+                                    dim_t        ij,
+                                    dim_t        b,
+                                    const obj_t* obj,
+                                    const obj_t* sub_obj );
 
diff --git a/frame/base/proj/bli_projm.c b/frame/base/proj/bli_projm.c
index 949bc2cc94..c798970838 100644
--- a/frame/base/proj/bli_projm.c
+++ b/frame/base/proj/bli_projm.c
@@ -36,8 +36,8 @@
 
 void bli_projm
      (
-       obj_t* a,
-       obj_t* b
+       const obj_t* a,
+       const obj_t* b
      )
 {
 	// Check parameters.
@@ -88,8 +88,8 @@ void bli_projm
 
 void bli_projm_check
      (
-       obj_t* a,
-       obj_t* b
+       const obj_t* a,
+       const obj_t* b
      )
 {
 	err_t e_val;
diff --git a/frame/base/proj/bli_projm.h b/frame/base/proj/bli_projm.h
index e95f7f2f53..924924f9bc 100644
--- a/frame/base/proj/bli_projm.h
+++ b/frame/base/proj/bli_projm.h
@@ -34,13 +34,13 @@
 
 BLIS_EXPORT_BLIS void bli_projm
      (
-       obj_t* a,
-       obj_t* b
+       const obj_t* a,
+       const obj_t* b
      );
 
 void bli_projm_check
      (
-       obj_t* a,
-       obj_t* b
+       const obj_t* a,
+       const obj_t* b
      );
 
diff --git a/frame/base/proj/bli_projv.c b/frame/base/proj/bli_projv.c
index 9a6587e5b2..588ac39c36 100644
--- a/frame/base/proj/bli_projv.c
+++ b/frame/base/proj/bli_projv.c
@@ -36,8 +36,8 @@
 
 void bli_projv
      (
-       obj_t* x,
-       obj_t* y
+       const obj_t* x,
+       const obj_t* y
      )
 {
 	// Check parameters.
@@ -88,8 +88,8 @@ void bli_projv
 
 void bli_projv_check
      (
-       obj_t* x,
-       obj_t* y
+       const obj_t* x,
+       const obj_t* y
      )
 {
 	err_t e_val;
diff --git a/frame/base/proj/bli_projv.h b/frame/base/proj/bli_projv.h
index b738b2f973..abdf35522d 100644
--- a/frame/base/proj/bli_projv.h
+++ b/frame/base/proj/bli_projv.h
@@ -34,13 +34,13 @@
 
 BLIS_EXPORT_BLIS void bli_projv
      (
-       obj_t* x,
-       obj_t* y
+       const obj_t* x,
+       const obj_t* y
      );
 
 void bli_projv_check
      (
-       obj_t* x,
-       obj_t* y
+       const obj_t* x,
+       const obj_t* y
      );
 
diff --git a/frame/include/bli_extern_defs.h b/frame/include/bli_extern_defs.h
index 9773e5e69d..4d8fbee1dc 100644
--- a/frame/include/bli_extern_defs.h
+++ b/frame/include/bli_extern_defs.h
@@ -35,16 +35,16 @@
 #ifndef BLIS_EXTERN_DEFS_H
 #define BLIS_EXTERN_DEFS_H
 
-BLIS_EXPORT_BLIS extern obj_t BLIS_TWO;
-BLIS_EXPORT_BLIS extern obj_t BLIS_ONE;
+BLIS_EXPORT_BLIS extern const obj_t BLIS_TWO;
+BLIS_EXPORT_BLIS extern const obj_t BLIS_ONE;
 //BLIS_EXPORT_BLIS extern obj_t BLIS_ONE_HALF;
-BLIS_EXPORT_BLIS extern obj_t BLIS_ZERO;
+BLIS_EXPORT_BLIS extern const obj_t BLIS_ZERO;
 //BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_ONE_HALF;
-BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_ONE;
-BLIS_EXPORT_BLIS extern obj_t BLIS_MINUS_TWO;
+BLIS_EXPORT_BLIS extern const obj_t BLIS_MINUS_ONE;
+BLIS_EXPORT_BLIS extern const obj_t BLIS_MINUS_TWO;
 
-BLIS_EXPORT_BLIS extern thrcomm_t BLIS_SINGLE_COMM;
-BLIS_EXPORT_BLIS extern thrinfo_t BLIS_PACKM_SINGLE_THREADED;
-BLIS_EXPORT_BLIS extern thrinfo_t BLIS_GEMM_SINGLE_THREADED;
+BLIS_EXPORT_BLIS extern const thrcomm_t BLIS_SINGLE_COMM;
+BLIS_EXPORT_BLIS extern const thrinfo_t BLIS_PACKM_SINGLE_THREADED;
+BLIS_EXPORT_BLIS extern const thrinfo_t BLIS_GEMM_SINGLE_THREADED;
 
 #endif
diff --git a/frame/include/bli_obj_macro_defs.h b/frame/include/bli_obj_macro_defs.h
index fe174202cf..c076d41fb2 100644
--- a/frame/include/bli_obj_macro_defs.h
+++ b/frame/include/bli_obj_macro_defs.h
@@ -42,363 +42,363 @@
 
 // Info query
 
-BLIS_INLINE num_t bli_obj_dt( obj_t* obj )
+BLIS_INLINE num_t bli_obj_dt( const obj_t* obj )
 {
 	return ( num_t )
 	       ( obj->info & BLIS_DATATYPE_BITS );
 }
 
-BLIS_INLINE bool bli_obj_is_float( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_float( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_dt( obj ) == BLIS_BITVAL_FLOAT_TYPE );
 }
 
-BLIS_INLINE bool bli_obj_is_double( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_double( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_dt( obj ) == BLIS_BITVAL_DOUBLE_TYPE );
 }
 
-BLIS_INLINE bool bli_obj_is_scomplex( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_scomplex( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_dt( obj ) == BLIS_BITVAL_SCOMPLEX_TYPE );
 }
 
-BLIS_INLINE bool bli_obj_is_dcomplex( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_dcomplex( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_dt( obj ) == BLIS_BITVAL_DCOMPLEX_TYPE );
 }
 
-BLIS_INLINE bool bli_obj_is_int( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_int( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_dt( obj ) == BLIS_BITVAL_INT_TYPE );
 }
 
-BLIS_INLINE bool bli_obj_is_const( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_const( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_dt( obj ) == BLIS_BITVAL_CONST_TYPE );
 }
 
-BLIS_INLINE dom_t bli_obj_domain( obj_t* obj )
+BLIS_INLINE dom_t bli_obj_domain( const obj_t* obj )
 {
 	return ( dom_t )
 	       ( obj->info & BLIS_DOMAIN_BIT );
 }
 
-BLIS_INLINE prec_t bli_obj_prec( obj_t* obj )
+BLIS_INLINE prec_t bli_obj_prec( const obj_t* obj )
 {
 	return ( prec_t )
 	       ( obj->info & BLIS_PRECISION_BIT );
 }
 
-BLIS_INLINE bool bli_obj_is_single_prec( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_single_prec( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_prec( obj ) == BLIS_BITVAL_SINGLE_PREC );
 }
 
-BLIS_INLINE bool bli_obj_is_double_prec( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_double_prec( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_prec( obj ) == BLIS_BITVAL_DOUBLE_PREC );
 }
 
-BLIS_INLINE num_t bli_obj_dt_proj_to_single_prec( obj_t* obj )
+BLIS_INLINE num_t bli_obj_dt_proj_to_single_prec( const obj_t* obj )
 {
 	return ( num_t )
 	       ( bli_obj_dt( obj ) & ~BLIS_BITVAL_SINGLE_PREC );
 }
 
-BLIS_INLINE num_t bli_obj_dt_proj_to_double_prec( obj_t* obj )
+BLIS_INLINE num_t bli_obj_dt_proj_to_double_prec( const obj_t* obj )
 {
 	return ( num_t )
 	       ( bli_obj_dt( obj ) | BLIS_BITVAL_DOUBLE_PREC );
 }
 
-BLIS_INLINE bool bli_obj_is_real( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_real( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_domain( obj ) == BLIS_BITVAL_REAL &&
 	         !bli_obj_is_const( obj ) );
 }
 
-BLIS_INLINE bool bli_obj_is_complex( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_complex( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_domain( obj ) == BLIS_BITVAL_COMPLEX &&
 	         !bli_obj_is_const( obj ) );
 }
 
-BLIS_INLINE num_t bli_obj_dt_proj_to_real( obj_t* obj )
+BLIS_INLINE num_t bli_obj_dt_proj_to_real( const obj_t* obj )
 {
 	return ( num_t )
 	       ( bli_obj_dt( obj ) & ~BLIS_BITVAL_COMPLEX );
 }
 
-BLIS_INLINE num_t bli_obj_dt_proj_to_complex( obj_t* obj )
+BLIS_INLINE num_t bli_obj_dt_proj_to_complex( const obj_t* obj )
 {
 	return ( num_t )
 	       ( bli_obj_dt( obj ) | BLIS_BITVAL_COMPLEX );
 }
 
-BLIS_INLINE num_t bli_obj_target_dt( obj_t* obj )
+BLIS_INLINE num_t bli_obj_target_dt( const obj_t* obj )
 {
 	return ( num_t )
 	       ( ( obj->info & BLIS_TARGET_DT_BITS ) >> BLIS_TARGET_DT_SHIFT );
 }
 
-BLIS_INLINE dom_t bli_obj_target_domain( obj_t* obj )
+BLIS_INLINE dom_t bli_obj_target_domain( const obj_t* obj )
 {
 	return ( dom_t )
 	       ( ( obj->info & BLIS_TARGET_DOMAIN_BIT ) >> BLIS_TARGET_DT_SHIFT );
 }
 
-BLIS_INLINE prec_t bli_obj_target_prec( obj_t* obj )
+BLIS_INLINE prec_t bli_obj_target_prec( const obj_t* obj )
 {
 	return ( prec_t )
 	       ( ( obj->info & BLIS_TARGET_PREC_BIT ) >> BLIS_TARGET_DT_SHIFT );
 }
 
-BLIS_INLINE num_t bli_obj_exec_dt( obj_t* obj )
+BLIS_INLINE num_t bli_obj_exec_dt( const obj_t* obj )
 {
 	return ( num_t )
 	       ( ( obj->info & BLIS_EXEC_DT_BITS ) >> BLIS_EXEC_DT_SHIFT );
 }
 
-BLIS_INLINE dom_t bli_obj_exec_domain( obj_t* obj )
+BLIS_INLINE dom_t bli_obj_exec_domain( const obj_t* obj )
 {
 	return ( dom_t )
 	       ( ( obj->info & BLIS_EXEC_DOMAIN_BIT ) >> BLIS_EXEC_DT_SHIFT );
 }
 
-BLIS_INLINE prec_t bli_obj_exec_prec( obj_t* obj )
+BLIS_INLINE prec_t bli_obj_exec_prec( const obj_t* obj )
 {
 	return ( prec_t )
 	       ( ( obj->info & BLIS_EXEC_PREC_BIT ) >> BLIS_EXEC_DT_SHIFT );
 }
 
-BLIS_INLINE num_t bli_obj_comp_dt( obj_t* obj )
+BLIS_INLINE num_t bli_obj_comp_dt( const obj_t* obj )
 {
 	return ( num_t )
 	       ( ( obj->info & BLIS_COMP_DT_BITS ) >> BLIS_COMP_DT_SHIFT );
 }
 
-BLIS_INLINE dom_t bli_obj_comp_domain( obj_t* obj )
+BLIS_INLINE dom_t bli_obj_comp_domain( const obj_t* obj )
 {
 	return ( dom_t )
 	       ( ( obj->info & BLIS_COMP_DOMAIN_BIT ) >> BLIS_COMP_DT_SHIFT );
 }
 
-BLIS_INLINE prec_t bli_obj_comp_prec( obj_t* obj )
+BLIS_INLINE prec_t bli_obj_comp_prec( const obj_t* obj )
 {
 	return ( prec_t )
 	       ( ( obj->info & BLIS_COMP_PREC_BIT ) >> BLIS_COMP_DT_SHIFT );
 }
 
 // NOTE: This function queries info2.
-BLIS_INLINE num_t bli_obj_scalar_dt( obj_t* obj )
+BLIS_INLINE num_t bli_obj_scalar_dt( const obj_t* obj )
 {
 	return ( num_t )
 	       ( ( obj->info2 & BLIS_SCALAR_DT_BITS ) >> BLIS_SCALAR_DT_SHIFT );
 }
 
 // NOTE: This function queries info2.
-BLIS_INLINE dom_t bli_obj_scalar_domain( obj_t* obj )
+BLIS_INLINE dom_t bli_obj_scalar_domain( const obj_t* obj )
 {
 	return ( dom_t )
 	       ( ( obj->info2 & BLIS_SCALAR_DOMAIN_BIT ) >> BLIS_SCALAR_DT_SHIFT );
 }
 
 // NOTE: This function queries info2.
-BLIS_INLINE prec_t bli_obj_scalar_prec( obj_t* obj )
+BLIS_INLINE prec_t bli_obj_scalar_prec( const obj_t* obj )
 {
 	return ( prec_t )
 	       ( ( obj->info2 & BLIS_SCALAR_PREC_BIT ) >> BLIS_SCALAR_DT_SHIFT );
 }
 
-BLIS_INLINE trans_t bli_obj_conjtrans_status( obj_t* obj )
+BLIS_INLINE trans_t bli_obj_conjtrans_status( const obj_t* obj )
 {
 	return ( trans_t )
 	       ( obj->info & BLIS_CONJTRANS_BITS );
 }
 
-BLIS_INLINE trans_t bli_obj_onlytrans_status( obj_t* obj )
+BLIS_INLINE trans_t bli_obj_onlytrans_status( const obj_t* obj )
 {
 	return ( trans_t )
 	       ( obj->info & BLIS_TRANS_BIT );
 }
 
-BLIS_INLINE bool bli_obj_has_trans( obj_t* obj )
+BLIS_INLINE bool bli_obj_has_trans( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_onlytrans_status( obj ) == BLIS_BITVAL_TRANS );
 }
 
-BLIS_INLINE bool bli_obj_has_notrans( obj_t* obj )
+BLIS_INLINE bool bli_obj_has_notrans( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_onlytrans_status( obj ) == BLIS_BITVAL_NO_TRANS );
 }
 
-BLIS_INLINE conj_t bli_obj_conj_status( obj_t* obj )
+BLIS_INLINE conj_t bli_obj_conj_status( const obj_t* obj )
 {
 	return ( conj_t )
 	       ( obj->info & BLIS_CONJ_BIT );
 }
 
-BLIS_INLINE bool bli_obj_has_conj( obj_t* obj )
+BLIS_INLINE bool bli_obj_has_conj( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_conj_status( obj ) == BLIS_BITVAL_CONJ );
 }
 
-BLIS_INLINE bool bli_obj_has_noconj( obj_t* obj )
+BLIS_INLINE bool bli_obj_has_noconj( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_conj_status( obj ) == BLIS_BITVAL_NO_CONJ );
 }
 
-BLIS_INLINE uplo_t bli_obj_uplo( obj_t* obj )
+BLIS_INLINE uplo_t bli_obj_uplo( const obj_t* obj )
 {
 	return ( uplo_t )
 	       ( obj->info & BLIS_UPLO_BITS );
 }
 
-BLIS_INLINE bool bli_obj_is_upper( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_upper( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_uplo( obj ) == BLIS_BITVAL_UPPER );
 }
 
-BLIS_INLINE bool bli_obj_is_lower( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_lower( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_uplo( obj ) == BLIS_BITVAL_LOWER );
 }
 
-BLIS_INLINE bool bli_obj_is_upper_or_lower( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_upper_or_lower( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_is_upper( obj ) ||
 	         bli_obj_is_lower( obj ) );
 }
 
-BLIS_INLINE bool bli_obj_is_dense( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_dense( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_uplo( obj ) == BLIS_BITVAL_DENSE );
 }
 
-BLIS_INLINE bool bli_obj_is_zeros( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_zeros( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_uplo( obj ) == BLIS_BITVAL_ZEROS );
 }
 
-BLIS_INLINE diag_t bli_obj_diag( obj_t* obj )
+BLIS_INLINE diag_t bli_obj_diag( const obj_t* obj )
 {
 	return ( diag_t )
 	       ( obj->info & BLIS_UNIT_DIAG_BIT );
 }
 
-BLIS_INLINE bool bli_obj_has_nonunit_diag( obj_t* obj )
+BLIS_INLINE bool bli_obj_has_nonunit_diag( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_diag( obj ) == BLIS_BITVAL_NONUNIT_DIAG );
 }
 
-BLIS_INLINE bool bli_obj_has_unit_diag( obj_t* obj )
+BLIS_INLINE bool bli_obj_has_unit_diag( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_diag( obj ) == BLIS_BITVAL_UNIT_DIAG );
 }
 
-BLIS_INLINE bool bli_obj_has_inverted_diag( obj_t* obj )
+BLIS_INLINE bool bli_obj_has_inverted_diag( const obj_t* obj )
 {
 	return ( bool )
 	       ( ( obj->info & BLIS_INVERT_DIAG_BIT ) == BLIS_BITVAL_INVERT_DIAG );
 }
 
-BLIS_INLINE bool bli_obj_is_pack_rev_if_upper( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_pack_rev_if_upper( const obj_t* obj )
 {
 	return ( bool )
 	       ( ( obj->info & BLIS_PACK_REV_IF_UPPER_BIT ) == BLIS_BITVAL_PACK_REV_IF_UPPER );
 }
 
-BLIS_INLINE bool bli_obj_is_pack_rev_if_lower( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_pack_rev_if_lower( const obj_t* obj )
 {
 	return ( bool )
 	       ( ( obj->info & BLIS_PACK_REV_IF_LOWER_BIT ) == BLIS_BITVAL_PACK_REV_IF_LOWER );
 }
 
-BLIS_INLINE pack_t bli_obj_pack_schema( obj_t* obj )
+BLIS_INLINE pack_t bli_obj_pack_schema( const obj_t* obj )
 {
 	return ( pack_t )
 	       ( obj->info & BLIS_PACK_SCHEMA_BITS );
 }
 
-BLIS_INLINE bool bli_obj_is_packed( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_packed( const obj_t* obj )
 {
 	return ( bool )
 	       ( obj->info & BLIS_PACK_BIT );
 }
 
-BLIS_INLINE bool bli_obj_is_row_packed( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_row_packed( const obj_t* obj )
 {
 	return ( bool )
 	       ( ( obj->info & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^
                                                    BLIS_BITVAL_PACKED_ROWS    ) );
 }
 
-BLIS_INLINE bool bli_obj_is_col_packed( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_col_packed( const obj_t* obj )
 {
 	return ( bool )
 	       ( ( obj->info & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^
                                                    BLIS_BITVAL_PACKED_COLUMNS ) );
 }
 
-BLIS_INLINE bool bli_obj_is_panel_packed( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_panel_packed( const obj_t* obj )
 {
 	return ( bool )
 	       ( obj->info & BLIS_PACK_PANEL_BIT );
 }
 
-BLIS_INLINE packbuf_t bli_obj_pack_buffer_type( obj_t* obj )
+BLIS_INLINE packbuf_t bli_obj_pack_buffer_type( const obj_t* obj )
 {
 	return ( packbuf_t )
 	       ( obj->info & BLIS_PACK_BUFFER_BITS );
 }
 
-BLIS_INLINE struc_t bli_obj_struc( obj_t* obj )
+BLIS_INLINE struc_t bli_obj_struc( const obj_t* obj )
 {
 	return ( struc_t )
 	       ( obj->info & BLIS_STRUC_BITS );
 }
 
-BLIS_INLINE bool bli_obj_is_general( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_general( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_struc( obj ) == BLIS_BITVAL_GENERAL );
 }
 
-BLIS_INLINE bool bli_obj_is_hermitian( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_hermitian( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_struc( obj ) == BLIS_BITVAL_HERMITIAN );
 }
 
-BLIS_INLINE bool bli_obj_is_symmetric( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_symmetric( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_struc( obj ) == BLIS_BITVAL_SYMMETRIC );
 }
 
-BLIS_INLINE bool bli_obj_is_triangular( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_triangular( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_struc( obj ) == BLIS_BITVAL_TRIANGULAR );
@@ -599,49 +599,49 @@ BLIS_INLINE void bli_obj_toggle_uplo( obj_t* obj )
 
 // Root matrix query
 
-BLIS_INLINE obj_t* bli_obj_root( obj_t* obj )
+BLIS_INLINE obj_t* bli_obj_root( const obj_t* obj )
 {
 	return ( obj_t* )( obj->root );
 }
 
-BLIS_INLINE bool bli_obj_root_is_general( obj_t* obj )
+BLIS_INLINE bool bli_obj_root_is_general( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_is_general( bli_obj_root( obj ) ) );
 }
 
-BLIS_INLINE bool bli_obj_root_is_hermitian( obj_t* obj )
+BLIS_INLINE bool bli_obj_root_is_hermitian( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_is_hermitian( bli_obj_root( obj ) ) );
 }
 
-BLIS_INLINE bool bli_obj_root_is_symmetric( obj_t* obj )
+BLIS_INLINE bool bli_obj_root_is_symmetric( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_is_symmetric( bli_obj_root( obj ) ) );
 }
 
-BLIS_INLINE bool bli_obj_root_is_triangular( obj_t* obj )
+BLIS_INLINE bool bli_obj_root_is_triangular( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_is_triangular( bli_obj_root( obj ) ) );
 }
 
-BLIS_INLINE bool bli_obj_root_is_herm_or_symm( obj_t* obj )
+BLIS_INLINE bool bli_obj_root_is_herm_or_symm( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_is_hermitian( bli_obj_root( obj ) ) ||
 	         bli_obj_is_symmetric( bli_obj_root( obj ) ) );
 }
 
-BLIS_INLINE bool bli_obj_root_is_upper( obj_t* obj )
+BLIS_INLINE bool bli_obj_root_is_upper( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_is_upper( bli_obj_root( obj ) ) );
 }
 
-BLIS_INLINE bool bli_obj_root_is_lower( obj_t* obj )
+BLIS_INLINE bool bli_obj_root_is_lower( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_is_lower( bli_obj_root( obj ) ) );
@@ -656,13 +656,13 @@ BLIS_INLINE void bli_obj_set_as_root( obj_t* obj )
 
 // Diagonal offset query
 
-BLIS_INLINE doff_t bli_obj_diag_offset( obj_t* obj )
+BLIS_INLINE doff_t bli_obj_diag_offset( const obj_t* obj )
 {
 	return ( doff_t )
 	       ( obj->diag_off );
 }
 
-BLIS_INLINE doff_t bli_obj_diag_offset_after_trans( obj_t* obj )
+BLIS_INLINE doff_t bli_obj_diag_offset_after_trans( const obj_t* obj )
 {
 	return ( doff_t )
 	       ( bli_obj_has_trans( obj ) ? -bli_obj_diag_offset( obj )
@@ -688,46 +688,46 @@ BLIS_INLINE void bli_obj_inc_diag_offset( doff_t offset, obj_t* obj )
 
 // Dimension query
 
-BLIS_INLINE dim_t bli_obj_length( obj_t* obj )
+BLIS_INLINE dim_t bli_obj_length( const obj_t* obj )
 {
 	return ( obj->dim[ BLIS_M ] );
 }
 
-BLIS_INLINE dim_t bli_obj_width( obj_t* obj )
+BLIS_INLINE dim_t bli_obj_width( const obj_t* obj )
 {
 	return ( obj->dim[ BLIS_N ] );
 }
 
-BLIS_INLINE dim_t bli_obj_dim( mdim_t mdim, obj_t* obj )
+BLIS_INLINE dim_t bli_obj_dim( mdim_t mdim, const obj_t* obj )
 {
 	return ( obj->dim[ mdim ] );
 }
 
-BLIS_INLINE dim_t bli_obj_min_dim( obj_t* obj )
+BLIS_INLINE dim_t bli_obj_min_dim( const obj_t* obj )
 {
 	return bli_min( bli_obj_length( obj ),
 	                bli_obj_width( obj ) );
 }
 
-BLIS_INLINE dim_t bli_obj_max_dim( obj_t* obj )
+BLIS_INLINE dim_t bli_obj_max_dim( const obj_t* obj )
 {
 	return bli_max( bli_obj_length( obj ),
 	                bli_obj_width( obj ) );
 }
 
-BLIS_INLINE dim_t bli_obj_length_after_trans( obj_t* obj )
+BLIS_INLINE dim_t bli_obj_length_after_trans( const obj_t* obj )
 {
 	return ( bli_obj_has_trans( obj ) ? bli_obj_width( obj )
 	                                  : bli_obj_length( obj ) );
 }
 
-BLIS_INLINE dim_t bli_obj_width_after_trans( obj_t* obj )
+BLIS_INLINE dim_t bli_obj_width_after_trans( const obj_t* obj )
 {
 	return ( bli_obj_has_trans( obj ) ? bli_obj_length( obj )
 	                                  : bli_obj_width( obj ) );
 }
 
-BLIS_INLINE bool bli_obj_is_1x1( obj_t* x )
+BLIS_INLINE bool bli_obj_is_1x1( const obj_t* x )
 {
 	return ( bool )
 	       ( bli_obj_length( x ) == 1 &&
@@ -736,34 +736,34 @@ BLIS_INLINE bool bli_obj_is_1x1( obj_t* x )
 
 // Stride/increment query
 
-BLIS_INLINE inc_t bli_obj_row_stride( obj_t* obj )
+BLIS_INLINE inc_t bli_obj_row_stride( const obj_t* obj )
 {
 	return ( obj->rs );
 }
 
-BLIS_INLINE inc_t bli_obj_col_stride( obj_t* obj )
+BLIS_INLINE inc_t bli_obj_col_stride( const obj_t* obj )
 {
 	return ( obj->cs );
 }
 
-BLIS_INLINE inc_t bli_obj_imag_stride( obj_t* obj )
+BLIS_INLINE inc_t bli_obj_imag_stride( const obj_t* obj )
 {
 	return ( obj->is );
 }
 
-BLIS_INLINE inc_t bli_obj_row_stride_mag( obj_t* obj )
+BLIS_INLINE inc_t bli_obj_row_stride_mag( const obj_t* obj )
 {
 	return ( inc_t )
 	       ( bli_abs( obj->rs ) );
 }
 
-BLIS_INLINE inc_t bli_obj_col_stride_mag( obj_t* obj )
+BLIS_INLINE inc_t bli_obj_col_stride_mag( const obj_t* obj )
 {
 	return ( inc_t )
 	       ( bli_abs( obj->cs ) );
 }
 
-BLIS_INLINE inc_t bli_obj_imag_stride_mag( obj_t* obj )
+BLIS_INLINE inc_t bli_obj_imag_stride_mag( const obj_t* obj )
 {
 	return ( inc_t )
 	       ( bli_abs( obj->is ) );
@@ -773,7 +773,7 @@ BLIS_INLINE inc_t bli_obj_imag_stride_mag( obj_t* obj )
 // of the smallest submatrices of an object that could still encompass
 // the stored data above (if obj is upper) or below (if obj is lower)
 // the diagonal.
-BLIS_INLINE dim_t bli_obj_length_stored( obj_t* obj )
+BLIS_INLINE dim_t bli_obj_length_stored( const obj_t* obj )
 {
 	return ( dim_t )
 	       ( bli_obj_is_upper( obj )
@@ -784,7 +784,7 @@ BLIS_INLINE dim_t bli_obj_length_stored( obj_t* obj )
 	       );
 }
 
-BLIS_INLINE dim_t bli_obj_width_stored( obj_t* obj )
+BLIS_INLINE dim_t bli_obj_width_stored( const obj_t* obj )
 {
 	return ( dim_t )
 	       ( bli_obj_is_lower( obj )
@@ -795,25 +795,25 @@ BLIS_INLINE dim_t bli_obj_width_stored( obj_t* obj )
 	       );
 }
 
-BLIS_INLINE dim_t bli_obj_length_stored_after_trans( obj_t* obj )
+BLIS_INLINE dim_t bli_obj_length_stored_after_trans( const obj_t* obj )
 {
 	return ( bli_obj_has_trans( obj ) ? bli_obj_width_stored( obj )
 	                                  : bli_obj_length_stored( obj ) );
 }
 
-BLIS_INLINE dim_t bli_obj_width_stored_after_trans( obj_t* obj )
+BLIS_INLINE dim_t bli_obj_width_stored_after_trans( const obj_t* obj )
 {
 	return ( bli_obj_has_trans( obj ) ? bli_obj_length_stored( obj )
 	                                  : bli_obj_width_stored( obj ) );
 }
 
-BLIS_INLINE dim_t bli_obj_vector_dim( obj_t* x )
+BLIS_INLINE dim_t bli_obj_vector_dim( const obj_t* x )
 {
 	return ( bli_obj_length( x ) == 1 ? bli_obj_width( x )
 	                                  : bli_obj_length( x ) );
 }
 
-BLIS_INLINE inc_t bli_obj_vector_inc( obj_t* x )
+BLIS_INLINE inc_t bli_obj_vector_inc( const obj_t* x )
 {
 	return ( bli_obj_is_1x1( x ) ? 1 :
 	         ( bli_obj_length( x ) == 1 ? bli_obj_col_stride( x )
@@ -821,26 +821,26 @@ BLIS_INLINE inc_t bli_obj_vector_inc( obj_t* x )
 	       );
 }
 
-BLIS_INLINE bool bli_obj_is_vector( obj_t* x )
+BLIS_INLINE bool bli_obj_is_vector( const obj_t* x )
 {
 	return ( bool )
 	       ( bli_obj_length( x ) == 1 ||
 	         bli_obj_width(  x ) == 1 );
 }
 
-BLIS_INLINE bool bli_obj_is_row_vector( obj_t* x )
+BLIS_INLINE bool bli_obj_is_row_vector( const obj_t* x )
 {
 	return ( bool )
 	       ( bli_obj_length( x ) == 1 );
 }
 
-BLIS_INLINE bool bli_obj_is_col_vector( obj_t* x )
+BLIS_INLINE bool bli_obj_is_col_vector( const obj_t* x )
 {
 	return ( bool )
 	       ( bli_obj_width( x ) == 1 );
 }
 
-BLIS_INLINE bool bli_obj_has_zero_dim( obj_t* x )
+BLIS_INLINE bool bli_obj_has_zero_dim( const obj_t* x )
 {
 	return ( bool )
 	       ( bli_obj_length( x ) == 0 ||
@@ -894,32 +894,32 @@ BLIS_INLINE void bli_obj_set_dims_with_trans( trans_t trans, dim_t m, dim_t n, o
 // "obj" macros are used on packed matrices.
 //
 
-BLIS_INLINE bool bli_obj_is_row_stored( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_row_stored( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_col_stride_mag( obj ) == 1 );
 }
 
-BLIS_INLINE bool bli_obj_is_col_stored( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_col_stored( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_row_stride_mag( obj ) == 1 );
 }
 
-BLIS_INLINE bool bli_obj_is_gen_stored( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_gen_stored( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_row_stride_mag( obj ) != 1 &&
 	         bli_obj_col_stride_mag( obj ) != 1 );
 }
 
-BLIS_INLINE bool bli_obj_is_row_tilted( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_row_tilted( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_col_stride_mag( obj ) < bli_obj_row_stride_mag( obj ) );
 }
 
-BLIS_INLINE bool bli_obj_is_col_tilted( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_col_tilted( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_row_stride_mag( obj ) < bli_obj_col_stride_mag( obj ) );
@@ -950,17 +950,17 @@ BLIS_INLINE void bli_obj_set_imag_stride( inc_t is, obj_t* obj )
 
 // Offset query
 
-BLIS_INLINE dim_t bli_obj_row_off( obj_t* obj )
+BLIS_INLINE dim_t bli_obj_row_off( const obj_t* obj )
 {
 	return ( obj->off[ BLIS_M ] );
 }
 
-BLIS_INLINE dim_t bli_obj_col_off( obj_t* obj )
+BLIS_INLINE dim_t bli_obj_col_off( const obj_t* obj )
 {
 	return ( obj->off[ BLIS_N ] );
 }
 
-BLIS_INLINE dim_t bli_obj_off( mdim_t mdim, obj_t* obj )
+BLIS_INLINE dim_t bli_obj_off( mdim_t mdim, const obj_t* obj )
 {
 	return ( obj->off[ mdim ] );
 }
@@ -991,33 +991,33 @@ BLIS_INLINE void bli_obj_inc_offs( dim_t offm, dim_t offn, obj_t* obj )
 
 // Diagonal offset predicates
 
-BLIS_INLINE bool bli_obj_is_strictly_above_diag( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_strictly_above_diag( const obj_t* obj )
 {
 	return ( bool )
 	       ( ( doff_t )bli_obj_length( obj ) <= -bli_obj_diag_offset( obj ) );
 }
 
-BLIS_INLINE bool bli_obj_is_strictly_below_diag( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_strictly_below_diag( const obj_t* obj )
 {
 	return ( bool )
 	       ( ( doff_t )bli_obj_width( obj ) <= bli_obj_diag_offset( obj ) );
 }
 
-BLIS_INLINE bool bli_obj_is_outside_diag( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_outside_diag( const obj_t* obj )
 {
 	return ( bool )
 	       ( bli_obj_is_strictly_above_diag( obj ) ||
 	         bli_obj_is_strictly_below_diag( obj ) );
 }
 
-BLIS_INLINE bool bli_obj_intersects_diag( obj_t* obj )
+BLIS_INLINE bool bli_obj_intersects_diag( const obj_t* obj )
 {
 	return ( bool )
 	       ( !bli_obj_is_strictly_above_diag( obj ) &&
 	         !bli_obj_is_strictly_below_diag( obj ) );
 }
 
-BLIS_INLINE bool bli_obj_is_unstored_subpart( obj_t* obj )
+BLIS_INLINE bool bli_obj_is_unstored_subpart( const obj_t* obj )
 {
 	return ( bool )
 	       ( ( bli_obj_root_is_lower( obj ) && bli_obj_is_strictly_above_diag( obj ) ) ||
@@ -1026,7 +1026,7 @@ BLIS_INLINE bool bli_obj_is_unstored_subpart( obj_t* obj )
 
 // Buffer address query
 
-BLIS_INLINE void* bli_obj_buffer( obj_t* obj )
+BLIS_INLINE void* bli_obj_buffer( const obj_t* obj )
 {
 	return ( void* )
 	       ( obj->buffer );
@@ -1041,7 +1041,7 @@ BLIS_INLINE void bli_obj_set_buffer( void* p, obj_t* obj )
 
 // Bufferless scalar field query
 
-BLIS_INLINE void* bli_obj_internal_scalar_buffer( obj_t* obj )
+BLIS_INLINE void* bli_obj_internal_scalar_buffer( const obj_t* obj )
 {
 	return ( void* )
 	       ( &( obj->scalar ) );
@@ -1049,14 +1049,14 @@ BLIS_INLINE void* bli_obj_internal_scalar_buffer( obj_t* obj )
 
 // Bufferless scalar field modification
 
-BLIS_INLINE void bli_obj_copy_internal_scalar( obj_t* a, obj_t* b )
+BLIS_INLINE void bli_obj_copy_internal_scalar( const obj_t* a, obj_t* b )
 {
 	b->scalar = a->scalar;
 }
 
 // Element size query
 
-BLIS_INLINE siz_t bli_obj_elem_size( obj_t* obj )
+BLIS_INLINE siz_t bli_obj_elem_size( const obj_t* obj )
 {
 	return ( siz_t )
 	       ( obj->elem_size );
@@ -1071,12 +1071,12 @@ BLIS_INLINE void bli_obj_set_elem_size( siz_t size, obj_t* obj )
 
 // Packed matrix info query
 
-BLIS_INLINE dim_t bli_obj_padded_length( obj_t* obj )
+BLIS_INLINE dim_t bli_obj_padded_length( const obj_t* obj )
 {
 	return ( obj->m_padded );
 }
 
-BLIS_INLINE dim_t bli_obj_padded_width( obj_t* obj )
+BLIS_INLINE dim_t bli_obj_padded_width( const obj_t* obj )
 {
 	return ( obj->n_padded );
 }
@@ -1101,22 +1101,22 @@ BLIS_INLINE void bli_obj_set_padded_dims( dim_t m, dim_t n, obj_t* obj )
 
 // Packed panel info query
 
-BLIS_INLINE dim_t bli_obj_panel_length( obj_t* obj )
+BLIS_INLINE dim_t bli_obj_panel_length( const obj_t* obj )
 {
 	return ( obj->m_panel );
 }
 
-BLIS_INLINE dim_t bli_obj_panel_width( obj_t* obj )
+BLIS_INLINE dim_t bli_obj_panel_width( const obj_t* obj )
 {
 	return ( obj->n_panel );
 }
 
-BLIS_INLINE inc_t bli_obj_panel_dim( obj_t* obj )
+BLIS_INLINE inc_t bli_obj_panel_dim( const obj_t* obj )
 {
 	return ( obj->pd );
 }
 
-BLIS_INLINE inc_t bli_obj_panel_stride( obj_t* obj )
+BLIS_INLINE inc_t bli_obj_panel_stride( const obj_t* obj )
 {
 	return ( obj->ps );
 }
@@ -1151,7 +1151,7 @@ BLIS_INLINE void bli_obj_set_panel_stride( inc_t ps, obj_t* obj )
 
 // stor3_t-related
 
-BLIS_INLINE stor3_t bli_obj_stor3_from_strides( obj_t* c, obj_t* a, obj_t* b )
+BLIS_INLINE stor3_t bli_obj_stor3_from_strides( const obj_t* c, const obj_t* a, const obj_t* b )
 {
 	const inc_t rs_c = bli_obj_row_stride( c );
 	const inc_t cs_c = bli_obj_col_stride( c );
@@ -1191,22 +1191,22 @@ BLIS_INLINE stor3_t bli_obj_stor3_from_strides( obj_t* c, obj_t* a, obj_t* b )
 
 // Function pointer query
 
-BLIS_INLINE obj_pack_fn_t bli_obj_pack_fn( obj_t* obj )
+BLIS_INLINE obj_pack_fn_t bli_obj_pack_fn( const obj_t* obj )
 {
 	return obj->pack_fn;
 }
 
-BLIS_INLINE void* bli_obj_pack_params( obj_t* obj )
+BLIS_INLINE void* bli_obj_pack_params( const obj_t* obj )
 {
 	return obj->pack_params;
 }
 
-BLIS_INLINE obj_ker_fn_t bli_obj_ker_fn( obj_t* obj )
+BLIS_INLINE obj_ker_fn_t bli_obj_ker_fn( const obj_t* obj )
 {
 	return obj->ker_fn;
 }
 
-BLIS_INLINE void* bli_obj_ker_params( obj_t* obj )
+BLIS_INLINE void* bli_obj_ker_params( const obj_t* obj )
 {
 	return obj->ker_params;
 }
@@ -1315,7 +1315,7 @@ BLIS_INLINE void bli_obj_set_defaults( obj_t* obj )
 
 // Acquire buffer at object's submatrix offset (offset-aware buffer query).
 
-BLIS_INLINE void* bli_obj_buffer_at_off( obj_t* obj )
+BLIS_INLINE void* bli_obj_buffer_at_off( const obj_t* obj )
 {
 	return ( void* )
 	       (
@@ -1330,9 +1330,9 @@ BLIS_INLINE void* bli_obj_buffer_at_off( obj_t* obj )
 
 // Acquire buffer from BLIS_CONSTANT object.
 
-BLIS_INLINE void* bli_obj_buffer_for_const( num_t dt, obj_t* obj )
+BLIS_INLINE const void* bli_obj_buffer_for_const( num_t dt, const obj_t* obj )
 {
-	void* p;
+    void* p;
 
 	if      ( dt == BLIS_FLOAT    ) p = &((( constdata_t* )bli_obj_buffer( obj ))->s);
 	else if ( dt == BLIS_DOUBLE   ) p = &((( constdata_t* )bli_obj_buffer( obj ))->d);
@@ -1345,7 +1345,7 @@ BLIS_INLINE void* bli_obj_buffer_for_const( num_t dt, obj_t* obj )
 
 // Acquire buffer from scalar (1x1) object, including BLIS_CONSTANT objects.
 
-BLIS_INLINE void* bli_obj_buffer_for_1x1( num_t dt, obj_t* obj )
+BLIS_INLINE void* bli_obj_buffer_for_1x1( num_t dt, const obj_t* obj )
 {
 	return ( void* )
 	       ( bli_obj_is_const( obj ) ? bli_obj_buffer_for_const( dt, obj )
@@ -1367,14 +1367,14 @@ BLIS_INLINE void bli_obj_reset_origin( obj_t* obj )
 
 // Make a full alias (shallow copy).
 
-BLIS_INLINE void bli_obj_alias_to( obj_t* a, obj_t* b )
+BLIS_INLINE void bli_obj_alias_to( const obj_t* a, obj_t* b )
 {
 	bli_obj_init_full_shallow_copy_of( a, b );
 }
 
 // Check if two objects are aliases of one another.
 
-BLIS_INLINE bool bli_obj_is_alias_of( obj_t* a, obj_t* b )
+BLIS_INLINE bool bli_obj_is_alias_of( const obj_t* a, const obj_t* b )
 {
 	return ( bool )
 	       ( bli_obj_buffer( a ) == bli_obj_buffer( b ) );
@@ -1384,7 +1384,7 @@ BLIS_INLINE bool bli_obj_is_alias_of( obj_t* a, obj_t* b )
 // Create an alias with a trans value applied.
 // (Note: trans may include a conj component.)
 
-BLIS_INLINE void bli_obj_alias_with_trans( trans_t trans, obj_t* a, obj_t* b )
+BLIS_INLINE void bli_obj_alias_with_trans( trans_t trans, const obj_t* a, obj_t* b )
 {
 	bli_obj_alias_to( a, b );
 	bli_obj_apply_trans( trans, b );
@@ -1392,7 +1392,7 @@ BLIS_INLINE void bli_obj_alias_with_trans( trans_t trans, obj_t* a, obj_t* b )
 
 // Create an alias with a conj value applied.
 
-BLIS_INLINE void bli_obj_alias_with_conj( conj_t conja, obj_t* a, obj_t* b )
+BLIS_INLINE void bli_obj_alias_with_conj( conj_t conja, const obj_t* a, obj_t* b )
 {
 	bli_obj_alias_to( a, b );
 	bli_obj_apply_conj( conja, b );
@@ -1400,7 +1400,7 @@ BLIS_INLINE void bli_obj_alias_with_conj( conj_t conja, obj_t* a, obj_t* b )
 
 // Alias only the real part.
 
-BLIS_INLINE void bli_obj_real_part( obj_t* c, obj_t* r )
+BLIS_INLINE void bli_obj_real_part( const obj_t* c, obj_t* r )
 {
 	bli_obj_alias_to( c, r );
 
@@ -1433,7 +1433,7 @@ BLIS_INLINE void bli_obj_real_part( obj_t* c, obj_t* r )
 
 // Alias only the imaginary part.
 
-BLIS_INLINE void bli_obj_imag_part( obj_t* c, obj_t* i )
+BLIS_INLINE void bli_obj_imag_part( const obj_t* c, obj_t* i )
 {
 	if ( bli_obj_is_complex( c ) )
 	{
@@ -1472,7 +1472,7 @@ BLIS_INLINE void bli_obj_imag_part( obj_t* c, obj_t* i )
 // chosen buffer (possibly using an auxiliary datatype if the object is
 // BLIS_CONSTANT).
 
-BLIS_INLINE void bli_obj_scalar_set_dt_buffer( obj_t* obj, num_t dt_aux, num_t* dt, void** buf )
+BLIS_INLINE void bli_obj_scalar_set_dt_buffer( const obj_t* obj, num_t dt_aux, num_t* dt, void** buf )
 {
 	if ( bli_obj_is_const( obj ) )
 	{
diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h
index 4e64f37116..eb99875c5b 100644
--- a/frame/include/bli_type_defs.h
+++ b/frame/include/bli_type_defs.h
@@ -1162,23 +1162,23 @@ struct thrinfo_s;
 
 typedef void (*obj_pack_fn_t)
     (
-      struct obj_s*     a,
+      const struct obj_s*     a,
       struct obj_s*     ap,
-      struct cntx_s*    cntx,
+      const struct cntx_s*    cntx,
       struct rntm_s*    rntm,
       struct cntl_s*    cntl,
-      struct thrinfo_s* thread
+      const struct thrinfo_s* thread
     );
 
 typedef void (*obj_ker_fn_t)
     (
-      struct obj_s*     a,
-      struct obj_s*     b,
-      struct obj_s*     c,
-      struct cntx_s*    cntx,
+      const struct obj_s*     a,
+      const struct obj_s*     b,
+      const struct obj_s*     c,
+      const struct cntx_s*    cntx,
       struct rntm_s*    rntm,
       struct cntl_s*    cntl,
-      struct thrinfo_s* thread
+      const struct thrinfo_s* thread
     );
 
 typedef struct obj_s
@@ -1297,7 +1297,7 @@ typedef struct obj_s
 // Define these macros here since they must be updated if contents of
 // obj_t changes.
 
-BLIS_INLINE void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b )
+BLIS_INLINE void bli_obj_init_full_shallow_copy_of( const obj_t* a, obj_t* b )
 {
 	b->root        = a->root;
 
@@ -1332,7 +1332,7 @@ BLIS_INLINE void bli_obj_init_full_shallow_copy_of( obj_t* a, obj_t* b )
 	b->ker_params  = a->ker_params;
 }
 
-BLIS_INLINE void bli_obj_init_subpart_from( obj_t* a, obj_t* b )
+BLIS_INLINE void bli_obj_init_subpart_from( const obj_t* a, obj_t* b )
 {
 	b->root        = a->root;
 
diff --git a/frame/thread/bli_thread.c b/frame/thread/bli_thread.c
index 6dc4f9141c..8368ea2beb 100644
--- a/frame/thread/bli_thread.c
+++ b/frame/thread/bli_thread.c
@@ -35,9 +35,9 @@
 
 #include "blis.h"
 
-thrinfo_t BLIS_PACKM_SINGLE_THREADED = {};
-thrinfo_t BLIS_GEMM_SINGLE_THREADED  = {};
-thrcomm_t BLIS_SINGLE_COMM           = {};
+const thrinfo_t BLIS_PACKM_SINGLE_THREADED = {};
+const thrinfo_t BLIS_GEMM_SINGLE_THREADED  = {};
+const thrcomm_t BLIS_SINGLE_COMM           = {};
 
 // The global rntm_t structure. (The definition resides in bli_rntm.c.)
 extern rntm_t global_rntm;
@@ -50,9 +50,9 @@ extern bli_pthread_mutex_t global_rntm_mutex;
 
 void bli_thread_init( void )
 {
-	bli_thrcomm_init( 1, &BLIS_SINGLE_COMM );
-	bli_packm_thrinfo_init_single( &BLIS_PACKM_SINGLE_THREADED );
-	bli_l3_thrinfo_init_single( &BLIS_GEMM_SINGLE_THREADED );
+	bli_thrcomm_init( 1, ( thrcomm_t* )&BLIS_SINGLE_COMM );
+	bli_packm_thrinfo_init_single( ( thrinfo_t* )&BLIS_PACKM_SINGLE_THREADED );
+	bli_l3_thrinfo_init_single( (thrinfo_t* )&BLIS_GEMM_SINGLE_THREADED );
 
 	// Read the environment variables and use them to initialize the
 	// global runtime object.
@@ -67,7 +67,7 @@ void bli_thread_finalize( void )
 
 void bli_thread_range_sub
      (
-       thrinfo_t* thread,
+       const thrinfo_t* thread,
        dim_t      n,
        dim_t      bf,
        bool       handle_edge_low,
@@ -211,9 +211,9 @@ void bli_thread_range_sub
 
 siz_t bli_thread_range_l2r
      (
-       thrinfo_t* thr,
-       obj_t*     a,
-       blksz_t*   bmult,
+       const thrinfo_t* thr,
+       const obj_t*     a,
+       const blksz_t*   bmult,
        dim_t*     start,
        dim_t*     end
      )
@@ -231,9 +231,9 @@ siz_t bli_thread_range_l2r
 
 siz_t bli_thread_range_r2l
      (
-       thrinfo_t* thr,
-       obj_t*     a,
-       blksz_t*   bmult,
+       const thrinfo_t* thr,
+       const obj_t*     a,
+       const blksz_t*   bmult,
        dim_t*     start,
        dim_t*     end
      )
@@ -251,9 +251,9 @@ siz_t bli_thread_range_r2l
 
 siz_t bli_thread_range_t2b
      (
-       thrinfo_t* thr,
-       obj_t*     a,
-       blksz_t*   bmult,
+       const thrinfo_t* thr,
+       const obj_t*     a,
+       const blksz_t*   bmult,
        dim_t*     start,
        dim_t*     end
      )
@@ -271,9 +271,9 @@ siz_t bli_thread_range_t2b
 
 siz_t bli_thread_range_b2t
      (
-       thrinfo_t* thr,
-       obj_t*     a,
-       blksz_t*   bmult,
+       const thrinfo_t* thr,
+       const obj_t*     a,
+       const blksz_t*   bmult,
        dim_t*     start,
        dim_t*     end
      )
@@ -504,15 +504,15 @@ siz_t bli_find_area_trap_l
 
 siz_t bli_thread_range_weighted_sub
      (
-       thrinfo_t* restrict thread,
-       doff_t              diagoff,
-       uplo_t              uplo,
-       dim_t               m,
-       dim_t               n,
-       dim_t               bf,
-       bool                handle_edge_low,
-       dim_t*     restrict j_start_thr,
-       dim_t*     restrict j_end_thr
+       const thrinfo_t* thread,
+       doff_t           diagoff,
+       uplo_t           uplo,
+       dim_t            m,
+       dim_t            n,
+       dim_t            bf,
+       bool             handle_edge_low,
+       dim_t*           j_start_thr,
+       dim_t*           j_end_thr
      )
 {
 	dim_t      n_way   = bli_thread_n_way( thread );
@@ -642,12 +642,12 @@ siz_t bli_thread_range_weighted_sub
 siz_t bli_thread_range_mdim
      (
        dir_t      direct,
-       thrinfo_t* thr,
-       obj_t*     a,
-       obj_t*     b,
-       obj_t*     c,
-       cntl_t*    cntl,
-       cntx_t*    cntx,
+       const thrinfo_t* thr,
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     c,
+       const cntl_t*    cntl,
+       const cntx_t*    cntx,
        dim_t*     start,
        dim_t*     end
      )
@@ -665,8 +665,8 @@ siz_t bli_thread_range_mdim
 		else                                   bszid = BLIS_NR;
 	}
 
-	blksz_t* bmult  = bli_cntx_get_bmult( bszid, cntx );
-	obj_t*   x;
+	const blksz_t* bmult  = bli_cntx_get_bmult( bszid, cntx );
+	const obj_t*   x;
 	bool     use_weighted;
 
 	// Use the operation family to choose the one of the two matrices
@@ -701,12 +701,12 @@ siz_t bli_thread_range_mdim
 siz_t bli_thread_range_ndim
      (
        dir_t      direct,
-       thrinfo_t* thr,
-       obj_t*     a,
-       obj_t*     b,
-       obj_t*     c,
-       cntl_t*    cntl,
-       cntx_t*    cntx,
+       const thrinfo_t* thr,
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     c,
+       const cntl_t*    cntl,
+       const cntx_t*    cntx,
        dim_t*     start,
        dim_t*     end
      )
@@ -724,8 +724,8 @@ siz_t bli_thread_range_ndim
 		else                                   bszid = BLIS_NR;
 	}
 
-	blksz_t* bmult  = bli_cntx_get_bmult( bszid, cntx );
-	obj_t*   x;
+	const blksz_t* bmult  = bli_cntx_get_bmult( bszid, cntx );
+	const obj_t*   x;
 	bool     use_weighted;
 
 	// Use the operation family to choose the one of the two matrices
@@ -759,9 +759,9 @@ siz_t bli_thread_range_ndim
 
 siz_t bli_thread_range_weighted_l2r
      (
-       thrinfo_t* thr,
-       obj_t*     a,
-       blksz_t*   bmult,
+       const thrinfo_t* thr,
+       const obj_t*     a,
+       const blksz_t*   bmult,
        dim_t*     start,
        dim_t*     end
      )
@@ -809,9 +809,9 @@ siz_t bli_thread_range_weighted_l2r
 
 siz_t bli_thread_range_weighted_r2l
      (
-       thrinfo_t* thr,
-       obj_t*     a,
-       blksz_t*   bmult,
+       const thrinfo_t* thr,
+       const obj_t*     a,
+       const blksz_t*   bmult,
        dim_t*     start,
        dim_t*     end
      )
@@ -861,9 +861,9 @@ siz_t bli_thread_range_weighted_r2l
 
 siz_t bli_thread_range_weighted_t2b
      (
-       thrinfo_t* thr,
-       obj_t*     a,
-       blksz_t*   bmult,
+       const thrinfo_t* thr,
+       const obj_t*     a,
+       const blksz_t*   bmult,
        dim_t*     start,
        dim_t*     end
      )
@@ -913,9 +913,9 @@ siz_t bli_thread_range_weighted_t2b
 
 siz_t bli_thread_range_weighted_b2t
      (
-       thrinfo_t* thr,
-       obj_t*     a,
-       blksz_t*   bmult,
+       const thrinfo_t* thr,
+       const obj_t*     a,
+       const blksz_t*   bmult,
        dim_t*     start,
        dim_t*     end
      )
diff --git a/frame/thread/bli_thread.h b/frame/thread/bli_thread.h
index d4880c4c85..60088ff832 100644
--- a/frame/thread/bli_thread.h
+++ b/frame/thread/bli_thread.h
@@ -66,7 +66,7 @@ void bli_thread_finalize( void );
 BLIS_EXPORT_BLIS
 void bli_thread_range_sub
      (
-       thrinfo_t* thread,
+       const thrinfo_t* thread,
        dim_t      n,
        dim_t      bf,
        bool       handle_edge_low,
@@ -80,12 +80,12 @@ void bli_thread_range_sub
 siz_t PASTEMAC0( opname ) \
      ( \
        dir_t      direct, \
-       thrinfo_t* thr, \
-       obj_t*     a, \
-       obj_t*     b, \
-       obj_t*     c, \
-       cntl_t*    cntl, \
-       cntx_t*    cntx, \
+       const thrinfo_t* thr, \
+       const obj_t*     a, \
+       const obj_t*     b, \
+       const obj_t*     c, \
+       const cntl_t*    cntl, \
+       const cntx_t*    cntx, \
        dim_t*     start, \
        dim_t*     end  \
      );
@@ -98,9 +98,9 @@ GENPROT( thread_range_ndim )
 \
 siz_t PASTEMAC0( opname ) \
      ( \
-       thrinfo_t* thr, \
-       obj_t*     a, \
-       blksz_t*   bmult, \
+       const thrinfo_t* thr, \
+       const obj_t*     a, \
+       const blksz_t*   bmult, \
        dim_t*     start, \
        dim_t*     end  \
      );
@@ -136,7 +136,7 @@ siz_t bli_find_area_trap_l
      );
 siz_t bli_thread_range_weighted_sub
      (
-       thrinfo_t* restrict thread,
+       const thrinfo_t* restrict thread,
        doff_t              diagoff,
        uplo_t              uplo,
        dim_t               m,
@@ -157,9 +157,9 @@ typedef struct
     dim_t f;
 } bli_prime_factors_t;
 
-void bli_prime_factorization(dim_t n, bli_prime_factors_t* factors);
+void bli_prime_factorization( dim_t n, bli_prime_factors_t* factors );
 
-dim_t bli_next_prime_factor(bli_prime_factors_t* factors);
+dim_t bli_next_prime_factor( bli_prime_factors_t* factors );
 bool  bli_is_prime( dim_t n );
 
 void bli_thread_partition_2x2
@@ -211,7 +211,7 @@ void  bli_thread_init_rntm_from_env( rntm_t* rntm );
 
 BLIS_INLINE void bli_thread_range_jrir_rr
      (
-       thrinfo_t* thread,
+       const thrinfo_t* thread,
        dim_t      n,
        dim_t      bf,
        bool       handle_edge_low,
@@ -228,7 +228,7 @@ BLIS_INLINE void bli_thread_range_jrir_rr
 
 BLIS_INLINE void bli_thread_range_jrir_sl
      (
-       thrinfo_t* thread,
+       const thrinfo_t* thread,
        dim_t      n,
        dim_t      bf,
        bool       handle_edge_low,
@@ -244,7 +244,7 @@ BLIS_INLINE void bli_thread_range_jrir_sl
 
 BLIS_INLINE void bli_thread_range_jrir
      (
-       thrinfo_t* thread,
+       const thrinfo_t* thread,
        dim_t      n,
        dim_t      bf,
        bool       handle_edge_low,
diff --git a/frame/thread/bli_thrinfo.c b/frame/thread/bli_thrinfo.c
index f9cd5ce74b..4dd447eec4 100644
--- a/frame/thread/bli_thrinfo.c
+++ b/frame/thread/bli_thrinfo.c
@@ -41,7 +41,7 @@ thrinfo_t* bli_thrinfo_create
        thrcomm_t* ocomm,
        dim_t      ocomm_id,
        dim_t      n_way,
-       dim_t      work_id, 
+       dim_t      work_id,
        bool       free_comm,
        bszid_t    bszid,
        thrinfo_t* sub_node
@@ -57,7 +57,7 @@ thrinfo_t* bli_thrinfo_create
 	(
 	  thread,
 	  ocomm, ocomm_id,
-	  n_way, work_id, 
+	  n_way, work_id,
 	  free_comm,
 	  bszid,
 	  sub_node
@@ -72,7 +72,7 @@ void bli_thrinfo_init
        thrcomm_t* ocomm,
        dim_t      ocomm_id,
        dim_t      n_way,
-       dim_t      work_id, 
+       dim_t      work_id,
        bool       free_comm,
        bszid_t    bszid,
        thrinfo_t* sub_node
@@ -97,7 +97,7 @@ void bli_thrinfo_init_single
 	bli_thrinfo_init
 	(
 	  thread,
-	  &BLIS_SINGLE_COMM, 0,
+	  ( thrcomm_t* )&BLIS_SINGLE_COMM, 0,
 	  1,
 	  0,
 	  FALSE,
diff --git a/frame/thread/bli_thrinfo.h b/frame/thread/bli_thrinfo.h
index 8e5a6da3b7..6b98096849 100644
--- a/frame/thread/bli_thrinfo.h
+++ b/frame/thread/bli_thrinfo.h
@@ -75,54 +75,54 @@ typedef struct thrinfo_s thrinfo_t;
 
 // thrinfo_t query (field only)
 
-BLIS_INLINE dim_t bli_thread_num_threads( thrinfo_t* t )
+BLIS_INLINE dim_t bli_thread_num_threads( const thrinfo_t* t )
 {
 	return (t->ocomm)->n_threads;
 }
 
-BLIS_INLINE dim_t bli_thread_ocomm_id( thrinfo_t* t )
+BLIS_INLINE dim_t bli_thread_ocomm_id( const thrinfo_t* t )
 {
 	return t->ocomm_id;
 }
 
-BLIS_INLINE dim_t bli_thread_n_way( thrinfo_t* t )
+BLIS_INLINE dim_t bli_thread_n_way( const thrinfo_t* t )
 {
 	return t->n_way;
 }
 
-BLIS_INLINE dim_t bli_thread_work_id( thrinfo_t* t )
+BLIS_INLINE dim_t bli_thread_work_id( const thrinfo_t* t )
 {
 	return t->work_id;
 }
 
-BLIS_INLINE thrcomm_t* bli_thrinfo_ocomm( thrinfo_t* t )
+BLIS_INLINE thrcomm_t* bli_thrinfo_ocomm( const thrinfo_t* t )
 {
 	return t->ocomm;
 }
 
-BLIS_INLINE bool bli_thrinfo_needs_free_comm( thrinfo_t* t )
+BLIS_INLINE bool bli_thrinfo_needs_free_comm( const thrinfo_t* t )
 {
 	return t->free_comm;
 }
 
-BLIS_INLINE dim_t bli_thread_bszid( thrinfo_t* t )
+BLIS_INLINE dim_t bli_thread_bszid( const thrinfo_t* t )
 {
 	return t->bszid;
 }
 
-BLIS_INLINE thrinfo_t* bli_thrinfo_sub_node( thrinfo_t* t )
+BLIS_INLINE thrinfo_t* bli_thrinfo_sub_node( const thrinfo_t* t )
 {
 	return t->sub_node;
 }
 
-BLIS_INLINE thrinfo_t* bli_thrinfo_sub_prenode( thrinfo_t* t )
+BLIS_INLINE thrinfo_t* bli_thrinfo_sub_prenode( const thrinfo_t* t )
 {
 	return t->sub_prenode;
 }
 
 // thrinfo_t query (complex)
 
-BLIS_INLINE bool bli_thread_am_ochief( thrinfo_t* t )
+BLIS_INLINE bool bli_thread_am_ochief( const thrinfo_t* t )
 {
 	return t->ocomm_id == 0;
 }
@@ -171,12 +171,12 @@ BLIS_INLINE void bli_thrinfo_set_sub_prenode( thrinfo_t* sub_prenode, thrinfo_t*
 
 // other thrinfo_t-related functions
 
-BLIS_INLINE void* bli_thread_broadcast( thrinfo_t* t, void* p )
+BLIS_INLINE void* bli_thread_broadcast( const thrinfo_t* t, void* p )
 {
 	return bli_thrcomm_bcast( t->ocomm_id, p, t->ocomm );
 }
 
-BLIS_INLINE void bli_thread_barrier( thrinfo_t* t )
+BLIS_INLINE void bli_thread_barrier( const thrinfo_t* t )
 {
 	bli_thrcomm_barrier( t->ocomm_id, t->ocomm );
 }
@@ -192,7 +192,7 @@ thrinfo_t* bli_thrinfo_create
        thrcomm_t* ocomm,
        dim_t      ocomm_id,
        dim_t      n_way,
-       dim_t      work_id, 
+       dim_t      work_id,
        bool       free_comm,
        bszid_t    bszid,
        thrinfo_t* sub_node
@@ -204,7 +204,7 @@ void bli_thrinfo_init
        thrcomm_t* ocomm,
        dim_t      ocomm_id,
        dim_t      n_way,
-       dim_t      work_id, 
+       dim_t      work_id,
        bool       free_comm,
        bszid_t    bszid,
        thrinfo_t* sub_node
diff --git a/frame/util/bli_util_check.c b/frame/util/bli_util_check.c
index 3693ea39c1..070a21f913 100644
--- a/frame/util/bli_util_check.c
+++ b/frame/util/bli_util_check.c
@@ -43,8 +43,8 @@
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  asum  \
+       const obj_t*  x, \
+       const obj_t*  asum  \
      ) \
 { \
 	bli_utilv_xa_check( x, asum ); \
@@ -58,7 +58,7 @@ GENFRONT( asumv )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x  \
+       const obj_t*  x  \
      ) \
 { \
 	bli_utilm_mkhst_check( x ); \
@@ -74,8 +74,8 @@ GENFRONT( mktrim )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  norm  \
+       const obj_t*  x, \
+       const obj_t*  norm  \
      ) \
 { \
 	bli_utilv_norm_check( x, norm ); \
@@ -91,8 +91,8 @@ GENFRONT( normiv )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  norm  \
+       const obj_t*  x, \
+       const obj_t*  norm  \
      ) \
 { \
 	bli_utilm_norm_check( x, norm ); \
@@ -108,7 +108,7 @@ GENFRONT( normim )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x  \
+       const obj_t*  x  \
      ) \
 { \
 	bli_utilm_rand_check( x ); \
@@ -125,9 +125,9 @@ GENFRONT( randnm )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  scale, \
-       obj_t*  sumsq  \
+       const obj_t*  x, \
+       const obj_t*  scale, \
+       const obj_t*  sumsq  \
      ) \
 { \
 	bli_utilv_sumsqv_check( x, scale, sumsq ); \
@@ -142,9 +142,9 @@ GENFRONT( sumsqv )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  chi, \
-       obj_t*  psi, \
-       bool*   is_eq  \
+       const obj_t*  chi, \
+       const obj_t*  psi, \
+       const bool*   is_eq  \
      ) \
 { \
 	bli_l0_xxbsc_check( chi, psi, is_eq ); \
@@ -158,9 +158,9 @@ GENFRONT( eqsc )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  y, \
-       bool*   is_eq  \
+       const obj_t*  x, \
+       const obj_t*  y, \
+       const bool*   is_eq  \
      ) \
 { \
 	bli_l1v_xy_check( x, y ); \
@@ -174,9 +174,9 @@ GENFRONT( eqv )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  y, \
-       bool*   is_eq  \
+       const obj_t*  x, \
+       const obj_t*  y, \
+       const bool*   is_eq  \
      ) \
 { \
 	bli_l1m_xy_check( x, y ); \
@@ -190,11 +190,11 @@ GENFRONT( eqm )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       FILE*  file, \
-       char*  s1, \
-       obj_t* x, \
-       char*  format, \
-       char*  s2  \
+       const FILE*  file, \
+       const char*  s1, \
+       const obj_t* x, \
+       const char*  format, \
+       const char*  s2  \
      ) \
 { \
 	bli_utilm_fprint_check( file, s1, x, format, s2 ); \
@@ -207,8 +207,8 @@ GENFRONT( fprintm )
 
 void bli_utilv_xa_check
      (
-       obj_t*  x,
-       obj_t*  asum
+       const obj_t*  x,
+       const obj_t*  asum
      )
 {
 	err_t e_val;
@@ -240,7 +240,7 @@ void bli_utilv_xa_check
 
 void bli_utilm_mkhst_check
      (
-       obj_t*  a
+       const obj_t*  a
      )
 {
 	err_t e_val;
@@ -277,8 +277,8 @@ void bli_utilm_mkhst_check
 
 void bli_utilv_norm_check
      (
-       obj_t*  x,
-       obj_t*  norm
+       const obj_t*  x,
+       const obj_t*  norm
      )
 {
 	err_t e_val;
@@ -317,8 +317,8 @@ void bli_utilv_norm_check
 
 void bli_utilm_norm_check
      (
-       obj_t*  x,
-       obj_t*  norm
+       const obj_t*  x,
+       const obj_t*  norm
      )
 {
 	err_t e_val;
@@ -356,35 +356,35 @@ void bli_utilm_norm_check
 
 void bli_utilm_fprint_check
      (
-       FILE*  file,
-       char*  s1,
-       obj_t* x,
-       char*  format,
-       char*  s2
+       const FILE*  file,
+       const char*  s1,
+       const obj_t* x,
+       const char*  format,
+       const char*  s2
      )
 {
 	err_t e_val;
 
 	// Check argument pointers.
-	
+
 	e_val = bli_check_null_pointer( file );
 	bli_check_error_code( e_val );
 
 	e_val = bli_check_null_pointer( s1 );
 	bli_check_error_code( e_val );
 
-	e_val = bli_check_null_pointer( s2 ); 
+	e_val = bli_check_null_pointer( s2 );
 	bli_check_error_code( e_val );
 
 	// Check object buffers (for non-NULLness).
 
-	e_val = bli_check_object_buffer( x ); 
+	e_val = bli_check_object_buffer( x );
 	bli_check_error_code( e_val );
 }
 
 void bli_utilm_rand_check
      (
-       obj_t* x
+       const obj_t* x
      )
 {
 	err_t e_val;
@@ -405,9 +405,9 @@ void bli_utilm_rand_check
 
 void bli_utilv_sumsqv_check
      (
-       obj_t*  x,
-       obj_t*  scale,
-       obj_t*  sumsq
+       const obj_t*  x,
+       const obj_t*  scale,
+       const obj_t*  sumsq
      )
 {
 	err_t e_val;
@@ -430,15 +430,15 @@ void bli_utilv_sumsqv_check
 
 	e_val = bli_check_scalar_object( scale );
 	bli_check_error_code( e_val );
-	
+
 	e_val = bli_check_scalar_object( sumsq );
 	bli_check_error_code( e_val );
 
 	// Check object buffers (for non-NULLness).
-	
+
 	e_val = bli_check_object_buffer( x );
 	bli_check_error_code( e_val );
-	
+
 	e_val = bli_check_object_buffer( scale );
 	bli_check_error_code( e_val );
 
diff --git a/frame/util/bli_util_check.h b/frame/util/bli_util_check.h
index 866a2cd895..41d3b17420 100644
--- a/frame/util/bli_util_check.h
+++ b/frame/util/bli_util_check.h
@@ -42,8 +42,8 @@
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  asum  \
+       const obj_t*  x, \
+       const obj_t*  asum  \
      );
 
 GENPROT( asumv )
@@ -54,7 +54,7 @@ GENPROT( asumv )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x  \
+       const obj_t*  x  \
      );
 
 GENPROT( mkherm )
@@ -67,8 +67,8 @@ GENPROT( mktrim )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  norm  \
+       const obj_t*  x, \
+       const obj_t*  norm  \
      );
 
 GENPROT( norm1v )
@@ -81,8 +81,8 @@ GENPROT( normiv )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  norm  \
+       const obj_t*  x, \
+       const obj_t*  norm  \
      );
 
 GENPROT( norm1m )
@@ -95,7 +95,7 @@ GENPROT( normim )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x  \
+       const obj_t*  x  \
      );
 
 GENPROT( randv )
@@ -109,9 +109,9 @@ GENPROT( randnm )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  scale, \
-       obj_t*  sumsq  \
+       const obj_t*  x, \
+       const obj_t*  scale, \
+       const obj_t*  sumsq  \
      );
 
 GENPROT( sumsqv )
@@ -123,9 +123,9 @@ GENPROT( sumsqv )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  chi, \
-       obj_t*  psi, \
-       bool*   is_eq  \
+       const obj_t*  chi, \
+       const obj_t*  psi, \
+       const bool*   is_eq  \
      );
 
 GENTPROT( eqsc )
@@ -136,9 +136,9 @@ GENTPROT( eqsc )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       obj_t*  x, \
-       obj_t*  y, \
-       bool*   is_eq  \
+       const obj_t*  x, \
+       const obj_t*  y, \
+       const bool*   is_eq  \
     );
 
 GENPROT( eqv )
@@ -150,11 +150,11 @@ GENPROT( eqm )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       FILE*  file, \
-       char*  s1, \
-       obj_t* x, \
-       char*  format, \
-       char*  s2  \
+       const FILE*  file, \
+       const char*  s1, \
+       const obj_t* x, \
+       const char*  format, \
+       const char*  s2  \
      );
 
 GENPROT( fprintv )
@@ -164,51 +164,51 @@ GENPROT( fprintm )
 
 void bli_utilv_xi_check
      (
-       obj_t*  x,
-       obj_t*  index
+       const obj_t*  x,
+       const obj_t*  index
      );
 
 void bli_utilv_xa_check
      (
-       obj_t*  x,
-       obj_t*  asum
+       const obj_t*  x,
+       const obj_t*  asum
      );
 
 void bli_utilm_mkhst_check
      (
-       obj_t*  a
+       const obj_t*  a
      );
 
 void bli_utilv_norm_check
      (
-       obj_t*  x,
-       obj_t*  norm
+       const obj_t*  x,
+       const obj_t*  norm
      );
 
 void bli_utilm_norm_check
      (
-       obj_t*  x,
-       obj_t*  norm
+       const obj_t*  x,
+       const obj_t*  norm
      );
 
 void bli_utilm_fprint_check
      (
-       FILE*  file,
-       char*  s1,
-       obj_t* x,
-       char*  format,
-       char*  s2
+       const FILE*  file,
+       const char*  s1,
+       const obj_t* x,
+       const char*  format,
+       const char*  s2
      );
 
 void bli_utilm_rand_check
      (
-       obj_t* x
+       const obj_t* x
      );
 
 void bli_utilv_sumsqv_check
      (
-       obj_t*  x,
-       obj_t*  scale,
-       obj_t*  sumsq
+       const obj_t*  x,
+       const obj_t*  scale,
+       const obj_t*  sumsq
      );
 
diff --git a/frame/util/bli_util_ft.h b/frame/util/bli_util_ft.h
index 673f4782aa..703b0bfe5b 100644
--- a/frame/util/bli_util_ft.h
+++ b/frame/util/bli_util_ft.h
@@ -44,9 +44,9 @@
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       dim_t    n, \
-       ctype*   x, inc_t incx, \
-       ctype_r* asum  \
+       dim_t        n, \
+       const ctype* x, inc_t incx, \
+       ctype_r*     asum  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -76,9 +76,9 @@ INSERT_GENTDEF( mktrim )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       dim_t    n, \
-       ctype*   x, inc_t incx, \
-       ctype_r* norm  \
+       dim_t        n, \
+       const ctype* x, inc_t incx, \
+       ctype_r*     norm  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -93,13 +93,13 @@ INSERT_GENTDEFR( normiv )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       doff_t   diagoffx, \
-       diag_t   diagx, \
-       uplo_t   uplox, \
-       dim_t    m, \
-       dim_t    n, \
-       ctype*   x, inc_t rs_x, inc_t cs_x, \
-       ctype_r* norm  \
+       doff_t       diagoffx, \
+       diag_t       diagx, \
+       uplo_t       uplox, \
+       dim_t        m, \
+       dim_t        n, \
+       const ctype* x, inc_t rs_x, inc_t cs_x, \
+       ctype_r*     norm  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -114,12 +114,12 @@ INSERT_GENTDEFR( normim )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       FILE*  file, \
-       char*  s1, \
-       dim_t  n, \
-       ctype* x, inc_t incx, \
-       char*  format, \
-       char*  s2  \
+       FILE*        file, \
+       const char*  s1, \
+       dim_t        n, \
+       const ctype* x, inc_t incx, \
+       const char*  format, \
+       const char*  s2  \
      );
 
 INSERT_GENTDEF( fprintv )
@@ -131,13 +131,13 @@ INSERT_GENTDEF( fprintv )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       FILE*  file, \
-       char*  s1, \
-       dim_t  m, \
-       dim_t  n, \
-       ctype* x, inc_t rs_x, inc_t cs_x, \
-       char*  format, \
-       char*  s2  \
+       FILE*        file, \
+       const char*  s1, \
+       dim_t        m, \
+       dim_t        n, \
+       const ctype* x, inc_t rs_x, inc_t cs_x, \
+       const char*  format, \
+       const char*  s2  \
      );
 
 INSERT_GENTDEF( fprintm )
@@ -182,10 +182,10 @@ INSERT_GENTDEF( randnm )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       dim_t    n, \
-       ctype*   x, inc_t incx, \
-       ctype_r* scale, \
-       ctype_r* sumsq  \
+       dim_t        n, \
+       const ctype* x, inc_t incx, \
+       ctype_r*     scale, \
+       ctype_r*     sumsq  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -204,10 +204,10 @@ INSERT_GENTDEFR( sumsqv )
 \
 typedef void (*PASTECH2(ch,opname,tsuf)) \
      ( \
-       conj_t  conjchi, \
-       ctype*  chi, \
-       ctype*  psi, \
-       bool*   is_eq  \
+       conj_t       conjchi, \
+       const ctype* chi, \
+       const ctype* psi, \
+       bool*        is_eq  \
      );
 
 INSERT_GENTDEF( eqsc )
@@ -219,11 +219,11 @@ INSERT_GENTDEF( eqsc )
 \
 typedef void (*PASTECH2(ch,opname,tsuf)) \
      ( \
-       conj_t  conjx, \
-       dim_t   n, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy, \
-       bool*   is_eq  \
+       conj_t       conjx, \
+       dim_t        n, \
+       const ctype* x, inc_t incx, \
+       const ctype* y, inc_t incy, \
+       bool*        is_eq  \
      );
 
 INSERT_GENTDEF( eqv )
@@ -235,15 +235,15 @@ INSERT_GENTDEF( eqv )
 \
 typedef void (*PASTECH2(ch,opname,tsuf)) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       uplo_t  uplox, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  y, inc_t rs_y, inc_t cs_y, \
-       bool*   is_eq  \
+       doff_t       diagoffx, \
+       diag_t       diagx, \
+       uplo_t       uplox, \
+       trans_t      transx, \
+       dim_t        m, \
+       dim_t        n, \
+       const ctype* x, inc_t rs_x, inc_t cs_x, \
+       const ctype* y, inc_t rs_y, inc_t cs_y, \
+       bool*        is_eq  \
      );
 
 INSERT_GENTDEF( eqm )
diff --git a/frame/util/bli_util_oapi.c b/frame/util/bli_util_oapi.c
index afd221a587..6ccc7ed2e2 100644
--- a/frame/util/bli_util_oapi.c
+++ b/frame/util/bli_util_oapi.c
@@ -45,8 +45,8 @@
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  asum  \
+       const obj_t*  x, \
+       const obj_t*  asum  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -88,7 +88,7 @@ GENFRONT( asumv )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  a  \
+       const obj_t*  a  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -132,8 +132,8 @@ GENFRONT( mktrim )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  norm  \
+       const obj_t*  x, \
+       const obj_t*  norm  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -176,8 +176,8 @@ GENFRONT( normiv )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  norm  \
+       const obj_t*  x, \
+       const obj_t*  norm  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -229,7 +229,7 @@ GENFRONT( normim )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x  \
+       const obj_t*  x  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -269,7 +269,7 @@ GENFRONT( randnv )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x  \
+       const obj_t*  x  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -316,9 +316,9 @@ GENFRONT( randnm )
 \
 void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  scale, \
-       obj_t*  sumsq  \
+       const obj_t*  x, \
+       const obj_t*  scale, \
+       const obj_t*  sumsq  \
        BLIS_OAPI_EX_PARAMS  \
      ) \
 { \
@@ -366,9 +366,9 @@ GENFRONT( sumsqv )
 \
 void PASTEMAC0(opname) \
      ( \
-       obj_t*  chi, \
-       obj_t*  psi, \
-       bool*   is_eq  \
+       const obj_t* chi, \
+       const obj_t* psi, \
+       bool*        is_eq  \
      ) \
 { \
 	bli_init_once(); \
@@ -427,9 +427,9 @@ GENFRONT( eqsc )
 \
 void PASTEMAC0(opname) \
      ( \
-       obj_t*  x, \
-       obj_t*  y, \
-       bool*   is_eq  \
+       const obj_t* x, \
+       const obj_t* y, \
+       bool*        is_eq  \
      ) \
 { \
 	bli_init_once(); \
@@ -474,9 +474,9 @@ GENFRONT( eqv )
 \
 void PASTEMAC0(opname) \
      ( \
-       obj_t*  x, \
-       obj_t*  y, \
-       bool*   is_eq  \
+       const obj_t* x, \
+       const obj_t* y, \
+       bool*        is_eq  \
      ) \
 { \
 	bli_init_once(); \
@@ -531,11 +531,11 @@ GENFRONT( eqm )
 \
 void PASTEMAC0(opname) \
      ( \
-       FILE*   file, \
-       char*   s1, \
-       obj_t*  x, \
-       char*   format, \
-       char*   s2  \
+       FILE*        file, \
+       const char*  s1, \
+       const obj_t* x, \
+       const char*  format, \
+       const char*  s2  \
      ) \
 { \
 	bli_init_once(); \
@@ -579,11 +579,11 @@ GENFRONT( fprintv )
 \
 void PASTEMAC0(opname) \
      ( \
-       FILE*   file, \
-       char*   s1, \
-       obj_t*  x, \
-       char*   format, \
-       char*   s2  \
+       FILE*        file, \
+       const char*  s1, \
+       const obj_t* x, \
+       const char*  format, \
+       const char*  s2  \
      ) \
 { \
 	bli_init_once(); \
@@ -645,10 +645,10 @@ GENFRONT( fprintm )
 \
 void PASTEMAC0(opname) \
      ( \
-       char*   s1, \
-       obj_t*  x, \
-       char*   format, \
-       char*   s2  \
+       const char*   s1, \
+       const obj_t*  x, \
+       const char*   format, \
+       const char*   s2  \
      ) \
 { \
 	bli_init_once(); \
diff --git a/frame/util/bli_util_oapi.h b/frame/util/bli_util_oapi.h
index 92ce6c95f7..2f21c09b99 100644
--- a/frame/util/bli_util_oapi.h
+++ b/frame/util/bli_util_oapi.h
@@ -42,8 +42,8 @@
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  asum  \
+       const obj_t*  x, \
+       const obj_t*  asum  \
        BLIS_OAPI_EX_PARAMS  \
      );
 
@@ -55,7 +55,7 @@ GENPROT( asumv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  a  \
+       const obj_t*  a  \
        BLIS_OAPI_EX_PARAMS  \
      );
 
@@ -69,8 +69,8 @@ GENPROT( mktrim )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  norm  \
+       const obj_t*  x, \
+       const obj_t*  norm  \
        BLIS_OAPI_EX_PARAMS  \
      );
 
@@ -84,8 +84,8 @@ GENPROT( normiv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  norm  \
+       const obj_t*  x, \
+       const obj_t*  norm  \
        BLIS_OAPI_EX_PARAMS  \
      );
 
@@ -99,7 +99,7 @@ GENPROT( normim )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x  \
+       const obj_t*  x  \
        BLIS_OAPI_EX_PARAMS  \
      );
 
@@ -112,7 +112,7 @@ GENPROT( randnv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x  \
+       const obj_t*  x  \
        BLIS_OAPI_EX_PARAMS  \
      );
 
@@ -125,9 +125,9 @@ GENPROT( randnm )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,EX_SUF) \
      ( \
-       obj_t*  x, \
-       obj_t*  scale, \
-       obj_t*  sumsq  \
+       const obj_t*  x, \
+       const obj_t*  scale, \
+       const obj_t*  sumsq  \
        BLIS_OAPI_EX_PARAMS  \
      );
 
@@ -145,9 +145,9 @@ GENPROT( sumsqv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
      ( \
-       obj_t*  chi, \
-       obj_t*  psi, \
-       bool*   is_eq  \
+       const obj_t* chi, \
+       const obj_t* psi, \
+       bool*        is_eq  \
      );
 
 GENPROT( eqsc )
@@ -158,9 +158,9 @@ GENPROT( eqsc )
 \
 BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
      ( \
-       obj_t*  x, \
-       obj_t*  y, \
-       bool*   is_eq  \
+       const obj_t* x, \
+       const obj_t* y, \
+       bool*        is_eq  \
      );
 
 GENPROT( eqv )
@@ -172,9 +172,9 @@ GENPROT( eqv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
      ( \
-       obj_t*  x, \
-       obj_t*  y, \
-       bool*   is_eq  \
+       const obj_t* x, \
+       const obj_t* y, \
+       bool*        is_eq  \
      );
 
 GENPROT( eqsc )
@@ -187,11 +187,11 @@ GENPROT( eqm )
 \
 BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
      ( \
-       FILE*   file, \
-       char*   s1, \
-       obj_t*  x, \
-       char*   format, \
-       char*   s2  \
+       FILE*        file, \
+       const char*  s1, \
+       const obj_t* x, \
+       const char*  format, \
+       const char*  s2  \
      );
 
 GENPROT( fprintv )
@@ -203,10 +203,10 @@ GENPROT( fprintm )
 \
 BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
      ( \
-       char*   s1, \
-       obj_t*  x, \
-       char*   format, \
-       char*   s2  \
+       const char*   s1, \
+       const obj_t*  x, \
+       const char*   format, \
+       const char*   s2  \
      );
 
 GENPROT( printv )
diff --git a/frame/util/bli_util_tapi.c b/frame/util/bli_util_tapi.c
index ca0b3c279d..8862f4ff6d 100644
--- a/frame/util/bli_util_tapi.c
+++ b/frame/util/bli_util_tapi.c
@@ -45,9 +45,9 @@
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       dim_t    n, \
-       ctype*   x, inc_t incx, \
-       ctype_r* asum  \
+       dim_t        n, \
+       const ctype* x, inc_t incx, \
+       ctype_r*     asum  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -124,9 +124,9 @@ INSERT_GENTFUNC_BASIC0( mktrim )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       dim_t    n, \
-       ctype*   x, inc_t incx, \
-       ctype_r* norm  \
+       dim_t        n, \
+       const ctype* x, inc_t incx, \
+       ctype_r*     norm  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -167,13 +167,13 @@ INSERT_GENTFUNCR_BASIC0( normiv )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t   diagoffx, \
-       diag_t   diagx, \
-       uplo_t   uplox, \
-       dim_t    m, \
-       dim_t    n, \
-       ctype*   x, inc_t rs_x, inc_t cs_x, \
-       ctype_r* norm  \
+       doff_t       diagoffx, \
+       diag_t       diagx, \
+       uplo_t       uplox, \
+       dim_t        m, \
+       dim_t        n, \
+       const ctype* x, inc_t rs_x, inc_t cs_x, \
+       ctype_r*     norm  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -340,10 +340,10 @@ INSERT_GENTFUNCR_BASIC0( randnm )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       dim_t    n, \
-       ctype*   x, inc_t incx, \
-       ctype_r* scale, \
-       ctype_r* sumsq  \
+       dim_t        n, \
+       const ctype* x, inc_t incx, \
+       ctype_r*     scale, \
+       ctype_r*     sumsq  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -383,10 +383,10 @@ INSERT_GENTFUNCR_BASIC0( sumsqv )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       conj_t  conjchi, \
-       ctype*  chi, \
-       ctype*  psi, \
-       bool*   is_eq  \
+       conj_t       conjchi, \
+       const ctype* chi, \
+       const ctype* psi, \
+       bool*        is_eq  \
      ) \
 { \
 	bli_init_once(); \
@@ -406,11 +406,11 @@ INSERT_GENTFUNC_BASIC0( eqsc )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       conj_t  conjx, \
-       dim_t   n, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy, \
-       bool*   is_eq  \
+       conj_t       conjx, \
+       dim_t        n, \
+       const ctype* x, inc_t incx, \
+       const ctype* y, inc_t incy, \
+       bool*        is_eq  \
      ) \
 { \
 	bli_init_once(); \
@@ -438,15 +438,15 @@ INSERT_GENTFUNC_BASIC0( eqv )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       uplo_t  uplox, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  y, inc_t rs_y, inc_t cs_y, \
-       bool*   is_eq  \
+       doff_t       diagoffx, \
+       diag_t       diagx, \
+       uplo_t       uplox, \
+       trans_t      transx, \
+       dim_t        m, \
+       dim_t        n, \
+       const ctype* x, inc_t rs_x, inc_t cs_x, \
+       const ctype* y, inc_t rs_y, inc_t cs_y, \
+       bool*        is_eq  \
      ) \
 { \
 	bli_init_once(); \
@@ -480,11 +480,11 @@ INSERT_GENTFUNC_BASIC0( eqm )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       char*  s1, \
-       dim_t  n, \
-       void*  x, inc_t incx, \
-       char*  format, \
-       char*  s2  \
+       const char* s1, \
+       dim_t       n, \
+       const void* x, inc_t incx, \
+       const char* format, \
+       const char* s2  \
      ) \
 { \
 	bli_init_once(); \
@@ -508,12 +508,12 @@ INSERT_GENTFUNC_BASIC_I( printv, fprintv )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       char*  s1, \
-       dim_t  m, \
-       dim_t  n, \
-       void*  x, inc_t rs_x, inc_t cs_x, \
-       char*  format, \
-       char*  s2  \
+       const char* s1, \
+       dim_t       m, \
+       dim_t       n, \
+       const void* x, inc_t rs_x, inc_t cs_x, \
+       const char* format, \
+       const char* s2  \
      ) \
 { \
 	bli_init_once(); \
diff --git a/frame/util/bli_util_tapi.h b/frame/util/bli_util_tapi.h
index 43fbbdb063..652e3735b0 100644
--- a/frame/util/bli_util_tapi.h
+++ b/frame/util/bli_util_tapi.h
@@ -42,9 +42,9 @@
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       dim_t    n, \
-       ctype*   x, inc_t incx, \
-       ctype_r* asum  \
+       dim_t        n, \
+       const ctype* x, inc_t incx, \
+       ctype_r*     asum  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -72,9 +72,9 @@ INSERT_GENTPROT_BASIC0( mktrim )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       dim_t    n, \
-       ctype*   x, inc_t incx, \
-       ctype_r* norm  \
+       dim_t        n, \
+       const ctype* x, inc_t incx, \
+       ctype_r*     norm  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -88,13 +88,13 @@ INSERT_GENTPROTR_BASIC0( normiv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t   diagoffx, \
-       diag_t   diagx, \
-       uplo_t   uplox, \
-       dim_t    m, \
-       dim_t    n, \
-       ctype*   x, inc_t rs_x, inc_t cs_x, \
-       ctype_r* norm  \
+       doff_t       diagoffx, \
+       diag_t       diagx, \
+       uplo_t       uplox, \
+       dim_t        m, \
+       dim_t        n, \
+       const ctype* x, inc_t rs_x, inc_t cs_x, \
+       ctype_r*     norm  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -139,10 +139,10 @@ INSERT_GENTPROT_BASIC0( randnm )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       dim_t    n, \
-       ctype*   x, inc_t incx, \
-       ctype_r* scale, \
-       ctype_r* sumsq  \
+       dim_t        n, \
+       const ctype* x, inc_t incx, \
+       ctype_r*     scale, \
+       ctype_r*     sumsq  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -159,10 +159,10 @@ INSERT_GENTPROTR_BASIC0( sumsqv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       conj_t  conjchi, \
-       ctype*  chi, \
-       ctype*  psi, \
-       bool*   is_eq  \
+       conj_t       conjchi, \
+       const ctype* chi, \
+       const ctype* psi, \
+       bool*        is_eq  \
      );
 
 INSERT_GENTPROT_BASIC0( eqsc )
@@ -173,11 +173,11 @@ INSERT_GENTPROT_BASIC0( eqsc )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
       ( \
-        conj_t  conjx, \
-        dim_t   n, \
-        ctype*  x, inc_t incx, \
-        ctype*  y, inc_t incy, \
-        bool*   is_eq  \
+        conj_t       conjx, \
+        dim_t        n, \
+        const ctype* x, inc_t incx, \
+        const ctype* y, inc_t incy, \
+        bool*        is_eq  \
       );
 
 INSERT_GENTPROT_BASIC0( eqv )
@@ -188,15 +188,15 @@ INSERT_GENTPROT_BASIC0( eqv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       uplo_t  uplox, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  y, inc_t rs_y, inc_t cs_y, \
-       bool*   is_eq  \
+       doff_t       diagoffx, \
+       diag_t       diagx, \
+       uplo_t       uplox, \
+       trans_t      transx, \
+       dim_t        m, \
+       dim_t        n, \
+       const ctype* x, inc_t rs_x, inc_t cs_x, \
+       const ctype* y, inc_t rs_y, inc_t cs_y, \
+       bool*        is_eq  \
      );
 
 INSERT_GENTPROT_BASIC0( eqm )
@@ -207,11 +207,11 @@ INSERT_GENTPROT_BASIC0( eqm )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       char*  s1, \
-       dim_t  n, \
-       void*  x, inc_t incx, \
-       char*  format, \
-       char*  s2  \
+       const char* s1, \
+       dim_t       n, \
+       const void* x, inc_t incx, \
+       const char* format, \
+       const char* s2  \
      );
 
 INSERT_GENTPROT_BASIC0_I( printv )
@@ -222,12 +222,12 @@ INSERT_GENTPROT_BASIC0_I( printv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       char*  s1, \
-       dim_t  m, \
-       dim_t  n, \
-       void*  x, inc_t rs_x, inc_t cs_x, \
-       char*  format, \
-       char*  s2  \
+       const char* s1, \
+       dim_t       m, \
+       dim_t       n, \
+       const void* x, inc_t rs_x, inc_t cs_x, \
+       const char* format, \
+       const char* s2  \
      );
 
 INSERT_GENTPROT_BASIC0_I( printm )
diff --git a/frame/util/bli_util_unb_var1.c b/frame/util/bli_util_unb_var1.c
index af550681aa..18506d4cc6 100644
--- a/frame/util/bli_util_unb_var1.c
+++ b/frame/util/bli_util_unb_var1.c
@@ -45,18 +45,18 @@
 \
 void PASTEMAC(ch,varname) \
      ( \
-       dim_t    n, \
-       ctype*   x, inc_t incx, \
-       ctype_r* asum, \
-       cntx_t*  cntx, \
-       rntm_t*  rntm  \
+       dim_t         n, \
+       const ctype*  x, inc_t incx, \
+       ctype_r*      asum, \
+       const cntx_t* cntx, \
+       const rntm_t* rntm  \
      ) \
 { \
-	ctype*   chi1; \
-	ctype_r  chi1_r; \
-	ctype_r  chi1_i; \
-	ctype_r  absum; \
-	dim_t    i; \
+	const ctype* chi1; \
+	ctype_r      chi1_r; \
+	ctype_r      chi1_i; \
+	ctype_r      absum; \
+	dim_t        i; \
 \
 	/* Initialize the absolute sum accumulator to zero. */ \
 	PASTEMAC(chr,set0s)( absum ); \
@@ -89,11 +89,11 @@ INSERT_GENTFUNCR_BASIC0( asumv_unb_var1 )
 \
 void PASTEMAC(ch,varname) \
      ( \
-       uplo_t  uploa, \
-       dim_t   m, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+       uplo_t        uploa, \
+       dim_t         m, \
+       ctype*        a, inc_t rs_a, inc_t cs_a, \
+       const cntx_t* cntx, \
+       const rntm_t* rntm  \
      ) \
 { \
 	ctype_r* zeror = PASTEMAC(chr,0); \
@@ -145,11 +145,11 @@ INSERT_GENTFUNCR_BASIC0( mkherm_unb_var1 )
 \
 void PASTEMAC(ch,varname) \
      ( \
-       uplo_t  uploa, \
-       dim_t   m, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+       uplo_t        uploa, \
+       dim_t         m, \
+       ctype*        a, inc_t rs_a, inc_t cs_a, \
+       const cntx_t* cntx, \
+       const rntm_t* rntm  \
      ) \
 { \
 	doff_t  diagoffa; \
@@ -187,11 +187,11 @@ INSERT_GENTFUNC_BASIC0( mksymm_unb_var1 )
 \
 void PASTEMAC(ch,varname) \
      ( \
-       uplo_t  uploa, \
-       dim_t   m, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+       uplo_t        uploa, \
+       dim_t         m, \
+       ctype*        a, inc_t rs_a, inc_t cs_a, \
+       const cntx_t* cntx, \
+       const rntm_t* rntm  \
      ) \
 { \
 	ctype*  zero = PASTEMAC(ch,0); \
@@ -232,17 +232,17 @@ INSERT_GENTFUNC_BASIC0( mktrim_unb_var1 )
 \
 void PASTEMAC(ch,varname) \
      ( \
-       dim_t    n, \
-       ctype*   x, inc_t incx, \
-       ctype_r* norm, \
-       cntx_t*  cntx, \
-       rntm_t*  rntm  \
+       dim_t         n, \
+       const ctype*  x, inc_t incx, \
+       ctype_r*      norm, \
+       const cntx_t* cntx, \
+       const rntm_t* rntm  \
      ) \
 { \
-	ctype*   chi1; \
-	ctype_r  abs_chi1; \
-	ctype_r  absum; \
-	dim_t    i; \
+	const ctype* chi1; \
+	ctype_r      abs_chi1; \
+	ctype_r      absum; \
+	dim_t        i; \
 \
 	/* Initialize the absolute sum accumulator to zero. */ \
 	PASTEMAC(chr,set0s)( absum ); \
@@ -270,11 +270,11 @@ INSERT_GENTFUNCR_BASIC0( norm1v_unb_var1 )
 \
 void PASTEMAC(ch,varname) \
      ( \
-       dim_t    n, \
-       ctype*   x, inc_t incx, \
-       ctype_r* norm, \
-       cntx_t*  cntx, \
-       rntm_t*  rntm  \
+       dim_t         n, \
+       const ctype*  x, inc_t incx, \
+       ctype_r*      norm, \
+       const cntx_t* cntx, \
+       const rntm_t* rntm  \
      ) \
 { \
 	ctype_r* zero       = PASTEMAC(chr,0); \
@@ -402,11 +402,11 @@ void PASTEMAC(ch,varname) \
 \
 void PASTEMAC(ch,varname) \
      ( \
-       dim_t    n, \
-       ctype*   x, inc_t incx, \
-       ctype_r* norm, \
-       cntx_t*  cntx, \
-       rntm_t*  rntm  \
+       dim_t         n, \
+       const ctype*  x, inc_t incx, \
+       ctype_r*      norm, \
+       const cntx_t* cntx, \
+       const rntm_t* rntm  \
      ) \
 { \
 	ctype_r* zero       = PASTEMAC(chr,0); \
@@ -448,17 +448,17 @@ GENTFUNCR( double,  double, d, d, normfv_unb_var1, sumsqv_unb_var1 )
 \
 void PASTEMAC(ch,varname) \
      ( \
-       dim_t    n, \
-       ctype*   x, inc_t incx, \
-       ctype_r* norm, \
-       cntx_t*  cntx, \
-       rntm_t*  rntm  \
+       dim_t         n, \
+       const ctype*  x, inc_t incx, \
+       ctype_r*      norm, \
+       const cntx_t* cntx, \
+       const rntm_t* rntm  \
      ) \
 { \
-	ctype*   chi1; \
-	ctype_r  abs_chi1; \
-	ctype_r  abs_chi1_max; \
-	dim_t    i; \
+	const ctype* chi1; \
+	ctype_r      abs_chi1; \
+	ctype_r      abs_chi1_max; \
+	dim_t        i; \
 \
 	/* Initialize the maximum absolute value to zero. */ \
 	PASTEMAC(chr,set0s)( abs_chi1_max ); \
@@ -494,30 +494,30 @@ INSERT_GENTFUNCR_BASIC0( normiv_unb_var1 )
 \
 void PASTEMAC(ch,varname) \
      ( \
-       doff_t   diagoffx, \
-       diag_t   diagx, \
-       uplo_t   uplox, \
-       dim_t    m, \
-       dim_t    n, \
-       ctype*   x, inc_t rs_x, inc_t cs_x, \
-       ctype_r* norm, \
-       cntx_t*  cntx, \
-       rntm_t*  rntm  \
+       doff_t        diagoffx, \
+       diag_t        diagx, \
+       uplo_t        uplox, \
+       dim_t         m, \
+       dim_t         n, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+       ctype_r*      norm, \
+       const cntx_t* cntx, \
+       const rntm_t* rntm  \
      ) \
 { \
-	ctype*   one       = PASTEMAC(ch,1); \
-	ctype*   x0; \
-	ctype*   chi1; \
-	ctype*   x2; \
-	ctype_r  absum_max; \
-	ctype_r  absum_j; \
-	ctype_r  abval_chi1; \
-	uplo_t   uplox_eff; \
-	dim_t    n_iter; \
-	dim_t    n_elem, n_elem_max; \
-	inc_t    ldx, incx; \
-	dim_t    j, i; \
-	dim_t    ij0, n_shift; \
+	const ctype* one       = PASTEMAC(ch,1); \
+	const ctype* x0; \
+	const ctype* chi1; \
+	const ctype* x2; \
+	ctype_r      absum_max; \
+	ctype_r      absum_j; \
+	ctype_r      abval_chi1; \
+	uplo_t       uplox_eff; \
+	dim_t        n_iter; \
+	dim_t        n_elem, n_elem_max; \
+	inc_t        ldx, incx; \
+	dim_t        j, i; \
+	dim_t        ij0, n_shift; \
 \
 	/* Initialize the maximum absolute column sum to zero. */ \
 	PASTEMAC(chr,set0s)( absum_max ); \
@@ -658,32 +658,32 @@ INSERT_GENTFUNCR_BASIC( norm1m_unb_var1, norm1v_unb_var1 )
 \
 void PASTEMAC(ch,varname) \
      ( \
-       doff_t   diagoffx, \
-       diag_t   diagx, \
-       uplo_t   uplox, \
-       dim_t    m, \
-       dim_t    n, \
-       ctype*   x, inc_t rs_x, inc_t cs_x, \
-       ctype_r* norm, \
-       cntx_t*  cntx, \
-       rntm_t*  rntm  \
+       doff_t        diagoffx, \
+       diag_t        diagx, \
+       uplo_t        uplox, \
+       dim_t         m, \
+       dim_t         n, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+       ctype_r*      norm, \
+       const cntx_t* cntx, \
+       const rntm_t* rntm  \
      ) \
 { \
-	ctype*   one    = PASTEMAC(ch,1); \
-	ctype_r* one_r  = PASTEMAC(chr,1); \
-	ctype_r* zero_r = PASTEMAC(chr,0); \
-	ctype*   x0; \
-	ctype*   chi1; \
-	ctype*   x2; \
-	ctype_r  scale; \
-	ctype_r  sumsq; \
-	ctype_r  sqrt_sumsq; \
-	uplo_t   uplox_eff; \
-	dim_t    n_iter; \
-	dim_t    n_elem, n_elem_max; \
-	inc_t    ldx, incx; \
-	dim_t    j, i; \
-	dim_t    ij0, n_shift; \
+	const ctype*   one    = PASTEMAC(ch,1); \
+	const ctype_r* one_r  = PASTEMAC(chr,1); \
+	const ctype_r* zero_r = PASTEMAC(chr,0); \
+	const ctype*   x0; \
+	const ctype*   chi1; \
+	const ctype*   x2; \
+	ctype_r        scale; \
+	ctype_r        sumsq; \
+	ctype_r        sqrt_sumsq; \
+	uplo_t         uplox_eff; \
+	dim_t          n_iter; \
+	dim_t          n_elem, n_elem_max; \
+	inc_t          ldx, incx; \
+	dim_t          j, i; \
+	dim_t          ij0, n_shift; \
 \
 	/* Return a norm of zero if either dimension is zero. */ \
 	if ( bli_zero_dim2( m, n ) ) \
@@ -825,15 +825,15 @@ INSERT_GENTFUNCR_BASIC( normfm_unb_var1, sumsqv_unb_var1 )
 \
 void PASTEMAC(ch,varname) \
      ( \
-       doff_t   diagoffx, \
-       diag_t   diagx, \
-       uplo_t   uplox, \
-       dim_t    m, \
-       dim_t    n, \
-       ctype*   x, inc_t rs_x, inc_t cs_x, \
-       ctype_r* norm, \
-       cntx_t*  cntx, \
-       rntm_t*  rntm  \
+       doff_t        diagoffx, \
+       diag_t        diagx, \
+       uplo_t        uplox, \
+       dim_t         m, \
+       dim_t         n, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+       ctype_r*      norm, \
+       const cntx_t* cntx, \
+       const rntm_t* rntm  \
      ) \
 { \
 	/* Induce a transposition so that rows become columns. */ \
@@ -867,10 +867,10 @@ INSERT_GENTFUNCR_BASIC( normim_unb_var1, norm1m_unb_var1 )
 \
 void PASTEMAC(ch,varname) \
      ( \
-       dim_t   n, \
-       ctype*  x, inc_t incx, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+       dim_t         n, \
+       ctype*        x, inc_t incx, \
+       const cntx_t* cntx, \
+       const rntm_t* rntm  \
      ) \
 { \
 	ctype* chi1; \
@@ -895,29 +895,29 @@ INSERT_GENTFUNC_BASIC( randnv_unb_var1, randnp2s )
 \
 void PASTEMAC(ch,varname) \
      ( \
-       doff_t  diagoffx, \
-       uplo_t  uplox, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+       doff_t        diagoffx, \
+       uplo_t        uplox, \
+       dim_t         m, \
+       dim_t         n, \
+       ctype*        x, inc_t rs_x, inc_t cs_x, \
+       const cntx_t* cntx, \
+       const rntm_t* rntm  \
      ) \
 { \
-	ctype*  one = PASTEMAC(ch,1); \
-	ctype*  x0; \
-	ctype*  x1; \
-	ctype*  x2; \
-	ctype*  chi1; \
-	ctype   beta; \
-	ctype   omega; \
-	double  max_m_n; \
-	uplo_t  uplox_eff; \
-	dim_t   n_iter; \
-	dim_t   n_elem, n_elem_max; \
-	inc_t   ldx, incx; \
-	dim_t   j, i; \
-	dim_t   ij0, n_shift; \
+	const ctype* one = PASTEMAC(ch,1); \
+	ctype*       x0; \
+	ctype*       x1; \
+	ctype*       x2; \
+	ctype*       chi1; \
+	ctype        beta; \
+	ctype        omega; \
+	double       max_m_n; \
+	uplo_t       uplox_eff; \
+	dim_t        n_iter; \
+	dim_t        n_elem, n_elem_max; \
+	inc_t        ldx, incx; \
+	dim_t        j, i; \
+	dim_t        ij0, n_shift; \
 \
 	/* Set various loop parameters. Here, we pretend that diagx is equal to
 	   BLIS_NONUNIT_DIAG because we handle the unit diagonal case manually. */ \
@@ -1051,18 +1051,18 @@ INSERT_GENTFUNC_BASIC( randnm_unb_var1, randnv_unb_var1 )
 \
 void PASTEMAC(ch,varname) \
      ( \
-       dim_t    n, \
-       ctype*   x, inc_t incx, \
-       ctype_r* scale, \
-       ctype_r* sumsq, \
-       cntx_t*  cntx, \
-       rntm_t*  rntm  \
+       dim_t         n, \
+       const ctype*  x, inc_t incx, \
+       ctype_r*      scale, \
+       ctype_r*      sumsq, \
+       const cntx_t* cntx, \
+       const rntm_t* rntm  \
      ) \
 { \
 	const ctype_r zero_r = *PASTEMAC(chr,0); \
 	const ctype_r one_r  = *PASTEMAC(chr,1); \
 \
-	ctype*        chi1; \
+	const ctype*  chi1; \
 	ctype_r       chi1_r; \
 	ctype_r       chi1_i; \
 	ctype_r       scale_r; \
@@ -1143,16 +1143,16 @@ INSERT_GENTFUNCR_BASIC0( sumsqv_unb_var1 )
 \
 bool PASTEMAC(ch,opname) \
      ( \
-       conj_t  conjx, \
-       dim_t   n, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy  \
+       conj_t       conjx, \
+       dim_t        n, \
+       const ctype* x, inc_t incx, \
+       const ctype* y, inc_t incy  \
      ) \
 { \
 	for ( dim_t i = 0; i < n; ++i ) \
 	{ \
-		ctype* chi1 = x + (i  )*incx; \
-		ctype* psi1 = y + (i  )*incy; \
+		const ctype* chi1 = x + (i  )*incx; \
+		const ctype* psi1 = y + (i  )*incy; \
 \
 		ctype chi1c; \
 \
@@ -1174,14 +1174,14 @@ INSERT_GENTFUNC_BASIC0( eqv_unb_var1 )
 \
 bool PASTEMAC(ch,opname) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       uplo_t  uplox, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  y, inc_t rs_y, inc_t cs_y  \
+       doff_t       diagoffx, \
+       diag_t       diagx, \
+       uplo_t       uplox, \
+       trans_t      transx, \
+       dim_t        m, \
+       dim_t        n, \
+       const ctype* x, inc_t rs_x, inc_t cs_x, \
+       const ctype* y, inc_t rs_y, inc_t cs_y  \
      ) \
 { \
 	uplo_t   uplox_eff; \
@@ -1219,14 +1219,14 @@ bool PASTEMAC(ch,opname) \
 		{ \
 			const dim_t n_elem = n_elem_max; \
 \
-			ctype* x1 = x + (j  )*ldx + (0  )*incx; \
-			ctype* y1 = y + (j  )*ldy + (0  )*incy; \
+			const ctype* x1 = x + (j  )*ldx + (0  )*incx; \
+			const ctype* y1 = y + (j  )*ldy + (0  )*incy; \
 \
 			for ( dim_t i = 0; i < n_elem; ++i ) \
 			{ \
-				ctype* x11 = x1 + (i  )*incx; \
-				ctype* y11 = y1 + (i  )*incy; \
-				ctype  x11c; \
+				const ctype* x11 = x1 + (i  )*incx; \
+				const ctype* y11 = y1 + (i  )*incy; \
+				ctype        x11c; \
 \
 				if ( bli_is_conj( conjx ) ) { PASTEMAC(ch,copyjs)( *x11, x11c ); } \
 				else                        { PASTEMAC(ch,copys)( *x11, x11c ); } \
@@ -1244,14 +1244,14 @@ bool PASTEMAC(ch,opname) \
 			{ \
 				const dim_t n_elem = bli_min( n_shift + j + 1, n_elem_max ); \
 \
-				ctype* x1 = x + (ij0+j  )*ldx + (0  )*incx; \
-				ctype* y1 = y + (ij0+j  )*ldy + (0  )*incy; \
+				const ctype* x1 = x + (ij0+j  )*ldx + (0  )*incx; \
+				const ctype* y1 = y + (ij0+j  )*ldy + (0  )*incy; \
 \
 				for ( dim_t i = 0; i < n_elem; ++i ) \
 				{ \
-					ctype* x11 = x1 + (i  )*incx; \
-					ctype* y11 = y1 + (i  )*incy; \
-					ctype  x11c; \
+					const ctype* x11 = x1 + (i  )*incx; \
+					const ctype* y11 = y1 + (i  )*incy; \
+					ctype        x11c; \
 \
 					if ( bli_is_conj( conjx ) ) { PASTEMAC(ch,copyjs)( *x11, x11c ); } \
 					else                        { PASTEMAC(ch,copys)( *x11, x11c ); } \
@@ -1268,14 +1268,14 @@ bool PASTEMAC(ch,opname) \
 				const dim_t offi   = bli_max( 0, ( doff_t )j - ( doff_t )n_shift ); \
 				const dim_t n_elem = n_elem_max - offi; \
 \
-				ctype* x1 = x + (j  )*ldx + (ij0+offi  )*incx; \
-				ctype* y1 = y + (j  )*ldy + (ij0+offi  )*incy; \
+				const ctype* x1 = x + (j  )*ldx + (ij0+offi  )*incx; \
+				const ctype* y1 = y + (j  )*ldy + (ij0+offi  )*incy; \
 \
 				for ( dim_t i = 0; i < n_elem; ++i ) \
 				{ \
-					ctype* x11 = x1 + (i  )*incx; \
-					ctype* y11 = y1 + (i  )*incy; \
-					ctype  x11c; \
+					const ctype* x11 = x1 + (i  )*incx; \
+					const ctype* y11 = y1 + (i  )*incy; \
+					ctype        x11c; \
 \
 					if ( bli_is_conj( conjx ) ) { PASTEMAC(ch,copyjs)( *x11, x11c ); } \
 					else                        { PASTEMAC(ch,copys)( *x11, x11c ); } \
@@ -1298,16 +1298,16 @@ INSERT_GENTFUNC_BASIC0( eqm_unb_var1 )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       FILE*  file, \
-       char*  s1, \
-       dim_t  n, \
-       ctype* x, inc_t incx, \
-       char*  format, \
-       char*  s2  \
+       FILE*        file, \
+       const char*  s1, \
+       dim_t        n, \
+       const ctype* x, inc_t incx, \
+       const char*  format, \
+       const char*  s2  \
      ) \
 { \
 	dim_t  i; \
-	ctype* chi1; \
+	const ctype* chi1; \
 	char   default_spec[32] = PASTEMAC(ch,formatspec)(); \
 \
 	if ( format == NULL ) format = default_spec; \
@@ -1335,17 +1335,17 @@ INSERT_GENTFUNC_BASIC0_I( fprintv )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       FILE*  file, \
-       char*  s1, \
-       dim_t  m, \
-       dim_t  n, \
-       ctype* x, inc_t rs_x, inc_t cs_x, \
-       char*  format, \
-       char*  s2  \
+       FILE*        file, \
+       const char*  s1, \
+       dim_t        m, \
+       dim_t        n, \
+       const ctype* x, inc_t rs_x, inc_t cs_x, \
+       const char*  format, \
+       const char*  s2  \
      ) \
 { \
 	dim_t  i, j; \
-	ctype* chi1; \
+	const ctype* chi1; \
 	char   default_spec[32] = PASTEMAC(ch,formatspec)(); \
 \
 	if ( format == NULL ) format = default_spec; \
diff --git a/frame/util/bli_util_unb_var1.h b/frame/util/bli_util_unb_var1.h
index f878488568..12a5b7de8a 100644
--- a/frame/util/bli_util_unb_var1.h
+++ b/frame/util/bli_util_unb_var1.h
@@ -42,11 +42,11 @@
 \
 void PASTEMAC(ch,varname) \
      ( \
-       dim_t    n, \
-       ctype*   x, inc_t incx, \
-       ctype_r* asum, \
-       cntx_t*  cntx, \
-       rntm_t*  rntm  \
+       dim_t         n, \
+       const ctype*  x, inc_t incx, \
+       ctype_r*      asum, \
+       const cntx_t* cntx, \
+       const rntm_t* rntm  \
      );
 
 INSERT_GENTPROTR_BASIC0( asumv_unb_var1 )
@@ -57,11 +57,11 @@ INSERT_GENTPROTR_BASIC0( asumv_unb_var1 )
 \
 void PASTEMAC(ch,varname) \
      ( \
-       uplo_t  uploa, \
-       dim_t   m, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+       uplo_t        uploa, \
+       dim_t         m, \
+       ctype*        a, inc_t rs_a, inc_t cs_a, \
+       const cntx_t* cntx, \
+       const rntm_t* rntm  \
      );
 
 INSERT_GENTPROT_BASIC0( mkherm_unb_var1 )
@@ -74,11 +74,11 @@ INSERT_GENTPROT_BASIC0( mktrim_unb_var1 )
 \
 void PASTEMAC(ch,varname) \
      ( \
-       dim_t    n, \
-       ctype*   x, inc_t incx, \
-       ctype_r* norm, \
-       cntx_t*  cntx, \
-       rntm_t*  rntm  \
+       dim_t         n, \
+       const ctype*  x, inc_t incx, \
+       ctype_r*      norm, \
+       const cntx_t* cntx, \
+       const rntm_t* rntm  \
      );
 
 INSERT_GENTPROTR_BASIC0( norm1v_unb_var1 )
@@ -91,15 +91,15 @@ INSERT_GENTPROTR_BASIC0( normiv_unb_var1 )
 \
 void PASTEMAC(ch,varname) \
      ( \
-       doff_t   diagoffx, \
-       diag_t   diagx, \
-       uplo_t   uplox, \
-       dim_t    m, \
-       dim_t    n, \
-       ctype*   x, inc_t rs_x, inc_t cs_x, \
-       ctype_r* norm, \
-       cntx_t*  cntx, \
-       rntm_t*  rntm  \
+       doff_t        diagoffx, \
+       diag_t        diagx, \
+       uplo_t        uplox, \
+       dim_t         m, \
+       dim_t         n, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+       ctype_r*      norm, \
+       const cntx_t* cntx, \
+       const rntm_t* rntm  \
      );
 
 INSERT_GENTPROTR_BASIC0( norm1m_unb_var1 )
@@ -112,10 +112,10 @@ INSERT_GENTPROTR_BASIC0( normim_unb_var1 )
 \
 void PASTEMAC(ch,varname) \
      ( \
-       dim_t   n, \
-       ctype*  x, inc_t incx, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+       dim_t         n, \
+       ctype*        x, inc_t incx, \
+       const cntx_t* cntx, \
+       const rntm_t* rntm  \
      );
 
 INSERT_GENTPROT_BASIC0( randv_unb_var1 )
@@ -127,13 +127,13 @@ INSERT_GENTPROT_BASIC0( randnv_unb_var1 )
 \
 void PASTEMAC(ch,varname) \
      ( \
-       doff_t  diagoffx, \
-       uplo_t  uplox, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+       doff_t        diagoffx, \
+       uplo_t        uplox, \
+       dim_t         m, \
+       dim_t         n, \
+       ctype*        x, inc_t rs_x, inc_t cs_x, \
+       const cntx_t* cntx, \
+       const rntm_t* rntm  \
      );
 
 INSERT_GENTPROT_BASIC0( randm_unb_var1 )
@@ -145,12 +145,12 @@ INSERT_GENTPROT_BASIC0( randnm_unb_var1 )
 \
 void PASTEMAC(ch,varname) \
      ( \
-       dim_t    n, \
-       ctype*   x, inc_t incx, \
-       ctype_r* scale, \
-       ctype_r* sumsq, \
-       cntx_t*  cntx, \
-       rntm_t*  rntm  \
+       dim_t          n, \
+       const ctype*   x, inc_t incx, \
+       ctype_r*       scale, \
+       ctype_r*       sumsq, \
+       const cntx_t*  cntx, \
+       const rntm_t*  rntm  \
      );
 
 INSERT_GENTPROTR_BASIC0( sumsqv_unb_var1 )
@@ -162,10 +162,10 @@ INSERT_GENTPROTR_BASIC0( sumsqv_unb_var1 )
 \
 bool PASTEMAC(ch,varname) \
      ( \
-       conj_t  conjx, \
-       dim_t   n, \
-       ctype*  x, inc_t incx, \
-       ctype*  y, inc_t incy  \
+       conj_t       conjx, \
+       dim_t        n, \
+       const ctype* x, inc_t incx, \
+       const ctype* y, inc_t incy  \
      );
 
 INSERT_GENTPROT_BASIC0( eqv_unb_var1 )
@@ -176,14 +176,14 @@ INSERT_GENTPROT_BASIC0( eqv_unb_var1 )
 \
 bool PASTEMAC(ch,varname) \
      ( \
-       doff_t  diagoffx, \
-       diag_t  diagx, \
-       uplo_t  uplox, \
-       trans_t transx, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype*  y, inc_t rs_y, inc_t cs_y  \
+       doff_t       diagoffx, \
+       diag_t       diagx, \
+       uplo_t       uplox, \
+       trans_t      transx, \
+       dim_t        m, \
+       dim_t        n, \
+       const ctype* x, inc_t rs_x, inc_t cs_x, \
+       const ctype* y, inc_t rs_y, inc_t cs_y  \
      );
 
 INSERT_GENTPROT_BASIC0( eqm_unb_var1 )
@@ -194,12 +194,12 @@ INSERT_GENTPROT_BASIC0( eqm_unb_var1 )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       FILE*  file, \
-       char*  s1, \
-       dim_t  n, \
-       ctype* x, inc_t incx, \
-       char*  format, \
-       char*  s2  \
+       FILE*        file, \
+       const char*  s1, \
+       dim_t        n, \
+       const ctype* x, inc_t incx, \
+       const char*  format, \
+       const char*  s2  \
      );
 
 INSERT_GENTPROT_BASIC0_I( fprintv )
@@ -210,13 +210,13 @@ INSERT_GENTPROT_BASIC0_I( fprintv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       FILE*  file, \
-       char*  s1, \
-       dim_t  m, \
-       dim_t  n, \
-       ctype* x, inc_t rs_x, inc_t cs_x, \
-       char*  format, \
-       char*  s2  \
+       FILE*        file, \
+       const char*  s1, \
+       dim_t        m, \
+       dim_t        n, \
+       const ctype* x, inc_t rs_x, inc_t cs_x, \
+       const char*  format, \
+       const char*  s2  \
      );
 
 INSERT_GENTPROT_BASIC0_I( fprintm )

From 80c132ab3cda9cbe9d293ba552da4d0b9c73f8b5 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Tue, 15 Mar 2022 09:51:55 -0500
Subject: [PATCH 12/32] Finished adding const, now debugging.

---
 addon/gemmd/bao_gemmd.c                       |   28 +-
 addon/gemmd/bao_gemmd.h                       |   68 +-
 addon/gemmd/bao_gemmd_bp_var1.c               |  172 +--
 addon/gemmd/bao_gemmd_check.c                 |   14 +-
 addon/gemmd/bao_gemmd_var.h                   |   70 +-
 .../kernels/1/bli_axpyv_template_noopt_var1.c |    2 +-
 .../kernels/1/bli_dotv_template_noopt_var1.c  |    4 +-
 .../1f/bli_axpy2v_template_noopt_var1.c       |    2 +-
 .../1f/bli_axpyf_template_noopt_var1.c        |    2 +-
 .../1f/bli_dotaxpyv_template_noopt_var1.c     |    2 +-
 .../1f/bli_dotxaxpyf_template_noopt_var1.c    |    2 +-
 .../1f/bli_dotxf_template_noopt_var1.c        |    4 +-
 .../kernels/3/bli_gemm_template_noopt_mxn.c   |    4 +-
 .../3/bli_gemmtrsm_l_template_noopt_mxn.c     |    4 +-
 .../3/bli_gemmtrsm_u_template_noopt_mxn.c     |    4 +-
 .../kernels/3/bli_trsm_l_template_noopt_mxn.c |    4 +-
 .../kernels/3/bli_trsm_u_template_noopt_mxn.c |    4 +-
 frame/0/bli_l0_ft.h                           |   26 +-
 frame/0/bli_l0_oapi.c                         |   10 +-
 frame/0/bli_l0_oapi.h                         |   10 +-
 frame/0/bli_l0_tapi.c                         |   32 +-
 frame/0/bli_l0_tapi.h                         |   26 +-
 frame/0/copysc/bli_copysc.c                   |   12 +-
 frame/0/copysc/bli_copysc.h                   |    6 +-
 frame/1/bli_l1v_ft.h                          |   36 +-
 frame/1/bli_l1v_ker_prot.h                    |   28 +-
 frame/1/bli_l1v_tapi.c                        |  104 +-
 frame/1/bli_l1v_tapi.h                        |   36 +-
 frame/1d/bli_l1d_ft.h                         |   50 +-
 frame/1d/bli_l1d_tapi.c                       |   94 +-
 frame/1d/bli_l1d_tapi.h                       |   52 +-
 frame/1f/bli_l1f_ft.h                         |   42 +-
 frame/1f/bli_l1f_ker_prot.h                   |   10 +-
 frame/1f/bli_l1f_tapi.c                       |  104 +-
 frame/1f/bli_l1f_tapi.h                       |   42 +-
 frame/1m/bli_l1m_ft.h                         |   60 +-
 frame/1m/bli_l1m_ft_ker.h                     |    6 +-
 frame/1m/bli_l1m_ker_prot.h                   |    6 +-
 frame/1m/bli_l1m_oft_var.h                    |    6 +-
 frame/1m/bli_l1m_tapi.c                       |  140 +-
 frame/1m/bli_l1m_tapi.h                       |   60 +-
 frame/1m/packm/bli_packm_alloc.c              |   14 +-
 frame/1m/packm/bli_packm_alloc.h              |   14 +-
 frame/1m/packm/bli_packm_blk_var1.c           |    8 +-
 frame/1m/packm/bli_packm_blk_var1.h           |    4 +-
 frame/1m/packm/bli_packm_init.c               |    4 +-
 frame/1m/packm/bli_packm_init.h               |    4 +-
 frame/1m/packm/bli_packm_int.c                |    4 +-
 frame/1m/packm/bli_packm_int.h                |    4 +-
 frame/1m/packm/bli_packm_struc_cxk.c          |   42 +-
 frame/1m/packm/bli_packm_struc_cxk.h          |   36 +-
 frame/1m/unpackm/bli_unpackm_blk_var1.c       |   44 +-
 frame/1m/unpackm/bli_unpackm_check.c          |    2 +-
 frame/2/bli_l2_ft.h                           |   52 +-
 frame/2/bli_l2_tapi.c                         |  120 +-
 frame/2/bli_l2_tapi.h                         |   52 +-
 .../gemv/{ => other}/bli_gemv_var_oapi.c.prev |    0
 frame/3/bli_l3_blocksize.c                    |  119 +-
 frame/3/bli_l3_blocksize.h                    |   22 +-
 frame/3/bli_l3_check.c                        |   12 +-
 frame/3/bli_l3_check.h                        |    6 +-
 frame/3/bli_l3_cntl.c                         |   12 +-
 frame/3/bli_l3_cntl.h                         |   12 +-
 frame/3/bli_l3_ft_ukr.h                       |   12 +-
 frame/3/bli_l3_ind_ukr.h                      |   12 +-
 frame/3/bli_l3_int.c                          |    6 +-
 frame/3/bli_l3_int.h                          |    6 +-
 frame/3/bli_l3_oapi.c                         |    2 +-
 frame/3/bli_l3_oapi.h                         |    4 +-
 frame/3/bli_l3_oapi_ex.c                      |   54 +-
 frame/3/bli_l3_oapi_ex.h                      |   20 +-
 frame/3/bli_l3_oft.h                          |   12 +-
 frame/3/bli_l3_oft_var.h                      |    6 +-
 frame/3/bli_l3_packab.c                       |   12 +-
 frame/3/bli_l3_packab.h                       |   12 +-
 frame/3/bli_l3_prune.c                        |  244 ++--
 frame/3/bli_l3_prune.h                        |   57 +-
 frame/3/bli_l3_schema.c                       |    6 +-
 frame/3/bli_l3_schema.h                       |    6 +-
 frame/3/bli_l3_sup.c                          |   28 +-
 frame/3/bli_l3_sup.h                          |   28 +-
 frame/3/bli_l3_sup_ft_ker.h                   |    4 +-
 frame/3/bli_l3_sup_int.c                      |    8 +-
 frame/3/bli_l3_sup_int.h                      |    8 +-
 frame/3/bli_l3_sup_ker_prot.h                 |    4 +-
 frame/3/bli_l3_sup_oft.h                      |    2 +-
 frame/3/bli_l3_sup_packm_a.c                  |   32 +-
 frame/3/bli_l3_sup_packm_a.h                  |   32 +-
 frame/3/bli_l3_sup_packm_b.c                  |   46 +-
 frame/3/bli_l3_sup_packm_b.h                  |   46 +-
 frame/3/bli_l3_sup_packm_var.c                |   48 +-
 frame/3/bli_l3_sup_packm_var.h                |   20 +-
 frame/3/bli_l3_sup_ref.c                      |   28 +-
 frame/3/bli_l3_sup_ref.h                      |   28 +-
 frame/3/bli_l3_sup_var12.c                    |  166 +--
 frame/3/bli_l3_sup_var1n2m.c                  |  432 +++---
 frame/3/bli_l3_sup_vars.h                     |   92 +-
 frame/3/bli_l3_tapi.c                         |  176 +--
 frame/3/bli_l3_tapi.h                         |  156 +-
 frame/3/bli_l3_tapi_ex.c                      |  294 ++--
 frame/3/bli_l3_tapi_ex.h                      |  188 +--
 frame/3/bli_l3_ukr_prot.h                     |   12 +-
 frame/3/bli_l3_ukr_tapi.c                     |   12 +-
 frame/3/gemm/bli_gemm_blk_var1.c              |   35 +-
 frame/3/gemm/bli_gemm_blk_var2.c              |   35 +-
 frame/3/gemm/bli_gemm_blk_var3.c              |   38 +-
 frame/3/gemm/bli_gemm_front.c                 |   20 +-
 frame/3/gemm/bli_gemm_front.h                 |   30 +-
 frame/3/gemm/bli_gemm_ker_var2.c              |  110 +-
 frame/3/gemm/bli_gemm_md.c                    |  282 ++--
 frame/3/gemm/bli_gemm_md.h                    |   88 +-
 frame/3/gemm/bli_gemm_md_c2r_ref.c            |    4 +-
 frame/3/gemm/bli_gemm_var.h                   |   14 +-
 frame/3/gemm/ind/bli_gemm_ind_opt.h           |   20 +-
 frame/3/gemmt/bli_gemmt_front.c               |   16 +-
 frame/3/gemmt/bli_gemmt_front.h               |   16 +-
 frame/3/gemmt/bli_gemmt_l_ker_var2.c          |  105 +-
 frame/3/gemmt/bli_gemmt_u_ker_var2.c          |  105 +-
 frame/3/gemmt/bli_gemmt_var.h                 |   14 +-
 frame/3/gemmt/bli_gemmt_x_ker_var2.c          |   14 +-
 frame/3/hemm/bli_hemm_front.c                 |   18 +-
 frame/3/hemm/bli_hemm_front.h                 |   18 +-
 frame/3/symm/bli_symm_front.c                 |   18 +-
 frame/3/symm/bli_symm_front.h                 |   18 +-
 frame/3/trmm/bli_trmm_front.c                 |   14 +-
 frame/3/trmm/bli_trmm_front.h                 |   14 +-
 frame/3/trmm/bli_trmm_ll_ker_var2.c           |   97 +-
 frame/3/trmm/bli_trmm_lu_ker_var2.c           |   97 +-
 frame/3/trmm/bli_trmm_rl_ker_var2.c           |   97 +-
 frame/3/trmm/bli_trmm_ru_ker_var2.c           |   97 +-
 frame/3/trmm/bli_trmm_var.h                   |   14 +-
 frame/3/trmm/bli_trmm_xx_ker_var2.c           |   14 +-
 frame/3/trmm3/bli_trmm3_front.c               |   18 +-
 frame/3/trmm3/bli_trmm3_front.h               |   18 +-
 frame/3/trsm/bli_trsm_blk_var1.c              |   42 +-
 frame/3/trsm/bli_trsm_blk_var2.c              |   35 +-
 frame/3/trsm/bli_trsm_blk_var3.c              |   38 +-
 frame/3/trsm/bli_trsm_front.c                 |   14 +-
 frame/3/trsm/bli_trsm_front.h                 |   14 +-
 frame/3/trsm/bli_trsm_ll_ker_var2.c           |   97 +-
 frame/3/trsm/bli_trsm_lu_ker_var2.c           |   97 +-
 frame/3/trsm/bli_trsm_rl_ker_var2.c           |   93 +-
 frame/3/trsm/bli_trsm_ru_ker_var2.c           |   93 +-
 frame/3/trsm/bli_trsm_var.h                   |   14 +-
 frame/3/trsm/bli_trsm_xx_ker_var2.c           |   14 +-
 frame/base/bli_apool.c                        |   70 +-
 frame/base/bli_apool.h                        |   26 +-
 frame/base/bli_array.c                        |    2 +-
 frame/base/bli_array.h                        |    2 +-
 frame/base/bli_auxinfo.h                      |   14 +-
 frame/base/bli_blksz.c                        |   20 +-
 frame/base/bli_blksz.h                        |   24 +-
 frame/base/bli_cntx.h                         |    6 +-
 frame/base/bli_func.c                         |    2 +-
 frame/base/bli_func.h                         |    4 +-
 frame/base/bli_getopt.c                       |    8 +-
 frame/base/bli_getopt.h                       |   10 +-
 frame/base/bli_gks.c                          |    8 +-
 frame/base/bli_mem.h                          |   16 +-
 frame/base/bli_obj.c                          |    4 +-
 frame/base/bli_obj.h                          |    4 +-
 frame/base/bli_obj_scalar.c                   |   10 +-
 frame/base/bli_obj_scalar.h                   |   10 +-
 frame/base/bli_part.c                         |  130 +-
 frame/base/bli_part.h                         |   46 +-
 frame/base/bli_pba.c                          |   22 +-
 frame/base/bli_pba.h                          |   20 +-
 frame/base/bli_rntm.c                         |    4 +-
 frame/base/bli_rntm.h                         |    6 +-
 frame/base/bli_sba.c                          |   22 +-
 frame/base/bli_sba.h                          |   18 +-
 frame/base/bli_setgetijm.c                    |   40 +-
 frame/base/bli_setgetijm.h                    |   30 +-
 frame/base/bli_setgetijv.c                    |   32 +-
 frame/base/bli_setgetijv.h                    |   24 +-
 frame/base/bli_setri.c                        |    8 +-
 frame/base/bli_setri.h                        |    8 +-
 frame/base/cast/bli_castm.c                   |   71 +-
 frame/base/cast/bli_castm.h                   |   10 +-
 frame/base/cast/bli_castnzm.c                 |   71 +-
 frame/base/cast/bli_castnzm.h                 |   10 +-
 frame/base/cast/bli_castv.c                   |   43 +-
 frame/base/cast/bli_castv.h                   |    8 +-
 frame/base/check/bli_obj_check.c              |   30 +-
 frame/base/check/bli_obj_check.h              |   30 +-
 frame/base/check/bli_part_check.c             |   30 +-
 frame/base/check/bli_part_check.h             |   34 +-
 frame/include/bli_extern_defs.h               |    6 +-
 frame/include/bli_oapi_ba.h                   |    4 +-
 frame/include/bli_oapi_ex.h                   |    2 +-
 frame/include/bli_obj_macro_defs.h            |    2 +-
 frame/include/bli_tapi_ba.h                   |    4 +-
 frame/include/bli_tapi_ex.h                   |    2 +-
 frame/include/bli_type_defs.h                 |   24 +-
 frame/thread/bli_l3_decor.h                   |   38 +-
 frame/thread/bli_l3_decor_openmp.c            |   51 +-
 frame/thread/bli_l3_decor_pthreads.c          |  118 +-
 frame/thread/bli_l3_decor_single.c            |   44 +-
 frame/thread/bli_l3_sup_decor.h               |   34 +-
 frame/thread/bli_l3_sup_decor_openmp.c        |   26 +-
 frame/thread/bli_l3_sup_decor_pthreads.c      |   74 +-
 frame/thread/bli_l3_sup_decor_single.c        |   26 +-
 frame/thread/bli_thread.c                     |   82 +-
 frame/thread/bli_thread.h                     |   74 +-
 frame/thread/bli_thrinfo.c                    |    2 +-
 frame/thread/bli_thrinfo_sup.c                |   39 +-
 frame/thread/bli_thrinfo_sup.h                |   22 +-
 frame/util/bli_util_ft.h                      |   72 +-
 frame/util/bli_util_oapi.c                    |   20 +-
 frame/util/bli_util_oapi.h                    |   31 +-
 frame/util/bli_util_tapi.c                    |  108 +-
 frame/util/bli_util_tapi.h                    |   68 +-
 frame/util/bli_util_unb_var1.c                |  382 +++--
 frame/util/bli_util_unb_var1.h                |  116 +-
 .../armsve/1m/bli_dpackm_armsve256_int_8xk.c  |    2 +-
 .../armsve/1m/bli_dpackm_armsve512_asm_10xk.c |    2 +-
 .../armsve/1m/bli_dpackm_armsve512_asm_16xk.c |    2 +-
 .../3/bli_gemm_armsve_asm_c2vx10_unindexed.c  |    4 +-
 .../3/bli_gemm_armsve_asm_d2vx10_unindexed.c  |    4 +-
 .../3/bli_gemm_armsve_asm_s2vx10_unindexed.c  |    4 +-
 .../3/bli_gemm_armsve_asm_z2vx10_unindexed.c  |    4 +-
 kernels/armv7a/3/bli_gemm_armv7a_asm_d4x4.c   |   24 +-
 kernels/armv7a/3/bli_gemm_armv7a_int_d4x4.c   |    8 +-
 kernels/armv8a/1m/bli_packm_armv8a_int_d6xk.c |    6 +-
 kernels/armv8a/1m/bli_packm_armv8a_int_d8xk.c |    6 +-
 .../armv8a/1m/bli_packm_armv8a_int_s12xk.c    |    6 +-
 kernels/armv8a/1m/bli_packm_armv8a_int_s8xk.c |    6 +-
 kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c   |   16 +-
 kernels/armv8a/3/sup/bli_gemmsup_armv8a_ref.c |    8 +-
 .../3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.c   |    4 +-
 .../3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.c   |    4 +-
 .../3/sup/bli_gemmsup_rv_armv8a_asm_d4x8m.c   |    6 +-
 .../3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.c   |    6 +-
 .../3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.c   |    4 +-
 .../3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.c   |    4 +-
 .../3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c   |    4 +-
 .../sup/d3x4/bli_gemmsup_rd_armv8a_asm_d3x4.c |    4 +-
 .../sup/d3x4/bli_gemmsup_rd_armv8a_asm_d6x3.c |    4 +-
 .../sup/d3x4/bli_gemmsup_rd_armv8a_int_d2x8.c |   12 +-
 .../sup/d3x4/bli_gemmsup_rd_armv8a_int_d3x4.c |    4 +-
 .../d6x4/bli_gemmsup_rv_armv8a_int_d3x8mn.c   |    4 +-
 .../d6x4/bli_gemmsup_rv_armv8a_int_d6x4mn.c   |    6 +-
 kernels/bgq/1/bli_axpyv_bgq_int.c             |    8 +-
 kernels/bgq/1/bli_dotv_bgq_int.c              |    4 +-
 kernels/bgq/1f/bli_axpyf_bgq_int.c            |    6 +-
 kernels/bgq/3/bli_gemm_bgq_int_8x8.c          |    8 +-
 .../3/bli_gemm_bulldozer_asm_d4x6_fma4.c      |   16 +-
 .../haswell/1m/bli_packm_haswell_asm_c3xk.c   |    2 +-
 .../haswell/1m/bli_packm_haswell_asm_c8xk.c   |    2 +-
 .../haswell/1m/bli_packm_haswell_asm_d6xk.c   |   26 +-
 .../haswell/1m/bli_packm_haswell_asm_d8xk.c   |   26 +-
 .../haswell/1m/bli_packm_haswell_asm_s16xk.c  |   28 +-
 .../haswell/1m/bli_packm_haswell_asm_s6xk.c   |   28 +-
 .../haswell/1m/bli_packm_haswell_asm_z3xk.c   |    2 +-
 .../haswell/1m/bli_packm_haswell_asm_z4xk.c   |    2 +-
 kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c |   16 +-
 kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c |   16 +-
 .../3/bli_gemmtrsm_l_haswell_asm_d6x8.c       |    8 +-
 .../3/bli_gemmtrsm_u_haswell_asm_d6x8.c       |    8 +-
 .../3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c  |  476 +++----
 .../3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c  |  612 ++++----
 .../3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c |  790 +++++------
 .../3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c |  614 ++++----
 .../3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c  |  828 +++++------
 .../3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c  | 1050 +++++++-------
 .../3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c | 1252 ++++++++---------
 .../3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c | 1048 +++++++-------
 .../sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.c |   40 +-
 .../d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c    |   16 +-
 .../d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c    |  600 ++++----
 .../d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c    |   12 +-
 .../d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c    |  460 +++---
 .../d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c    | 1044 +++++++-------
 .../d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c    | 1036 +++++++-------
 .../d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c    | 1018 +++++++-------
 .../d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c    | 1018 +++++++-------
 .../s6x16/bli_gemmsup_r_haswell_ref_sMx1.c    |   44 +-
 .../s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c   |  608 ++++----
 .../s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c  |  462 +++---
 .../s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c  |  462 +++---
 .../s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c   |  608 ++++----
 .../s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c   |  462 +++---
 .../s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c   |  462 +++---
 .../s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c  | 1008 ++++++-------
 .../s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c  | 1010 ++++++-------
 .../s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c   | 1082 +++++++-------
 .../s6x16/bli_gemmsup_rv_haswell_asm_sMx4.c   | 1082 +++++++-------
 .../s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c   | 1202 ++++++++--------
 .../s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c   | 1008 ++++++-------
 kernels/knc/3/bli_dgemm_knc_asm_30x8.c        |    4 +-
 kernels/knc/3/bli_sgemm_knc_asm_30x16.c       |    4 +-
 kernels/knl/1m/bli_dpackm_knl_asm_24x8.c      |    4 +-
 kernels/knl/1m/bli_spackm_knl_asm_24x16.c     |    4 +-
 kernels/knl/3/bli_dgemm_knl_asm_24x8.c        |    4 +-
 kernels/knl/3/bli_sgemm_knl_asm_24x16.c       |    4 +-
 kernels/penryn/1/bli_axpyv_penryn_int.c       |    2 +-
 kernels/penryn/1/bli_dotv_penryn_int.c        |    2 +-
 kernels/penryn/1f/bli_axpy2v_penryn_int.c     |    2 +-
 kernels/penryn/1f/bli_axpyf_penryn_int.c      |    2 +-
 kernels/penryn/1f/bli_dotaxpyv_penryn_int.c   |    2 +-
 kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c  |    2 +-
 kernels/penryn/1f/bli_dotxf_penryn_int.c      |    2 +-
 kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c   |    8 +-
 .../penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c |  214 +--
 .../penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c |  206 +--
 kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c |   76 +-
 kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c |   76 +-
 .../3/bli_gemm_piledriver_asm_d8x3.c          |   16 +-
 kernels/power10/3/bli_dgemm_power10_mma.c     |    4 +-
 kernels/power10/3/bli_i16gemm_power10_mma.c   |    4 +-
 kernels/power10/3/bli_i16sgemm_power10_mma.c  |    4 +-
 kernels/power10/3/bli_i4gemm_power10_mma.c    |    4 +-
 kernels/power10/3/bli_i8gemm_power10_mma.c    |    4 +-
 kernels/power10/3/bli_sbgemm_power10_mma.c    |    4 +-
 kernels/power10/3/bli_sgemm_power10_mma.c     |    4 +-
 kernels/power10/3/bli_shgemm_power10_mma.c    |    4 +-
 kernels/power7/3/bli_gemm_power7_int_8x4.c    |   16 +-
 .../power7/3/test/bli_gemm_power7_int_8x4.h   |   16 +-
 kernels/power9/3/bli_gemm_power9_asm_d12x6.c  |    4 +-
 .../3/bli_gemm_sandybridge_asm_d8x4.c         |   16 +-
 .../3/bli_gemm_sandybridge_int_d8x4.c         |   16 +-
 kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c    |    2 +-
 kernels/skx/3/bli_dgemm_skx_asm_16x14.c       |    2 +-
 kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c    |    2 +-
 kernels/zen/1/bli_amaxv_zen_int.c             |   16 +-
 kernels/zen/1/bli_axpyv_zen_int.c             |    4 +-
 kernels/zen/1/bli_axpyv_zen_int10.c           |    4 +-
 kernels/zen/1/bli_copyv_zen_int.c             |    4 +-
 kernels/zen/1/bli_dotv_zen_int.c              |    4 +-
 kernels/zen/1/bli_dotv_zen_int10.c            |    4 +-
 kernels/zen/1/bli_dotxv_zen_int.c             |    6 +-
 kernels/zen/1/bli_scalv_zen_int.c             |    4 +-
 kernels/zen/1/bli_scalv_zen_int10.c           |    4 +-
 kernels/zen/1/bli_setv_zen_int.c              |    4 +-
 kernels/zen/1/bli_swapv_zen_int8.c            |    4 +-
 kernels/zen/1f/bli_axpyf_zen_int_8.c          |    4 +-
 kernels/zen/1f/bli_dotxf_zen_int_8.c          |    4 +-
 .../sup/broken/bli_gemmsup_rv_zen_asm_c3x8m.c |   45 +-
 .../sup/broken/bli_gemmsup_rv_zen_asm_c3x8n.c |   34 +-
 .../sup/broken/bli_gemmsup_rv_zen_asm_z3x4.c  |   56 +-
 .../sup/broken/bli_gemmsup_rv_zen_asm_z3x4m.c |   36 +-
 .../sup/broken/bli_gemmsup_rv_zen_asm_z3x4n.c |   30 +-
 kernels/zen2/1f/bli_axpyf_zen_int_5.c         |    4 +-
 ref_kernels/1/bli_addv_ref.c                  |    2 +-
 ref_kernels/1/bli_amaxv_ref.c                 |    2 +-
 ref_kernels/1/bli_axpbyv_ref.c                |    2 +-
 ref_kernels/1/bli_axpyv_ref.c                 |    4 +-
 ref_kernels/1/bli_copyv_ref.c                 |    2 +-
 ref_kernels/1/bli_dotv_ref.c                  |    2 +-
 ref_kernels/1/bli_dotxv_ref.c                 |    2 +-
 ref_kernels/1/bli_invertv_ref.c               |    2 +-
 ref_kernels/1/bli_scal2v_ref.c                |    2 +-
 ref_kernels/1/bli_scalv_ref.c                 |    2 +-
 ref_kernels/1/bli_setv_ref.c                  |    2 +-
 ref_kernels/1/bli_subv_ref.c                  |    2 +-
 ref_kernels/1/bli_swapv_ref.c                 |    2 +-
 ref_kernels/1/bli_xpbyv_ref.c                 |    2 +-
 ref_kernels/1f/bli_axpy2v_ref.c               |    2 +-
 ref_kernels/1f/bli_axpyf_ref.c                |    2 +-
 ref_kernels/1f/bli_dotaxpyv_ref.c             |    2 +-
 ref_kernels/1f/bli_dotxaxpyf_ref.c            |    2 +-
 ref_kernels/1f/bli_dotxf_ref.c                |    2 +-
 ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c   |    2 +-
 ref_kernels/1m/bli_packm_cxc_diag_ref.c       |    2 +-
 ref_kernels/1m/bli_packm_cxk_1er_ref.c        |    2 +-
 ref_kernels/1m/bli_packm_cxk_ref.c            |    2 +-
 ref_kernels/1m/bli_unpackm_cxk_ref.c          |    2 +-
 ref_kernels/3/bli_gemm_ref.c                  |    8 +-
 ref_kernels/3/bli_gemmsup_ref.c               |   20 +-
 ref_kernels/3/bli_gemmtrsm_ref.c              |    4 +-
 ref_kernels/3/bli_trsm_ref.c                  |    8 +-
 ref_kernels/bli_cntx_ref.c                    |   12 +-
 ref_kernels/ind/bli_gemm1m_ref.c              |    4 +-
 ref_kernels/ind/bli_gemmtrsm1m_ref.c          |    4 +-
 ref_kernels/ind/bli_trsm1m_ref.c              |    8 +-
 testsuite/src/test_axpy2v.c                   |    4 +-
 testsuite/src/test_axpyf.c                    |    4 +-
 testsuite/src/test_dotaxpyv.c                 |    6 +-
 testsuite/src/test_dotxaxpyf.c                |    4 +-
 testsuite/src/test_dotxf.c                    |    4 +-
 testsuite/src/test_gemm_ukr.c                 |    2 +-
 testsuite/src/test_gemmtrsm_ukr.c             |    2 +-
 testsuite/src/test_libblis.c                  |   10 +-
 testsuite/src/test_trsm_ukr.c                 |    2 +-
 384 files changed, 15796 insertions(+), 16002 deletions(-)
 rename frame/2/gemv/{ => other}/bli_gemv_var_oapi.c.prev (100%)

diff --git a/addon/gemmd/bao_gemmd.c b/addon/gemmd/bao_gemmd.c
index fe38505a79..01185a9d75 100644
--- a/addon/gemmd/bao_gemmd.c
+++ b/addon/gemmd/bao_gemmd.c
@@ -40,12 +40,12 @@
 
 void bao_gemmd
      (
-       const obj_t*  alpha,
-       const obj_t*  a,
-       const obj_t*  d,
-       const obj_t*  b,
-       const obj_t*  beta,
-       const obj_t*  c
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  d,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c
      )
 {
 	bao_gemmd_ex
@@ -63,14 +63,14 @@ void bao_gemmd
 
 void bao_gemmd_ex
      (
-       const obj_t*  alpha,
-       const obj_t*  a,
-       const obj_t*  d,
-       const obj_t*  b,
-       const obj_t*  beta,
-       const obj_t*  c,
-       const cntx_t* cntx,
-             rntm_t* rntm
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  d,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm
      )
 {
 	bli_init_once();
diff --git a/addon/gemmd/bao_gemmd.h b/addon/gemmd/bao_gemmd.h
index e3ea11e4e1..7c7466494d 100644
--- a/addon/gemmd/bao_gemmd.h
+++ b/addon/gemmd/bao_gemmd.h
@@ -38,24 +38,24 @@
 
 BLIS_EXPORT_ADDON void bao_gemmd
      (
-       const obj_t*  alpha,
-       const obj_t*  a,
-       const obj_t*  d,
-       const obj_t*  b,
-       const obj_t*  beta,
-       const obj_t*  c
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  d,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c
      );
 
 BLIS_EXPORT_ADDON void bao_gemmd_ex
      (
-       const obj_t*  alpha,
-       const obj_t*  a,
-       const obj_t*  d,
-       const obj_t*  b,
-       const obj_t*  beta,
-       const obj_t*  c,
-       const cntx_t* cntx,
-             rntm_t* rntm
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  d,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm
      );
 
 //
@@ -64,15 +64,15 @@ BLIS_EXPORT_ADDON void bao_gemmd_ex
 
 void bao_gemmd_int
      (
-       const obj_t*  alpha,
-       const obj_t*  a,
-       const obj_t*  d,
-       const obj_t*  b,
-       const obj_t*  beta,
-       const obj_t*  c,
-       const cntx_t* cntx,
-       const rntm_t* rntm,
-       const thrinfo_t* thread
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  d,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
      );
 
 //
@@ -84,17 +84,17 @@ void bao_gemmd_int
 \
 BLIS_EXPORT_ADDON void PASTECH2(bao_,ch,opname) \
      ( \
-       trans_t       transa, \
-       trans_t       transb, \
-       dim_t         m, \
-       dim_t         n, \
-       dim_t         k, \
-       const ctype*  alpha, \
-       const ctype*  a, inc_t rs_a, inc_t cs_a, \
-       const ctype*  d, inc_t incd, \
-       const ctype*  b, inc_t rs_b, inc_t cs_b, \
-       const ctype*  beta, \
-             ctype*  c, inc_t rs_c, inc_t cs_c  \
+       trans_t transa, \
+       trans_t transb, \
+       dim_t   m, \
+       dim_t   n, \
+       dim_t   k, \
+       ctype*  alpha, \
+       ctype*  a, inc_t rs_a, inc_t cs_a, \
+       ctype*  d, inc_t incd, \
+       ctype*  b, inc_t rs_b, inc_t cs_b, \
+       ctype*  beta, \
+       ctype*  c, inc_t rs_c, inc_t cs_c  \
      );
 
 //INSERT_GENTPROT_BASIC0( gemmd )
diff --git a/addon/gemmd/bao_gemmd_bp_var1.c b/addon/gemmd/bao_gemmd_bp_var1.c
index e4393dcb23..689471367f 100644
--- a/addon/gemmd/bao_gemmd_bp_var1.c
+++ b/addon/gemmd/bao_gemmd_bp_var1.c
@@ -43,15 +43,15 @@ typedef void (*FUNCPTR_T)
        dim_t            m,
        dim_t            n,
        dim_t            k,
-       const void*      alpha,
-       const void*      a, inc_t rs_a, inc_t cs_a,
-       const void*      d, inc_t incd,
-       const void*      b, inc_t rs_b, inc_t cs_b,
-       const void*      beta,
-             void*      c, inc_t rs_c, inc_t cs_c,
-       const cntx_t*    cntx,
-       const rntm_t*    rntm,
-       const thrinfo_t* thread
+       void*   restrict alpha,
+       void*   restrict a, inc_t rs_a, inc_t cs_a,
+       void*   restrict d, inc_t incd,
+       void*   restrict b, inc_t rs_b, inc_t cs_b,
+       void*   restrict beta,
+       void*   restrict c, inc_t rs_c, inc_t cs_c,
+       cntx_t* restrict cntx,
+       rntm_t* restrict rntm,
+       thrinfo_t* restrict thread
      );
 
 //
@@ -64,43 +64,43 @@ static FUNCPTR_T GENARRAY_PREF(ftypes,bao_,gemmd_bp_var1);
 
 void bao_gemmd_bp_var1
      (
-       const obj_t*  alpha,
-       const obj_t*  a,
-       const obj_t*  d,
-       const obj_t*  b,
-       const obj_t*  beta,
-       const obj_t*  c,
-       const cntx_t* cntx,
-       const rntm_t* rntm,
-       const thrinfo_t* thread
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  d,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx,
+       rntm_t* rntm,
+       thrinfo_t* thread
      )
 {
-	const num_t  dt        = bli_obj_dt( c );
+	const num_t    dt        = bli_obj_dt( c );
 
-	const conj_t conja     = bli_obj_conj_status( a );
-	const conj_t conjb     = bli_obj_conj_status( b );
+	const conj_t   conja     = bli_obj_conj_status( a );
+	const conj_t   conjb     = bli_obj_conj_status( b );
 
-	const dim_t  m         = bli_obj_length( c );
-	const dim_t  n         = bli_obj_width( c );
-	const dim_t  k         = bli_obj_width( a );
+	const dim_t    m         = bli_obj_length( c );
+	const dim_t    n         = bli_obj_width( c );
+	const dim_t    k         = bli_obj_width( a );
 
-	const void*  buf_a     = bli_obj_buffer_at_off( a );
-	const inc_t  rs_a      = bli_obj_row_stride( a );
-	const inc_t  cs_a      = bli_obj_col_stride( a );
+	void* restrict buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t    rs_a      = bli_obj_row_stride( a );
+	const inc_t    cs_a      = bli_obj_col_stride( a );
 
-	const void*  buf_d     = bli_obj_buffer_at_off( d );
-	const inc_t  incd      = bli_obj_vector_inc( d );
+	void* restrict buf_d     = bli_obj_buffer_at_off( d );
+	const inc_t    incd      = bli_obj_vector_inc( d );
 
-	const void*  buf_b     = bli_obj_buffer_at_off( b );
-	const inc_t  rs_b      = bli_obj_row_stride( b );
-	const inc_t  cs_b      = bli_obj_col_stride( b );
+	void* restrict buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t    rs_b      = bli_obj_row_stride( b );
+	const inc_t    cs_b      = bli_obj_col_stride( b );
 
-	      void*  buf_c     = bli_obj_buffer_at_off( c );
-	const inc_t  rs_c      = bli_obj_row_stride( c );
-	const inc_t  cs_c      = bli_obj_col_stride( c );
+	void* restrict buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t    rs_c      = bli_obj_row_stride( c );
+	const inc_t    cs_c      = bli_obj_col_stride( c );
 
-	const void*  buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
-	const void*  buf_beta  = bli_obj_buffer_for_1x1( dt, beta );
+	void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
+	void* restrict buf_beta  = bli_obj_buffer_for_1x1( dt, beta );
 
 	// Index into the function pointer array to extract the correct
 	// typed function pointer based on the chosen datatype.
@@ -140,15 +140,15 @@ void PASTECH2(bao_,ch,varname) \
        dim_t            m, \
        dim_t            n, \
        dim_t            k, \
-       const void*      alpha, \
-       const void*      a, inc_t rs_a, inc_t cs_a, \
-       const void*      d, inc_t incd, \
-       const void*      b, inc_t rs_b, inc_t cs_b, \
-       const void*      beta, \
-             void*      c, inc_t rs_c, inc_t cs_c, \
-       const cntx_t*    cntx, \
-       const rntm_t*    rntm, \
-       const thrinfo_t* thread  \
+       void*   restrict alpha, \
+       void*   restrict a, inc_t rs_a, inc_t cs_a, \
+       void*   restrict d, inc_t incd, \
+       void*   restrict b, inc_t rs_b, inc_t cs_b, \
+       void*   restrict beta, \
+       void*   restrict c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* restrict cntx, \
+       rntm_t* restrict rntm, \
+       thrinfo_t* restrict thread  \
      ) \
 { \
 	const num_t dt = PASTEMAC(ch,type); \
@@ -180,12 +180,12 @@ void PASTECH2(bao_,ch,varname) \
 \
 	const inc_t irstep_c = rs_c * MR; \
 \
-	const ctype* a_00       = a; \
-	const ctype* d_00       = d; \
-	const ctype* b_00       = b; \
-	      ctype* c_00       = c; \
-	const ctype* alpha_cast = alpha; \
-	const ctype* beta_cast  = beta; \
+	ctype* restrict a_00       = a; \
+	ctype* restrict d_00       = d; \
+	ctype* restrict b_00       = b; \
+	ctype* restrict c_00       = c; \
+	ctype* restrict alpha_cast = alpha; \
+	ctype* restrict beta_cast  = beta; \
 \
 	/* Make local copies of the scalars to prevent any unnecessary sharing of
 	   cache lines between the cores' caches. */ \
@@ -212,21 +212,21 @@ void PASTECH2(bao_,ch,varname) \
 	                      BLIS_MR,      /* 1st loop */ \
 	                      BLIS_KR };    /* microkernel loop */  \
 \
-	const bszid_t* bszids_jc = &bszids[0]; \
-	const bszid_t* bszids_pc = &bszids[1]; \
-	/*const bszid_t* bszids_pb = &bszids[2];*/ \
-	const bszid_t* bszids_ic = &bszids[3]; \
-	/*const bszid_t* bszids_pa = &bszids[4];*/ \
-	const bszid_t* bszids_jr = &bszids[5]; \
-	/*const bszid_t* bszids_ir = &bszids[6];*/ \
-\
-	thrinfo_t* thread_jc = NULL; \
-	thrinfo_t* thread_pc = NULL; \
-	thrinfo_t* thread_pb = NULL; \
-	thrinfo_t* thread_ic = NULL; \
-	thrinfo_t* thread_pa = NULL; \
-	thrinfo_t* thread_jr = NULL; \
-	thrinfo_t* thread_ir = NULL; \
+	bszid_t* restrict bszids_jc = &bszids[0]; \
+	bszid_t* restrict bszids_pc = &bszids[1]; \
+	/*bszid_t* restrict bszids_pb = &bszids[2];*/ \
+	bszid_t* restrict bszids_ic = &bszids[3]; \
+	/*bszid_t* restrict bszids_pa = &bszids[4];*/ \
+	bszid_t* restrict bszids_jr = &bszids[5]; \
+	/*bszid_t* restrict bszids_ir = &bszids[6];*/ \
+\
+	thrinfo_t* restrict thread_jc = NULL; \
+	thrinfo_t* restrict thread_pc = NULL; \
+	thrinfo_t* restrict thread_pb = NULL; \
+	thrinfo_t* restrict thread_ic = NULL; \
+	thrinfo_t* restrict thread_pa = NULL; \
+	thrinfo_t* restrict thread_jr = NULL; \
+	thrinfo_t* restrict thread_ir = NULL; \
 \
 	/* Identify the current thrinfo_t node and then grow the tree. */ \
 	thread_jc = thread; \
@@ -239,7 +239,7 @@ void PASTECH2(bao_,ch,varname) \
 \
 	/* Compute number of primary and leftover components of the JC loop. */ \
 	/*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/ \
-	const dim_t jc_left = n_local % NC; \
+	const dim_t jc_left =   n_local % NC; \
 \
 	/* Loop over the n dimension (NC rows/columns at a time). */ \
 	for ( dim_t jj = jc_start; jj < jc_end; jj += NC ) \
@@ -247,8 +247,8 @@ void PASTECH2(bao_,ch,varname) \
 		/* Calculate the thread's current JC block dimension. */ \
 		const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \
 \
-		const ctype* b_jc = b_00 + jj * jcstep_b; \
-		      ctype* c_jc = c_00 + jj * jcstep_c; \
+		ctype* restrict b_jc = b_00 + jj * jcstep_b; \
+		ctype* restrict c_jc = c_00 + jj * jcstep_c; \
 \
 		/* Identify the current thrinfo_t node and then grow the tree. */ \
 		thread_pc = bli_thrinfo_sub_node( thread_jc ); \
@@ -268,14 +268,14 @@ void PASTECH2(bao_,ch,varname) \
 			/* Calculate the thread's current PC block dimension. */ \
 			const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \
 \
-			const ctype* a_pc = a_00 + pp * pcstep_a; \
-			const ctype* d_pc = d_00 + pp * pcstep_d; \
-			const ctype* b_pc = b_jc + pp * pcstep_b; \
+			ctype* restrict a_pc = a_00 + pp * pcstep_a; \
+			ctype* restrict d_pc = d_00 + pp * pcstep_d; \
+			ctype* restrict b_pc = b_jc + pp * pcstep_b; \
 \
 			/* Only apply beta to the first iteration of the pc loop. */ \
-			const ctype* beta_use = ( pp == 0 ? &beta_local : &one_local ); \
+			ctype* restrict beta_use = ( pp == 0 ? &beta_local : &one_local ); \
 \
-			const ctype* b_use; \
+			ctype* b_use; \
 			inc_t  rs_b_use, cs_b_use, ps_b_use; \
 \
 			/* Identify the current thrinfo_t node. Note that the thrinfo_t
@@ -306,7 +306,7 @@ void PASTECH2(bao_,ch,varname) \
 \
 			/* Alias b_use so that it's clear this is our current block of
 			   matrix B. */ \
-			const ctype* b_pc_use = b_use; \
+			ctype* restrict b_pc_use = b_use; \
 \
 			/* Identify the current thrinfo_t node and then grow the tree. */ \
 			thread_ic = bli_thrinfo_sub_node( thread_pb ); \
@@ -327,10 +327,10 @@ void PASTECH2(bao_,ch,varname) \
 				/* Calculate the thread's current IC block dimension. */ \
 				const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \
 \
-				const ctype* a_ic = a_pc + ii * icstep_a; \
-				      ctype* c_ic = c_jc + ii * icstep_c; \
+				ctype* restrict a_ic = a_pc + ii * icstep_a; \
+				ctype* restrict c_ic = c_jc + ii * icstep_c; \
 \
-				const ctype* a_use; \
+				ctype* a_use; \
 				inc_t  rs_a_use, cs_a_use, ps_a_use; \
 \
 				/* Identify the current thrinfo_t node. Note that the thrinfo_t
@@ -361,7 +361,7 @@ void PASTECH2(bao_,ch,varname) \
 \
 				/* Alias a_use so that it's clear this is our current block of
 				   matrix A. */ \
-				const ctype* a_ic_use = a_use; \
+				ctype* restrict a_ic_use = a_use; \
 \
 				/* Identify the current thrinfo_t node and then grow the tree. */ \
 				thread_jr = bli_thrinfo_sub_node( thread_pa ); \
@@ -387,12 +387,12 @@ void PASTECH2(bao_,ch,varname) \
 					const dim_t nr_cur \
 					= ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \
 \
-					const ctype* b_jr = b_pc_use + j * ps_b_use; \
-					      ctype* c_jr = c_ic     + j * jrstep_c; \
+					ctype* restrict b_jr = b_pc_use + j * ps_b_use; \
+					ctype* restrict c_jr = c_ic     + j * jrstep_c; \
 \
 					/* Assume for now that our next panel of B to be the current panel
 					   of B. */ \
-					const ctype* b2 = b_jr; \
+					ctype* restrict b2 = b_jr; \
 \
 					/* Identify the current thrinfo_t node. */ \
 					thread_ir = bli_thrinfo_sub_node( thread_jr ); \
@@ -417,10 +417,10 @@ void PASTECH2(bao_,ch,varname) \
 						const dim_t mr_cur \
 						= ( bli_is_not_edge_f( i, ir_iter, ir_left ) ? MR : ir_left ); \
 \
-						const ctype* a_ir = a_ic_use + i * ps_a_use; \
-						      ctype* c_ir = c_jr     + i * irstep_c; \
+						ctype* restrict a_ir = a_ic_use + i * ps_a_use; \
+						ctype* restrict c_ir = c_jr     + i * irstep_c; \
 \
-						const ctype* a2; \
+						ctype* restrict a2; \
 \
 						/* Compute the addresses of the next micropanels of A and B. */ \
 						a2 = bli_gemm_get_next_a_upanel( a_ir, ps_a_use, 1 ); \
diff --git a/addon/gemmd/bao_gemmd_check.c b/addon/gemmd/bao_gemmd_check.c
index c900ac188d..864e9a1acb 100644
--- a/addon/gemmd/bao_gemmd_check.c
+++ b/addon/gemmd/bao_gemmd_check.c
@@ -36,13 +36,13 @@
 
 void bao_gemmd_check
      (
-       const obj_t*  alpha,
-       const obj_t*  a,
-       const obj_t*  d,
-       const obj_t*  b,
-       const obj_t*  beta,
-       const obj_t*  c,
-       const cntx_t* cntx
+       obj_t*  alpha,
+       obj_t*  a,
+       obj_t*  d,
+       obj_t*  b,
+       obj_t*  beta,
+       obj_t*  c,
+       cntx_t* cntx
      )
 {
 	err_t e_val;
diff --git a/addon/gemmd/bao_gemmd_var.h b/addon/gemmd/bao_gemmd_var.h
index 98d6c7d479..05ec45e07e 100644
--- a/addon/gemmd/bao_gemmd_var.h
+++ b/addon/gemmd/bao_gemmd_var.h
@@ -42,15 +42,15 @@
 \
 void PASTECH(bao_,opname) \
      ( \
-       const obj_t*  alpha, \
-       const obj_t*  a, \
-       const obj_t*  d, \
-       const obj_t*  b, \
-       const obj_t*  beta, \
-       const obj_t*  c, \
-       const cntx_t* cntx, \
-       const rntm_t* rntm, \
-       const thrinfo_t* thread  \
+       obj_t*  alpha, \
+       obj_t*  a, \
+       obj_t*  d, \
+       obj_t*  b, \
+       obj_t*  beta, \
+       obj_t*  c, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       thrinfo_t* thread  \
      );
 
 GENPROT( gemmd_bp_var1 )
@@ -65,20 +65,20 @@ GENPROT( gemmd_bp_var1 )
 \
 void PASTECH2(bao_,ch,varname) \
      ( \
-       conj_t        conja, \
-       conj_t        conjb, \
-       dim_t         m, \
-       dim_t         n, \
-       dim_t         k, \
-       const void*   alpha, \
-       const void*   a, inc_t rs_a, inc_t cs_a, \
-       const void*   d, inc_t incd, \
-       const void*   b, inc_t rs_b, inc_t cs_b, \
-       const void*   beta, \
-             void*   c, inc_t rs_c, inc_t cs_c, \
-       const cntx_t* cntx, \
-       const rntm_t* rntm, \
-       const thrinfo_t* thread  \
+       conj_t           conja, \
+       conj_t           conjb, \
+       dim_t            m, \
+       dim_t            n, \
+       dim_t            k, \
+       void*   restrict alpha, \
+       void*   restrict a, inc_t rs_a, inc_t cs_a, \
+       void*   restrict d, inc_t incd, \
+       void*   restrict b, inc_t rs_b, inc_t cs_b, \
+       void*   restrict beta, \
+       void*   restrict c, inc_t rs_c, inc_t cs_c, \
+       cntx_t* restrict cntx, \
+       rntm_t* restrict rntm, \
+       thrinfo_t* restrict thread  \
      );
 
 //INSERT_GENTPROT_BASIC0( gemmd_bp_var1 )
@@ -97,18 +97,18 @@ GENTPROT( dcomplex, z, gemmd_bp_var1 )
 \
 void PASTECH2(bao_,ch,varname) \
      ( \
-       const dim_t      MR, \
-       const dim_t      NR, \
-       dim_t            mr_cur, \
-       dim_t            nr_cur, \
-       dim_t            k, \
-       const ctype*     alpha, \
-       const ctype*     a, inc_t rs_a, inc_t cs_a, \
-       const ctype*     b, inc_t rs_b, inc_t cs_b, \
-       const ctype*     beta, \
-             ctype*     c, inc_t rs_c, inc_t cs_c, \
-       const auxinfo_t* aux, \
-       const cntx_t*    cntx  \
+       const dim_t         MR, \
+       const dim_t         NR, \
+       dim_t               mr_cur, \
+       dim_t               nr_cur, \
+       dim_t               k, \
+       ctype*     restrict alpha, \
+       ctype*     restrict a, inc_t rs_a, inc_t cs_a, \
+       ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
+       ctype*     restrict beta, \
+       ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
+       auxinfo_t* restrict aux, \
+       cntx_t*    restrict cntx  \
      );
 
 //INSERT_GENTPROT_BASIC0( gemm_kernel )
diff --git a/config/template/kernels/1/bli_axpyv_template_noopt_var1.c b/config/template/kernels/1/bli_axpyv_template_noopt_var1.c
index d1918466f7..8796bab267 100644
--- a/config/template/kernels/1/bli_axpyv_template_noopt_var1.c
+++ b/config/template/kernels/1/bli_axpyv_template_noopt_var1.c
@@ -42,7 +42,7 @@ void bli_zaxpyv_template_noopt
        dcomplex* restrict alpha,
        dcomplex* restrict x, inc_t incx,
        dcomplex* restrict y, inc_t incy,
-       cntx_t*   restrict cntx
+       cntx_t*            cntx
      )
 {
 /*
diff --git a/config/template/kernels/1/bli_dotv_template_noopt_var1.c b/config/template/kernels/1/bli_dotv_template_noopt_var1.c
index 3761d2e764..90f93b8177 100644
--- a/config/template/kernels/1/bli_dotv_template_noopt_var1.c
+++ b/config/template/kernels/1/bli_dotv_template_noopt_var1.c
@@ -43,7 +43,7 @@ void bli_zdotv_template_noopt
        dcomplex* restrict x, inc_t incx,
        dcomplex* restrict y, inc_t incy,
        dcomplex* restrict rho,
-       cntx_t*   restrict cntx
+       cntx_t*            cntx
      )
 {
 /*
@@ -187,7 +187,7 @@ void bli_zdotv_template_noopt
 	// Initialize accumulator to zero.
 	bli_zset0s( dotxy );
 
-	
+
 	conjx_use = conjx;
 
 	// If y must be conjugated, we compute the result indirectly by first
diff --git a/config/template/kernels/1f/bli_axpy2v_template_noopt_var1.c b/config/template/kernels/1f/bli_axpy2v_template_noopt_var1.c
index 7080abce06..5a12bf761f 100644
--- a/config/template/kernels/1f/bli_axpy2v_template_noopt_var1.c
+++ b/config/template/kernels/1f/bli_axpy2v_template_noopt_var1.c
@@ -45,7 +45,7 @@ void bli_zaxpy2v_template_noopt
        dcomplex* restrict x, inc_t incx,
        dcomplex* restrict y, inc_t incy,
        dcomplex* restrict z, inc_t incz,
-       cntx_t*   restrict cntx
+       cntx_t*            cntx
      )
 {
 /*
diff --git a/config/template/kernels/1f/bli_axpyf_template_noopt_var1.c b/config/template/kernels/1f/bli_axpyf_template_noopt_var1.c
index a0afedfcaf..f7b4922864 100644
--- a/config/template/kernels/1f/bli_axpyf_template_noopt_var1.c
+++ b/config/template/kernels/1f/bli_axpyf_template_noopt_var1.c
@@ -45,7 +45,7 @@ void bli_zaxpyf_template_noopt
        dcomplex* restrict a, inc_t inca, inc_t lda,
        dcomplex* restrict x, inc_t incx,
        dcomplex* restrict y, inc_t incy,
-       cntx_t*   restrict cntx
+       cntx_t*            cntx
      )
 {
 /*
diff --git a/config/template/kernels/1f/bli_dotaxpyv_template_noopt_var1.c b/config/template/kernels/1f/bli_dotaxpyv_template_noopt_var1.c
index 275c399982..31a3097c0e 100644
--- a/config/template/kernels/1f/bli_dotaxpyv_template_noopt_var1.c
+++ b/config/template/kernels/1f/bli_dotaxpyv_template_noopt_var1.c
@@ -46,7 +46,7 @@ void bli_zdotaxpyv_template_noopt
        dcomplex* restrict y, inc_t incy,
        dcomplex* restrict rho,
        dcomplex* restrict z, inc_t incz,
-       cntx_t*   restrict cntx
+       cntx_t*            cntx
      )
 {
 /*
diff --git a/config/template/kernels/1f/bli_dotxaxpyf_template_noopt_var1.c b/config/template/kernels/1f/bli_dotxaxpyf_template_noopt_var1.c
index 6754d86ce8..aeb502f354 100644
--- a/config/template/kernels/1f/bli_dotxaxpyf_template_noopt_var1.c
+++ b/config/template/kernels/1f/bli_dotxaxpyf_template_noopt_var1.c
@@ -50,7 +50,7 @@ void bli_zdotxaxpyf_template_noopt
        dcomplex* restrict beta,
        dcomplex* restrict y, inc_t incy,
        dcomplex* restrict z, inc_t incz,
-       cntx_t*   restrict cntx
+       cntx_t*            cntx
      )
 
 {
diff --git a/config/template/kernels/1f/bli_dotxf_template_noopt_var1.c b/config/template/kernels/1f/bli_dotxf_template_noopt_var1.c
index 430fb277db..650303afe1 100644
--- a/config/template/kernels/1f/bli_dotxf_template_noopt_var1.c
+++ b/config/template/kernels/1f/bli_dotxf_template_noopt_var1.c
@@ -46,7 +46,7 @@ void bli_zdotxf_template_noopt
        dcomplex* restrict x, inc_t incx,
        dcomplex* restrict beta,
        dcomplex* restrict y, inc_t incy,
-       cntx_t*   restrict cntx
+       cntx_t*            cntx
      )
 {
 /*
@@ -239,7 +239,7 @@ void bli_zdotxf_template_noopt
 	if ( bli_is_conj( conjx ) )
 		bli_toggle_conj( &conjat_use );
 
-	
+
 	// Iterate over columns of A and rows of x to compute:
 	//   Atx = conjat_use( A^T ) * x;
 	if ( bli_is_noconj( conjat_use ) )
diff --git a/config/template/kernels/3/bli_gemm_template_noopt_mxn.c b/config/template/kernels/3/bli_gemm_template_noopt_mxn.c
index 06f25a0e9e..190519fa0a 100644
--- a/config/template/kernels/3/bli_gemm_template_noopt_mxn.c
+++ b/config/template/kernels/3/bli_gemm_template_noopt_mxn.c
@@ -45,8 +45,8 @@ void bli_zgemm_template_noopt
        dcomplex*  restrict b1,
        dcomplex*  restrict beta,
        dcomplex*  restrict c11, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 /*
diff --git a/config/template/kernels/3/bli_gemmtrsm_l_template_noopt_mxn.c b/config/template/kernels/3/bli_gemmtrsm_l_template_noopt_mxn.c
index 87c21f7edf..d44fa4c1ef 100644
--- a/config/template/kernels/3/bli_gemmtrsm_l_template_noopt_mxn.c
+++ b/config/template/kernels/3/bli_gemmtrsm_l_template_noopt_mxn.c
@@ -44,8 +44,8 @@ void bli_zgemmtrsm_l_template_noopt
        dcomplex*  restrict b01,
        dcomplex*  restrict b11,
        dcomplex*  restrict c11, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 /*
diff --git a/config/template/kernels/3/bli_gemmtrsm_u_template_noopt_mxn.c b/config/template/kernels/3/bli_gemmtrsm_u_template_noopt_mxn.c
index 0b4544ae1d..0a3d596227 100644
--- a/config/template/kernels/3/bli_gemmtrsm_u_template_noopt_mxn.c
+++ b/config/template/kernels/3/bli_gemmtrsm_u_template_noopt_mxn.c
@@ -44,8 +44,8 @@ void bli_zgemmtrsm_u_template_noopt
        dcomplex*  restrict b01,
        dcomplex*  restrict b11,
        dcomplex*  restrict c11, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 /*
diff --git a/config/template/kernels/3/bli_trsm_l_template_noopt_mxn.c b/config/template/kernels/3/bli_trsm_l_template_noopt_mxn.c
index ce15798b0e..4e6634dea3 100644
--- a/config/template/kernels/3/bli_trsm_l_template_noopt_mxn.c
+++ b/config/template/kernels/3/bli_trsm_l_template_noopt_mxn.c
@@ -40,8 +40,8 @@ void bli_ztrsm_l_template_noopt
        dcomplex*  restrict a11,
        dcomplex*  restrict b11,
        dcomplex*  restrict c11, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 /*
diff --git a/config/template/kernels/3/bli_trsm_u_template_noopt_mxn.c b/config/template/kernels/3/bli_trsm_u_template_noopt_mxn.c
index 661167c9ca..42982459ad 100644
--- a/config/template/kernels/3/bli_trsm_u_template_noopt_mxn.c
+++ b/config/template/kernels/3/bli_trsm_u_template_noopt_mxn.c
@@ -40,8 +40,8 @@ void bli_ztrsm_u_template_noopt
        dcomplex*  restrict a11,
        dcomplex*  restrict b11,
        dcomplex*  restrict c11, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 /*
diff --git a/frame/0/bli_l0_ft.h b/frame/0/bli_l0_ft.h
index 96e3573c65..01d90cc3bd 100644
--- a/frame/0/bli_l0_ft.h
+++ b/frame/0/bli_l0_ft.h
@@ -44,7 +44,7 @@
 \
 typedef void (*PASTECH2(ch,opname,tsuf)) \
      ( \
-       conj_t       conjchi, \
+             conj_t conjchi, \
        const ctype* chi, \
              ctype* psi  \
      );
@@ -73,7 +73,7 @@ INSERT_GENTDEF( invertsc )
 \
 typedef void (*PASTECH2(ch,opname,tsuf)) \
      ( \
-       conj_t       conjchi, \
+             conj_t conjchi, \
        const ctype* chi, \
              ctype* psi  \
      );
@@ -87,8 +87,8 @@ INSERT_GENTDEF( mulsc )
 \
 typedef void (*PASTECH2(ch,opname,tsuf)) \
      ( \
-       const ctype* chi, \
-       ctype_r*     absq  \
+       const ctype*   chi, \
+             ctype_r* absq  \
      );
 
 INSERT_GENTDEFR( absqsc )
@@ -100,8 +100,8 @@ INSERT_GENTDEFR( absqsc )
 \
 typedef void (*PASTECH2(ch,opname,tsuf)) \
      ( \
-       const ctype* chi, \
-       ctype_r*     norm  \
+       const ctype*   chi, \
+             ctype_r* norm  \
      );
 
 INSERT_GENTDEFR( normfsc )
@@ -126,9 +126,9 @@ INSERT_GENTDEF( sqrtsc )
 \
 typedef void (*PASTECH2(ch,opname,tsuf)) \
      ( \
-       const ctype* chi, \
-       double*      zeta_r, \
-       double*      zeta_i  \
+       const ctype*  chi, \
+             double* zeta_r, \
+             double* zeta_i  \
      );
 
 INSERT_GENTDEF( getsc )
@@ -154,9 +154,9 @@ INSERT_GENTDEF( setsc )
 \
 typedef void (*PASTECH2(ch,opname,tsuf)) \
      ( \
-       const ctype* chi, \
-       ctype_r*     zeta_r, \
-       ctype_r*     zeta_i  \
+       const ctype*   chi, \
+             ctype_r* zeta_r, \
+             ctype_r* zeta_i  \
      );
 
 INSERT_GENTDEFR( unzipsc )
@@ -170,7 +170,7 @@ typedef void (*PASTECH2(ch,opname,tsuf)) \
      ( \
        const ctype_r* zeta_r, \
        const ctype_r* zeta_i, \
-       ctype*         chi  \
+             ctype*   chi  \
      );
 
 INSERT_GENTDEFR( zipsc )
diff --git a/frame/0/bli_l0_oapi.c b/frame/0/bli_l0_oapi.c
index 3fb903e987..e7b1274025 100644
--- a/frame/0/bli_l0_oapi.c
+++ b/frame/0/bli_l0_oapi.c
@@ -188,9 +188,9 @@ GENFRONT( sqrtsc )
 \
 void PASTEMAC0(opname) \
      ( \
-       const obj_t* chi, \
-       double*      zeta_r, \
-       double*      zeta_i  \
+       const obj_t*  chi, \
+             double* zeta_r, \
+             double* zeta_i  \
      ) \
 { \
 	bli_init_once(); \
@@ -232,8 +232,8 @@ GENFRONT( getsc )
 \
 void PASTEMAC0(opname) \
      ( \
-       double  zeta_r, \
-       double  zeta_i, \
+             double  zeta_r, \
+             double  zeta_i, \
        const obj_t*  chi  \
      ) \
 { \
diff --git a/frame/0/bli_l0_oapi.h b/frame/0/bli_l0_oapi.h
index ef111ebe40..ad9094e553 100644
--- a/frame/0/bli_l0_oapi.h
+++ b/frame/0/bli_l0_oapi.h
@@ -82,9 +82,9 @@ GENPROT( invertsc )
 \
 BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
      ( \
-       const obj_t* chi, \
-       double*      zeta_r, \
-       double*      zeta_i  \
+       const obj_t*  chi, \
+             double* zeta_r, \
+             double* zeta_i  \
      );
 
 GENPROT( getsc )
@@ -95,8 +95,8 @@ GENPROT( getsc )
 \
 BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
      ( \
-       double  zeta_r, \
-       double  zeta_i, \
+             double  zeta_r, \
+             double  zeta_i, \
        const obj_t*  chi  \
      );
 
diff --git a/frame/0/bli_l0_tapi.c b/frame/0/bli_l0_tapi.c
index 55dbcab722..cd2876a143 100644
--- a/frame/0/bli_l0_tapi.c
+++ b/frame/0/bli_l0_tapi.c
@@ -43,7 +43,7 @@
 \
 void PASTEMAC(ch,opname) \
      ( \
-       conj_t       conjchi, \
+             conj_t conjchi, \
        const ctype* chi, \
              ctype* psi  \
      ) \
@@ -87,7 +87,7 @@ INSERT_GENTFUNC_BASIC( invertsc, inverts )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       conj_t       conjchi, \
+             conj_t conjchi, \
        const ctype* chi, \
              ctype* psi  \
      ) \
@@ -116,8 +116,8 @@ INSERT_GENTFUNC_BASIC( mulsc, scals )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       const ctype* chi, \
-       ctype_r*     absq  \
+       const ctype*   chi, \
+             ctype_r* absq  \
      ) \
 { \
 	bli_init_once(); \
@@ -145,8 +145,8 @@ INSERT_GENTFUNCR_BASIC0( absqsc )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       const ctype* chi, \
-       ctype_r*     norm  \
+       const ctype*   chi, \
+             ctype_r* norm  \
      ) \
 { \
 	bli_init_once(); \
@@ -181,9 +181,9 @@ INSERT_GENTFUNC_BASIC0( sqrtsc )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       const ctype* chi, \
-       double*      zeta_r, \
-       double*      zeta_i  \
+       const ctype*  chi, \
+             double* zeta_r, \
+             double* zeta_i  \
      ) \
 { \
 	bli_init_once(); \
@@ -217,9 +217,9 @@ INSERT_GENTFUNC_BASIC0( setsc )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       const ctype* chi, \
-       ctype_r*     zeta_r, \
-       ctype_r*     zeta_i  \
+       const ctype*   chi, \
+             ctype_r* zeta_r, \
+             ctype_r* zeta_i  \
      ) \
 { \
 	bli_init_once(); \
@@ -237,7 +237,7 @@ void PASTEMAC(ch,opname) \
      ( \
        const ctype_r* zeta_r, \
        const ctype_r* zeta_i, \
-       ctype*         chi  \
+             ctype*   chi  \
      ) \
 { \
 	bli_init_once(); \
@@ -251,9 +251,9 @@ INSERT_GENTFUNCR_BASIC0( zipsc )
 
 void bli_igetsc
      (
-       const dim_t* chi,
-       double*      zeta_r,
-       double*      zeta_i
+       const dim_t*  chi,
+             double* zeta_r,
+             double* zeta_i
      )
 {
 	bli_init_once();
diff --git a/frame/0/bli_l0_tapi.h b/frame/0/bli_l0_tapi.h
index 3ff7667869..66a31c4328 100644
--- a/frame/0/bli_l0_tapi.h
+++ b/frame/0/bli_l0_tapi.h
@@ -42,7 +42,7 @@
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       conj_t        conjchi, \
+             conj_t  conjchi, \
        const ctype*  chi, \
              ctype*  psi  \
      );
@@ -70,8 +70,8 @@ INSERT_GENTPROT_BASIC0( invertsc )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       const ctype* chi, \
-       ctype_r*     absq  \
+       const ctype*   chi, \
+             ctype_r* absq  \
      );
 
 INSERT_GENTPROTR_BASIC0( absqsc )
@@ -95,9 +95,9 @@ INSERT_GENTPROT_BASIC0( sqrtsc )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       const ctype* chi, \
-       double*      zeta_r, \
-       double*      zeta_i  \
+       const ctype*  chi, \
+             double* zeta_r, \
+             double* zeta_i  \
      );
 
 INSERT_GENTPROT_BASIC0( getsc )
@@ -121,9 +121,9 @@ INSERT_GENTPROT_BASIC0( setsc )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       const ctype* chi, \
-       ctype_r*     zeta_r, \
-       ctype_r*     zeta_i  \
+       const ctype*   chi, \
+             ctype_r* zeta_r, \
+             ctype_r* zeta_i  \
      );
 
 INSERT_GENTPROTR_BASIC0( unzipsc )
@@ -136,7 +136,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
        const ctype_r* zeta_r, \
        const ctype_r* zeta_i, \
-       ctype*         chi  \
+             ctype*   chi  \
      );
 
 INSERT_GENTPROTR_BASIC0( zipsc )
@@ -145,9 +145,9 @@ INSERT_GENTPROTR_BASIC0( zipsc )
 
 BLIS_EXPORT_BLIS void bli_igetsc
      (
-       const dim_t* chi,
-       double*      zeta_r,
-       double*      zeta_i
+       const dim_t*  chi,
+             double* zeta_r,
+             double* zeta_i
      );
 
 BLIS_EXPORT_BLIS void bli_isetsc
diff --git a/frame/0/copysc/bli_copysc.c b/frame/0/copysc/bli_copysc.c
index 04f4efbbe8..28de4d6c89 100644
--- a/frame/0/copysc/bli_copysc.c
+++ b/frame/0/copysc/bli_copysc.c
@@ -41,9 +41,9 @@
 
 typedef void (*FUNCPTR_T)
      (
-       conj_t      conjchi,
-       const void* chi,
-             void* psi
+             conj_t conjchi,
+       const void*  chi,
+             void*  psi
      );
 
 static FUNCPTR_T GENARRAY2_ALL(ftypes,copysc);
@@ -105,9 +105,9 @@ GENFRONT( copysc )
 \
 void PASTEMAC2(chx,chy,varname) \
      ( \
-       conj_t      conjchi, \
-       const void* chi, \
-             void* psi \
+             conj_t conjchi, \
+       const void*  chi, \
+             void*  psi \
      ) \
 { \
 	bli_init_once(); \
diff --git a/frame/0/copysc/bli_copysc.h b/frame/0/copysc/bli_copysc.h
index 1c2ee4e862..cd5481e576 100644
--- a/frame/0/copysc/bli_copysc.h
+++ b/frame/0/copysc/bli_copysc.h
@@ -57,9 +57,9 @@ GENFRONT( copysc )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,varname) \
      ( \
-       conj_t      conjchi, \
-       const void* chi, \
-             void* psi \
+             conj_t conjchi, \
+       const void*  chi, \
+             void*  psi \
      );
 
 INSERT_GENTPROT2_BASIC0( copysc )
diff --git a/frame/1/bli_l1v_ft.h b/frame/1/bli_l1v_ft.h
index 7f1d93ad34..57f9d223a8 100644
--- a/frame/1/bli_l1v_ft.h
+++ b/frame/1/bli_l1v_ft.h
@@ -44,8 +44,8 @@
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       conj_t       conjx, \
-       dim_t        n, \
+             conj_t conjx, \
+             dim_t  n, \
        const ctype* x, inc_t incx, \
              ctype* y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
@@ -62,9 +62,9 @@ INSERT_GENTDEF( subv )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       dim_t        n, \
+             dim_t  n, \
        const ctype* x, inc_t incx, \
-       dim_t*       index  \
+             dim_t* index  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -77,8 +77,8 @@ INSERT_GENTDEF( amaxv )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       conj_t       conjx, \
-       dim_t        n, \
+             conj_t conjx, \
+             dim_t  n, \
        const ctype* alpha, \
        const ctype* x, inc_t incx, \
        const ctype* beta, \
@@ -95,8 +95,8 @@ INSERT_GENTDEF( axpbyv )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       conj_t       conjx, \
-       dim_t        n, \
+             conj_t conjx, \
+             dim_t  n, \
        const ctype* alpha, \
        const ctype* x, inc_t incx, \
              ctype* y, inc_t incy  \
@@ -113,9 +113,9 @@ INSERT_GENTDEF( scal2v )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       conj_t       conjx, \
-       conj_t       conjy, \
-       dim_t        n, \
+             conj_t conjx, \
+             conj_t conjy, \
+             dim_t  n, \
        const ctype* x, inc_t incx, \
        const ctype* y, inc_t incy, \
              ctype* rho  \
@@ -131,9 +131,9 @@ INSERT_GENTDEF( dotv )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       conj_t       conjx, \
-       conj_t       conjy, \
-       dim_t        n, \
+             conj_t conjx, \
+             conj_t conjy, \
+             dim_t  n, \
        const ctype* alpha, \
        const ctype* x, inc_t incx, \
        const ctype* y, inc_t incy, \
@@ -165,8 +165,8 @@ INSERT_GENTDEF( invertv )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       conj_t       conjalpha, \
-       dim_t        n, \
+             conj_t conjalpha, \
+             dim_t  n, \
        const ctype* alpha, \
              ctype* x, inc_t incx  \
        BLIS_TAPI_EX_PARAMS  \
@@ -197,8 +197,8 @@ INSERT_GENTDEF( swapv )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       conj_t       conjx, \
-       dim_t        n, \
+             conj_t conjx, \
+             dim_t  n, \
        const ctype* x, inc_t incx, \
        const ctype* beta, \
              ctype* y, inc_t incy  \
diff --git a/frame/1/bli_l1v_ker_prot.h b/frame/1/bli_l1v_ker_prot.h
index 1a1eec3f38..b912ba7e00 100644
--- a/frame/1/bli_l1v_ker_prot.h
+++ b/frame/1/bli_l1v_ker_prot.h
@@ -45,7 +45,7 @@ void PASTEMAC(ch,opname) \
         dim_t            n, \
         ctype*  restrict x, inc_t incx, \
         ctype*  restrict y, inc_t incy, \
-        cntx_t* restrict cntx  \
+        cntx_t*          cntx  \
       );
 
 
@@ -56,7 +56,7 @@ void PASTEMAC(ch,opname) \
        dim_t            n, \
        ctype*  restrict x, inc_t incx, \
        dim_t*  restrict index, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ); \
 
 
@@ -70,7 +70,7 @@ void PASTEMAC(ch,opname) \
        ctype*  restrict x, inc_t incx, \
        ctype*  restrict beta, \
        ctype*  restrict y, inc_t incy, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ); \
 
 
@@ -83,7 +83,7 @@ void PASTEMAC(ch,opname) \
        ctype*  restrict alpha, \
        ctype*  restrict x, inc_t incx, \
        ctype*  restrict y, inc_t incy, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ); \
 
 
@@ -95,7 +95,7 @@ void PASTEMAC(ch,opname) \
         dim_t            n, \
         ctype*  restrict x, inc_t incx, \
         ctype*  restrict y, inc_t incy, \
-        cntx_t* restrict cntx  \
+        cntx_t*          cntx  \
       );
 
 
@@ -109,7 +109,7 @@ void PASTEMAC(ch,opname) \
        ctype*  restrict x, inc_t incx, \
        ctype*  restrict y, inc_t incy, \
        ctype*  restrict rho, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ); \
 
 
@@ -125,7 +125,7 @@ void PASTEMAC(ch,opname) \
        ctype*  restrict y, inc_t incy, \
        ctype*  restrict beta, \
        ctype*  restrict rho, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ); \
 
 
@@ -135,7 +135,7 @@ void PASTEMAC(ch,opname) \
      ( \
        dim_t            n, \
        ctype*  restrict x, inc_t incx, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ); \
 
 
@@ -147,7 +147,7 @@ void PASTEMAC(ch,opname) \
        dim_t            n, \
        ctype*  restrict alpha, \
        ctype*  restrict x, inc_t incx, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ); \
 
 
@@ -160,7 +160,7 @@ void PASTEMAC(ch,opname) \
        ctype*  restrict alpha, \
        ctype*  restrict x, inc_t incx, \
        ctype*  restrict y, inc_t incy, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ); \
 
 
@@ -172,7 +172,7 @@ void PASTEMAC(ch,opname) \
        dim_t            n, \
        ctype*  restrict alpha, \
        ctype*  restrict x, inc_t incx, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ); \
 
 
@@ -184,7 +184,7 @@ void PASTEMAC(ch,opname) \
         dim_t            n, \
         ctype*  restrict x, inc_t incx, \
         ctype*  restrict y, inc_t incy, \
-        cntx_t* restrict cntx  \
+        cntx_t*          cntx  \
       );
 
 
@@ -195,7 +195,7 @@ void PASTEMAC(ch,opname) \
        dim_t            n, \
        ctype*  restrict x, inc_t incx, \
        ctype*  restrict y, inc_t incy, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ); \
 
 
@@ -208,6 +208,6 @@ void PASTEMAC(ch,opname) \
        ctype*  restrict x, inc_t incx, \
        ctype*  restrict beta, \
        ctype*  restrict y, inc_t incy, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ); \
 
diff --git a/frame/1/bli_l1v_tapi.c b/frame/1/bli_l1v_tapi.c
index d9e247978f..9a638b1d73 100644
--- a/frame/1/bli_l1v_tapi.c
+++ b/frame/1/bli_l1v_tapi.c
@@ -45,8 +45,8 @@
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t       conjx, \
-       dim_t        n, \
+             conj_t conjx, \
+             dim_t  n, \
        const ctype* x, inc_t incx, \
              ctype* y, inc_t incy  \
        BLIS_TAPI_EX_PARAMS  \
@@ -67,9 +67,9 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	( \
 	  conjx, \
 	  n, \
-	  x, incx, \
-	  y, incy, \
-	  cntx  \
+	  ( ctype* )x, incx, \
+	            y, incy, \
+	  ( cntx_t* )cntx  \
 	); \
 }
 
@@ -83,9 +83,9 @@ INSERT_GENTFUNC_BASIC( subv,  BLIS_SUBV_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       dim_t         n, \
-       const ctype*  x, inc_t incx, \
-       dim_t*        index  \
+             dim_t  n, \
+       const ctype* x, inc_t incx, \
+             dim_t* index  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -103,9 +103,9 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	f \
 	( \
 	  n, \
-	  x, incx, \
+	  ( ctype* )x, incx, \
 	  index, \
-	  cntx  \
+	  ( cntx_t* )cntx  \
 	); \
 }
 
@@ -117,8 +117,8 @@ INSERT_GENTFUNC_BASIC( amaxv, BLIS_AMAXV_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t        conjx, \
-       dim_t         n, \
+             conj_t  conjx, \
+             dim_t   n, \
        const ctype*  alpha, \
        const ctype*  x, inc_t incx, \
        const ctype*  beta, \
@@ -141,11 +141,11 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	( \
 	  conjx, \
 	  n, \
-	  alpha, \
-	  x, incx, \
-	  beta, \
-	  y, incy, \
-	  cntx  \
+	  ( ctype* )alpha, \
+	  ( ctype* )x, incx, \
+	  ( ctype* )beta, \
+	            y, incy, \
+	  ( cntx_t* )cntx  \
 	); \
 }
 
@@ -157,8 +157,8 @@ INSERT_GENTFUNC_BASIC( axpbyv, BLIS_AXPBYV_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t        conjx, \
-       dim_t         n, \
+             conj_t  conjx, \
+             dim_t   n, \
        const ctype*  alpha, \
        const ctype*  x, inc_t incx, \
              ctype*  y, inc_t incy  \
@@ -181,10 +181,10 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	( \
 	  conjx, \
 	  n, \
-	  alpha, \
-	  x, incx, \
-	  y, incy, \
-	  cntx  \
+	  ( ctype* )alpha, \
+	  ( ctype* )x, incx, \
+	            y, incy, \
+	  ( cntx_t* )cntx  \
 	); \
 }
 
@@ -197,9 +197,9 @@ INSERT_GENTFUNC_BASIC( scal2v, BLIS_SCAL2V_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t        conjx, \
-       conj_t        conjy, \
-       dim_t         n, \
+             conj_t  conjx, \
+             conj_t  conjy, \
+             dim_t   n, \
        const ctype*  x, inc_t incx, \
        const ctype*  y, inc_t incy, \
              ctype*  rho  \
@@ -222,10 +222,10 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  conjx, \
 	  conjy, \
 	  n, \
-	  x, incx, \
-	  y, incy, \
-	  rho, \
-	  cntx  \
+	  ( ctype* )x, incx, \
+	  ( ctype* )y, incy, \
+	            rho, \
+	  ( cntx_t* )cntx  \
 	); \
 }
 
@@ -237,9 +237,9 @@ INSERT_GENTFUNC_BASIC( dotv, BLIS_DOTV_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t        conjx, \
-       conj_t        conjy, \
-       dim_t         n, \
+             conj_t  conjx, \
+             conj_t  conjy, \
+             dim_t   n, \
        const ctype*  alpha, \
        const ctype*  x, inc_t incx, \
        const ctype*  y, inc_t incy, \
@@ -264,12 +264,12 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  conjx, \
 	  conjy, \
 	  n, \
-	  alpha, \
-	  x, incx, \
-	  y, incy, \
-	  beta, \
-	  rho, \
-	  cntx  \
+	  ( ctype* )alpha, \
+	  ( ctype* )x, incx, \
+	  ( ctype* )y, incy, \
+	  ( ctype* )beta, \
+	            rho, \
+	  ( cntx_t* )cntx  \
 	); \
 }
 
@@ -301,7 +301,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	( \
 	  n, \
 	  x, incx, \
-	  cntx  \
+	  ( cntx_t* )cntx  \
 	); \
 }
 
@@ -313,8 +313,8 @@ INSERT_GENTFUNC_BASIC( invertv, BLIS_INVERTV_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t        conjalpha, \
-       dim_t         n, \
+             conj_t  conjalpha, \
+             dim_t   n, \
        const ctype*  alpha, \
              ctype*  x, inc_t incx  \
        BLIS_TAPI_EX_PARAMS  \
@@ -335,9 +335,9 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	( \
 	  conjalpha, \
 	  n, \
-	  alpha, \
-	  x, incx, \
-	  cntx  \
+	  ( ctype* )alpha, \
+	            x, incx, \
+	  ( cntx_t* )cntx  \
 	); \
 }
 
@@ -372,7 +372,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  n, \
 	  x, incx, \
 	  y, incy, \
-	  cntx  \
+	  ( cntx_t* )cntx  \
 	); \
 }
 
@@ -383,8 +383,8 @@ INSERT_GENTFUNC_BASIC( swapv, BLIS_SWAPV_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t        conjx, \
-       dim_t         n, \
+             conj_t  conjx, \
+             dim_t   n, \
        const ctype*  x, inc_t incx, \
        const ctype*  beta, \
              ctype*  y, inc_t incy  \
@@ -406,10 +406,10 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	( \
 	  conjx, \
 	  n, \
-	  x, incx, \
-	  beta, \
-	  y, incy, \
-	  cntx  \
+	  ( ctype* )x, incx, \
+	  ( ctype* )beta, \
+	            y, incy, \
+	  ( cntx_t* )cntx  \
 	); \
 }
 
diff --git a/frame/1/bli_l1v_tapi.h b/frame/1/bli_l1v_tapi.h
index 7514f617e0..ed6d0508e8 100644
--- a/frame/1/bli_l1v_tapi.h
+++ b/frame/1/bli_l1v_tapi.h
@@ -42,8 +42,8 @@
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
       ( \
-        conj_t        conjx, \
-        dim_t         n, \
+              conj_t  conjx, \
+              dim_t   n, \
         const ctype*  x, inc_t incx, \
               ctype*  y, inc_t incy  \
         BLIS_TAPI_EX_PARAMS  \
@@ -59,9 +59,9 @@ INSERT_GENTPROT_BASIC0( subv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       dim_t          n, \
+             dim_t   n, \
        const ctype*  x, inc_t incx, \
-       dim_t*        index  \
+             dim_t*  index  \
        BLIS_TAPI_EX_PARAMS  \
      ); \
 
@@ -73,8 +73,8 @@ INSERT_GENTPROT_BASIC0( amaxv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t        conjx, \
-       dim_t         n, \
+             conj_t  conjx, \
+             dim_t   n, \
        const ctype*  alpha, \
        const ctype*  x, inc_t incx, \
        const ctype*  beta, \
@@ -90,8 +90,8 @@ INSERT_GENTPROT_BASIC0( axpbyv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t        conjx, \
-       dim_t         n, \
+             conj_t  conjx, \
+             dim_t   n, \
        const ctype*  alpha, \
        const ctype*  x, inc_t incx, \
              ctype*  y, inc_t incy  \
@@ -107,9 +107,9 @@ INSERT_GENTPROT_BASIC0( scal2v )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t        conjx, \
-       conj_t        conjy, \
-       dim_t         n, \
+             conj_t  conjx, \
+             conj_t  conjy, \
+             dim_t   n, \
        const ctype*  x, inc_t incx, \
        const ctype*  y, inc_t incy, \
              ctype*  rho  \
@@ -124,9 +124,9 @@ INSERT_GENTPROT_BASIC0( dotv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t        conjx, \
-       conj_t        conjy, \
-       dim_t         n, \
+             conj_t  conjx, \
+             conj_t  conjy, \
+             dim_t   n, \
        const ctype*  alpha, \
        const ctype*  x, inc_t incx, \
        const ctype*  y, inc_t incy, \
@@ -156,8 +156,8 @@ INSERT_GENTPROT_BASIC0( invertv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t        conjalpha, \
-       dim_t         n, \
+             conj_t  conjalpha, \
+             dim_t   n, \
        const ctype*  alpha, \
              ctype*  x, inc_t incx  \
        BLIS_TAPI_EX_PARAMS  \
@@ -186,8 +186,8 @@ INSERT_GENTPROT_BASIC0( swapv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t        conjx, \
-       dim_t         n, \
+             conj_t  conjx, \
+             dim_t   n, \
        const ctype*  x, inc_t incx, \
        const ctype*  beta, \
              ctype*  y, inc_t incy  \
diff --git a/frame/1d/bli_l1d_ft.h b/frame/1d/bli_l1d_ft.h
index 5eecdafe1a..8bbbfd2ee8 100644
--- a/frame/1d/bli_l1d_ft.h
+++ b/frame/1d/bli_l1d_ft.h
@@ -44,11 +44,11 @@
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       doff_t        diagoffx, \
-       diag_t        diagx, \
-       trans_t       transx, \
-       dim_t         m, \
-       dim_t         n, \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             trans_t transx, \
+             dim_t   m, \
+             dim_t   n, \
        const ctype*  x, inc_t rs_x, inc_t cs_x, \
              ctype*  y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
@@ -65,11 +65,11 @@ INSERT_GENTDEF( subd )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       doff_t        diagoffx, \
-       diag_t        diagx, \
-       trans_t       transx, \
-       dim_t         m, \
-       dim_t         n, \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             trans_t transx, \
+             dim_t   m, \
+             dim_t   n, \
        const ctype*  alpha, \
        const ctype*  x, inc_t rs_x, inc_t cs_x, \
              ctype*  y, inc_t rs_y, inc_t cs_y  \
@@ -102,10 +102,10 @@ INSERT_GENTDEF( invertd )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       conj_t        conjalpha, \
-       doff_t        diagoffx, \
-       dim_t         m, \
-       dim_t         n, \
+             conj_t  conjalpha, \
+             doff_t  diagoffx, \
+             dim_t   m, \
+             dim_t   n, \
        const ctype*  alpha, \
              ctype*  x, inc_t rs_x, inc_t cs_x  \
        BLIS_TAPI_EX_PARAMS  \
@@ -121,9 +121,9 @@ INSERT_GENTDEF( setd )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       doff_t         diagoffx, \
-       dim_t          m, \
-       dim_t          n, \
+             doff_t   diagoffx, \
+             dim_t    m, \
+             dim_t    n, \
        const ctype_r* alpha, \
              ctype*   x, inc_t rs_x, inc_t cs_x  \
        BLIS_TAPI_EX_PARAMS  \
@@ -138,9 +138,9 @@ INSERT_GENTDEFR( setid )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       doff_t        diagoffx, \
-       dim_t         m, \
-       dim_t         n, \
+             doff_t  diagoffx, \
+             dim_t   m, \
+             dim_t   n, \
        const ctype*  alpha, \
              ctype*  x, inc_t rs_x, inc_t cs_x  \
        BLIS_TAPI_EX_PARAMS  \
@@ -155,11 +155,11 @@ INSERT_GENTDEF( shiftd )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       doff_t        diagoffx, \
-       diag_t        diagx, \
-       trans_t       transx, \
-       dim_t         m, \
-       dim_t         n, \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             trans_t transx, \
+             dim_t   m, \
+             dim_t   n, \
        const ctype*  x, inc_t rs_x, inc_t cs_x, \
        const ctype*  beta, \
              ctype*  y, inc_t rs_y, inc_t cs_y  \
diff --git a/frame/1d/bli_l1d_tapi.c b/frame/1d/bli_l1d_tapi.c
index 560cb40efc..46499a55a4 100644
--- a/frame/1d/bli_l1d_tapi.c
+++ b/frame/1d/bli_l1d_tapi.c
@@ -45,11 +45,11 @@
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t        diagoffx, \
-       diag_t        diagx, \
-       trans_t       transx, \
-       dim_t         m, \
-       dim_t         n, \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             trans_t transx, \
+             dim_t   m, \
+             dim_t   n, \
        const ctype*  x, inc_t rs_x, inc_t cs_x, \
              ctype*  y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
@@ -108,9 +108,9 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	( \
 	  conjx, \
 	  n_elem, \
-	  x1, incx, \
-	  y1, incy, \
-	  cntx  \
+	  ( ctype* )x1, incx, \
+	            y1, incy, \
+	  ( cntx_t* )cntx  \
 	); \
 }
 
@@ -124,11 +124,11 @@ INSERT_GENTFUNC_BASIC2( subd,  subv,  BLIS_SUBV_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t        diagoffx, \
-       diag_t        diagx, \
-       trans_t       transx, \
-       dim_t         m, \
-       dim_t         n, \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             trans_t transx, \
+             dim_t   m, \
+             dim_t   n, \
        const ctype*  alpha, \
        const ctype*  x, inc_t rs_x, inc_t cs_x, \
              ctype*  y, inc_t rs_y, inc_t cs_y  \
@@ -188,10 +188,10 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	( \
 	  conjx, \
 	  n_elem, \
-	  alpha, \
-	  x1, incx, \
-	  y1, incy, \
-	  cntx  \
+	  ( ctype* )alpha, \
+	  ( ctype* )x1, incx, \
+	            y1, incy, \
+	  ( cntx_t* )cntx  \
 	); \
 }
 
@@ -248,7 +248,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	( \
 	  n_elem, \
 	  x1, incx, \
-	  cntx  \
+	  ( cntx_t* )cntx  \
 	); \
 }
 
@@ -260,10 +260,10 @@ INSERT_GENTFUNC_BASIC2( invertd, invertv, BLIS_INVERTV_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t        conjalpha, \
-       doff_t        diagoffx, \
-       dim_t         m, \
-       dim_t         n, \
+             conj_t  conjalpha, \
+             doff_t  diagoffx, \
+             dim_t   m, \
+             dim_t   n, \
        const ctype*  alpha, \
              ctype*  x, inc_t rs_x, inc_t cs_x  \
        BLIS_TAPI_EX_PARAMS  \
@@ -306,9 +306,9 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	( \
 	  conjalpha, \
 	  n_elem, \
-	  alpha, \
-	  x1, incx, \
-	  cntx  \
+	  ( ctype* )alpha, \
+	            x1, incx, \
+	  ( cntx_t* )cntx  \
 	); \
 }
 
@@ -321,11 +321,11 @@ INSERT_GENTFUNC_BASIC2( setd,  setv,  BLIS_SETV_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t         diagoffx, \
-       dim_t          m, \
-       dim_t          n, \
+             doff_t   diagoffx, \
+             dim_t    m, \
+             dim_t    n, \
        const ctype_r* alpha, \
-       ctype*         x, inc_t rs_x, inc_t cs_x  \
+             ctype*   x, inc_t rs_x, inc_t cs_x  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -383,9 +383,9 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	( \
 	  BLIS_NO_CONJUGATE, \
 	  n_elem, \
-	  alpha, \
-	  x1, incx, \
-	  cntx  \
+	  ( ctype_r* )alpha, \
+	              x1, incx, \
+	  ( cntx_t* )cntx  \
 	); \
 }
 
@@ -397,9 +397,9 @@ INSERT_GENTFUNCR_BASIC2( setid, setv, BLIS_SETV_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t        diagoffx, \
-       dim_t         m, \
-       dim_t         n, \
+             doff_t  diagoffx, \
+             dim_t   m, \
+             dim_t   n, \
        const ctype*  alpha, \
              ctype*  x, inc_t rs_x, inc_t cs_x  \
        BLIS_TAPI_EX_PARAMS  \
@@ -442,9 +442,9 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	( \
 	  BLIS_NO_CONJUGATE, \
 	  n_elem, \
-	  alpha, 0, \
-	  x1, incx, \
-	  cntx  \
+	  ( ctype* )alpha, 0, \
+	            x1, incx, \
+	  ( cntx_t* )cntx  \
 	); \
 }
 
@@ -456,11 +456,11 @@ INSERT_GENTFUNC_BASIC2( shiftd, addv, BLIS_ADDV_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t        diagoffx, \
-       diag_t        diagx, \
-       trans_t       transx, \
-       dim_t         m, \
-       dim_t         n, \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             trans_t transx, \
+             dim_t   m, \
+             dim_t   n, \
        const ctype*  x, inc_t rs_x, inc_t cs_x, \
        const ctype*  beta, \
              ctype*  y, inc_t rs_y, inc_t cs_y  \
@@ -520,10 +520,10 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	( \
 	  conjx, \
 	  n_elem, \
-	  x1, incx, \
-	  beta, \
-	  y1, incy, \
-	  cntx  \
+	  ( ctype* )x1, incx, \
+	  ( ctype* )beta, \
+	            y1, incy, \
+	  ( cntx_t* )cntx  \
 	); \
 }
 
diff --git a/frame/1d/bli_l1d_tapi.h b/frame/1d/bli_l1d_tapi.h
index d063db3617..50fa0b1503 100644
--- a/frame/1d/bli_l1d_tapi.h
+++ b/frame/1d/bli_l1d_tapi.h
@@ -42,11 +42,11 @@
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t        diagoffx, \
-       diag_t        diagx, \
-       trans_t       transx, \
-       dim_t         m, \
-       dim_t         n, \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             trans_t transx, \
+             dim_t   m, \
+             dim_t   n, \
        const ctype*  x, inc_t rs_x, inc_t cs_x, \
              ctype*  y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
@@ -62,11 +62,11 @@ INSERT_GENTPROT_BASIC0( subd )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t        diagoffx, \
-       diag_t        diagx, \
-       trans_t       transx, \
-       dim_t         m, \
-       dim_t         n, \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             trans_t transx, \
+             dim_t   m, \
+             dim_t   n, \
        const ctype*  alpha, \
        const ctype*  x, inc_t rs_x, inc_t cs_x, \
              ctype*  y, inc_t rs_y, inc_t cs_y  \
@@ -97,10 +97,10 @@ INSERT_GENTPROT_BASIC0( invertd )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t        conjalpha, \
-       doff_t        diagoffx, \
-       dim_t         m, \
-       dim_t         n, \
+             conj_t  conjalpha, \
+             doff_t  diagoffx, \
+             dim_t   m, \
+             dim_t   n, \
        const ctype*  alpha, \
              ctype*  x, inc_t rs_x, inc_t cs_x  \
        BLIS_TAPI_EX_PARAMS  \
@@ -115,11 +115,11 @@ INSERT_GENTPROT_BASIC0( setd )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t         diagoffx, \
-       dim_t          m, \
-       dim_t          n, \
+             doff_t   diagoffx, \
+             dim_t    m, \
+             dim_t    n, \
        const ctype_r* alpha, \
-       ctype*         x, inc_t rs_x, inc_t cs_x  \
+             ctype*   x, inc_t rs_x, inc_t cs_x  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -131,9 +131,9 @@ INSERT_GENTPROTR_BASIC0( setid )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t        diagoffx, \
-       dim_t         m, \
-       dim_t         n, \
+             doff_t  diagoffx, \
+             dim_t   m, \
+             dim_t   n, \
        const ctype*  alpha, \
              ctype*  x, inc_t rs_x, inc_t cs_x  \
        BLIS_TAPI_EX_PARAMS  \
@@ -147,11 +147,11 @@ INSERT_GENTPROT_BASIC0( shiftd )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t        diagoffx, \
-       diag_t        diagx, \
-       trans_t       transx, \
-       dim_t         m, \
-       dim_t         n, \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             trans_t transx, \
+             dim_t   m, \
+             dim_t   n, \
        const ctype*  x, inc_t rs_x, inc_t cs_x, \
        const ctype*  beta, \
              ctype*  y, inc_t rs_y, inc_t cs_y  \
diff --git a/frame/1f/bli_l1f_ft.h b/frame/1f/bli_l1f_ft.h
index 76f036dbfa..e5e2e86667 100644
--- a/frame/1f/bli_l1f_ft.h
+++ b/frame/1f/bli_l1f_ft.h
@@ -44,9 +44,9 @@
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       conj_t        conjx, \
-       conj_t        conjy, \
-       dim_t         n, \
+             conj_t  conjx, \
+             conj_t  conjy, \
+             dim_t   n, \
        const ctype*  alpha1, \
        const ctype*  alpha2, \
        const ctype*  x, inc_t incx, \
@@ -64,10 +64,10 @@ INSERT_GENTDEF( axpy2v )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       conj_t        conja, \
-       conj_t        conjx, \
-       dim_t         m, \
-       dim_t         b_n, \
+             conj_t  conja, \
+             conj_t  conjx, \
+             dim_t   m, \
+             dim_t   b_n, \
        const ctype*  alpha, \
        const ctype*  a, inc_t inca, inc_t lda, \
        const ctype*  x, inc_t incx, \
@@ -84,10 +84,10 @@ INSERT_GENTDEF( axpyf )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       conj_t        conjxt, \
-       conj_t        conjx, \
-       conj_t        conjy, \
-       dim_t         m, \
+             conj_t  conjxt, \
+             conj_t  conjx, \
+             conj_t  conjy, \
+             dim_t   m, \
        const ctype*  alpha, \
        const ctype*  x, inc_t incx, \
        const ctype*  y, inc_t incy, \
@@ -105,10 +105,10 @@ INSERT_GENTDEF( dotaxpyv )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       conj_t        conjat, \
-       conj_t        conjx, \
-       dim_t         m, \
-       dim_t         b_n, \
+             conj_t  conjat, \
+             conj_t  conjx, \
+             dim_t   m, \
+             dim_t   b_n, \
        const ctype*  alpha, \
        const ctype*  a, inc_t inca, inc_t lda, \
        const ctype*  x, inc_t incx, \
@@ -126,12 +126,12 @@ INSERT_GENTDEF( dotxf )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       conj_t        conjat, \
-       conj_t        conja, \
-       conj_t        conjw, \
-       conj_t        conjx, \
-       dim_t         m, \
-       dim_t         b_n, \
+             conj_t  conjat, \
+             conj_t  conja, \
+             conj_t  conjw, \
+             conj_t  conjx, \
+             dim_t   m, \
+             dim_t   b_n, \
        const ctype*  alpha, \
        const ctype*  a, inc_t inca, inc_t lda, \
        const ctype*  w, inc_t incw, \
diff --git a/frame/1f/bli_l1f_ker_prot.h b/frame/1f/bli_l1f_ker_prot.h
index 18eea45687..4393faf10d 100644
--- a/frame/1f/bli_l1f_ker_prot.h
+++ b/frame/1f/bli_l1f_ker_prot.h
@@ -49,7 +49,7 @@ void PASTEMAC(ch,opname) \
        ctype*  restrict x, inc_t incx, \
        ctype*  restrict y, inc_t incy, \
        ctype*  restrict z, inc_t incz, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      );
 
 
@@ -65,7 +65,7 @@ void PASTEMAC(ch,opname) \
        ctype*  restrict a, inc_t inca, inc_t lda, \
        ctype*  restrict x, inc_t incx, \
        ctype*  restrict y, inc_t incy, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      );
 
 
@@ -82,7 +82,7 @@ void PASTEMAC(ch,opname) \
        ctype*  restrict y, inc_t incy, \
        ctype*  restrict rho, \
        ctype*  restrict z, inc_t incz, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      );
 
 
@@ -103,7 +103,7 @@ void PASTEMAC(ch,opname) \
        ctype*  restrict beta, \
        ctype*  restrict y, inc_t incy, \
        ctype*  restrict z, inc_t incz, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      );
 
 
@@ -120,6 +120,6 @@ void PASTEMAC(ch,opname) \
        ctype*  restrict x, inc_t incx, \
        ctype*  restrict beta, \
        ctype*  restrict y, inc_t incy, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      );
 
diff --git a/frame/1f/bli_l1f_tapi.c b/frame/1f/bli_l1f_tapi.c
index 31d2553ba9..96bf96cdea 100644
--- a/frame/1f/bli_l1f_tapi.c
+++ b/frame/1f/bli_l1f_tapi.c
@@ -45,9 +45,9 @@
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t        conjx, \
-       conj_t        conjy, \
-       dim_t         n, \
+             conj_t  conjx, \
+             conj_t  conjy, \
+             dim_t   n, \
        const ctype*  alphax, \
        const ctype*  alphay, \
        const ctype*  x, inc_t incx, \
@@ -72,12 +72,12 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  conjx, \
 	  conjy, \
 	  n, \
-	  alphax, \
-	  alphay, \
-	  x, incx, \
-	  y, incy, \
-	  z, incz, \
-	  cntx  \
+	  ( ctype* )alphax, \
+	  ( ctype* )alphay, \
+	  ( ctype* )x, incx, \
+	  ( ctype* )y, incy, \
+	            z, incz, \
+	  ( cntx_t* )cntx  \
 	); \
 }
 
@@ -89,10 +89,10 @@ INSERT_GENTFUNC_BASIC( axpy2v, BLIS_AXPY2V_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t        conja, \
-       conj_t        conjx, \
-       dim_t         m, \
-       dim_t         b_n, \
+             conj_t  conja, \
+             conj_t  conjx, \
+             dim_t   m, \
+             dim_t   b_n, \
        const ctype*  alpha, \
        const ctype*  a, inc_t inca, inc_t lda, \
        const ctype*  x, inc_t incx, \
@@ -117,11 +117,11 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  conjx, \
 	  m, \
 	  b_n, \
-	  alpha, \
-	  a, inca, lda, \
-	  x, incx, \
-	  y, incy, \
-	  cntx  \
+	  ( ctype* )alpha, \
+	  ( ctype* )a, inca, lda, \
+	  ( ctype* )x, incx, \
+	            y, incy, \
+	  ( cntx_t* )cntx  \
 	); \
 }
 
@@ -133,10 +133,10 @@ INSERT_GENTFUNC_BASIC( axpyf, BLIS_AXPYF_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t        conjxt, \
-       conj_t        conjx, \
-       conj_t        conjy, \
-       dim_t         n, \
+             conj_t  conjxt, \
+             conj_t  conjx, \
+             conj_t  conjy, \
+             dim_t   n, \
        const ctype*  alpha, \
        const ctype*  x, inc_t incx, \
        const ctype*  y, inc_t incy, \
@@ -162,12 +162,12 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  conjx, \
 	  conjy, \
 	  n, \
-	  alpha, \
-	  x, incx, \
-	  y, incy, \
-	  rho, \
-	  z, incz, \
-	  cntx  \
+	  ( ctype* )alpha, \
+	  ( ctype* )x, incx, \
+	  ( ctype* )y, incy, \
+	            rho, \
+	            z, incz, \
+	  ( cntx_t* )cntx  \
 	); \
 }
 
@@ -179,12 +179,12 @@ INSERT_GENTFUNC_BASIC( dotaxpyv, BLIS_DOTAXPYV_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t        conjat, \
-       conj_t        conja, \
-       conj_t        conjw, \
-       conj_t        conjx, \
-       dim_t         m, \
-       dim_t         b_n, \
+             conj_t  conjat, \
+             conj_t  conja, \
+             conj_t  conjw, \
+             conj_t  conjx, \
+             dim_t   m, \
+             dim_t   b_n, \
        const ctype*  alpha, \
        const ctype*  a, inc_t inca, inc_t lda, \
        const ctype*  w, inc_t incw, \
@@ -214,14 +214,14 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  conjx, \
 	  m, \
 	  b_n, \
-	  alpha, \
-	  a, inca, lda, \
-	  w, incw, \
-	  x, incx, \
-	  beta, \
-	  y, incy, \
-	  z, incz, \
-	  cntx  \
+	  ( ctype* )alpha, \
+	  ( ctype* )a, inca, lda, \
+	  ( ctype* )w, incw, \
+	  ( ctype* )x, incx, \
+	  ( ctype* )beta, \
+	            y, incy, \
+	            z, incz, \
+	  ( cntx_t* )cntx  \
 	); \
 }
 
@@ -233,10 +233,10 @@ INSERT_GENTFUNC_BASIC( dotxaxpyf, BLIS_DOTXAXPYF_KER )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t        conjat, \
-       conj_t        conjx, \
-       dim_t         m, \
-       dim_t         b_n, \
+             conj_t  conjat, \
+             conj_t  conjx, \
+             dim_t   m, \
+             dim_t   b_n, \
        const ctype*  alpha, \
        const ctype*  a, inc_t inca, inc_t lda, \
        const ctype*  x, inc_t incx, \
@@ -262,12 +262,12 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  conjx, \
 	  m, \
 	  b_n, \
-	  alpha, \
-	  a, inca, lda, \
-	  x, incx, \
-	  beta, \
-	  y, incy, \
-	  cntx  \
+	  ( ctype* )alpha, \
+	  ( ctype* )a, inca, lda, \
+	  ( ctype* )x, incx, \
+	  ( ctype* )beta, \
+	            y, incy, \
+	  ( cntx_t* )cntx  \
 	); \
 }
 
diff --git a/frame/1f/bli_l1f_tapi.h b/frame/1f/bli_l1f_tapi.h
index 93ef982933..d62ce0d636 100644
--- a/frame/1f/bli_l1f_tapi.h
+++ b/frame/1f/bli_l1f_tapi.h
@@ -42,9 +42,9 @@
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t        conjx, \
-       conj_t        conjy, \
-       dim_t         n, \
+             conj_t  conjx, \
+             conj_t  conjy, \
+             dim_t   n, \
        const ctype*  alphax, \
        const ctype*  alphay, \
        const ctype*  x, inc_t incx, \
@@ -61,10 +61,10 @@ INSERT_GENTPROT_BASIC0( axpy2v )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t        conja, \
-       conj_t        conjx, \
-       dim_t         m, \
-       dim_t         b_n, \
+             conj_t  conja, \
+             conj_t  conjx, \
+             dim_t   m, \
+             dim_t   b_n, \
        const ctype*  alpha, \
        const ctype*  a, inc_t inca, inc_t lda, \
        const ctype*  x, inc_t incx, \
@@ -80,10 +80,10 @@ INSERT_GENTPROT_BASIC0( axpyf )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t        conjxt, \
-       conj_t        conjx, \
-       conj_t        conjy, \
-       dim_t         n, \
+             conj_t  conjxt, \
+             conj_t  conjx, \
+             conj_t  conjy, \
+             dim_t   n, \
        const ctype*  alpha, \
        const ctype*  x, inc_t incx, \
        const ctype*  y, inc_t incy, \
@@ -100,12 +100,12 @@ INSERT_GENTPROT_BASIC0( dotaxpyv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t        conjat, \
-       conj_t        conja, \
-       conj_t        conjw, \
-       conj_t        conjx, \
-       dim_t         m, \
-       dim_t         b_n, \
+             conj_t  conjat, \
+             conj_t  conja, \
+             conj_t  conjw, \
+             conj_t  conjx, \
+             dim_t   m, \
+             dim_t   b_n, \
        const ctype*  alpha, \
        const ctype*  a, inc_t inca, inc_t lda, \
        const ctype*  w, inc_t incw, \
@@ -124,10 +124,10 @@ INSERT_GENTPROT_BASIC0( dotxaxpyf )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t        conjat, \
-       conj_t        conjx, \
-       dim_t         m, \
-       dim_t         b_n, \
+             conj_t  conjat, \
+             conj_t  conjx, \
+             dim_t   m, \
+             dim_t   b_n, \
        const ctype*  alpha, \
        const ctype*  a, inc_t inca, inc_t lda, \
        const ctype*  x, inc_t incx, \
diff --git a/frame/1m/bli_l1m_ft.h b/frame/1m/bli_l1m_ft.h
index fd5c023cb1..55a8fd29bf 100644
--- a/frame/1m/bli_l1m_ft.h
+++ b/frame/1m/bli_l1m_ft.h
@@ -44,12 +44,12 @@
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       doff_t        diagoffx, \
-       diag_t        diagx, \
-       uplo_t        uplox, \
-       trans_t       transx, \
-       dim_t         m, \
-       dim_t         n, \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             uplo_t  uplox, \
+             trans_t transx, \
+             dim_t   m, \
+             dim_t   n, \
        const ctype*  x, inc_t rs_x, inc_t cs_x, \
              ctype*  y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
@@ -66,12 +66,12 @@ INSERT_GENTDEF( copym )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       doff_t        diagoffx, \
-       diag_t        diagx, \
-       uplo_t        uplox, \
-       trans_t       transx, \
-       dim_t         m, \
-       dim_t         n, \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             uplo_t  uplox, \
+             trans_t transx, \
+             dim_t   m, \
+             dim_t   n, \
        const ctype*  alpha, \
        const ctype*  x, inc_t rs_x, inc_t cs_x, \
              ctype*  y, inc_t rs_y, inc_t cs_y  \
@@ -87,12 +87,12 @@ INSERT_GENTDEF( axpym )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       doff_t        diagoffx, \
-       diag_t        diagx, \
-       uplo_t        uplox, \
-       trans_t       transx, \
-       dim_t         m, \
-       dim_t         n, \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             uplo_t  uplox, \
+             trans_t transx, \
+             dim_t   m, \
+             dim_t   n, \
        const ctype*  alpha, \
        const ctype*  x, inc_t rs_x, inc_t cs_x, \
              ctype*  y, inc_t rs_y, inc_t cs_y  \
@@ -108,12 +108,12 @@ INSERT_GENTDEF( scal2m )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       conj_t        conjalpha, \
-       doff_t        diagoffx, \
-       diag_t        diagx, \
-       uplo_t        uplox, \
-       dim_t         m, \
-       dim_t         n, \
+             conj_t  conjalpha, \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             uplo_t  uplox, \
+             dim_t   m, \
+             dim_t   n, \
        const ctype*  alpha, \
              ctype*  x, inc_t rs_x, inc_t cs_x  \
        BLIS_TAPI_EX_PARAMS  \
@@ -129,12 +129,12 @@ INSERT_GENTDEF( setm )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       doff_t        diagoffx, \
-       diag_t        diagx, \
-       uplo_t        uplox, \
-       trans_t       transx, \
-       dim_t         m, \
-       dim_t         n, \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             uplo_t  uplox, \
+             trans_t transx, \
+             dim_t   m, \
+             dim_t   n, \
        const ctype*  x, inc_t rs_x, inc_t cs_x, \
        const ctype*  beta, \
              ctype*  y, inc_t rs_y, inc_t cs_y  \
diff --git a/frame/1m/bli_l1m_ft_ker.h b/frame/1m/bli_l1m_ft_ker.h
index 41d80e217d..f25c3c9438 100644
--- a/frame/1m/bli_l1m_ft_ker.h
+++ b/frame/1m/bli_l1m_ft_ker.h
@@ -90,7 +90,7 @@ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
        ctype*  restrict kappa, \
        ctype*  restrict a, inc_t inca, inc_t lda, \
        ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      );
 
 INSERT_GENTDEF( packm_cxk )
@@ -109,7 +109,7 @@ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
        ctype*  restrict kappa, \
        ctype*  restrict p,             inc_t ldp, \
        ctype*  restrict a, inc_t inca, inc_t lda, \
-       cntx_t* restrict cntx \
+       cntx_t*          cntx \
      );
 
 INSERT_GENTDEF( unpackm_cxk )
@@ -132,7 +132,7 @@ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
        ctype*  restrict kappa, \
        ctype*  restrict a, inc_t inca, inc_t lda, \
        ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx \
+       cntx_t*          cntx \
      );
 
 INSERT_GENTDEF( packm_cxc_diag )
diff --git a/frame/1m/bli_l1m_ker_prot.h b/frame/1m/bli_l1m_ker_prot.h
index 80284ea223..8430614d28 100644
--- a/frame/1m/bli_l1m_ker_prot.h
+++ b/frame/1m/bli_l1m_ker_prot.h
@@ -51,7 +51,7 @@ void PASTEMAC(ch,varname) \
        ctype*  restrict kappa, \
        ctype*  restrict a, inc_t inca, inc_t lda, \
        ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      );
 
 
@@ -68,7 +68,7 @@ void PASTEMAC(ch,varname) \
        ctype*  restrict kappa, \
        ctype*  restrict p,             inc_t ldp, \
        ctype*  restrict a, inc_t inca, inc_t lda, \
-       cntx_t* restrict cntx \
+       cntx_t*          cntx \
      );
 
 
@@ -89,6 +89,6 @@ void PASTEMAC(ch,varname) \
        ctype*  restrict kappa, \
        ctype*  restrict a, inc_t inca, inc_t lda, \
        ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx \
+       cntx_t*          cntx \
      );
 
diff --git a/frame/1m/bli_l1m_oft_var.h b/frame/1m/bli_l1m_oft_var.h
index 496f79e0b2..325ed0ecff 100644
--- a/frame/1m/bli_l1m_oft_var.h
+++ b/frame/1m/bli_l1m_oft_var.h
@@ -46,10 +46,10 @@
 typedef void (*PASTECH(opname,_var_oft)) \
 ( \
   const obj_t*  a, \
-  obj_t*  p, \
+        obj_t*  p, \
   const cntx_t* cntx, \
-  rntm_t* rntm, \
-  cntl_t* cntl, \
+        rntm_t* rntm, \
+        cntl_t* cntl, \
   const thrinfo_t* thread  \
 );
 
diff --git a/frame/1m/bli_l1m_tapi.c b/frame/1m/bli_l1m_tapi.c
index ff84f03827..1b88afe898 100644
--- a/frame/1m/bli_l1m_tapi.c
+++ b/frame/1m/bli_l1m_tapi.c
@@ -45,12 +45,12 @@
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t        diagoffx, \
-       diag_t        diagx, \
-       uplo_t        uplox, \
-       trans_t       transx, \
-       dim_t         m, \
-       dim_t         n, \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             uplo_t  uplox, \
+             trans_t transx, \
+             dim_t   m, \
+             dim_t   n, \
        const ctype*  x, inc_t rs_x, inc_t cs_x, \
              ctype*  y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
@@ -75,9 +75,9 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  transx, \
 	  m, \
 	  n, \
-	  x, rs_x, cs_x, \
-	  y, rs_y, cs_y, \
-	  cntx, \
+	  ( ctype* )x, rs_x, cs_x, \
+	            y, rs_y, cs_y, \
+	  ( cntx_t* )cntx, \
 	  rntm  \
 	); \
 \
@@ -110,12 +110,12 @@ INSERT_GENTFUNC_BASIC( subm, subd )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t        diagoffx, \
-       diag_t        diagx, \
-       uplo_t        uplox, \
-       trans_t       transx, \
-       dim_t         m, \
-       dim_t         n, \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             uplo_t  uplox, \
+             trans_t transx, \
+             dim_t   m, \
+             dim_t   n, \
        const ctype*  x, inc_t rs_x, inc_t cs_x, \
              ctype*  y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
@@ -140,9 +140,9 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  transx, \
 	  m, \
 	  n, \
-	  x, rs_x, cs_x, \
-	  y, rs_y, cs_y, \
-	  cntx, \
+	  ( ctype* )x, rs_x, cs_x, \
+	            y, rs_y, cs_y, \
+	  ( cntx_t* )cntx, \
 	  rntm  \
 	); \
 \
@@ -179,12 +179,12 @@ INSERT_GENTFUNC_BASIC0( copym )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t        diagoffx, \
-       diag_t        diagx, \
-       uplo_t        uplox, \
-       trans_t       transx, \
-       dim_t         m, \
-       dim_t         n, \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             uplo_t  uplox, \
+             trans_t transx, \
+             dim_t   m, \
+             dim_t   n, \
        const ctype*  alpha, \
        const ctype*  x, inc_t rs_x, inc_t cs_x, \
              ctype*  y, inc_t rs_y, inc_t cs_y  \
@@ -213,10 +213,10 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  transx, \
 	  m, \
 	  n, \
-	  alpha, \
-	  x, rs_x, cs_x, \
-	  y, rs_y, cs_y, \
-	  cntx, \
+	  ( ctype* )alpha, \
+	  ( ctype* )x, rs_x, cs_x, \
+	            y, rs_y, cs_y, \
+	  ( cntx_t* )cntx, \
 	  rntm  \
 	); \
 \
@@ -249,12 +249,12 @@ INSERT_GENTFUNC_BASIC0( axpym )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t        diagoffx, \
-       diag_t        diagx, \
-       uplo_t        uplox, \
-       trans_t       transx, \
-       dim_t         m, \
-       dim_t         n, \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             uplo_t  uplox, \
+             trans_t transx, \
+             dim_t   m, \
+             dim_t   n, \
        const ctype*  alpha, \
        const ctype*  x, inc_t rs_x, inc_t cs_x, \
              ctype*  y, inc_t rs_y, inc_t cs_y  \
@@ -302,10 +302,10 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  transx, \
 	  m, \
 	  n, \
-	  alpha, \
-	  x, rs_x, cs_x, \
-	  y, rs_y, cs_y, \
-	  cntx, \
+	  ( ctype* )alpha, \
+	  ( ctype* )x, rs_x, cs_x, \
+	            y, rs_y, cs_y, \
+	  ( cntx_t* )cntx, \
 	  rntm  \
 	); \
 \
@@ -341,12 +341,12 @@ INSERT_GENTFUNC_BASIC0( scal2m )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t        conjalpha, \
-       doff_t        diagoffx, \
-       diag_t        diagx, \
-       uplo_t        uplox, \
-       dim_t         m, \
-       dim_t         n, \
+             conj_t  conjalpha, \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             uplo_t  uplox, \
+             dim_t   m, \
+             dim_t   n, \
        const ctype*  alpha, \
              ctype*  x, inc_t rs_x, inc_t cs_x  \
        BLIS_TAPI_EX_PARAMS  \
@@ -371,9 +371,9 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  uplox, \
 	  m, \
 	  n, \
-	  alpha, \
-	  x, rs_x, cs_x, \
-	  cntx, \
+	  ( ctype* )alpha, \
+	            x, rs_x, cs_x, \
+	  ( cntx_t* )cntx, \
 	  rntm  \
 	); \
 }
@@ -387,12 +387,12 @@ INSERT_GENTFUNC_BASIC0( setm )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t        diagoffx, \
-       diag_t        diagx, \
-       uplo_t        uplox, \
-       trans_t       transx, \
-       dim_t         m, \
-       dim_t         n, \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             uplo_t  uplox, \
+             trans_t transx, \
+             dim_t   m, \
+             dim_t   n, \
        const ctype*  x, inc_t rs_x, inc_t cs_x, \
        const ctype*  beta, \
              ctype*  y, inc_t rs_y, inc_t cs_y  \
@@ -419,9 +419,9 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 		  transx, \
 		  m, \
 		  n, \
-		  x, rs_x, cs_x, \
-		  y, rs_y, cs_y, \
-		  cntx, \
+		  ( ctype* )x, rs_x, cs_x, \
+		            y, rs_y, cs_y, \
+		  ( cntx_t* )cntx, \
 		  rntm  \
 		); \
 \
@@ -438,10 +438,10 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  transx, \
 	  m, \
 	  n, \
-	  x, rs_x, cs_x, \
-	  beta, \
-	  y, rs_y, cs_y, \
-	  cntx, \
+	  ( ctype* )x, rs_x, cs_x, \
+	  ( ctype* )beta, \
+	            y, rs_y, cs_y, \
+	  ( cntx_t* )cntx, \
 	  rntm  \
 	); \
 \
@@ -474,12 +474,12 @@ INSERT_GENTFUNC_BASIC0( xpbym )
 \
 void PASTEMAC3(chx,chy,opname,EX_SUF) \
      ( \
-       doff_t         diagoffx, \
-       diag_t         diagx, \
-       uplo_t         uplox, \
-       trans_t        transx, \
-       dim_t          m, \
-       dim_t          n, \
+             doff_t   diagoffx, \
+             diag_t   diagx, \
+             uplo_t   uplox, \
+             trans_t  transx, \
+             dim_t    m, \
+             dim_t    n, \
        const ctype_x* x, inc_t rs_x, inc_t cs_x, \
        const ctype_y* beta, \
              ctype_y* y, inc_t rs_y, inc_t cs_y  \
@@ -520,10 +520,10 @@ void PASTEMAC3(chx,chy,opname,EX_SUF) \
 	  transx, \
 	  m, \
 	  n, \
-	  x, rs_x, cs_x, \
-	  beta, \
-	  y, rs_y, cs_y, \
-	  cntx, \
+	  ( ctype_x* )x, rs_x, cs_x, \
+	  ( ctype_y* )beta, \
+	              y, rs_y, cs_y, \
+	  ( cntx_t* )cntx, \
 	  rntm  \
 	); \
 }
diff --git a/frame/1m/bli_l1m_tapi.h b/frame/1m/bli_l1m_tapi.h
index ff99047ebc..fe82be3bf4 100644
--- a/frame/1m/bli_l1m_tapi.h
+++ b/frame/1m/bli_l1m_tapi.h
@@ -42,12 +42,12 @@
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t        diagoffx, \
-       diag_t        diagx, \
-       uplo_t        uplox, \
-       trans_t       transx, \
-       dim_t         m, \
-       dim_t         n, \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             uplo_t  uplox, \
+             trans_t transx, \
+             dim_t   m, \
+             dim_t   n, \
        const ctype*  x, inc_t rs_x, inc_t cs_x, \
              ctype*  y, inc_t rs_y, inc_t cs_y  \
        BLIS_TAPI_EX_PARAMS  \
@@ -63,12 +63,12 @@ INSERT_GENTPROT_BASIC0( subm )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t        diagoffx, \
-       diag_t        diagx, \
-       uplo_t        uplox, \
-       trans_t       transx, \
-       dim_t         m, \
-       dim_t         n, \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             uplo_t  uplox, \
+             trans_t transx, \
+             dim_t   m, \
+             dim_t   n, \
        const ctype*  alpha, \
        const ctype*  x, inc_t rs_x, inc_t cs_x, \
              ctype*  y, inc_t rs_y, inc_t cs_y  \
@@ -84,12 +84,12 @@ INSERT_GENTPROT_BASIC0( scal2m )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t        conjalpha, \
-       doff_t        diagoffx, \
-       diag_t        diagx, \
-       uplo_t        uplox, \
-       dim_t         m, \
-       dim_t         n, \
+             conj_t  conjalpha, \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             uplo_t  uplox, \
+             dim_t   m, \
+             dim_t   n, \
        const ctype*  alpha, \
              ctype*  x, inc_t rs_x, inc_t cs_x  \
        BLIS_TAPI_EX_PARAMS  \
@@ -104,12 +104,12 @@ INSERT_GENTPROT_BASIC0( setm )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t        diagoffx, \
-       diag_t        diagx, \
-       uplo_t        uplox, \
-       trans_t       transx, \
-       dim_t         m, \
-       dim_t         n, \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             uplo_t  uplox, \
+             trans_t transx, \
+             dim_t   m, \
+             dim_t   n, \
        const ctype*  x, inc_t rs_x, inc_t cs_x, \
        const ctype*  beta, \
              ctype*  y, inc_t rs_y, inc_t cs_y  \
@@ -124,12 +124,12 @@ INSERT_GENTPROT_BASIC0( xpbym )
 \
 BLIS_EXPORT_BLIS void PASTEMAC3(chx,chy,opname,EX_SUF) \
      ( \
-       doff_t         diagoffx, \
-       diag_t         diagx, \
-       uplo_t         uplox, \
-       trans_t        transx, \
-       dim_t          m, \
-       dim_t          n, \
+             doff_t   diagoffx, \
+             diag_t   diagx, \
+             uplo_t   uplox, \
+             trans_t  transx, \
+             dim_t    m, \
+             dim_t    n, \
        const ctype_x* x, inc_t rs_x, inc_t cs_x, \
        const ctype_y* beta, \
              ctype_y* y, inc_t rs_y, inc_t cs_y  \
diff --git a/frame/1m/packm/bli_packm_alloc.c b/frame/1m/packm/bli_packm_alloc.c
index bd292c9249..22ed31ecc5 100644
--- a/frame/1m/packm/bli_packm_alloc.c
+++ b/frame/1m/packm/bli_packm_alloc.c
@@ -37,9 +37,9 @@
 
 void* bli_packm_alloc
      (
-       siz_t      size_needed,
-       rntm_t*    rntm,
-       cntl_t*    cntl,
+             siz_t      size_needed,
+             rntm_t*    rntm,
+             cntl_t*    cntl,
        const thrinfo_t* thread
      )
 {
@@ -58,10 +58,10 @@ void* bli_packm_alloc
 
 void* bli_packm_alloc_ex
      (
-       siz_t      size_needed,
-       packbuf_t  pack_buf_type,
-       rntm_t*    rntm,
-       cntl_t*    cntl,
+             siz_t      size_needed,
+             packbuf_t  pack_buf_type,
+             rntm_t*    rntm,
+             cntl_t*    cntl,
        const thrinfo_t* thread
      )
 {
diff --git a/frame/1m/packm/bli_packm_alloc.h b/frame/1m/packm/bli_packm_alloc.h
index c7d0325aed..aec2e1af53 100644
--- a/frame/1m/packm/bli_packm_alloc.h
+++ b/frame/1m/packm/bli_packm_alloc.h
@@ -34,18 +34,18 @@
 
 BLIS_EXPORT_BLIS void* bli_packm_alloc
      (
-       siz_t      size_needed,
-       rntm_t*    rntm,
-       cntl_t*    cntl,
+             siz_t      size_needed,
+             rntm_t*    rntm,
+             cntl_t*    cntl,
        const thrinfo_t* thread
      );
 
 BLIS_EXPORT_BLIS void* bli_packm_alloc_ex
      (
-       siz_t      size_needed,
-       packbuf_t  pack_buf_type,
-       rntm_t*    rntm,
-       cntl_t*    cntl,
+             siz_t      size_needed,
+             packbuf_t  pack_buf_type,
+             rntm_t*    rntm,
+             cntl_t*    cntl,
        const thrinfo_t* thread
      );
 
diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c
index 7296423c8e..601f2c05c5 100644
--- a/frame/1m/packm/bli_packm_blk_var1.c
+++ b/frame/1m/packm/bli_packm_blk_var1.c
@@ -57,8 +57,8 @@ void bli_packm_blk_var1
        const obj_t*   c,
              obj_t*   p,
        const cntx_t*  cntx,
-       rntm_t*  rntm,
-       cntl_t*  cntl,
+             rntm_t*  rntm,
+             cntl_t*  cntl,
        const thrinfo_t* thread
      )
 {
@@ -271,7 +271,7 @@ void bli_packm_blk_var1
 				                c_use, incc, ldc,
 				                p_use,       ldp,
 				                       is_p_use,
-				                cntx,
+				                ( cntx_t* )cntx,
 				                params );
 			}
 
@@ -303,7 +303,7 @@ void bli_packm_blk_var1
 				                kappa_cast,
 				                c_begin, incc, ldc,
 				                p_begin,       ldp, is_p,
-				                cntx,
+				                ( cntx_t* )cntx,
 				                params );
 			}
 		}
diff --git a/frame/1m/packm/bli_packm_blk_var1.h b/frame/1m/packm/bli_packm_blk_var1.h
index f35bf3cf99..2fec23902e 100644
--- a/frame/1m/packm/bli_packm_blk_var1.h
+++ b/frame/1m/packm/bli_packm_blk_var1.h
@@ -52,8 +52,8 @@ BLIS_EXPORT_BLIS void bli_packm_blk_var1
        const obj_t*   c,
              obj_t*   p,
        const cntx_t*  cntx,
-       rntm_t*  rntm,
-       cntl_t*  cntl,
+             rntm_t*  rntm,
+             cntl_t*  cntl,
        const thrinfo_t* t
      );
 
diff --git a/frame/1m/packm/bli_packm_init.c b/frame/1m/packm/bli_packm_init.c
index e4cbcc49cf..67e02ac0e5 100644
--- a/frame/1m/packm/bli_packm_init.c
+++ b/frame/1m/packm/bli_packm_init.c
@@ -40,8 +40,8 @@ bool bli_packm_init
        const obj_t*  c,
              obj_t*  p,
        const cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
+             rntm_t* rntm,
+             cntl_t* cntl,
        const thrinfo_t* thread
      )
 {
diff --git a/frame/1m/packm/bli_packm_init.h b/frame/1m/packm/bli_packm_init.h
index 6f0997bc3b..6f9b472736 100644
--- a/frame/1m/packm/bli_packm_init.h
+++ b/frame/1m/packm/bli_packm_init.h
@@ -37,8 +37,8 @@ BLIS_EXPORT_BLIS bool bli_packm_init
        const obj_t*  a,
              obj_t*  p,
        const cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
+             rntm_t* rntm,
+             cntl_t* cntl,
        const thrinfo_t* thread
      );
 
diff --git a/frame/1m/packm/bli_packm_int.c b/frame/1m/packm/bli_packm_int.c
index 45872ebb07..f76607508c 100644
--- a/frame/1m/packm/bli_packm_int.c
+++ b/frame/1m/packm/bli_packm_int.c
@@ -39,8 +39,8 @@ void bli_packm_int
        const obj_t*  a,
              obj_t*  p,
        const cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
+             rntm_t* rntm,
+             cntl_t* cntl,
        const thrinfo_t* thread
      )
 {
diff --git a/frame/1m/packm/bli_packm_int.h b/frame/1m/packm/bli_packm_int.h
index 389c49ad59..a4cf17d592 100644
--- a/frame/1m/packm/bli_packm_int.h
+++ b/frame/1m/packm/bli_packm_int.h
@@ -37,7 +37,7 @@ void bli_packm_int
        const obj_t*  a,
              obj_t*  p,
        const cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
+             rntm_t* rntm,
+             cntl_t* cntl,
        const thrinfo_t* thread
      );
diff --git a/frame/1m/packm/bli_packm_struc_cxk.c b/frame/1m/packm/bli_packm_struc_cxk.c
index 0cf4ac9304..2b8f92eb5f 100644
--- a/frame/1m/packm/bli_packm_struc_cxk.c
+++ b/frame/1m/packm/bli_packm_struc_cxk.c
@@ -39,23 +39,23 @@
 \
 void PASTEMAC(ch,varname) \
      ( \
-       struc_t         strucc, \
-       diag_t          diagc, \
-       uplo_t          uploc, \
-       conj_t          conjc, \
-       pack_t          schema, \
-       bool            invdiag, \
-       dim_t           panel_dim, \
-       dim_t           panel_len, \
-       dim_t           panel_dim_max, \
-       dim_t           panel_len_max, \
-       dim_t           panel_dim_off, \
-       dim_t           panel_len_off, \
-       ctype* restrict kappa, \
-       ctype* restrict c, inc_t incc, inc_t ldc, \
-       ctype* restrict p,             inc_t ldp, \
-                          inc_t is_p, \
-       cntx_t*         cntx  \
+       struc_t strucc, \
+       diag_t  diagc, \
+       uplo_t  uploc, \
+       conj_t  conjc, \
+       pack_t  schema, \
+       bool    invdiag, \
+       dim_t   panel_dim, \
+       dim_t   panel_len, \
+       dim_t   panel_dim_max, \
+       dim_t   panel_len_max, \
+       dim_t   panel_dim_off, \
+       dim_t   panel_len_off, \
+       ctype*  kappa, \
+       ctype*  c, inc_t incc, inc_t ldc, \
+       ctype*  p,             inc_t ldp, \
+                  inc_t is_p, \
+       cntx_t* cntx  \
      ) \
 { \
 	num_t   dt            = PASTEMAC(ch,type); \
@@ -132,7 +132,7 @@ void PASTEMAC(ch,varname) \
         { \
             if ( bli_is_1m_packed( schema ) ) \
             { \
-    		    ctype_r* restrict zero = PASTEMAC(chr,0); \
+    		    ctype_r* zero = PASTEMAC(chr,0); \
         		PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \
         		( \
         		  BLIS_NO_CONJUGATE, \
@@ -149,7 +149,7 @@ void PASTEMAC(ch,varname) \
             } \
             else \
             { \
-    		    ctype* restrict zero = PASTEMAC(ch,0); \
+    		    ctype* zero = PASTEMAC(ch,0); \
         		PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \
         		( \
         		  BLIS_NO_CONJUGATE, \
@@ -239,7 +239,7 @@ void PASTEMAC(ch,varname) \
         { \
             if ( bli_is_1m_packed( schema ) ) \
             { \
-    		    ctype_r* restrict zero = PASTEMAC(chr,0); \
+    		    ctype_r* zero = PASTEMAC(chr,0); \
         		PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \
         		( \
         		  BLIS_NO_CONJUGATE, \
@@ -256,7 +256,7 @@ void PASTEMAC(ch,varname) \
             } \
             else \
             { \
-    		    ctype* restrict zero = PASTEMAC(ch,0); \
+    		    ctype* zero = PASTEMAC(ch,0); \
         		PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \
         		( \
         		  BLIS_NO_CONJUGATE, \
diff --git a/frame/1m/packm/bli_packm_struc_cxk.h b/frame/1m/packm/bli_packm_struc_cxk.h
index 973a02612b..f0293330b5 100644
--- a/frame/1m/packm/bli_packm_struc_cxk.h
+++ b/frame/1m/packm/bli_packm_struc_cxk.h
@@ -37,26 +37,24 @@
 \
 void PASTEMAC(ch,varname) \
      ( \
-       struc_t         strucc, \
-       diag_t          diagc, \
-       uplo_t          uploc, \
-       conj_t          conjc, \
-       pack_t          schema, \
-       bool            invdiag, \
-       dim_t           panel_dim, \
-       dim_t           panel_len, \
-       dim_t           panel_dim_max, \
-       dim_t           panel_len_max, \
-       dim_t           panel_dim_off, \
-       dim_t           panel_len_off, \
-       ctype* restrict kappa, \
-       ctype* restrict c, inc_t incc, inc_t ldc, \
-       ctype* restrict p,             inc_t ldp, \
-                          inc_t is_p, \
-       cntx_t*         cntx  \
+       struc_t strucc, \
+       diag_t  diagc, \
+       uplo_t  uploc, \
+       conj_t  conjc, \
+       pack_t  schema, \
+       bool    invdiag, \
+       dim_t   panel_dim, \
+       dim_t   panel_len, \
+       dim_t   panel_dim_max, \
+       dim_t   panel_len_max, \
+       dim_t   panel_dim_off, \
+       dim_t   panel_len_off, \
+       ctype*  kappa, \
+       ctype*  c, inc_t incc, inc_t ldc, \
+       ctype*  p,             inc_t ldp, \
+                  inc_t is_p, \
+       cntx_t* cntx  \
      );
 
 INSERT_GENTPROT_BASIC0( packm_struc_cxk )
-INSERT_GENTPROT_BASIC0( packm_herm_cxk )
-INSERT_GENTPROT_BASIC0( packm_tri_cxk )
 
diff --git a/frame/1m/unpackm/bli_unpackm_blk_var1.c b/frame/1m/unpackm/bli_unpackm_blk_var1.c
index d8193fbc1a..fb5afa61f2 100644
--- a/frame/1m/unpackm/bli_unpackm_blk_var1.c
+++ b/frame/1m/unpackm/bli_unpackm_blk_var1.c
@@ -143,28 +143,28 @@ void PASTEMAC(ch,varname) \
        const cntx_t* cntx  \
      ) \
 { \
-	ctype* restrict one       = PASTEMAC(ch,1); \
-	ctype* restrict c_cast    = c; \
-	ctype* restrict p_cast    = p; \
-	ctype* restrict c_begin; \
-	ctype* restrict p_begin; \
+	ctype* one    = PASTEMAC(ch,1); \
+	ctype* c_cast = c; \
+	ctype* p_cast = p; \
+	ctype* c_begin; \
+	ctype* p_begin; \
 \
-	dim_t           iter_dim; \
-	dim_t           num_iter; \
-	dim_t           it, ic, ip; \
-    dim_t           ic0, ip0; \
-	doff_t          ic_inc, ip_inc; \
-    doff_t          diagoffc_i; \
-    doff_t          diagoffc_inc; \
-	dim_t           panel_len; \
-	dim_t           panel_dim_i; \
-	dim_t           panel_dim_max; \
-	inc_t           vs_c; \
-	inc_t           incc, ldc; \
-	inc_t           ldp; \
-	dim_t*          m_panel_full; \
-	dim_t*          n_panel_full; \
-    pack_t          schema; \
+	dim_t  iter_dim; \
+	dim_t  num_iter; \
+	dim_t  it, ic, ip; \
+    dim_t  ic0, ip0; \
+	doff_t ic_inc, ip_inc; \
+    doff_t diagoffc_i; \
+    doff_t diagoffc_inc; \
+	dim_t  panel_len; \
+	dim_t  panel_dim_i; \
+	dim_t  panel_dim_max; \
+	inc_t  vs_c; \
+	inc_t  incc, ldc; \
+	inc_t  ldp; \
+	dim_t* m_panel_full; \
+	dim_t* n_panel_full; \
+    pack_t schema; \
 \
 \
 	/* If c needs a transposition, induce it so that we can more simply
@@ -272,7 +272,7 @@ void PASTEMAC(ch,varname) \
 			  one, \
 			  p_begin,       ldp, \
 			  c_begin, incc, ldc, \
-			  cntx  \
+			  ( cntx_t* )cntx  \
 			); \
 		} \
 \
diff --git a/frame/1m/unpackm/bli_unpackm_check.c b/frame/1m/unpackm/bli_unpackm_check.c
index e397c311f9..786edd4c83 100644
--- a/frame/1m/unpackm/bli_unpackm_check.c
+++ b/frame/1m/unpackm/bli_unpackm_check.c
@@ -1,4 +1,4 @@
-const /*
+/*
 
    BLIS
    An object-based framework for developing high-performance BLAS-like
diff --git a/frame/2/bli_l2_ft.h b/frame/2/bli_l2_ft.h
index 410b796658..cb1799966c 100644
--- a/frame/2/bli_l2_ft.h
+++ b/frame/2/bli_l2_ft.h
@@ -44,10 +44,10 @@
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       trans_t       transa, \
-       conj_t        conjx, \
-       dim_t         m, \
-       dim_t         n, \
+             trans_t transa, \
+             conj_t  conjx, \
+             dim_t   m, \
+             dim_t   n, \
        const ctype*  alpha, \
        const ctype*  a, inc_t rs_a, inc_t cs_a, \
        const ctype*  x, inc_t incx, \
@@ -65,10 +65,10 @@ INSERT_GENTDEF( gemv )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       conj_t        conjx, \
-       conj_t        conjy, \
-       dim_t         m, \
-       dim_t         n, \
+             conj_t  conjx, \
+             conj_t  conjy, \
+             dim_t   m, \
+             dim_t   n, \
        const ctype*  alpha, \
        const ctype*  x, inc_t incx, \
        const ctype*  y, inc_t incy, \
@@ -85,10 +85,10 @@ INSERT_GENTDEF( ger )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       uplo_t        uploa, \
-       conj_t        conja, \
-       conj_t        conjx, \
-       dim_t         m, \
+             uplo_t  uploa, \
+             conj_t  conja, \
+             conj_t  conjx, \
+             dim_t   m, \
        const ctype*  alpha, \
        const ctype*  a, inc_t rs_a, inc_t cs_a, \
        const ctype*  x, inc_t incx, \
@@ -107,9 +107,9 @@ INSERT_GENTDEF( symv )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       uplo_t         uploa, \
-       conj_t         conjx, \
-       dim_t          m, \
+             uplo_t   uploa, \
+             conj_t   conjx, \
+             dim_t    m, \
        const ctype_r* alpha, \
        const ctype*   x, inc_t incx, \
              ctype*   a, inc_t rs_a, inc_t cs_a  \
@@ -125,9 +125,9 @@ INSERT_GENTDEFR( her )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       uplo_t       uploa, \
-       conj_t       conjx, \
-       dim_t        m, \
+             uplo_t uploa, \
+             conj_t conjx, \
+             dim_t  m, \
        const ctype* alpha, \
        const ctype* x, inc_t incx, \
              ctype* a, inc_t rs_a, inc_t cs_a  \
@@ -143,10 +143,10 @@ INSERT_GENTDEF( syr )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       uplo_t        uploa, \
-       conj_t        conjx, \
-       conj_t        conjy, \
-       dim_t         m, \
+             uplo_t  uploa, \
+             conj_t  conjx, \
+             conj_t  conjy, \
+             dim_t   m, \
        const ctype*  alpha, \
        const ctype*  x, inc_t incx, \
        const ctype*  y, inc_t incy, \
@@ -164,10 +164,10 @@ INSERT_GENTDEF( syr2 )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       uplo_t        uploa, \
-       trans_t       transa, \
-       diag_t        diaga, \
-       dim_t         m, \
+             uplo_t  uploa, \
+             trans_t transa, \
+             diag_t  diaga, \
+             dim_t   m, \
        const ctype*  alpha, \
        const ctype*  a, inc_t rs_a, inc_t cs_a, \
              ctype*  x, inc_t incx  \
diff --git a/frame/2/bli_l2_tapi.c b/frame/2/bli_l2_tapi.c
index 17f50ac7bc..f99fb8f8c4 100644
--- a/frame/2/bli_l2_tapi.c
+++ b/frame/2/bli_l2_tapi.c
@@ -45,10 +45,10 @@
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       trans_t       transa, \
-       conj_t        conjx, \
-       dim_t         m, \
-       dim_t         n, \
+             trans_t transa, \
+             conj_t  conjx, \
+             dim_t   m, \
+             dim_t   n, \
        const ctype*  alpha, \
        const ctype*  a, inc_t rs_a, inc_t cs_a, \
        const ctype*  x, inc_t incx, \
@@ -111,12 +111,12 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  conjx, \
 	  m, \
 	  n, \
-	  alpha, \
-	  a, rs_a, cs_a, \
-	  x, incx, \
-	  beta, \
-	  y, incy, \
-	  cntx \
+	  ( ctype* )alpha, \
+	  ( ctype* )a, rs_a, cs_a, \
+	  ( ctype* )x, incx, \
+	  ( ctype* )beta, \
+	            y, incy, \
+	  ( cntx_t* )cntx \
 	); \
 }
 
@@ -128,10 +128,10 @@ INSERT_GENTFUNC_BASIC3( gemv, gemv, gemv_unf_var1, gemv_unf_var2 )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t        conjx, \
-       conj_t        conjy, \
-       dim_t         m, \
-       dim_t         n, \
+             conj_t  conjx, \
+             conj_t  conjy, \
+             dim_t   m, \
+             dim_t   n, \
        const ctype*  alpha, \
        const ctype*  x, inc_t incx, \
        const ctype*  y, inc_t incy, \
@@ -164,11 +164,11 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  conjy, \
 	  m, \
 	  n, \
-	  alpha, \
-	  x, incx, \
-	  y, incy, \
-	  a, rs_a, cs_a, \
-	  cntx \
+	  ( ctype* )alpha, \
+	  ( ctype* )x, incx, \
+	  ( ctype* )y, incy, \
+	            a, rs_a, cs_a, \
+	  ( cntx_t* )cntx \
 	); \
 }
 
@@ -180,10 +180,10 @@ INSERT_GENTFUNC_BASIC3( ger, ger, ger_unb_var1, ger_unb_var2 )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       uplo_t        uploa, \
-       conj_t        conja, \
-       conj_t        conjx, \
-       dim_t         m, \
+             uplo_t  uploa, \
+             conj_t  conja, \
+             conj_t  conjx, \
+             dim_t   m, \
        const ctype*  alpha, \
        const ctype*  a, inc_t rs_a, inc_t cs_a, \
        const ctype*  x, inc_t incx, \
@@ -239,12 +239,12 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  conjx, \
 	  conjh, /* used by variants to distinguish hemv from symv */ \
 	  m, \
-	  alpha, \
-	  a, rs_a, cs_a, \
-	  x, incx, \
-	  beta, \
-	  y, incy, \
-	  cntx \
+	  ( ctype* )alpha, \
+	  ( ctype* )a, rs_a, cs_a, \
+	  ( ctype* )x, incx, \
+	  ( ctype* )beta, \
+	            y, incy, \
+	  ( cntx_t* )cntx \
 	); \
 }
 
@@ -257,9 +257,9 @@ INSERT_GENTFUNC_BASIC4( symv, hemv, BLIS_NO_CONJUGATE, hemv_unf_var1, hemv_unf_v
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       uplo_t         uploa, \
-       conj_t         conjx, \
-       dim_t          m, \
+             uplo_t   uploa, \
+             conj_t   conjx, \
+             dim_t    m, \
        const ctype_r* alpha, \
        const ctype*   x, inc_t incx, \
              ctype*   a, inc_t rs_a, inc_t cs_a  \
@@ -306,10 +306,10 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  conjx, \
 	  conjh, /* used by variants to distinguish her from syr */ \
 	  m, \
-	  &alpha_local, \
-	  x, incx, \
-	  a, rs_a, cs_a, \
-	  cntx \
+	  ( ctype* )&alpha_local, \
+	  ( ctype* )x, incx, \
+	            a, rs_a, cs_a, \
+	  ( cntx_t* )cntx \
 	); \
 }
 
@@ -321,9 +321,9 @@ INSERT_GENTFUNCR_BASIC4( her, her, BLIS_CONJUGATE, her_unb_var1, her_unb_var2 )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       uplo_t         uploa, \
-       conj_t         conjx, \
-       dim_t          m, \
+             uplo_t   uploa, \
+             conj_t   conjx, \
+             dim_t    m, \
        const ctype*   alpha, \
        const ctype*   x, inc_t incx, \
              ctype*   a, inc_t rs_a, inc_t cs_a  \
@@ -363,10 +363,10 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  conjx, \
 	  conjh, /* used by variants to distinguish her2 from syr2 */ \
 	  m, \
-	  alpha, \
-	  x, incx, \
-	  a, rs_a, cs_a, \
-	  cntx \
+	  ( ctype* )alpha, \
+	  ( ctype* )x, incx, \
+	            a, rs_a, cs_a, \
+	  ( cntx_t* )cntx \
 	); \
 }
 
@@ -378,10 +378,10 @@ INSERT_GENTFUNC_BASIC4( syr, her, BLIS_NO_CONJUGATE, her_unb_var1, her_unb_var2
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       uplo_t        uploa, \
-       conj_t        conjx, \
-       conj_t        conjy, \
-       dim_t         m, \
+             uplo_t  uploa, \
+             conj_t  conjx, \
+             conj_t  conjy, \
+             dim_t   m, \
        const ctype*  alpha, \
        const ctype*  x, inc_t incx, \
        const ctype*  y, inc_t incy, \
@@ -423,11 +423,11 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  conjy, \
 	  conjh, \
 	  m, \
-	  alpha, \
-	  x, incx, \
-	  y, incy, \
-	  a, rs_a, cs_a, \
-	  cntx \
+	  ( ctype* )alpha, \
+	  ( ctype* )x, incx, \
+	  ( ctype* )y, incy, \
+	            a, rs_a, cs_a, \
+	  ( cntx_t* )cntx \
 	); \
 }
 
@@ -440,10 +440,10 @@ INSERT_GENTFUNC_BASIC4( syr2, her2, BLIS_NO_CONJUGATE, her2_unf_var1, her2_unf_v
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       uplo_t        uploa, \
-       trans_t       transa, \
-       diag_t        diaga, \
-       dim_t         m, \
+             uplo_t  uploa, \
+             trans_t transa, \
+             diag_t  diaga, \
+             dim_t   m, \
        const ctype*  alpha, \
        const ctype*  a, inc_t rs_a, inc_t cs_a, \
              ctype*  x, inc_t incx  \
@@ -498,10 +498,10 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  transa, \
 	  diaga, \
 	  m, \
-	  alpha, \
-	  a, rs_a, cs_a, \
-	  x, incx, \
-	  cntx \
+	  ( ctype* )alpha, \
+	  ( ctype* )a, rs_a, cs_a, \
+	            x, incx, \
+	  ( cntx_t* )cntx \
 	); \
 }
 
diff --git a/frame/2/bli_l2_tapi.h b/frame/2/bli_l2_tapi.h
index 072c87a2f1..a95604d3c0 100644
--- a/frame/2/bli_l2_tapi.h
+++ b/frame/2/bli_l2_tapi.h
@@ -42,10 +42,10 @@
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       trans_t       transa, \
-       conj_t        conjx, \
-       dim_t         m, \
-       dim_t         n, \
+             trans_t transa, \
+             conj_t  conjx, \
+             dim_t   m, \
+             dim_t   n, \
        const ctype*  alpha, \
        const ctype*  a, inc_t rs_a, inc_t cs_a, \
        const ctype*  x, inc_t incx, \
@@ -62,10 +62,10 @@ INSERT_GENTPROT_BASIC0( gemv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       conj_t        conjx, \
-       conj_t        conjy, \
-       dim_t         m, \
-       dim_t         n, \
+             conj_t  conjx, \
+             conj_t  conjy, \
+             dim_t   m, \
+             dim_t   n, \
        const ctype*  alpha, \
        const ctype*  x, inc_t incx, \
        const ctype*  y, inc_t incy, \
@@ -81,10 +81,10 @@ INSERT_GENTPROT_BASIC0( ger )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       uplo_t        uploa, \
-       conj_t        conja, \
-       conj_t        conjx, \
-       dim_t         m, \
+             uplo_t  uploa, \
+             conj_t  conja, \
+             conj_t  conjx, \
+             dim_t   m, \
        const ctype*  alpha, \
        const ctype*  a, inc_t rs_a, inc_t cs_a, \
        const ctype*  x, inc_t incx, \
@@ -102,9 +102,9 @@ INSERT_GENTPROT_BASIC0( symv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       uplo_t         uploa, \
-       conj_t         conjx, \
-       dim_t          m, \
+             uplo_t   uploa, \
+             conj_t   conjx, \
+             dim_t    m, \
        const ctype_r* alpha, \
        const ctype*   x, inc_t incx, \
              ctype*   a, inc_t rs_a, inc_t cs_a  \
@@ -119,9 +119,9 @@ INSERT_GENTPROTR_BASIC0( her )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       uplo_t         uploa, \
-       conj_t         conjx, \
-       dim_t          m, \
+             uplo_t   uploa, \
+             conj_t   conjx, \
+             dim_t    m, \
        const ctype*   alpha, \
        const ctype*   x, inc_t incx, \
              ctype*   a, inc_t rs_a, inc_t cs_a  \
@@ -136,10 +136,10 @@ INSERT_GENTPROT_BASIC0( syr )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       uplo_t        uploa, \
-       conj_t        conjx, \
-       conj_t        conjy, \
-       dim_t         m, \
+             uplo_t  uploa, \
+             conj_t  conjx, \
+             conj_t  conjy, \
+             dim_t   m, \
        const ctype*  alpha, \
        const ctype*  x, inc_t incx, \
        const ctype*  y, inc_t incy, \
@@ -156,10 +156,10 @@ INSERT_GENTPROT_BASIC0( syr2 )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       uplo_t        uploa, \
-       trans_t       transa, \
-       diag_t        diaga, \
-       dim_t         m, \
+             uplo_t  uploa, \
+             trans_t transa, \
+             diag_t  diaga, \
+             dim_t   m, \
        const ctype*  alpha, \
        const ctype*  a, inc_t rs_a, inc_t cs_a, \
              ctype*  x, inc_t incx  \
diff --git a/frame/2/gemv/bli_gemv_var_oapi.c.prev b/frame/2/gemv/other/bli_gemv_var_oapi.c.prev
similarity index 100%
rename from frame/2/gemv/bli_gemv_var_oapi.c.prev
rename to frame/2/gemv/other/bli_gemv_var_oapi.c.prev
diff --git a/frame/3/bli_l3_blocksize.c b/frame/3/bli_l3_blocksize.c
index 284c92b011..78482b5f63 100644
--- a/frame/3/bli_l3_blocksize.c
+++ b/frame/3/bli_l3_blocksize.c
@@ -37,12 +37,12 @@
 
 dim_t bli_l3_determine_kc
       (
-        dir_t   direct,
-        dim_t   i,
-        dim_t   dim,
+              dir_t   direct,
+              dim_t   i,
+              dim_t   dim,
         const obj_t*  a,
         const obj_t*  b,
-        bszid_t bszid,
+              bszid_t bszid,
         const cntx_t* cntx,
         const cntl_t* cntl
       )
@@ -75,12 +75,12 @@ dim_t bli_l3_determine_kc
 \
 dim_t PASTEMAC0(opname) \
       ( \
-        dir_t   direct, \
-        dim_t   i, \
-        dim_t   dim, \
+              dir_t   direct, \
+              dim_t   i, \
+              dim_t   dim, \
         const obj_t*  a, \
         const obj_t*  b, \
-        bszid_t bszid, \
+              bszid_t bszid, \
         const cntx_t* cntx  \
       ) \
 { \
@@ -102,20 +102,14 @@ GENFRONT( trsm_determine_kc, trsm )
 \
 dim_t PASTEMAC0(opname) \
       ( \
-        dim_t   i, \
-        dim_t   dim, \
+              dim_t   i, \
+              dim_t   dim, \
         const obj_t*  a, \
         const obj_t*  b, \
-        bszid_t bszid, \
+              bszid_t bszid, \
         const cntx_t* cntx  \
       ) \
 { \
-	num_t    dt; \
-	const blksz_t* bsize; \
-	dim_t    mnr; \
-	dim_t    b_alg, b_max; \
-	dim_t    b_use; \
- \
 	/* bli_*_determine_kc_f():
 
 	   We assume that this function is being called from an algorithm that
@@ -130,15 +124,16 @@ dim_t PASTEMAC0(opname) \
 \
 	/* Extract the execution datatype and use it to query the corresponding
 	   blocksize and blocksize maximum values from the blksz_t object. */ \
-	dt    = bli_obj_exec_dt( a ); \
-	bsize = bli_cntx_get_blksz( bszid, cntx ); \
-	b_alg = bli_blksz_get_def( dt, bsize ); \
-	b_max = bli_blksz_get_max( dt, bsize ); \
+	const num_t    dt    = bli_obj_exec_dt( a ); \
+	const blksz_t* bsize = bli_cntx_get_blksz( bszid, cntx ); \
+	      dim_t    b_alg = bli_blksz_get_def( dt, bsize ); \
+	      dim_t    b_max = bli_blksz_get_max( dt, bsize ); \
 \
 	/* Nudge the default and maximum kc blocksizes up to the nearest
 	   multiple of MR if A is Hermitian or symmetric, or NR if B is
 	   Hermitian or symmetric. If neither case applies, then we leave
 	   the blocksizes unchanged. */ \
+	dim_t    mnr; \
 	if      ( bli_obj_root_is_herm_or_symm( a ) ) \
 	{ \
 		mnr   = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
@@ -154,9 +149,7 @@ dim_t PASTEMAC0(opname) \
 \
 	/* Call the bli_determine_blocksize_[fb]_sub() helper routine defined
 	   in bli_blksz.c */ \
-	b_use = PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \
-\
-	return b_use; \
+	return PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \
 }
 
 GENFRONT( gemm_determine_kc_f, f )
@@ -169,19 +162,14 @@ GENFRONT( gemm_determine_kc_b, b )
 \
 dim_t PASTEMAC0(opname) \
       ( \
-        dim_t   i, \
-        dim_t   dim, \
+              dim_t   i, \
+              dim_t   dim, \
         const obj_t*  a, \
         const obj_t*  b, \
-        bszid_t bszid, \
+              bszid_t bszid, \
         const cntx_t* cntx  \
       ) \
 { \
-	num_t    dt; \
-	const blksz_t* bsize; \
-	dim_t    b_alg, b_max; \
-	dim_t    b_use; \
- \
 	/* bli_*_determine_kc_f():
 
 	   We assume that this function is being called from an algorithm that
@@ -196,19 +184,17 @@ dim_t PASTEMAC0(opname) \
 \
 	/* Extract the execution datatype and use it to query the corresponding
 	   blocksize and blocksize maximum values from the blksz_t object. */ \
-	dt    = bli_obj_exec_dt( a ); \
-	bsize = bli_cntx_get_blksz( bszid, cntx ); \
-	b_alg = bli_blksz_get_def( dt, bsize ); \
-	b_max = bli_blksz_get_max( dt, bsize ); \
+	const num_t    dt    = bli_obj_exec_dt( a ); \
+	const blksz_t* bsize = bli_cntx_get_blksz( bszid, cntx ); \
+	const dim_t    b_alg = bli_blksz_get_def( dt, bsize ); \
+	const dim_t    b_max = bli_blksz_get_max( dt, bsize ); \
 \
 	/* Notice that for gemmt, we do not need to perform any special handling
 	   for the default and maximum kc blocksizes vis-a-vis MR or NR. */ \
 \
 	/* Call the bli_determine_blocksize_[fb]_sub() helper routine defined
 	   in bli_blksz.c */ \
-	b_use = PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \
-\
-	return b_use; \
+	return PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \
 }
 
 GENFRONT( gemmt_determine_kc_f, f )
@@ -221,20 +207,14 @@ GENFRONT( gemmt_determine_kc_b, b )
 \
 dim_t PASTEMAC0(opname) \
       ( \
-        dim_t   i, \
-        dim_t   dim, \
+              dim_t   i, \
+              dim_t   dim, \
         const obj_t*  a, \
         const obj_t*  b, \
-        bszid_t bszid, \
+              bszid_t bszid, \
         const cntx_t* cntx  \
       ) \
 { \
-	num_t    dt; \
-	const blksz_t* bsize; \
-	dim_t    mnr; \
-	dim_t    b_alg, b_max; \
-	dim_t    b_use; \
- \
 	/* bli_*_determine_kc_f():
 
 	   We assume that this function is being called from an algorithm that
@@ -249,14 +229,15 @@ dim_t PASTEMAC0(opname) \
 \
 	/* Extract the execution datatype and use it to query the corresponding
 	   blocksize and blocksize maximum values from the blksz_t object. */ \
-	dt    = bli_obj_exec_dt( a ); \
-	bsize = bli_cntx_get_blksz( bszid, cntx ); \
-	b_alg = bli_blksz_get_def( dt, bsize ); \
-	b_max = bli_blksz_get_max( dt, bsize ); \
+	const num_t    dt    = bli_obj_exec_dt( a ); \
+	const blksz_t* bsize = bli_cntx_get_blksz( bszid, cntx ); \
+	      dim_t    b_alg = bli_blksz_get_def( dt, bsize ); \
+	      dim_t    b_max = bli_blksz_get_max( dt, bsize ); \
 \
 	/* Nudge the default and maximum kc blocksizes up to the nearest
 	   multiple of MR if the triangular matrix is on the left, or NR
 	   if the triangular matrix is one the right. */ \
+	dim_t mnr; \
 	if ( bli_obj_root_is_triangular( a ) ) \
 		mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
 	else \
@@ -267,9 +248,7 @@ dim_t PASTEMAC0(opname) \
 \
 	/* Call the bli_determine_blocksize_[fb]_sub() helper routine defined
 	   in bli_blksz.c */ \
-	b_use = PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \
-\
-	return b_use; \
+	return PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \
 }
 
 GENFRONT( trmm_determine_kc_f, f )
@@ -282,20 +261,14 @@ GENFRONT( trmm_determine_kc_b, b )
 \
 dim_t PASTEMAC0(opname) \
       ( \
-        dim_t   i, \
-        dim_t   dim, \
+              dim_t   i, \
+              dim_t   dim, \
         const obj_t*  a, \
         const obj_t*  b, \
-        bszid_t bszid, \
+              bszid_t bszid, \
         const cntx_t* cntx  \
       ) \
 { \
-	num_t    dt; \
-	const blksz_t* bsize; \
-	dim_t    mnr; \
-	dim_t    b_alg, b_max; \
-	dim_t    b_use; \
- \
 	/* bli_*_determine_kc_f():
 
 	   We assume that this function is being called from an algorithm that
@@ -310,25 +283,23 @@ dim_t PASTEMAC0(opname) \
 \
 	/* Extract the execution datatype and use it to query the corresponding
 	   blocksize and blocksize maximum values from the blksz_t object. */ \
-	dt    = bli_obj_exec_dt( a ); \
-	bsize = bli_cntx_get_blksz( bszid, cntx ); \
-	b_alg = bli_blksz_get_def( dt, bsize ); \
-	b_max = bli_blksz_get_max( dt, bsize ); \
+	const num_t    dt    = bli_obj_exec_dt( a ); \
+	const blksz_t* bsize = bli_cntx_get_blksz( bszid, cntx ); \
+	      dim_t    b_alg = bli_blksz_get_def( dt, bsize ); \
+	      dim_t    b_max = bli_blksz_get_max( dt, bsize ); \
 \
 	/* Nudge the default and maximum kc blocksizes up to the nearest
 	   multiple of MR. We always use MR (rather than sometimes using NR)
 	   because even when the triangle is on the right, packing of that
 	   matrix uses MR, since only left-side trsm micro-kernels are
 	   supported. */ \
-	mnr   = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
-	b_alg = bli_align_dim_to_mult( b_alg, mnr ); \
-	b_max = bli_align_dim_to_mult( b_max, mnr ); \
+	const dim_t mnr   = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
+	            b_alg = bli_align_dim_to_mult( b_alg, mnr ); \
+	            b_max = bli_align_dim_to_mult( b_max, mnr ); \
 \
 	/* Call the bli_determine_blocksize_[fb]_sub() helper routine defined
 	   in bli_blksz.c */ \
-	b_use = PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \
-\
-	return b_use; \
+	return PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \
 }
 
 GENFRONT( trsm_determine_kc_f, f )
diff --git a/frame/3/bli_l3_blocksize.h b/frame/3/bli_l3_blocksize.h
index cae6c85199..1ec889e030 100644
--- a/frame/3/bli_l3_blocksize.h
+++ b/frame/3/bli_l3_blocksize.h
@@ -34,12 +34,12 @@
 
 dim_t bli_l3_determine_kc
       (
-        dir_t   direct,
-        dim_t   i,
-        dim_t   dim,
+              dir_t   direct,
+              dim_t   i,
+              dim_t   dim,
         const obj_t*  a,
         const obj_t*  b,
-        bszid_t bszid,
+              bszid_t bszid,
         const cntx_t* cntx,
         const cntl_t* cntl
       );
@@ -50,12 +50,12 @@ dim_t bli_l3_determine_kc
 \
 dim_t PASTEMAC0(opname) \
       ( \
-         dir_t   direct, \
-         dim_t   i, \
-         dim_t   dim, \
+               dir_t   direct, \
+               dim_t   i, \
+               dim_t   dim, \
          const obj_t*  a, \
          const obj_t*  b, \
-         bszid_t bszid, \
+               bszid_t bszid, \
          const cntx_t* cntx  \
       );
 
@@ -70,11 +70,11 @@ GENPROT( trsm_determine_kc )
 \
 dim_t PASTEMAC0(opname) \
       ( \
-         dim_t   i, \
-         dim_t   dim, \
+               dim_t   i, \
+               dim_t   dim, \
          const obj_t*  a, \
          const obj_t*  b, \
-         bszid_t bszid, \
+               bszid_t bszid, \
          const cntx_t* cntx  \
       );
 
diff --git a/frame/3/bli_l3_check.c b/frame/3/bli_l3_check.c
index 9c25922aae..29f57fba7f 100644
--- a/frame/3/bli_l3_check.c
+++ b/frame/3/bli_l3_check.c
@@ -87,7 +87,7 @@ void bli_gemmt_check
 
 void bli_hemm_check
      (
-       side_t  side,
+             side_t  side,
        const obj_t*  alpha,
        const obj_t*  a,
        const obj_t*  b,
@@ -175,7 +175,7 @@ void bli_her2k_check
 
 void bli_symm_check
      (
-       side_t  side,
+             side_t  side,
        const obj_t*  alpha,
        const obj_t*  a,
        const obj_t*  b,
@@ -250,7 +250,7 @@ void bli_syr2k_check
 
 void bli_trmm3_check
      (
-       side_t  side,
+             side_t  side,
        const obj_t*  alpha,
        const obj_t*  a,
        const obj_t*  b,
@@ -273,7 +273,7 @@ void bli_trmm3_check
 
 void bli_trmm_check
      (
-       side_t  side,
+             side_t  side,
        const obj_t*  alpha,
        const obj_t*  a,
        const obj_t*  b,
@@ -294,7 +294,7 @@ void bli_trmm_check
 
 void bli_trsm_check
      (
-       side_t  side,
+             side_t  side,
        const obj_t*  alpha,
        const obj_t*  a,
        const obj_t*  b,
@@ -389,7 +389,7 @@ void bli_gemmt_basic_check
 
 void bli_hemm_basic_check
      (
-       side_t  side,
+             side_t  side,
        const obj_t*  alpha,
        const obj_t*  a,
        const obj_t*  b,
diff --git a/frame/3/bli_l3_check.h b/frame/3/bli_l3_check.h
index b8ea6661d9..8551b6b612 100644
--- a/frame/3/bli_l3_check.h
+++ b/frame/3/bli_l3_check.h
@@ -61,7 +61,7 @@ GENPROT( syr2k )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       side_t  side, \
+             side_t  side, \
        const obj_t*  alpha, \
        const obj_t*  a, \
        const obj_t*  b, \
@@ -96,7 +96,7 @@ GENPROT( syrk )
 \
 void PASTEMAC(opname,_check) \
      ( \
-       side_t  side, \
+             side_t  side, \
        const obj_t*  alpha, \
        const obj_t*  a, \
        const obj_t*  b, \
@@ -131,7 +131,7 @@ void bli_gemmt_basic_check
 
 void bli_hemm_basic_check
      (
-       side_t  side,
+             side_t  side,
        const obj_t*  alpha,
        const obj_t*  a,
        const obj_t*  b,
diff --git a/frame/3/bli_l3_cntl.c b/frame/3/bli_l3_cntl.c
index 025e5a7248..d7fd9649e8 100644
--- a/frame/3/bli_l3_cntl.c
+++ b/frame/3/bli_l3_cntl.c
@@ -38,15 +38,15 @@
 
 void bli_l3_cntl_create_if
      (
-       opid_t   family,
-       pack_t   schema_a,
-       pack_t   schema_b,
+             opid_t   family,
+             pack_t   schema_a,
+             pack_t   schema_b,
        const obj_t*   a,
        const obj_t*   b,
        const obj_t*   c,
-       rntm_t*  rntm,
-       cntl_t*  cntl_orig,
-       cntl_t** cntl_use
+             rntm_t*  rntm,
+             cntl_t*  cntl_orig,
+             cntl_t** cntl_use
      )
 {
 	// If the control tree pointer is NULL, we construct a default
diff --git a/frame/3/bli_l3_cntl.h b/frame/3/bli_l3_cntl.h
index 2ba68feca2..eb4321ecd7 100644
--- a/frame/3/bli_l3_cntl.h
+++ b/frame/3/bli_l3_cntl.h
@@ -40,15 +40,15 @@
 
 void bli_l3_cntl_create_if
      (
-       opid_t   family,
-       pack_t   schema_a,
-       pack_t   schema_b,
+             opid_t   family,
+             pack_t   schema_a,
+             pack_t   schema_b,
        const obj_t*   a,
        const obj_t*   b,
        const obj_t*   c,
-       rntm_t*  rntm,
-       cntl_t*  cntl_orig,
-       cntl_t** cntl_use
+             rntm_t*  rntm,
+             cntl_t*  cntl_orig,
+             cntl_t** cntl_use
      );
 
 void bli_l3_cntl_free
diff --git a/frame/3/bli_l3_ft_ukr.h b/frame/3/bli_l3_ft_ukr.h
index 28065c208b..e7952409f4 100644
--- a/frame/3/bli_l3_ft_ukr.h
+++ b/frame/3/bli_l3_ft_ukr.h
@@ -55,8 +55,8 @@ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \
        ctype*     restrict b, \
        ctype*     restrict beta, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      );
 
 INSERT_GENTDEF( gemm )
@@ -78,8 +78,8 @@ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \
        ctype*     restrict bx1, \
        ctype*     restrict b11, \
        ctype*     restrict c11, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      );
 
 INSERT_GENTDEF( gemmtrsm )
@@ -95,8 +95,8 @@ typedef void (*PASTECH3(ch,opname,_ukr,tsuf)) \
        ctype*     restrict a, \
        ctype*     restrict b, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      );
 
 INSERT_GENTDEF( trsm )
diff --git a/frame/3/bli_l3_ind_ukr.h b/frame/3/bli_l3_ind_ukr.h
index 6f24e71fcf..243ff818df 100644
--- a/frame/3/bli_l3_ind_ukr.h
+++ b/frame/3/bli_l3_ind_ukr.h
@@ -51,8 +51,8 @@ void PASTEMAC(ch,opname) \
        ctype*     restrict b, \
        ctype*     restrict beta, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      );
 
 INSERT_GENTPROT_BASIC0( gemm1m_ukr_name )
@@ -72,8 +72,8 @@ void PASTEMAC(ch,opname) \
        ctype*     restrict bx1, \
        ctype*     restrict b11, \
        ctype*     restrict c11, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      );
 
 INSERT_GENTPROT_BASIC0( gemmtrsm1m_l_ukr_name )
@@ -88,8 +88,8 @@ void PASTEMAC(ch,opname) \
        ctype*     restrict a, \
        ctype*     restrict b, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      );
 
 INSERT_GENTPROT_BASIC0( trsm1m_l_ukr_name )
diff --git a/frame/3/bli_l3_int.c b/frame/3/bli_l3_int.c
index a88bd5249a..b786236ab9 100644
--- a/frame/3/bli_l3_int.c
+++ b/frame/3/bli_l3_int.c
@@ -42,9 +42,9 @@ void bli_l3_int
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      )
 {
 	obj_t a_local;
diff --git a/frame/3/bli_l3_int.h b/frame/3/bli_l3_int.h
index 9648670feb..65485206de 100644
--- a/frame/3/bli_l3_int.h
+++ b/frame/3/bli_l3_int.h
@@ -40,8 +40,8 @@ void bli_l3_int
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      );
 
diff --git a/frame/3/bli_l3_oapi.c b/frame/3/bli_l3_oapi.c
index 52e8234028..d48a96ec82 100644
--- a/frame/3/bli_l3_oapi.c
+++ b/frame/3/bli_l3_oapi.c
@@ -66,7 +66,7 @@ GENFRONT( syr2k )
 \
 void PASTEMAC0(opname) \
      ( \
-       side_t  side, \
+             side_t  side, \
        const obj_t*  alpha, \
        const obj_t*  a, \
        const obj_t*  b, \
diff --git a/frame/3/bli_l3_oapi.h b/frame/3/bli_l3_oapi.h
index bdef0217ab..1f184c1219 100644
--- a/frame/3/bli_l3_oapi.h
+++ b/frame/3/bli_l3_oapi.h
@@ -61,7 +61,7 @@ GENPROT( syr2k )
 \
 BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
      ( \
-       side_t  side, \
+             side_t  side, \
        const obj_t*  alpha, \
        const obj_t*  a, \
        const obj_t*  b, \
@@ -94,7 +94,7 @@ GENPROT( syrk )
 \
 BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
      ( \
-       side_t  side, \
+             side_t  side, \
        const obj_t*  alpha, \
        const obj_t*  a, \
        const obj_t*  b  \
diff --git a/frame/3/bli_l3_oapi_ex.c b/frame/3/bli_l3_oapi_ex.c
index 342131ecde..e4c815fe3a 100644
--- a/frame/3/bli_l3_oapi_ex.c
+++ b/frame/3/bli_l3_oapi_ex.c
@@ -49,8 +49,8 @@ void PASTEMAC(gemm,BLIS_OAPI_EX_SUF)
        const obj_t*  b,
        const obj_t*  beta,
        const obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm
+       const cntx_t* cntx,
+             rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -122,8 +122,8 @@ void PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)
        const obj_t*  b,
        const obj_t*  beta,
        const obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm
+       const cntx_t* cntx,
+             rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -171,8 +171,8 @@ void PASTEMAC(her2k,BLIS_OAPI_EX_SUF)
        const obj_t*  b,
        const obj_t*  beta,
        const obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm
+       const cntx_t* cntx,
+             rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -217,8 +217,8 @@ void PASTEMAC(syr2k,BLIS_OAPI_EX_SUF)
        const obj_t*  b,
        const obj_t*  beta,
        const obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm
+       const cntx_t* cntx,
+             rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -244,14 +244,14 @@ void PASTEMAC(syr2k,BLIS_OAPI_EX_SUF)
 
 void PASTEMAC(hemm,BLIS_OAPI_EX_SUF)
      (
-       side_t  side,
+             side_t  side,
        const obj_t*  alpha,
        const obj_t*  a,
        const obj_t*  b,
        const obj_t*  beta,
        const obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm
+       const cntx_t* cntx,
+             rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -294,14 +294,14 @@ void PASTEMAC(hemm,BLIS_OAPI_EX_SUF)
 
 void PASTEMAC(symm,BLIS_OAPI_EX_SUF)
      (
-       side_t  side,
+             side_t  side,
        const obj_t*  alpha,
        const obj_t*  a,
        const obj_t*  b,
        const obj_t*  beta,
        const obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm
+       const cntx_t* cntx,
+             rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -344,14 +344,14 @@ void PASTEMAC(symm,BLIS_OAPI_EX_SUF)
 
 void PASTEMAC(trmm3,BLIS_OAPI_EX_SUF)
      (
-       side_t  side,
+             side_t  side,
        const obj_t*  alpha,
        const obj_t*  a,
        const obj_t*  b,
        const obj_t*  beta,
        const obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm
+       const cntx_t* cntx,
+             rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -398,8 +398,8 @@ void PASTEMAC(herk,BLIS_OAPI_EX_SUF)
        const obj_t*  a,
        const obj_t*  beta,
        const obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm
+       const cntx_t* cntx,
+             rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -432,8 +432,8 @@ void PASTEMAC(syrk,BLIS_OAPI_EX_SUF)
        const obj_t*  a,
        const obj_t*  beta,
        const obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm
+       const cntx_t* cntx,
+             rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -453,12 +453,12 @@ void PASTEMAC(syrk,BLIS_OAPI_EX_SUF)
 
 void PASTEMAC(trmm,BLIS_OAPI_EX_SUF)
      (
-       side_t  side,
+             side_t  side,
        const obj_t*  alpha,
        const obj_t*  a,
        const obj_t*  b,
-       cntx_t* cntx,
-       rntm_t* rntm
+       const cntx_t* cntx,
+             rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -500,12 +500,12 @@ void PASTEMAC(trmm,BLIS_OAPI_EX_SUF)
 
 void PASTEMAC(trsm,BLIS_OAPI_EX_SUF)
      (
-       side_t  side,
+             side_t  side,
        const obj_t*  alpha,
        const obj_t*  a,
        const obj_t*  b,
-       cntx_t* cntx,
-       rntm_t* rntm
+       const cntx_t* cntx,
+             rntm_t* rntm
      )
 {
 	bli_init_once();
diff --git a/frame/3/bli_l3_oapi_ex.h b/frame/3/bli_l3_oapi_ex.h
index 68b98aa064..58091704b4 100644
--- a/frame/3/bli_l3_oapi_ex.h
+++ b/frame/3/bli_l3_oapi_ex.h
@@ -48,8 +48,8 @@ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
        const obj_t*  b, \
        const obj_t*  beta, \
        const obj_t*  c, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+       const cntx_t* cntx, \
+             rntm_t* rntm  \
      );
 
 GENPROT( gemm )
@@ -63,14 +63,14 @@ GENPROT( syr2k )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
      ( \
-       side_t  side, \
+             side_t  side, \
        const obj_t*  alpha, \
        const obj_t*  a, \
        const obj_t*  b, \
        const obj_t*  beta, \
        const obj_t*  c, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+       const cntx_t* cntx, \
+             rntm_t* rntm  \
      );
 
 GENPROT( hemm )
@@ -87,8 +87,8 @@ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
        const obj_t*  a, \
        const obj_t*  beta, \
        const obj_t*  c, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+       const cntx_t* cntx, \
+             rntm_t* rntm  \
      );
 
 GENPROT( herk )
@@ -100,12 +100,12 @@ GENPROT( syrk )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
      ( \
-       side_t  side, \
+             side_t  side, \
        const obj_t*  alpha, \
        const obj_t*  a, \
        const obj_t*  b, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+       const cntx_t* cntx, \
+             rntm_t* rntm  \
      );
 
 GENPROT( trmm )
diff --git a/frame/3/bli_l3_oft.h b/frame/3/bli_l3_oft.h
index 22496faefa..997ade58e0 100644
--- a/frame/3/bli_l3_oft.h
+++ b/frame/3/bli_l3_oft.h
@@ -54,7 +54,7 @@ typedef void (*PASTECH(opname,_oft)) \
   const obj_t*  beta, \
   const obj_t*  c, \
   const cntx_t* cntx, \
-  rntm_t* rntm  \
+        rntm_t* rntm  \
 );
 
 GENTDEF( gemm )
@@ -70,14 +70,14 @@ GENTDEF( syr2k )
 \
 typedef void (*PASTECH(opname,_oft)) \
 ( \
-  side_t  side, \
+        side_t  side, \
   const obj_t*  alpha, \
   const obj_t*  a, \
   const obj_t*  b, \
   const obj_t*  beta, \
   const obj_t*  c, \
   const cntx_t* cntx, \
-  rntm_t* rntm  \
+        rntm_t* rntm  \
 );
 
 GENTDEF( hemm )
@@ -97,7 +97,7 @@ typedef void (*PASTECH(opname,_oft)) \
   const obj_t*  beta, \
   const obj_t*  c, \
   const cntx_t* cntx, \
-  rntm_t* rntm  \
+        rntm_t* rntm  \
 );
 
 GENTDEF( herk )
@@ -111,12 +111,12 @@ GENTDEF( syrk )
 \
 typedef void (*PASTECH(opname,_oft)) \
 ( \
-  side_t  side, \
+        side_t  side, \
   const obj_t*  alpha, \
   const obj_t*  a, \
   const obj_t*  b, \
   const cntx_t* cntx, \
-  rntm_t* rntm  \
+        rntm_t* rntm  \
 );
 
 GENTDEF( trmm )
diff --git a/frame/3/bli_l3_oft_var.h b/frame/3/bli_l3_oft_var.h
index 016fe79418..ee529b115a 100644
--- a/frame/3/bli_l3_oft_var.h
+++ b/frame/3/bli_l3_oft_var.h
@@ -49,9 +49,9 @@ typedef void (*PASTECH(opname,_var_oft)) \
   const obj_t*  b, \
   const obj_t*  c, \
   const cntx_t* cntx, \
-  rntm_t* rntm, \
-  cntl_t* cntl, \
-  thrinfo_t* thread  \
+        rntm_t* rntm, \
+        cntl_t* cntl, \
+        thrinfo_t* thread  \
 );
 
 GENTDEF( l3 )
diff --git a/frame/3/bli_l3_packab.c b/frame/3/bli_l3_packab.c
index 1ea97beade..6f18169b28 100644
--- a/frame/3/bli_l3_packab.c
+++ b/frame/3/bli_l3_packab.c
@@ -40,9 +40,9 @@ void bli_l3_packa
        const obj_t*  b,
        const obj_t*  c,
        const cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      )
 {
 	obj_t a_local, a_pack;
@@ -88,9 +88,9 @@ void bli_l3_packb
        const obj_t*  b,
        const obj_t*  c,
        const cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      )
 {
 	obj_t bt_local, bt_pack;
diff --git a/frame/3/bli_l3_packab.h b/frame/3/bli_l3_packab.h
index 1901eea434..f03b7f62ce 100644
--- a/frame/3/bli_l3_packab.h
+++ b/frame/3/bli_l3_packab.h
@@ -38,9 +38,9 @@ void bli_l3_packa
        const obj_t*  b,
        const obj_t*  c,
        const cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      );
 
 void bli_l3_packb
@@ -49,8 +49,8 @@ void bli_l3_packb
        const obj_t*  b,
        const obj_t*  c,
        const cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      );
 
diff --git a/frame/3/bli_l3_prune.c b/frame/3/bli_l3_prune.c
index d227a3e7d8..6531b74a82 100644
--- a/frame/3/bli_l3_prune.c
+++ b/frame/3/bli_l3_prune.c
@@ -34,174 +34,106 @@
 
 #include "blis.h"
 
-/*
+
 void bli_l3_prune_unref_mparts_m
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
+             obj_t*  a,
+       const obj_t*  b,
+             obj_t*  c,
        const cntl_t* cntl
      )
 {
-	// Query the operation family.
+	/* Query the operation family. */
 	opid_t family = bli_cntl_family( cntl );
 
-	if      ( family == BLIS_GEMM ) return; // No pruning is necessary for gemm.
-	else if ( family == BLIS_GEMMT ) bli_gemmt_prune_unref_mparts_m( a, b, c );
-	else if ( family == BLIS_TRMM ) bli_trmm_prune_unref_mparts_m( a, b, c );
-	else if ( family == BLIS_TRSM ) bli_trsm_prune_unref_mparts_m( a, b, c );
-}
-*/
-
-#undef  GENFRONT
-#define GENFRONT( dim ) \
-\
-void PASTEMAC(l3_prune_unref_mparts_,dim) \
-     ( \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  c, \
-       const cntl_t* cntl  \
-     ) \
-{ \
-	/* Query the operation family. */ \
-	opid_t family = bli_cntl_family( cntl ); \
-\
-	if      ( family == BLIS_GEMM ) return; /* No pruning is necessary for gemm. */ \
-	else if ( family == BLIS_GEMMT ) PASTEMAC(gemmt_prune_unref_mparts_,dim)( a, b, c ); \
-	else if ( family == BLIS_TRMM ) PASTEMAC(trmm_prune_unref_mparts_,dim)( a, b, c ); \
-	else if ( family == BLIS_TRSM ) PASTEMAC(trsm_prune_unref_mparts_,dim)( a, b, c ); \
-}
-
-GENFRONT( m )
-GENFRONT( n )
-GENFRONT( k )
-
-// -----------------------------------------------------------------------------
-
-#undef  GENFRONT
-#define GENFRONT( opname ) \
-\
-void PASTEMAC(opname,_prune_unref_mparts_m) \
-     ( \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  c  \
-     ) \
-{ \
-	/* No pruning is necessary for gemm. */ \
-} \
-void PASTEMAC(opname,_prune_unref_mparts_n) \
-     ( \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  c  \
-     ) \
-{ \
-	/* No pruning is necessary for gemm. */ \
-} \
-void PASTEMAC(opname,_prune_unref_mparts_k) \
-     ( \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  c  \
-     ) \
-{ \
-	/* No pruning is necessary for gemm. */ \
+	if      ( family == BLIS_GEMM )
+	{
+		/* No pruning is necessary for gemm. */
+		return;
+	}
+	else if ( family == BLIS_GEMMT )
+	{
+		/* Prune any unreferenced part from the subpartition of C (that would
+		   be encountered from partitioning in the m dimension) and adjust the
+		   subpartition of A accordingly. */
+		bli_prune_unref_mparts( c, BLIS_M, a, BLIS_M );
+	}
+	else if ( family == BLIS_TRMM ||
+	          family == BLIS_TRSM )
+	{
+		/* Prune any unreferenced part from the subpartition of A (that would
+		   be encountered from partitioning in the m dimension) and adjust the
+		   subpartition of C accordingly. */
+		bli_prune_unref_mparts( a, BLIS_M, c, BLIS_M );
+	}
 }
 
-GENFRONT( gemm )
-
-// -----------------------------------------------------------------------------
+void bli_l3_prune_unref_mparts_n
+     (
+       const obj_t*  a,
+             obj_t*  b,
+             obj_t*  c,
+       const cntl_t* cntl
+     )
+{
+	/* Query the operation family. */
+	opid_t family = bli_cntl_family( cntl );
 
-#undef  GENFRONT
-#define GENFRONT( opname ) \
-\
-void PASTEMAC(opname,_prune_unref_mparts_m) \
-     ( \
-       obj_t*  a, \
-       obj_t*  ah, \
-       obj_t*  c  \
-     ) \
-{ \
-	/* Prune any unreferenced part from the subpartition of C (that would
-	   be encountered from partitioning in the m dimension) and adjust the
-	   subpartition of A accordingly. */ \
-	bli_prune_unref_mparts( c, BLIS_M, a, BLIS_M ); \
-} \
-void PASTEMAC(opname,_prune_unref_mparts_n) \
-     ( \
-       obj_t*  a, \
-       obj_t*  ah, \
-       obj_t*  c  \
-     ) \
-{ \
-	/* Prune any unreferenced part from the subpartition of C (that would
-	   be encountered from partitioning in the n dimension) and adjust the
-	   subpartition of Ah accordingly. */ \
-	bli_prune_unref_mparts( c, BLIS_N, ah, BLIS_N ); \
-} \
-void PASTEMAC(opname,_prune_unref_mparts_k) \
-     ( \
-       obj_t*  a, \
-       obj_t*  ah, \
-       obj_t*  c  \
-     ) \
-{ \
-	/* As long as A and Ah are general in structure, no pruning should be
-	   for the k dimension. */ \
+	if      ( family == BLIS_GEMM )
+	{
+		/* No pruning is necessary for gemm. */
+		return;
+	}
+	else if ( family == BLIS_GEMMT )
+	{
+		/* Prune any unreferenced part from the subpartition of C (that would
+		   be encountered from partitioning in the m dimension) and adjust the
+		   subpartition of B accordingly. */
+		bli_prune_unref_mparts( c, BLIS_N, b, BLIS_N );
+	}
+	else if ( family == BLIS_TRMM ||
+	          family == BLIS_TRSM )
+	{
+		/* Prune any unreferenced part from the subpartition of B (that would
+		   be encountered from partitioning in the m dimension) and adjust the
+		   subpartition of C accordingly. */
+		bli_prune_unref_mparts( b, BLIS_N, c, BLIS_N );
+	}
 }
 
-GENFRONT( gemmt )
-
-// -----------------------------------------------------------------------------
+void bli_l3_prune_unref_mparts_k
+     (
+             obj_t*  a,
+             obj_t*  b,
+       const obj_t*  c,
+       const cntl_t* cntl
+     )
+{
+	/* Query the operation family. */
+	opid_t family = bli_cntl_family( cntl );
 
-#undef  GENFRONT
-#define GENFRONT( opname ) \
-\
-void PASTEMAC(opname,_prune_unref_mparts_m) \
-     ( \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  c  \
-     ) \
-{ \
-	/* Prune any unreferenced part from the subpartition of A (that would
-	   be encountered from partitioning in the m dimension) and adjust the
-	   subpartition of C accordingly. */ \
-	bli_prune_unref_mparts( a, BLIS_M, c, BLIS_M ); \
-} \
-void PASTEMAC(opname,_prune_unref_mparts_n) \
-     ( \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  c  \
-     ) \
-{ \
-	/* Prune any unreferenced part from the subpartition of B (that would
-	   be encountered from partitioning in the n dimension) and adjust the
-	   subpartition of C accordingly. */ \
-	bli_prune_unref_mparts( b, BLIS_N, c, BLIS_N ); \
-} \
-void PASTEMAC(opname,_prune_unref_mparts_k) \
-     ( \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  c  \
-     ) \
-{ \
-	/* Prune any unreferenced part from the subpartition of A (that would
-	   be encountered from partitioning in the k dimension) and adjust the
-	   subpartition of B accordingly. */ \
-	bli_prune_unref_mparts( a, BLIS_N, b, BLIS_M ); \
-\
-	/* Prune any unreferenced part from the subpartition of B (that would
-	   be encountered from partitioning in the k dimension) and adjust the
-	   subpartition of A accordingly. */ \
-	bli_prune_unref_mparts( b, BLIS_M, a, BLIS_N ); \
+	if      ( family == BLIS_GEMM )
+	{
+		/* No pruning is necessary for gemm. */
+		return;
+	}
+	else if ( family == BLIS_GEMMT )
+	{
+		/* No pruning is necessary for gemmt. */
+		return;
+	}
+	else if ( family == BLIS_TRMM ||
+	          family == BLIS_TRSM )
+	{
+		/* Prune any unreferenced part from the subpartition of A (that would
+		   be encountered from partitioning in the k dimension) and adjust the
+		   subpartition of B accordingly. */
+		bli_prune_unref_mparts( a, BLIS_N, b, BLIS_M );
+
+		/* Prune any unreferenced part from the subpartition of B (that would
+		   be encountered from partitioning in the k dimension) and adjust the
+		   subpartition of A accordingly. */
+		bli_prune_unref_mparts( b, BLIS_M, a, BLIS_N );
+	}
 }
 
-GENFRONT( trmm )
-GENFRONT( trsm )
-
-
diff --git a/frame/3/bli_l3_prune.h b/frame/3/bli_l3_prune.h
index 887f2b5e6d..84c0cbbcd7 100644
--- a/frame/3/bli_l3_prune.h
+++ b/frame/3/bli_l3_prune.h
@@ -33,46 +33,27 @@
 */
 
 
-#undef  GENPROT
-#define GENPROT( dim ) \
-\
-void PASTEMAC(l3_prune_unref_mparts_,dim) \
-     ( \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  c, \
-       const cntl_t* cntl  \
+void bli_l3_prune_unref_mparts_m
+     (
+             obj_t*  a,
+       const obj_t*  b,
+             obj_t*  c,
+       const cntl_t* cntl
      );
 
-GENPROT( m )
-GENPROT( n )
-GENPROT( k )
-
-// -----------------------------------------------------------------------------
-
-#undef  GENPROT
-#define GENPROT( opname, dim ) \
-\
-void PASTEMAC2(opname,_prune_unref_mparts_,dim) \
-     ( \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  c  \
+void bli_l3_prune_unref_mparts_n
+     (
+       const obj_t*  a,
+             obj_t*  b,
+             obj_t*  c,
+       const cntl_t* cntl
      );
 
-GENPROT( gemm, m )
-GENPROT( gemm, n )
-GENPROT( gemm, k )
-
-GENPROT( gemmt, m )
-GENPROT( gemmt, n )
-GENPROT( gemmt, k )
-
-GENPROT( trmm, m )
-GENPROT( trmm, n )
-GENPROT( trmm, k )
-
-GENPROT( trsm, m )
-GENPROT( trsm, n )
-GENPROT( trsm, k )
+void bli_l3_prune_unref_mparts_k
+     (
+             obj_t*  a,
+             obj_t*  b,
+       const obj_t*  c,
+       const cntl_t* cntl
+     );
 
diff --git a/frame/3/bli_l3_schema.c b/frame/3/bli_l3_schema.c
index 1b03468776..1de381f372 100644
--- a/frame/3/bli_l3_schema.c
+++ b/frame/3/bli_l3_schema.c
@@ -36,9 +36,9 @@
 
 void bli_l3_set_schemas
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
+             obj_t*  a,
+             obj_t*  b,
+       const obj_t*  c,
        const cntx_t* cntx
      )
 {
diff --git a/frame/3/bli_l3_schema.h b/frame/3/bli_l3_schema.h
index 5ec5be3ccc..a909bf598c 100644
--- a/frame/3/bli_l3_schema.h
+++ b/frame/3/bli_l3_schema.h
@@ -34,8 +34,8 @@
 
 void bli_l3_set_schemas
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
+             obj_t*  a,
+             obj_t*  b,
+       const obj_t*  c,
        const cntx_t* cntx
      );
diff --git a/frame/3/bli_l3_sup.c b/frame/3/bli_l3_sup.c
index 7e37e1f22e..eedbd9ec51 100644
--- a/frame/3/bli_l3_sup.c
+++ b/frame/3/bli_l3_sup.c
@@ -36,13 +36,13 @@
 
 err_t bli_gemmsup
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm
      )
 {
 	// Return early if small matrix handling is disabled at configure-time.
@@ -134,13 +134,13 @@ printf( "dims: %d %d %d (threshs: %d %d %d)\n",
 
 err_t bli_gemmtsup
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm
      )
 {
 	// Return early if small matrix handling is disabled at configure-time.
diff --git a/frame/3/bli_l3_sup.h b/frame/3/bli_l3_sup.h
index fe6d0483e7..33b3f8ca74 100644
--- a/frame/3/bli_l3_sup.h
+++ b/frame/3/bli_l3_sup.h
@@ -34,23 +34,23 @@
 
 err_t bli_gemmsup
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm
      );
 
 err_t bli_gemmtsup
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm
      );
 
diff --git a/frame/3/bli_l3_sup_ft_ker.h b/frame/3/bli_l3_sup_ft_ker.h
index 5bb2218f3b..dbeafb404e 100644
--- a/frame/3/bli_l3_sup_ft_ker.h
+++ b/frame/3/bli_l3_sup_ft_ker.h
@@ -57,8 +57,8 @@ typedef void (*PASTECH3(ch,opname,_ker,tsuf)) \
        ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
        ctype*     restrict beta, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      );
 
 INSERT_GENTDEF( gemmsup )
diff --git a/frame/3/bli_l3_sup_int.c b/frame/3/bli_l3_sup_int.c
index b95fa1368b..3ff13bdb59 100644
--- a/frame/3/bli_l3_sup_int.c
+++ b/frame/3/bli_l3_sup_int.c
@@ -42,8 +42,8 @@ err_t bli_gemmsup_int
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
+             rntm_t* rntm,
+             thrinfo_t* thread
      )
 {
 #if 0
@@ -246,8 +246,8 @@ err_t bli_gemmtsup_int
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
+             rntm_t* rntm,
+             thrinfo_t* thread
      )
 {
 	const stor3_t stor_id = bli_obj_stor3_from_strides( c, a, b );
diff --git a/frame/3/bli_l3_sup_int.h b/frame/3/bli_l3_sup_int.h
index 6d3abdf5c5..195e3ca405 100644
--- a/frame/3/bli_l3_sup_int.h
+++ b/frame/3/bli_l3_sup_int.h
@@ -40,8 +40,8 @@ err_t bli_gemmsup_int
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
+             rntm_t* rntm,
+             thrinfo_t* thread
      );
 
 err_t bli_gemmtsup_int
@@ -52,6 +52,6 @@ err_t bli_gemmtsup_int
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
+             rntm_t* rntm,
+             thrinfo_t* thread
      );
diff --git a/frame/3/bli_l3_sup_ker_prot.h b/frame/3/bli_l3_sup_ker_prot.h
index 899a47d3fa..30cad5257b 100644
--- a/frame/3/bli_l3_sup_ker_prot.h
+++ b/frame/3/bli_l3_sup_ker_prot.h
@@ -50,7 +50,7 @@ void PASTEMAC(ch,opname) \
        ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
        ctype*     restrict beta, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      );
 
diff --git a/frame/3/bli_l3_sup_oft.h b/frame/3/bli_l3_sup_oft.h
index 6d9cb09e49..ba60035b78 100644
--- a/frame/3/bli_l3_sup_oft.h
+++ b/frame/3/bli_l3_sup_oft.h
@@ -53,7 +53,7 @@ typedef err_t (*PASTECH(opname,_oft)) \
   const obj_t*  beta, \
   const obj_t*  c, \
   const cntx_t* cntx, \
-  rntm_t* rntm  \
+        rntm_t* rntm  \
 );
 
 GENTDEF( gemmsup )
diff --git a/frame/3/bli_l3_sup_packm_a.c b/frame/3/bli_l3_sup_packm_a.c
index e89cc15601..26faefc463 100644
--- a/frame/3/bli_l3_sup_packm_a.c
+++ b/frame/3/bli_l3_sup_packm_a.c
@@ -214,18 +214,18 @@ void PASTEMAC(ch,opname) \
      ( \
        bool             will_pack, \
        stor3_t          stor_id, \
-       pack_t* restrict schema, \
+       pack_t* schema, \
        dim_t            m, \
        dim_t            k, \
        dim_t            mr, \
-       dim_t*  restrict m_max, \
-       dim_t*  restrict k_max, \
+       dim_t*  m_max, \
+       dim_t*  k_max, \
        ctype*           x, inc_t           rs_x, inc_t           cs_x, \
-       ctype**          p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
-                           dim_t* restrict pd_p, inc_t* restrict ps_p, \
-       cntx_t* restrict cntx, \
-       mem_t*  restrict mem, \
-       thrinfo_t* restrict thread  \
+       ctype**          p, inc_t* rs_p, inc_t* cs_p, \
+                           dim_t* pd_p, inc_t* ps_p, \
+       cntx_t* cntx, \
+       mem_t*  mem, \
+       thrinfo_t* thread  \
      ) \
 { \
 	/* Inspect whether we are going to be packing matrix A. */ \
@@ -320,14 +320,14 @@ void PASTEMAC(ch,opname) \
        dim_t            m, \
        dim_t            k, \
        dim_t            mr, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t           rs_a, inc_t           cs_a, \
-       ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
-                                                 inc_t* restrict ps_p, \
-       cntx_t* restrict cntx, \
-       rntm_t* restrict rntm, \
-       mem_t*  restrict mem, \
-       thrinfo_t* restrict thread  \
+       ctype*  kappa, \
+       ctype*  a, inc_t           rs_a, inc_t           cs_a, \
+       ctype** p, inc_t* rs_p, inc_t* cs_p, \
+                                                 inc_t* ps_p, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       mem_t*  mem, \
+       thrinfo_t* thread  \
      ) \
 { \
 	pack_t schema; \
diff --git a/frame/3/bli_l3_sup_packm_a.h b/frame/3/bli_l3_sup_packm_a.h
index 6b40f950a6..2bddeb07b4 100644
--- a/frame/3/bli_l3_sup_packm_a.h
+++ b/frame/3/bli_l3_sup_packm_a.h
@@ -73,18 +73,18 @@ void PASTEMAC(ch,opname) \
      ( \
        bool             will_pack, \
        stor3_t          stor_id, \
-       pack_t* restrict schema, \
+       pack_t* schema, \
        dim_t            m, \
        dim_t            k, \
        dim_t            mr, \
-       dim_t*  restrict m_max, \
-       dim_t*  restrict k_max, \
+       dim_t*  m_max, \
+       dim_t*  k_max, \
        ctype*           a, inc_t           rs_a, inc_t           cs_a, \
-       ctype**          p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
-                           dim_t* restrict pd_p, inc_t* restrict ps_p, \
-       cntx_t* restrict cntx, \
-       mem_t*  restrict mem, \
-       thrinfo_t* restrict thread  \
+       ctype**          p, inc_t* rs_p, inc_t* cs_p, \
+                           dim_t* pd_p, inc_t* ps_p, \
+       cntx_t* cntx, \
+       mem_t*  mem, \
+       thrinfo_t* thread  \
      ); \
 
 INSERT_GENTPROT_BASIC0( packm_sup_init_a )
@@ -104,14 +104,14 @@ void PASTEMAC(ch,opname) \
        dim_t            m, \
        dim_t            k, \
        dim_t            mr, \
-       ctype*  restrict kappa, \
-       ctype*  restrict a, inc_t           rs_a, inc_t           cs_a, \
-       ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
-                                                 inc_t* restrict ps_p, \
-       cntx_t* restrict cntx, \
-       rntm_t* restrict rntm, \
-       mem_t*  restrict mem, \
-       thrinfo_t* restrict thread  \
+       ctype*  kappa, \
+       ctype*  a, inc_t           rs_a, inc_t           cs_a, \
+       ctype** p, inc_t* rs_p, inc_t* cs_p, \
+                                                 inc_t* ps_p, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       mem_t*  mem, \
+       thrinfo_t* thread  \
      ); \
 
 INSERT_GENTPROT_BASIC0( packm_sup_a )
diff --git a/frame/3/bli_l3_sup_packm_b.c b/frame/3/bli_l3_sup_packm_b.c
index 32c14afe3d..6165567759 100644
--- a/frame/3/bli_l3_sup_packm_b.c
+++ b/frame/3/bli_l3_sup_packm_b.c
@@ -45,10 +45,10 @@ void PASTEMAC(ch,opname) \
        dim_t            k, \
        dim_t            n, \
        dim_t            nr, \
-       cntx_t* restrict cntx, \
-       rntm_t* restrict rntm, \
-       mem_t*  restrict mem, \
-       thrinfo_t* restrict thread  \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       mem_t*  mem, \
+       thrinfo_t* thread  \
      ) \
 { \
 	/* Inspect whether we are going to be packing matrix B. */ \
@@ -175,9 +175,9 @@ INSERT_GENTFUNC_BASIC0( packm_sup_init_mem_b )
 void PASTEMAC(ch,opname) \
      ( \
        bool             did_pack, \
-       rntm_t* restrict rntm, \
-       mem_t*  restrict mem, \
-       thrinfo_t* restrict thread  \
+       rntm_t* rntm, \
+       mem_t*  mem, \
+       thrinfo_t* thread  \
      ) \
 { \
 	/* Inspect whether we previously packed matrix A. */ \
@@ -214,18 +214,18 @@ void PASTEMAC(ch,opname) \
      ( \
        bool             will_pack, \
        stor3_t          stor_id, \
-       pack_t* restrict schema, \
+       pack_t* schema, \
        dim_t            k, \
        dim_t            n, \
        dim_t            nr, \
-       dim_t*  restrict k_max, \
-       dim_t*  restrict n_max, \
+       dim_t*  k_max, \
+       dim_t*  n_max, \
        ctype*           x, inc_t           rs_x, inc_t           cs_x, \
-       ctype**          p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
-                           dim_t* restrict pd_p, inc_t* restrict ps_p, \
-       cntx_t* restrict cntx, \
-       mem_t*  restrict mem, \
-       thrinfo_t* restrict thread  \
+       ctype**          p, inc_t* rs_p, inc_t* cs_p, \
+                           dim_t* pd_p, inc_t* ps_p, \
+       cntx_t* cntx, \
+       mem_t*  mem, \
+       thrinfo_t* thread  \
      ) \
 { \
 	/* Inspect whether we are going to be packing matrix B. */ \
@@ -320,14 +320,14 @@ void PASTEMAC(ch,opname) \
        dim_t            k, \
        dim_t            n, \
        dim_t            nr, \
-       ctype*  restrict kappa, \
-       ctype*  restrict b, inc_t           rs_b, inc_t           cs_b, \
-       ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
-                                                 inc_t* restrict ps_p, \
-       cntx_t* restrict cntx, \
-       rntm_t* restrict rntm, \
-       mem_t*  restrict mem, \
-       thrinfo_t* restrict thread  \
+       ctype*  kappa, \
+       ctype*  b, inc_t           rs_b, inc_t           cs_b, \
+       ctype** p, inc_t* rs_p, inc_t* cs_p, \
+                                                 inc_t* ps_p, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       mem_t*  mem, \
+       thrinfo_t* thread  \
      ) \
 { \
 	pack_t schema; \
diff --git a/frame/3/bli_l3_sup_packm_b.h b/frame/3/bli_l3_sup_packm_b.h
index 2965727d54..da20ea71d3 100644
--- a/frame/3/bli_l3_sup_packm_b.h
+++ b/frame/3/bli_l3_sup_packm_b.h
@@ -43,10 +43,10 @@ void PASTEMAC(ch,opname) \
        dim_t            k, \
        dim_t            n, \
        dim_t            nr, \
-       cntx_t* restrict cntx, \
-       rntm_t* restrict rntm, \
-       mem_t*  restrict mem, \
-       thrinfo_t* restrict thread  \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       mem_t*  mem, \
+       thrinfo_t* thread  \
      ); \
 
 INSERT_GENTPROT_BASIC0( packm_sup_init_mem_b )
@@ -58,9 +58,9 @@ INSERT_GENTPROT_BASIC0( packm_sup_init_mem_b )
 void PASTEMAC(ch,opname) \
      ( \
        bool             did_pack, \
-       rntm_t* restrict rntm, \
-       mem_t*  restrict mem, \
-       thrinfo_t* restrict thread  \
+       rntm_t* rntm, \
+       mem_t*  mem, \
+       thrinfo_t* thread  \
      ); \
 
 INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_b )
@@ -73,18 +73,18 @@ void PASTEMAC(ch,opname) \
      ( \
        bool             will_pack, \
        stor3_t          stor_id, \
-       pack_t* restrict schema, \
+       pack_t* schema, \
        dim_t            k, \
        dim_t            n, \
        dim_t            nr, \
-       dim_t*  restrict k_max, \
-       dim_t*  restrict n_max, \
+       dim_t*  k_max, \
+       dim_t*  n_max, \
        ctype*           b, inc_t           rs_b, inc_t           cs_b, \
-       ctype**          p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
-                           dim_t* restrict pd_p, inc_t* restrict ps_p, \
-       cntx_t* restrict cntx, \
-       mem_t*  restrict mem, \
-       thrinfo_t* restrict thread  \
+       ctype**          p, inc_t* rs_p, inc_t* cs_p, \
+                           dim_t* pd_p, inc_t* ps_p, \
+       cntx_t* cntx, \
+       mem_t*  mem, \
+       thrinfo_t* thread  \
      ); \
 
 INSERT_GENTPROT_BASIC0( packm_sup_init_b )
@@ -104,14 +104,14 @@ void PASTEMAC(ch,opname) \
        dim_t            k, \
        dim_t            n, \
        dim_t            nr, \
-       ctype*  restrict kappa, \
-       ctype*  restrict b, inc_t           rs_b, inc_t           cs_b, \
-       ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
-                                                 inc_t* restrict ps_p, \
-       cntx_t* restrict cntx, \
-       rntm_t* restrict rntm, \
-       mem_t*  restrict mem, \
-       thrinfo_t* restrict thread  \
+       ctype*  kappa, \
+       ctype*  b, inc_t           rs_b, inc_t           cs_b, \
+       ctype** p, inc_t* rs_p, inc_t* cs_p, \
+                                                 inc_t* ps_p, \
+       cntx_t* cntx, \
+       rntm_t* rntm, \
+       mem_t*  mem, \
+       thrinfo_t* thread  \
      ); \
 
 INSERT_GENTPROT_BASIC0( packm_sup_b )
diff --git a/frame/3/bli_l3_sup_packm_var.c b/frame/3/bli_l3_sup_packm_var.c
index f54e5f1256..dd55cda54f 100644
--- a/frame/3/bli_l3_sup_packm_var.c
+++ b/frame/3/bli_l3_sup_packm_var.c
@@ -50,17 +50,17 @@ void PASTEMAC(ch,varname) \
        dim_t            n, \
        dim_t            m_max, \
        dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict c, inc_t rs_c, inc_t cs_c, \
-       ctype*  restrict p, inc_t rs_p, inc_t cs_p, \
+       ctype*  kappa, \
+       ctype*  c, inc_t rs_c, inc_t cs_c, \
+       ctype*  p, inc_t rs_p, inc_t cs_p, \
                            dim_t pd_p, inc_t ps_p, \
-       cntx_t* restrict cntx, \
-       thrinfo_t* restrict thread  \
+       cntx_t* cntx, \
+       thrinfo_t* thread  \
      ) \
 { \
-	ctype* restrict kappa_cast = kappa; \
-	ctype* restrict c_cast     = c; \
-	ctype* restrict p_cast     = p; \
+	ctype* kappa_cast = kappa; \
+	ctype* c_cast     = c; \
+	ctype* p_cast     = p; \
 \
 	dim_t           iter_dim; \
 	dim_t           n_iter; \
@@ -140,7 +140,7 @@ void PASTEMAC(ch,varname) \
 		ic_inc = panel_dim_max; \
 	} \
 \
-	ctype* restrict p_begin = p_cast; \
+	ctype* p_begin = p_cast; \
 \
 	/* Query the number of threads and thread ids from the current thread's
 	   packm thrinfo_t node. */ \
@@ -165,10 +165,10 @@ void PASTEMAC(ch,varname) \
 	{ \
 		panel_dim_i = bli_min( panel_dim_max, iter_dim - ic ); \
 \
-		ctype* restrict c_begin = c_cast   + (ic  )*vs_c; \
+		ctype* c_begin = c_cast   + (ic  )*vs_c; \
 \
-		ctype* restrict c_use = c_begin; \
-		ctype* restrict p_use = p_begin; \
+		ctype* c_use = c_begin; \
+		ctype* p_use = p_begin; \
 \
 		{ \
 			panel_len_i     = panel_len_full; \
@@ -320,16 +320,16 @@ void PASTEMAC(ch,varname) \
        pack_t           schema, \
        dim_t            m, \
        dim_t            n, \
-       ctype*  restrict kappa, \
-       ctype*  restrict c, inc_t rs_c, inc_t cs_c, \
-       ctype*  restrict p, inc_t rs_p, inc_t cs_p, \
-       cntx_t* restrict cntx, \
-       thrinfo_t* restrict thread  \
+       ctype*  kappa, \
+       ctype*  c, inc_t rs_c, inc_t cs_c, \
+       ctype*  p, inc_t rs_p, inc_t cs_p, \
+       cntx_t* cntx, \
+       thrinfo_t* thread  \
      ) \
 { \
-	ctype* restrict kappa_cast = kappa; \
-	ctype* restrict c_cast     = c; \
-	ctype* restrict p_cast     = p; \
+	ctype* kappa_cast = kappa; \
+	ctype* c_cast     = c; \
+	ctype* p_cast     = p; \
 \
 	dim_t           iter_dim; \
 	dim_t           n_iter; \
@@ -383,7 +383,7 @@ void PASTEMAC(ch,varname) \
 	n_iter = iter_dim; \
 \
 \
-	ctype* restrict p_begin = p_cast; \
+	ctype* p_begin = p_cast; \
 \
 	/* Query the number of threads and thread ids from the current thread's
 	   packm thrinfo_t node. */ \
@@ -405,10 +405,10 @@ void PASTEMAC(ch,varname) \
 	/* Iterate over every logical micropanel in the source matrix. */ \
 	for ( it = 0; it < n_iter; it += 1 ) \
 	{ \
-		ctype* restrict c_begin = c_cast + (it  )*ldc; \
+		ctype* c_begin = c_cast + (it  )*ldc; \
 \
-		ctype* restrict c_use = c_begin; \
-		ctype* restrict p_use = p_begin; \
+		ctype* c_use = c_begin; \
+		ctype* p_use = p_begin; \
 \
 		{ \
 			/* The definition of bli_packm_my_iter() will depend on whether slab
diff --git a/frame/3/bli_l3_sup_packm_var.h b/frame/3/bli_l3_sup_packm_var.h
index 5ccdd3b762..257974e466 100644
--- a/frame/3/bli_l3_sup_packm_var.h
+++ b/frame/3/bli_l3_sup_packm_var.h
@@ -48,12 +48,12 @@ void PASTEMAC(ch,varname) \
        dim_t            n, \
        dim_t            m_max, \
        dim_t            n_max, \
-       ctype*  restrict kappa, \
-       ctype*  restrict c, inc_t rs_c, inc_t cs_c, \
-       ctype*  restrict p, inc_t rs_p, inc_t cs_p, \
+       ctype*  kappa, \
+       ctype*  c, inc_t rs_c, inc_t cs_c, \
+       ctype*  p, inc_t rs_p, inc_t cs_p, \
                            dim_t pd_p, inc_t ps_p, \
-       cntx_t* restrict cntx, \
-       thrinfo_t* restrict thread  \
+       cntx_t* cntx, \
+       thrinfo_t* thread  \
      );
 
 INSERT_GENTPROT_BASIC0( packm_sup_var1 )
@@ -67,11 +67,11 @@ void PASTEMAC(ch,varname) \
        pack_t           schema, \
        dim_t            m, \
        dim_t            n, \
-       ctype*  restrict kappa, \
-       ctype*  restrict c, inc_t rs_c, inc_t cs_c, \
-       ctype*  restrict p, inc_t rs_p, inc_t cs_p, \
-       cntx_t* restrict cntx, \
-       thrinfo_t* restrict thread  \
+       ctype*  kappa, \
+       ctype*  c, inc_t rs_c, inc_t cs_c, \
+       ctype*  p, inc_t rs_p, inc_t cs_p, \
+       cntx_t* cntx, \
+       thrinfo_t* thread  \
      );
 
 INSERT_GENTPROT_BASIC0( packm_sup_var2 )
diff --git a/frame/3/bli_l3_sup_ref.c b/frame/3/bli_l3_sup_ref.c
index f03ec1b18f..8eb7a6d4b0 100644
--- a/frame/3/bli_l3_sup_ref.c
+++ b/frame/3/bli_l3_sup_ref.c
@@ -36,13 +36,13 @@
 
 err_t bli_gemmsup_ref
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm
      )
 {
 	// This function implements the default gemmsup handler. If you are a
@@ -124,13 +124,13 @@ err_t bli_gemmsup_ref
 
 err_t bli_gemmtsup_ref
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm
      )
 {
 	// This function implements the default gemmtsup handler. If you are a
diff --git a/frame/3/bli_l3_sup_ref.h b/frame/3/bli_l3_sup_ref.h
index bce4e1729e..4d4811db34 100644
--- a/frame/3/bli_l3_sup_ref.h
+++ b/frame/3/bli_l3_sup_ref.h
@@ -34,23 +34,23 @@
 
 err_t bli_gemmsup_ref
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm
      );
 
 err_t bli_gemmtsup_ref
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm
      );
 
diff --git a/frame/3/bli_l3_sup_var12.c b/frame/3/bli_l3_sup_var12.c
index 106ad86e4d..d65482243b 100644
--- a/frame/3/bli_l3_sup_var12.c
+++ b/frame/3/bli_l3_sup_var12.c
@@ -38,19 +38,19 @@
 
 typedef void (*FUNCPTR_T)
      (
-       conj_t           conja,
-       conj_t           conjb,
-       dim_t            m,
-       dim_t            n,
-       dim_t            k,
-       void*   restrict alpha,
-       void*   restrict a, inc_t rs_a, inc_t cs_a,
-       void*   restrict b, inc_t rs_b, inc_t cs_b,
-       void*   restrict beta,
-       void*   restrict c, inc_t rs_c, inc_t cs_c,
-       stor3_t          eff_id,
-       cntx_t* restrict cntx,
-       rntm_t* restrict rntm
+       conj_t  conja,
+       conj_t  conjb,
+       dim_t   m,
+       dim_t   n,
+       dim_t   k,
+       void*   alpha,
+       void*   a, inc_t rs_a, inc_t cs_a,
+       void*   b, inc_t rs_b, inc_t cs_b,
+       void*   beta,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       stor3_t eff_id,
+       cntx_t* cntx,
+       rntm_t* rntm
      );
 
 #if 0
@@ -95,20 +95,20 @@ void bli_gemmsup_ref_var2
 
 	const dim_t    k         = bli_obj_width( &at );
 
-	void* restrict buf_a     = bli_obj_buffer_at_off( &at );
+	void* buf_a     = bli_obj_buffer_at_off( &at );
 	const inc_t    rs_a      = bli_obj_row_stride( &at );
 	const inc_t    cs_a      = bli_obj_col_stride( &at );
 
-	void* restrict buf_b     = bli_obj_buffer_at_off( &bt );
+	void* buf_b     = bli_obj_buffer_at_off( &bt );
 	const inc_t    rs_b      = bli_obj_row_stride( &bt );
 	const inc_t    cs_b      = bli_obj_col_stride( &bt );
 
-	void* restrict buf_c     = bli_obj_buffer_at_off( c );
+	void* buf_c     = bli_obj_buffer_at_off( c );
 	const inc_t    rs_c      = bli_obj_row_stride( c );
 	const inc_t    cs_c      = bli_obj_col_stride( c );
 
-	void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha );
-	void* restrict buf_beta  = bli_obj_buffer_for_1x1( dt_exec, beta );
+	void* buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha );
+	void* buf_beta  = bli_obj_buffer_for_1x1( dt_exec, beta );
 
 #else
 
@@ -121,11 +121,11 @@ void bli_gemmsup_ref_var2
 	const dim_t    n         = bli_obj_width( c );
 	      dim_t    k;
 
-	void* restrict buf_a = bli_obj_buffer_at_off( a );
+	void* buf_a = bli_obj_buffer_at_off( a );
 	      inc_t    rs_a;
 	      inc_t    cs_a;
 
-	void* restrict buf_b = bli_obj_buffer_at_off( b );
+	void* buf_b = bli_obj_buffer_at_off( b );
 	      inc_t    rs_b;
 	      inc_t    cs_b;
 
@@ -157,12 +157,12 @@ void bli_gemmsup_ref_var2
 		cs_b  = bli_obj_row_stride( b );
 	}
 
-	void* restrict buf_c     = bli_obj_buffer_at_off( c );
+	void* buf_c     = bli_obj_buffer_at_off( c );
 	const inc_t    rs_c      = bli_obj_row_stride( c );
 	const inc_t    cs_c      = bli_obj_col_stride( c );
 
-	void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha );
-	void* restrict buf_beta  = bli_obj_buffer_for_1x1( dt_exec, beta );
+	void* buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha );
+	void* buf_beta  = bli_obj_buffer_for_1x1( dt_exec, beta );
 
 #endif
 
@@ -200,14 +200,14 @@ void PASTEMAC(ch,varname) \
        dim_t            m, \
        dim_t            n, \
        dim_t            k, \
-       void*   restrict alpha, \
-       void*   restrict a, inc_t rs_a, inc_t cs_a, \
-       void*   restrict b, inc_t rs_b, inc_t cs_b, \
-       void*   restrict beta, \
-       void*   restrict c, inc_t rs_c, inc_t cs_c, \
+       void*   alpha, \
+       void*   a, inc_t rs_a, inc_t cs_a, \
+       void*   b, inc_t rs_b, inc_t cs_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
        stor3_t          eff_id, \
-       cntx_t* restrict cntx, \
-       rntm_t* restrict rntm  \
+       cntx_t* cntx, \
+       rntm_t* rntm  \
      ) \
 { \
 	/* If any dimension is zero, return immediately. */ \
@@ -266,13 +266,13 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,gemmsup_ker_ft) \
                gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \
 \
-	ctype* restrict a_00       = a; \
-	ctype* restrict b_00       = b; \
-	ctype* restrict c_00       = c; \
-	ctype* restrict alpha_cast = alpha; \
-	ctype* restrict beta_cast  = beta; \
+	ctype* a_00       = a; \
+	ctype* b_00       = b; \
+	ctype* c_00       = c; \
+	ctype* alpha_cast = alpha; \
+	ctype* beta_cast  = beta; \
 \
-	ctype* restrict one        = PASTEMAC(ch,1); \
+	ctype* one        = PASTEMAC(ch,1); \
 \
 	auxinfo_t       aux; \
 \
@@ -305,8 +305,8 @@ void PASTEMAC(ch,varname) \
 	{ \
 		const dim_t nc_cur = ( bli_is_not_edge_f( jj, jc_iter, jc_left ) ? NC : jc_left ); \
 \
-		ctype* restrict b_jc = b_00 + jj * jcstep_b; \
-		ctype* restrict c_jc = c_00 + jj * jcstep_c; \
+		ctype* b_jc = b_00 + jj * jcstep_b; \
+		ctype* c_jc = c_00 + jj * jcstep_c; \
 \
 		const dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \
 		const dim_t jr_left =   nc_cur % NR; \
@@ -316,19 +316,19 @@ void PASTEMAC(ch,varname) \
 		{ \
 			const dim_t kc_cur = ( bli_is_not_edge_f( pp, pc_iter, pc_left ) ? KC : pc_left ); \
 \
-			ctype* restrict a_pc = a_00 + pp * pcstep_a; \
-			ctype* restrict b_pc = b_jc + pp * pcstep_b; \
+			ctype* a_pc = a_00 + pp * pcstep_a; \
+			ctype* b_pc = b_jc + pp * pcstep_b; \
 \
 			/* Only apply beta to the first iteration of the pc loop. */ \
-			ctype* restrict beta_use = ( pp == 0 ? beta_cast : one ); \
+			ctype* beta_use = ( pp == 0 ? beta_cast : one ); \
 \
 			/* Loop over the m dimension (MC rows at a time). */ \
 			for ( dim_t ii = 0; ii < ic_iter; ii += ic_inc ) \
 			{ \
 				const dim_t mc_cur = ( bli_is_not_edge_f( ii, ic_iter, ic_left ) ? MC : ic_left ); \
 \
-				ctype* restrict a_ic = a_pc + ii * icstep_a; \
-				ctype* restrict c_ic = c_jc + ii * icstep_c; \
+				ctype* a_ic = a_pc + ii * icstep_a; \
+				ctype* c_ic = c_jc + ii * icstep_c; \
 \
 				const dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \
 				const dim_t ir_left =   mc_cur % MR; \
@@ -338,11 +338,11 @@ void PASTEMAC(ch,varname) \
 				{ \
 					const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \
 \
-					ctype* restrict b_jr = b_pc + j * jrstep_b; \
-					ctype* restrict c_jr = c_ic + j * jrstep_c; \
+					ctype* b_jr = b_pc + j * jrstep_b; \
+					ctype* c_jr = c_ic + j * jrstep_c; \
 \
 /*
-					ctype* restrict b2 = b_jr; \
+					ctype* b2 = b_jr; \
 */ \
 \
 					/* Loop over the m dimension (MR rows at a time). */ \
@@ -350,13 +350,13 @@ void PASTEMAC(ch,varname) \
 					{ \
 						const dim_t mr_cur = ( bli_is_not_edge_f( i, ir_iter, ir_left ) ? MR : ir_left ); \
 \
-						ctype* restrict a_ir = a_ic + i * irstep_a; \
-						ctype* restrict c_ir = c_jr + i * irstep_c; \
+						ctype* a_ir = a_ic + i * irstep_a; \
+						ctype* c_ir = c_jr + i * irstep_c; \
 \
 						/* Save addresses of next panels of A and B to the auxinfo_t
 						   object. */ \
 /*
-						ctype* restrict a2 = bli_gemm_get_next_a_upanel( a_ir, irstep_a, ir_inc ); \
+						ctype* a2 = bli_gemm_get_next_a_upanel( a_ir, irstep_a, ir_inc ); \
 						if ( bli_is_last_iter( i, ir_iter, 0, 1 ) ) \
 						{ \
 							a2 = a_00; \
@@ -442,20 +442,20 @@ void bli_gemmsup_ref_var1
 
 	const dim_t    k         = bli_obj_width( &at );
 
-	void* restrict buf_a     = bli_obj_buffer_at_off( &at );
+	void* buf_a     = bli_obj_buffer_at_off( &at );
 	const inc_t    rs_a      = bli_obj_row_stride( &at );
 	const inc_t    cs_a      = bli_obj_col_stride( &at );
 
-	void* restrict buf_b     = bli_obj_buffer_at_off( &bt );
+	void* buf_b     = bli_obj_buffer_at_off( &bt );
 	const inc_t    rs_b      = bli_obj_row_stride( &bt );
 	const inc_t    cs_b      = bli_obj_col_stride( &bt );
 
-	void* restrict buf_c     = bli_obj_buffer_at_off( c );
+	void* buf_c     = bli_obj_buffer_at_off( c );
 	const inc_t    rs_c      = bli_obj_row_stride( c );
 	const inc_t    cs_c      = bli_obj_col_stride( c );
 
-	void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha );
-	void* restrict buf_beta  = bli_obj_buffer_for_1x1( dt_exec, beta );
+	void* buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha );
+	void* buf_beta  = bli_obj_buffer_for_1x1( dt_exec, beta );
 
 #else
 
@@ -468,11 +468,11 @@ void bli_gemmsup_ref_var1
 	const dim_t    n         = bli_obj_width( c );
 	      dim_t    k;
 
-	void* restrict buf_a = bli_obj_buffer_at_off( a );
+	void* buf_a = bli_obj_buffer_at_off( a );
 	      inc_t    rs_a;
 	      inc_t    cs_a;
 
-	void* restrict buf_b = bli_obj_buffer_at_off( b );
+	void* buf_b = bli_obj_buffer_at_off( b );
 	      inc_t    rs_b;
 	      inc_t    cs_b;
 
@@ -504,12 +504,12 @@ void bli_gemmsup_ref_var1
 		cs_b  = bli_obj_row_stride( b );
 	}
 
-	void* restrict buf_c     = bli_obj_buffer_at_off( c );
+	void* buf_c     = bli_obj_buffer_at_off( c );
 	const inc_t    rs_c      = bli_obj_row_stride( c );
 	const inc_t    cs_c      = bli_obj_col_stride( c );
 
-	void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha );
-	void* restrict buf_beta  = bli_obj_buffer_for_1x1( dt_exec, beta );
+	void* buf_alpha = bli_obj_buffer_for_1x1( dt_exec, alpha );
+	void* buf_beta  = bli_obj_buffer_for_1x1( dt_exec, beta );
 
 #endif
 
@@ -547,14 +547,14 @@ void PASTEMAC(ch,varname) \
        dim_t            m, \
        dim_t            n, \
        dim_t            k, \
-       void*   restrict alpha, \
-       void*   restrict a, inc_t rs_a, inc_t cs_a, \
-       void*   restrict b, inc_t rs_b, inc_t cs_b, \
-       void*   restrict beta, \
-       void*   restrict c, inc_t rs_c, inc_t cs_c, \
+       void*   alpha, \
+       void*   a, inc_t rs_a, inc_t cs_a, \
+       void*   b, inc_t rs_b, inc_t cs_b, \
+       void*   beta, \
+       void*   c, inc_t rs_c, inc_t cs_c, \
        stor3_t          eff_id, \
-       cntx_t* restrict cntx, \
-       rntm_t* restrict rntm  \
+       cntx_t* cntx, \
+       rntm_t* rntm  \
      ) \
 { \
 	/* If any dimension is zero, return immediately. */ \
@@ -617,13 +617,13 @@ void PASTEMAC(ch,varname) \
 	PASTECH(ch,gemmsup_ker_ft) \
                gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \
 \
-	ctype* restrict a_00       = a; \
-	ctype* restrict b_00       = b; \
-	ctype* restrict c_00       = c; \
-	ctype* restrict alpha_cast = alpha; \
-	ctype* restrict beta_cast  = beta; \
+	ctype* a_00       = a; \
+	ctype* b_00       = b; \
+	ctype* c_00       = c; \
+	ctype* alpha_cast = alpha; \
+	ctype* beta_cast  = beta; \
 \
-	ctype* restrict one        = PASTEMAC(ch,1); \
+	ctype* one        = PASTEMAC(ch,1); \
 \
 	auxinfo_t       aux; \
 \
@@ -656,8 +656,8 @@ void PASTEMAC(ch,varname) \
 	{ \
 		const dim_t nc_cur = ( bli_is_not_edge_f( jj, jc_iter, jc_left ) ? NC : jc_left ); \
 \
-		ctype* restrict a_jc = a_00 + jj * jcstep_a; \
-		ctype* restrict c_jc = c_00 + jj * jcstep_c; \
+		ctype* a_jc = a_00 + jj * jcstep_a; \
+		ctype* c_jc = c_00 + jj * jcstep_c; \
 \
 		const dim_t jr_iter = ( nc_cur + MR - 1 ) / MR; \
 		const dim_t jr_left =   nc_cur % MR; \
@@ -667,19 +667,19 @@ void PASTEMAC(ch,varname) \
 		{ \
 			const dim_t kc_cur = ( bli_is_not_edge_f( pp, pc_iter, pc_left ) ? KC : pc_left ); \
 \
-			ctype* restrict a_pc = a_jc + pp * pcstep_a; \
-			ctype* restrict b_pc = b_00 + pp * pcstep_b; \
+			ctype* a_pc = a_jc + pp * pcstep_a; \
+			ctype* b_pc = b_00 + pp * pcstep_b; \
 \
 			/* Only apply beta to the first iteration of the pc loop. */ \
-			ctype* restrict beta_use = ( pp == 0 ? beta_cast : one ); \
+			ctype* beta_use = ( pp == 0 ? beta_cast : one ); \
 \
 			/* Loop over the n dimension (MC rows at a time). */ \
 			for ( dim_t ii = 0; ii < ic_iter; ii += ic_inc ) \
 			{ \
 				const dim_t mc_cur = ( bli_is_not_edge_f( ii, ic_iter, ic_left ) ? MC : ic_left ); \
 \
-				ctype* restrict b_ic = b_pc + ii * icstep_b; \
-				ctype* restrict c_ic = c_jc + ii * icstep_c; \
+				ctype* b_ic = b_pc + ii * icstep_b; \
+				ctype* c_ic = c_jc + ii * icstep_c; \
 \
 				const dim_t ir_iter = ( mc_cur + NR - 1 ) / NR; \
 				const dim_t ir_left =   mc_cur % NR; \
@@ -689,16 +689,16 @@ void PASTEMAC(ch,varname) \
 				{ \
 					const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \
 \
-					ctype* restrict a_jr = a_pc + j * jrstep_a; \
-					ctype* restrict c_jr = c_ic + j * jrstep_c; \
+					ctype* a_jr = a_pc + j * jrstep_a; \
+					ctype* c_jr = c_ic + j * jrstep_c; \
 \
 					/* Loop over the n dimension (MR rows at a time). */ \
 					for ( dim_t i = 0; i < ir_iter; i += ir_inc ) \
 					{ \
 						const dim_t mr_cur = ( bli_is_not_edge_f( i, ir_iter, ir_left ) ? MR : ir_left ); \
 \
-						ctype* restrict b_ir = b_ic + i * irstep_b; \
-						ctype* restrict c_ir = c_jr + i * irstep_c; \
+						ctype* b_ir = b_ic + i * irstep_b; \
+						ctype* c_ir = c_jr + i * irstep_c; \
 \
 						/* Invoke the gemmsup micro-kernel. */ \
 						gemmsup_ker \
diff --git a/frame/3/bli_l3_sup_var1n2m.c b/frame/3/bli_l3_sup_var1n2m.c
index acc4c30712..a5d66783fc 100644
--- a/frame/3/bli_l3_sup_var1n2m.c
+++ b/frame/3/bli_l3_sup_var1n2m.c
@@ -38,22 +38,22 @@
 
 typedef void (*FUNCPTR_T)
      (
-       bool             packa,
-       bool             packb,
-       conj_t           conja,
-       conj_t           conjb,
-       dim_t            m,
-       dim_t            n,
-       dim_t            k,
-       void*   restrict alpha,
-       void*   restrict a, inc_t rs_a, inc_t cs_a,
-       void*   restrict b, inc_t rs_b, inc_t cs_b,
-       void*   restrict beta,
-       void*   restrict c, inc_t rs_c, inc_t cs_c,
-       stor3_t          eff_id,
-       cntx_t* restrict cntx,
-       rntm_t* restrict rntm,
-       thrinfo_t* restrict thread
+       bool       packa,
+       bool       packb,
+       conj_t     conja,
+       conj_t     conjb,
+       dim_t      m,
+       dim_t      n,
+       dim_t      k,
+       void*      alpha,
+       void*      a, inc_t rs_a, inc_t cs_a,
+       void*      b, inc_t rs_b, inc_t cs_b,
+       void*      beta,
+       void*      c, inc_t rs_c, inc_t cs_c,
+       stor3_t    eff_id,
+       cntx_t*    cntx,
+       rntm_t*    rntm,
+       thrinfo_t* thread
      );
 
 //
@@ -64,16 +64,16 @@ static FUNCPTR_T GENARRAY(ftypes_var1n,gemmsup_ref_var1n);
 
 void bli_gemmsup_ref_var1n
      (
-       trans_t trans,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       stor3_t eff_id,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
+             trans_t trans,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+             stor3_t eff_id,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             thrinfo_t* thread
      )
 {
 #if 0
@@ -98,41 +98,41 @@ void bli_gemmsup_ref_var1n
 
 	const dim_t    k         = bli_obj_width( &at );
 
-	void* restrict buf_a     = bli_obj_buffer_at_off( &at );
+	void* buf_a     = bli_obj_buffer_at_off( &at );
 	const inc_t    rs_a      = bli_obj_row_stride( &at );
 	const inc_t    cs_a      = bli_obj_col_stride( &at );
 
-	void* restrict buf_b     = bli_obj_buffer_at_off( &bt );
+	void* buf_b     = bli_obj_buffer_at_off( &bt );
 	const inc_t    rs_b      = bli_obj_row_stride( &bt );
 	const inc_t    cs_b      = bli_obj_col_stride( &bt );
 
-	void* restrict buf_c     = bli_obj_buffer_at_off( c );
+	void* buf_c     = bli_obj_buffer_at_off( c );
 	const inc_t    rs_c      = bli_obj_row_stride( c );
 	const inc_t    cs_c      = bli_obj_col_stride( c );
 
-	void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
-	void* restrict buf_beta  = bli_obj_buffer_for_1x1( dt, beta );
+	void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
+	void* buf_beta  = bli_obj_buffer_for_1x1( dt, beta );
 
 #else
-	const num_t    dt        = bli_obj_dt( c );
+	const num_t  dt    = bli_obj_dt( c );
 
-	const bool     packa     = bli_rntm_pack_a( rntm );
-	const bool     packb     = bli_rntm_pack_b( rntm );
+	const bool   packa = bli_rntm_pack_a( rntm );
+	const bool   packb = bli_rntm_pack_b( rntm );
 
-	const conj_t   conja     = bli_obj_conj_status( a );
-	const conj_t   conjb     = bli_obj_conj_status( b );
+	const conj_t conja = bli_obj_conj_status( a );
+	const conj_t conjb = bli_obj_conj_status( b );
 
-	const dim_t    m         = bli_obj_length( c );
-	const dim_t    n         = bli_obj_width( c );
-	      dim_t    k;
+	const dim_t  m     = bli_obj_length( c );
+	const dim_t  n     = bli_obj_width( c );
+	      dim_t  k;
 
-	void* restrict buf_a = bli_obj_buffer_at_off( a );
-	      inc_t    rs_a;
-	      inc_t    cs_a;
+	const void*  buf_a = bli_obj_buffer_at_off( a );
+	      inc_t  rs_a;
+	      inc_t  cs_a;
 
-	void* restrict buf_b = bli_obj_buffer_at_off( b );
-	      inc_t    rs_b;
-	      inc_t    cs_b;
+	const void*  buf_b = bli_obj_buffer_at_off( b );
+	      inc_t  rs_b;
+	      inc_t  cs_b;
 
 	if ( bli_obj_has_notrans( a ) )
 	{
@@ -162,12 +162,12 @@ void bli_gemmsup_ref_var1n
 		cs_b  = bli_obj_row_stride( b );
 	}
 
-	void* restrict buf_c     = bli_obj_buffer_at_off( c );
-	const inc_t    rs_c      = bli_obj_row_stride( c );
-	const inc_t    cs_c      = bli_obj_col_stride( c );
+	      void* buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t rs_c      = bli_obj_row_stride( c );
+	const inc_t cs_c      = bli_obj_col_stride( c );
 
-	void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
-	void* restrict buf_beta  = bli_obj_buffer_for_1x1( dt, beta );
+	const void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
+	const void* buf_beta  = bli_obj_buffer_for_1x1( dt, beta );
 
 #endif
 
@@ -193,13 +193,13 @@ void bli_gemmsup_ref_var1n
 		  m,
 		  n,
 		  k,
-		  buf_alpha,
-		  buf_a, rs_a, cs_a,
-		  buf_b, rs_b, cs_b,
-		  buf_beta,
-		  buf_c, rs_c, cs_c,
+		  ( void* )buf_alpha,
+		  ( void* )buf_a, rs_a, cs_a,
+		  ( void* )buf_b, rs_b, cs_b,
+		  ( void* )buf_beta,
+		           buf_c, rs_c, cs_c,
 		  eff_id,
-		  cntx,
+		  ( cntx_t* )cntx,
 		  rntm,
 		  thread
 		);
@@ -216,13 +216,13 @@ void bli_gemmsup_ref_var1n
 		  n,                 // swap the m and n dimensions.
 		  m,
 		  k,
-		  buf_alpha,
-		  buf_b, cs_b, rs_b, // swap the positions of A and B.
-		  buf_a, cs_a, rs_a, // swap the strides of A and B.
-		  buf_beta,
-		  buf_c, cs_c, rs_c, // swap the strides of C.
+		  ( void* )buf_alpha,
+		  ( void* )buf_b, cs_b, rs_b, // swap the positions of A and B.
+		  ( void* )buf_a, cs_a, rs_a, // swap the strides of A and B.
+		  ( void* )buf_beta,
+		           buf_c, cs_c, rs_c, // swap the strides of C.
 		  bli_stor3_trans( eff_id ), // transpose the stor3_t id.
-		  cntx,
+          ( cntx_t* )cntx,
 		  rntm,
 		  thread
 		);
@@ -235,22 +235,22 @@ void bli_gemmsup_ref_var1n
 \
 void PASTEMAC(ch,varname) \
      ( \
-       bool             packa, \
-       bool             packb, \
-       conj_t           conja, \
-       conj_t           conjb, \
-       dim_t            m, \
-       dim_t            n, \
-       dim_t            k, \
-       void*   restrict alpha, \
-       void*   restrict a, inc_t rs_a, inc_t cs_a, \
-       void*   restrict b, inc_t rs_b, inc_t cs_b, \
-       void*   restrict beta, \
-       void*   restrict c, inc_t rs_c, inc_t cs_c, \
-       stor3_t          stor_id, \
-       cntx_t* restrict cntx, \
-       rntm_t* restrict rntm, \
-       thrinfo_t* restrict thread  \
+       bool       packa, \
+       bool       packb, \
+       conj_t     conja, \
+       conj_t     conjb, \
+       dim_t      m, \
+       dim_t      n, \
+       dim_t      k, \
+       void*      alpha, \
+       void*      a, inc_t rs_a, inc_t cs_a, \
+       void*      b, inc_t rs_b, inc_t cs_b, \
+       void*      beta, \
+       void*      c, inc_t rs_c, inc_t cs_c, \
+       stor3_t    stor_id, \
+       cntx_t*    cntx, \
+       rntm_t*    rntm, \
+       thrinfo_t* thread  \
      ) \
 { \
 	const num_t dt = PASTEMAC(ch,type); \
@@ -365,20 +365,20 @@ void PASTEMAC(ch,varname) \
 	/* Query the context for the sup microkernel address and cast it to its
 	   function pointer type. */ \
 	PASTECH(ch,gemmsup_ker_ft) \
-               gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \
+	    gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \
 \
-	ctype* restrict a_00       = a; \
-	ctype* restrict b_00       = b; \
-	ctype* restrict c_00       = c; \
-	ctype* restrict alpha_cast = alpha; \
-	ctype* restrict beta_cast  = beta; \
+	ctype* a_00       = a; \
+	ctype* b_00       = b; \
+	ctype* c_00       = c; \
+	ctype* alpha_cast = alpha; \
+	ctype* beta_cast  = beta; \
 \
 	/* Make local copies of beta and one scalars to prevent any unnecessary
 	   sharing of cache lines between the cores' caches. */ \
 	ctype           beta_local = *beta_cast; \
 	ctype           one_local  = *PASTEMAC(ch,1); \
 \
-	auxinfo_t       aux; \
+	auxinfo_t aux; \
 \
 	/* Parse and interpret the contents of the rntm_t object to properly
 	   set the ways of parallelism for each loop. */ \
@@ -408,12 +408,12 @@ void PASTEMAC(ch,varname) \
 	   That is, this panel-block algorithm partitions an NC x KC submatrix
 	   of A to be packed in the 4th loop, and a KC x MC submatrix of B
 	   to be packed in the 3rd loop. */ \
-	/*                           5thloop  4thloop         packa  3rdloop         packb  2ndloop  1stloop  ukrloop */ \
+	/*                                  5thloop  4thloop         packa  3rdloop         packb  2ndloop  1stloop  ukrloop */ \
 	bszid_t bszids_nopack[6] = { BLIS_NC, BLIS_KC,               BLIS_MC,               BLIS_NR, BLIS_MR, BLIS_KR }; \
 	bszid_t bszids_packa [7] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC,               BLIS_NR, BLIS_MR, BLIS_KR }; \
 	bszid_t bszids_packb [7] = { BLIS_NC, BLIS_KC,               BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \
 	bszid_t bszids_packab[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \
-	bszid_t* restrict bszids; \
+	bszid_t* bszids; \
 \
 	/* Set the bszids pointer to the correct bszids array above based on which
 	   matrices (if any) are being packed. */ \
@@ -425,16 +425,16 @@ void PASTEMAC(ch,varname) \
 	/* Determine whether we are using more than one thread. */ \
 	const bool is_mt = ( bli_rntm_calc_num_threads( rntm ) > 1 ); \
 \
-	thrinfo_t* restrict thread_jc = NULL; \
-	thrinfo_t* restrict thread_pc = NULL; \
-	thrinfo_t* restrict thread_pa = NULL; \
-	thrinfo_t* restrict thread_ic = NULL; \
-	thrinfo_t* restrict thread_pb = NULL; \
-	thrinfo_t* restrict thread_jr = NULL; \
+	thrinfo_t* thread_jc = NULL; \
+	thrinfo_t* thread_pc = NULL; \
+	thrinfo_t* thread_pa = NULL; \
+	thrinfo_t* thread_ic = NULL; \
+	thrinfo_t* thread_pb = NULL; \
+	thrinfo_t* thread_jr = NULL; \
 \
 	/* Grow the thrinfo_t tree. */ \
-	bszid_t*   restrict bszids_jc = bszids; \
-	                    thread_jc = thread; \
+	bszid_t*   bszids_jc = bszids; \
+	               thread_jc = thread; \
 	bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \
 \
 	/* Compute the JC loop thread range for the current thread. */ \
@@ -453,12 +453,12 @@ void PASTEMAC(ch,varname) \
 		/* Calculate the thread's current JC block dimension. */ \
 		const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \
 \
-		ctype* restrict a_jc = a_00 + jj * jcstep_a; \
-		ctype* restrict c_jc = c_00 + jj * jcstep_c; \
+		ctype* a_jc = a_00 + jj * jcstep_a; \
+		ctype* c_jc = c_00 + jj * jcstep_c; \
 \
 		/* Grow the thrinfo_t tree. */ \
-		bszid_t*   restrict bszids_pc = &bszids_jc[1]; \
-		                    thread_pc = bli_thrinfo_sub_node( thread_jc ); \
+		bszid_t*   bszids_pc = &bszids_jc[1]; \
+		               thread_pc = bli_thrinfo_sub_node( thread_jc ); \
 		bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \
 \
 		/* Compute the PC loop thread range for the current thread. */ \
@@ -476,14 +476,14 @@ void PASTEMAC(ch,varname) \
 			/* Calculate the thread's current PC block dimension. */ \
 			const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \
 \
-			ctype* restrict a_pc = a_jc + pp * pcstep_a; \
-			ctype* restrict b_pc = b_00 + pp * pcstep_b; \
+			ctype* a_pc = a_jc + pp * pcstep_a; \
+			ctype* b_pc = b_00 + pp * pcstep_b; \
 \
 			/* Only apply beta to the first iteration of the pc loop. */ \
-			ctype* restrict beta_use = ( pp == 0 ? &beta_local : &one_local ); \
+			ctype* beta_use = ( pp == 0 ? &beta_local : &one_local ); \
 \
 			ctype* a_use; \
-			inc_t  rs_a_use, cs_a_use, ps_a_use; \
+			      inc_t  rs_a_use, cs_a_use, ps_a_use; \
 \
 			/* Set the bszid_t array and thrinfo_t pointer based on whether
 			   we will be packing A. If we won't be packing A, we alias to
@@ -493,7 +493,7 @@ void PASTEMAC(ch,varname) \
 			   previous call to bli_thrinfo_grow(), since bszid values of
 			   BLIS_NO_PART cause the tree to grow by two (e.g. to the next
 			   bszid that is a normal bszid_t value). */ \
-			bszid_t*   restrict bszids_pa; \
+			bszid_t*   bszids_pa; \
 			if ( packa ) { bszids_pa = &bszids_pc[1]; \
 			               thread_pa = bli_thrinfo_sub_node( thread_pc ); } \
 			else         { bszids_pa = &bszids_pc[0]; \
@@ -526,7 +526,7 @@ void PASTEMAC(ch,varname) \
 \
 			/* Alias a_use so that it's clear this is our current block of
 			   matrix A. */ \
-			ctype* restrict a_pc_use = a_use; \
+			ctype* a_pc_use = a_use; \
 \
 			/* We don't need to embed the panel stride of A within the auxinfo_t
 			   object because this variant iterates through A in the jr loop,
@@ -535,8 +535,8 @@ void PASTEMAC(ch,varname) \
 			/*bli_auxinfo_set_ps_a( ps_a_use, &aux );*/ \
 \
 			/* Grow the thrinfo_t tree. */ \
-			bszid_t*   restrict bszids_ic = &bszids_pa[1]; \
-			                    thread_ic = bli_thrinfo_sub_node( thread_pa ); \
+			bszid_t*   bszids_ic = &bszids_pa[1]; \
+			               thread_ic = bli_thrinfo_sub_node( thread_pa ); \
 			bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \
 \
 			/* Compute the IC loop thread range for the current thread. */ \
@@ -555,11 +555,11 @@ void PASTEMAC(ch,varname) \
 				/* Calculate the thread's current IC block dimension. */ \
 				const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \
 \
-				ctype* restrict b_ic = b_pc + ii * icstep_b; \
-				ctype* restrict c_ic = c_jc + ii * icstep_c; \
+				ctype* b_ic = b_pc + ii * icstep_b; \
+				ctype* c_ic = c_jc + ii * icstep_c; \
 \
 				ctype* b_use; \
-				inc_t  rs_b_use, cs_b_use, ps_b_use; \
+				      inc_t  rs_b_use, cs_b_use, ps_b_use; \
 \
 				/* Set the bszid_t array and thrinfo_t pointer based on whether
 				   we will be packing A. If we won't be packing A, we alias to
@@ -569,7 +569,7 @@ void PASTEMAC(ch,varname) \
 				   previous call to bli_thrinfo_grow(), since bszid values of
 				   BLIS_NO_PART cause the tree to grow by two (e.g. to the next
 				   bszid that is a normal bszid_t value). */ \
-				bszid_t*   restrict bszids_pb; \
+				bszid_t*   bszids_pb; \
 				if ( packb ) { bszids_pb = &bszids_ic[1]; \
 							   thread_pb = bli_thrinfo_sub_node( thread_ic ); } \
 				else         { bszids_pb = &bszids_ic[0]; \
@@ -602,7 +602,7 @@ void PASTEMAC(ch,varname) \
 \
 				/* Alias b_use so that it's clear this is our current block of
 				   matrix B. */ \
-				ctype* restrict b_ic_use = b_use; \
+				ctype* b_ic_use = b_use; \
 \
 				/* Embed the panel stride of B within the auxinfo_t object. The
 				   millikernel will query and use this to iterate through
@@ -610,8 +610,8 @@ void PASTEMAC(ch,varname) \
 				bli_auxinfo_set_ps_b( ps_b_use, &aux ); \
 \
 				/* Grow the thrinfo_t tree. */ \
-				bszid_t*   restrict bszids_jr = &bszids_pb[1]; \
-				                    thread_jr = bli_thrinfo_sub_node( thread_pb ); \
+				bszid_t*   bszids_jr = &bszids_pb[1]; \
+				               thread_jr = bli_thrinfo_sub_node( thread_pb ); \
 				bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \
 \
 				/* Compute number of primary and leftover components of the JR loop. */ \
@@ -640,10 +640,10 @@ void PASTEMAC(ch,varname) \
 					const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? MR : jr_left ); \
 \
 					/*
-					ctype* restrict a_jr = a_pc + j * jrstep_a; \
+					ctype* a_jr = a_pc + j * jrstep_a; \
 					*/ \
-					ctype* restrict a_jr = a_pc_use + j * ps_a_use; \
-					ctype* restrict c_jr = c_ic     + j * jrstep_c; \
+					ctype* a_jr = a_pc_use + j * ps_a_use; \
+					ctype* c_jr = c_ic     + j * jrstep_c; \
 \
 					/*
 					const dim_t ir_iter = ( mc_cur + NR - 1 ) / NR; \
@@ -664,7 +664,7 @@ void PASTEMAC(ch,varname) \
 						  a_jr,     rs_a_use, cs_a_use, \
 						  b_ic_use, rs_b_use, cs_b_use, \
 						  beta_use, \
-						  c_jr,     rs_c,     cs_c, \
+						            c_jr,     rs_c,     cs_c, \
 						  &aux, \
 						  cntx  \
 						); \
@@ -712,16 +712,16 @@ static FUNCPTR_T GENARRAY(ftypes_var2m,gemmsup_ref_var2m);
 
 void bli_gemmsup_ref_var2m
      (
-       trans_t trans,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       stor3_t eff_id,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
+             trans_t    trans,
+       const obj_t*     alpha,
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     beta,
+       const obj_t*     c,
+             stor3_t    eff_id,
+       const cntx_t*    cntx,
+             rntm_t*    rntm,
+             thrinfo_t* thread
      )
 {
 #if 0
@@ -746,41 +746,41 @@ void bli_gemmsup_ref_var2m
 
 	const dim_t    k         = bli_obj_width( &at );
 
-	void* restrict buf_a     = bli_obj_buffer_at_off( &at );
+	void* buf_a     = bli_obj_buffer_at_off( &at );
 	const inc_t    rs_a      = bli_obj_row_stride( &at );
 	const inc_t    cs_a      = bli_obj_col_stride( &at );
 
-	void* restrict buf_b     = bli_obj_buffer_at_off( &bt );
+	void* buf_b     = bli_obj_buffer_at_off( &bt );
 	const inc_t    rs_b      = bli_obj_row_stride( &bt );
 	const inc_t    cs_b      = bli_obj_col_stride( &bt );
 
-	void* restrict buf_c     = bli_obj_buffer_at_off( c );
+	void* buf_c     = bli_obj_buffer_at_off( c );
 	const inc_t    rs_c      = bli_obj_row_stride( c );
 	const inc_t    cs_c      = bli_obj_col_stride( c );
 
-	void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
-	void* restrict buf_beta  = bli_obj_buffer_for_1x1( dt, beta );
+	void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
+	void* buf_beta  = bli_obj_buffer_for_1x1( dt, beta );
 
 #else
-	const num_t    dt        = bli_obj_dt( c );
+	const num_t  dt    = bli_obj_dt( c );
 
-	const bool     packa     = bli_rntm_pack_a( rntm );
-	const bool     packb     = bli_rntm_pack_b( rntm );
+	const bool   packa = bli_rntm_pack_a( rntm );
+	const bool   packb = bli_rntm_pack_b( rntm );
 
-	const conj_t   conja     = bli_obj_conj_status( a );
-	const conj_t   conjb     = bli_obj_conj_status( b );
+	const conj_t conja = bli_obj_conj_status( a );
+	const conj_t conjb = bli_obj_conj_status( b );
 
-	const dim_t    m         = bli_obj_length( c );
-	const dim_t    n         = bli_obj_width( c );
-	      dim_t    k;
+	const dim_t  m     = bli_obj_length( c );
+	const dim_t  n     = bli_obj_width( c );
+	      dim_t  k;
 
-	void* restrict buf_a = bli_obj_buffer_at_off( a );
-	      inc_t    rs_a;
-	      inc_t    cs_a;
+	const void*  buf_a = bli_obj_buffer_at_off( a );
+	      inc_t  rs_a;
+	      inc_t  cs_a;
 
-	void* restrict buf_b = bli_obj_buffer_at_off( b );
-	      inc_t    rs_b;
-	      inc_t    cs_b;
+	const void*  buf_b = bli_obj_buffer_at_off( b );
+	      inc_t  rs_b;
+	      inc_t  cs_b;
 
 	if ( bli_obj_has_notrans( a ) )
 	{
@@ -810,12 +810,12 @@ void bli_gemmsup_ref_var2m
 		cs_b  = bli_obj_row_stride( b );
 	}
 
-	void* restrict buf_c     = bli_obj_buffer_at_off( c );
-	const inc_t    rs_c      = bli_obj_row_stride( c );
-	const inc_t    cs_c      = bli_obj_col_stride( c );
+	      void* buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t rs_c      = bli_obj_row_stride( c );
+	const inc_t cs_c      = bli_obj_col_stride( c );
 
-	void* restrict buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
-	void* restrict buf_beta  = bli_obj_buffer_for_1x1( dt, beta );
+	const void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
+	const void* buf_beta  = bli_obj_buffer_for_1x1( dt, beta );
 
 #endif
 
@@ -841,13 +841,13 @@ void bli_gemmsup_ref_var2m
 		  m,
 		  n,
 		  k,
-		  buf_alpha,
-		  buf_a, rs_a, cs_a,
-		  buf_b, rs_b, cs_b,
-		  buf_beta,
-		  buf_c, rs_c, cs_c,
+		  ( void* )buf_alpha,
+		  ( void* )buf_a, rs_a, cs_a,
+		  ( void* )buf_b, rs_b, cs_b,
+		  ( void* )buf_beta,
+		           buf_c, rs_c, cs_c,
 		  eff_id,
-		  cntx,
+		  ( cntx_t* )cntx,
 		  rntm,
 		  thread
 		);
@@ -864,13 +864,13 @@ void bli_gemmsup_ref_var2m
 		  n,                 // swap the m and n dimensions.
 		  m,
 		  k,
-		  buf_alpha,
-		  buf_b, cs_b, rs_b, // swap the positions of A and B.
-		  buf_a, cs_a, rs_a, // swap the strides of A and B.
-		  buf_beta,
-		  buf_c, cs_c, rs_c, // swap the strides of C.
+		  ( void* )buf_alpha,
+		  ( void* )buf_b, cs_b, rs_b, // swap the positions of A and B.
+		  ( void* )buf_a, cs_a, rs_a, // swap the strides of A and B.
+		  ( void* )buf_beta,
+		           buf_c, cs_c, rs_c, // swap the strides of C.
 		  bli_stor3_trans( eff_id ), // transpose the stor3_t id.
-		  cntx,
+		  ( cntx_t* )cntx,
 		  rntm,
 		  thread
 		);
@@ -883,22 +883,22 @@ void bli_gemmsup_ref_var2m
 \
 void PASTEMAC(ch,varname) \
      ( \
-       bool             packa, \
-       bool             packb, \
-       conj_t           conja, \
-       conj_t           conjb, \
-       dim_t            m, \
-       dim_t            n, \
-       dim_t            k, \
-       void*   restrict alpha, \
-       void*   restrict a, inc_t rs_a, inc_t cs_a, \
-       void*   restrict b, inc_t rs_b, inc_t cs_b, \
-       void*   restrict beta, \
-       void*   restrict c, inc_t rs_c, inc_t cs_c, \
-       stor3_t          stor_id, \
-       cntx_t* restrict cntx, \
-       rntm_t* restrict rntm, \
-       thrinfo_t* restrict thread  \
+       bool       packa, \
+       bool       packb, \
+       conj_t     conja, \
+       conj_t     conjb, \
+       dim_t      m, \
+       dim_t      n, \
+       dim_t      k, \
+       void*      alpha, \
+       void*      a, inc_t rs_a, inc_t cs_a, \
+       void*      b, inc_t rs_b, inc_t cs_b, \
+       void*      beta, \
+       void*      c, inc_t rs_c, inc_t cs_c, \
+       stor3_t    stor_id, \
+       cntx_t*    cntx, \
+       rntm_t*    rntm, \
+       thrinfo_t* thread  \
      ) \
 { \
 	const num_t dt = PASTEMAC(ch,type); \
@@ -998,13 +998,13 @@ void PASTEMAC(ch,varname) \
 	/* Query the context for the sup microkernel address and cast it to its
 	   function pointer type. */ \
 	PASTECH(ch,gemmsup_ker_ft) \
-               gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \
+        gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \
 \
-	ctype* restrict a_00       = a; \
-	ctype* restrict b_00       = b; \
-	ctype* restrict c_00       = c; \
-	ctype* restrict alpha_cast = alpha; \
-	ctype* restrict beta_cast  = beta; \
+	ctype* a_00       = a; \
+	ctype* b_00       = b; \
+	ctype* c_00       = c; \
+	ctype* alpha_cast = alpha; \
+	ctype* beta_cast  = beta; \
 \
 	/* Make local copies of beta and one scalars to prevent any unnecessary
 	   sharing of cache lines between the cores' caches. */ \
@@ -1035,7 +1035,7 @@ void PASTEMAC(ch,varname) \
 	bszid_t bszids_packa [7] = { BLIS_NC, BLIS_KC,               BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \
 	bszid_t bszids_packb [7] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC,               BLIS_NR, BLIS_MR, BLIS_KR }; \
 	bszid_t bszids_packab[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \
-	bszid_t* restrict bszids; \
+	bszid_t* bszids; \
 \
 	/* Set the bszids pointer to the correct bszids array above based on which
 	   matrices (if any) are being packed. */ \
@@ -1047,16 +1047,16 @@ void PASTEMAC(ch,varname) \
 	/* Determine whether we are using more than one thread. */ \
 	const bool is_mt = ( bli_rntm_calc_num_threads( rntm ) > 1 ); \
 \
-	thrinfo_t* restrict thread_jc = NULL; \
-	thrinfo_t* restrict thread_pc = NULL; \
-	thrinfo_t* restrict thread_pb = NULL; \
-	thrinfo_t* restrict thread_ic = NULL; \
-	thrinfo_t* restrict thread_pa = NULL; \
-	thrinfo_t* restrict thread_jr = NULL; \
+	thrinfo_t* thread_jc = NULL; \
+	thrinfo_t* thread_pc = NULL; \
+	thrinfo_t* thread_pb = NULL; \
+	thrinfo_t* thread_ic = NULL; \
+	thrinfo_t* thread_pa = NULL; \
+	thrinfo_t* thread_jr = NULL; \
 \
 	/* Grow the thrinfo_t tree. */ \
-	bszid_t*   restrict bszids_jc = bszids; \
-	                    thread_jc = thread; \
+	bszid_t*   bszids_jc = bszids; \
+	               thread_jc = thread; \
 	bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \
 \
 	/* Compute the JC loop thread range for the current thread. */ \
@@ -1075,12 +1075,12 @@ void PASTEMAC(ch,varname) \
 		/* Calculate the thread's current JC block dimension. */ \
 		const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \
 \
-		ctype* restrict b_jc = b_00 + jj * jcstep_b; \
-		ctype* restrict c_jc = c_00 + jj * jcstep_c; \
+		ctype* b_jc = b_00 + jj * jcstep_b; \
+		ctype* c_jc = c_00 + jj * jcstep_c; \
 \
 		/* Grow the thrinfo_t tree. */ \
-		bszid_t*   restrict bszids_pc = &bszids_jc[1]; \
-		                    thread_pc = bli_thrinfo_sub_node( thread_jc ); \
+		bszid_t*   bszids_pc = &bszids_jc[1]; \
+		               thread_pc = bli_thrinfo_sub_node( thread_jc ); \
 		bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \
 \
 		/* Compute the PC loop thread range for the current thread. */ \
@@ -1098,11 +1098,11 @@ void PASTEMAC(ch,varname) \
 			/* Calculate the thread's current PC block dimension. */ \
 			const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \
 \
-			ctype* restrict a_pc = a_00 + pp * pcstep_a; \
-			ctype* restrict b_pc = b_jc + pp * pcstep_b; \
+			ctype* a_pc = a_00 + pp * pcstep_a; \
+			ctype* b_pc = b_jc + pp * pcstep_b; \
 \
 			/* Only apply beta to the first iteration of the pc loop. */ \
-			ctype* restrict beta_use = ( pp == 0 ? &beta_local : &one_local ); \
+			ctype* beta_use = ( pp == 0 ? &beta_local : &one_local ); \
 \
 			ctype* b_use; \
 			inc_t  rs_b_use, cs_b_use, ps_b_use; \
@@ -1115,7 +1115,7 @@ void PASTEMAC(ch,varname) \
 			   previous call to bli_thrinfo_grow(), since bszid values of
 			   BLIS_NO_PART cause the tree to grow by two (e.g. to the next
 			   bszid that is a normal bszid_t value). */ \
-			bszid_t*   restrict bszids_pb; \
+			bszid_t*   bszids_pb; \
 			if ( packb ) { bszids_pb = &bszids_pc[1]; \
 			               thread_pb = bli_thrinfo_sub_node( thread_pc ); } \
 			else         { bszids_pb = &bszids_pc[0]; \
@@ -1146,7 +1146,7 @@ void PASTEMAC(ch,varname) \
 \
 			/* Alias b_use so that it's clear this is our current block of
 			   matrix B. */ \
-			ctype* restrict b_pc_use = b_use; \
+			ctype* b_pc_use = b_use; \
 \
 			/* We don't need to embed the panel stride of B within the auxinfo_t
 			   object because this variant iterates through B in the jr loop,
@@ -1155,8 +1155,8 @@ void PASTEMAC(ch,varname) \
 			/*bli_auxinfo_set_ps_b( ps_b_use, &aux );*/ \
 \
 			/* Grow the thrinfo_t tree. */ \
-			bszid_t*   restrict bszids_ic = &bszids_pb[1]; \
-			                    thread_ic = bli_thrinfo_sub_node( thread_pb ); \
+			bszid_t*   bszids_ic = &bszids_pb[1]; \
+			               thread_ic = bli_thrinfo_sub_node( thread_pb ); \
 			bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \
 \
 			/* Compute the IC loop thread range for the current thread. */ \
@@ -1175,8 +1175,8 @@ void PASTEMAC(ch,varname) \
 				/* Calculate the thread's current IC block dimension. */ \
 				const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \
 \
-				ctype* restrict a_ic = a_pc + ii * icstep_a; \
-				ctype* restrict c_ic = c_jc + ii * icstep_c; \
+				ctype* a_ic = a_pc + ii * icstep_a; \
+				ctype* c_ic = c_jc + ii * icstep_c; \
 \
 				ctype* a_use; \
 				inc_t  rs_a_use, cs_a_use, ps_a_use; \
@@ -1189,7 +1189,7 @@ void PASTEMAC(ch,varname) \
 				   previous call to bli_thrinfo_grow(), since bszid values of
 				   BLIS_NO_PART cause the tree to grow by two (e.g. to the next
 				   bszid that is a normal bszid_t value). */ \
-				bszid_t*   restrict bszids_pa; \
+				bszid_t*   bszids_pa; \
 				if ( packa ) { bszids_pa = &bszids_ic[1]; \
 							   thread_pa = bli_thrinfo_sub_node( thread_ic ); } \
 				else         { bszids_pa = &bszids_ic[0]; \
@@ -1220,7 +1220,7 @@ void PASTEMAC(ch,varname) \
 \
 				/* Alias a_use so that it's clear this is our current block of
 				   matrix A. */ \
-				ctype* restrict a_ic_use = a_use; \
+				ctype* a_ic_use = a_use; \
 \
 				/* Embed the panel stride of A within the auxinfo_t object. The
 				   millikernel will query and use this to iterate through
@@ -1228,8 +1228,8 @@ void PASTEMAC(ch,varname) \
 				bli_auxinfo_set_ps_a( ps_a_use, &aux ); \
 \
 				/* Grow the thrinfo_t tree. */ \
-				bszid_t*   restrict bszids_jr = &bszids_pa[1]; \
-				                    thread_jr = bli_thrinfo_sub_node( thread_pa ); \
+				bszid_t*   bszids_jr = &bszids_pa[1]; \
+				               thread_jr = bli_thrinfo_sub_node( thread_pa ); \
 				bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \
 \
 				/* Compute number of primary and leftover components of the JR loop. */ \
@@ -1258,10 +1258,10 @@ void PASTEMAC(ch,varname) \
 					const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \
 \
 					/*
-					ctype* restrict b_jr = b_pc_use + j * jrstep_b; \
+					ctype* b_jr = b_pc_use + j * jrstep_b; \
 					*/ \
-					ctype* restrict b_jr = b_pc_use + j * ps_b_use; \
-					ctype* restrict c_jr = c_ic     + j * jrstep_c; \
+					ctype* b_jr = b_pc_use + j * ps_b_use; \
+					ctype* c_jr = c_ic     + j * jrstep_c; \
 \
 					/*
 					const dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \
@@ -1282,7 +1282,7 @@ void PASTEMAC(ch,varname) \
 						  a_ic_use, rs_a_use, cs_a_use, \
 						  b_jr,     rs_b_use, cs_b_use, \
 						  beta_use, \
-						  c_jr,     rs_c,     cs_c, \
+						            c_jr,     rs_c,     cs_c, \
 						  &aux, \
 						  cntx  \
 						); \
diff --git a/frame/3/bli_l3_sup_vars.h b/frame/3/bli_l3_sup_vars.h
index ead9925e68..df9a747abd 100644
--- a/frame/3/bli_l3_sup_vars.h
+++ b/frame/3/bli_l3_sup_vars.h
@@ -42,16 +42,16 @@
 \
 void PASTEMAC0(opname) \
      ( \
-       trans_t trans, \
-       obj_t*  alpha, \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  beta, \
-       obj_t*  c, \
-       stor3_t eff_id, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       thrinfo_t* thread  \
+             trans_t    trans, \
+       const obj_t*     alpha, \
+       const obj_t*     a, \
+       const obj_t*     b, \
+       const obj_t*     beta, \
+       const obj_t*     c, \
+             stor3_t    eff_id, \
+       const cntx_t*    cntx, \
+             rntm_t*    rntm, \
+             thrinfo_t* thread  \
      );
 
 GENPROT( gemmsup_ref_var1 )
@@ -70,20 +70,20 @@ GENPROT( gemmsup_ref_var2m )
 \
 void PASTEMAC(ch,varname) \
      ( \
-       conj_t           conja, \
-       conj_t           conjb, \
-       dim_t            m, \
-       dim_t            n, \
-       dim_t            k, \
-       void*   restrict alpha, \
-       void*   restrict a, inc_t rs_a, inc_t cs_a, \
-       void*   restrict b, inc_t rs_b, inc_t cs_b, \
-       void*   restrict beta, \
-       void*   restrict c, inc_t rs_c, inc_t cs_c, \
-       stor3_t          eff_id, \
-       cntx_t* restrict cntx, \
-       rntm_t* restrict rntm, \
-       thrinfo_t* restrict thread  \
+       conj_t     conja, \
+       conj_t     conjb, \
+       dim_t      m, \
+       dim_t      n, \
+       dim_t      k, \
+       void*      alpha, \
+       void*      a, inc_t rs_a, inc_t cs_a, \
+       void*      b, inc_t rs_b, inc_t cs_b, \
+       void*      beta, \
+       void*      c, inc_t rs_c, inc_t cs_c, \
+       stor3_t    eff_id, \
+       cntx_t*    cntx, \
+       rntm_t*    rntm, \
+       thrinfo_t* thread  \
      );
 
 INSERT_GENTPROT_BASIC0( gemmsup_ref_var1 )
@@ -94,22 +94,22 @@ INSERT_GENTPROT_BASIC0( gemmsup_ref_var2 )
 \
 void PASTEMAC(ch,varname) \
      ( \
-       bool             packa, \
-       bool             packb, \
-       conj_t           conja, \
-       conj_t           conjb, \
-       dim_t            m, \
-       dim_t            n, \
-       dim_t            k, \
-       void*   restrict alpha, \
-       void*   restrict a, inc_t rs_a, inc_t cs_a, \
-       void*   restrict b, inc_t rs_b, inc_t cs_b, \
-       void*   restrict beta, \
-       void*   restrict c, inc_t rs_c, inc_t cs_c, \
-       stor3_t          eff_id, \
-       cntx_t* restrict cntx, \
-       rntm_t* restrict rntm, \
-       thrinfo_t* restrict thread  \
+       bool       packa, \
+       bool       packb, \
+       conj_t     conja, \
+       conj_t     conjb, \
+       dim_t      m, \
+       dim_t      n, \
+       dim_t      k, \
+       void*      alpha, \
+       void*      a, inc_t rs_a, inc_t cs_a, \
+       void*      b, inc_t rs_b, inc_t cs_b, \
+       void*      beta, \
+       void*      c, inc_t rs_c, inc_t cs_c, \
+       stor3_t    eff_id, \
+       cntx_t*    cntx, \
+       rntm_t*    rntm, \
+       thrinfo_t* thread  \
      );
 
 INSERT_GENTPROT_BASIC0( gemmsup_ref_var1n )
@@ -119,12 +119,12 @@ INSERT_GENTPROT_BASIC0( gemmsup_ref_var2m )
 
 BLIS_INLINE void bli_gemmsup_ref_var1n2m_opt_cases
      (
-       num_t    dt,
-       trans_t* trans,
-       bool     packa,
-       bool     packb,
-       stor3_t* eff_id,
-       cntx_t*  cntx
+             num_t    dt,
+             trans_t* trans,
+             bool     packa,
+             bool     packb,
+             stor3_t* eff_id,
+       const cntx_t*  cntx
      )
 {
 	const bool row_pref = bli_cntx_ukr_prefers_rows_dt( dt, bli_stor3_ukr( *eff_id ), cntx );
diff --git a/frame/3/bli_l3_tapi.c b/frame/3/bli_l3_tapi.c
index afec5b677a..8f256a11aa 100644
--- a/frame/3/bli_l3_tapi.c
+++ b/frame/3/bli_l3_tapi.c
@@ -43,16 +43,16 @@
 \
 void PASTEMAC(ch,opname) \
      ( \
-       trans_t transa, \
-       trans_t transb, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   k, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  b, inc_t rs_b, inc_t cs_b, \
-       ctype*  beta, \
-       ctype*  c, inc_t rs_c, inc_t cs_c  \
+             trans_t transa, \
+             trans_t transb, \
+             dim_t   m, \
+             dim_t   n, \
+             dim_t   k, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  b, inc_t rs_b, inc_t cs_b, \
+       const ctype*  beta, \
+             ctype*  c, inc_t rs_c, inc_t cs_c  \
      ) \
 { \
 	/* Invoke the expert interface and request default cntx_t and rntm_t
@@ -80,16 +80,16 @@ INSERT_GENTFUNC_BASIC0( gemm )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       uplo_t  uploc, \
-       trans_t transa, \
-       trans_t transb, \
-       dim_t   m, \
-       dim_t   k, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  b, inc_t rs_b, inc_t cs_b, \
-       ctype*  beta, \
-       ctype*  c, inc_t rs_c, inc_t cs_c  \
+             uplo_t  uploc, \
+             trans_t transa, \
+             trans_t transb, \
+             dim_t   m, \
+             dim_t   k, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  b, inc_t rs_b, inc_t cs_b, \
+       const ctype*  beta, \
+             ctype*  c, inc_t rs_c, inc_t cs_c  \
      ) \
 { \
 	/* Invoke the expert interface and request default cntx_t and rntm_t
@@ -118,17 +118,17 @@ INSERT_GENTFUNC_BASIC0( gemmt )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       side_t  side, \
-       uplo_t  uploa, \
-       conj_t  conja, \
-       trans_t transb, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  b, inc_t rs_b, inc_t cs_b, \
-       ctype*  beta, \
-       ctype*  c, inc_t rs_c, inc_t cs_c  \
+             side_t  side, \
+             uplo_t  uploa, \
+             conj_t  conja, \
+             trans_t transb, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  b, inc_t rs_b, inc_t cs_b, \
+       const ctype*  beta, \
+             ctype*  c, inc_t rs_c, inc_t cs_c  \
      ) \
 { \
 	/* Invoke the expert interface and request default cntx_t and rntm_t
@@ -159,14 +159,14 @@ INSERT_GENTFUNC_BASIC( symm, BLIS_SYMMETRIC )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       uplo_t   uploc, \
-       trans_t  transa, \
-       dim_t    m, \
-       dim_t    k, \
-       ctype_r* alpha, \
-       ctype*   a, inc_t rs_a, inc_t cs_a, \
-       ctype_r* beta, \
-       ctype*   c, inc_t rs_c, inc_t cs_c  \
+             uplo_t   uploc, \
+             trans_t  transa, \
+             dim_t    m, \
+             dim_t    k, \
+       const ctype_r* alpha, \
+       const ctype*   a, inc_t rs_a, inc_t cs_a, \
+       const ctype_r* beta, \
+             ctype*   c, inc_t rs_c, inc_t cs_c  \
      ) \
 { \
 	/* Invoke the expert interface and request default cntx_t and rntm_t
@@ -193,16 +193,16 @@ INSERT_GENTFUNCR_BASIC0( herk )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       uplo_t   uploc, \
-       trans_t  transa, \
-       trans_t  transb, \
-       dim_t    m, \
-       dim_t    k, \
-       ctype*   alpha, \
-       ctype*   a, inc_t rs_a, inc_t cs_a, \
-       ctype*   b, inc_t rs_b, inc_t cs_b, \
-       ctype_r* beta, \
-       ctype*   c, inc_t rs_c, inc_t cs_c  \
+             uplo_t   uploc, \
+             trans_t  transa, \
+             trans_t  transb, \
+             dim_t    m, \
+             dim_t    k, \
+       const ctype*   alpha, \
+       const ctype*   a, inc_t rs_a, inc_t cs_a, \
+       const ctype*   b, inc_t rs_b, inc_t cs_b, \
+       const ctype_r* beta, \
+             ctype*   c, inc_t rs_c, inc_t cs_c  \
      ) \
 { \
 	/* Invoke the expert interface and request default cntx_t and rntm_t
@@ -231,14 +231,14 @@ INSERT_GENTFUNCR_BASIC0( her2k )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       uplo_t  uploc, \
-       trans_t transa, \
-       dim_t   m, \
-       dim_t   k, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  beta, \
-       ctype*  c, inc_t rs_c, inc_t cs_c  \
+             uplo_t  uploc, \
+             trans_t transa, \
+             dim_t   m, \
+             dim_t   k, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  beta, \
+             ctype*  c, inc_t rs_c, inc_t cs_c  \
      ) \
 { \
 	/* Invoke the expert interface and request default cntx_t and rntm_t
@@ -265,16 +265,16 @@ INSERT_GENTFUNC_BASIC0( syrk )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       uplo_t  uploc, \
-       trans_t transa, \
-       trans_t transb, \
-       dim_t   m, \
-       dim_t   k, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  b, inc_t rs_b, inc_t cs_b, \
-       ctype*  beta, \
-       ctype*  c, inc_t rs_c, inc_t cs_c  \
+             uplo_t  uploc, \
+             trans_t transa, \
+             trans_t transb, \
+             dim_t   m, \
+             dim_t   k, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  b, inc_t rs_b, inc_t cs_b, \
+       const ctype*  beta, \
+             ctype*  c, inc_t rs_c, inc_t cs_c  \
      ) \
 { \
 	/* Invoke the expert interface and request default cntx_t and rntm_t
@@ -303,18 +303,18 @@ INSERT_GENTFUNC_BASIC0( syr2k )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       side_t  side, \
-       uplo_t  uploa, \
-       trans_t transa, \
-       diag_t  diaga, \
-       trans_t transb, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  b, inc_t rs_b, inc_t cs_b, \
-       ctype*  beta, \
-       ctype*  c, inc_t rs_c, inc_t cs_c  \
+             side_t  side, \
+             uplo_t  uploa, \
+             trans_t transa, \
+             diag_t  diaga, \
+             trans_t transb, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  b, inc_t rs_b, inc_t cs_b, \
+       const ctype*  beta, \
+             ctype*  c, inc_t rs_c, inc_t cs_c  \
      ) \
 { \
 	/* Invoke the expert interface and request default cntx_t and rntm_t
@@ -345,15 +345,15 @@ INSERT_GENTFUNC_BASIC0( trmm3 )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       side_t  side, \
-       uplo_t  uploa, \
-       trans_t transa, \
-       diag_t  diaga, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  b, inc_t rs_b, inc_t cs_b  \
+             side_t  side, \
+             uplo_t  uploa, \
+             trans_t transa, \
+             diag_t  diaga, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+             ctype*  b, inc_t rs_b, inc_t cs_b  \
      ) \
 { \
 	/* Invoke the expert interface and request default cntx_t and rntm_t
diff --git a/frame/3/bli_l3_tapi.h b/frame/3/bli_l3_tapi.h
index 4b35040018..9b7a9b0771 100644
--- a/frame/3/bli_l3_tapi.h
+++ b/frame/3/bli_l3_tapi.h
@@ -43,16 +43,16 @@
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       trans_t transa, \
-       trans_t transb, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   k, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  b, inc_t rs_b, inc_t cs_b, \
-       ctype*  beta, \
-       ctype*  c, inc_t rs_c, inc_t cs_c  \
+             trans_t transa, \
+             trans_t transb, \
+             dim_t   m, \
+             dim_t   n, \
+             dim_t   k, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  b, inc_t rs_b, inc_t cs_b, \
+       const ctype*  beta, \
+             ctype*  c, inc_t rs_c, inc_t cs_c  \
      );
 
 INSERT_GENTPROT_BASIC0( gemm )
@@ -62,17 +62,17 @@ INSERT_GENTPROT_BASIC0( gemm )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       side_t  side, \
-       uplo_t  uploa, \
-       conj_t  conja, \
-       trans_t transb, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  b, inc_t rs_b, inc_t cs_b, \
-       ctype*  beta, \
-       ctype*  c, inc_t rs_c, inc_t cs_c  \
+             side_t  side, \
+             uplo_t  uploa, \
+             conj_t  conja, \
+             trans_t transb, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  b, inc_t rs_b, inc_t cs_b, \
+       const ctype*  beta, \
+             ctype*  c, inc_t rs_c, inc_t cs_c  \
      );
 
 INSERT_GENTPROT_BASIC0( hemm )
@@ -84,14 +84,14 @@ INSERT_GENTPROT_BASIC0( symm )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       uplo_t   uploc, \
-       trans_t  transa, \
-       dim_t    m, \
-       dim_t    k, \
-       ctype_r* alpha, \
-       ctype*   a, inc_t rs_a, inc_t cs_a, \
-       ctype_r* beta, \
-       ctype*   c, inc_t rs_c, inc_t cs_c  \
+             uplo_t   uploc, \
+             trans_t  transa, \
+             dim_t    m, \
+             dim_t    k, \
+       const ctype_r* alpha, \
+       const ctype*   a, inc_t rs_a, inc_t cs_a, \
+       const ctype_r* beta, \
+             ctype*   c, inc_t rs_c, inc_t cs_c  \
      );
 
 INSERT_GENTPROTR_BASIC0( herk )
@@ -102,16 +102,16 @@ INSERT_GENTPROTR_BASIC0( herk )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       uplo_t   uploc, \
-       trans_t  transa, \
-       trans_t  transb, \
-       dim_t    m, \
-       dim_t    k, \
-       ctype*   alpha, \
-       ctype*   a, inc_t rs_a, inc_t cs_a, \
-       ctype*   b, inc_t rs_b, inc_t cs_b, \
-       ctype_r* beta, \
-       ctype*   c, inc_t rs_c, inc_t cs_c  \
+             uplo_t   uploc, \
+             trans_t  transa, \
+             trans_t  transb, \
+             dim_t    m, \
+             dim_t    k, \
+       const ctype*   alpha, \
+       const ctype*   a, inc_t rs_a, inc_t cs_a, \
+       const ctype*   b, inc_t rs_b, inc_t cs_b, \
+       const ctype_r* beta, \
+             ctype*   c, inc_t rs_c, inc_t cs_c  \
      );
 
 INSERT_GENTPROTR_BASIC0( her2k )
@@ -122,14 +122,14 @@ INSERT_GENTPROTR_BASIC0( her2k )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       uplo_t  uploc, \
-       trans_t transa, \
-       dim_t   m, \
-       dim_t   k, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  beta, \
-       ctype*  c, inc_t rs_c, inc_t cs_c  \
+             uplo_t  uploc, \
+             trans_t transa, \
+             dim_t   m, \
+             dim_t   k, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  beta, \
+             ctype*  c, inc_t rs_c, inc_t cs_c  \
      );
 
 INSERT_GENTPROT_BASIC0( syrk )
@@ -140,16 +140,16 @@ INSERT_GENTPROT_BASIC0( syrk )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       uplo_t  uploc, \
-       trans_t transa, \
-       trans_t transb, \
-       dim_t   m, \
-       dim_t   k, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  b, inc_t rs_b, inc_t cs_b, \
-       ctype*  beta, \
-       ctype*  c, inc_t rs_c, inc_t cs_c  \
+             uplo_t  uploc, \
+             trans_t transa, \
+             trans_t transb, \
+             dim_t   m, \
+             dim_t   k, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  b, inc_t rs_b, inc_t cs_b, \
+       const ctype*  beta, \
+             ctype*  c, inc_t rs_c, inc_t cs_c  \
      );
 
 INSERT_GENTPROT_BASIC0( gemmt )
@@ -161,18 +161,18 @@ INSERT_GENTPROT_BASIC0( syr2k )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       side_t  side, \
-       uplo_t  uploa, \
-       trans_t transa, \
-       diag_t  diaga, \
-       trans_t transb, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  b, inc_t rs_b, inc_t cs_b, \
-       ctype*  beta, \
-       ctype*  c, inc_t rs_c, inc_t cs_c  \
+             side_t  side, \
+             uplo_t  uploa, \
+             trans_t transa, \
+             diag_t  diaga, \
+             trans_t transb, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  b, inc_t rs_b, inc_t cs_b, \
+       const ctype*  beta, \
+             ctype*  c, inc_t rs_c, inc_t cs_c  \
      );
 
 INSERT_GENTPROT_BASIC0( trmm3 )
@@ -183,15 +183,15 @@ INSERT_GENTPROT_BASIC0( trmm3 )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       side_t  side, \
-       uplo_t  uploa, \
-       trans_t transa, \
-       diag_t  diaga, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  b, inc_t rs_b, inc_t cs_b  \
+             side_t  side, \
+             uplo_t  uploa, \
+             trans_t transa, \
+             diag_t  diaga, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+             ctype*  b, inc_t rs_b, inc_t cs_b  \
      );
 
 INSERT_GENTPROT_BASIC0( trmm )
diff --git a/frame/3/bli_l3_tapi_ex.c b/frame/3/bli_l3_tapi_ex.c
index f6a52fb5e9..c934ba9493 100644
--- a/frame/3/bli_l3_tapi_ex.c
+++ b/frame/3/bli_l3_tapi_ex.c
@@ -44,18 +44,18 @@
 \
 void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
      ( \
-       trans_t transa, \
-       trans_t transb, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   k, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  b, inc_t rs_b, inc_t cs_b, \
-       ctype*  beta, \
-       ctype*  c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+             trans_t transa, \
+             trans_t transb, \
+             dim_t   m, \
+             dim_t   n, \
+             dim_t   k, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  b, inc_t rs_b, inc_t cs_b, \
+       const ctype*  beta, \
+             ctype*  c, inc_t rs_c, inc_t cs_c, \
+       const cntx_t* cntx, \
+             rntm_t* rntm  \
      ) \
 { \
 	bli_init_once(); \
@@ -74,12 +74,12 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
 	bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \
 	bli_set_dims_with_trans( transb, k, n, &m_b, &n_b ); \
 \
-	bli_obj_init_finish_1x1( dt, alpha, &alphao ); \
-	bli_obj_init_finish_1x1( dt, beta,  &betao  ); \
+	bli_obj_init_finish_1x1( dt, ( void* )alpha, &alphao ); \
+	bli_obj_init_finish_1x1( dt, ( void* )beta,  &betao  ); \
 \
-	bli_obj_init_finish( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \
-	bli_obj_init_finish( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \
-	bli_obj_init_finish( dt, m,   n,   c, rs_c, cs_c, &co ); \
+	bli_obj_init_finish( dt, m_a, n_a, ( void* )a, rs_a, cs_a, &ao ); \
+	bli_obj_init_finish( dt, m_b, n_b, ( void* )b, rs_b, cs_b, &bo ); \
+	bli_obj_init_finish( dt, m,   n,            c, rs_c, cs_c, &co ); \
 \
 	bli_obj_set_conjtrans( transa, &ao ); \
 	bli_obj_set_conjtrans( transb, &bo ); \
@@ -103,19 +103,19 @@ INSERT_GENTFUNC_BASIC0( gemm )
 \
 void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
      ( \
-       side_t  side, \
-       uplo_t  uploa, \
-       conj_t  conja, \
-       trans_t transb, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  b, inc_t rs_b, inc_t cs_b, \
-       ctype*  beta, \
-       ctype*  c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+             side_t  side, \
+             uplo_t  uploa, \
+             conj_t  conja, \
+             trans_t transb, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  b, inc_t rs_b, inc_t cs_b, \
+       const ctype*  beta, \
+             ctype*  c, inc_t rs_c, inc_t cs_c, \
+       const cntx_t* cntx, \
+             rntm_t* rntm  \
      ) \
 { \
 	bli_init_once(); \
@@ -134,12 +134,12 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
 	bli_set_dim_with_side(   side,   m, n, &mn_a ); \
 	bli_set_dims_with_trans( transb, m, n, &m_b, &n_b ); \
 \
-	bli_obj_init_finish_1x1( dt, alpha, &alphao ); \
-	bli_obj_init_finish_1x1( dt, beta,  &betao  ); \
+	bli_obj_init_finish_1x1( dt, ( void* )alpha, &alphao ); \
+	bli_obj_init_finish_1x1( dt, ( void* )beta,  &betao  ); \
 \
-	bli_obj_init_finish( dt, mn_a, mn_a, a, rs_a, cs_a, &ao ); \
-	bli_obj_init_finish( dt, m_b,  n_b,  b, rs_b, cs_b, &bo ); \
-	bli_obj_init_finish( dt, m,    n,    c, rs_c, cs_c, &co ); \
+	bli_obj_init_finish( dt, mn_a, mn_a, ( void* )a, rs_a, cs_a, &ao ); \
+	bli_obj_init_finish( dt, m_b,  n_b,  ( void* )b, rs_b, cs_b, &bo ); \
+	bli_obj_init_finish( dt, m,    n,             c, rs_c, cs_c, &co ); \
 \
 	bli_obj_set_uplo( uploa, &ao ); \
 	bli_obj_set_conj( conja, &ao ); \
@@ -169,16 +169,16 @@ INSERT_GENTFUNC_BASIC( symm, BLIS_SYMMETRIC )
 \
 void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
      ( \
-       uplo_t   uploc, \
-       trans_t  transa, \
-       dim_t    m, \
-       dim_t    k, \
-       ctype_r* alpha, \
-       ctype*   a, inc_t rs_a, inc_t cs_a, \
-       ctype_r* beta, \
-       ctype*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t*  cntx, \
-       rntm_t*  rntm  \
+             uplo_t   uploc, \
+             trans_t  transa, \
+             dim_t    m, \
+             dim_t    k, \
+       const ctype_r* alpha, \
+       const ctype*   a, inc_t rs_a, inc_t cs_a, \
+       const ctype_r* beta, \
+             ctype*   c, inc_t rs_c, inc_t cs_c, \
+       const cntx_t*  cntx, \
+             rntm_t*  rntm  \
      ) \
 { \
 	bli_init_once(); \
@@ -195,11 +195,11 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
 \
 	bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \
 \
-	bli_obj_init_finish_1x1( dt_r, alpha, &alphao ); \
-	bli_obj_init_finish_1x1( dt_r, beta,  &betao  ); \
+	bli_obj_init_finish_1x1( dt_r, ( void* )alpha, &alphao ); \
+	bli_obj_init_finish_1x1( dt_r, ( void* )beta,  &betao  ); \
 \
-	bli_obj_init_finish( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \
-	bli_obj_init_finish( dt, m,   m,   c, rs_c, cs_c, &co ); \
+	bli_obj_init_finish( dt, m_a, n_a, ( void* )a, rs_a, cs_a, &ao ); \
+	bli_obj_init_finish( dt, m,   m,            c, rs_c, cs_c, &co ); \
 \
 	bli_obj_set_uplo( uploc, &co ); \
 	bli_obj_set_conjtrans( transa, &ao ); \
@@ -225,18 +225,18 @@ INSERT_GENTFUNCR_BASIC0( herk )
 \
 void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
      ( \
-       uplo_t   uploc, \
-       trans_t  transa, \
-       trans_t  transb, \
-       dim_t    m, \
-       dim_t    k, \
-       ctype*   alpha, \
-       ctype*   a, inc_t rs_a, inc_t cs_a, \
-       ctype*   b, inc_t rs_b, inc_t cs_b, \
-       ctype_r* beta, \
-       ctype*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t*  cntx, \
-       rntm_t*  rntm  \
+             uplo_t   uploc, \
+             trans_t  transa, \
+             trans_t  transb, \
+             dim_t    m, \
+             dim_t    k, \
+       const ctype*   alpha, \
+       const ctype*   a, inc_t rs_a, inc_t cs_a, \
+       const ctype*   b, inc_t rs_b, inc_t cs_b, \
+       const ctype_r* beta, \
+             ctype*   c, inc_t rs_c, inc_t cs_c, \
+       const cntx_t*  cntx, \
+             rntm_t*  rntm  \
      ) \
 { \
 	bli_init_once(); \
@@ -256,12 +256,12 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
 	bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \
 	bli_set_dims_with_trans( transb, m, k, &m_b, &n_b ); \
 \
-	bli_obj_init_finish_1x1( dt,   alpha, &alphao ); \
-	bli_obj_init_finish_1x1( dt_r, beta,  &betao  ); \
+	bli_obj_init_finish_1x1( dt,   ( void* )alpha, &alphao ); \
+	bli_obj_init_finish_1x1( dt_r, ( void* )beta,  &betao  ); \
 \
-	bli_obj_init_finish( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \
-	bli_obj_init_finish( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \
-	bli_obj_init_finish( dt, m,   m,   c, rs_c, cs_c, &co ); \
+	bli_obj_init_finish( dt, m_a, n_a, ( void* )a, rs_a, cs_a, &ao ); \
+	bli_obj_init_finish( dt, m_b, n_b, ( void* )b, rs_b, cs_b, &bo ); \
+	bli_obj_init_finish( dt, m,   m,            c, rs_c, cs_c, &co ); \
 \
 	bli_obj_set_uplo( uploc, &co ); \
 	bli_obj_set_conjtrans( transa, &ao ); \
@@ -289,16 +289,16 @@ INSERT_GENTFUNCR_BASIC0( her2k )
 \
 void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
      ( \
-       uplo_t  uploc, \
-       trans_t transa, \
-       dim_t   m, \
-       dim_t   k, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  beta, \
-       ctype*  c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+             uplo_t  uploc, \
+             trans_t transa, \
+             dim_t   m, \
+             dim_t   k, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  beta, \
+             ctype*  c, inc_t rs_c, inc_t cs_c, \
+       const cntx_t* cntx, \
+             rntm_t* rntm  \
      ) \
 { \
 	bli_init_once(); \
@@ -314,11 +314,11 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
 \
 	bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \
 \
-	bli_obj_init_finish_1x1( dt, alpha, &alphao ); \
-	bli_obj_init_finish_1x1( dt, beta,  &betao  ); \
+	bli_obj_init_finish_1x1( dt, ( void* )alpha, &alphao ); \
+	bli_obj_init_finish_1x1( dt, ( void* )beta,  &betao  ); \
 \
-	bli_obj_init_finish( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \
-	bli_obj_init_finish( dt, m,   m,   c, rs_c, cs_c, &co ); \
+	bli_obj_init_finish( dt, m_a, n_a, ( void* )a, rs_a, cs_a, &ao ); \
+	bli_obj_init_finish( dt, m,   m,            c, rs_c, cs_c, &co ); \
 \
 	bli_obj_set_uplo( uploc, &co ); \
 	bli_obj_set_conjtrans( transa, &ao ); \
@@ -344,18 +344,18 @@ INSERT_GENTFUNC_BASIC0( syrk )
 \
 void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
      ( \
-       uplo_t  uploc, \
-       trans_t transa, \
-       trans_t transb, \
-       dim_t   m, \
-       dim_t   k, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  b, inc_t rs_b, inc_t cs_b, \
-       ctype*  beta, \
-       ctype*  c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+             uplo_t  uploc, \
+             trans_t transa, \
+             trans_t transb, \
+             dim_t   m, \
+             dim_t   k, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  b, inc_t rs_b, inc_t cs_b, \
+       const ctype*  beta, \
+             ctype*  c, inc_t rs_c, inc_t cs_c, \
+       const cntx_t* cntx, \
+             rntm_t* rntm  \
      ) \
 { \
 	bli_init_once(); \
@@ -374,12 +374,12 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
 	bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \
 	bli_set_dims_with_trans( transb, m, k, &m_b, &n_b ); \
 \
-	bli_obj_init_finish_1x1( dt, alpha, &alphao ); \
-	bli_obj_init_finish_1x1( dt, beta,  &betao  ); \
+	bli_obj_init_finish_1x1( dt, ( void* )alpha, &alphao ); \
+	bli_obj_init_finish_1x1( dt, ( void* )beta,  &betao  ); \
 \
-	bli_obj_init_finish( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \
-	bli_obj_init_finish( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \
-	bli_obj_init_finish( dt, m,   m,   c, rs_c, cs_c, &co ); \
+	bli_obj_init_finish( dt, m_a, n_a, ( void* )a, rs_a, cs_a, &ao ); \
+	bli_obj_init_finish( dt, m_b, n_b, ( void* )b, rs_b, cs_b, &bo ); \
+	bli_obj_init_finish( dt, m,   m,            c, rs_c, cs_c, &co ); \
 \
 	bli_obj_set_uplo( uploc, &co ); \
 	bli_obj_set_conjtrans( transa, &ao ); \
@@ -407,18 +407,18 @@ INSERT_GENTFUNC_BASIC0( syr2k )
 \
 void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
      ( \
-       uplo_t  uploc, \
-       trans_t transa, \
-       trans_t transb, \
-       dim_t   m, \
-       dim_t   k, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  b, inc_t rs_b, inc_t cs_b, \
-       ctype*  beta, \
-       ctype*  c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+             uplo_t  uploc, \
+             trans_t transa, \
+             trans_t transb, \
+             dim_t   m, \
+             dim_t   k, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  b, inc_t rs_b, inc_t cs_b, \
+       const ctype*  beta, \
+             ctype*  c, inc_t rs_c, inc_t cs_c, \
+       const cntx_t* cntx, \
+             rntm_t* rntm  \
      ) \
 { \
 	bli_init_once(); \
@@ -437,12 +437,12 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
 	bli_set_dims_with_trans( transa, m, k, &m_a, &n_a ); \
 	bli_set_dims_with_trans( transb, k, m, &m_b, &n_b ); \
 \
-	bli_obj_init_finish_1x1( dt, alpha, &alphao ); \
-	bli_obj_init_finish_1x1( dt, beta,  &betao  ); \
+	bli_obj_init_finish_1x1( dt, ( void* )alpha, &alphao ); \
+	bli_obj_init_finish_1x1( dt, ( void* )beta,  &betao  ); \
 \
-	bli_obj_init_finish( dt, m_a, n_a, a, rs_a, cs_a, &ao ); \
-	bli_obj_init_finish( dt, m_b, n_b, b, rs_b, cs_b, &bo ); \
-	bli_obj_init_finish( dt, m,   m,   c, rs_c, cs_c, &co ); \
+	bli_obj_init_finish( dt, m_a, n_a, ( void* )a, rs_a, cs_a, &ao ); \
+	bli_obj_init_finish( dt, m_b, n_b, ( void* )b, rs_b, cs_b, &bo ); \
+	bli_obj_init_finish( dt, m,   m,            c, rs_c, cs_c, &co ); \
 \
 	bli_obj_set_uplo( uploc, &co ); \
 	bli_obj_set_conjtrans( transa, &ao ); \
@@ -468,20 +468,20 @@ INSERT_GENTFUNC_BASIC0( gemmt )
 \
 void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
      ( \
-       side_t  side, \
-       uplo_t  uploa, \
-       trans_t transa, \
-       diag_t  diaga, \
-       trans_t transb, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  b, inc_t rs_b, inc_t cs_b, \
-       ctype*  beta, \
-       ctype*  c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+             side_t  side, \
+             uplo_t  uploa, \
+             trans_t transa, \
+             diag_t  diaga, \
+             trans_t transb, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  b, inc_t rs_b, inc_t cs_b, \
+       const ctype*  beta, \
+             ctype*  c, inc_t rs_c, inc_t cs_c, \
+       const cntx_t* cntx, \
+             rntm_t* rntm  \
      ) \
 { \
 	bli_init_once(); \
@@ -500,12 +500,12 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
 	bli_set_dim_with_side(   side,   m, n, &mn_a ); \
 	bli_set_dims_with_trans( transb, m, n, &m_b, &n_b ); \
 \
-	bli_obj_init_finish_1x1( dt, alpha, &alphao ); \
-	bli_obj_init_finish_1x1( dt, beta,  &betao  ); \
+	bli_obj_init_finish_1x1( dt, ( void* )alpha, &alphao ); \
+	bli_obj_init_finish_1x1( dt, ( void* )beta,  &betao  ); \
 \
-	bli_obj_init_finish( dt, mn_a, mn_a, a, rs_a, cs_a, &ao ); \
-	bli_obj_init_finish( dt, m_b,  n_b,  b, rs_b, cs_b, &bo ); \
-	bli_obj_init_finish( dt, m,    n,    c, rs_c, cs_c, &co ); \
+	bli_obj_init_finish( dt, mn_a, mn_a, ( void* )a, rs_a, cs_a, &ao ); \
+	bli_obj_init_finish( dt, m_b,  n_b,  ( void* )b, rs_b, cs_b, &bo ); \
+	bli_obj_init_finish( dt, m,    n,             c, rs_c, cs_c, &co ); \
 \
 	bli_obj_set_uplo( uploa, &ao ); \
 	bli_obj_set_diag( diaga, &ao ); \
@@ -535,17 +535,17 @@ INSERT_GENTFUNC_BASIC0( trmm3 )
 \
 void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
      ( \
-       side_t  side, \
-       uplo_t  uploa, \
-       trans_t transa, \
-       diag_t  diaga, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  b, inc_t rs_b, inc_t cs_b, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+             side_t  side, \
+             uplo_t  uploa, \
+             trans_t transa, \
+             diag_t  diaga, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+             ctype*  b, inc_t rs_b, inc_t cs_b, \
+       const cntx_t* cntx, \
+             rntm_t* rntm  \
      ) \
 { \
 	bli_init_once(); \
@@ -560,10 +560,10 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
 \
 	bli_set_dim_with_side( side, m, n, &mn_a ); \
 \
-	bli_obj_init_finish_1x1( dt, alpha, &alphao ); \
+	bli_obj_init_finish_1x1( dt, ( void* )alpha, &alphao ); \
 \
-	bli_obj_init_finish( dt, mn_a, mn_a, a, rs_a, cs_a, &ao ); \
-	bli_obj_init_finish( dt, m,    n,    b, rs_b, cs_b, &bo ); \
+	bli_obj_init_finish( dt, mn_a, mn_a, ( void* )a, rs_a, cs_a, &ao ); \
+	bli_obj_init_finish( dt, m,    n,             b, rs_b, cs_b, &bo ); \
 \
 	bli_obj_set_uplo( uploa, &ao ); \
 	bli_obj_set_diag( diaga, &ao ); \
diff --git a/frame/3/bli_l3_tapi_ex.h b/frame/3/bli_l3_tapi_ex.h
index 1ab0a8ff1a..eb142af05d 100644
--- a/frame/3/bli_l3_tapi_ex.h
+++ b/frame/3/bli_l3_tapi_ex.h
@@ -43,18 +43,18 @@
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
      ( \
-       trans_t transa, \
-       trans_t transb, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   k, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  b, inc_t rs_b, inc_t cs_b, \
-       ctype*  beta, \
-       ctype*  c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+             trans_t transa, \
+             trans_t transb, \
+             dim_t   m, \
+             dim_t   n, \
+             dim_t   k, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  b, inc_t rs_b, inc_t cs_b, \
+       const ctype*  beta, \
+             ctype*  c, inc_t rs_c, inc_t cs_c, \
+       const cntx_t* cntx, \
+             rntm_t* rntm  \
      );
 
 INSERT_GENTPROT_BASIC0( gemm )
@@ -64,19 +64,19 @@ INSERT_GENTPROT_BASIC0( gemm )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
      ( \
-       side_t  side, \
-       uplo_t  uploa, \
-       conj_t  conja, \
-       trans_t transb, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  b, inc_t rs_b, inc_t cs_b, \
-       ctype*  beta, \
-       ctype*  c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+             side_t  side, \
+             uplo_t  uploa, \
+             conj_t  conja, \
+             trans_t transb, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  b, inc_t rs_b, inc_t cs_b, \
+       const ctype*  beta, \
+             ctype*  c, inc_t rs_c, inc_t cs_c, \
+       const cntx_t* cntx, \
+             rntm_t* rntm  \
      );
 
 INSERT_GENTPROT_BASIC0( hemm )
@@ -88,16 +88,16 @@ INSERT_GENTPROT_BASIC0( symm )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
      ( \
-       uplo_t   uploc, \
-       trans_t  transa, \
-       dim_t    m, \
-       dim_t    k, \
-       ctype_r* alpha, \
-       ctype*   a, inc_t rs_a, inc_t cs_a, \
-       ctype_r* beta, \
-       ctype*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t*  cntx, \
-       rntm_t*  rntm  \
+             uplo_t   uploc, \
+             trans_t  transa, \
+             dim_t    m, \
+             dim_t    k, \
+       const ctype_r* alpha, \
+       const ctype*   a, inc_t rs_a, inc_t cs_a, \
+       const ctype_r* beta, \
+             ctype*   c, inc_t rs_c, inc_t cs_c, \
+       const cntx_t*  cntx, \
+             rntm_t*  rntm  \
      );
 
 INSERT_GENTPROTR_BASIC0( herk )
@@ -108,18 +108,18 @@ INSERT_GENTPROTR_BASIC0( herk )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
      ( \
-       uplo_t   uploc, \
-       trans_t  transa, \
-       trans_t  transb, \
-       dim_t    m, \
-       dim_t    k, \
-       ctype*   alpha, \
-       ctype*   a, inc_t rs_a, inc_t cs_a, \
-       ctype*   b, inc_t rs_b, inc_t cs_b, \
-       ctype_r* beta, \
-       ctype*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t*  cntx, \
-       rntm_t*  rntm  \
+             uplo_t   uploc, \
+             trans_t  transa, \
+             trans_t  transb, \
+             dim_t    m, \
+             dim_t    k, \
+       const ctype*   alpha, \
+       const ctype*   a, inc_t rs_a, inc_t cs_a, \
+       const ctype*   b, inc_t rs_b, inc_t cs_b, \
+       const ctype_r* beta, \
+             ctype*   c, inc_t rs_c, inc_t cs_c, \
+       const cntx_t*  cntx, \
+             rntm_t*  rntm  \
      );
 
 INSERT_GENTPROTR_BASIC0( her2k )
@@ -130,16 +130,16 @@ INSERT_GENTPROTR_BASIC0( her2k )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
      ( \
-       uplo_t  uploc, \
-       trans_t transa, \
-       dim_t   m, \
-       dim_t   k, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  beta, \
-       ctype*  c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+             uplo_t  uploc, \
+             trans_t transa, \
+             dim_t   m, \
+             dim_t   k, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  beta, \
+             ctype*  c, inc_t rs_c, inc_t cs_c, \
+       const cntx_t* cntx, \
+             rntm_t* rntm  \
      );
 
 INSERT_GENTPROT_BASIC0( syrk )
@@ -150,18 +150,18 @@ INSERT_GENTPROT_BASIC0( syrk )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
      ( \
-       uplo_t  uploc, \
-       trans_t transa, \
-       trans_t transb, \
-       dim_t   m, \
-       dim_t   k, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  b, inc_t rs_b, inc_t cs_b, \
-       ctype*  beta, \
-       ctype*  c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+             uplo_t  uploc, \
+             trans_t transa, \
+             trans_t transb, \
+             dim_t   m, \
+             dim_t   k, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  b, inc_t rs_b, inc_t cs_b, \
+       const ctype*  beta, \
+             ctype*  c, inc_t rs_c, inc_t cs_c, \
+       const cntx_t* cntx, \
+             rntm_t* rntm  \
      );
 
 INSERT_GENTPROT_BASIC0( gemmt )
@@ -173,20 +173,20 @@ INSERT_GENTPROT_BASIC0( syr2k )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
      ( \
-       side_t  side, \
-       uplo_t  uploa, \
-       trans_t transa, \
-       diag_t  diaga, \
-       trans_t transb, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  b, inc_t rs_b, inc_t cs_b, \
-       ctype*  beta, \
-       ctype*  c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+             side_t  side, \
+             uplo_t  uploa, \
+             trans_t transa, \
+             diag_t  diaga, \
+             trans_t transb, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+       const ctype*  b, inc_t rs_b, inc_t cs_b, \
+       const ctype*  beta, \
+             ctype*  c, inc_t rs_c, inc_t cs_c, \
+       const cntx_t* cntx, \
+             rntm_t* rntm  \
      );
 
 INSERT_GENTPROT_BASIC0( trmm3 )
@@ -197,17 +197,17 @@ INSERT_GENTPROT_BASIC0( trmm3 )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
      ( \
-       side_t  side, \
-       uplo_t  uploa, \
-       trans_t transa, \
-       diag_t  diaga, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  alpha, \
-       ctype*  a, inc_t rs_a, inc_t cs_a, \
-       ctype*  b, inc_t rs_b, inc_t cs_b, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+             side_t  side, \
+             uplo_t  uploa, \
+             trans_t transa, \
+             diag_t  diaga, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  alpha, \
+       const ctype*  a, inc_t rs_a, inc_t cs_a, \
+             ctype*  b, inc_t rs_b, inc_t cs_b, \
+       const cntx_t* cntx, \
+             rntm_t* rntm  \
      );
 
 INSERT_GENTPROT_BASIC0( trmm )
diff --git a/frame/3/bli_l3_ukr_prot.h b/frame/3/bli_l3_ukr_prot.h
index 677afc0202..44a59bd4c9 100644
--- a/frame/3/bli_l3_ukr_prot.h
+++ b/frame/3/bli_l3_ukr_prot.h
@@ -50,8 +50,8 @@ void PASTEMAC(ch,opname) \
        ctype_in*  restrict b, \
        ctype_out* restrict beta, \
        ctype_out* restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      );
 
 
@@ -68,8 +68,8 @@ void PASTEMAC(ch,opname) \
        ctype*     restrict bx1, \
        ctype*     restrict b11, \
        ctype*     restrict c11, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      );
 
 
@@ -80,7 +80,7 @@ void PASTEMAC(ch,opname) \
        ctype*     restrict a, \
        ctype*     restrict b, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      );
 
diff --git a/frame/3/bli_l3_ukr_tapi.c b/frame/3/bli_l3_ukr_tapi.c
index 56eaf3f4ce..c2e8ed5d5d 100644
--- a/frame/3/bli_l3_ukr_tapi.c
+++ b/frame/3/bli_l3_ukr_tapi.c
@@ -47,8 +47,8 @@ void PASTEMAC(ch,opname) \
        ctype*     restrict b, \
        ctype*     restrict beta, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      ) \
 { \
 	bli_init_once(); \
@@ -92,8 +92,8 @@ void PASTEMAC(ch,opname) \
        ctype*     restrict bx1, \
        ctype*     restrict b11, \
        ctype*     restrict c11, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      ) \
 { \
 	bli_init_once(); \
@@ -133,8 +133,8 @@ void PASTEMAC(ch,opname) \
        ctype*     restrict a, \
        ctype*     restrict b, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      ) \
 { \
 	bli_init_once(); \
diff --git a/frame/3/gemm/bli_gemm_blk_var1.c b/frame/3/gemm/bli_gemm_blk_var1.c
index de077e5adc..485779a902 100644
--- a/frame/3/gemm/bli_gemm_blk_var1.c
+++ b/frame/3/gemm/bli_gemm_blk_var1.c
@@ -37,44 +37,47 @@
 
 void bli_gemm_blk_var1
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      )
 {
-	obj_t a1, c1;
-	dim_t my_start, my_end;
-	dim_t b_alg;
+	obj_t ap, cp;
+	bli_obj_alias_to( a, &ap );
+	bli_obj_alias_to( c, &cp );
 
 	// Determine the direction in which to partition (forwards or backwards).
-	dir_t direct = bli_l3_direct( a, b, c, cntl );
+	const dir_t direct = bli_l3_direct( &ap, b, &cp, cntl );
 
 	// Prune any zero region that exists along the partitioning dimension.
-	bli_l3_prune_unref_mparts_m( a, b, c, cntl );
+	bli_l3_prune_unref_mparts_m( &ap, b, &cp, cntl );
 
 	// Determine the current thread's subpartition range.
+	dim_t my_start, my_end;
 	bli_thread_range_mdim
 	(
-	  direct, thread, a, b, c, cntl, cntx,
+	  direct, thread, &ap, b, &cp, cntl, cntx,
 	  &my_start, &my_end
 	);
 
 	// Partition along the m dimension.
+	dim_t b_alg;
 	for ( dim_t i = my_start; i < my_end; i += b_alg )
 	{
 		// Determine the current algorithmic blocksize.
-		b_alg = bli_determine_blocksize( direct, i, my_end, a,
+		b_alg = bli_determine_blocksize( direct, i, my_end, &ap,
 		                                 bli_cntl_bszid( cntl ), cntx );
 
 		// Acquire partitions for A1 and C1.
+		obj_t a1, c1;
 		bli_acquire_mpart_mdim( direct, BLIS_SUBPART1,
-		                        i, b_alg, a, &a1 );
+		                        i, b_alg, &ap, &a1 );
 		bli_acquire_mpart_mdim( direct, BLIS_SUBPART1,
-		                        i, b_alg, c, &c1 );
+		                        i, b_alg, &cp, &c1 );
 
 		// Perform gemm subproblem.
 		bli_l3_int
diff --git a/frame/3/gemm/bli_gemm_blk_var2.c b/frame/3/gemm/bli_gemm_blk_var2.c
index 53943e47cd..254a310648 100644
--- a/frame/3/gemm/bli_gemm_blk_var2.c
+++ b/frame/3/gemm/bli_gemm_blk_var2.c
@@ -37,44 +37,47 @@
 
 void bli_gemm_blk_var2
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      )
 {
-	obj_t b1, c1;
-	dim_t my_start, my_end;
-	dim_t b_alg;
+	obj_t bp, cp;
+	bli_obj_alias_to( b, &bp );
+	bli_obj_alias_to( c, &cp );
 
 	// Determine the direction in which to partition (forwards or backwards).
-	dir_t direct = bli_l3_direct( a, b, c, cntl );
+	dir_t direct = bli_l3_direct( a, &bp, &cp, cntl );
 
 	// Prune any zero region that exists along the partitioning dimension.
-	bli_l3_prune_unref_mparts_n( a, b, c, cntl );
+	bli_l3_prune_unref_mparts_n( a, &bp, &cp, cntl );
 
 	// Determine the current thread's subpartition range.
+	dim_t my_start, my_end;
 	bli_thread_range_ndim
 	(
-	  direct, thread, a, b, c, cntl, cntx,
+	  direct, thread, a, &bp, &cp, cntl, cntx,
 	  &my_start, &my_end
 	);
 
 	// Partition along the n dimension.
+	dim_t b_alg;
 	for ( dim_t i = my_start; i < my_end; i += b_alg )
 	{
 		// Determine the current algorithmic blocksize.
-		b_alg = bli_determine_blocksize( direct, i, my_end, b,
+		b_alg = bli_determine_blocksize( direct, i, my_end, &bp,
 		                                 bli_cntl_bszid( cntl ), cntx );
 
 		// Acquire partitions for B1 and C1.
+		obj_t b1, c1;
 		bli_acquire_mpart_ndim( direct, BLIS_SUBPART1,
-		                        i, b_alg, b, &b1 );
+		                        i, b_alg, &bp, &b1 );
 		bli_acquire_mpart_ndim( direct, BLIS_SUBPART1,
-		                        i, b_alg, c, &c1 );
+		                        i, b_alg, &cp, &c1 );
 
 		// Perform gemm subproblem.
 		bli_l3_int
diff --git a/frame/3/gemm/bli_gemm_blk_var3.c b/frame/3/gemm/bli_gemm_blk_var3.c
index 28029777de..1bbec1d957 100644
--- a/frame/3/gemm/bli_gemm_blk_var3.c
+++ b/frame/3/gemm/bli_gemm_blk_var3.c
@@ -36,39 +36,43 @@
 
 void bli_gemm_blk_var3
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      )
 {
-	obj_t a1, b1;
-	dim_t b_alg;
+	obj_t ap, bp, cs;
+	bli_obj_alias_to( a, &ap );
+	bli_obj_alias_to( b, &bp );
+	bli_obj_alias_to( c, &cs );
 
 	// Determine the direction in which to partition (forwards or backwards).
-	dir_t direct = bli_l3_direct( a, b, c, cntl );
+	dir_t direct = bli_l3_direct( &ap, &bp, &cs, cntl );
 
 	// Prune any zero region that exists along the partitioning dimension.
-	bli_l3_prune_unref_mparts_k( a, b, c, cntl );
+	bli_l3_prune_unref_mparts_k( &ap, &bp, &cs, cntl );
 
 	// Query dimension in partitioning direction.
-	dim_t k_trans = bli_obj_width_after_trans( a );
+	dim_t k_trans = bli_obj_width_after_trans( &ap );
 
 	// Partition along the k dimension.
+	dim_t b_alg;
 	for ( dim_t i = 0; i < k_trans; i += b_alg )
 	{
 		// Determine the current algorithmic blocksize.
-		b_alg = bli_l3_determine_kc( direct, i, k_trans, a, b,
+		b_alg = bli_l3_determine_kc( direct, i, k_trans, &ap, &bp,
 		                             bli_cntl_bszid( cntl ), cntx, cntl );
 
 		// Acquire partitions for A1 and B1.
+		obj_t a1, b1;
 		bli_acquire_mpart_ndim( direct, BLIS_SUBPART1,
-		                        i, b_alg, a, &a1 );
+		                        i, b_alg, &ap, &a1 );
 		bli_acquire_mpart_mdim( direct, BLIS_SUBPART1,
-		                        i, b_alg, b, &b1 );
+		                        i, b_alg, &bp, &b1 );
 
 		// Perform gemm subproblem.
 		bli_l3_int
@@ -77,7 +81,7 @@ void bli_gemm_blk_var3
 		  &a1,
 		  &b1,
 		  &BLIS_ONE,
-		  c,
+		  &cs,
 		  cntx,
 		  rntm,
 		  bli_cntl_sub_node( cntl ),
@@ -107,7 +111,7 @@ void bli_gemm_blk_var3
 		// Thus, for neither trmm nor trmm3 should we reset the scalar on C
 		// after the first iteration.
 		if ( bli_cntl_family( cntl ) != BLIS_TRMM )
-		if ( i == 0 ) bli_obj_scalar_reset( c );
+		if ( i == 0 ) bli_obj_scalar_reset( &cs );
 	}
 }
 
diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c
index cd8827bd9c..428e2079f8 100644
--- a/frame/3/gemm/bli_gemm_front.c
+++ b/frame/3/gemm/bli_gemm_front.c
@@ -37,14 +37,14 @@
 
 void bli_gemm_front
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl
      )
 {
 	bli_init_once();
@@ -163,8 +163,8 @@ void bli_gemm_front
 	  rntm
 	);
 
-	obj_t* cp    = &c_local;
-	obj_t* betap = beta;
+	      obj_t* cp    = &c_local;
+	const obj_t* betap = beta;
 
 #ifdef BLIS_ENABLE_GEMM_MD
 #ifdef BLIS_ENABLE_GEMM_MD_EXTRA_MEM
diff --git a/frame/3/gemm/bli_gemm_front.h b/frame/3/gemm/bli_gemm_front.h
index 2728ce8f7f..744f88d1b2 100644
--- a/frame/3/gemm/bli_gemm_front.h
+++ b/frame/3/gemm/bli_gemm_front.h
@@ -34,26 +34,26 @@
 
 void bli_gemm_front
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl
      );
 
 #ifdef BLIS_ENABLE_SMALL_MATRIX
 err_t bli_gemm_small
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       cntl_t* cntl
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             cntl_t* cntl
      );
 #endif
 
diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c
index 874a12439c..814b47c0cf 100644
--- a/frame/3/gemm/bli_gemm_ker_var2.c
+++ b/frame/3/gemm/bli_gemm_ker_var2.c
@@ -77,38 +77,38 @@ static xpbys_mxn_vft GENARRAY2_ALL(xbpys_mxn, xbpys_mxn_fn);
 
 void bli_gemm_ker_var2
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      )
 {
-	num_t     dt_exec   = bli_obj_exec_dt( c );
-	num_t     dt_c      = bli_obj_dt( c );
+	      num_t  dt_exec   = bli_obj_exec_dt( c );
+	      num_t  dt_c      = bli_obj_dt( c );
 
-	pack_t    schema_a  = bli_obj_pack_schema( a );
-	pack_t    schema_b  = bli_obj_pack_schema( b );
+	      pack_t schema_a  = bli_obj_pack_schema( a );
+	      pack_t schema_b  = bli_obj_pack_schema( b );
 
-	dim_t     m         = bli_obj_length( c );
-	dim_t     n         = bli_obj_width( c );
-	dim_t     k         = bli_obj_width( a );
+	      dim_t  m         = bli_obj_length( c );
+	      dim_t  n         = bli_obj_width( c );
+	      dim_t  k         = bli_obj_width( a );
 
-	char*     a_cast    = bli_obj_buffer_at_off( a );
-	inc_t     is_a      = bli_obj_imag_stride( a );
-	dim_t     pd_a      = bli_obj_panel_dim( a );
-	inc_t     ps_a      = bli_obj_panel_stride( a );
+	const char*  a_cast    = bli_obj_buffer_at_off( a );
+	      inc_t  is_a      = bli_obj_imag_stride( a );
+	      dim_t  pd_a      = bli_obj_panel_dim( a );
+	      inc_t  ps_a      = bli_obj_panel_stride( a );
 
-	char*     b_cast    = bli_obj_buffer_at_off( b );
-	inc_t     is_b      = bli_obj_imag_stride( b );
-	dim_t     pd_b      = bli_obj_panel_dim( b );
-	inc_t     ps_b      = bli_obj_panel_stride( b );
+	const char*  b_cast    = bli_obj_buffer_at_off( b );
+	      inc_t  is_b      = bli_obj_imag_stride( b );
+	      dim_t  pd_b      = bli_obj_panel_dim( b );
+	      inc_t  ps_b      = bli_obj_panel_stride( b );
 
-	char*     c_cast    = bli_obj_buffer_at_off( c );
-	inc_t     rs_c      = bli_obj_row_stride( c );
-	inc_t     cs_c      = bli_obj_col_stride( c );
+	      char*  c_cast    = bli_obj_buffer_at_off( c );
+	      inc_t  rs_c      = bli_obj_row_stride( c );
+	      inc_t  cs_c      = bli_obj_col_stride( c );
 
 	// If any dimension is zero, return immediately.
 	if ( bli_zero_dim3( m, n, k ) ) return;
@@ -129,8 +129,8 @@ void bli_gemm_ker_var2
 	// that casts the scalars of A and B to dt_exec via scalar_a and scalar_b,
 	// and we know that the internal scalar in C is already of the type dt_c
 	// due to the casting in the implementation of bli_obj_scalar_attach().
-	char* alpha_cast = bli_obj_internal_scalar_buffer( &scalar_b );
-	char* beta_cast  = bli_obj_internal_scalar_buffer( c );
+	const char* alpha_cast = bli_obj_internal_scalar_buffer( &scalar_b );
+	const char* beta_cast  = bli_obj_internal_scalar_buffer( c );
 
 	// If 1m is being employed on a column- or row-stored matrix with a
 	// real-valued beta, we can use the real domain macro-kernel, which
@@ -174,14 +174,12 @@ void bli_gemm_ker_var2
 	}
 #endif
 
-	siz_t        dt_size   = bli_dt_size( dt_exec );
-	siz_t        dt_c_size = bli_dt_size( dt_c );
+	const siz_t dt_size   = bli_dt_size( dt_exec );
+	const siz_t dt_c_size = bli_dt_size( dt_c );
 
 	// Alias some constants to simpler names.
-	const dim_t  MR        = pd_a;
-	const dim_t  NR        = pd_b;
-	//const dim_t PACKMR     = cs_a;
-	//const dim_t PACKNR     = rs_b;
+	const dim_t MR = pd_a;
+	const dim_t NR = pd_b;
 
 	// Query the context for the micro-kernel address and cast it to its
 	// function pointer type.
@@ -191,7 +189,7 @@ void bli_gemm_ker_var2
 	// field of the params struct. If that function pointer is non-NULL, use it
 	// as our microkernel instead of the default microkernel queried from the
 	// cntx above.
-	gemm_ker_params_t* params = bli_obj_ker_params( c );
+	const gemm_ker_params_t* params = bli_obj_ker_params( c );
 	gemm_ukr_vft user_ukr = params ? params->ukr : NULL;
 	if ( user_ukr ) gemm_ukr = user_ukr;
 
@@ -204,7 +202,7 @@ void bli_gemm_ker_var2
 	const bool  col_pref    = bli_cntx_ukr_prefers_cols_dt( dt_exec, BLIS_GEMM_VIR_UKR, cntx );
 	const inc_t rs_ct       = ( col_pref ? 1 : NR );
 	const inc_t cs_ct       = ( col_pref ? MR : 1 );
-	char*       zero        = bli_obj_buffer_for_const( dt_exec, &BLIS_ZERO );
+	const char* zero        = bli_obj_buffer_for_const( dt_exec, &BLIS_ZERO );
 
 	//
 	// Assumptions/assertions:
@@ -277,24 +275,24 @@ void bli_gemm_ker_var2
 	// Loop over the n dimension (NR columns at a time).
 	for ( dim_t j = jr_start; j < jr_end; j += jr_inc )
 	{
-		char* b1 = b_cast + j * cstep_b;
-		char* c1 = c_cast + j * cstep_c;
+		const char* b1 = b_cast + j * cstep_b;
+		      char* c1 = c_cast + j * cstep_c;
 
-		dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
+		const dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
 
 		// Initialize our next panel of B to be the current panel of B.
-		char* b2 = b1;
+		const char* b2 = b1;
 
 		// Loop over the m dimension (MR rows at a time).
 		for ( dim_t i = ir_start; i < ir_end; i += ir_inc )
 		{
-			char* a1  = a_cast + i * rstep_a;
-			char* c11 = c1     + i * rstep_c;
+			const char* a1  = a_cast + i * rstep_a;
+			      char* c11 = c1     + i * rstep_c;
 
-			dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
+			const dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
 
 			// Compute the addresses of the next panels of A and B.
-			char* a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc );
+			const char* a2 = bli_gemm_get_next_a_upanel( a1, rstep_a, ir_inc );
 			if ( bli_is_last_iter( i, ir_end, ir_tid, ir_nt ) )
 			{
 				a2 = a_cast;
@@ -320,13 +318,13 @@ void bli_gemm_ker_var2
 				  m_cur,
 				  n_cur,
 				  k,
-				  alpha_cast,
-				  a1,
-				  b1,
-				  beta_cast,
-				  c11, rs_c, cs_c,
+				  ( void* )alpha_cast,
+				  ( void* )a1,
+				  ( void* )b1,
+				  ( void* )beta_cast,
+				           c11, rs_c, cs_c,
 				  &aux,
-				  cntx
+				  ( cntx_t* )cntx
 				);
 			}
 			else
@@ -337,13 +335,13 @@ void bli_gemm_ker_var2
 				  MR,
 				  NR,
 				  k,
-				  alpha_cast,
-				  a1,
-				  b1,
-				  zero,
-				  &ct, rs_ct, cs_ct,
+				  ( void* )alpha_cast,
+				  ( void* )a1,
+				  ( void* )b1,
+				  ( void* )zero,
+				           &ct, rs_ct, cs_ct,
 				  &aux,
-				  cntx
+				  ( cntx_t* )cntx
 				);
 
 				// Accumulate to C with type-casting.
@@ -351,7 +349,7 @@ void bli_gemm_ker_var2
 				(
 				    m_cur, n_cur,
 				    &ct, rs_ct, cs_ct,
-				    beta_cast,
+				    ( void* )beta_cast,
 				    c11, rs_c, cs_c
 				);
 			}
diff --git a/frame/3/gemm/bli_gemm_md.c b/frame/3/gemm/bli_gemm_md.c
index 6202cfffdd..a283c12354 100644
--- a/frame/3/gemm/bli_gemm_md.c
+++ b/frame/3/gemm/bli_gemm_md.c
@@ -39,12 +39,12 @@
 
 void bli_gemm_md
      (
-       obj_t*   a,
-       obj_t*   b,
-       obj_t*   beta,
-       obj_t*   c,
-       cntx_t*  cntx_local,
-       cntx_t** cntx
+             obj_t*   a,
+             obj_t*   b,
+       const obj_t*   beta,
+             obj_t*   c,
+             cntx_t*  cntx_local,
+       const cntx_t** cntx
      )
 {
 	mddm_t doms;
@@ -148,12 +148,12 @@ void bli_gemm_md
 //                 cab
 mddm_t bli_gemm_md_ccr
      (
-       obj_t*   a,
-       obj_t*   b,
-       obj_t*   beta,
-       obj_t*   c,
-       cntx_t*  cntx_local,
-       cntx_t** cntx
+             obj_t*   a,
+             obj_t*   b,
+       const obj_t*   beta,
+             obj_t*   c,
+             cntx_t*  cntx_local,
+       const cntx_t** cntx
      )
 {
 	mddm_t doms;
@@ -201,48 +201,51 @@ mddm_t bli_gemm_md_ccr
 
 	// Copy the real domain blocksizes into the slots of their complex
 	// counterparts.
-	blksz_t* blksz_mr = bli_cntx_get_blksz( BLIS_MR, *cntx );
-	blksz_t* blksz_nr = bli_cntx_get_blksz( BLIS_NR, *cntx );
-	blksz_t* blksz_mc = bli_cntx_get_blksz( BLIS_MC, *cntx );
-	blksz_t* blksz_nc = bli_cntx_get_blksz( BLIS_NC, *cntx );
-	blksz_t* blksz_kc = bli_cntx_get_blksz( BLIS_KC, *cntx );
+	blksz_t blksz_mr = *bli_cntx_get_blksz( BLIS_MR, cntx_local );
+	blksz_t blksz_nr = *bli_cntx_get_blksz( BLIS_NR, cntx_local );
+	blksz_t blksz_mc = *bli_cntx_get_blksz( BLIS_MC, cntx_local );
+	blksz_t blksz_nc = *bli_cntx_get_blksz( BLIS_NC, cntx_local );
+	blksz_t blksz_kc = *bli_cntx_get_blksz( BLIS_KC, cntx_local );
 
-	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_mr, BLIS_SCOMPLEX, blksz_mr );
-	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_mr, BLIS_DCOMPLEX, blksz_mr );
+	bli_blksz_copy_dt( BLIS_FLOAT,  &blksz_mr, BLIS_SCOMPLEX, &blksz_mr );
+	bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_mr, BLIS_DCOMPLEX, &blksz_mr );
 
-	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_nr, BLIS_SCOMPLEX, blksz_nr );
-	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_nr, BLIS_DCOMPLEX, blksz_nr );
+	bli_blksz_copy_dt( BLIS_FLOAT,  &blksz_nr, BLIS_SCOMPLEX, &blksz_nr );
+	bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_nr, BLIS_DCOMPLEX, &blksz_nr );
 
-	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_mc, BLIS_SCOMPLEX, blksz_mc );
-	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_mc, BLIS_DCOMPLEX, blksz_mc );
+	bli_blksz_copy_dt( BLIS_FLOAT,  &blksz_mc, BLIS_SCOMPLEX, &blksz_mc );
+	bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_mc, BLIS_DCOMPLEX, &blksz_mc );
 
-	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_nc, BLIS_SCOMPLEX, blksz_nc );
-	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_nc, BLIS_DCOMPLEX, blksz_nc );
+	bli_blksz_copy_dt( BLIS_FLOAT,  &blksz_nc, BLIS_SCOMPLEX, &blksz_nc );
+	bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_nc, BLIS_DCOMPLEX, &blksz_nc );
 
-	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_kc, BLIS_SCOMPLEX, blksz_kc );
-	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_kc, BLIS_DCOMPLEX, blksz_kc );
+	bli_blksz_copy_dt( BLIS_FLOAT,  &blksz_kc, BLIS_SCOMPLEX, &blksz_kc );
+	bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_kc, BLIS_DCOMPLEX, &blksz_kc );
 
 	// Halve both the real and complex MR's (which are both real MR's).
-	bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT,    blksz_mr );
-	bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE,   blksz_mr );
-	bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, blksz_mr );
-	bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, blksz_mr );
+	bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT,    &blksz_mr );
+	bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE,   &blksz_mr );
+	bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, &blksz_mr );
+	bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, &blksz_mr );
 
 	// Halve both the real and complex MC's (which are both real MC's).
-	bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT,    blksz_mc );
-	bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE,   blksz_mc );
-	bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, blksz_mc );
-	bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, blksz_mc );
+	bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT,    &blksz_mc );
+	bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE,   &blksz_mc );
+	bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, &blksz_mc );
+	bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, &blksz_mc );
 
-	// Use the default pack schemas in the objects.
+    bli_cntx_set_blksz( BLIS_MR, &blksz_mr, BLIS_MR, cntx_local );
+    bli_cntx_set_blksz( BLIS_NR, &blksz_nr, BLIS_NR, cntx_local );
+    bli_cntx_set_blksz( BLIS_MC, &blksz_mc, BLIS_MR, cntx_local );
+    bli_cntx_set_blksz( BLIS_NC, &blksz_nc, BLIS_NR, cntx_local );
+    bli_cntx_set_blksz( BLIS_KC, &blksz_kc, BLIS_KC, cntx_local );
 
-	// static func_t* bli_cntx_get_ukrs( ukr_t ukr_id, cntx_t* cntx )
-	func_t* l3_vir_ukrs = bli_cntx_get_ukrs( BLIS_GEMM_VIR_UKR, *cntx );
+	// Use the default pack schemas in the objects.
 
 	// Rather than check which complex datatype dt_comp refers to, we set
 	// the mixed-domain virtual microkernel for both types.
-	bli_func_set_dt( bli_cgemm_md_c2r_ref, BLIS_SCOMPLEX, l3_vir_ukrs );
-	bli_func_set_dt( bli_zgemm_md_c2r_ref, BLIS_DCOMPLEX, l3_vir_ukrs );
+    bli_cntx_set_ukr_dt( bli_cgemm_md_c2r_ref, BLIS_SCOMPLEX, BLIS_GEMM_VIR_UKR, cntx_local );
+    bli_cntx_set_ukr_dt( bli_zgemm_md_c2r_ref, BLIS_DCOMPLEX, BLIS_GEMM_VIR_UKR, cntx_local );
 
 	// Return the computation and execution domains.
 	return doms;
@@ -253,12 +256,12 @@ mddm_t bli_gemm_md_ccr
 //                 cab
 mddm_t bli_gemm_md_crc
      (
-       obj_t*   a,
-       obj_t*   b,
-       obj_t*   beta,
-       obj_t*   c,
-       cntx_t*  cntx_local,
-       cntx_t** cntx
+             obj_t*   a,
+             obj_t*   b,
+       const obj_t*   beta,
+             obj_t*   c,
+             cntx_t*  cntx_local,
+       const cntx_t** cntx
      )
 {
 	mddm_t doms;
@@ -306,48 +309,51 @@ mddm_t bli_gemm_md_crc
 
 	// Copy the real domain blocksizes into the slots of their complex
 	// counterparts.
-	blksz_t* blksz_mr = bli_cntx_get_blksz( BLIS_MR, *cntx );
-	blksz_t* blksz_nr = bli_cntx_get_blksz( BLIS_NR, *cntx );
-	blksz_t* blksz_mc = bli_cntx_get_blksz( BLIS_MC, *cntx );
-	blksz_t* blksz_nc = bli_cntx_get_blksz( BLIS_NC, *cntx );
-	blksz_t* blksz_kc = bli_cntx_get_blksz( BLIS_KC, *cntx );
+	blksz_t blksz_mr = *bli_cntx_get_blksz( BLIS_MR, cntx_local );
+	blksz_t blksz_nr = *bli_cntx_get_blksz( BLIS_NR, cntx_local );
+	blksz_t blksz_mc = *bli_cntx_get_blksz( BLIS_MC, cntx_local );
+	blksz_t blksz_nc = *bli_cntx_get_blksz( BLIS_NC, cntx_local );
+	blksz_t blksz_kc = *bli_cntx_get_blksz( BLIS_KC, cntx_local );
 
-	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_mr, BLIS_SCOMPLEX, blksz_mr );
-	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_mr, BLIS_DCOMPLEX, blksz_mr );
+	bli_blksz_copy_dt( BLIS_FLOAT,  &blksz_mr, BLIS_SCOMPLEX, &blksz_mr );
+	bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_mr, BLIS_DCOMPLEX, &blksz_mr );
 
-	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_nr, BLIS_SCOMPLEX, blksz_nr );
-	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_nr, BLIS_DCOMPLEX, blksz_nr );
+	bli_blksz_copy_dt( BLIS_FLOAT,  &blksz_nr, BLIS_SCOMPLEX, &blksz_nr );
+	bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_nr, BLIS_DCOMPLEX, &blksz_nr );
 
-	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_mc, BLIS_SCOMPLEX, blksz_mc );
-	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_mc, BLIS_DCOMPLEX, blksz_mc );
+	bli_blksz_copy_dt( BLIS_FLOAT,  &blksz_mc, BLIS_SCOMPLEX, &blksz_mc );
+	bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_mc, BLIS_DCOMPLEX, &blksz_mc );
 
-	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_nc, BLIS_SCOMPLEX, blksz_nc );
-	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_nc, BLIS_DCOMPLEX, blksz_nc );
+	bli_blksz_copy_dt( BLIS_FLOAT,  &blksz_nc, BLIS_SCOMPLEX, &blksz_nc );
+	bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_nc, BLIS_DCOMPLEX, &blksz_nc );
 
-	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_kc, BLIS_SCOMPLEX, blksz_kc );
-	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_kc, BLIS_DCOMPLEX, blksz_kc );
+	bli_blksz_copy_dt( BLIS_FLOAT,  &blksz_kc, BLIS_SCOMPLEX, &blksz_kc );
+	bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_kc, BLIS_DCOMPLEX, &blksz_kc );
 
 	// Halve both the real and complex NR's (which are both real NR's).
-	bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT,    blksz_nr );
-	bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE,   blksz_nr );
-	bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, blksz_nr );
-	bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, blksz_nr );
+	bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT,    &blksz_nr );
+	bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE,   &blksz_nr );
+	bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, &blksz_nr );
+	bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, &blksz_nr );
 
 	// Halve both the real and complex NC's (which are both real NC's).
-	bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT,    blksz_nc );
-	bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE,   blksz_nc );
-	bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, blksz_nc );
-	bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, blksz_nc );
+	bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT,    &blksz_nc );
+	bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE,   &blksz_nc );
+	bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, &blksz_nc );
+	bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, &blksz_nc );
 
-	// Use the default pack schemas in the objects.
+    bli_cntx_set_blksz( BLIS_MR, &blksz_mr, BLIS_MR, cntx_local );
+    bli_cntx_set_blksz( BLIS_NR, &blksz_nr, BLIS_NR, cntx_local );
+    bli_cntx_set_blksz( BLIS_MC, &blksz_mc, BLIS_MR, cntx_local );
+    bli_cntx_set_blksz( BLIS_NC, &blksz_nc, BLIS_NR, cntx_local );
+    bli_cntx_set_blksz( BLIS_KC, &blksz_kc, BLIS_KC, cntx_local );
 
-	// static func_t* bli_cntx_get_ukrs( ukr_t ukr_id, cntx_t* cntx )
-	func_t* l3_vir_ukrs = bli_cntx_get_ukrs( BLIS_GEMM_VIR_UKR, *cntx );
+	// Use the default pack schemas in the objects.
 
 	// Rather than check which complex datatype dt_comp refers to, we set
 	// the mixed-domain virtual microkernel for both types.
-	bli_func_set_dt( bli_cgemm_md_c2r_ref, BLIS_SCOMPLEX, l3_vir_ukrs );
-	bli_func_set_dt( bli_zgemm_md_c2r_ref, BLIS_DCOMPLEX, l3_vir_ukrs );
+    bli_cntx_set_ukr_dt( bli_cgemm_md_c2r_ref, BLIS_SCOMPLEX, BLIS_GEMM_VIR_UKR, cntx_local );
+    bli_cntx_set_ukr_dt( bli_zgemm_md_c2r_ref, BLIS_DCOMPLEX, BLIS_GEMM_VIR_UKR, cntx_local );
 
 	// Return the computation and execution domains.
 	return doms;
@@ -358,12 +364,12 @@ mddm_t bli_gemm_md_crc
 //                 cab
 mddm_t bli_gemm_md_rcc
      (
-       obj_t*   a,
-       obj_t*   b,
-       obj_t*   beta,
-       obj_t*   c,
-       cntx_t*  cntx_local,
-       cntx_t** cntx
+             obj_t*   a,
+             obj_t*   b,
+       const obj_t*   beta,
+             obj_t*   c,
+             cntx_t*  cntx_local,
+       const cntx_t** cntx
      )
 {
 	mddm_t doms;
@@ -384,32 +390,38 @@ mddm_t bli_gemm_md_rcc
 
 	// Copy the real domain blocksizes into the slots of their complex
 	// counterparts.
-	blksz_t* blksz_mr = bli_cntx_get_blksz( BLIS_MR, *cntx );
-	blksz_t* blksz_nr = bli_cntx_get_blksz( BLIS_NR, *cntx );
-	blksz_t* blksz_mc = bli_cntx_get_blksz( BLIS_MC, *cntx );
-	blksz_t* blksz_nc = bli_cntx_get_blksz( BLIS_NC, *cntx );
-	blksz_t* blksz_kc = bli_cntx_get_blksz( BLIS_KC, *cntx );
+	blksz_t blksz_mr = *bli_cntx_get_blksz( BLIS_MR, cntx_local );
+	blksz_t blksz_nr = *bli_cntx_get_blksz( BLIS_NR, cntx_local );
+	blksz_t blksz_mc = *bli_cntx_get_blksz( BLIS_MC, cntx_local );
+	blksz_t blksz_nc = *bli_cntx_get_blksz( BLIS_NC, cntx_local );
+	blksz_t blksz_kc = *bli_cntx_get_blksz( BLIS_KC, cntx_local );
 
-	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_mr, BLIS_SCOMPLEX, blksz_mr );
-	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_mr, BLIS_DCOMPLEX, blksz_mr );
+	bli_blksz_copy_dt( BLIS_FLOAT,  &blksz_mr, BLIS_SCOMPLEX, &blksz_mr );
+	bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_mr, BLIS_DCOMPLEX, &blksz_mr );
 
-	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_nr, BLIS_SCOMPLEX, blksz_nr );
-	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_nr, BLIS_DCOMPLEX, blksz_nr );
+	bli_blksz_copy_dt( BLIS_FLOAT,  &blksz_nr, BLIS_SCOMPLEX, &blksz_nr );
+	bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_nr, BLIS_DCOMPLEX, &blksz_nr );
 
-	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_mc, BLIS_SCOMPLEX, blksz_mc );
-	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_mc, BLIS_DCOMPLEX, blksz_mc );
+	bli_blksz_copy_dt( BLIS_FLOAT,  &blksz_mc, BLIS_SCOMPLEX, &blksz_mc );
+	bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_mc, BLIS_DCOMPLEX, &blksz_mc );
 
-	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_nc, BLIS_SCOMPLEX, blksz_nc );
-	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_nc, BLIS_DCOMPLEX, blksz_nc );
+	bli_blksz_copy_dt( BLIS_FLOAT,  &blksz_nc, BLIS_SCOMPLEX, &blksz_nc );
+	bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_nc, BLIS_DCOMPLEX, &blksz_nc );
 
-	bli_blksz_copy_dt( BLIS_FLOAT,  blksz_kc, BLIS_SCOMPLEX, blksz_kc );
-	bli_blksz_copy_dt( BLIS_DOUBLE, blksz_kc, BLIS_DCOMPLEX, blksz_kc );
+	bli_blksz_copy_dt( BLIS_FLOAT,  &blksz_kc, BLIS_SCOMPLEX, &blksz_kc );
+	bli_blksz_copy_dt( BLIS_DOUBLE, &blksz_kc, BLIS_DCOMPLEX, &blksz_kc );
 
 	// Halve both the real and complex KC's (which are both real KC's).
-	bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT,    blksz_kc );
-	bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE,   blksz_kc );
-	bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, blksz_kc );
-	bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, blksz_kc );
+	bli_blksz_scale_def_max( 1, 2, BLIS_FLOAT,    &blksz_kc );
+	bli_blksz_scale_def_max( 1, 2, BLIS_DOUBLE,   &blksz_kc );
+	bli_blksz_scale_def_max( 1, 2, BLIS_SCOMPLEX, &blksz_kc );
+	bli_blksz_scale_def_max( 1, 2, BLIS_DCOMPLEX, &blksz_kc );
+
+    bli_cntx_set_blksz( BLIS_MR, &blksz_mr, BLIS_MR, cntx_local );
+    bli_cntx_set_blksz( BLIS_NR, &blksz_nr, BLIS_NR, cntx_local );
+    bli_cntx_set_blksz( BLIS_MC, &blksz_mc, BLIS_MR, cntx_local );
+    bli_cntx_set_blksz( BLIS_NC, &blksz_nc, BLIS_NR, cntx_local );
+    bli_cntx_set_blksz( BLIS_KC, &blksz_kc, BLIS_KC, cntx_local );
 
 	// Use the 1r pack schema for both A and B with the conjugation
 	// of A or B toggled (to produce ar * br - ai * bi).
@@ -427,14 +439,14 @@ mddm_t bli_gemm_md_rcc
 	// the target datatype. (The packm_blk_var1_md() function has "built-in"
 	// support for packing to 1r (and 1e) schemas, whereas the
 	// packm_blk_var1() function relies on packm kernels for packing to 1r.
-	const num_t dt_complex = bli_obj_dt( a );
-	cntx_t* cntx_1m = bli_gks_query_ind_cntx( BLIS_1M, dt_complex );
+	const num_t   dt_complex = bli_obj_dt( a );
+	const cntx_t* cntx_1m    = bli_gks_query_ind_cntx( BLIS_1M, dt_complex );
 
-	func_t* cntx_funcs    = bli_cntx_ukrs_buf( *cntx );
-	func_t* cntx_1m_funcs = bli_cntx_ukrs_buf( cntx_1m );
+	const func_t* packm_1m_mr = bli_cntx_get_ukrs( BLIS_PACKM_MRXK_KER, cntx_1m );
+	const func_t* packm_1m_nr = bli_cntx_get_ukrs( BLIS_PACKM_NRXK_KER, cntx_1m );
 
-	cntx_funcs[ BLIS_PACKM_MRXK_KER ] = cntx_1m_funcs[ BLIS_PACKM_MRXK_KER ];
-	cntx_funcs[ BLIS_PACKM_NRXK_KER ] = cntx_1m_funcs[ BLIS_PACKM_NRXK_KER ];
+    bli_cntx_set_ukr( BLIS_PACKM_MRXK_KER, packm_1m_mr, cntx_local );
+    bli_cntx_set_ukr( BLIS_PACKM_NRXK_KER, packm_1m_nr, cntx_local );
 
 	// Return the computation and execution domains.
 	return doms;
@@ -445,12 +457,12 @@ mddm_t bli_gemm_md_rcc
 //                 cab
 mddm_t bli_gemm_md_crr
      (
-       obj_t*   a,
-       obj_t*   b,
-       obj_t*   beta,
-       obj_t*   c,
-       cntx_t*  cntx_local,
-       cntx_t** cntx
+             obj_t*   a,
+             obj_t*   b,
+       const obj_t*   beta,
+             obj_t*   c,
+             cntx_t*  cntx_local,
+       const cntx_t** cntx
      )
 {
 	mddm_t doms;
@@ -502,12 +514,12 @@ mddm_t bli_gemm_md_crr
 //                 cab
 mddm_t bli_gemm_md_rcr
      (
-       obj_t*   a,
-       obj_t*   b,
-       obj_t*   beta,
-       obj_t*   c,
-       cntx_t*  cntx_local,
-       cntx_t** cntx
+             obj_t*   a,
+             obj_t*   b,
+       const obj_t*   beta,
+             obj_t*   c,
+             cntx_t*  cntx_local,
+       const cntx_t** cntx
      )
 {
 	mddm_t doms;
@@ -540,12 +552,12 @@ mddm_t bli_gemm_md_rcr
 //                 cab
 mddm_t bli_gemm_md_rrc
      (
-       obj_t*   a,
-       obj_t*   b,
-       obj_t*   beta,
-       obj_t*   c,
-       cntx_t*  cntx_local,
-       cntx_t** cntx
+             obj_t*   a,
+             obj_t*   b,
+       const obj_t*   beta,
+             obj_t*   c,
+             cntx_t*  cntx_local,
+       const cntx_t** cntx
      )
 {
 	mddm_t doms;
@@ -578,12 +590,12 @@ mddm_t bli_gemm_md_rrc
 //                 cab
 mddm_t bli_gemm_md_rrr
      (
-       obj_t*   a,
-       obj_t*   b,
-       obj_t*   beta,
-       obj_t*   c,
-       cntx_t*  cntx_local,
-       cntx_t** cntx
+             obj_t*   a,
+             obj_t*   b,
+       const obj_t*   beta,
+             obj_t*   c,
+             cntx_t*  cntx_local,
+       const cntx_t** cntx
      )
 {
 	mddm_t doms;
@@ -608,12 +620,12 @@ mddm_t bli_gemm_md_rrr
 //                 cab
 mddm_t bli_gemm_md_ccc
      (
-       obj_t*   a,
-       obj_t*   b,
-       obj_t*   beta,
-       obj_t*   c,
-       cntx_t*  cntx_local,
-       cntx_t** cntx
+             obj_t*   a,
+             obj_t*   b,
+       const obj_t*   beta,
+             obj_t*   c,
+             cntx_t*  cntx_local,
+       const cntx_t** cntx
      )
 {
 	mddm_t doms;
diff --git a/frame/3/gemm/bli_gemm_md.h b/frame/3/gemm/bli_gemm_md.h
index 751e271eaf..d71d97987a 100644
--- a/frame/3/gemm/bli_gemm_md.h
+++ b/frame/3/gemm/bli_gemm_md.h
@@ -43,51 +43,51 @@ typedef struct mddm_s
 
 void bli_gemm_md
      (
-       obj_t*   a,
-       obj_t*   b,
-       obj_t*   beta,
-       obj_t*   c,
-       cntx_t*  cntx_local,
-       cntx_t** cntx
+             obj_t*   a,
+             obj_t*   b,
+       const obj_t*   beta,
+             obj_t*   c,
+             cntx_t*  cntx_local,
+       const cntx_t** cntx
      );
-mddm_t bli_gemm_md_ccc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx );
-mddm_t bli_gemm_md_ccr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx );
-mddm_t bli_gemm_md_crc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx );
-mddm_t bli_gemm_md_rcc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx );
-mddm_t bli_gemm_md_rrc( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx );
-mddm_t bli_gemm_md_rcr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx );
-mddm_t bli_gemm_md_crr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx );
-mddm_t bli_gemm_md_rrr( obj_t* a, obj_t* b, obj_t* beta, obj_t* c, cntx_t* cntx_l, cntx_t** cntx );
+mddm_t bli_gemm_md_ccc( obj_t* a, obj_t* b, const obj_t* beta, obj_t* c, cntx_t* cntx_l, const cntx_t** cntx );
+mddm_t bli_gemm_md_ccr( obj_t* a, obj_t* b, const obj_t* beta, obj_t* c, cntx_t* cntx_l, const cntx_t** cntx );
+mddm_t bli_gemm_md_crc( obj_t* a, obj_t* b, const obj_t* beta, obj_t* c, cntx_t* cntx_l, const cntx_t** cntx );
+mddm_t bli_gemm_md_rcc( obj_t* a, obj_t* b, const obj_t* beta, obj_t* c, cntx_t* cntx_l, const cntx_t** cntx );
+mddm_t bli_gemm_md_rrc( obj_t* a, obj_t* b, const obj_t* beta, obj_t* c, cntx_t* cntx_l, const cntx_t** cntx );
+mddm_t bli_gemm_md_rcr( obj_t* a, obj_t* b, const obj_t* beta, obj_t* c, cntx_t* cntx_l, const cntx_t** cntx );
+mddm_t bli_gemm_md_crr( obj_t* a, obj_t* b, const obj_t* beta, obj_t* c, cntx_t* cntx_l, const cntx_t** cntx );
+mddm_t bli_gemm_md_rrr( obj_t* a, obj_t* b, const obj_t* beta, obj_t* c, cntx_t* cntx_l, const cntx_t** cntx );
 
 // -----------------------------------------------------------------------------
 
 void bli_gemm_md_front
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl
      );
 
 void bli_gemm_md_zgemm
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl
      );
 
 // -----------------------------------------------------------------------------
 
-BLIS_INLINE bool bli_gemm_md_is_crr( obj_t* a, obj_t* b, obj_t* c )
+BLIS_INLINE bool bli_gemm_md_is_crr( const obj_t* a, const obj_t* b, const obj_t* c )
 {
 	bool r_val = FALSE;
 
@@ -107,7 +107,7 @@ BLIS_INLINE bool bli_gemm_md_is_crr( obj_t* a, obj_t* b, obj_t* c )
 	return r_val;
 }
 
-BLIS_INLINE bool bli_gemm_md_is_ccr( obj_t* a, obj_t* b, obj_t* c )
+BLIS_INLINE bool bli_gemm_md_is_ccr( const obj_t* a, const obj_t* b, const obj_t* c )
 {
 	bool r_val = FALSE;
 
@@ -127,7 +127,7 @@ BLIS_INLINE bool bli_gemm_md_is_ccr( obj_t* a, obj_t* b, obj_t* c )
 	return r_val;
 }
 
-BLIS_INLINE bool bli_gemm_md_is_crc( obj_t* a, obj_t* b, obj_t* c )
+BLIS_INLINE bool bli_gemm_md_is_crc( const obj_t* a, const obj_t* b, const obj_t* c )
 {
 	bool r_val = FALSE;
 
@@ -151,17 +151,17 @@ BLIS_INLINE bool bli_gemm_md_is_crc( obj_t* a, obj_t* b, obj_t* c )
 
 BLIS_INLINE void bli_gemm_md_ker_var2_recast
      (
-       num_t* dt_comp,
-       num_t  dt_a,
-       num_t  dt_b,
-       num_t* dt_c,
-       dim_t* m,
-       dim_t* n,
-       dim_t* k,
-       inc_t* pd_a, inc_t* ps_a,
-       inc_t* pd_b, inc_t* ps_b,
-       obj_t* c,
-       inc_t* rs_c, inc_t* cs_c
+             num_t* dt_comp,
+             num_t  dt_a,
+             num_t  dt_b,
+             num_t* dt_c,
+             dim_t* m,
+             dim_t* n,
+             dim_t* k,
+             inc_t* pd_a, inc_t* ps_a,
+             inc_t* pd_b, inc_t* ps_b,
+       const obj_t* c,
+             inc_t* rs_c, inc_t* cs_c
      )
 {
 	if      ( bli_is_real( *dt_c )    &&
diff --git a/frame/3/gemm/bli_gemm_md_c2r_ref.c b/frame/3/gemm/bli_gemm_md_c2r_ref.c
index a4797ad4fd..086a3b1dff 100644
--- a/frame/3/gemm/bli_gemm_md_c2r_ref.c
+++ b/frame/3/gemm/bli_gemm_md_c2r_ref.c
@@ -49,8 +49,8 @@ void PASTEMAC2(ch,opname,suf) \
        ctype*     restrict b, \
        ctype*     restrict beta, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      ) \
 { \
 	const num_t       dt        = PASTEMAC(ch,type); \
diff --git a/frame/3/gemm/bli_gemm_var.h b/frame/3/gemm/bli_gemm_var.h
index 888181bad6..d3109e6003 100644
--- a/frame/3/gemm/bli_gemm_var.h
+++ b/frame/3/gemm/bli_gemm_var.h
@@ -53,13 +53,13 @@ typedef struct
 \
 void PASTEMAC0(opname) \
      ( \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  c, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       cntl_t* cntl, \
-       thrinfo_t* thread  \
+       const obj_t*  a, \
+       const obj_t*  b, \
+       const obj_t*  c, \
+       const cntx_t* cntx, \
+             rntm_t* rntm, \
+             cntl_t* cntl, \
+             thrinfo_t* thread  \
      );
 
 GENPROT( gemm_blk_var1 )
diff --git a/frame/3/gemm/ind/bli_gemm_ind_opt.h b/frame/3/gemm/ind/bli_gemm_ind_opt.h
index 52ea81a5e8..789d5895cf 100644
--- a/frame/3/gemm/ind/bli_gemm_ind_opt.h
+++ b/frame/3/gemm/ind/bli_gemm_ind_opt.h
@@ -34,16 +34,16 @@
 
 BLIS_INLINE void bli_gemm_ind_recast_1m_params
      (
-       num_t* dt_exec,
-       num_t* dt_c,
-       pack_t schema_a,
-       obj_t* c,
-       dim_t* m,
-       dim_t* n,
-       dim_t* k,
-       inc_t* pd_a, inc_t* ps_a,
-       inc_t* pd_b, inc_t* ps_b,
-       inc_t* rs_c, inc_t* cs_c
+             num_t* dt_exec,
+             num_t* dt_c,
+             pack_t schema_a,
+       const obj_t* c,
+             dim_t* m,
+             dim_t* n,
+             dim_t* k,
+             inc_t* pd_a, inc_t* ps_a,
+             inc_t* pd_b, inc_t* ps_b,
+             inc_t* rs_c, inc_t* cs_c
      )
 {
 	obj_t beta;
diff --git a/frame/3/gemmt/bli_gemmt_front.c b/frame/3/gemmt/bli_gemmt_front.c
index d53838470a..e291b5f275 100644
--- a/frame/3/gemmt/bli_gemmt_front.c
+++ b/frame/3/gemmt/bli_gemmt_front.c
@@ -37,14 +37,14 @@
 
 void bli_gemmt_front
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl
      )
 {
 	bli_init_once();
diff --git a/frame/3/gemmt/bli_gemmt_front.h b/frame/3/gemmt/bli_gemmt_front.h
index c5967f8b8a..0f2a9ada2b 100644
--- a/frame/3/gemmt/bli_gemmt_front.h
+++ b/frame/3/gemmt/bli_gemmt_front.h
@@ -35,12 +35,12 @@
 
 void bli_gemmt_front
      (
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl
      );
diff --git a/frame/3/gemmt/bli_gemmt_l_ker_var2.c b/frame/3/gemmt/bli_gemmt_l_ker_var2.c
index 3aedc6e9a0..c6fc045b45 100644
--- a/frame/3/gemmt/bli_gemmt_l_ker_var2.c
+++ b/frame/3/gemmt/bli_gemmt_l_ker_var2.c
@@ -62,81 +62,74 @@ static FUNCPTR_T GENARRAY(ftypes,gemmt_l_ker_var2);
 
 void bli_gemmt_l_ker_var2
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      )
 {
-	num_t     dt_exec   = bli_obj_exec_dt( c );
-
-	doff_t    diagoffc  = bli_obj_diag_offset( c );
+	const num_t  dt_exec   = bli_obj_exec_dt( c );
 
-	pack_t    schema_a  = bli_obj_pack_schema( a );
-	pack_t    schema_b  = bli_obj_pack_schema( b );
+	const doff_t diagoffc  = bli_obj_diag_offset( c );
 
-	dim_t     m         = bli_obj_length( c );
-	dim_t     n         = bli_obj_width( c );
-	dim_t     k         = bli_obj_width( a );
+	const pack_t schema_a  = bli_obj_pack_schema( a );
+	const pack_t schema_b  = bli_obj_pack_schema( b );
 
-	void*     buf_a     = bli_obj_buffer_at_off( a );
-	inc_t     cs_a      = bli_obj_col_stride( a );
-	inc_t     is_a      = bli_obj_imag_stride( a );
-	dim_t     pd_a      = bli_obj_panel_dim( a );
-	inc_t     ps_a      = bli_obj_panel_stride( a );
+	const dim_t  m         = bli_obj_length( c );
+	const dim_t  n         = bli_obj_width( c );
+	const dim_t  k         = bli_obj_width( a );
 
-	void*     buf_b     = bli_obj_buffer_at_off( b );
-	inc_t     rs_b      = bli_obj_row_stride( b );
-	inc_t     is_b      = bli_obj_imag_stride( b );
-	dim_t     pd_b      = bli_obj_panel_dim( b );
-	inc_t     ps_b      = bli_obj_panel_stride( b );
+	const void*  buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t  cs_a      = bli_obj_col_stride( a );
+	const inc_t  is_a      = bli_obj_imag_stride( a );
+	const dim_t  pd_a      = bli_obj_panel_dim( a );
+	const inc_t  ps_a      = bli_obj_panel_stride( a );
 
-	void*     buf_c     = bli_obj_buffer_at_off( c );
-	inc_t     rs_c      = bli_obj_row_stride( c );
-	inc_t     cs_c      = bli_obj_col_stride( c );
+	const void*  buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t  rs_b      = bli_obj_row_stride( b );
+	const inc_t  is_b      = bli_obj_imag_stride( b );
+	const dim_t  pd_b      = bli_obj_panel_dim( b );
+	const inc_t  ps_b      = bli_obj_panel_stride( b );
 
-	obj_t     scalar_a;
-	obj_t     scalar_b;
-
-	void*     buf_alpha;
-	void*     buf_beta;
-
-	FUNCPTR_T f;
+	      void*  buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t  rs_c      = bli_obj_row_stride( c );
+	const inc_t  cs_c      = bli_obj_col_stride( c );
 
 	// Detach and multiply the scalars attached to A and B.
+	obj_t  scalar_a, scalar_b;
 	bli_obj_scalar_detach( a, &scalar_a );
 	bli_obj_scalar_detach( b, &scalar_b );
 	bli_mulsc( &scalar_a, &scalar_b );
 
 	// Grab the addresses of the internal scalar buffers for the scalar
 	// merged above and the scalar attached to C.
-	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
-	buf_beta  = bli_obj_internal_scalar_buffer( c );
+	const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	const void* buf_beta  = bli_obj_internal_scalar_buffer( c );
 
 	// Index into the type combination array to extract the correct
 	// function pointer.
-	f = ftypes[dt_exec];
-
-	// Invoke the function.
-	f( diagoffc,
-	   schema_a,
-	   schema_b,
-	   m,
-	   n,
-	   k,
-	   buf_alpha,
-	   buf_a, cs_a, is_a,
-	          pd_a, ps_a,
-	   buf_b, rs_b, is_b,
-	          pd_b, ps_b,
-	   buf_beta,
-	   buf_c, rs_c, cs_c,
-	   cntx,
-	   rntm,
-	   thread );
+	ftypes[dt_exec]
+    (
+      diagoffc,
+	  schema_a,
+	  schema_b,
+	  m,
+	  n,
+	  k,
+	  ( void* )buf_alpha,
+	  ( void* )buf_a, cs_a, is_a,
+	                  pd_a, ps_a,
+	  ( void* )buf_b, rs_b, is_b,
+	                  pd_b, ps_b,
+	  ( void* )buf_beta,
+	           buf_c, rs_c, cs_c,
+	  ( cntx_t* )cntx,
+	  rntm,
+	  thread
+	);
 }
 
 
diff --git a/frame/3/gemmt/bli_gemmt_u_ker_var2.c b/frame/3/gemmt/bli_gemmt_u_ker_var2.c
index b3a9fe8a1e..f64a84ef15 100644
--- a/frame/3/gemmt/bli_gemmt_u_ker_var2.c
+++ b/frame/3/gemmt/bli_gemmt_u_ker_var2.c
@@ -62,81 +62,74 @@ static FUNCPTR_T GENARRAY(ftypes,gemmt_u_ker_var2);
 
 void bli_gemmt_u_ker_var2
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      )
 {
-	num_t     dt_exec   = bli_obj_exec_dt( c );
-
-	doff_t    diagoffc  = bli_obj_diag_offset( c );
+	const num_t  dt_exec   = bli_obj_exec_dt( c );
 
-	pack_t    schema_a  = bli_obj_pack_schema( a );
-	pack_t    schema_b  = bli_obj_pack_schema( b );
+	const doff_t diagoffc  = bli_obj_diag_offset( c );
 
-	dim_t     m         = bli_obj_length( c );
-	dim_t     n         = bli_obj_width( c );
-	dim_t     k         = bli_obj_width( a );
+	const pack_t schema_a  = bli_obj_pack_schema( a );
+	const pack_t schema_b  = bli_obj_pack_schema( b );
 
-	void*     buf_a     = bli_obj_buffer_at_off( a );
-	inc_t     cs_a      = bli_obj_col_stride( a );
-	inc_t     is_a      = bli_obj_imag_stride( a );
-	dim_t     pd_a      = bli_obj_panel_dim( a );
-	inc_t     ps_a      = bli_obj_panel_stride( a );
+	const dim_t  m         = bli_obj_length( c );
+	const dim_t  n         = bli_obj_width( c );
+	const dim_t  k         = bli_obj_width( a );
 
-	void*     buf_b     = bli_obj_buffer_at_off( b );
-	inc_t     rs_b      = bli_obj_row_stride( b );
-	inc_t     is_b      = bli_obj_imag_stride( b );
-	dim_t     pd_b      = bli_obj_panel_dim( b );
-	inc_t     ps_b      = bli_obj_panel_stride( b );
+	const void*  buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t  cs_a      = bli_obj_col_stride( a );
+	const inc_t  is_a      = bli_obj_imag_stride( a );
+	const dim_t  pd_a      = bli_obj_panel_dim( a );
+	const inc_t  ps_a      = bli_obj_panel_stride( a );
 
-	void*     buf_c     = bli_obj_buffer_at_off( c );
-	inc_t     rs_c      = bli_obj_row_stride( c );
-	inc_t     cs_c      = bli_obj_col_stride( c );
+	const void*  buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t  rs_b      = bli_obj_row_stride( b );
+	const inc_t  is_b      = bli_obj_imag_stride( b );
+	const dim_t  pd_b      = bli_obj_panel_dim( b );
+	const inc_t  ps_b      = bli_obj_panel_stride( b );
 
-	obj_t     scalar_a;
-	obj_t     scalar_b;
-
-	void*     buf_alpha;
-	void*     buf_beta;
-
-	FUNCPTR_T f;
+	      void*  buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t  rs_c      = bli_obj_row_stride( c );
+	const inc_t  cs_c      = bli_obj_col_stride( c );
 
 	// Detach and multiply the scalars attached to A and B.
+	obj_t  scalar_a, scalar_b;
 	bli_obj_scalar_detach( a, &scalar_a );
 	bli_obj_scalar_detach( b, &scalar_b );
 	bli_mulsc( &scalar_a, &scalar_b );
 
 	// Grab the addresses of the internal scalar buffers for the scalar
 	// merged above and the scalar attached to C.
-	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
-	buf_beta  = bli_obj_internal_scalar_buffer( c );
+	const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	const void* buf_beta  = bli_obj_internal_scalar_buffer( c );
 
 	// Index into the type combination array to extract the correct
 	// function pointer.
-    f = ftypes[dt_exec];
-
-	// Invoke the function.
-	f( diagoffc,
-	   schema_a,
-	   schema_b,
-	   m,
-	   n,
-	   k,
-	   buf_alpha,
-	   buf_a, cs_a, is_a,
-	          pd_a, ps_a,
-	   buf_b, rs_b, is_b,
-	          pd_b, ps_b,
-	   buf_beta,
-	   buf_c, rs_c, cs_c,
-	   cntx,
-	   rntm,
-	   thread );
+	ftypes[dt_exec]
+    (
+      diagoffc,
+	  schema_a,
+	  schema_b,
+	  m,
+	  n,
+	  k,
+	  ( void* )buf_alpha,
+	  ( void* )buf_a, cs_a, is_a,
+	                  pd_a, ps_a,
+	  ( void* )buf_b, rs_b, is_b,
+	                  pd_b, ps_b,
+	  ( void* )buf_beta,
+	           buf_c, rs_c, cs_c,
+	  ( cntx_t* )cntx,
+	  rntm,
+	  thread
+	);
 }
 
 
diff --git a/frame/3/gemmt/bli_gemmt_var.h b/frame/3/gemmt/bli_gemmt_var.h
index 60c68c9f59..98d8f55633 100644
--- a/frame/3/gemmt/bli_gemmt_var.h
+++ b/frame/3/gemmt/bli_gemmt_var.h
@@ -43,13 +43,13 @@
 \
 void PASTEMAC0(opname) \
      ( \
-       obj_t*  a, \
-       obj_t*  ah, \
-       obj_t*  c, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       cntl_t* cntl, \
-       thrinfo_t* thread  \
+       const obj_t*  a, \
+       const obj_t*  ah, \
+       const obj_t*  c, \
+       const cntx_t* cntx, \
+             rntm_t* rntm, \
+             cntl_t* cntl, \
+             thrinfo_t* thread  \
      );
 
 GENPROT( gemmt_x_ker_var2 )
diff --git a/frame/3/gemmt/bli_gemmt_x_ker_var2.c b/frame/3/gemmt/bli_gemmt_x_ker_var2.c
index 3a1d681c3b..76fe106b08 100644
--- a/frame/3/gemmt/bli_gemmt_x_ker_var2.c
+++ b/frame/3/gemmt/bli_gemmt_x_ker_var2.c
@@ -42,13 +42,13 @@ static l3_var_oft vars[2] =
 
 void bli_gemmt_x_ker_var2
      (
-       obj_t*  a,
-       obj_t*  ah,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  a,
+       const obj_t*  ah,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      )
 {
 	dim_t      uplo;
diff --git a/frame/3/hemm/bli_hemm_front.c b/frame/3/hemm/bli_hemm_front.c
index 15460125da..c39703503d 100644
--- a/frame/3/hemm/bli_hemm_front.c
+++ b/frame/3/hemm/bli_hemm_front.c
@@ -36,15 +36,15 @@
 
 void bli_hemm_front
      (
-       side_t  side,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
+             side_t  side,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl
      )
 {
 	bli_init_once();
diff --git a/frame/3/hemm/bli_hemm_front.h b/frame/3/hemm/bli_hemm_front.h
index 308b6378bc..63eb91cd3a 100644
--- a/frame/3/hemm/bli_hemm_front.h
+++ b/frame/3/hemm/bli_hemm_front.h
@@ -34,13 +34,13 @@
 
 void bli_hemm_front
      (
-       side_t  side,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
+             side_t  side,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl
      );
diff --git a/frame/3/symm/bli_symm_front.c b/frame/3/symm/bli_symm_front.c
index 8108b607fc..c9aada9893 100644
--- a/frame/3/symm/bli_symm_front.c
+++ b/frame/3/symm/bli_symm_front.c
@@ -36,15 +36,15 @@
 
 void bli_symm_front
      (
-       side_t  side,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
+             side_t  side,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl
      )
 {
 	bli_init_once();
diff --git a/frame/3/symm/bli_symm_front.h b/frame/3/symm/bli_symm_front.h
index 909997f6cd..417cb9acb2 100644
--- a/frame/3/symm/bli_symm_front.h
+++ b/frame/3/symm/bli_symm_front.h
@@ -34,13 +34,13 @@
 
 void bli_symm_front
      (
-       side_t  side,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
+             side_t  side,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl
      );
diff --git a/frame/3/trmm/bli_trmm_front.c b/frame/3/trmm/bli_trmm_front.c
index d973b6eb6b..edd4ce1efb 100644
--- a/frame/3/trmm/bli_trmm_front.c
+++ b/frame/3/trmm/bli_trmm_front.c
@@ -37,13 +37,13 @@
 
 void bli_trmm_front
      (
-       side_t  side,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
+             side_t  side,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl
      )
 {
 	bli_init_once();
diff --git a/frame/3/trmm/bli_trmm_front.h b/frame/3/trmm/bli_trmm_front.h
index 3e136f9dc4..cfefdd39bc 100644
--- a/frame/3/trmm/bli_trmm_front.h
+++ b/frame/3/trmm/bli_trmm_front.h
@@ -34,11 +34,11 @@
 
 void bli_trmm_front
      (
-       side_t  side,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
+             side_t  side,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl
      );
diff --git a/frame/3/trmm/bli_trmm_ll_ker_var2.c b/frame/3/trmm/bli_trmm_ll_ker_var2.c
index 646287f931..9ea0db853e 100644
--- a/frame/3/trmm/bli_trmm_ll_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_ll_ker_var2.c
@@ -60,77 +60,70 @@ static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2);
 
 void bli_trmm_ll_ker_var2
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      )
 {
-	num_t     dt_exec   = bli_obj_exec_dt( c );
-
-	doff_t    diagoffa  = bli_obj_diag_offset( a );
+	const num_t     dt_exec   = bli_obj_exec_dt( c );
 
-	pack_t    schema_a  = bli_obj_pack_schema( a );
-	pack_t    schema_b  = bli_obj_pack_schema( b );
+	const doff_t    diagoffb  = bli_obj_diag_offset( b );
 
-	dim_t     m         = bli_obj_length( c );
-	dim_t     n         = bli_obj_width( c );
-	dim_t     k         = bli_obj_width( a );
+	const pack_t    schema_a  = bli_obj_pack_schema( a );
+	const pack_t    schema_b  = bli_obj_pack_schema( b );
 
-	void*     buf_a     = bli_obj_buffer_at_off( a );
-	inc_t     cs_a      = bli_obj_col_stride( a );
-	dim_t     pd_a      = bli_obj_panel_dim( a );
-	inc_t     ps_a      = bli_obj_panel_stride( a );
+	const dim_t     m         = bli_obj_length( c );
+	const dim_t     n         = bli_obj_width( c );
+	const dim_t     k         = bli_obj_width( a );
 
-	void*     buf_b     = bli_obj_buffer_at_off( b );
-	inc_t     rs_b      = bli_obj_row_stride( b );
-	dim_t     pd_b      = bli_obj_panel_dim( b );
-	inc_t     ps_b      = bli_obj_panel_stride( b );
+	const void*     buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t     cs_a      = bli_obj_col_stride( a );
+	const dim_t     pd_a      = bli_obj_panel_dim( a );
+	const inc_t     ps_a      = bli_obj_panel_stride( a );
 
-	void*     buf_c     = bli_obj_buffer_at_off( c );
-	inc_t     rs_c      = bli_obj_row_stride( c );
-	inc_t     cs_c      = bli_obj_col_stride( c );
+	const void*     buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t     rs_b      = bli_obj_row_stride( b );
+	const dim_t     pd_b      = bli_obj_panel_dim( b );
+	const inc_t     ps_b      = bli_obj_panel_stride( b );
 
-	obj_t     scalar_a;
-	obj_t     scalar_b;
-
-	void*     buf_alpha;
-	void*     buf_beta;
-
-	FUNCPTR_T f;
+	      void*     buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t     rs_c      = bli_obj_row_stride( c );
+	const inc_t     cs_c      = bli_obj_col_stride( c );
 
 	// Detach and multiply the scalars attached to A and B.
+	obj_t scalar_a, scalar_b;
 	bli_obj_scalar_detach( a, &scalar_a );
 	bli_obj_scalar_detach( b, &scalar_b );
 	bli_mulsc( &scalar_a, &scalar_b );
 
 	// Grab the addresses of the internal scalar buffers for the scalar
 	// merged above and the scalar attached to C.
-	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
-	buf_beta  = bli_obj_internal_scalar_buffer( c );
+	const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	const void* buf_beta  = bli_obj_internal_scalar_buffer( c );
 
 	// Index into the type combination array to extract the correct
 	// function pointer.
-	f = ftypes[dt_exec];
-
-	// Invoke the function.
-	f( diagoffa,
-	   schema_a,
-	   schema_b,
-	   m,
-	   n,
-	   k,
-	   buf_alpha,
-	   buf_a, cs_a, pd_a, ps_a,
-	   buf_b, rs_b, pd_b, ps_b,
-	   buf_beta,
-	   buf_c, rs_c, cs_c,
-	   cntx,
-	   rntm,
-	   thread );
+	ftypes[dt_exec]
+	(
+	  diagoffb,
+	  schema_a,
+	  schema_b,
+	  m,
+	  n,
+	  k,
+	  ( void* )buf_alpha,
+	  ( void* )buf_a, cs_a, pd_a, ps_a,
+	  ( void* )buf_b, rs_b, pd_b, ps_b,
+	  ( void* )buf_beta,
+	           buf_c, rs_c, cs_c,
+	  ( cntx_t* )cntx,
+	  rntm,
+	  thread
+	);
 }
 
 
diff --git a/frame/3/trmm/bli_trmm_lu_ker_var2.c b/frame/3/trmm/bli_trmm_lu_ker_var2.c
index 9ef2a475de..ba91a58e90 100644
--- a/frame/3/trmm/bli_trmm_lu_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_lu_ker_var2.c
@@ -60,77 +60,70 @@ static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2);
 
 void bli_trmm_lu_ker_var2
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      )
 {
-	num_t     dt_exec   = bli_obj_exec_dt( c );
-
-	doff_t    diagoffa  = bli_obj_diag_offset( a );
+	const num_t     dt_exec   = bli_obj_exec_dt( c );
 
-	pack_t    schema_a  = bli_obj_pack_schema( a );
-	pack_t    schema_b  = bli_obj_pack_schema( b );
+	const doff_t    diagoffb  = bli_obj_diag_offset( b );
 
-	dim_t     m         = bli_obj_length( c );
-	dim_t     n         = bli_obj_width( c );
-	dim_t     k         = bli_obj_width( a );
+	const pack_t    schema_a  = bli_obj_pack_schema( a );
+	const pack_t    schema_b  = bli_obj_pack_schema( b );
 
-	void*     buf_a     = bli_obj_buffer_at_off( a );
-	inc_t     cs_a      = bli_obj_col_stride( a );
-	dim_t     pd_a      = bli_obj_panel_dim( a );
-	inc_t     ps_a      = bli_obj_panel_stride( a );
+	const dim_t     m         = bli_obj_length( c );
+	const dim_t     n         = bli_obj_width( c );
+	const dim_t     k         = bli_obj_width( a );
 
-	void*     buf_b     = bli_obj_buffer_at_off( b );
-	inc_t     rs_b      = bli_obj_row_stride( b );
-	dim_t     pd_b      = bli_obj_panel_dim( b );
-	inc_t     ps_b      = bli_obj_panel_stride( b );
+	const void*     buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t     cs_a      = bli_obj_col_stride( a );
+	const dim_t     pd_a      = bli_obj_panel_dim( a );
+	const inc_t     ps_a      = bli_obj_panel_stride( a );
 
-	void*     buf_c     = bli_obj_buffer_at_off( c );
-	inc_t     rs_c      = bli_obj_row_stride( c );
-	inc_t     cs_c      = bli_obj_col_stride( c );
+	const void*     buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t     rs_b      = bli_obj_row_stride( b );
+	const dim_t     pd_b      = bli_obj_panel_dim( b );
+	const inc_t     ps_b      = bli_obj_panel_stride( b );
 
-	obj_t     scalar_a;
-	obj_t     scalar_b;
-
-	void*     buf_alpha;
-	void*     buf_beta;
-
-	FUNCPTR_T f;
+	      void*     buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t     rs_c      = bli_obj_row_stride( c );
+	const inc_t     cs_c      = bli_obj_col_stride( c );
 
 	// Detach and multiply the scalars attached to A and B.
+	obj_t scalar_a, scalar_b;
 	bli_obj_scalar_detach( a, &scalar_a );
 	bli_obj_scalar_detach( b, &scalar_b );
 	bli_mulsc( &scalar_a, &scalar_b );
 
 	// Grab the addresses of the internal scalar buffers for the scalar
 	// merged above and the scalar attached to C.
-	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
-	buf_beta  = bli_obj_internal_scalar_buffer( c );
+	const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	const void* buf_beta  = bli_obj_internal_scalar_buffer( c );
 
 	// Index into the type combination array to extract the correct
 	// function pointer.
-	f = ftypes[dt_exec];
-
-	// Invoke the function.
-	f( diagoffa,
-	   schema_a,
-	   schema_b,
-	   m,
-	   n,
-	   k,
-	   buf_alpha,
-	   buf_a, cs_a, pd_a, ps_a,
-	   buf_b, rs_b, pd_b, ps_b,
-	   buf_beta,
-	   buf_c, rs_c, cs_c,
-	   cntx,
-	   rntm,
-	   thread );
+	ftypes[dt_exec]
+	(
+	  diagoffb,
+	  schema_a,
+	  schema_b,
+	  m,
+	  n,
+	  k,
+	  ( void* )buf_alpha,
+	  ( void* )buf_a, cs_a, pd_a, ps_a,
+	  ( void* )buf_b, rs_b, pd_b, ps_b,
+	  ( void* )buf_beta,
+	           buf_c, rs_c, cs_c,
+	  ( cntx_t* )cntx,
+	  rntm,
+	  thread
+	);
 }
 
 
diff --git a/frame/3/trmm/bli_trmm_rl_ker_var2.c b/frame/3/trmm/bli_trmm_rl_ker_var2.c
index f6b20af2e5..89f86aa3a8 100644
--- a/frame/3/trmm/bli_trmm_rl_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_rl_ker_var2.c
@@ -60,77 +60,70 @@ static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2);
 
 void bli_trmm_rl_ker_var2
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      )
 {
-	num_t     dt_exec   = bli_obj_exec_dt( c );
-
-	doff_t    diagoffb  = bli_obj_diag_offset( b );
+	const num_t     dt_exec   = bli_obj_exec_dt( c );
 
-	pack_t    schema_a  = bli_obj_pack_schema( a );
-	pack_t    schema_b  = bli_obj_pack_schema( b );
+	const doff_t    diagoffb  = bli_obj_diag_offset( b );
 
-	dim_t     m         = bli_obj_length( c );
-	dim_t     n         = bli_obj_width( c );
-	dim_t     k         = bli_obj_width( a );
+	const pack_t    schema_a  = bli_obj_pack_schema( a );
+	const pack_t    schema_b  = bli_obj_pack_schema( b );
 
-	void*     buf_a     = bli_obj_buffer_at_off( a );
-	inc_t     cs_a      = bli_obj_col_stride( a );
-	dim_t     pd_a      = bli_obj_panel_dim( a );
-	inc_t     ps_a      = bli_obj_panel_stride( a );
+	const dim_t     m         = bli_obj_length( c );
+	const dim_t     n         = bli_obj_width( c );
+	const dim_t     k         = bli_obj_width( a );
 
-	void*     buf_b     = bli_obj_buffer_at_off( b );
-	inc_t     rs_b      = bli_obj_row_stride( b );
-	dim_t     pd_b      = bli_obj_panel_dim( b );
-	inc_t     ps_b      = bli_obj_panel_stride( b );
+	const void*     buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t     cs_a      = bli_obj_col_stride( a );
+	const dim_t     pd_a      = bli_obj_panel_dim( a );
+	const inc_t     ps_a      = bli_obj_panel_stride( a );
 
-	void*     buf_c     = bli_obj_buffer_at_off( c );
-	inc_t     rs_c      = bli_obj_row_stride( c );
-	inc_t     cs_c      = bli_obj_col_stride( c );
+	const void*     buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t     rs_b      = bli_obj_row_stride( b );
+	const dim_t     pd_b      = bli_obj_panel_dim( b );
+	const inc_t     ps_b      = bli_obj_panel_stride( b );
 
-	obj_t     scalar_a;
-	obj_t     scalar_b;
-
-	void*     buf_alpha;
-	void*     buf_beta;
-
-	FUNCPTR_T f;
+	      void*     buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t     rs_c      = bli_obj_row_stride( c );
+	const inc_t     cs_c      = bli_obj_col_stride( c );
 
 	// Detach and multiply the scalars attached to A and B.
+	obj_t scalar_a, scalar_b;
 	bli_obj_scalar_detach( a, &scalar_a );
 	bli_obj_scalar_detach( b, &scalar_b );
 	bli_mulsc( &scalar_a, &scalar_b );
 
 	// Grab the addresses of the internal scalar buffers for the scalar
 	// merged above and the scalar attached to C.
-	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
-	buf_beta  = bli_obj_internal_scalar_buffer( c );
+	const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	const void* buf_beta  = bli_obj_internal_scalar_buffer( c );
 
 	// Index into the type combination array to extract the correct
 	// function pointer.
-	f = ftypes[dt_exec];
-
-	// Invoke the function.
-	f( diagoffb,
-	   schema_a,
-	   schema_b,
-	   m,
-	   n,
-	   k,
-	   buf_alpha,
-	   buf_a, cs_a, pd_a, ps_a,
-	   buf_b, rs_b, pd_b, ps_b,
-	   buf_beta,
-	   buf_c, rs_c, cs_c,
-	   cntx,
-	   rntm,
-	   thread );
+	ftypes[dt_exec]
+	(
+	  diagoffb,
+	  schema_a,
+	  schema_b,
+	  m,
+	  n,
+	  k,
+	  ( void* )buf_alpha,
+	  ( void* )buf_a, cs_a, pd_a, ps_a,
+	  ( void* )buf_b, rs_b, pd_b, ps_b,
+	  ( void* )buf_beta,
+	           buf_c, rs_c, cs_c,
+	  ( cntx_t* )cntx,
+	  rntm,
+	  thread
+	);
 }
 
 
diff --git a/frame/3/trmm/bli_trmm_ru_ker_var2.c b/frame/3/trmm/bli_trmm_ru_ker_var2.c
index f71fb3c4d8..4ed38e7610 100644
--- a/frame/3/trmm/bli_trmm_ru_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_ru_ker_var2.c
@@ -60,77 +60,70 @@ static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2);
 
 void bli_trmm_ru_ker_var2
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      )
 {
-	num_t     dt_exec   = bli_obj_exec_dt( c );
-
-	doff_t    diagoffb  = bli_obj_diag_offset( b );
+	const num_t     dt_exec   = bli_obj_exec_dt( c );
 
-	pack_t    schema_a  = bli_obj_pack_schema( a );
-	pack_t    schema_b  = bli_obj_pack_schema( b );
+	const doff_t    diagoffb  = bli_obj_diag_offset( b );
 
-	dim_t     m         = bli_obj_length( c );
-	dim_t     n         = bli_obj_width( c );
-	dim_t     k         = bli_obj_width( a );
+	const pack_t    schema_a  = bli_obj_pack_schema( a );
+	const pack_t    schema_b  = bli_obj_pack_schema( b );
 
-	void*     buf_a     = bli_obj_buffer_at_off( a );
-	inc_t     cs_a      = bli_obj_col_stride( a );
-	dim_t     pd_a      = bli_obj_panel_dim( a );
-	inc_t     ps_a      = bli_obj_panel_stride( a );
+	const dim_t     m         = bli_obj_length( c );
+	const dim_t     n         = bli_obj_width( c );
+	const dim_t     k         = bli_obj_width( a );
 
-	void*     buf_b     = bli_obj_buffer_at_off( b );
-	inc_t     rs_b      = bli_obj_row_stride( b );
-	dim_t     pd_b      = bli_obj_panel_dim( b );
-	inc_t     ps_b      = bli_obj_panel_stride( b );
+	const void*     buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t     cs_a      = bli_obj_col_stride( a );
+	const dim_t     pd_a      = bli_obj_panel_dim( a );
+	const inc_t     ps_a      = bli_obj_panel_stride( a );
 
-	void*     buf_c     = bli_obj_buffer_at_off( c );
-	inc_t     rs_c      = bli_obj_row_stride( c );
-	inc_t     cs_c      = bli_obj_col_stride( c );
+	const void*     buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t     rs_b      = bli_obj_row_stride( b );
+	const dim_t     pd_b      = bli_obj_panel_dim( b );
+	const inc_t     ps_b      = bli_obj_panel_stride( b );
 
-	obj_t     scalar_a;
-	obj_t     scalar_b;
-
-	void*     buf_alpha;
-	void*     buf_beta;
-
-	FUNCPTR_T f;
+	      void*     buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t     rs_c      = bli_obj_row_stride( c );
+	const inc_t     cs_c      = bli_obj_col_stride( c );
 
 	// Detach and multiply the scalars attached to A and B.
+	obj_t scalar_a, scalar_b;
 	bli_obj_scalar_detach( a, &scalar_a );
 	bli_obj_scalar_detach( b, &scalar_b );
 	bli_mulsc( &scalar_a, &scalar_b );
 
 	// Grab the addresses of the internal scalar buffers for the scalar
 	// merged above and the scalar attached to C.
-	buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
-	buf_beta  = bli_obj_internal_scalar_buffer( c );
+	const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
+	const void* buf_beta  = bli_obj_internal_scalar_buffer( c );
 
 	// Index into the type combination array to extract the correct
 	// function pointer.
-	f = ftypes[dt_exec];
-
-	// Invoke the function.
-	f( diagoffb,
-	   schema_a,
-	   schema_b,
-	   m,
-	   n,
-	   k,
-	   buf_alpha,
-	   buf_a, cs_a, pd_a, ps_a,
-	   buf_b, rs_b, pd_b, ps_b,
-	   buf_beta,
-	   buf_c, rs_c, cs_c,
-	   cntx,
-	   rntm,
-	   thread );
+	ftypes[dt_exec]
+	(
+	  diagoffb,
+	  schema_a,
+	  schema_b,
+	  m,
+	  n,
+	  k,
+	  ( void* )buf_alpha,
+	  ( void* )buf_a, cs_a, pd_a, ps_a,
+	  ( void* )buf_b, rs_b, pd_b, ps_b,
+	  ( void* )buf_beta,
+	           buf_c, rs_c, cs_c,
+	  ( cntx_t* )cntx,
+	  rntm,
+	  thread
+	);
 }
 
 
diff --git a/frame/3/trmm/bli_trmm_var.h b/frame/3/trmm/bli_trmm_var.h
index 262b0490fd..2f0642ca8f 100644
--- a/frame/3/trmm/bli_trmm_var.h
+++ b/frame/3/trmm/bli_trmm_var.h
@@ -43,13 +43,13 @@
 \
 void PASTEMAC0(opname) \
      ( \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  c, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       cntl_t* cntl, \
-       thrinfo_t* thread  \
+       const obj_t*  a, \
+       const obj_t*  b, \
+       const obj_t*  c, \
+       const cntx_t* cntx, \
+             rntm_t* rntm, \
+             cntl_t* cntl, \
+             thrinfo_t* thread  \
      );
 
 //GENPROT( trmm_blk_var1 )
diff --git a/frame/3/trmm/bli_trmm_xx_ker_var2.c b/frame/3/trmm/bli_trmm_xx_ker_var2.c
index 898cfe2423..d42bc88c2d 100644
--- a/frame/3/trmm/bli_trmm_xx_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_xx_ker_var2.c
@@ -43,13 +43,13 @@ static l3_var_oft vars[2][2] =
 
 void bli_trmm_xx_ker_var2
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      )
 {
 	dim_t      side;
diff --git a/frame/3/trmm3/bli_trmm3_front.c b/frame/3/trmm3/bli_trmm3_front.c
index 9cd04963b4..9681eb6406 100644
--- a/frame/3/trmm3/bli_trmm3_front.c
+++ b/frame/3/trmm3/bli_trmm3_front.c
@@ -36,15 +36,15 @@
 
 void bli_trmm3_front
      (
-       side_t  side,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
+             side_t  side,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl
      )
 {
 	bli_init_once();
diff --git a/frame/3/trmm3/bli_trmm3_front.h b/frame/3/trmm3/bli_trmm3_front.h
index 296b9354bd..b5dde34cd0 100644
--- a/frame/3/trmm3/bli_trmm3_front.h
+++ b/frame/3/trmm3/bli_trmm3_front.h
@@ -34,13 +34,13 @@
 
 void bli_trmm3_front
      (
-       side_t  side,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
+             side_t  side,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl
      );
diff --git a/frame/3/trsm/bli_trsm_blk_var1.c b/frame/3/trsm/bli_trsm_blk_var1.c
index 30bf6921cd..915fe3e59a 100644
--- a/frame/3/trsm/bli_trsm_blk_var1.c
+++ b/frame/3/trsm/bli_trsm_blk_var1.c
@@ -39,34 +39,35 @@
 
 void bli_trsm_blk_var1
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      )
 {
-	dim_t my_start, my_end;
-	dim_t b_alg;
+	obj_t ap, cp;
+	bli_obj_alias_to( a, &ap );
+	bli_obj_alias_to( c, &cp );
 
 	// Determine the direction in which to partition (forwards or backwards).
-	dir_t direct = bli_l3_direct( a, b, c, cntl );
+	dir_t direct = bli_l3_direct( &ap, b, &cp, cntl );
 
 	// Prune any zero region that exists along the partitioning dimension.
-	bli_l3_prune_unref_mparts_m( a, b, c, cntl );
+	bli_l3_prune_unref_mparts_m( &ap, b, &cp, cntl );
 
 	// Isolate the diagonal block A11 and its corresponding row panel C1.
-	const dim_t kc = bli_obj_width_after_trans( a );
+	const dim_t kc = bli_obj_width_after_trans( &ap );
 	obj_t a11, c1;
 	bli_acquire_mpart_mdim( direct, BLIS_SUBPART1,
-	                        0, kc, a, &a11 );
+	                        0, kc, &ap, &a11 );
 	bli_acquire_mpart_mdim( direct, BLIS_SUBPART1,
-	                        0, kc, c, &c1 );
+	                        0, kc, &cp, &c1 );
 
 	// All threads iterate over the entire diagonal block A11.
-	my_start = 0; my_end = kc;
+	dim_t my_start = 0, my_end = kc;
 
 #ifdef PRINT
 	printf( "bli_trsm_blk_var1(): a11 is %d x %d at offsets (%3d, %3d)\n",
@@ -76,14 +77,14 @@ void bli_trsm_blk_var1
 #endif
 
 	// Partition along the m dimension for the trsm subproblem.
+	dim_t b_alg;
 	for ( dim_t i = my_start; i < my_end; i += b_alg )
 	{
-		obj_t a11_1, c1_1;
-
 		b_alg = bli_determine_blocksize( direct, i, my_end, &a11,
 		                                 bli_cntl_bszid( cntl ), cntx );
 
 		// Acquire partitions for A1 and C1.
+		obj_t a11_1, c1_1;
 		bli_acquire_mpart_mdim( direct, BLIS_SUBPART1,
 		                        i, b_alg, &a11, &a11_1 );
 		bli_acquire_mpart_mdim( direct, BLIS_SUBPART1,
@@ -124,9 +125,9 @@ void bli_trsm_blk_var1
 	// on whether we are moving forwards or backwards, respectively).
 	obj_t ax1, cx1;
 	bli_acquire_mpart_mdim( direct, BLIS_SUBPART1A,
-	                        0, kc, a, &ax1 );
+	                        0, kc, &ap, &ax1 );
 	bli_acquire_mpart_mdim( direct, BLIS_SUBPART1A,
-	                        0, kc, c, &cx1 );
+	                        0, kc, &cp, &cx1 );
 
 #ifdef PRINT
 	printf( "bli_trsm_blk_var1(): ax1 is %d x %d at offsets (%3d, %3d)\n",
@@ -149,13 +150,12 @@ void bli_trsm_blk_var1
 	// Partition along the m dimension for the gemm subproblem.
 	for ( dim_t i = my_start; i < my_end; i += b_alg )
 	{
-		obj_t a11, c1;
-
 		// Determine the current algorithmic blocksize.
 		b_alg = bli_determine_blocksize( direct, i, my_end, &ax1,
 		                                 bli_cntl_bszid( cntl ), cntx );
 
 		// Acquire partitions for A1 and C1.
+		obj_t a11, c1;
 		bli_acquire_mpart_mdim( direct, BLIS_SUBPART1,
 		                        i, b_alg, &ax1, &a11 );
 		bli_acquire_mpart_mdim( direct, BLIS_SUBPART1,
diff --git a/frame/3/trsm/bli_trsm_blk_var2.c b/frame/3/trsm/bli_trsm_blk_var2.c
index 5691c964ad..88db57e519 100644
--- a/frame/3/trsm/bli_trsm_blk_var2.c
+++ b/frame/3/trsm/bli_trsm_blk_var2.c
@@ -37,44 +37,47 @@
 
 void bli_trsm_blk_var2
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      )
 {
-	obj_t b1, c1;
-	dim_t my_start, my_end;
-	dim_t b_alg;
+	obj_t bp, cp;
+	bli_obj_alias_to( b, &bp );
+	bli_obj_alias_to( c, &cp );
 
 	// Determine the direction in which to partition (forwards or backwards).
-	dir_t direct = bli_l3_direct( a, b, c, cntl );
+	dir_t direct = bli_l3_direct( a, &bp, &cp, cntl );
 
 	// Prune any zero region that exists along the partitioning dimension.
-	bli_l3_prune_unref_mparts_n( a, b, c, cntl );
+	bli_l3_prune_unref_mparts_n( a, &bp, &cp, cntl );
 
 	// Determine the current thread's subpartition range.
+	dim_t my_start, my_end;
 	bli_thread_range_ndim
 	(
-	  direct, thread, a, b, c, cntl, cntx,
+	  direct, thread, a, &bp, &cp, cntl, cntx,
 	  &my_start, &my_end
 	);
 
 	// Partition along the n dimension.
+	dim_t b_alg;
 	for ( dim_t i = my_start; i < my_end; i += b_alg )
 	{
 		// Determine the current algorithmic blocksize.
-		b_alg = bli_determine_blocksize( direct, i, my_end, b,
+		b_alg = bli_determine_blocksize( direct, i, my_end, &bp,
 		                                 bli_cntl_bszid( cntl ), cntx );
 
 		// Acquire partitions for B1 and C1.
+		obj_t b1, c1;
 		bli_acquire_mpart_ndim( direct, BLIS_SUBPART1,
-		                        i, b_alg, b, &b1 );
+		                        i, b_alg, &bp, &b1 );
 		bli_acquire_mpart_ndim( direct, BLIS_SUBPART1,
-		                        i, b_alg, c, &c1 );
+		                        i, b_alg, &cp, &c1 );
 
 		// Perform trsm subproblem.
 		bli_l3_int
diff --git a/frame/3/trsm/bli_trsm_blk_var3.c b/frame/3/trsm/bli_trsm_blk_var3.c
index 43fc25f16d..4f7bcb9ff9 100644
--- a/frame/3/trsm/bli_trsm_blk_var3.c
+++ b/frame/3/trsm/bli_trsm_blk_var3.c
@@ -36,39 +36,43 @@
 
 void bli_trsm_blk_var3
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      )
 {
-	obj_t a1, b1;
-	dim_t b_alg;
+	obj_t ap, bp, cs;
+	bli_obj_alias_to( a, &ap );
+	bli_obj_alias_to( b, &bp );
+	bli_obj_alias_to( c, &cs );
 
 	// Determine the direction in which to partition (forwards or backwards).
-	dir_t direct = bli_l3_direct( a, b, c, cntl );
+	dir_t direct = bli_l3_direct( &ap, &bp, c, cntl );
 
 	// Prune any zero region that exists along the partitioning dimension.
-	bli_l3_prune_unref_mparts_k( a, b, c, cntl );
+	bli_l3_prune_unref_mparts_k( &ap, &bp, c, cntl );
 
 	// Query dimension in partitioning direction.
-	dim_t k_trans = bli_obj_width_after_trans( a );
+	dim_t k_trans = bli_obj_width_after_trans( &ap );
 
 	// Partition along the k dimension.
+	dim_t b_alg;
 	for ( dim_t i = 0; i < k_trans; i += b_alg )
 	{
 		// Determine the current algorithmic blocksize.
-		b_alg = bli_trsm_determine_kc( direct, i, k_trans, a, b,
+		b_alg = bli_trsm_determine_kc( direct, i, k_trans, &ap, &bp,
 		                               bli_cntl_bszid( cntl ), cntx );
 
 		// Acquire partitions for A1 and B1.
+		obj_t a1, b1;
 		bli_acquire_mpart_ndim( direct, BLIS_SUBPART1,
-		                        i, b_alg, a, &a1 );
+		                        i, b_alg, &ap, &a1 );
 		bli_acquire_mpart_mdim( direct, BLIS_SUBPART1,
-		                        i, b_alg, b, &b1 );
+		                        i, b_alg, &bp, &b1 );
 
 		// Perform trsm subproblem.
 		bli_l3_int
@@ -92,8 +96,8 @@ void bli_trsm_blk_var3
 		// that they are only used in the first iteration.
 		if ( i == 0 )
 		{
-			bli_obj_scalar_reset( a ); bli_obj_scalar_reset( b );
-			bli_obj_scalar_reset( c );
+			bli_obj_scalar_reset( &ap ); bli_obj_scalar_reset( &bp );
+			bli_obj_scalar_reset( &cs );
 		}
 	}
 }
diff --git a/frame/3/trsm/bli_trsm_front.c b/frame/3/trsm/bli_trsm_front.c
index 7f3d17aeff..b94a129d99 100644
--- a/frame/3/trsm/bli_trsm_front.c
+++ b/frame/3/trsm/bli_trsm_front.c
@@ -37,13 +37,13 @@
 
 void bli_trsm_front
      (
-       side_t  side,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
+             side_t  side,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl
      )
 {
 	bli_init_once();
diff --git a/frame/3/trsm/bli_trsm_front.h b/frame/3/trsm/bli_trsm_front.h
index 379935536a..b31e88b041 100644
--- a/frame/3/trsm/bli_trsm_front.h
+++ b/frame/3/trsm/bli_trsm_front.h
@@ -35,13 +35,13 @@
 
 void bli_trsm_front
      (
-       side_t  side,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
+             side_t  side,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl
      );
 
 #ifdef BLIS_ENABLE_SMALL_MATRIX
diff --git a/frame/3/trsm/bli_trsm_ll_ker_var2.c b/frame/3/trsm/bli_trsm_ll_ker_var2.c
index 7b1133c2a8..71abcca123 100644
--- a/frame/3/trsm/bli_trsm_ll_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_ll_ker_var2.c
@@ -60,52 +60,47 @@ static FUNCPTR_T GENARRAY(ftypes,trsm_ll_ker_var2);
 
 void bli_trsm_ll_ker_var2
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      )
 {
-	num_t     dt_exec   = bli_obj_exec_dt( c );
+	const num_t     dt_exec   = bli_obj_exec_dt( c );
 
-	doff_t    diagoffa  = bli_obj_diag_offset( a );
+	const doff_t    diagoffb  = bli_obj_diag_offset( b );
 
-	pack_t    schema_a  = bli_obj_pack_schema( a );
-	pack_t    schema_b  = bli_obj_pack_schema( b );
+	const pack_t    schema_a  = bli_obj_pack_schema( a );
+	const pack_t    schema_b  = bli_obj_pack_schema( b );
 
-	dim_t     m         = bli_obj_length( c );
-	dim_t     n         = bli_obj_width( c );
-	dim_t     k         = bli_obj_width( a );
+	const dim_t     m         = bli_obj_length( c );
+	const dim_t     n         = bli_obj_width( c );
+	const dim_t     k         = bli_obj_width( a );
 
-	void*     buf_a     = bli_obj_buffer_at_off( a );
-	inc_t     cs_a      = bli_obj_col_stride( a );
-	dim_t     pd_a      = bli_obj_panel_dim( a );
-	inc_t     ps_a      = bli_obj_panel_stride( a );
+	const void*     buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t     cs_a      = bli_obj_col_stride( a );
+	const dim_t     pd_a      = bli_obj_panel_dim( a );
+	const inc_t     ps_a      = bli_obj_panel_stride( a );
 
-	void*     buf_b     = bli_obj_buffer_at_off( b );
-	inc_t     rs_b      = bli_obj_row_stride( b );
-	dim_t     pd_b      = bli_obj_panel_dim( b );
-	inc_t     ps_b      = bli_obj_panel_stride( b );
+	const void*     buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t     rs_b      = bli_obj_row_stride( b );
+	const dim_t     pd_b      = bli_obj_panel_dim( b );
+	const inc_t     ps_b      = bli_obj_panel_stride( b );
 
-	void*     buf_c     = bli_obj_buffer_at_off( c );
-	inc_t     rs_c      = bli_obj_row_stride( c );
-	inc_t     cs_c      = bli_obj_col_stride( c );
-
-	void*     buf_alpha1;
-	void*     buf_alpha2;
-
-	FUNCPTR_T f;
+	      void*     buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t     rs_c      = bli_obj_row_stride( c );
+	const inc_t     cs_c      = bli_obj_col_stride( c );
 
 	// Grab the address of the internal scalar buffer for the scalar
-	// attached to B (the non-triangular matrix). This will be the alpha
+	// attached to A (the non-triangular matrix). This will be the alpha
 	// scalar used in the gemmtrsm subproblems (ie: the scalar that would
-	// be applied to the packed copy of B prior to it being updated by
+	// be applied to the packed copy of A prior to it being updated by
 	// the trsm subproblem). This scalar may be unit, if for example it
 	// was applied during packing.
-	buf_alpha1 = bli_obj_internal_scalar_buffer( b );
+	const void* buf_alpha1 = bli_obj_internal_scalar_buffer( a );
 
 	// Grab the address of the internal scalar buffer for the scalar
 	// attached to C. This will be the "beta" scalar used in the gemm-only
@@ -113,27 +108,27 @@ void bli_trsm_ll_ker_var2
 	// the diagonal. We need this separate scalar because it's possible
 	// that the alpha attached to B was reset, if it was applied during
 	// packing.
-	buf_alpha2 = bli_obj_internal_scalar_buffer( c );
+	const void* buf_alpha2 = bli_obj_internal_scalar_buffer( c );
 
 	// Index into the type combination array to extract the correct
 	// function pointer.
-	f = ftypes[dt_exec];
-
-	// Invoke the function.
-	f( diagoffa,
-	   schema_a,
-	   schema_b,
-	   m,
-	   n,
-	   k,
-	   buf_alpha1,
-	   buf_a, cs_a, pd_a, ps_a,
-	   buf_b, rs_b, pd_b, ps_b,
-	   buf_alpha2,
-	   buf_c, rs_c, cs_c,
-	   cntx,
-	   rntm,
-	   thread );
+	ftypes[dt_exec]
+	(
+	  diagoffb,
+	  schema_a,
+	  schema_b,
+	  m,
+	  n,
+	  k,
+	  ( void* )buf_alpha1,
+	  ( void* )buf_a, cs_a, pd_a, ps_a,
+	  ( void* )buf_b, rs_b, pd_b, ps_b,
+	  ( void* )buf_alpha2,
+	           buf_c, rs_c, cs_c,
+	  ( cntx_t* )cntx,
+	  rntm,
+	  thread
+	);
 }
 
 
diff --git a/frame/3/trsm/bli_trsm_lu_ker_var2.c b/frame/3/trsm/bli_trsm_lu_ker_var2.c
index 2059d1c9f2..46ee8f4399 100644
--- a/frame/3/trsm/bli_trsm_lu_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_lu_ker_var2.c
@@ -60,52 +60,47 @@ static FUNCPTR_T GENARRAY(ftypes,trsm_lu_ker_var2);
 
 void bli_trsm_lu_ker_var2
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      )
 {
-	num_t     dt_exec   = bli_obj_exec_dt( c );
+	const num_t     dt_exec   = bli_obj_exec_dt( c );
 
-	doff_t    diagoffa  = bli_obj_diag_offset( a );
+	const doff_t    diagoffb  = bli_obj_diag_offset( b );
 
-	pack_t    schema_a  = bli_obj_pack_schema( a );
-	pack_t    schema_b  = bli_obj_pack_schema( b );
+	const pack_t    schema_a  = bli_obj_pack_schema( a );
+	const pack_t    schema_b  = bli_obj_pack_schema( b );
 
-	dim_t     m         = bli_obj_length( c );
-	dim_t     n         = bli_obj_width( c );
-	dim_t     k         = bli_obj_width( a );
+	const dim_t     m         = bli_obj_length( c );
+	const dim_t     n         = bli_obj_width( c );
+	const dim_t     k         = bli_obj_width( a );
 
-	void*     buf_a     = bli_obj_buffer_at_off( a );
-	inc_t     cs_a      = bli_obj_col_stride( a );
-	dim_t     pd_a      = bli_obj_panel_dim( a );
-	inc_t     ps_a      = bli_obj_panel_stride( a );
+	const void*     buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t     cs_a      = bli_obj_col_stride( a );
+	const dim_t     pd_a      = bli_obj_panel_dim( a );
+	const inc_t     ps_a      = bli_obj_panel_stride( a );
 
-	void*     buf_b     = bli_obj_buffer_at_off( b );
-	inc_t     rs_b      = bli_obj_row_stride( b );
-	dim_t     pd_b      = bli_obj_panel_dim( b );
-	inc_t     ps_b      = bli_obj_panel_stride( b );
+	const void*     buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t     rs_b      = bli_obj_row_stride( b );
+	const dim_t     pd_b      = bli_obj_panel_dim( b );
+	const inc_t     ps_b      = bli_obj_panel_stride( b );
 
-	void*     buf_c     = bli_obj_buffer_at_off( c );
-	inc_t     rs_c      = bli_obj_row_stride( c );
-	inc_t     cs_c      = bli_obj_col_stride( c );
-
-	void*     buf_alpha1;
-	void*     buf_alpha2;
-
-	FUNCPTR_T f;
+	      void*     buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t     rs_c      = bli_obj_row_stride( c );
+	const inc_t     cs_c      = bli_obj_col_stride( c );
 
 	// Grab the address of the internal scalar buffer for the scalar
-	// attached to B (the non-triangular matrix). This will be the alpha
+	// attached to A (the non-triangular matrix). This will be the alpha
 	// scalar used in the gemmtrsm subproblems (ie: the scalar that would
-	// be applied to the packed copy of B prior to it being updated by
+	// be applied to the packed copy of A prior to it being updated by
 	// the trsm subproblem). This scalar may be unit, if for example it
 	// was applied during packing.
-	buf_alpha1 = bli_obj_internal_scalar_buffer( b );
+	const void* buf_alpha1 = bli_obj_internal_scalar_buffer( a );
 
 	// Grab the address of the internal scalar buffer for the scalar
 	// attached to C. This will be the "beta" scalar used in the gemm-only
@@ -113,27 +108,27 @@ void bli_trsm_lu_ker_var2
 	// the diagonal. We need this separate scalar because it's possible
 	// that the alpha attached to B was reset, if it was applied during
 	// packing.
-	buf_alpha2 = bli_obj_internal_scalar_buffer( c );
+	const void* buf_alpha2 = bli_obj_internal_scalar_buffer( c );
 
 	// Index into the type combination array to extract the correct
 	// function pointer.
-	f = ftypes[dt_exec];
-
-	// Invoke the function.
-	f( diagoffa,
-	   schema_a,
-	   schema_b,
-	   m,
-	   n,
-	   k,
-	   buf_alpha1,
-	   buf_a, cs_a, pd_a, ps_a,
-	   buf_b, rs_b, pd_b, ps_b,
-	   buf_alpha2,
-	   buf_c, rs_c, cs_c,
-	   cntx,
-	   rntm,
-	   thread );
+	ftypes[dt_exec]
+	(
+	  diagoffb,
+	  schema_a,
+	  schema_b,
+	  m,
+	  n,
+	  k,
+	  ( void* )buf_alpha1,
+	  ( void* )buf_a, cs_a, pd_a, ps_a,
+	  ( void* )buf_b, rs_b, pd_b, ps_b,
+	  ( void* )buf_alpha2,
+	           buf_c, rs_c, cs_c,
+	  ( cntx_t* )cntx,
+	  rntm,
+	  thread
+	);
 }
 
 
diff --git a/frame/3/trsm/bli_trsm_rl_ker_var2.c b/frame/3/trsm/bli_trsm_rl_ker_var2.c
index cace3622a1..721203df72 100644
--- a/frame/3/trsm/bli_trsm_rl_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_rl_ker_var2.c
@@ -60,44 +60,39 @@ static FUNCPTR_T GENARRAY(ftypes,trsm_rl_ker_var2);
 
 void bli_trsm_rl_ker_var2
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      )
 {
-	num_t     dt_exec   = bli_obj_exec_dt( c );
+	const num_t     dt_exec   = bli_obj_exec_dt( c );
 
-	doff_t    diagoffb  = bli_obj_diag_offset( b );
+	const doff_t    diagoffb  = bli_obj_diag_offset( b );
 
-	pack_t    schema_a  = bli_obj_pack_schema( a );
-	pack_t    schema_b  = bli_obj_pack_schema( b );
+	const pack_t    schema_a  = bli_obj_pack_schema( a );
+	const pack_t    schema_b  = bli_obj_pack_schema( b );
 
-	dim_t     m         = bli_obj_length( c );
-	dim_t     n         = bli_obj_width( c );
-	dim_t     k         = bli_obj_width( a );
+	const dim_t     m         = bli_obj_length( c );
+	const dim_t     n         = bli_obj_width( c );
+	const dim_t     k         = bli_obj_width( a );
 
-	void*     buf_a     = bli_obj_buffer_at_off( a );
-	inc_t     cs_a      = bli_obj_col_stride( a );
-	dim_t     pd_a      = bli_obj_panel_dim( a );
-	inc_t     ps_a      = bli_obj_panel_stride( a );
+	const void*     buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t     cs_a      = bli_obj_col_stride( a );
+	const dim_t     pd_a      = bli_obj_panel_dim( a );
+	const inc_t     ps_a      = bli_obj_panel_stride( a );
 
-	void*     buf_b     = bli_obj_buffer_at_off( b );
-	inc_t     rs_b      = bli_obj_row_stride( b );
-	dim_t     pd_b      = bli_obj_panel_dim( b );
-	inc_t     ps_b      = bli_obj_panel_stride( b );
+	const void*     buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t     rs_b      = bli_obj_row_stride( b );
+	const dim_t     pd_b      = bli_obj_panel_dim( b );
+	const inc_t     ps_b      = bli_obj_panel_stride( b );
 
-	void*     buf_c     = bli_obj_buffer_at_off( c );
-	inc_t     rs_c      = bli_obj_row_stride( c );
-	inc_t     cs_c      = bli_obj_col_stride( c );
-
-	void*     buf_alpha1;
-	void*     buf_alpha2;
-
-	FUNCPTR_T f;
+	      void*     buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t     rs_c      = bli_obj_row_stride( c );
+	const inc_t     cs_c      = bli_obj_col_stride( c );
 
 	// Grab the address of the internal scalar buffer for the scalar
 	// attached to A (the non-triangular matrix). This will be the alpha
@@ -105,7 +100,7 @@ void bli_trsm_rl_ker_var2
 	// be applied to the packed copy of A prior to it being updated by
 	// the trsm subproblem). This scalar may be unit, if for example it
 	// was applied during packing.
-	buf_alpha1 = bli_obj_internal_scalar_buffer( a );
+	const void* buf_alpha1 = bli_obj_internal_scalar_buffer( a );
 
 	// Grab the address of the internal scalar buffer for the scalar
 	// attached to C. This will be the "beta" scalar used in the gemm-only
@@ -113,27 +108,27 @@ void bli_trsm_rl_ker_var2
 	// the diagonal. We need this separate scalar because it's possible
 	// that the alpha attached to B was reset, if it was applied during
 	// packing.
-	buf_alpha2 = bli_obj_internal_scalar_buffer( c );
+	const void* buf_alpha2 = bli_obj_internal_scalar_buffer( c );
 
 	// Index into the type combination array to extract the correct
 	// function pointer.
-	f = ftypes[dt_exec];
-
-	// Invoke the function.
-	f( diagoffb,
-	   schema_a,
-	   schema_b,
-	   m,
-	   n,
-	   k,
-	   buf_alpha1,
-	   buf_a, cs_a, pd_a, ps_a,
-	   buf_b, rs_b, pd_b, ps_b,
-	   buf_alpha2,
-	   buf_c, rs_c, cs_c,
-	   cntx,
-	   rntm,
-	   thread );
+	ftypes[dt_exec]
+	(
+	  diagoffb,
+	  schema_a,
+	  schema_b,
+	  m,
+	  n,
+	  k,
+	  ( void* )buf_alpha1,
+	  ( void* )buf_a, cs_a, pd_a, ps_a,
+	  ( void* )buf_b, rs_b, pd_b, ps_b,
+	  ( void* )buf_alpha2,
+	           buf_c, rs_c, cs_c,
+	  ( cntx_t* )cntx,
+	  rntm,
+	  thread
+	);
 }
 
 
diff --git a/frame/3/trsm/bli_trsm_ru_ker_var2.c b/frame/3/trsm/bli_trsm_ru_ker_var2.c
index 4b0c7f083a..447fbf8cd5 100644
--- a/frame/3/trsm/bli_trsm_ru_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_ru_ker_var2.c
@@ -60,44 +60,39 @@ static FUNCPTR_T GENARRAY(ftypes,trsm_ru_ker_var2);
 
 void bli_trsm_ru_ker_var2
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      )
 {
-	num_t     dt_exec   = bli_obj_exec_dt( c );
+	const num_t     dt_exec   = bli_obj_exec_dt( c );
 
-	doff_t    diagoffb  = bli_obj_diag_offset( b );
+	const doff_t    diagoffb  = bli_obj_diag_offset( b );
 
-	pack_t    schema_a  = bli_obj_pack_schema( a );
-	pack_t    schema_b  = bli_obj_pack_schema( b );
+	const pack_t    schema_a  = bli_obj_pack_schema( a );
+	const pack_t    schema_b  = bli_obj_pack_schema( b );
 
-	dim_t     m         = bli_obj_length( c );
-	dim_t     n         = bli_obj_width( c );
-	dim_t     k         = bli_obj_width( a );
+	const dim_t     m         = bli_obj_length( c );
+	const dim_t     n         = bli_obj_width( c );
+	const dim_t     k         = bli_obj_width( a );
 
-	void*     buf_a     = bli_obj_buffer_at_off( a );
-	inc_t     cs_a      = bli_obj_col_stride( a );
-	dim_t     pd_a      = bli_obj_panel_dim( a );
-	inc_t     ps_a      = bli_obj_panel_stride( a );
+	const void*     buf_a     = bli_obj_buffer_at_off( a );
+	const inc_t     cs_a      = bli_obj_col_stride( a );
+	const dim_t     pd_a      = bli_obj_panel_dim( a );
+	const inc_t     ps_a      = bli_obj_panel_stride( a );
 
-	void*     buf_b     = bli_obj_buffer_at_off( b );
-	inc_t     rs_b      = bli_obj_row_stride( b );
-	dim_t     pd_b      = bli_obj_panel_dim( b );
-	inc_t     ps_b      = bli_obj_panel_stride( b );
+	const void*     buf_b     = bli_obj_buffer_at_off( b );
+	const inc_t     rs_b      = bli_obj_row_stride( b );
+	const dim_t     pd_b      = bli_obj_panel_dim( b );
+	const inc_t     ps_b      = bli_obj_panel_stride( b );
 
-	void*     buf_c     = bli_obj_buffer_at_off( c );
-	inc_t     rs_c      = bli_obj_row_stride( c );
-	inc_t     cs_c      = bli_obj_col_stride( c );
-
-	void*     buf_alpha1;
-	void*     buf_alpha2;
-
-	FUNCPTR_T f;
+	      void*     buf_c     = bli_obj_buffer_at_off( c );
+	const inc_t     rs_c      = bli_obj_row_stride( c );
+	const inc_t     cs_c      = bli_obj_col_stride( c );
 
 	// Grab the address of the internal scalar buffer for the scalar
 	// attached to A (the non-triangular matrix). This will be the alpha
@@ -105,7 +100,7 @@ void bli_trsm_ru_ker_var2
 	// be applied to the packed copy of A prior to it being updated by
 	// the trsm subproblem). This scalar may be unit, if for example it
 	// was applied during packing.
-	buf_alpha1 = bli_obj_internal_scalar_buffer( a );
+	const void* buf_alpha1 = bli_obj_internal_scalar_buffer( a );
 
 	// Grab the address of the internal scalar buffer for the scalar
 	// attached to C. This will be the "beta" scalar used in the gemm-only
@@ -113,27 +108,27 @@ void bli_trsm_ru_ker_var2
 	// the diagonal. We need this separate scalar because it's possible
 	// that the alpha attached to B was reset, if it was applied during
 	// packing.
-	buf_alpha2 = bli_obj_internal_scalar_buffer( c );
+	const void* buf_alpha2 = bli_obj_internal_scalar_buffer( c );
 
 	// Index into the type combination array to extract the correct
 	// function pointer.
-	f = ftypes[dt_exec];
-
-	// Invoke the function.
-	f( diagoffb,
-	   schema_a,
-	   schema_b,
-	   m,
-	   n,
-	   k,
-	   buf_alpha1,
-	   buf_a, cs_a, pd_a, ps_a,
-	   buf_b, rs_b, pd_b, ps_b,
-	   buf_alpha2,
-	   buf_c, rs_c, cs_c,
-	   cntx,
-	   rntm,
-	   thread );
+	ftypes[dt_exec]
+	(
+	  diagoffb,
+	  schema_a,
+	  schema_b,
+	  m,
+	  n,
+	  k,
+	  ( void* )buf_alpha1,
+	  ( void* )buf_a, cs_a, pd_a, ps_a,
+	  ( void* )buf_b, rs_b, pd_b, ps_b,
+	  ( void* )buf_alpha2,
+	           buf_c, rs_c, cs_c,
+	  ( cntx_t* )cntx,
+	  rntm,
+	  thread
+	);
 }
 
 
diff --git a/frame/3/trsm/bli_trsm_var.h b/frame/3/trsm/bli_trsm_var.h
index 8322a8b5b6..7e747b4a88 100644
--- a/frame/3/trsm/bli_trsm_var.h
+++ b/frame/3/trsm/bli_trsm_var.h
@@ -43,13 +43,13 @@
 \
 void PASTEMAC0(opname) \
      ( \
-       obj_t*  a, \
-       obj_t*  b, \
-       obj_t*  c, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       cntl_t* cntl, \
-       thrinfo_t* thread  \
+       const obj_t*  a, \
+       const obj_t*  b, \
+       const obj_t*  c, \
+       const cntx_t* cntx, \
+             rntm_t* rntm, \
+             cntl_t* cntl, \
+             thrinfo_t* thread  \
      );
 
 GENPROT( trsm_blk_var1 )
diff --git a/frame/3/trsm/bli_trsm_xx_ker_var2.c b/frame/3/trsm/bli_trsm_xx_ker_var2.c
index c30a5828a3..a0a59c0a85 100644
--- a/frame/3/trsm/bli_trsm_xx_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_xx_ker_var2.c
@@ -43,13 +43,13 @@ static l3_var_oft vars[2][2] =
 
 void bli_trsm_xx_ker_var2
      (
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl,
-       thrinfo_t* thread
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl,
+             thrinfo_t* thread
      )
 {
 	dim_t      side;
diff --git a/frame/base/bli_apool.c b/frame/base/bli_apool.c
index e2d8123511..a42c7103e5 100644
--- a/frame/base/bli_apool.c
+++ b/frame/base/bli_apool.c
@@ -36,7 +36,7 @@
 
 void bli_apool_init
      (
-       apool_t* restrict apool
+       apool_t* apool
      )
 {
 	err_t r_val;
@@ -47,7 +47,7 @@ void bli_apool_init
 	// library initialization.
 
 	// Query the mutex from the apool_t.
-	//bli_pthread_mutex_t* restrict mutex = bli_apool_mutex( apool );
+	//bli_pthread_mutex_t* mutex = bli_apool_mutex( apool );
 
 	// Initialize the mutex.
 	//*mutex = BLIS_PTHREAD_MUTEX_INITIALIZER;
@@ -76,7 +76,7 @@ void bli_apool_init
 	const siz_t align_size = 64;
 
 	// Query the underlying pool_t from the apool_t.
-	pool_t* restrict pool = bli_apool_pool( apool );
+	pool_t* pool = bli_apool_pool( apool );
 
 	// Set the default array_t length of the apool_t.
 	bli_apool_set_def_array_len( num_elem, apool );
@@ -92,7 +92,7 @@ void bli_apool_init
 	#endif
 
 	// Allocate the block_ptrs array.
-	array_t** restrict block_ptrs
+	array_t** block_ptrs
 	=
 	bli_malloc_intl( block_ptrs_len * sizeof( array_t* ), &r_val );
 
@@ -139,8 +139,8 @@ void bli_apool_init
 
 void bli_apool_alloc_block
      (
-       siz_t              num_elem,
-       array_t** restrict array_p
+       siz_t     num_elem,
+       array_t** array_p
      )
 {
 	err_t r_val;
@@ -156,9 +156,7 @@ void bli_apool_alloc_block
 	// Allocate the array_t via the bli_fmalloc_align() wrapper, which performs
 	// alignment logic and opaquely saves the original pointer so that it can
 	// be recovered when it's time to free the block.
-	array_t* restrict array
-	=
-	bli_malloc_intl( block_size, &r_val );
+	array_t* array = bli_malloc_intl( block_size, &r_val );
 
 	// Initialize an array_t struct within the newly allocated memory region.
 	bli_array_init( num_elem, sizeof( pool_t* ), array );
@@ -169,16 +167,16 @@ void bli_apool_alloc_block
 
 void bli_apool_free_block
      (
-       array_t* restrict array
+       array_t* array
      )
 {
-	const siz_t       num_elem = bli_array_num_elem( array );
-	pool_t** restrict buf      = bli_array_buf( array );
+	const siz_t    num_elem = bli_array_num_elem( array );
+	      pool_t** buf      = bli_array_buf( array );
 
 	// Step through the array and finalize each pool_t.
 	for ( dim_t i = 0; i < num_elem; ++i )
 	{
-		pool_t* restrict pool = buf[ i ];
+		pool_t* pool = buf[ i ];
 
 		#ifdef BLIS_ENABLE_MEM_TRACING
 		printf( "bli_apool_free_block(): freeing pool_t %d within array_t.\n",
@@ -218,25 +216,25 @@ void bli_apool_free_block
 
 void bli_apool_finalize
      (
-       apool_t* restrict apool
+       apool_t* apool
      )
 {
 	// NOTE: Since the apool_t's mutex is now initialized statically, we no
 	// longer need to explicitly destroy it.
 
 	// Query the mutex from the apool_t.
-	//bli_pthread_mutex_t* restrict mutex = bli_apool_mutex( apool );
+	//bli_pthread_mutex_t* mutex = bli_apool_mutex( apool );
 
 	// Destroy the mutex.
 	//bli_pthread_mutex_destroy( mutex );
 
 	// Query the underlying pool_t and mutex from the apool_t.
-	pool_t* restrict pool = bli_apool_pool( apool );
+	pool_t* pool = bli_apool_pool( apool );
 
 	// ----------------------------------------------------------------
 
 	// Query the block_ptrs array.
-	array_t** restrict block_ptrs = bli_pool_block_ptrs( pool );
+	array_t** block_ptrs = bli_pool_block_ptrs( pool );
 
 	// Query the total number of blocks currently allocated.
 	siz_t num_blocks = bli_pool_num_blocks( pool );
@@ -270,8 +268,8 @@ void bli_apool_finalize
 
 array_t* bli_apool_checkout_array
      (
-       siz_t             n_threads,
-       apool_t* restrict apool
+       siz_t    n_threads,
+       apool_t* apool
      )
 {
 	// Acquire the apool_t's mutex.
@@ -298,10 +296,10 @@ array_t* bli_apool_checkout_array
 	// At this point, at least one array_t is guaranteed to be available.
 
 	// Query the underlying pool_t from the apool_t.
-	pool_t* restrict pool = bli_apool_pool( apool );
+	pool_t* pool = bli_apool_pool( apool );
 
 	// Query the block_ptrs array.
-	array_t** restrict block_ptrs = bli_pool_block_ptrs( pool );
+	array_t** block_ptrs = bli_pool_block_ptrs( pool );
 
 	// Query the top_index of the pool.
 	const siz_t top_index = bli_pool_top_index( pool );
@@ -313,7 +311,7 @@ array_t* bli_apool_checkout_array
 	#endif
 
 	// Select the array_t* at top_index to return to the caller.
-	array_t* restrict array = block_ptrs[ top_index ];
+	array_t* array = block_ptrs[ top_index ];
 
 	// Increment the pool's top_index.
 	bli_pool_set_top_index( top_index + 1, pool );
@@ -333,15 +331,15 @@ array_t* bli_apool_checkout_array
 
 void bli_apool_checkin_array
      (
-       array_t* restrict array,
-       apool_t* restrict apool
+       array_t* array,
+       apool_t* apool
      )
 {
 	// Acquire the apool_t's mutex.
 	bli_apool_lock( apool );
 
 	// Query the underlying pool_t from the apool_t.
-	pool_t* restrict pool = bli_apool_pool( apool );
+	pool_t* pool = bli_apool_pool( apool );
 
 	// ----------------------------------------------------------------------------
 
@@ -351,7 +349,7 @@ void bli_apool_checkin_array
 	// change.
 
 	// Query the block_ptrs array.
-	array_t** restrict block_ptrs = bli_pool_block_ptrs( pool );
+	array_t** block_ptrs = bli_pool_block_ptrs( pool );
 
 	// Query the top_index of the pool.
 	const siz_t top_index = bli_pool_top_index( pool );
@@ -376,8 +374,8 @@ void bli_apool_checkin_array
 
 pool_t* bli_apool_array_elem
      (
-       siz_t             index,
-       array_t* restrict array
+       siz_t    index,
+       array_t* array
      )
 {
 	err_t r_val;
@@ -391,8 +389,8 @@ pool_t* bli_apool_array_elem
 	// stores in the array_t are pool_t*, that means that the function is
 	// actually returning the address of a pool_t*, or pool_t**, hence the
 	// dereferencing below.
-	pool_t** restrict pool_p = bli_array_elem( index, array );
-	pool_t*           pool   = *pool_p;
+	pool_t** pool_p = bli_array_elem( index, array );
+	pool_t*  pool   = *pool_p;
 
 	// If the element is NULL, then it means a pool_t has not yet been created
 	// and allocated for the given index (thread id).
@@ -463,8 +461,8 @@ pool_t* bli_apool_array_elem
 
 void bli_apool_grow
      (
-       siz_t             num_blocks_add,
-       apool_t* restrict apool
+       siz_t    num_blocks_add,
+       apool_t* apool
      )
 {
 	err_t r_val;
@@ -473,7 +471,7 @@ void bli_apool_grow
 	if ( num_blocks_add == 0 ) return;
 
 	// Query the underlying pool_t from the apool_t.
-	pool_t* restrict pool = bli_apool_pool( apool );
+	pool_t* pool = bli_apool_pool( apool );
 
 	// Query the default initial array length from the apool_t.
 	const siz_t num_elem = bli_apool_def_array_len( apool );
@@ -499,7 +497,7 @@ void bli_apool_grow
 		const siz_t block_ptrs_len_new = 2 * block_ptrs_len_cur;
 
 		// Query the current block_ptrs array.
-		array_t** restrict block_ptrs_cur = bli_pool_block_ptrs( pool );
+		array_t** block_ptrs_cur = bli_pool_block_ptrs( pool );
 
 		#ifdef BLIS_ENABLE_MEM_TRACING
 		printf( "bli_apool_grow(): growing block_ptrs_len (%d -> %d): ",
@@ -507,7 +505,7 @@ void bli_apool_grow
 		#endif
 
 		// Allocate a new block_ptrs array.
-		array_t** restrict block_ptrs_new
+		array_t** block_ptrs_new
 		=
 		bli_malloc_intl( block_ptrs_len_new * sizeof( array_t* ), &r_val );
 
@@ -541,7 +539,7 @@ void bli_apool_grow
 	// blocks.
 
 	// Query the current block_ptrs array (which was maybe just resized).
-	array_t** restrict block_ptrs = bli_pool_block_ptrs( pool );
+	array_t** block_ptrs = bli_pool_block_ptrs( pool );
 
 	#ifdef BLIS_ENABLE_MEM_TRACING
 	printf( "bli_apool_grow(): growing apool_t (%d -> %d).\n",
diff --git a/frame/base/bli_apool.h b/frame/base/bli_apool.h
index e7ea722d6f..d06f79207b 100644
--- a/frame/base/bli_apool.h
+++ b/frame/base/bli_apool.h
@@ -94,44 +94,44 @@ BLIS_INLINE void bli_apool_set_def_array_len( siz_t def_array_len, apool_t* pool
 
 void bli_apool_init
      (
-       apool_t* restrict apool
+       apool_t* apool
      );
 void bli_apool_finalize
      (
-       apool_t* restrict apool
+       apool_t* apool
      );
 
 array_t* bli_apool_checkout_array
      (
-       siz_t             n_threads,
-       apool_t* restrict apool
+       siz_t    n_threads,
+       apool_t* apool
      );
 void bli_apool_checkin_array
      (
-       array_t* restrict array,
-       apool_t* restrict apool
+       array_t* array,
+       apool_t* apool
      );
 
 pool_t* bli_apool_array_elem
      (
-       siz_t             index,
-       array_t* restrict array
+       siz_t    index,
+       array_t* array
      );
 
 void bli_apool_grow
      (
-       siz_t             num_blocks_add,
-       apool_t* restrict apool
+       siz_t    num_blocks_add,
+       apool_t* apool
      );
 
 void bli_apool_alloc_block
      (
-       siz_t              num_elem,
-       array_t** restrict array_p
+       siz_t     num_elem,
+       array_t** array_p
      );
 void bli_apool_free_block
      (
-       array_t* restrict array
+       array_t* array
      );
 
 
diff --git a/frame/base/bli_array.c b/frame/base/bli_array.c
index ae46eb4e17..ea47a0024c 100644
--- a/frame/base/bli_array.c
+++ b/frame/base/bli_array.c
@@ -146,7 +146,7 @@ void bli_array_finalize
 
 void* bli_array_elem
      (
-       siz_t          index,
+             siz_t    index,
        const array_t* array
      )
 {
diff --git a/frame/base/bli_array.h b/frame/base/bli_array.h
index d05801f27f..c1e6ce038a 100644
--- a/frame/base/bli_array.h
+++ b/frame/base/bli_array.h
@@ -103,7 +103,7 @@ void bli_array_finalize
 
 void* bli_array_elem
      (
-       siz_t          index,
+             siz_t    index,
        const array_t* array
      );
 void bli_array_set_elem
diff --git a/frame/base/bli_auxinfo.h b/frame/base/bli_auxinfo.h
index e1e34c8816..f97ecf951d 100644
--- a/frame/base/bli_auxinfo.h
+++ b/frame/base/bli_auxinfo.h
@@ -47,11 +47,11 @@ BLIS_INLINE pack_t bli_auxinfo_schema_b( const auxinfo_t* ai )
 	return ai->schema_b;
 }
 
-BLIS_INLINE void* bli_auxinfo_next_a( const auxinfo_t* ai )
+BLIS_INLINE const void* bli_auxinfo_next_a( const auxinfo_t* ai )
 {
 	return ai->a_next;
 }
-BLIS_INLINE void* bli_auxinfo_next_b( const auxinfo_t* ai )
+BLIS_INLINE const void* bli_auxinfo_next_b( const auxinfo_t* ai )
 {
 	return ai->b_next;
 }
@@ -78,7 +78,7 @@ BLIS_INLINE void_fp bli_auxinfo_ukr( const auxinfo_t* ai )
 {
     return ai->ukr;
 }
-BLIS_INLINE void* bli_auxinfo_params( const auxinfo_t* ai )
+BLIS_INLINE const void* bli_auxinfo_params( const auxinfo_t* ai )
 {
     return ai->params;
 }
@@ -95,15 +95,15 @@ BLIS_INLINE void bli_auxinfo_set_schema_b( pack_t schema, auxinfo_t* ai )
 	ai->schema_b = schema;
 }
 
-BLIS_INLINE void bli_auxinfo_set_next_a( void* p, auxinfo_t* ai )
+BLIS_INLINE void bli_auxinfo_set_next_a( const void* p, auxinfo_t* ai )
 {
 	ai->a_next = p;
 }
-BLIS_INLINE void bli_auxinfo_set_next_b( void* p, auxinfo_t* ai )
+BLIS_INLINE void bli_auxinfo_set_next_b( const void* p, auxinfo_t* ai )
 {
 	ai->b_next = p;
 }
-BLIS_INLINE void bli_auxinfo_set_next_ab( void* ap, void* bp, auxinfo_t* ai )
+BLIS_INLINE void bli_auxinfo_set_next_ab( const void* ap, const void* bp, auxinfo_t* ai )
 {
 	ai->a_next = ap;
 	ai->b_next = bp;
@@ -131,7 +131,7 @@ BLIS_INLINE void bli_auxinfo_set_ukr( void_fp ukr, auxinfo_t* ai )
 {
     ai->ukr = ukr;
 }
-BLIS_INLINE void bli_auxinfo_set_params( void* params, auxinfo_t* ai )
+BLIS_INLINE void bli_auxinfo_set_params( const void* params, auxinfo_t* ai )
 {
     ai->params = params;
 }
diff --git a/frame/base/bli_blksz.c b/frame/base/bli_blksz.c
index 8168bc2656..3d164a7cf5 100644
--- a/frame/base/bli_blksz.c
+++ b/frame/base/bli_blksz.c
@@ -235,11 +235,11 @@ void bli_blksz_reduce_max_to
 
 dim_t bli_determine_blocksize
      (
-       dir_t         direct,
-       dim_t         i,
-       dim_t         dim,
+             dir_t   direct,
+             dim_t   i,
+             dim_t   dim,
        const obj_t*  obj,
-       bszid_t       bszid,
+             bszid_t bszid,
        const cntx_t* cntx
      )
 {
@@ -251,10 +251,10 @@ dim_t bli_determine_blocksize
 
 dim_t bli_determine_blocksize_f
      (
-       dim_t         i,
-       dim_t         dim,
+             dim_t   i,
+             dim_t   dim,
        const obj_t*  obj,
-       bszid_t       bszid,
+             bszid_t bszid,
        const cntx_t* cntx
      )
 {
@@ -277,10 +277,10 @@ dim_t bli_determine_blocksize_f
 
 dim_t bli_determine_blocksize_b
      (
-       dim_t         i,
-       dim_t         dim,
+             dim_t   i,
+             dim_t   dim,
        const obj_t*  obj,
-       bszid_t       bszid,
+             bszid_t bszid,
        const cntx_t* cntx
      )
 {
diff --git a/frame/base/bli_blksz.h b/frame/base/bli_blksz.h
index 63864a186b..d91c0542d8 100644
--- a/frame/base/bli_blksz.h
+++ b/frame/base/bli_blksz.h
@@ -36,7 +36,7 @@
 
 BLIS_INLINE dim_t bli_blksz_get_def
      (
-       num_t          dt,
+             num_t    dt,
        const blksz_t* b
      )
 {
@@ -45,7 +45,7 @@ BLIS_INLINE dim_t bli_blksz_get_def
 
 BLIS_INLINE dim_t bli_blksz_get_max
      (
-       num_t          dt,
+             num_t    dt,
        const blksz_t* b
      )
 {
@@ -252,29 +252,29 @@ void bli_blksz_reduce_max_to
 
 dim_t bli_determine_blocksize
      (
-       dir_t         direct,
-       dim_t         i,
-       dim_t         dim,
+             dir_t   direct,
+             dim_t   i,
+             dim_t   dim,
        const obj_t*  obj,
-       bszid_t       bszid,
+             bszid_t bszid,
        const cntx_t* cntx
      );
 
 dim_t bli_determine_blocksize_f
      (
-       dim_t         i,
-       dim_t         dim,
+             dim_t   i,
+             dim_t   dim,
        const obj_t*  obj,
-       bszid_t       bszid,
+             bszid_t bszid,
        const cntx_t* cntx
      );
 
 dim_t bli_determine_blocksize_b
      (
-       dim_t         i,
-       dim_t         dim,
+             dim_t   i,
+             dim_t   dim,
        const obj_t*  obj,
-       bszid_t       bszid,
+             bszid_t bszid,
        const cntx_t* cntx
      );
 
diff --git a/frame/base/bli_cntx.h b/frame/base/bli_cntx.h
index 3af16a7c51..90050a5ed9 100644
--- a/frame/base/bli_cntx.h
+++ b/frame/base/bli_cntx.h
@@ -245,7 +245,7 @@ BLIS_INLINE bool bli_cntx_ukr_prefers_cols_dt( num_t dt, ukr_t ukr_id, const cnt
 	return ! bli_cntx_ukr_prefers_rows_dt( dt, ukr_id, cntx );
 }
 
-BLIS_INLINE bool bli_cntx_prefers_storage_of( obj_t* obj, ukr_t ukr_id, const cntx_t* cntx )
+BLIS_INLINE bool bli_cntx_prefers_storage_of( const obj_t* obj, ukr_t ukr_id, const cntx_t* cntx )
 {
 	const bool ukr_prefers_rows
 		= bli_cntx_ukr_prefers_rows_dt( bli_obj_dt( obj ), ukr_id, cntx );
@@ -256,7 +256,7 @@ BLIS_INLINE bool bli_cntx_prefers_storage_of( obj_t* obj, ukr_t ukr_id, const cn
 	return FALSE;
 }
 
-BLIS_INLINE bool bli_cntx_dislikes_storage_of( obj_t* obj, ukr_t ukr_id, const cntx_t* cntx )
+BLIS_INLINE bool bli_cntx_dislikes_storage_of( const obj_t* obj, ukr_t ukr_id, const cntx_t* cntx )
 {
 	return ! bli_cntx_prefers_storage_of( obj, ukr_id, cntx );
 }
@@ -286,7 +286,7 @@ BLIS_INLINE void bli_cntx_set_blksz_max_dt( num_t dt, bszid_t bs_id, dim_t bs, c
 	bli_blksz_set_max( bs, dt, &cntx->blkszs[ bs_id ]);
 }
 
-BLIS_INLINE void bli_cntx_set_ukr( ukr_t ukr_id, func_t* func, cntx_t* cntx )
+BLIS_INLINE void bli_cntx_set_ukr( ukr_t ukr_id, const func_t* func, cntx_t* cntx )
 {
 	cntx->ukrs[ ukr_id ] = *func;
 }
diff --git a/frame/base/bli_func.c b/frame/base/bli_func.c
index 7b462cd850..7cb7aac6d2 100644
--- a/frame/base/bli_func.c
+++ b/frame/base/bli_func.c
@@ -93,7 +93,7 @@ void bli_func_free( func_t* f )
 
 // -----------------------------------------------------------------------------
 
-bool bli_func_is_null_dt( num_t         dt,
+bool bli_func_is_null_dt(       num_t   dt,
                           const func_t* f )
 {
 	return ( bli_func_get_dt( dt, f ) == NULL );
diff --git a/frame/base/bli_func.h b/frame/base/bli_func.h
index 9094d56f86..71e179d76f 100644
--- a/frame/base/bli_func.h
+++ b/frame/base/bli_func.h
@@ -38,7 +38,7 @@
 
 BLIS_INLINE void_fp bli_func_get_dt
      (
-       num_t         dt,
+             num_t   dt,
        const func_t* func
      )
 {
@@ -96,7 +96,7 @@ void bli_func_free( func_t* f );
 
 // -----------------------------------------------------------------------------
 
-bool bli_func_is_null_dt( num_t   dt,
+bool bli_func_is_null_dt(       num_t   dt,
                           const func_t* f );
 bool bli_func_is_null( const func_t* f );
 
diff --git a/frame/base/bli_getopt.c b/frame/base/bli_getopt.c
index 184439db59..e1d90d3234 100644
--- a/frame/base/bli_getopt.c
+++ b/frame/base/bli_getopt.c
@@ -45,12 +45,12 @@ void bli_getopt_init_state( int opterr, getopt_t* state )
 	state->optopt = 0;
 }
 
-int bli_getopt( int argc, char** const argv, const char* optstring, getopt_t* state )
+int bli_getopt( int argc, const char* const * argv, const char* optstring, getopt_t* state )
 {
-	static char* nextchar = NULL;
+	static const char* nextchar = NULL;
 
-	char*        elem_str;
-	char*        optstr_char;
+	const char* elem_str;
+	const char* optstr_char;
 
 	// If argv contains no more arguments to process, return.
 	if ( state->optind == argc ) return -1;
diff --git a/frame/base/bli_getopt.h b/frame/base/bli_getopt.h
index 1b5a7a002e..bb0e4f2cf1 100644
--- a/frame/base/bli_getopt.h
+++ b/frame/base/bli_getopt.h
@@ -34,13 +34,13 @@
 
 typedef struct getopt_s
 {
-	char* optarg;
-	int   optind;
-	int   opterr;
-	int   optopt;
+	const char* optarg;
+	      int   optind;
+	      int   opterr;
+	      int   optopt;
 } getopt_t;
 
 BLIS_EXPORT_BLIS void bli_getopt_init_state( int opterr, getopt_t* state );
 
-BLIS_EXPORT_BLIS int bli_getopt( int argc, char** const argv, const char* optstring, getopt_t* state );
+BLIS_EXPORT_BLIS int bli_getopt( int argc, const char* const * argv, const char* optstring, getopt_t* state );
 
diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c
index f932650cb1..ff4578bc5f 100644
--- a/frame/base/bli_gks.c
+++ b/frame/base/bli_gks.c
@@ -316,8 +316,8 @@ const cntx_t* bli_gks_lookup_ind_cntx
 
 	// Index into the array of context pointers for the given architecture id,
 	// and then index into the subarray for the given induced method.
-	cntx_t** restrict gks_id     = gks[ id ];
-	cntx_t*  restrict gks_id_ind = gks_id[ ind ];
+	cntx_t** gks_id     = gks[ id ];
+	cntx_t*  gks_id_ind = gks_id[ ind ];
 
 	// Return the context pointer at gks_id_ind.
 	return gks_id_ind;
@@ -405,7 +405,7 @@ void bli_gks_register_cntx
 	gks[ id ] = bli_calloc_intl( sizeof( cntx_t* ) * BLIS_NUM_IND_METHODS, &r_val );
 
 	// Alias the allocated array for readability.
-	cntx_t** restrict gks_id = gks[ id ];
+	cntx_t** gks_id = gks[ id ];
 
 	#ifdef BLIS_ENABLE_MEM_TRACING
 	printf( "bli_gks_register_cntx(): " );
@@ -417,7 +417,7 @@ void bli_gks_register_cntx
 	gks_id[ BLIS_NAT ] = bli_calloc_intl( sizeof( cntx_t ), &r_val );
 
 	// Alias the allocated context address for readability.
-	cntx_t* restrict gks_id_nat = gks_id[ BLIS_NAT ];
+	cntx_t* gks_id_nat = gks_id[ BLIS_NAT ];
 
 	// Call the context initialization function on the element of the newly
 	// allocated array corresponding to native execution.
diff --git a/frame/base/bli_mem.h b/frame/base/bli_mem.h
index d61e970214..c255114865 100644
--- a/frame/base/bli_mem.h
+++ b/frame/base/bli_mem.h
@@ -66,33 +66,33 @@ BLIS_INLINE pblk_t* bli_mem_pblk( mem_t* mem )
 	return &(mem->pblk);
 }
 
-BLIS_INLINE void* bli_mem_buffer( mem_t* mem )
+BLIS_INLINE void* bli_mem_buffer( const mem_t* mem )
 {
-	return bli_pblk_buf( bli_mem_pblk( mem ) );
+	return bli_pblk_buf( bli_mem_pblk( ( mem_t* )mem ) );
 }
 
-BLIS_INLINE packbuf_t bli_mem_buf_type( mem_t* mem )
+BLIS_INLINE packbuf_t bli_mem_buf_type( const mem_t* mem )
 {
 	return mem->buf_type;
 }
 
-BLIS_INLINE pool_t* bli_mem_pool( mem_t* mem )
+BLIS_INLINE pool_t* bli_mem_pool( const mem_t* mem )
 {
 	return mem->pool;
 }
 
-BLIS_INLINE siz_t bli_mem_size( mem_t* mem )
+BLIS_INLINE siz_t bli_mem_size( const mem_t* mem )
 {
 	return mem->size;
 }
 
-BLIS_INLINE bool bli_mem_is_alloc( mem_t* mem )
+BLIS_INLINE bool bli_mem_is_alloc( const mem_t* mem )
 {
 	return ( bool )
 	       ( bli_mem_buffer( mem ) != NULL );
 }
 
-BLIS_INLINE bool bli_mem_is_unalloc( mem_t* mem )
+BLIS_INLINE bool bli_mem_is_unalloc( const mem_t* mem )
 {
 	return ( bool )
 	       ( bli_mem_buffer( mem ) == NULL );
@@ -160,4 +160,4 @@ BLIS_INLINE void bli_mem_clear( mem_t* mem )
 }
 
 
-#endif 
+#endif
diff --git a/frame/base/bli_obj.c b/frame/base/bli_obj.c
index f7946b90e6..cd0b6ac985 100644
--- a/frame/base/bli_obj.c
+++ b/frame/base/bli_obj.c
@@ -600,8 +600,8 @@ dim_t bli_align_dim_to_size
 
 dim_t bli_align_ptr_to_size
      (
-       const void* p,
-       size_t      align_size
+       const void*  p,
+             size_t align_size
      )
 {
 	dim_t dim;
diff --git a/frame/base/bli_obj.h b/frame/base/bli_obj.h
index d806563fdd..a446c09c81 100644
--- a/frame/base/bli_obj.h
+++ b/frame/base/bli_obj.h
@@ -139,8 +139,8 @@ BLIS_EXPORT_BLIS dim_t bli_align_dim_to_size
 
 BLIS_EXPORT_BLIS dim_t bli_align_ptr_to_size
      (
-       const void* p,
-       size_t      align_size
+       const void*  p,
+             size_t align_size
      );
 
 BLIS_EXPORT_BLIS void bli_obj_print
diff --git a/frame/base/bli_obj_scalar.c b/frame/base/bli_obj_scalar.c
index 8efe22d7d4..5c6ef8f94a 100644
--- a/frame/base/bli_obj_scalar.c
+++ b/frame/base/bli_obj_scalar.c
@@ -59,8 +59,8 @@ void bli_obj_scalar_init_detached
 
 void bli_obj_scalar_init_detached_copy_of
      (
-       num_t        dt,
-       conj_t       conj,
+             num_t  dt,
+             conj_t conj,
        const obj_t* alpha,
              obj_t* beta
      )
@@ -103,9 +103,9 @@ void bli_obj_scalar_detach
 
 void bli_obj_scalar_attach
      (
-       conj_t conj,
-       obj_t* alpha,
-       obj_t* a
+             conj_t conj,
+       const obj_t* alpha,
+             obj_t* a
      )
 {
 	obj_t alpha_cast;
diff --git a/frame/base/bli_obj_scalar.h b/frame/base/bli_obj_scalar.h
index 753707f043..23bf573c67 100644
--- a/frame/base/bli_obj_scalar.h
+++ b/frame/base/bli_obj_scalar.h
@@ -40,8 +40,8 @@ BLIS_EXPORT_BLIS void bli_obj_scalar_init_detached
 
 BLIS_EXPORT_BLIS void bli_obj_scalar_init_detached_copy_of
      (
-       num_t        dt,
-       conj_t       conj,
+             num_t  dt,
+             conj_t conj,
        const obj_t* alpha,
              obj_t* beta
      );
@@ -54,9 +54,9 @@ BLIS_EXPORT_BLIS void bli_obj_scalar_detach
 
 BLIS_EXPORT_BLIS void bli_obj_scalar_attach
      (
-       conj_t conj,
-       obj_t* alpha,
-       obj_t* a
+             conj_t conj,
+       const obj_t* alpha,
+             obj_t* a
      );
 
 BLIS_EXPORT_BLIS void bli_obj_scalar_cast_to
diff --git a/frame/base/bli_part.c b/frame/base/bli_part.c
index accab54e43..a1c8b534e3 100644
--- a/frame/base/bli_part.c
+++ b/frame/base/bli_part.c
@@ -40,10 +40,10 @@
 
 void bli_acquire_mpart
      (
-       dim_t        i,
-       dim_t        j,
-       dim_t        bm,
-       dim_t        bn,
+             dim_t  i,
+             dim_t  j,
+             dim_t  bm,
+             dim_t  bn,
        const obj_t* parent,
              obj_t* child
      )
@@ -83,11 +83,11 @@ void bli_acquire_mpart
 
 void bli_acquire_mpart_t2b
      (
-       subpart_t    req_part,
-       dim_t        i,
-       dim_t        b,
-       const obj_t* obj,
-             obj_t* sub_obj
+             subpart_t req_part,
+             dim_t     i,
+             dim_t     b,
+       const obj_t*    obj,
+             obj_t*    sub_obj
      )
 {
 	bli_acquire_mpart_mdim( BLIS_FWD, req_part, i, b, obj, sub_obj );
@@ -96,11 +96,11 @@ void bli_acquire_mpart_t2b
 
 void bli_acquire_mpart_b2t
      (
-       subpart_t    req_part,
-       dim_t        i,
-       dim_t        b,
-       const obj_t* obj,
-             obj_t* sub_obj
+             subpart_t req_part,
+             dim_t     i,
+             dim_t     b,
+       const obj_t*    obj,
+             obj_t*    sub_obj
      )
 {
 	bli_acquire_mpart_mdim( BLIS_BWD, req_part, i, b, obj, sub_obj );
@@ -109,12 +109,12 @@ void bli_acquire_mpart_b2t
 
 void bli_acquire_mpart_mdim
      (
-       dir_t        direct,
-       subpart_t    req_part,
-       dim_t        i,
-       dim_t        b,
-       const obj_t* obj,
-             obj_t* sub_obj
+             dir_t     direct,
+             subpart_t req_part,
+             dim_t     i,
+             dim_t     b,
+       const obj_t*    obj,
+             obj_t*    sub_obj
      )
 {
 	dim_t  m;
@@ -307,11 +307,11 @@ void bli_acquire_mpart_mdim
 
 void bli_acquire_mpart_l2r
      (
-       subpart_t    req_part,
-       dim_t        j,
-       dim_t        b,
-       const obj_t* obj,
-             obj_t* sub_obj
+             subpart_t req_part,
+             dim_t     j,
+             dim_t     b,
+       const obj_t*    obj,
+             obj_t*    sub_obj
      )
 {
 	bli_acquire_mpart_ndim( BLIS_FWD, req_part, j, b, obj, sub_obj );
@@ -320,11 +320,11 @@ void bli_acquire_mpart_l2r
 
 void bli_acquire_mpart_r2l
      (
-       subpart_t    req_part,
-       dim_t        j,
-       dim_t        b,
-       const obj_t* obj,
-             obj_t* sub_obj
+             subpart_t req_part,
+             dim_t     j,
+             dim_t     b,
+       const obj_t*    obj,
+             obj_t*    sub_obj
      )
 {
 	bli_acquire_mpart_ndim( BLIS_BWD, req_part, j, b, obj, sub_obj );
@@ -333,12 +333,12 @@ void bli_acquire_mpart_r2l
 
 void bli_acquire_mpart_ndim
      (
-       dir_t        direct,
-       subpart_t    req_part,
-       dim_t        j,
-       dim_t        b,
-       const obj_t* obj,
-             obj_t* sub_obj
+             dir_t     direct,
+             subpart_t req_part,
+             dim_t     j,
+             dim_t     b,
+       const obj_t*    obj,
+             obj_t*    sub_obj
      )
 {
 	dim_t  m;
@@ -530,11 +530,11 @@ void bli_acquire_mpart_ndim
 
 void bli_acquire_mpart_tl2br
      (
-       subpart_t    req_part,
-       dim_t        i,
-       dim_t        b,
-       const obj_t* obj,
-             obj_t* sub_obj
+             subpart_t req_part,
+             dim_t     i,
+             dim_t     b,
+       const obj_t*    obj,
+             obj_t*    sub_obj
      )
 {
 	bli_acquire_mpart_mndim( BLIS_FWD, req_part, i, b, obj, sub_obj );
@@ -543,11 +543,11 @@ void bli_acquire_mpart_tl2br
 
 void bli_acquire_mpart_br2tl
      (
-       subpart_t    req_part,
-       dim_t        j,
-       dim_t        b,
-       const obj_t* obj,
-             obj_t* sub_obj
+             subpart_t req_part,
+             dim_t     j,
+             dim_t     b,
+       const obj_t*    obj,
+             obj_t*    sub_obj
      )
 {
 	bli_acquire_mpart_mndim( BLIS_BWD, req_part, j, b, obj, sub_obj );
@@ -556,12 +556,12 @@ void bli_acquire_mpart_br2tl
 
 void bli_acquire_mpart_mndim
      (
-       dir_t        direct,
-       subpart_t    req_part,
-       dim_t        ij,
-       dim_t        b,
-       const obj_t* obj,
-             obj_t* sub_obj
+             dir_t     direct,
+             subpart_t req_part,
+             dim_t     ij,
+             dim_t     b,
+       const obj_t*    obj,
+             obj_t*    sub_obj
      )
 {
 	dim_t  m;
@@ -798,11 +798,11 @@ void bli_acquire_mpart_mndim
 
 void bli_acquire_vpart_f2b
      (
-       subpart_t    req_part,
-       dim_t        i,
-       dim_t        b,
-       const obj_t* obj,
-             obj_t* sub_obj
+             subpart_t req_part,
+             dim_t     i,
+             dim_t     b,
+       const obj_t*    obj,
+             obj_t*    sub_obj
      )
 {
 	if ( bli_obj_is_col_vector( obj ) )
@@ -814,11 +814,11 @@ void bli_acquire_vpart_f2b
 
 void bli_acquire_vpart_b2f
      (
-       subpart_t    req_part,
-       dim_t        i,
-       dim_t        b,
-       const obj_t* obj,
-             obj_t* sub_obj
+             subpart_t req_part,
+             dim_t     i,
+             dim_t     b,
+       const obj_t*    obj,
+             obj_t*    sub_obj
      )
 {
 	if ( bli_obj_is_col_vector( obj ) )
@@ -833,8 +833,8 @@ void bli_acquire_vpart_b2f
 
 void bli_acquire_mij
      (
-       dim_t        i,
-       dim_t        j,
+             dim_t  i,
+             dim_t  j,
        const obj_t* obj,
              obj_t* sub_obj
      )
@@ -848,7 +848,7 @@ void bli_acquire_mij
 
 void bli_acquire_vi
      (
-       dim_t        i,
+             dim_t  i,
        const obj_t* obj,
              obj_t* sub_obj
      )
diff --git a/frame/base/bli_part.h b/frame/base/bli_part.h
index 971887e787..6d3e00ced5 100644
--- a/frame/base/bli_part.h
+++ b/frame/base/bli_part.h
@@ -38,10 +38,10 @@
 
 BLIS_EXPORT_BLIS void bli_acquire_mpart
      (
-       dim_t        i,
-       dim_t        j,
-       dim_t        m,
-       dim_t        n,
+             dim_t  i,
+             dim_t  j,
+             dim_t  m,
+             dim_t  n,
        const obj_t* obj,
              obj_t* sub_obj
      );
@@ -51,11 +51,11 @@ BLIS_EXPORT_BLIS void bli_acquire_mpart
 \
 BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \
      ( \
-       subpart_t    req_part, \
-       dim_t        i, \
-       dim_t        b, \
-       const obj_t* obj, \
-             obj_t* sub_obj \
+             subpart_t req_part, \
+             dim_t     i, \
+             dim_t     b, \
+       const obj_t*    obj, \
+             obj_t*    sub_obj \
      );
 
 GENPROT( acquire_mpart_t2b )
@@ -71,12 +71,12 @@ GENPROT( acquire_mpart_br2tl )
 \
 BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \
      ( \
-       dir_t        direct, \
-       subpart_t    req_part, \
-       dim_t        i, \
-       dim_t        b, \
-       const obj_t* obj, \
-             obj_t* sub_obj \
+             dir_t     direct, \
+             subpart_t req_part, \
+             dim_t     i, \
+             dim_t     b, \
+       const obj_t*    obj, \
+             obj_t*    sub_obj \
      );
 
 GENPROT( acquire_mpart_mdim )
@@ -91,11 +91,11 @@ GENPROT( acquire_mpart_mndim )
 \
 BLIS_EXPORT_BLIS void PASTEMAC0( opname ) \
      ( \
-       subpart_t    req_part, \
-       dim_t        i, \
-       dim_t        b, \
-       const obj_t* obj, \
-             obj_t* sub_obj \
+             subpart_t req_part, \
+             dim_t     i, \
+             dim_t     b, \
+       const obj_t*    obj, \
+             obj_t*    sub_obj \
      );
 
 GENPROT( acquire_vpart_f2b )
@@ -105,15 +105,15 @@ GENPROT( acquire_vpart_b2f )
 
 BLIS_EXPORT_BLIS void bli_acquire_mij
      (
-       dim_t        i,
-       dim_t        j,
+             dim_t  i,
+             dim_t  j,
        const obj_t* obj,
              obj_t* sub_obj
      );
 
 BLIS_EXPORT_BLIS void bli_acquire_vi
      (
-       dim_t        i,
+             dim_t  i,
        const obj_t* obj,
              obj_t* sub_obj
      );
diff --git a/frame/base/bli_pba.c b/frame/base/bli_pba.c
index bb62c18a74..0a8497d186 100644
--- a/frame/base/bli_pba.c
+++ b/frame/base/bli_pba.c
@@ -77,7 +77,7 @@ void bli_pba_finalize
        void
      )
 {
-	pba_t* restrict pba = bli_pba_query();
+	pba_t* pba = bli_pba_query();
 
 #ifdef BLIS_ENABLE_PBA_POOLS
 	bli_pba_finalize_pools( pba );
@@ -284,8 +284,8 @@ void bli_pba_acquire_v
 
 siz_t bli_pba_pool_size
      (
-       const pba_t* pba,
-       packbuf_t    buf_type
+       const pba_t*    pba,
+             packbuf_t buf_type
      )
 {
 	siz_t r_val;
@@ -320,7 +320,7 @@ siz_t bli_pba_pool_size
 void bli_pba_init_pools
      (
        const cntx_t* cntx,
-       pba_t*        pba
+             pba_t*  pba
      )
 {
 	// Map each of the packbuf_t values to an index starting at zero.
@@ -402,9 +402,9 @@ void bli_pba_finalize_pools
 
 void bli_pba_compute_pool_block_sizes
      (
-       siz_t*        bs_a,
-       siz_t*        bs_b,
-       siz_t*        bs_c,
+             siz_t*  bs_a,
+             siz_t*  bs_b,
+             siz_t*  bs_c,
        const cntx_t* cntx
      )
 {
@@ -449,10 +449,10 @@ void bli_pba_compute_pool_block_sizes
 
 void bli_pba_compute_pool_block_sizes_dt
      (
-       num_t         dt,
-       siz_t*        bs_a,
-       siz_t*        bs_b,
-       siz_t*        bs_c,
+             num_t   dt,
+             siz_t*  bs_a,
+             siz_t*  bs_b,
+             siz_t*  bs_c,
        const cntx_t* cntx
      )
 {
diff --git a/frame/base/bli_pba.h b/frame/base/bli_pba.h
index bd56f9fc60..5cd95c2d44 100644
--- a/frame/base/bli_pba.h
+++ b/frame/base/bli_pba.h
@@ -156,8 +156,8 @@ BLIS_INLINE void bli_pba_rntm_set_pba
 
 siz_t bli_pba_pool_size
      (
-       const pba_t* pba,
-       packbuf_t    buf_type
+       const pba_t*    pba,
+             packbuf_t buf_type
      );
 
 // ----------------------------------------------------------------------------
@@ -165,7 +165,7 @@ siz_t bli_pba_pool_size
 void bli_pba_init_pools
      (
        const cntx_t* cntx,
-       pba_t*        pba
+             pba_t*  pba
      );
 void bli_pba_finalize_pools
      (
@@ -174,17 +174,17 @@ void bli_pba_finalize_pools
 
 void bli_pba_compute_pool_block_sizes
      (
-       siz_t*        bs_a,
-       siz_t*        bs_b,
-       siz_t*        bs_c,
+             siz_t*  bs_a,
+             siz_t*  bs_b,
+             siz_t*  bs_c,
        const cntx_t* cntx
      );
 void bli_pba_compute_pool_block_sizes_dt
      (
-       num_t         dt,
-       siz_t*        bs_a,
-       siz_t*        bs_b,
-       siz_t*        bs_c,
+             num_t   dt,
+             siz_t*  bs_a,
+             siz_t*  bs_b,
+             siz_t*  bs_c,
        const cntx_t* cntx
      );
 
diff --git a/frame/base/bli_rntm.c b/frame/base/bli_rntm.c
index 732dbb00d0..2c13c74a22 100644
--- a/frame/base/bli_rntm.c
+++ b/frame/base/bli_rntm.c
@@ -433,8 +433,8 @@ void bli_rntm_print
 
 dim_t bli_rntm_calc_num_threads_in
      (
-       const bszid_t* restrict bszid_cur,
-       const rntm_t*  restrict rntm
+       const bszid_t* bszid_cur,
+       const rntm_t*  rntm
      )
 {
 	/*                                     // bp algorithm:
diff --git a/frame/base/bli_rntm.h b/frame/base/bli_rntm.h
index e10c541542..2a39f8894c 100644
--- a/frame/base/bli_rntm.h
+++ b/frame/base/bli_rntm.h
@@ -334,7 +334,7 @@ BLIS_INLINE void bli_rntm_init( rntm_t* rntm )
 
 BLIS_INLINE dim_t bli_rntm_calc_num_threads
      (
-       const rntm_t*  restrict rntm
+       const rntm_t* rntm
      )
 {
 	dim_t n_threads;
@@ -387,8 +387,8 @@ void bli_rntm_print
 
 dim_t bli_rntm_calc_num_threads_in
      (
-       const bszid_t* restrict bszid_cur,
-       const rntm_t*  restrict rntm
+       const bszid_t* bszid_cur,
+       const rntm_t*  rntm
      );
 
 #endif
diff --git a/frame/base/bli_sba.c b/frame/base/bli_sba.c
index 5b6ff6a0f0..776622bb4a 100644
--- a/frame/base/bli_sba.c
+++ b/frame/base/bli_sba.c
@@ -57,8 +57,8 @@ void bli_sba_finalize( void )
 
 void* bli_sba_acquire
      (
-       rntm_t* restrict rntm,
-       siz_t            req_size
+       rntm_t* rntm,
+       siz_t   req_size
      )
 {
 	void* block;
@@ -74,7 +74,7 @@ void* bli_sba_acquire
 		pblk_t pblk;
 
 		// Query the small block pool from the rntm.
-		pool_t* restrict pool = bli_rntm_sba_pool( rntm );
+		pool_t* pool = bli_rntm_sba_pool( rntm );
 
 		// We don't expect NULL sba_pool pointers in the normal course of BLIS
 		// operation. However, there are rare instances where it is convenient
@@ -122,8 +122,8 @@ void* bli_sba_acquire
 
 void bli_sba_release
      (
-       rntm_t* restrict rntm,
-       void*   restrict block
+       rntm_t* rntm,
+       void*   block
      )
 {
 #ifdef BLIS_ENABLE_SBA_POOLS
@@ -136,7 +136,7 @@ void bli_sba_release
 		pblk_t pblk;
 
 		// Query the small block pool from the rntm.
-		pool_t* restrict pool = bli_rntm_sba_pool( rntm );
+		pool_t* pool = bli_rntm_sba_pool( rntm );
 
 		if ( pool == NULL )
 		{
@@ -182,7 +182,7 @@ array_t* bli_sba_checkout_array
 
 void bli_sba_checkin_array
      (
-       array_t* restrict array
+       array_t* array
      )
 {
 	#ifndef BLIS_ENABLE_SBA_POOLS
@@ -194,9 +194,9 @@ void bli_sba_checkin_array
 
 void bli_sba_rntm_set_pool
      (
-       siz_t             index,
-       array_t* restrict array,
-       rntm_t*  restrict rntm
+       siz_t    index,
+       array_t* array,
+       rntm_t*  rntm
      )
 {
 	#ifndef BLIS_ENABLE_SBA_POOLS
@@ -205,7 +205,7 @@ void bli_sba_rntm_set_pool
 	#endif
 
 	// Query the pool_t* in the array_t corresponding to index.
-	pool_t* restrict pool = bli_apool_array_elem( index, array );
+	pool_t* pool = bli_apool_array_elem( index, array );
 
 	// Embed the pool_t* into the rntm_t.
 	bli_rntm_set_sba_pool( pool, rntm );
diff --git a/frame/base/bli_sba.h b/frame/base/bli_sba.h
index f5e36d759a..4fc3aaaeea 100644
--- a/frame/base/bli_sba.h
+++ b/frame/base/bli_sba.h
@@ -44,30 +44,30 @@ void bli_sba_finalize( void );
 
 array_t* bli_sba_checkout_array
      (
-       const siz_t n_threads
+       siz_t n_threads
      );
 
 void bli_sba_checkin_array
      (
-       array_t* restrict array
+       array_t* array
      );
 
 void bli_sba_rntm_set_pool
      (
-       siz_t             index,
-       array_t* restrict array,
-       rntm_t*  restrict rntm
+       siz_t    index,
+       array_t* array,
+       rntm_t*  rntm
      );
 
 void* bli_sba_acquire
      (
-       rntm_t* restrict rntm,
-       siz_t            req_size
+       rntm_t* rntm,
+       siz_t   req_size
      );
 void bli_sba_release
      (
-       rntm_t* restrict rntm,
-       void*   restrict block
+       rntm_t* rntm,
+       void*   block
      );
 
 
diff --git a/frame/base/bli_setgetijm.c b/frame/base/bli_setgetijm.c
index 86f1c8845c..434c7e7e58 100644
--- a/frame/base/bli_setgetijm.c
+++ b/frame/base/bli_setgetijm.c
@@ -46,11 +46,11 @@ static setijm_fp GENARRAY(ftypes_setijm,setijm);
 
 err_t bli_setijm
      (
-       double  ar,
-       double  ai,
-       dim_t   i,
-       dim_t   j,
-       obj_t*  b
+             double  ar,
+             double  ai,
+             dim_t   i,
+             dim_t   j,
+       const obj_t*  b
      )
 {
 	dim_t m  = bli_obj_length( b );
@@ -110,21 +110,21 @@ INSERT_GENTFUNC_BASIC0( setijm )
 
 typedef void (*getijm_fp)
      (
-       dim_t       i,
-       dim_t       j,
-       const void* b, inc_t rs, inc_t cs,
-       double*     ar,
-       double*     ai
+             dim_t   i,
+             dim_t   j,
+       const void*   b, inc_t rs, inc_t cs,
+             double* ar,
+             double* ai
      );
 static getijm_fp GENARRAY(ftypes_getijm,getijm);
 
 err_t bli_getijm
       (
-        dim_t        i,
-        dim_t        j,
-        const obj_t* b,
-        double*      ar,
-        double*      ai
+              dim_t   i,
+              dim_t   j,
+        const obj_t*  b,
+              double* ar,
+              double* ai
       )
 {
 	dim_t m  = bli_obj_length( b );
@@ -164,11 +164,11 @@ err_t bli_getijm
 \
 void PASTEMAC(ch,opname) \
      ( \
-       dim_t       i, \
-       dim_t       j, \
-       const void* b, inc_t rs, inc_t cs, \
-       double*     ar, \
-       double*     ai  \
+             dim_t   i, \
+             dim_t   j, \
+       const void*   b, inc_t rs, inc_t cs, \
+             double* ar, \
+             double* ai  \
      ) \
 { \
 	const ctype* b_cast = ( const ctype* )b; \
diff --git a/frame/base/bli_setgetijm.h b/frame/base/bli_setgetijm.h
index 76b03a64b3..a2db16d11a 100644
--- a/frame/base/bli_setgetijm.h
+++ b/frame/base/bli_setgetijm.h
@@ -34,11 +34,11 @@
 
 BLIS_EXPORT_BLIS err_t bli_setijm
      (
-       double ar,
-       double ai,
-       dim_t  i,
-       dim_t  j,
-       obj_t* b
+             double ar,
+             double ai,
+             dim_t  i,
+             dim_t  j,
+       const obj_t* b
      );
 
 #undef  GENTPROT
@@ -59,11 +59,11 @@ INSERT_GENTPROT_BASIC0( setijm )
 
 BLIS_EXPORT_BLIS err_t bli_getijm
       (
-        dim_t        i,
-        dim_t        j,
-        const obj_t* b,
-        double*      ar,
-        double*      ai
+              dim_t   i,
+              dim_t   j,
+        const obj_t*  b,
+              double* ar,
+              double* ai
       );
 
 #undef  GENTPROT
@@ -71,11 +71,11 @@ BLIS_EXPORT_BLIS err_t bli_getijm
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       dim_t       i, \
-       dim_t       j, \
-       const void* b, inc_t rs, inc_t cs, \
-       double*     ar, \
-       double*     ai  \
+             dim_t   i, \
+             dim_t   j, \
+       const void*   b, inc_t rs, inc_t cs, \
+             double* ar, \
+             double* ai  \
      );
 
 INSERT_GENTPROT_BASIC0( getijm )
diff --git a/frame/base/bli_setgetijv.c b/frame/base/bli_setgetijv.c
index 3728daed75..6cee789c70 100644
--- a/frame/base/bli_setgetijv.c
+++ b/frame/base/bli_setgetijv.c
@@ -45,10 +45,10 @@ static setijv_fp GENARRAY(ftypes_setijv,setijv);
 
 err_t bli_setijv
      (
-       double ar,
-       double ai,
-       dim_t  i,
-       obj_t* x
+             double ar,
+             double ai,
+             dim_t  i,
+       const obj_t* x
      )
 {
 	dim_t n    = bli_obj_vector_dim( x );
@@ -103,19 +103,19 @@ INSERT_GENTFUNC_BASIC0( setijv )
 
 typedef void (*getijv_fp)
      (
-       dim_t       i,
-       const void* x, inc_t incx,
-       double*     ar,
-       double*     ai
+             dim_t   i,
+       const void*   x, inc_t incx,
+             double* ar,
+             double* ai
      );
 static getijv_fp GENARRAY(ftypes_getijv,getijv);
 
 err_t bli_getijv
       (
-        dim_t        i,
-        const obj_t* x,
-        double*      ar,
-        double*      ai
+              dim_t   i,
+        const obj_t*  x,
+              double* ar,
+              double* ai
       )
 {
 	dim_t n    = bli_obj_vector_dim( x );
@@ -151,10 +151,10 @@ err_t bli_getijv
 \
 void PASTEMAC(ch,opname) \
      ( \
-       dim_t       i, \
-       const void* x, inc_t incx, \
-       double*     ar, \
-       double*     ai  \
+             dim_t   i, \
+       const void*   x, inc_t incx, \
+             double* ar, \
+             double* ai  \
      ) \
 { \
 	const ctype* restrict x_cast = ( const ctype* )x; \
diff --git a/frame/base/bli_setgetijv.h b/frame/base/bli_setgetijv.h
index 3b61179759..a9badce4d4 100644
--- a/frame/base/bli_setgetijv.h
+++ b/frame/base/bli_setgetijv.h
@@ -34,10 +34,10 @@
 
 BLIS_EXPORT_BLIS err_t bli_setijv
      (
-       double  ar,
-       double  ai,
-       dim_t   i,
-       obj_t*  x
+             double  ar,
+             double  ai,
+             dim_t   i,
+       const obj_t*  x
      );
 
 #undef  GENTPROT
@@ -57,10 +57,10 @@ INSERT_GENTPROT_BASIC0( setijv )
 
 BLIS_EXPORT_BLIS err_t bli_getijv
       (
-        dim_t        i,
-        const obj_t* x,
-        double*      ar,
-        double*      ai
+              dim_t   i,
+        const obj_t*  x,
+              double* ar,
+              double* ai
       );
 
 #undef  GENTPROT
@@ -68,10 +68,10 @@ BLIS_EXPORT_BLIS err_t bli_getijv
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       dim_t       i, \
-       const void* b, inc_t incx, \
-       double*     ar, \
-       double*     ai  \
+             dim_t   i, \
+       const void*   b, inc_t incx, \
+             double* ar, \
+             double* ai  \
      );
 
 INSERT_GENTPROT_BASIC0( getijv )
diff --git a/frame/base/bli_setri.c b/frame/base/bli_setri.c
index e7e69c3391..15e698b2bd 100644
--- a/frame/base/bli_setri.c
+++ b/frame/base/bli_setri.c
@@ -39,7 +39,7 @@
 void bli_setrm
      (
        const obj_t* alpha,
-             obj_t* b
+       const obj_t* b
      )
 {
 	obj_t alpha_real;
@@ -68,7 +68,7 @@ void bli_setrm
 void bli_setrv
      (
        const obj_t* alpha,
-             obj_t* x
+       const obj_t* x
      )
 {
 	obj_t alpha_real;
@@ -99,7 +99,7 @@ void bli_setrv
 void bli_setim
      (
        const obj_t* alpha,
-             obj_t* b
+       const obj_t* b
      )
 {
 	obj_t alpha_real;
@@ -131,7 +131,7 @@ void bli_setim
 void bli_setiv
      (
        const obj_t* alpha,
-             obj_t* x
+       const obj_t* x
      )
 {
 	obj_t alpha_real;
diff --git a/frame/base/bli_setri.h b/frame/base/bli_setri.h
index 0beac1ec5b..ff5a096815 100644
--- a/frame/base/bli_setri.h
+++ b/frame/base/bli_setri.h
@@ -37,13 +37,13 @@
 BLIS_EXPORT_BLIS void bli_setrm
      (
        const obj_t* alpha,
-             obj_t* b
+       const obj_t* b
      );
 
 BLIS_EXPORT_BLIS void bli_setrv
      (
        const obj_t* alpha,
-             obj_t* x
+       const obj_t* x
      );
 
 // -- seti ---------------------------------------------------------------------
@@ -51,12 +51,12 @@ BLIS_EXPORT_BLIS void bli_setrv
 BLIS_EXPORT_BLIS void bli_setim
      (
        const obj_t* alpha,
-             obj_t* b
+       const obj_t* b
      );
 
 BLIS_EXPORT_BLIS void bli_setiv
      (
        const obj_t* alpha,
-             obj_t* x
+       const obj_t* x
      );
 
diff --git a/frame/base/cast/bli_castm.c b/frame/base/cast/bli_castm.c
index e3ee3e097d..57dd48bbc9 100644
--- a/frame/base/cast/bli_castm.c
+++ b/frame/base/cast/bli_castm.c
@@ -41,11 +41,11 @@
 
 typedef void (*FUNCPTR_T)
      (
-       trans_t              transa,
-       dim_t                m,
-       dim_t                n,
-       const void* restrict a, inc_t rs_a, inc_t cs_a,
-             void* restrict b, inc_t rs_b, inc_t cs_b
+             trans_t transa,
+             dim_t   m,
+             dim_t   n,
+       const void*   a, inc_t rs_a, inc_t cs_a,
+             void*   b, inc_t rs_b, inc_t cs_b
      );
 
 static FUNCPTR_T GENARRAY2_ALL(ftypes,castm);
@@ -60,23 +60,21 @@ void bli_castm
        const obj_t* b
      )
 {
-	num_t     dt_a     = bli_obj_dt( a );
-	num_t     dt_b     = bli_obj_dt( b );
+	const num_t   dt_a   = bli_obj_dt( a );
+	const num_t   dt_b   = bli_obj_dt( b );
 
-	trans_t   transa   = bli_obj_conjtrans_status( a );
+	const trans_t transa = bli_obj_conjtrans_status( a );
 
-	dim_t     m        = bli_obj_length( b );
-	dim_t     n        = bli_obj_width( b );
+	const dim_t   m      = bli_obj_length( b );
+	const dim_t   n      = bli_obj_width( b );
 
-	const void*     buf_a    = bli_obj_buffer_at_off( a );
-	inc_t     rs_a     = bli_obj_row_stride( a );
-	inc_t     cs_a     = bli_obj_col_stride( a );
+	const void*   buf_a  = bli_obj_buffer_at_off( a );
+	const inc_t   rs_a   = bli_obj_row_stride( a );
+	const inc_t   cs_a   = bli_obj_col_stride( a );
 
-	void*     buf_b    = bli_obj_buffer_at_off( b );
-	inc_t     rs_b     = bli_obj_row_stride( b );
-	inc_t     cs_b     = bli_obj_col_stride( b );
-
-	FUNCPTR_T f;
+	      void*   buf_b  = bli_obj_buffer_at_off( b );
+	const inc_t   rs_b   = bli_obj_row_stride( b );
+	const inc_t   cs_b   = bli_obj_col_stride( b );
 
 	// Check parameters.
 	if ( bli_error_checking_is_enabled() )
@@ -93,10 +91,7 @@ void bli_castm
 
 	// Index into the type combination array to extract the correct
 	// function pointer.
-	f = ftypes[dt_a][dt_b];
-
-	// Invoke the void pointer-based function.
-	f
+	ftypes[dt_a][dt_b]
 	(
 	  transa,
 	  m,
@@ -117,21 +112,21 @@ void bli_castm
 \
 void PASTEMAC2(cha,chb,opname) \
      ( \
-       trans_t              transa, \
-       dim_t                m, \
-       dim_t                n, \
-       const void* restrict a, inc_t rs_a, inc_t cs_a, \
-             void* restrict b, inc_t rs_b, inc_t cs_b  \
+             trans_t transa, \
+             dim_t   m, \
+             dim_t   n, \
+       const void*   a, inc_t rs_a, inc_t cs_a, \
+             void*   b, inc_t rs_b, inc_t cs_b  \
      ) \
 { \
 	const ctype_a* restrict a_cast = a; \
-	ctype_b* restrict b_cast = b; \
-	conj_t            conja; \
-	dim_t             n_iter; \
-	dim_t             n_elem; \
-	inc_t             lda, inca; \
-	inc_t             ldb, incb; \
-	dim_t             j, i; \
+	      ctype_b* restrict b_cast = b; \
+	      conj_t            conja; \
+	      dim_t             n_iter; \
+	      dim_t             n_elem; \
+	      inc_t             lda, inca; \
+	      inc_t             ldb, incb; \
+	      dim_t             j, i; \
 \
 	/* Set various loop parameters. */ \
 	bli_set_dims_incs_2m \
@@ -151,7 +146,7 @@ void PASTEMAC2(cha,chb,opname) \
 			for ( j = 0; j < n_iter; ++j ) \
 			{ \
 				const ctype_a* restrict a1 = a_cast + (j  )*lda + (0  )*inca; \
-				ctype_b* restrict b1 = b_cast + (j  )*ldb + (0  )*incb; \
+				      ctype_b* restrict b1 = b_cast + (j  )*ldb + (0  )*incb; \
 \
 				for ( i = 0; i < n_elem; ++i ) \
 				{ \
@@ -164,7 +159,7 @@ void PASTEMAC2(cha,chb,opname) \
 			for ( j = 0; j < n_iter; ++j ) \
 			{ \
 				const ctype_a* restrict a1 = a_cast + (j  )*lda + (0  )*inca; \
-				ctype_b* restrict b1 = b_cast + (j  )*ldb + (0  )*incb; \
+				      ctype_b* restrict b1 = b_cast + (j  )*ldb + (0  )*incb; \
 \
 				for ( i = 0; i < n_elem; ++i ) \
 				{ \
@@ -183,7 +178,7 @@ void PASTEMAC2(cha,chb,opname) \
 			for ( j = 0; j < n_iter; ++j ) \
 			{ \
 				const ctype_a* restrict a1 = a_cast + (j  )*lda + (0  )*inca; \
-				ctype_b* restrict b1 = b_cast + (j  )*ldb + (0  )*incb; \
+				      ctype_b* restrict b1 = b_cast + (j  )*ldb + (0  )*incb; \
 \
 				for ( i = 0; i < n_elem; ++i ) \
 				{ \
@@ -196,7 +191,7 @@ void PASTEMAC2(cha,chb,opname) \
 			for ( j = 0; j < n_iter; ++j ) \
 			{ \
 				const ctype_a* restrict a1 = a_cast + (j  )*lda + (0  )*inca; \
-				ctype_b* restrict b1 = b_cast + (j  )*ldb + (0  )*incb; \
+				      ctype_b* restrict b1 = b_cast + (j  )*ldb + (0  )*incb; \
 \
 				for ( i = 0; i < n_elem; ++i ) \
 				{ \
diff --git a/frame/base/cast/bli_castm.h b/frame/base/cast/bli_castm.h
index 2cd784670f..c06d1241a9 100644
--- a/frame/base/cast/bli_castm.h
+++ b/frame/base/cast/bli_castm.h
@@ -51,11 +51,11 @@ BLIS_EXPORT_BLIS void bli_castm
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \
      ( \
-       trans_t     transa, \
-       dim_t       m, \
-       dim_t       n, \
-       const void* a, inc_t rs_a, inc_t cs_a, \
-             void* b, inc_t rs_b, inc_t cs_b  \
+             trans_t transa, \
+             dim_t   m, \
+             dim_t   n, \
+       const void*   a, inc_t rs_a, inc_t cs_a, \
+             void*   b, inc_t rs_b, inc_t cs_b  \
      );
 
 INSERT_GENTPROT2_BASIC0( castm )
diff --git a/frame/base/cast/bli_castnzm.c b/frame/base/cast/bli_castnzm.c
index 238405a6ea..071233169f 100644
--- a/frame/base/cast/bli_castnzm.c
+++ b/frame/base/cast/bli_castnzm.c
@@ -41,11 +41,11 @@
 
 typedef void (*FUNCPTR_T)
      (
-       trans_t              transa,
-       dim_t                m,
-       dim_t                n,
-       const void* restrict a, inc_t rs_a, inc_t cs_a,
-             void* restrict b, inc_t rs_b, inc_t cs_b
+             trans_t transa,
+             dim_t   m,
+             dim_t   n,
+       const void*   a, inc_t rs_a, inc_t cs_a,
+             void*   b, inc_t rs_b, inc_t cs_b
      );
 
 static FUNCPTR_T GENARRAY2_ALL(ftypes,castnzm);
@@ -60,23 +60,21 @@ void bli_castnzm
        const obj_t* b
      )
 {
-	num_t     dt_a     = bli_obj_dt( a );
-	num_t     dt_b     = bli_obj_dt( b );
+	const num_t   dt_a   = bli_obj_dt( a );
+	const num_t   dt_b   = bli_obj_dt( b );
 
-	trans_t   transa   = bli_obj_conjtrans_status( a );
+	const trans_t transa = bli_obj_conjtrans_status( a );
 
-	dim_t     m        = bli_obj_length( b );
-	dim_t     n        = bli_obj_width( b );
+	const dim_t   m      = bli_obj_length( b );
+	const dim_t   n      = bli_obj_width( b );
 
-	const void*     buf_a    = bli_obj_buffer_at_off( a );
-	inc_t     rs_a     = bli_obj_row_stride( a );
-	inc_t     cs_a     = bli_obj_col_stride( a );
+	const void*   buf_a  = bli_obj_buffer_at_off( a );
+	const inc_t   rs_a   = bli_obj_row_stride( a );
+	const inc_t   cs_a   = bli_obj_col_stride( a );
 
-	void*     buf_b    = bli_obj_buffer_at_off( b );
-	inc_t     rs_b     = bli_obj_row_stride( b );
-	inc_t     cs_b     = bli_obj_col_stride( b );
-
-	FUNCPTR_T f;
+	      void*   buf_b  = bli_obj_buffer_at_off( b );
+	const inc_t   rs_b   = bli_obj_row_stride( b );
+	const inc_t   cs_b   = bli_obj_col_stride( b );
 
 	// Check parameters.
 	if ( bli_error_checking_is_enabled() )
@@ -93,10 +91,7 @@ void bli_castnzm
 
 	// Index into the type combination array to extract the correct
 	// function pointer.
-	f = ftypes[dt_a][dt_b];
-
-	// Invoke the void pointer-based function.
-	f
+	ftypes[dt_a][dt_b]
 	(
 	  transa,
 	  m,
@@ -117,21 +112,21 @@ void bli_castnzm
 \
 void PASTEMAC2(cha,chb,opname) \
      ( \
-       trans_t              transa, \
-       dim_t                m, \
-       dim_t                n, \
-       const void* restrict a, inc_t rs_a, inc_t cs_a, \
-             void* restrict b, inc_t rs_b, inc_t cs_b  \
+             trans_t transa, \
+             dim_t   m, \
+             dim_t   n, \
+       const void*   a, inc_t rs_a, inc_t cs_a, \
+             void*   b, inc_t rs_b, inc_t cs_b  \
      ) \
 { \
 	const ctype_a* restrict a_cast = a; \
-	ctype_b* restrict b_cast = b; \
-	conj_t            conja; \
-	dim_t             n_iter; \
-	dim_t             n_elem; \
-	inc_t             lda, inca; \
-	inc_t             ldb, incb; \
-	dim_t             j, i; \
+	      ctype_b* restrict b_cast = b; \
+	      conj_t            conja; \
+	      dim_t             n_iter; \
+	      dim_t             n_elem; \
+	      inc_t             lda, inca; \
+	      inc_t             ldb, incb; \
+	      dim_t             j, i; \
 \
 	/* Set various loop parameters. */ \
 	bli_set_dims_incs_2m \
@@ -151,7 +146,7 @@ void PASTEMAC2(cha,chb,opname) \
 			for ( j = 0; j < n_iter; ++j ) \
 			{ \
 				const ctype_a* restrict a1 = a_cast + (j  )*lda + (0  )*inca; \
-				ctype_b* restrict b1 = b_cast + (j  )*ldb + (0  )*incb; \
+				      ctype_b* restrict b1 = b_cast + (j  )*ldb + (0  )*incb; \
 \
 				for ( i = 0; i < n_elem; ++i ) \
 				{ \
@@ -164,7 +159,7 @@ void PASTEMAC2(cha,chb,opname) \
 			for ( j = 0; j < n_iter; ++j ) \
 			{ \
 				const ctype_a* restrict a1 = a_cast + (j  )*lda + (0  )*inca; \
-				ctype_b* restrict b1 = b_cast + (j  )*ldb + (0  )*incb; \
+				      ctype_b* restrict b1 = b_cast + (j  )*ldb + (0  )*incb; \
 \
 				for ( i = 0; i < n_elem; ++i ) \
 				{ \
@@ -183,7 +178,7 @@ void PASTEMAC2(cha,chb,opname) \
 			for ( j = 0; j < n_iter; ++j ) \
 			{ \
 				const ctype_a* restrict a1 = a_cast + (j  )*lda + (0  )*inca; \
-				ctype_b* restrict b1 = b_cast + (j  )*ldb + (0  )*incb; \
+				      ctype_b* restrict b1 = b_cast + (j  )*ldb + (0  )*incb; \
 \
 				for ( i = 0; i < n_elem; ++i ) \
 				{ \
@@ -196,7 +191,7 @@ void PASTEMAC2(cha,chb,opname) \
 			for ( j = 0; j < n_iter; ++j ) \
 			{ \
 				const ctype_a* restrict a1 = a_cast + (j  )*lda + (0  )*inca; \
-				ctype_b* restrict b1 = b_cast + (j  )*ldb + (0  )*incb; \
+				      ctype_b* restrict b1 = b_cast + (j  )*ldb + (0  )*incb; \
 \
 				for ( i = 0; i < n_elem; ++i ) \
 				{ \
diff --git a/frame/base/cast/bli_castnzm.h b/frame/base/cast/bli_castnzm.h
index 9c351d3ea6..03860fe400 100644
--- a/frame/base/cast/bli_castnzm.h
+++ b/frame/base/cast/bli_castnzm.h
@@ -51,11 +51,11 @@ BLIS_EXPORT_BLIS void bli_castnzm
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(cha,chb,opname) \
      ( \
-       trans_t     transa, \
-       dim_t       m, \
-       dim_t       n, \
-       const void* a, inc_t rs_a, inc_t cs_a, \
-             void* b, inc_t rs_b, inc_t cs_b  \
+             trans_t transa, \
+             dim_t   m, \
+             dim_t   n, \
+       const void*   a, inc_t rs_a, inc_t cs_a, \
+             void*   b, inc_t rs_b, inc_t cs_b  \
      );
 
 INSERT_GENTPROT2_BASIC0( castnzm )
diff --git a/frame/base/cast/bli_castv.c b/frame/base/cast/bli_castv.c
index 74d1f8757b..c46a2798c9 100644
--- a/frame/base/cast/bli_castv.c
+++ b/frame/base/cast/bli_castv.c
@@ -41,10 +41,10 @@
 
 typedef void (*FUNCPTR_T)
      (
-       conj_t               conjx,
-       dim_t                n,
-       const void* restrict x, inc_t inc_x,
-             void* restrict y, inc_t inc_y
+             conj_t conjx,
+             dim_t  n,
+       const void*  x, inc_t inc_x,
+             void*  y, inc_t inc_y
      );
 
 static FUNCPTR_T GENARRAY2_ALL(ftypes,castv);
@@ -59,20 +59,18 @@ void bli_castv
        const obj_t* y
      )
 {
-	num_t     dt_x     = bli_obj_dt( x );
-	num_t     dt_y     = bli_obj_dt( y );
+	const num_t  dt_x  = bli_obj_dt( x );
+	const num_t  dt_y  = bli_obj_dt( y );
 
-	conj_t    conjx    = bli_obj_conj_status( x );
+	const conj_t conjx = bli_obj_conj_status( x );
 
-	dim_t     n        = bli_obj_vector_dim( x );
+	const dim_t  n     = bli_obj_vector_dim( x );
 
-	const void*     buf_x    = bli_obj_buffer_at_off( x );
-	inc_t     inc_x    = bli_obj_vector_inc( x );
+	const void*  buf_x = bli_obj_buffer_at_off( x );
+	const inc_t  inc_x = bli_obj_vector_inc( x );
 
-	void*     buf_y    = bli_obj_buffer_at_off( y );
-	inc_t     inc_y    = bli_obj_vector_inc( y );
-
-	FUNCPTR_T f;
+	      void*  buf_y = bli_obj_buffer_at_off( y );
+	const inc_t  inc_y = bli_obj_vector_inc( y );
 
 	// Check parameters.
 	if ( bli_error_checking_is_enabled() )
@@ -89,10 +87,7 @@ void bli_castv
 
 	// Index into the type combination array to extract the correct
 	// function pointer.
-	f = ftypes[dt_x][dt_y];
-
-	// Invoke the void pointer-based function.
-	f
+	ftypes[dt_x][dt_y]
 	(
 	  conjx,
 	  n,
@@ -112,15 +107,15 @@ void bli_castv
 \
 void PASTEMAC2(chx,chy,opname) \
      ( \
-       conj_t               conjx, \
-       dim_t                n, \
-       const void* restrict x, inc_t incx, \
-             void* restrict y, inc_t incy  \
+             conj_t conjx, \
+             dim_t  n, \
+       const void*  x, inc_t incx, \
+             void*  y, inc_t incy  \
      ) \
 { \
 	const ctype_x* restrict x1 = x; \
-	ctype_y* restrict y1 = y; \
-	dim_t             i; \
+	      ctype_y* restrict y1 = y; \
+	      dim_t             i; \
 \
 	if ( bli_is_conj( conjx ) ) \
 	{ \
diff --git a/frame/base/cast/bli_castv.h b/frame/base/cast/bli_castv.h
index 542795ca5a..85d87d9117 100644
--- a/frame/base/cast/bli_castv.h
+++ b/frame/base/cast/bli_castv.h
@@ -51,10 +51,10 @@ BLIS_EXPORT_BLIS void bli_castv
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(chx,chy,opname) \
      ( \
-       conj_t      conjx, \
-       dim_t       n, \
-       const void* x, inc_t incx, \
-             void* y, inc_t incy  \
+             conj_t conjx, \
+             dim_t  n, \
+       const void*  x, inc_t incx, \
+             void*  y, inc_t incy  \
      );
 
 INSERT_GENTPROT2_BASIC0( castv )
diff --git a/frame/base/check/bli_obj_check.c b/frame/base/check/bli_obj_check.c
index 1739afc428..4465a5b8df 100644
--- a/frame/base/check/bli_obj_check.c
+++ b/frame/base/check/bli_obj_check.c
@@ -34,11 +34,11 @@
 
 #include "blis.h"
 
-void bli_obj_create_check( num_t  dt,
-                           dim_t  m,
-                           dim_t  n,
-                           inc_t  rs,
-                           inc_t  cs,
+void bli_obj_create_check(       num_t  dt,
+                                 dim_t  m,
+                                 dim_t  n,
+                                 inc_t  rs,
+                                 inc_t  cs,
                            const obj_t* obj )
 {
 	err_t e_val;
@@ -53,9 +53,9 @@ void bli_obj_create_check( num_t  dt,
 	bli_check_error_code( e_val );
 }
 
-void bli_obj_create_without_buffer_check( num_t  dt,
-                                          dim_t  m,
-                                          dim_t  n,
+void bli_obj_create_without_buffer_check(       num_t  dt,
+                                                dim_t  m,
+                                                dim_t  n,
                                           const obj_t* obj )
 {
 	err_t e_val;
@@ -67,9 +67,9 @@ void bli_obj_create_without_buffer_check( num_t  dt,
 	bli_check_error_code( e_val );
 }
 
-void bli_obj_alloc_buffer_check( inc_t  rs,
-                                 inc_t  cs,
-                                 inc_t  is,
+void bli_obj_alloc_buffer_check(       inc_t  rs,
+                                       inc_t  cs,
+                                       inc_t  is,
                                  const obj_t* obj )
 {
 	err_t e_val;
@@ -84,9 +84,9 @@ void bli_obj_alloc_buffer_check( inc_t  rs,
 }
 
 void bli_obj_attach_buffer_check( const void*  p,
-                                  inc_t  rs,
-                                  inc_t  cs,
-                                  inc_t  is,
+                                        inc_t  rs,
+                                        inc_t  cs,
+                                        inc_t  is,
                                   const obj_t* obj )
 {
 	err_t e_val;
@@ -109,7 +109,7 @@ void bli_obj_attach_buffer_check( const void*  p,
 	bli_check_error_code( e_val );
 }
 
-void bli_obj_create_scalar_check( num_t  dt,
+void bli_obj_create_scalar_check(       num_t  dt,
                                   const obj_t* obj )
 {
 	err_t e_val;
diff --git a/frame/base/check/bli_obj_check.h b/frame/base/check/bli_obj_check.h
index 232fb02097..8548c9ee41 100644
--- a/frame/base/check/bli_obj_check.h
+++ b/frame/base/check/bli_obj_check.h
@@ -32,30 +32,30 @@
 
 */
 
-void bli_obj_create_check( num_t  dt,
-                           dim_t  m,
-                           dim_t  n,
-                           inc_t  rs,
-                           inc_t  cs,
+void bli_obj_create_check(       num_t  dt,
+                                 dim_t  m,
+                                 dim_t  n,
+                                 inc_t  rs,
+                                 inc_t  cs,
                            const obj_t* obj );
 
-void bli_obj_create_without_buffer_check( num_t  dt,
-                                          dim_t  m,
-                                          dim_t  n,
+void bli_obj_create_without_buffer_check(       num_t  dt,
+                                                dim_t  m,
+                                                dim_t  n,
                                           const obj_t* obj );
 
-void bli_obj_alloc_buffer_check( inc_t  rs,
-                                 inc_t  cs,
-                                 inc_t  is,
+void bli_obj_alloc_buffer_check(       inc_t  rs,
+                                       inc_t  cs,
+                                       inc_t  is,
                                  const obj_t* obj );
 
 void bli_obj_attach_buffer_check( const void*  p,
-                                  inc_t  rs,
-                                  inc_t  cs,
-                                  inc_t  is,
+                                        inc_t  rs,
+                                        inc_t  cs,
+                                        inc_t  is,
                                   const obj_t* obj );
 
-void bli_obj_create_scalar_check( num_t  dt,
+void bli_obj_create_scalar_check(       num_t  dt,
                                   const obj_t* obj );
 
 void bli_obj_free_check( const obj_t* obj );
diff --git a/frame/base/check/bli_part_check.c b/frame/base/check/bli_part_check.c
index c8abb4b6e7..d13a8c22ff 100644
--- a/frame/base/check/bli_part_check.c
+++ b/frame/base/check/bli_part_check.c
@@ -34,11 +34,11 @@
 
 #include "blis.h"
 
-void bli_acquire_mpart_t2b_check( subpart_t    requested_part,
-                                  dim_t        i,
-                                  dim_t        b,
-                                  const obj_t* obj,
-                                  const obj_t* sub_obj )
+void bli_acquire_mpart_t2b_check(       subpart_t requested_part,
+                                        dim_t     i,
+                                        dim_t     b,
+                                  const obj_t*    obj,
+                                  const obj_t*    sub_obj )
 {
 	err_t e_val;
 
@@ -52,11 +52,11 @@ void bli_acquire_mpart_t2b_check( subpart_t    requested_part,
 	bli_check_error_code( e_val );
 }
 
-void bli_acquire_mpart_l2r_check( subpart_t    requested_part,
-                                  dim_t        j,
-                                  dim_t        b,
-                                  const obj_t* obj,
-                                  const obj_t* sub_obj )
+void bli_acquire_mpart_l2r_check(       subpart_t requested_part,
+                                        dim_t     j,
+                                        dim_t     b,
+                                  const obj_t*    obj,
+                                  const obj_t*    sub_obj )
 {
 	err_t e_val;
 
@@ -70,11 +70,11 @@ void bli_acquire_mpart_l2r_check( subpart_t    requested_part,
 	bli_check_error_code( e_val );
 }
 
-void bli_acquire_mpart_tl2br_check( subpart_t    requested_part,
-                                    dim_t        ij,
-                                    dim_t        b,
-                                    const obj_t* obj,
-                                    const obj_t* sub_obj )
+void bli_acquire_mpart_tl2br_check(       subpart_t requested_part,
+                                          dim_t     ij,
+                                          dim_t     b,
+                                    const obj_t*    obj,
+                                    const obj_t*    sub_obj )
 {
 	err_t e_val;
 
diff --git a/frame/base/check/bli_part_check.h b/frame/base/check/bli_part_check.h
index 4576e09b5d..810c5a3a7d 100644
--- a/frame/base/check/bli_part_check.h
+++ b/frame/base/check/bli_part_check.h
@@ -32,21 +32,21 @@
 
 */
 
-void bli_acquire_mpart_t2b_check( subpart_t    requested_part,
-                                  dim_t        i,
-                                  dim_t        b,
-                                  const obj_t* obj,
-                                  const obj_t* sub_obj );
-
-void bli_acquire_mpart_l2r_check( subpart_t    requested_part,
-                                  dim_t        j,
-                                  dim_t        b,
-                                  const obj_t* obj,
-                                  const obj_t* sub_obj );
-
-void bli_acquire_mpart_tl2br_check( subpart_t    requested_part,
-                                    dim_t        ij,
-                                    dim_t        b,
-                                    const obj_t* obj,
-                                    const obj_t* sub_obj );
+void bli_acquire_mpart_t2b_check(       subpart_t requested_part,
+                                        dim_t     i,
+                                        dim_t     b,
+                                  const obj_t*    obj,
+                                  const obj_t*    sub_obj );
+
+void bli_acquire_mpart_l2r_check(       subpart_t requested_part,
+                                        dim_t     j,
+                                        dim_t     b,
+                                  const obj_t*    obj,
+                                  const obj_t*    sub_obj );
+
+void bli_acquire_mpart_tl2br_check(       subpart_t requested_part,
+                                          dim_t     ij,
+                                          dim_t     b,
+                                    const obj_t*    obj,
+                                    const obj_t*    sub_obj );
 
diff --git a/frame/include/bli_extern_defs.h b/frame/include/bli_extern_defs.h
index 4d8fbee1dc..42ad9c72ba 100644
--- a/frame/include/bli_extern_defs.h
+++ b/frame/include/bli_extern_defs.h
@@ -43,8 +43,8 @@ BLIS_EXPORT_BLIS extern const obj_t BLIS_ZERO;
 BLIS_EXPORT_BLIS extern const obj_t BLIS_MINUS_ONE;
 BLIS_EXPORT_BLIS extern const obj_t BLIS_MINUS_TWO;
 
-BLIS_EXPORT_BLIS extern const thrcomm_t BLIS_SINGLE_COMM;
-BLIS_EXPORT_BLIS extern const thrinfo_t BLIS_PACKM_SINGLE_THREADED;
-BLIS_EXPORT_BLIS extern const thrinfo_t BLIS_GEMM_SINGLE_THREADED;
+BLIS_EXPORT_BLIS extern thrcomm_t BLIS_SINGLE_COMM;
+BLIS_EXPORT_BLIS extern thrinfo_t BLIS_PACKM_SINGLE_THREADED;
+BLIS_EXPORT_BLIS extern thrinfo_t BLIS_GEMM_SINGLE_THREADED;
 
 #endif
diff --git a/frame/include/bli_oapi_ba.h b/frame/include/bli_oapi_ba.h
index dc17507d11..d802635973 100644
--- a/frame/include/bli_oapi_ba.h
+++ b/frame/include/bli_oapi_ba.h
@@ -54,6 +54,6 @@
 // to NULL. The "( void )" statements are to prevent unused variable
 // warnings by the compiler.
 #undef  BLIS_OAPI_EX_DECLS
-#define BLIS_OAPI_EX_DECLS   cntx_t* cntx = NULL; ( void )cntx; \
-                             rntm_t* rntm = NULL; ( void )rntm;
+#define BLIS_OAPI_EX_DECLS   const cntx_t* cntx = NULL; ( void )cntx; \
+                                   rntm_t* rntm = NULL; ( void )rntm;
 
diff --git a/frame/include/bli_oapi_ex.h b/frame/include/bli_oapi_ex.h
index 0eb5eb2a1e..7252fd7fff 100644
--- a/frame/include/bli_oapi_ex.h
+++ b/frame/include/bli_oapi_ex.h
@@ -48,7 +48,7 @@
 // Define the macro to add expert arguments to function signatures
 // and prototypes.
 #undef  BLIS_OAPI_EX_PARAMS
-#define BLIS_OAPI_EX_PARAMS   ,cntx_t* cntx, rntm_t* rntm
+#define BLIS_OAPI_EX_PARAMS   , const cntx_t* cntx, rntm_t* rntm
 
 // Define the macro to omit the expert variable declaration block, since
 // it is not needed when expert parameters are passed in through the API.
diff --git a/frame/include/bli_obj_macro_defs.h b/frame/include/bli_obj_macro_defs.h
index c076d41fb2..0db7fb5c46 100644
--- a/frame/include/bli_obj_macro_defs.h
+++ b/frame/include/bli_obj_macro_defs.h
@@ -1261,7 +1261,7 @@ BLIS_INLINE void bli_obj_init_finish( num_t dt, dim_t m, dim_t n, void* p, inc_t
 	bli_obj_set_buffer( p, obj );
 
 	bli_obj_set_scalar_dt( dt, obj );
-	void* restrict s = bli_obj_internal_scalar_buffer( obj );
+	void* s = bli_obj_internal_scalar_buffer( obj );
 
 	if      ( bli_dt_prec_is_single( dt ) ) { (( scomplex* )s)->real = 1.0F;
 	                                          (( scomplex* )s)->imag = 0.0F; }
diff --git a/frame/include/bli_tapi_ba.h b/frame/include/bli_tapi_ba.h
index 0177985d9d..6a7e195abe 100644
--- a/frame/include/bli_tapi_ba.h
+++ b/frame/include/bli_tapi_ba.h
@@ -54,6 +54,6 @@
 // to NULL. The "( void )" statements are to prevent unused variable
 // warnings by the compiler.
 #undef  BLIS_TAPI_EX_DECLS
-#define BLIS_TAPI_EX_DECLS   cntx_t* cntx = NULL; ( void )cntx; \
-                             rntm_t* rntm = NULL; ( void )rntm;
+#define BLIS_TAPI_EX_DECLS   const cntx_t* cntx = NULL; ( void )cntx; \
+                                   rntm_t* rntm = NULL; ( void )rntm;
 
diff --git a/frame/include/bli_tapi_ex.h b/frame/include/bli_tapi_ex.h
index c999b0ae9e..f12be24b89 100644
--- a/frame/include/bli_tapi_ex.h
+++ b/frame/include/bli_tapi_ex.h
@@ -48,7 +48,7 @@
 // Define the macro to add expert arguments to function signatures
 // and prototypes.
 #undef  BLIS_TAPI_EX_PARAMS
-#define BLIS_TAPI_EX_PARAMS   ,cntx_t* cntx, rntm_t* rntm
+#define BLIS_TAPI_EX_PARAMS   , const cntx_t* cntx, rntm_t* rntm
 
 // Define the macro to omit the expert variable declaration block, since
 // it is not needed when expert parameters are passed in through the API.
diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h
index eb99875c5b..e957fc6b23 100644
--- a/frame/include/bli_type_defs.h
+++ b/frame/include/bli_type_defs.h
@@ -1111,26 +1111,26 @@ typedef struct
 
 	// Pointers to the micro-panels of A and B which will be used by the
 	// next call to the micro-kernel.
-	void*  a_next;
-	void*  b_next;
+	const void* a_next;
+	const void* b_next;
 
 	// The imaginary strides of A and B.
-	inc_t  is_a;
-	inc_t  is_b;
+	inc_t is_a;
+	inc_t is_b;
 
 	// The panel strides of A and B.
 	// NOTE: These are only used in situations where iteration over the
 	// micropanels takes place in part within the kernel code (e.g. sup
 	// millikernels).
-	inc_t  ps_a;
-	inc_t  ps_b;
+	inc_t ps_a;
+	inc_t ps_b;
 
 	// The type to convert to on output.
 	//num_t  dt_on_output;
 
 	// (Virtual) microkernel address and additional parameters.
 	void_fp ukr;
-	void*   params;
+	const void* params;
 
 } auxinfo_t;
 
@@ -1163,10 +1163,10 @@ struct thrinfo_s;
 typedef void (*obj_pack_fn_t)
     (
       const struct obj_s*     a,
-      struct obj_s*     ap,
+            struct obj_s*     ap,
       const struct cntx_s*    cntx,
-      struct rntm_s*    rntm,
-      struct cntl_s*    cntl,
+            struct rntm_s*    rntm,
+            struct cntl_s*    cntl,
       const struct thrinfo_s* thread
     );
 
@@ -1176,8 +1176,8 @@ typedef void (*obj_ker_fn_t)
       const struct obj_s*     b,
       const struct obj_s*     c,
       const struct cntx_s*    cntx,
-      struct rntm_s*    rntm,
-      struct cntl_s*    cntl,
+            struct rntm_s*    rntm,
+            struct cntl_s*    cntl,
       const struct thrinfo_s* thread
     );
 
diff --git a/frame/thread/bli_l3_decor.h b/frame/thread/bli_l3_decor.h
index 0b09189a69..e2208aae63 100644
--- a/frame/thread/bli_l3_decor.h
+++ b/frame/thread/bli_l3_decor.h
@@ -41,30 +41,30 @@
 // Level-3 internal function type.
 typedef void (*l3int_t)
      (
-       obj_t*     alpha,
-       obj_t*     a,
-       obj_t*     b,
-       obj_t*     beta,
-       obj_t*     c,
-       cntx_t*    cntx,
-       rntm_t*    rntm,
-       cntl_t*    cntl,
-       thrinfo_t* thread
+       const obj_t*     alpha,
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     beta,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+             rntm_t*    rntm,
+             cntl_t*    cntl,
+             thrinfo_t* thread
      );
 
 // Level-3 thread decorator prototype.
 void bli_l3_thread_decorator
      (
-       l3int_t func,
-       opid_t  family,
-       obj_t*  alpha,
-       obj_t*  a,
-       obj_t*  b,
-       obj_t*  beta,
-       obj_t*  c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       cntl_t* cntl
+             l3int_t func,
+             opid_t  family,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl
      );
 
 // Include definitions specific to the method of multithreading for the
diff --git a/frame/thread/bli_l3_decor_openmp.c b/frame/thread/bli_l3_decor_openmp.c
index 5b40d06143..2c71c75321 100644
--- a/frame/thread/bli_l3_decor_openmp.c
+++ b/frame/thread/bli_l3_decor_openmp.c
@@ -46,29 +46,18 @@ void* bli_l3_thread_entry( void* data_void ) { return NULL; }
 
 void bli_l3_thread_decorator
      (
-       l3int_t    func,
-       opid_t     family,
-       obj_t*     alpha,
-       obj_t*     a,
-       obj_t*     b,
-       obj_t*     beta,
-       obj_t*     c,
-       cntx_t*    cntx,
-       rntm_t*    rntm,
-       cntl_t*    cntl
+             l3int_t func,
+             opid_t  family,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl
      )
 {
-	// This is part of a hack to support mixed domain in bli_gemm_front().
-	// Sometimes we need to specify a non-standard schema for A and B, and
-	// we decided to transmit them via the schema field in the obj_t's
-	// rather than pass them in as function parameters. Once the values
-	// have been read, we immediately reset them back to their expected
-	// values for unpacked objects.
-	pack_t schema_a = bli_obj_pack_schema( a );
-	pack_t schema_b = bli_obj_pack_schema( b );
-	bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
-	bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
-
 	// Query the total number of threads from the rntm_t object.
 	const dim_t n_threads = bli_rntm_num_threads( rntm );
 
@@ -83,7 +72,7 @@ void bli_l3_thread_decorator
 	// with an internal lock to ensure only one application thread accesses
 	// the sba at a time. bli_sba_checkout_array() will also automatically
 	// resize the array_t, if necessary.
-	array_t* restrict array = bli_sba_checkout_array( n_threads );
+	array_t* array = bli_sba_checkout_array( n_threads );
 
 	// Access the pool_t* for thread 0 and embed it into the rntm. We do
 	// this up-front only so that we have the rntm_t.sba_pool field
@@ -96,7 +85,7 @@ void bli_l3_thread_decorator
 	bli_pba_rntm_set_pba( rntm );
 
 	// Allocate a global communicator for the root thrinfo_t structures.
-	thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
+	thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads );
 
 
 	_Pragma( "omp parallel num_threads(n_threads)" )
@@ -104,8 +93,8 @@ void bli_l3_thread_decorator
 		// Create a thread-local copy of the master thread's rntm_t. This is
 		// necessary since we want each thread to be able to track its own
 		// small block pool_t as it executes down the function stack.
-		rntm_t           rntm_l = *rntm;
-		rntm_t* restrict rntm_p = &rntm_l;
+		rntm_t  rntm_l = *rntm;
+		rntm_t* rntm_p = &rntm_l;
 
 		// Query the thread's id from OpenMP.
 		const dim_t tid = omp_get_thread_num();
@@ -119,7 +108,6 @@ void bli_l3_thread_decorator
 		// be allocated/initialized.
 		bli_sba_rntm_set_pool( tid, array, rntm_p );
 
-
 		obj_t      a_t, b_t, c_t;
 		cntl_t*    cntl_use;
 		thrinfo_t* thread;
@@ -133,6 +121,17 @@ void bli_l3_thread_decorator
 		bli_obj_alias_to( b, &b_t );
 		bli_obj_alias_to( c, &c_t );
 
+		// This is part of a hack to support mixed domain in bli_gemm_front().
+		// Sometimes we need to specify a non-standard schema for A and B, and
+		// we decided to transmit them via the schema field in the obj_t's
+		// rather than pass them in as function parameters. Once the values
+		// have been read, we immediately reset them back to their expected
+		// values for unpacked objects.
+		pack_t schema_a = bli_obj_pack_schema( &a_t );
+		pack_t schema_b = bli_obj_pack_schema( &b_t );
+		bli_obj_set_pack_schema( BLIS_NOT_PACKED, &a_t );
+		bli_obj_set_pack_schema( BLIS_NOT_PACKED, &b_t );
+
 		// Create a default control tree for the operation, if needed.
 		bli_l3_cntl_create_if( family, schema_a, schema_b,
 		                       &a_t, &b_t, &c_t, rntm_p, cntl, &cntl_use );
diff --git a/frame/thread/bli_l3_decor_pthreads.c b/frame/thread/bli_l3_decor_pthreads.c
index 89b6ea1187..80247dfb1c 100644
--- a/frame/thread/bli_l3_decor_pthreads.c
+++ b/frame/thread/bli_l3_decor_pthreads.c
@@ -40,49 +40,45 @@
 // A data structure to assist in passing operands to additional threads.
 typedef struct thread_data
 {
-	l3int_t    func;
-	opid_t     family;
-	pack_t     schema_a;
-	pack_t     schema_b;
-	obj_t*     alpha;
-	obj_t*     a;
-	obj_t*     b;
-	obj_t*     beta;
-	obj_t*     c;
-	cntx_t*    cntx;
-	rntm_t*    rntm;
-	cntl_t*    cntl;
-	dim_t      tid;
-	thrcomm_t* gl_comm;
-	array_t*   array;
+	      l3int_t    func;
+	      opid_t     family;
+	const obj_t*     alpha;
+	const obj_t*     a;
+	const obj_t*     b;
+	const obj_t*     beta;
+	const obj_t*     c;
+	const cntx_t*    cntx;
+	      rntm_t*    rntm;
+	      cntl_t*    cntl;
+	      dim_t      tid;
+	      thrcomm_t* gl_comm;
+	      array_t*   array;
 } thread_data_t;
 
 // Entry point for additional threads
 void* bli_l3_thread_entry( void* data_void )
 {
-	thread_data_t* data     = data_void;
-
-	l3int_t        func     = data->func;
-	opid_t         family   = data->family;
-	pack_t         schema_a = data->schema_a;
-	pack_t         schema_b = data->schema_b;
-	obj_t*         alpha    = data->alpha;
-	obj_t*         a        = data->a;
-	obj_t*         b        = data->b;
-	obj_t*         beta     = data->beta;
-	obj_t*         c        = data->c;
-	cntx_t*        cntx     = data->cntx;
-	rntm_t*        rntm     = data->rntm;
-	cntl_t*        cntl     = data->cntl;
-	dim_t          tid      = data->tid;
-	array_t*       array    = data->array;
-	thrcomm_t*     gl_comm  = data->gl_comm;
+	const thread_data_t* data     = data_void;
+
+	const l3int_t        func     = data->func;
+	const opid_t         family   = data->family;
+	const obj_t*         alpha    = data->alpha;
+	const obj_t*         a        = data->a;
+	const obj_t*         b        = data->b;
+	const obj_t*         beta     = data->beta;
+	const obj_t*         c        = data->c;
+	const cntx_t*        cntx     = data->cntx;
+	      rntm_t*        rntm     = data->rntm;
+	      cntl_t*        cntl     = data->cntl;
+	const dim_t          tid      = data->tid;
+	      array_t*       array    = data->array;
+	      thrcomm_t*     gl_comm  = data->gl_comm;
 
 	// Create a thread-local copy of the master thread's rntm_t. This is
 	// necessary since we want each thread to be able to track its own
 	// small block pool_t as it executes down the function stack.
-	rntm_t           rntm_l = *rntm;
-	rntm_t* restrict rntm_p = &rntm_l;
+	rntm_t  rntm_l = *rntm;
+	rntm_t* rntm_p = &rntm_l;
 
 	// Use the thread id to access the appropriate pool_t* within the
 	// array_t, and use it to set the sba_pool field within the rntm_t.
@@ -90,9 +86,9 @@ void* bli_l3_thread_entry( void* data_void )
 	// be allocated/initialized.
 	bli_sba_rntm_set_pool( tid, array, rntm_p );
 
-	obj_t          a_t, b_t, c_t;
-	cntl_t*        cntl_use;
-	thrinfo_t*     thread;
+	obj_t      a_t, b_t, c_t;
+	cntl_t*    cntl_use;
+	thrinfo_t* thread;
 
 	// Alias thread-local copies of A, B, and C. These will be the objects
 	// we pass down the algorithmic function stack. Making thread-local
@@ -103,6 +99,17 @@ void* bli_l3_thread_entry( void* data_void )
 	bli_obj_alias_to( b, &b_t );
 	bli_obj_alias_to( c, &c_t );
 
+	// This is part of a hack to support mixed domain in bli_gemm_front().
+	// Sometimes we need to specify a non-standard schema for A and B, and
+	// we decided to transmit them via the schema field in the obj_t's
+	// rather than pass them in as function parameters. Once the values
+	// have been read, we immediately reset them back to their expected
+	// values for unpacked objects.
+	pack_t schema_a = bli_obj_pack_schema( &a_t );
+	pack_t schema_b = bli_obj_pack_schema( &b_t );
+	bli_obj_set_pack_schema( BLIS_NOT_PACKED, &a_t );
+	bli_obj_set_pack_schema( BLIS_NOT_PACKED, &b_t );
+
 	// Create a default control tree for the operation, if needed.
 	bli_l3_cntl_create_if( family, schema_a, schema_b,
 	                       &a_t, &b_t, &c_t, rntm_p, cntl, &cntl_use );
@@ -134,31 +141,20 @@ void* bli_l3_thread_entry( void* data_void )
 
 void bli_l3_thread_decorator
      (
-       l3int_t    func,
-       opid_t     family,
-       obj_t*     alpha,
-       obj_t*     a,
-       obj_t*     b,
-       obj_t*     beta,
-       obj_t*     c,
-       cntx_t*    cntx,
-       rntm_t*    rntm,
-       cntl_t*    cntl
+             l3int_t func,
+             opid_t  family,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl
      )
 {
 	err_t r_val;
 
-	// This is part of a hack to support mixed domain in bli_gemm_front().
-	// Sometimes we need to specify a non-standard schema for A and B, and
-	// we decided to transmit them via the schema field in the obj_t's
-	// rather than pass them in as function parameters. Once the values
-	// have been read, we immediately reset them back to their expected
-	// values for unpacked objects.
-	pack_t schema_a = bli_obj_pack_schema( a );
-	pack_t schema_b = bli_obj_pack_schema( b );
-	bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
-	bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
-
 	// Query the total number of threads from the context.
 	const dim_t n_threads = bli_rntm_num_threads( rntm );
 
@@ -168,7 +164,7 @@ void bli_l3_thread_decorator
 	// with an internal lock to ensure only one application thread accesses
 	// the sba at a time. bli_sba_checkout_array() will also automatically
 	// resize the array_t, if necessary.
-	array_t* restrict array = bli_sba_checkout_array( n_threads );
+	array_t* array = bli_sba_checkout_array( n_threads );
 
 	// Access the pool_t* for thread 0 and embed it into the rntm. We do
 	// this up-front only so that we have the rntm_t.sba_pool field
@@ -181,7 +177,7 @@ void bli_l3_thread_decorator
 	bli_pba_rntm_set_pba( rntm );
 
 	// Allocate a global communicator for the root thrinfo_t structures.
-	thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
+	thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads );
 
 	// Allocate an array of pthread objects and auxiliary data structs to pass
 	// to the thread entry functions.
@@ -203,8 +199,6 @@ void bli_l3_thread_decorator
 		// Set up thread data for additional threads (beyond thread 0).
 		datas[tid].func     = func;
 		datas[tid].family   = family;
-		datas[tid].schema_a = schema_a;
-		datas[tid].schema_b = schema_b;
 		datas[tid].alpha    = alpha;
 		datas[tid].a        = a;
 		datas[tid].b        = b;
diff --git a/frame/thread/bli_l3_decor_single.c b/frame/thread/bli_l3_decor_single.c
index 51474f0eee..c2c43b3703 100644
--- a/frame/thread/bli_l3_decor_single.c
+++ b/frame/thread/bli_l3_decor_single.c
@@ -39,28 +39,32 @@
 
 void bli_l3_thread_decorator
      (
-       l3int_t    func,
-       opid_t     family,
-       obj_t*     alpha,
-       obj_t*     a,
-       obj_t*     b,
-       obj_t*     beta,
-       obj_t*     c,
-       cntx_t*    cntx,
-       rntm_t*    rntm,
-       cntl_t*    cntl
+             l3int_t func,
+             opid_t  family,
+       const obj_t*  alpha,
+       const obj_t*  a,
+       const obj_t*  b,
+       const obj_t*  beta,
+       const obj_t*  c,
+       const cntx_t* cntx,
+             rntm_t* rntm,
+             cntl_t* cntl
      )
 {
+	obj_t a_t, b_t;
+	bli_obj_alias_to( a, &a_t );
+	bli_obj_alias_to( b, &b_t );
+
 	// This is part of a hack to support mixed domain in bli_gemm_front().
 	// Sometimes we need to specify a non-standard schema for A and B, and
 	// we decided to transmit them via the schema field in the obj_t's
 	// rather than pass them in as function parameters. Once the values
 	// have been read, we immediately reset them back to their expected
 	// values for unpacked objects.
-	pack_t schema_a = bli_obj_pack_schema( a );
-	pack_t schema_b = bli_obj_pack_schema( b );
-	bli_obj_set_pack_schema( BLIS_NOT_PACKED, a );
-	bli_obj_set_pack_schema( BLIS_NOT_PACKED, b );
+	pack_t schema_a = bli_obj_pack_schema( &a_t );
+	pack_t schema_b = bli_obj_pack_schema( &b_t );
+	bli_obj_set_pack_schema( BLIS_NOT_PACKED, &a_t );
+	bli_obj_set_pack_schema( BLIS_NOT_PACKED, &b_t );
 
 	// For sequential execution, we use only one thread.
 	const dim_t n_threads = 1;
@@ -71,7 +75,7 @@ void bli_l3_thread_decorator
 	// with an internal lock to ensure only one application thread accesses
 	// the sba at a time. bli_sba_checkout_array() will also automatically
 	// resize the array_t, if necessary.
-	array_t* restrict array = bli_sba_checkout_array( n_threads );
+	array_t* array = bli_sba_checkout_array( n_threads );
 
 	// Access the pool_t* for thread 0 and embed it into the rntm. We do
 	// this up-front only so that we can create the global comm below.
@@ -81,13 +85,13 @@ void bli_l3_thread_decorator
 	bli_pba_rntm_set_pba( rntm );
 
 	// Allcoate a global communicator for the root thrinfo_t structures.
-	thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
+	thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads );
 
 
 	{
 		// NOTE: We don't need to create another copy of the rntm_t since
 		// it was already copied in one of the high-level oapi functions.
-		rntm_t* restrict rntm_p = rntm;
+		rntm_t* rntm_p = rntm;
 
 		cntl_t*    cntl_use;
 		thrinfo_t* thread;
@@ -111,7 +115,7 @@ void bli_l3_thread_decorator
 
 		// Create a default control tree for the operation, if needed.
 		bli_l3_cntl_create_if( family, schema_a, schema_b,
-		                       a, b, c, rntm_p, cntl, &cntl_use );
+		                       &a_t, &b_t, c, rntm_p, cntl, &cntl_use );
 
 		// Create the root node of the thread's thrinfo_t structure.
 		bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
@@ -119,8 +123,8 @@ void bli_l3_thread_decorator
 		func
 		(
 		  alpha,
-		  a,
-		  b,
+		  &a_t,
+		  &b_t,
 		  beta,
 		  c,
 		  cntx,
diff --git a/frame/thread/bli_l3_sup_decor.h b/frame/thread/bli_l3_sup_decor.h
index a001e5b743..6e04011513 100644
--- a/frame/thread/bli_l3_sup_decor.h
+++ b/frame/thread/bli_l3_sup_decor.h
@@ -41,28 +41,28 @@
 // Level-3 sup internal function type.
 typedef err_t (*l3supint_t)
      (
-       obj_t*     alpha,
-       obj_t*     a,
-       obj_t*     b,
-       obj_t*     beta,
-       obj_t*     c,
-       cntx_t*    cntx,
-       rntm_t*    rntm,
-       thrinfo_t* thread
+       const obj_t*     alpha,
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     beta,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+             rntm_t*    rntm,
+             thrinfo_t* thread
      );
 
 // Level-3 sup thread decorator prototype.
 err_t bli_l3_sup_thread_decorator
      (
-       l3supint_t func,
-       opid_t     family,
-       obj_t*     alpha,
-       obj_t*     a,
-       obj_t*     b,
-       obj_t*     beta,
-       obj_t*     c,
-       cntx_t*    cntx,
-       rntm_t*    rntm
+             l3supint_t func,
+             opid_t     family,
+       const obj_t*     alpha,
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     beta,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+             rntm_t*    rntm
      );
 
 // Include definitions specific to the method of multithreading for the
diff --git a/frame/thread/bli_l3_sup_decor_openmp.c b/frame/thread/bli_l3_sup_decor_openmp.c
index 1db9514fd4..ff6bc667d3 100644
--- a/frame/thread/bli_l3_sup_decor_openmp.c
+++ b/frame/thread/bli_l3_sup_decor_openmp.c
@@ -46,15 +46,15 @@ void* bli_l3_sup_thread_entry( void* data_void ) { return NULL; }
 
 err_t bli_l3_sup_thread_decorator
      (
-       l3supint_t func,
-       opid_t     family,
-       obj_t*     alpha,
-       obj_t*     a,
-       obj_t*     b,
-       obj_t*     beta,
-       obj_t*     c,
-       cntx_t*    cntx,
-       rntm_t*    rntm
+             l3supint_t func,
+             opid_t     family,
+       const obj_t*     alpha,
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     beta,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+             rntm_t*    rntm
      )
 {
 	// Query the total number of threads from the rntm_t object.
@@ -66,7 +66,7 @@ err_t bli_l3_sup_thread_decorator
 	// with an internal lock to ensure only one application thread accesses
 	// the sba at a time. bli_sba_checkout_array() will also automatically
 	// resize the array_t, if necessary.
-	array_t* restrict array = bli_sba_checkout_array( n_threads );
+	array_t* array = bli_sba_checkout_array( n_threads );
 
 	// Access the pool_t* for thread 0 and embed it into the rntm. We do
 	// this up-front only so that we have the rntm_t.sba_pool field
@@ -79,7 +79,7 @@ err_t bli_l3_sup_thread_decorator
 	bli_pba_rntm_set_pba( rntm );
 
 	// Allcoate a global communicator for the root thrinfo_t structures.
-	thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
+	thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads );
 
 
 	_Pragma( "omp parallel num_threads(n_threads)" )
@@ -87,8 +87,8 @@ err_t bli_l3_sup_thread_decorator
 		// Create a thread-local copy of the master thread's rntm_t. This is
 		// necessary since we want each thread to be able to track its own
 		// small block pool_t as it executes down the function stack.
-		rntm_t           rntm_l = *rntm;
-		rntm_t* restrict rntm_p = &rntm_l;
+		rntm_t  rntm_l = *rntm;
+		rntm_t* rntm_p = &rntm_l;
 
 		// Query the thread's id from OpenMP.
 		const dim_t tid = omp_get_thread_num();
diff --git a/frame/thread/bli_l3_sup_decor_pthreads.c b/frame/thread/bli_l3_sup_decor_pthreads.c
index dade71a035..375a85730e 100644
--- a/frame/thread/bli_l3_sup_decor_pthreads.c
+++ b/frame/thread/bli_l3_sup_decor_pthreads.c
@@ -40,18 +40,18 @@
 // A data structure to assist in passing operands to additional threads.
 typedef struct thread_data
 {
-	l3supint_t func;
-	opid_t     family;
-	obj_t*     alpha;
-	obj_t*     a;
-	obj_t*     b;
-	obj_t*     beta;
-	obj_t*     c;
-	cntx_t*    cntx;
-	rntm_t*    rntm;
-	dim_t      tid;
-	thrcomm_t* gl_comm;
-	array_t*   array;
+	      l3supint_t func;
+	      opid_t     family;
+	const obj_t*     alpha;
+	const obj_t*     a;
+	const obj_t*     b;
+	const obj_t*     beta;
+	const obj_t*     c;
+	const cntx_t*    cntx;
+	      rntm_t*    rntm;
+	      dim_t      tid;
+	      thrcomm_t* gl_comm;
+	      array_t*   array;
 } thread_data_t;
 
 // Entry point for additional threads
@@ -59,26 +59,26 @@ void* bli_l3_sup_thread_entry( void* data_void )
 {
 	thread_data_t* data     = data_void;
 
-	l3supint_t     func     = data->func;
-	opid_t         family   = data->family;
-	obj_t*         alpha    = data->alpha;
-	obj_t*         a        = data->a;
-	obj_t*         b        = data->b;
-	obj_t*         beta     = data->beta;
-	obj_t*         c        = data->c;
-	cntx_t*        cntx     = data->cntx;
-	rntm_t*        rntm     = data->rntm;
-	dim_t          tid      = data->tid;
-	array_t*       array    = data->array;
-	thrcomm_t*     gl_comm  = data->gl_comm;
+	      l3supint_t     func     = data->func;
+	      opid_t         family   = data->family;
+	const obj_t*         alpha    = data->alpha;
+	const obj_t*         a        = data->a;
+	const obj_t*         b        = data->b;
+	const obj_t*         beta     = data->beta;
+	const obj_t*         c        = data->c;
+	const cntx_t*        cntx     = data->cntx;
+	      rntm_t*        rntm     = data->rntm;
+	      dim_t          tid      = data->tid;
+	      array_t*       array    = data->array;
+	      thrcomm_t*     gl_comm  = data->gl_comm;
 
 	( void )family;
 
 	// Create a thread-local copy of the master thread's rntm_t. This is
 	// necessary since we want each thread to be able to track its own
 	// small block pool_t as it executes down the function stack.
-	rntm_t           rntm_l = *rntm;
-	rntm_t* restrict rntm_p = &rntm_l;
+	rntm_t  rntm_l = *rntm;
+	rntm_t* rntm_p = &rntm_l;
 
 	// Use the thread id to access the appropriate pool_t* within the
 	// array_t, and use it to set the sba_pool field within the rntm_t.
@@ -111,15 +111,15 @@ void* bli_l3_sup_thread_entry( void* data_void )
 
 err_t bli_l3_sup_thread_decorator
      (
-       l3supint_t func,
-       opid_t     family,
-       obj_t*     alpha,
-       obj_t*     a,
-       obj_t*     b,
-       obj_t*     beta,
-       obj_t*     c,
-       cntx_t*    cntx,
-       rntm_t*    rntm
+             l3supint_t func,
+             opid_t     family,
+       const obj_t*     alpha,
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     beta,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+             rntm_t*    rntm
      )
 {
 	err_t r_val;
@@ -133,7 +133,7 @@ err_t bli_l3_sup_thread_decorator
 	// with an internal lock to ensure only one application thread accesses
 	// the sba at a time. bli_sba_checkout_array() will also automatically
 	// resize the array_t, if necessary.
-	array_t* restrict array = bli_sba_checkout_array( n_threads );
+	array_t* array = bli_sba_checkout_array( n_threads );
 
 	// Access the pool_t* for thread 0 and embed it into the rntm. We do
 	// this up-front only so that we have the rntm_t.sba_pool field
@@ -146,7 +146,7 @@ err_t bli_l3_sup_thread_decorator
 	bli_pba_rntm_set_pba( rntm );
 
 	// Allocate a global communicator for the root thrinfo_t structures.
-	thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
+	thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads );
 
 	// Allocate an array of pthread objects and auxiliary data structs to pass
 	// to the thread entry functions.
diff --git a/frame/thread/bli_l3_sup_decor_single.c b/frame/thread/bli_l3_sup_decor_single.c
index a87af41032..42dbd14563 100644
--- a/frame/thread/bli_l3_sup_decor_single.c
+++ b/frame/thread/bli_l3_sup_decor_single.c
@@ -41,17 +41,15 @@
 
 err_t bli_l3_sup_thread_decorator
      (
-       l3supint_t func,
-       opid_t     family,
-       //pack_t     schema_a,
-       //pack_t     schema_b,
-       obj_t*     alpha,
-       obj_t*     a,
-       obj_t*     b,
-       obj_t*     beta,
-       obj_t*     c,
-       cntx_t*    cntx,
-       rntm_t*    rntm
+             l3supint_t func,
+             opid_t     family,
+       const obj_t*     alpha,
+       const obj_t*     a,
+       const obj_t*     b,
+       const obj_t*     beta,
+       const obj_t*     c,
+       const cntx_t*    cntx,
+             rntm_t*    rntm
      )
 {
 	// For sequential execution, we use only one thread.
@@ -63,7 +61,7 @@ err_t bli_l3_sup_thread_decorator
 	// with an internal lock to ensure only one application thread accesses
 	// the sba at a time. bli_sba_checkout_array() will also automatically
 	// resize the array_t, if necessary.
-	array_t* restrict array = bli_sba_checkout_array( n_threads );
+	array_t* array = bli_sba_checkout_array( n_threads );
 
 	// Access the pool_t* for thread 0 and embed it into the rntm.
 	bli_sba_rntm_set_pool( 0, array, rntm );
@@ -73,14 +71,14 @@ err_t bli_l3_sup_thread_decorator
 
 #ifndef SKIP_THRINFO_TREE
 	// Allcoate a global communicator for the root thrinfo_t structures.
-	thrcomm_t* restrict gl_comm = bli_thrcomm_create( rntm, n_threads );
+	thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads );
 #endif
 
 
 	{
 		// NOTE: We don't need to create another copy of the rntm_t since
 		// it was already copied in one of the high-level oapi functions.
-		rntm_t* restrict rntm_p = rntm;
+		rntm_t* rntm_p = rntm;
 
 		// There is only one thread id (for the thief thread).
 		const dim_t tid = 0;
diff --git a/frame/thread/bli_thread.c b/frame/thread/bli_thread.c
index 8368ea2beb..b7fccace76 100644
--- a/frame/thread/bli_thread.c
+++ b/frame/thread/bli_thread.c
@@ -35,9 +35,9 @@
 
 #include "blis.h"
 
-const thrinfo_t BLIS_PACKM_SINGLE_THREADED = {};
-const thrinfo_t BLIS_GEMM_SINGLE_THREADED  = {};
-const thrcomm_t BLIS_SINGLE_COMM           = {};
+thrinfo_t BLIS_PACKM_SINGLE_THREADED = {};
+thrinfo_t BLIS_GEMM_SINGLE_THREADED  = {};
+thrcomm_t BLIS_SINGLE_COMM           = {};
 
 // The global rntm_t structure. (The definition resides in bli_rntm.c.)
 extern rntm_t global_rntm;
@@ -50,9 +50,9 @@ extern bli_pthread_mutex_t global_rntm_mutex;
 
 void bli_thread_init( void )
 {
-	bli_thrcomm_init( 1, ( thrcomm_t* )&BLIS_SINGLE_COMM );
-	bli_packm_thrinfo_init_single( ( thrinfo_t* )&BLIS_PACKM_SINGLE_THREADED );
-	bli_l3_thrinfo_init_single( (thrinfo_t* )&BLIS_GEMM_SINGLE_THREADED );
+	bli_thrcomm_init( 1, &BLIS_SINGLE_COMM );
+	bli_packm_thrinfo_init_single( &BLIS_PACKM_SINGLE_THREADED );
+	bli_l3_thrinfo_init_single( &BLIS_GEMM_SINGLE_THREADED );
 
 	// Read the environment variables and use them to initialize the
 	// global runtime object.
@@ -68,11 +68,11 @@ void bli_thread_finalize( void )
 void bli_thread_range_sub
      (
        const thrinfo_t* thread,
-       dim_t      n,
-       dim_t      bf,
-       bool       handle_edge_low,
-       dim_t*     start,
-       dim_t*     end
+             dim_t      n,
+             dim_t      bf,
+             bool       handle_edge_low,
+             dim_t*     start,
+             dim_t*     end
      )
 {
 	dim_t      n_way      = bli_thread_n_way( thread );
@@ -214,8 +214,8 @@ siz_t bli_thread_range_l2r
        const thrinfo_t* thr,
        const obj_t*     a,
        const blksz_t*   bmult,
-       dim_t*     start,
-       dim_t*     end
+             dim_t*     start,
+             dim_t*     end
      )
 {
 	num_t dt = bli_obj_dt( a );
@@ -234,8 +234,8 @@ siz_t bli_thread_range_r2l
        const thrinfo_t* thr,
        const obj_t*     a,
        const blksz_t*   bmult,
-       dim_t*     start,
-       dim_t*     end
+             dim_t*     start,
+             dim_t*     end
      )
 {
 	num_t dt = bli_obj_dt( a );
@@ -254,8 +254,8 @@ siz_t bli_thread_range_t2b
        const thrinfo_t* thr,
        const obj_t*     a,
        const blksz_t*   bmult,
-       dim_t*     start,
-       dim_t*     end
+             dim_t*     start,
+             dim_t*     end
      )
 {
 	num_t dt = bli_obj_dt( a );
@@ -274,8 +274,8 @@ siz_t bli_thread_range_b2t
        const thrinfo_t* thr,
        const obj_t*     a,
        const blksz_t*   bmult,
-       dim_t*     start,
-       dim_t*     end
+             dim_t*     start,
+             dim_t*     end
      )
 {
 	num_t dt = bli_obj_dt( a );
@@ -505,14 +505,14 @@ siz_t bli_find_area_trap_l
 siz_t bli_thread_range_weighted_sub
      (
        const thrinfo_t* thread,
-       doff_t           diagoff,
-       uplo_t           uplo,
-       dim_t            m,
-       dim_t            n,
-       dim_t            bf,
-       bool             handle_edge_low,
-       dim_t*           j_start_thr,
-       dim_t*           j_end_thr
+             doff_t     diagoff,
+             uplo_t     uplo,
+             dim_t      m,
+             dim_t      n,
+             dim_t      bf,
+             bool       handle_edge_low,
+             dim_t*     j_start_thr,
+             dim_t*     j_end_thr
      )
 {
 	dim_t      n_way   = bli_thread_n_way( thread );
@@ -641,15 +641,15 @@ siz_t bli_thread_range_weighted_sub
 
 siz_t bli_thread_range_mdim
      (
-       dir_t      direct,
+             dir_t      direct,
        const thrinfo_t* thr,
        const obj_t*     a,
        const obj_t*     b,
        const obj_t*     c,
        const cntl_t*    cntl,
        const cntx_t*    cntx,
-       dim_t*     start,
-       dim_t*     end
+             dim_t*     start,
+             dim_t*     end
      )
 {
 	bszid_t  bszid  = bli_cntl_bszid( cntl );
@@ -700,15 +700,15 @@ siz_t bli_thread_range_mdim
 
 siz_t bli_thread_range_ndim
      (
-       dir_t      direct,
+             dir_t      direct,
        const thrinfo_t* thr,
        const obj_t*     a,
        const obj_t*     b,
        const obj_t*     c,
        const cntl_t*    cntl,
        const cntx_t*    cntx,
-       dim_t*     start,
-       dim_t*     end
+             dim_t*     start,
+             dim_t*     end
      )
 {
 	bszid_t  bszid  = bli_cntl_bszid( cntl );
@@ -762,8 +762,8 @@ siz_t bli_thread_range_weighted_l2r
        const thrinfo_t* thr,
        const obj_t*     a,
        const blksz_t*   bmult,
-       dim_t*     start,
-       dim_t*     end
+             dim_t*     start,
+             dim_t*     end
      )
 {
 	siz_t area;
@@ -812,8 +812,8 @@ siz_t bli_thread_range_weighted_r2l
        const thrinfo_t* thr,
        const obj_t*     a,
        const blksz_t*   bmult,
-       dim_t*     start,
-       dim_t*     end
+             dim_t*     start,
+             dim_t*     end
      )
 {
 	siz_t area;
@@ -864,8 +864,8 @@ siz_t bli_thread_range_weighted_t2b
        const thrinfo_t* thr,
        const obj_t*     a,
        const blksz_t*   bmult,
-       dim_t*     start,
-       dim_t*     end
+             dim_t*     start,
+             dim_t*     end
      )
 {
 	siz_t area;
@@ -916,8 +916,8 @@ siz_t bli_thread_range_weighted_b2t
        const thrinfo_t* thr,
        const obj_t*     a,
        const blksz_t*   bmult,
-       dim_t*     start,
-       dim_t*     end
+             dim_t*     start,
+             dim_t*     end
      )
 {
 	siz_t area;
diff --git a/frame/thread/bli_thread.h b/frame/thread/bli_thread.h
index 60088ff832..5e9c650b5b 100644
--- a/frame/thread/bli_thread.h
+++ b/frame/thread/bli_thread.h
@@ -67,11 +67,11 @@ BLIS_EXPORT_BLIS
 void bli_thread_range_sub
      (
        const thrinfo_t* thread,
-       dim_t      n,
-       dim_t      bf,
-       bool       handle_edge_low,
-       dim_t*     start,
-       dim_t*     end
+             dim_t      n,
+             dim_t      bf,
+             bool       handle_edge_low,
+             dim_t*     start,
+             dim_t*     end
      );
 
 #undef  GENPROT
@@ -79,15 +79,15 @@ void bli_thread_range_sub
 \
 siz_t PASTEMAC0( opname ) \
      ( \
-       dir_t      direct, \
+             dir_t      direct, \
        const thrinfo_t* thr, \
        const obj_t*     a, \
        const obj_t*     b, \
        const obj_t*     c, \
        const cntl_t*    cntl, \
        const cntx_t*    cntx, \
-       dim_t*     start, \
-       dim_t*     end  \
+             dim_t*     start, \
+             dim_t*     end  \
      );
 
 GENPROT( thread_range_mdim )
@@ -101,8 +101,8 @@ siz_t PASTEMAC0( opname ) \
        const thrinfo_t* thr, \
        const obj_t*     a, \
        const blksz_t*   bmult, \
-       dim_t*     start, \
-       dim_t*     end  \
+             dim_t*     start, \
+             dim_t*     end  \
      );
 
 GENPROT( thread_range_l2r )
@@ -136,15 +136,15 @@ siz_t bli_find_area_trap_l
      );
 siz_t bli_thread_range_weighted_sub
      (
-       const thrinfo_t* restrict thread,
-       doff_t              diagoff,
-       uplo_t              uplo,
-       dim_t               m,
-       dim_t               n,
-       dim_t               bf,
-       bool                handle_edge_low,
-       dim_t*     restrict j_start_thr,
-       dim_t*     restrict j_end_thr
+       const thrinfo_t* thread,
+             doff_t     diagoff,
+             uplo_t     uplo,
+             dim_t      m,
+             dim_t      n,
+             dim_t      bf,
+             bool       handle_edge_low,
+             dim_t*     j_start_thr,
+             dim_t*     j_end_thr
      );
 
 // -----------------------------------------------------------------------------
@@ -212,12 +212,12 @@ void  bli_thread_init_rntm_from_env( rntm_t* rntm );
 BLIS_INLINE void bli_thread_range_jrir_rr
      (
        const thrinfo_t* thread,
-       dim_t      n,
-       dim_t      bf,
-       bool       handle_edge_low,
-       dim_t*     start,
-       dim_t*     end,
-       dim_t*     inc
+             dim_t      n,
+             dim_t      bf,
+             bool       handle_edge_low,
+             dim_t*     start,
+             dim_t*     end,
+             dim_t*     inc
      )
 {
 	// Use interleaved partitioning of jr/ir loops.
@@ -229,12 +229,12 @@ BLIS_INLINE void bli_thread_range_jrir_rr
 BLIS_INLINE void bli_thread_range_jrir_sl
      (
        const thrinfo_t* thread,
-       dim_t      n,
-       dim_t      bf,
-       bool       handle_edge_low,
-       dim_t*     start,
-       dim_t*     end,
-       dim_t*     inc
+             dim_t      n,
+             dim_t      bf,
+             bool       handle_edge_low,
+             dim_t*     start,
+             dim_t*     end,
+             dim_t*     inc
      )
 {
 	// Use contiguous slab partitioning of jr/ir loops.
@@ -245,12 +245,12 @@ BLIS_INLINE void bli_thread_range_jrir_sl
 BLIS_INLINE void bli_thread_range_jrir
      (
        const thrinfo_t* thread,
-       dim_t      n,
-       dim_t      bf,
-       bool       handle_edge_low,
-       dim_t*     start,
-       dim_t*     end,
-       dim_t*     inc
+             dim_t      n,
+             dim_t      bf,
+             bool       handle_edge_low,
+             dim_t*     start,
+             dim_t*     end,
+             dim_t*     inc
      )
 {
 	// Define a general-purpose version of bli_thread_range_jrir() whose
diff --git a/frame/thread/bli_thrinfo.c b/frame/thread/bli_thrinfo.c
index 4dd447eec4..4078d4d388 100644
--- a/frame/thread/bli_thrinfo.c
+++ b/frame/thread/bli_thrinfo.c
@@ -97,7 +97,7 @@ void bli_thrinfo_init_single
 	bli_thrinfo_init
 	(
 	  thread,
-	  ( thrcomm_t* )&BLIS_SINGLE_COMM, 0,
+	  &BLIS_SINGLE_COMM, 0,
 	  1,
 	  0,
 	  FALSE,
diff --git a/frame/thread/bli_thrinfo_sup.c b/frame/thread/bli_thrinfo_sup.c
index ab28b7160c..8800bc01f0 100644
--- a/frame/thread/bli_thrinfo_sup.c
+++ b/frame/thread/bli_thrinfo_sup.c
@@ -33,13 +33,14 @@
 
 */
 
+#include "bli_thrcomm_openmp.h"
 #include "blis.h"
 
 void bli_thrinfo_sup_grow
      (
-       rntm_t*    rntm,
-       bszid_t*   bszid_par,
-       thrinfo_t* thread
+             rntm_t*    rntm,
+       const bszid_t*   bszid_par,
+             thrinfo_t* thread
      )
 {
 	if ( thread == &BLIS_GEMM_SINGLE_THREADED ||
@@ -75,10 +76,10 @@ void bli_thrinfo_sup_grow
 
 thrinfo_t* bli_thrinfo_sup_rgrow
      (
-       rntm_t*    rntm,
-       bszid_t*   bszid_par,
-       bszid_t*   bszid_cur,
-       thrinfo_t* thread_par
+             rntm_t*    rntm,
+       const bszid_t*   bszid_par,
+       const bszid_t*   bszid_cur,
+             thrinfo_t* thread_par
      )
 {
 	thrinfo_t* thread_cur;
@@ -139,10 +140,10 @@ thrinfo_t* bli_thrinfo_sup_rgrow
 
 thrinfo_t* bli_thrinfo_sup_create_for_cntl
      (
-       rntm_t*    rntm,
-       bszid_t*   bszid_par,
-       bszid_t*   bszid_chl,
-       thrinfo_t* thread_par
+             rntm_t*    rntm,
+       const bszid_t*   bszid_par,
+       const bszid_t*   bszid_chl,
+             thrinfo_t* thread_par
      )
 {
 #if 1
@@ -152,14 +153,14 @@ thrinfo_t* bli_thrinfo_sup_create_for_cntl
 	{
 		thrinfo_t* thread_chl = bli_thrinfo_create
 		(
-		  rntm,                        // rntm
-		  &BLIS_SINGLE_COMM,           // ocomm
-		  0,                           // ocomm_id
-		  1,                           // n_way
-		  0,                           // work_id
-		  FALSE,                       // free_comm
-		  BLIS_NO_PART,                // bszid
-		  NULL                         // sub_node
+		  rntm,               // rntm
+		  &BLIS_SINGLE_COMM,  // ocomm
+		  0,                  // ocomm_id
+		  1,                  // n_way
+		  0,                  // work_id
+		  FALSE,              // free_comm
+		  BLIS_NO_PART,       // bszid
+		  NULL                // sub_node
 		);
 		return thread_chl;
 	}
diff --git a/frame/thread/bli_thrinfo_sup.h b/frame/thread/bli_thrinfo_sup.h
index 0be035cf87..1afcd3337e 100644
--- a/frame/thread/bli_thrinfo_sup.h
+++ b/frame/thread/bli_thrinfo_sup.h
@@ -42,25 +42,25 @@
 
 void bli_thrinfo_sup_grow
      (
-       rntm_t*    rntm,
-       bszid_t*   bszid_par,
-       thrinfo_t* thread
+             rntm_t*    rntm,
+       const bszid_t*   bszid_par,
+             thrinfo_t* thread
      );
 
 thrinfo_t* bli_thrinfo_sup_rgrow
      (
-       rntm_t*    rntm,
-       bszid_t*   bszid_par,
-       bszid_t*   bszid_cur,
-       thrinfo_t* thread_par
+             rntm_t*    rntm,
+       const bszid_t*   bszid_par,
+       const bszid_t*   bszid_cur,
+             thrinfo_t* thread_par
      );
 
 thrinfo_t* bli_thrinfo_sup_create_for_cntl
      (
-       rntm_t*    rntm,
-       bszid_t*   bszid_par,
-       bszid_t*   bszid_chl,
-       thrinfo_t* thread_par
+             rntm_t*    rntm,
+       const bszid_t*   bszid_par,
+       const bszid_t*   bszid_chl,
+             thrinfo_t* thread_par
      );
 
 #endif
diff --git a/frame/util/bli_util_ft.h b/frame/util/bli_util_ft.h
index 703b0bfe5b..ccdd7ae660 100644
--- a/frame/util/bli_util_ft.h
+++ b/frame/util/bli_util_ft.h
@@ -44,9 +44,9 @@
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       dim_t        n, \
-       const ctype* x, inc_t incx, \
-       ctype_r*     asum  \
+             dim_t    n, \
+       const ctype*   x, inc_t incx, \
+             ctype_r* asum  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -76,9 +76,9 @@ INSERT_GENTDEF( mktrim )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       dim_t        n, \
-       const ctype* x, inc_t incx, \
-       ctype_r*     norm  \
+             dim_t    n, \
+       const ctype*   x, inc_t incx, \
+             ctype_r* norm  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -93,13 +93,13 @@ INSERT_GENTDEFR( normiv )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       doff_t       diagoffx, \
-       diag_t       diagx, \
-       uplo_t       uplox, \
-       dim_t        m, \
-       dim_t        n, \
-       const ctype* x, inc_t rs_x, inc_t cs_x, \
-       ctype_r*     norm  \
+             doff_t   diagoffx, \
+             diag_t   diagx, \
+             uplo_t   uplox, \
+             dim_t    m, \
+             dim_t    n, \
+       const ctype*   x, inc_t rs_x, inc_t cs_x, \
+             ctype_r* norm  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -114,9 +114,9 @@ INSERT_GENTDEFR( normim )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       FILE*        file, \
+             FILE*  file, \
        const char*  s1, \
-       dim_t        n, \
+             dim_t  n, \
        const ctype* x, inc_t incx, \
        const char*  format, \
        const char*  s2  \
@@ -131,10 +131,10 @@ INSERT_GENTDEF( fprintv )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       FILE*        file, \
+             FILE*  file, \
        const char*  s1, \
-       dim_t        m, \
-       dim_t        n, \
+             dim_t  m, \
+             dim_t  n, \
        const ctype* x, inc_t rs_x, inc_t cs_x, \
        const char*  format, \
        const char*  s2  \
@@ -182,10 +182,10 @@ INSERT_GENTDEF( randnm )
 \
 typedef void (*PASTECH3(ch,opname,EX_SUF,tsuf)) \
      ( \
-       dim_t        n, \
-       const ctype* x, inc_t incx, \
-       ctype_r*     scale, \
-       ctype_r*     sumsq  \
+             dim_t    n, \
+       const ctype*   x, inc_t incx, \
+             ctype_r* scale, \
+             ctype_r* sumsq  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -204,10 +204,10 @@ INSERT_GENTDEFR( sumsqv )
 \
 typedef void (*PASTECH2(ch,opname,tsuf)) \
      ( \
-       conj_t       conjchi, \
+             conj_t conjchi, \
        const ctype* chi, \
        const ctype* psi, \
-       bool*        is_eq  \
+             bool*  is_eq  \
      );
 
 INSERT_GENTDEF( eqsc )
@@ -219,11 +219,11 @@ INSERT_GENTDEF( eqsc )
 \
 typedef void (*PASTECH2(ch,opname,tsuf)) \
      ( \
-       conj_t       conjx, \
-       dim_t        n, \
+             conj_t conjx, \
+             dim_t  n, \
        const ctype* x, inc_t incx, \
        const ctype* y, inc_t incy, \
-       bool*        is_eq  \
+             bool*  is_eq  \
      );
 
 INSERT_GENTDEF( eqv )
@@ -235,15 +235,15 @@ INSERT_GENTDEF( eqv )
 \
 typedef void (*PASTECH2(ch,opname,tsuf)) \
      ( \
-       doff_t       diagoffx, \
-       diag_t       diagx, \
-       uplo_t       uplox, \
-       trans_t      transx, \
-       dim_t        m, \
-       dim_t        n, \
-       const ctype* x, inc_t rs_x, inc_t cs_x, \
-       const ctype* y, inc_t rs_y, inc_t cs_y, \
-       bool*        is_eq  \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             uplo_t  uplox, \
+             trans_t transx, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+       const ctype*  y, inc_t rs_y, inc_t cs_y, \
+             bool*   is_eq  \
      );
 
 INSERT_GENTDEF( eqm )
diff --git a/frame/util/bli_util_oapi.c b/frame/util/bli_util_oapi.c
index 6ccc7ed2e2..da71fb8953 100644
--- a/frame/util/bli_util_oapi.c
+++ b/frame/util/bli_util_oapi.c
@@ -368,7 +368,7 @@ void PASTEMAC0(opname) \
      ( \
        const obj_t* chi, \
        const obj_t* psi, \
-       bool*        is_eq  \
+             bool*  is_eq  \
      ) \
 { \
 	bli_init_once(); \
@@ -429,7 +429,7 @@ void PASTEMAC0(opname) \
      ( \
        const obj_t* x, \
        const obj_t* y, \
-       bool*        is_eq  \
+             bool*  is_eq  \
      ) \
 { \
 	bli_init_once(); \
@@ -476,7 +476,7 @@ void PASTEMAC0(opname) \
      ( \
        const obj_t* x, \
        const obj_t* y, \
-       bool*        is_eq  \
+             bool* is_eq  \
      ) \
 { \
 	bli_init_once(); \
@@ -531,7 +531,7 @@ GENFRONT( eqm )
 \
 void PASTEMAC0(opname) \
      ( \
-       FILE*        file, \
+             FILE*  file, \
        const char*  s1, \
        const obj_t* x, \
        const char*  format, \
@@ -579,7 +579,7 @@ GENFRONT( fprintv )
 \
 void PASTEMAC0(opname) \
      ( \
-       FILE*        file, \
+             FILE*  file, \
        const char*  s1, \
        const obj_t* x, \
        const char*  format, \
@@ -602,11 +602,11 @@ void PASTEMAC0(opname) \
 	/* Handle constants up front. */ \
 	if ( dt == BLIS_CONSTANT ) \
 	{ \
-		float*    sp = bli_obj_buffer_for_const( BLIS_FLOAT,    x ); \
-		double*   dp = bli_obj_buffer_for_const( BLIS_DOUBLE,   x ); \
-		scomplex* cp = bli_obj_buffer_for_const( BLIS_SCOMPLEX, x ); \
-		dcomplex* zp = bli_obj_buffer_for_const( BLIS_DCOMPLEX, x ); \
-		gint_t*   ip = bli_obj_buffer_for_const( BLIS_INT,      x ); \
+		const float*    sp = bli_obj_buffer_for_const( BLIS_FLOAT,    x ); \
+		const double*   dp = bli_obj_buffer_for_const( BLIS_DOUBLE,   x ); \
+		const scomplex* cp = bli_obj_buffer_for_const( BLIS_SCOMPLEX, x ); \
+		const dcomplex* zp = bli_obj_buffer_for_const( BLIS_DCOMPLEX, x ); \
+		const gint_t*   ip = bli_obj_buffer_for_const( BLIS_INT,      x ); \
 \
 		fprintf( file, "%s\n", s1 ); \
 		fprintf( file, " float:     %9.2e\n",         bli_sreal( *sp ) ); \
diff --git a/frame/util/bli_util_oapi.h b/frame/util/bli_util_oapi.h
index 2f21c09b99..2a7be21681 100644
--- a/frame/util/bli_util_oapi.h
+++ b/frame/util/bli_util_oapi.h
@@ -139,33 +139,6 @@ GENPROT( sumsqv )
 
 #ifdef BLIS_OAPI_BASIC
 
-/*
-#undef  GENPROT
-#define GENPROT( opname ) \
-\
-BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
-     ( \
-       const obj_t* chi, \
-       const obj_t* psi, \
-       bool*        is_eq  \
-     );
-
-GENPROT( eqsc )
-
-
-#undef  GENPROT
-#define GENPROT( opname ) \
-\
-BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
-     ( \
-       const obj_t* x, \
-       const obj_t* y, \
-       bool*        is_eq  \
-     );
-
-GENPROT( eqv )
-*/
-
 
 #undef  GENPROT
 #define GENPROT( opname ) \
@@ -174,7 +147,7 @@ BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
      ( \
        const obj_t* x, \
        const obj_t* y, \
-       bool*        is_eq  \
+             bool*  is_eq  \
      );
 
 GENPROT( eqsc )
@@ -187,7 +160,7 @@ GENPROT( eqm )
 \
 BLIS_EXPORT_BLIS void PASTEMAC0(opname) \
      ( \
-       FILE*        file, \
+             FILE*  file, \
        const char*  s1, \
        const obj_t* x, \
        const char*  format, \
diff --git a/frame/util/bli_util_tapi.c b/frame/util/bli_util_tapi.c
index 8862f4ff6d..762129cf25 100644
--- a/frame/util/bli_util_tapi.c
+++ b/frame/util/bli_util_tapi.c
@@ -45,9 +45,9 @@
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       dim_t        n, \
-       const ctype* x, inc_t incx, \
-       ctype_r*     asum  \
+             dim_t    n, \
+       const ctype*   x, inc_t incx, \
+             ctype_r* asum  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -71,9 +71,9 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	PASTEMAC2(ch,opname,_unb_var1) \
 	( \
 	  n, \
-	  x, incx, \
-	  asum, \
-	  cntx, \
+	  ( ctype* )x, incx, \
+	            asum, \
+	  ( cntx_t* )cntx, \
 	  rntm  \
 	); \
 }
@@ -109,7 +109,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  uploa, \
 	  m, \
 	  a, rs_a, cs_a, \
-	  cntx, \
+	  ( cntx_t* )cntx, \
 	  rntm  \
 	); \
 }
@@ -124,9 +124,9 @@ INSERT_GENTFUNC_BASIC0( mktrim )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       dim_t        n, \
-       const ctype* x, inc_t incx, \
-       ctype_r*     norm  \
+             dim_t    n, \
+       const ctype*   x, inc_t incx, \
+             ctype_r* norm  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -150,9 +150,9 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	PASTEMAC2(ch,opname,_unb_var1) \
 	( \
 	  n, \
-	  x, incx, \
-	  norm, \
-	  cntx, \
+	  ( ctype* )x, incx, \
+	            norm, \
+	  ( cntx_t* )cntx, \
 	  rntm  \
 	); \
 }
@@ -167,13 +167,13 @@ INSERT_GENTFUNCR_BASIC0( normiv )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t       diagoffx, \
-       diag_t       diagx, \
-       uplo_t       uplox, \
-       dim_t        m, \
-       dim_t        n, \
-       const ctype* x, inc_t rs_x, inc_t cs_x, \
-       ctype_r*     norm  \
+             doff_t   diagoffx, \
+             diag_t   diagx, \
+             uplo_t   uplox, \
+             dim_t    m, \
+             dim_t    n, \
+       const ctype*   x, inc_t rs_x, inc_t cs_x, \
+             ctype_r* norm  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -201,9 +201,9 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	  uplox, \
 	  m, \
 	  n, \
-	  x, rs_x, cs_x, \
-	  norm, \
-	  cntx, \
+	  ( ctype* )x, rs_x, cs_x, \
+	            norm, \
+	  ( cntx_t* )cntx, \
 	  rntm  \
 	); \
 }
@@ -247,7 +247,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 		( \
 		  n, \
 		  x, incx, \
-		  cntx, \
+		  ( cntx_t* )cntx, \
 		  rntm  \
 		); \
 \
@@ -309,7 +309,7 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 		  m, \
 		  n, \
 		  x, rs_x, cs_x, \
-		  cntx, \
+		  ( cntx_t* )cntx, \
 		  rntm  \
 		); \
 \
@@ -340,10 +340,10 @@ INSERT_GENTFUNCR_BASIC0( randnm )
 \
 void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       dim_t        n, \
-       const ctype* x, inc_t incx, \
-       ctype_r*     scale, \
-       ctype_r*     sumsq  \
+             dim_t    n, \
+       const ctype*   x, inc_t incx, \
+             ctype_r* scale, \
+             ctype_r* sumsq  \
        BLIS_TAPI_EX_PARAMS  \
      ) \
 { \
@@ -362,10 +362,10 @@ void PASTEMAC2(ch,opname,EX_SUF) \
 	PASTEMAC2(ch,opname,_unb_var1) \
 	( \
 	  n, \
-	  x, incx, \
-	  scale, \
-	  sumsq, \
-	  cntx, \
+	  ( ctype* )x, incx, \
+	            scale, \
+	            sumsq, \
+	  ( cntx_t* )cntx, \
 	  rntm  \
 	); \
 }
@@ -383,10 +383,10 @@ INSERT_GENTFUNCR_BASIC0( sumsqv )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       conj_t       conjchi, \
+             conj_t conjchi, \
        const ctype* chi, \
        const ctype* psi, \
-       bool*        is_eq  \
+             bool*  is_eq  \
      ) \
 { \
 	bli_init_once(); \
@@ -406,11 +406,11 @@ INSERT_GENTFUNC_BASIC0( eqsc )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       conj_t       conjx, \
-       dim_t        n, \
+             conj_t conjx, \
+             dim_t  n, \
        const ctype* x, inc_t incx, \
        const ctype* y, inc_t incy, \
-       bool*        is_eq  \
+             bool*  is_eq  \
      ) \
 { \
 	bli_init_once(); \
@@ -425,8 +425,8 @@ void PASTEMAC(ch,opname) \
 	( \
 	  conjx, \
 	  n, \
-	  x, incx, \
-	  y, incy  \
+	  ( ctype* )x, incx, \
+	  ( ctype* )y, incy  \
 	); \
 }
 
@@ -438,15 +438,15 @@ INSERT_GENTFUNC_BASIC0( eqv )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       doff_t       diagoffx, \
-       diag_t       diagx, \
-       uplo_t       uplox, \
-       trans_t      transx, \
-       dim_t        m, \
-       dim_t        n, \
-       const ctype* x, inc_t rs_x, inc_t cs_x, \
-       const ctype* y, inc_t rs_y, inc_t cs_y, \
-       bool*        is_eq  \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             uplo_t  uplox, \
+             trans_t transx, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+       const ctype*  y, inc_t rs_y, inc_t cs_y, \
+             bool*   is_eq  \
      ) \
 { \
 	bli_init_once(); \
@@ -467,8 +467,8 @@ void PASTEMAC(ch,opname) \
 	  transx, \
 	  m, \
 	  n, \
-	  x, rs_x, cs_x, \
-	  y, rs_y, cs_y  \
+	  ( ctype* )x, rs_x, cs_x, \
+	  ( ctype* )y, rs_y, cs_y  \
 	); \
 }
 
@@ -481,7 +481,7 @@ INSERT_GENTFUNC_BASIC0( eqm )
 void PASTEMAC(ch,opname) \
      ( \
        const char* s1, \
-       dim_t       n, \
+             dim_t n, \
        const void* x, inc_t incx, \
        const char* format, \
        const char* s2  \
@@ -509,8 +509,8 @@ INSERT_GENTFUNC_BASIC_I( printv, fprintv )
 void PASTEMAC(ch,opname) \
      ( \
        const char* s1, \
-       dim_t       m, \
-       dim_t       n, \
+             dim_t m, \
+             dim_t n, \
        const void* x, inc_t rs_x, inc_t cs_x, \
        const char* format, \
        const char* s2  \
diff --git a/frame/util/bli_util_tapi.h b/frame/util/bli_util_tapi.h
index 652e3735b0..d4590614d7 100644
--- a/frame/util/bli_util_tapi.h
+++ b/frame/util/bli_util_tapi.h
@@ -42,9 +42,9 @@
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       dim_t        n, \
-       const ctype* x, inc_t incx, \
-       ctype_r*     asum  \
+             dim_t    n, \
+       const ctype*   x, inc_t incx, \
+             ctype_r* asum  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -72,9 +72,9 @@ INSERT_GENTPROT_BASIC0( mktrim )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       dim_t        n, \
-       const ctype* x, inc_t incx, \
-       ctype_r*     norm  \
+             dim_t    n, \
+       const ctype*   x, inc_t incx, \
+             ctype_r* norm  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -88,13 +88,13 @@ INSERT_GENTPROTR_BASIC0( normiv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       doff_t       diagoffx, \
-       diag_t       diagx, \
-       uplo_t       uplox, \
-       dim_t        m, \
-       dim_t        n, \
-       const ctype* x, inc_t rs_x, inc_t cs_x, \
-       ctype_r*     norm  \
+             doff_t   diagoffx, \
+             diag_t   diagx, \
+             uplo_t   uplox, \
+             dim_t    m, \
+             dim_t    n, \
+       const ctype*   x, inc_t rs_x, inc_t cs_x, \
+             ctype_r* norm  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -139,10 +139,10 @@ INSERT_GENTPROT_BASIC0( randnm )
 \
 BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,EX_SUF) \
      ( \
-       dim_t        n, \
-       const ctype* x, inc_t incx, \
-       ctype_r*     scale, \
-       ctype_r*     sumsq  \
+             dim_t    n, \
+       const ctype*   x, inc_t incx, \
+             ctype_r* scale, \
+             ctype_r* sumsq  \
        BLIS_TAPI_EX_PARAMS  \
      );
 
@@ -159,10 +159,10 @@ INSERT_GENTPROTR_BASIC0( sumsqv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       conj_t       conjchi, \
+             conj_t conjchi, \
        const ctype* chi, \
        const ctype* psi, \
-       bool*        is_eq  \
+             bool*  is_eq  \
      );
 
 INSERT_GENTPROT_BASIC0( eqsc )
@@ -173,11 +173,11 @@ INSERT_GENTPROT_BASIC0( eqsc )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
       ( \
-        conj_t       conjx, \
-        dim_t        n, \
+              conj_t conjx, \
+              dim_t  n, \
         const ctype* x, inc_t incx, \
         const ctype* y, inc_t incy, \
-        bool*        is_eq  \
+              bool*  is_eq  \
       );
 
 INSERT_GENTPROT_BASIC0( eqv )
@@ -188,15 +188,15 @@ INSERT_GENTPROT_BASIC0( eqv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       doff_t       diagoffx, \
-       diag_t       diagx, \
-       uplo_t       uplox, \
-       trans_t      transx, \
-       dim_t        m, \
-       dim_t        n, \
-       const ctype* x, inc_t rs_x, inc_t cs_x, \
-       const ctype* y, inc_t rs_y, inc_t cs_y, \
-       bool*        is_eq  \
+             doff_t  diagoffx, \
+             diag_t  diagx, \
+             uplo_t  uplox, \
+             trans_t transx, \
+             dim_t   m, \
+             dim_t   n, \
+       const ctype*  x, inc_t rs_x, inc_t cs_x, \
+       const ctype*  y, inc_t rs_y, inc_t cs_y, \
+             bool*   is_eq  \
      );
 
 INSERT_GENTPROT_BASIC0( eqm )
@@ -208,7 +208,7 @@ INSERT_GENTPROT_BASIC0( eqm )
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
        const char* s1, \
-       dim_t       n, \
+             dim_t n, \
        const void* x, inc_t incx, \
        const char* format, \
        const char* s2  \
@@ -223,8 +223,8 @@ INSERT_GENTPROT_BASIC0_I( printv )
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
        const char* s1, \
-       dim_t       m, \
-       dim_t       n, \
+             dim_t m, \
+             dim_t n, \
        const void* x, inc_t rs_x, inc_t cs_x, \
        const char* format, \
        const char* s2  \
diff --git a/frame/util/bli_util_unb_var1.c b/frame/util/bli_util_unb_var1.c
index 18506d4cc6..2b65c8460f 100644
--- a/frame/util/bli_util_unb_var1.c
+++ b/frame/util/bli_util_unb_var1.c
@@ -45,18 +45,18 @@
 \
 void PASTEMAC(ch,varname) \
      ( \
-       dim_t         n, \
-       const ctype*  x, inc_t incx, \
-       ctype_r*      asum, \
-       const cntx_t* cntx, \
-       const rntm_t* rntm  \
+       dim_t    n, \
+       ctype*   x, inc_t incx, \
+       ctype_r* asum, \
+       cntx_t*  cntx, \
+       rntm_t*  rntm  \
      ) \
 { \
-	const ctype* chi1; \
-	ctype_r      chi1_r; \
-	ctype_r      chi1_i; \
-	ctype_r      absum; \
-	dim_t        i; \
+	ctype*  chi1; \
+	ctype_r chi1_r; \
+	ctype_r chi1_i; \
+	ctype_r absum; \
+	dim_t   i; \
 \
 	/* Initialize the absolute sum accumulator to zero. */ \
 	PASTEMAC(chr,set0s)( absum ); \
@@ -89,11 +89,11 @@ INSERT_GENTFUNCR_BASIC0( asumv_unb_var1 )
 \
 void PASTEMAC(ch,varname) \
      ( \
-       uplo_t        uploa, \
-       dim_t         m, \
-       ctype*        a, inc_t rs_a, inc_t cs_a, \
-       const cntx_t* cntx, \
-       const rntm_t* rntm  \
+       uplo_t  uploa, \
+       dim_t   m, \
+       ctype*  a, inc_t rs_a, inc_t cs_a, \
+       cntx_t* cntx, \
+       rntm_t* rntm  \
      ) \
 { \
 	ctype_r* zeror = PASTEMAC(chr,0); \
@@ -145,11 +145,11 @@ INSERT_GENTFUNCR_BASIC0( mkherm_unb_var1 )
 \
 void PASTEMAC(ch,varname) \
      ( \
-       uplo_t        uploa, \
-       dim_t         m, \
-       ctype*        a, inc_t rs_a, inc_t cs_a, \
-       const cntx_t* cntx, \
-       const rntm_t* rntm  \
+       uplo_t  uploa, \
+       dim_t   m, \
+       ctype*  a, inc_t rs_a, inc_t cs_a, \
+       cntx_t* cntx, \
+       rntm_t* rntm  \
      ) \
 { \
 	doff_t  diagoffa; \
@@ -187,11 +187,11 @@ INSERT_GENTFUNC_BASIC0( mksymm_unb_var1 )
 \
 void PASTEMAC(ch,varname) \
      ( \
-       uplo_t        uploa, \
-       dim_t         m, \
-       ctype*        a, inc_t rs_a, inc_t cs_a, \
-       const cntx_t* cntx, \
-       const rntm_t* rntm  \
+       uplo_t  uploa, \
+       dim_t   m, \
+       ctype*  a, inc_t rs_a, inc_t cs_a, \
+       cntx_t* cntx, \
+       rntm_t* rntm  \
      ) \
 { \
 	ctype*  zero = PASTEMAC(ch,0); \
@@ -232,17 +232,17 @@ INSERT_GENTFUNC_BASIC0( mktrim_unb_var1 )
 \
 void PASTEMAC(ch,varname) \
      ( \
-       dim_t         n, \
-       const ctype*  x, inc_t incx, \
-       ctype_r*      norm, \
-       const cntx_t* cntx, \
-       const rntm_t* rntm  \
+       dim_t    n, \
+       ctype*   x, inc_t incx, \
+       ctype_r* norm, \
+       cntx_t*  cntx, \
+       rntm_t*  rntm  \
      ) \
 { \
-	const ctype* chi1; \
-	ctype_r      abs_chi1; \
-	ctype_r      absum; \
-	dim_t        i; \
+	ctype*  chi1; \
+	ctype_r abs_chi1; \
+	ctype_r absum; \
+	dim_t   i; \
 \
 	/* Initialize the absolute sum accumulator to zero. */ \
 	PASTEMAC(chr,set0s)( absum ); \
@@ -270,11 +270,11 @@ INSERT_GENTFUNCR_BASIC0( norm1v_unb_var1 )
 \
 void PASTEMAC(ch,varname) \
      ( \
-       dim_t         n, \
-       const ctype*  x, inc_t incx, \
-       ctype_r*      norm, \
-       const cntx_t* cntx, \
-       const rntm_t* rntm  \
+       dim_t    n, \
+       ctype*   x, inc_t incx, \
+       ctype_r* norm, \
+       cntx_t*  cntx, \
+       rntm_t*  rntm  \
      ) \
 { \
 	ctype_r* zero       = PASTEMAC(chr,0); \
@@ -402,11 +402,11 @@ void PASTEMAC(ch,varname) \
 \
 void PASTEMAC(ch,varname) \
      ( \
-       dim_t         n, \
-       const ctype*  x, inc_t incx, \
-       ctype_r*      norm, \
-       const cntx_t* cntx, \
-       const rntm_t* rntm  \
+       dim_t    n, \
+       ctype*   x, inc_t incx, \
+       ctype_r* norm, \
+       cntx_t*  cntx, \
+       rntm_t*  rntm  \
      ) \
 { \
 	ctype_r* zero       = PASTEMAC(chr,0); \
@@ -448,17 +448,17 @@ GENTFUNCR( double,  double, d, d, normfv_unb_var1, sumsqv_unb_var1 )
 \
 void PASTEMAC(ch,varname) \
      ( \
-       dim_t         n, \
-       const ctype*  x, inc_t incx, \
-       ctype_r*      norm, \
-       const cntx_t* cntx, \
-       const rntm_t* rntm  \
+       dim_t    n, \
+       ctype*   x, inc_t incx, \
+       ctype_r* norm, \
+       cntx_t*  cntx, \
+       rntm_t*  rntm  \
      ) \
 { \
-	const ctype* chi1; \
-	ctype_r      abs_chi1; \
-	ctype_r      abs_chi1_max; \
-	dim_t        i; \
+	ctype*  chi1; \
+	ctype_r abs_chi1; \
+	ctype_r abs_chi1_max; \
+	dim_t   i; \
 \
 	/* Initialize the maximum absolute value to zero. */ \
 	PASTEMAC(chr,set0s)( abs_chi1_max ); \
@@ -494,30 +494,30 @@ INSERT_GENTFUNCR_BASIC0( normiv_unb_var1 )
 \
 void PASTEMAC(ch,varname) \
      ( \
-       doff_t        diagoffx, \
-       diag_t        diagx, \
-       uplo_t        uplox, \
-       dim_t         m, \
-       dim_t         n, \
-       const ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype_r*      norm, \
-       const cntx_t* cntx, \
-       const rntm_t* rntm  \
+       doff_t   diagoffx, \
+       diag_t   diagx, \
+       uplo_t   uplox, \
+       dim_t    m, \
+       dim_t    n, \
+       ctype*   x, inc_t rs_x, inc_t cs_x, \
+       ctype_r* norm, \
+       cntx_t*  cntx, \
+       rntm_t*  rntm  \
      ) \
 { \
-	const ctype* one       = PASTEMAC(ch,1); \
-	const ctype* x0; \
-	const ctype* chi1; \
-	const ctype* x2; \
-	ctype_r      absum_max; \
-	ctype_r      absum_j; \
-	ctype_r      abval_chi1; \
-	uplo_t       uplox_eff; \
-	dim_t        n_iter; \
-	dim_t        n_elem, n_elem_max; \
-	inc_t        ldx, incx; \
-	dim_t        j, i; \
-	dim_t        ij0, n_shift; \
+	ctype*  one = PASTEMAC(ch,1); \
+	ctype*  x0; \
+	ctype*  chi1; \
+	ctype*  x2; \
+	ctype_r absum_max; \
+	ctype_r absum_j; \
+	ctype_r abval_chi1; \
+	uplo_t  uplox_eff; \
+	dim_t   n_iter; \
+	dim_t   n_elem, n_elem_max; \
+	inc_t   ldx, incx; \
+	dim_t   j, i; \
+	dim_t   ij0, n_shift; \
 \
 	/* Initialize the maximum absolute column sum to zero. */ \
 	PASTEMAC(chr,set0s)( absum_max ); \
@@ -658,32 +658,32 @@ INSERT_GENTFUNCR_BASIC( norm1m_unb_var1, norm1v_unb_var1 )
 \
 void PASTEMAC(ch,varname) \
      ( \
-       doff_t        diagoffx, \
-       diag_t        diagx, \
-       uplo_t        uplox, \
-       dim_t         m, \
-       dim_t         n, \
-       const ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype_r*      norm, \
-       const cntx_t* cntx, \
-       const rntm_t* rntm  \
+       doff_t   diagoffx, \
+       diag_t   diagx, \
+       uplo_t   uplox, \
+       dim_t    m, \
+       dim_t    n, \
+       ctype*   x, inc_t rs_x, inc_t cs_x, \
+       ctype_r* norm, \
+       cntx_t*  cntx, \
+       rntm_t*  rntm  \
      ) \
 { \
-	const ctype*   one    = PASTEMAC(ch,1); \
-	const ctype_r* one_r  = PASTEMAC(chr,1); \
-	const ctype_r* zero_r = PASTEMAC(chr,0); \
-	const ctype*   x0; \
-	const ctype*   chi1; \
-	const ctype*   x2; \
-	ctype_r        scale; \
-	ctype_r        sumsq; \
-	ctype_r        sqrt_sumsq; \
-	uplo_t         uplox_eff; \
-	dim_t          n_iter; \
-	dim_t          n_elem, n_elem_max; \
-	inc_t          ldx, incx; \
-	dim_t          j, i; \
-	dim_t          ij0, n_shift; \
+	ctype*   one    = PASTEMAC(ch,1); \
+	ctype_r* one_r  = PASTEMAC(chr,1); \
+	ctype_r* zero_r = PASTEMAC(chr,0); \
+	ctype*   x0; \
+	ctype*   chi1; \
+	ctype*   x2; \
+	ctype_r  scale; \
+	ctype_r  sumsq; \
+	ctype_r  sqrt_sumsq; \
+	uplo_t   uplox_eff; \
+	dim_t    n_iter; \
+	dim_t    n_elem, n_elem_max; \
+	inc_t    ldx, incx; \
+	dim_t    j, i; \
+	dim_t    ij0, n_shift; \
 \
 	/* Return a norm of zero if either dimension is zero. */ \
 	if ( bli_zero_dim2( m, n ) ) \
@@ -825,15 +825,15 @@ INSERT_GENTFUNCR_BASIC( normfm_unb_var1, sumsqv_unb_var1 )
 \
 void PASTEMAC(ch,varname) \
      ( \
-       doff_t        diagoffx, \
-       diag_t        diagx, \
-       uplo_t        uplox, \
-       dim_t         m, \
-       dim_t         n, \
-       const ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype_r*      norm, \
-       const cntx_t* cntx, \
-       const rntm_t* rntm  \
+       doff_t   diagoffx, \
+       diag_t   diagx, \
+       uplo_t   uplox, \
+       dim_t    m, \
+       dim_t    n, \
+       ctype*   x, inc_t rs_x, inc_t cs_x, \
+       ctype_r* norm, \
+       cntx_t*  cntx, \
+       rntm_t*  rntm  \
      ) \
 { \
 	/* Induce a transposition so that rows become columns. */ \
@@ -867,10 +867,10 @@ INSERT_GENTFUNCR_BASIC( normim_unb_var1, norm1m_unb_var1 )
 \
 void PASTEMAC(ch,varname) \
      ( \
-       dim_t         n, \
-       ctype*        x, inc_t incx, \
-       const cntx_t* cntx, \
-       const rntm_t* rntm  \
+       dim_t   n, \
+       ctype*  x, inc_t incx, \
+       cntx_t* cntx, \
+       rntm_t* rntm  \
      ) \
 { \
 	ctype* chi1; \
@@ -895,29 +895,29 @@ INSERT_GENTFUNC_BASIC( randnv_unb_var1, randnp2s )
 \
 void PASTEMAC(ch,varname) \
      ( \
-       doff_t        diagoffx, \
-       uplo_t        uplox, \
-       dim_t         m, \
-       dim_t         n, \
-       ctype*        x, inc_t rs_x, inc_t cs_x, \
-       const cntx_t* cntx, \
-       const rntm_t* rntm  \
+       doff_t  diagoffx, \
+       uplo_t  uplox, \
+       dim_t   m, \
+       dim_t   n, \
+       ctype*  x, inc_t rs_x, inc_t cs_x, \
+       cntx_t* cntx, \
+       rntm_t* rntm  \
      ) \
 { \
-	const ctype* one = PASTEMAC(ch,1); \
-	ctype*       x0; \
-	ctype*       x1; \
-	ctype*       x2; \
-	ctype*       chi1; \
-	ctype        beta; \
-	ctype        omega; \
-	double       max_m_n; \
-	uplo_t       uplox_eff; \
-	dim_t        n_iter; \
-	dim_t        n_elem, n_elem_max; \
-	inc_t        ldx, incx; \
-	dim_t        j, i; \
-	dim_t        ij0, n_shift; \
+	ctype* one = PASTEMAC(ch,1); \
+	ctype* x0; \
+	ctype* x1; \
+	ctype* x2; \
+	ctype* chi1; \
+	ctype  beta; \
+	ctype  omega; \
+	double max_m_n; \
+	uplo_t uplox_eff; \
+	dim_t  n_iter; \
+	dim_t  n_elem, n_elem_max; \
+	inc_t  ldx, incx; \
+	dim_t  j, i; \
+	dim_t  ij0, n_shift; \
 \
 	/* Set various loop parameters. Here, we pretend that diagx is equal to
 	   BLIS_NONUNIT_DIAG because we handle the unit diagonal case manually. */ \
@@ -1051,24 +1051,24 @@ INSERT_GENTFUNC_BASIC( randnm_unb_var1, randnv_unb_var1 )
 \
 void PASTEMAC(ch,varname) \
      ( \
-       dim_t         n, \
-       const ctype*  x, inc_t incx, \
-       ctype_r*      scale, \
-       ctype_r*      sumsq, \
-       const cntx_t* cntx, \
-       const rntm_t* rntm  \
+       dim_t    n, \
+       ctype*   x, inc_t incx, \
+       ctype_r* scale, \
+       ctype_r* sumsq, \
+       cntx_t*  cntx, \
+       rntm_t*  rntm  \
      ) \
 { \
-	const ctype_r zero_r = *PASTEMAC(chr,0); \
-	const ctype_r one_r  = *PASTEMAC(chr,1); \
+	ctype_r zero_r = *PASTEMAC(chr,0); \
+	ctype_r one_r  = *PASTEMAC(chr,1); \
 \
-	const ctype*  chi1; \
-	ctype_r       chi1_r; \
-	ctype_r       chi1_i; \
-	ctype_r       scale_r; \
-	ctype_r       sumsq_r; \
-	ctype_r       abs_chi1_r; \
-	dim_t         i; \
+	ctype*  chi1; \
+	ctype_r chi1_r; \
+	ctype_r chi1_i; \
+	ctype_r scale_r; \
+	ctype_r sumsq_r; \
+	ctype_r abs_chi1_r; \
+	dim_t   i; \
 \
 	/* NOTE: This function attempts to mimic the algorithm for computing
 	   the Frobenius norm in netlib LAPACK's ?lassq(). */ \
@@ -1143,16 +1143,16 @@ INSERT_GENTFUNCR_BASIC0( sumsqv_unb_var1 )
 \
 bool PASTEMAC(ch,opname) \
      ( \
-       conj_t       conjx, \
-       dim_t        n, \
-       const ctype* x, inc_t incx, \
-       const ctype* y, inc_t incy  \
+       conj_t conjx, \
+       dim_t  n, \
+       ctype* x, inc_t incx, \
+       ctype* y, inc_t incy  \
      ) \
 { \
 	for ( dim_t i = 0; i < n; ++i ) \
 	{ \
-		const ctype* chi1 = x + (i  )*incx; \
-		const ctype* psi1 = y + (i  )*incy; \
+		ctype* chi1 = x + (i  )*incx; \
+		ctype* psi1 = y + (i  )*incy; \
 \
 		ctype chi1c; \
 \
@@ -1174,14 +1174,14 @@ INSERT_GENTFUNC_BASIC0( eqv_unb_var1 )
 \
 bool PASTEMAC(ch,opname) \
      ( \
-       doff_t       diagoffx, \
-       diag_t       diagx, \
-       uplo_t       uplox, \
-       trans_t      transx, \
-       dim_t        m, \
-       dim_t        n, \
-       const ctype* x, inc_t rs_x, inc_t cs_x, \
-       const ctype* y, inc_t rs_y, inc_t cs_y  \
+       doff_t  diagoffx, \
+       diag_t  diagx, \
+       uplo_t  uplox, \
+       trans_t transx, \
+       dim_t   m, \
+       dim_t   n, \
+       ctype*  x, inc_t rs_x, inc_t cs_x, \
+       ctype*  y, inc_t rs_y, inc_t cs_y  \
      ) \
 { \
 	uplo_t   uplox_eff; \
@@ -1219,14 +1219,14 @@ bool PASTEMAC(ch,opname) \
 		{ \
 			const dim_t n_elem = n_elem_max; \
 \
-			const ctype* x1 = x + (j  )*ldx + (0  )*incx; \
-			const ctype* y1 = y + (j  )*ldy + (0  )*incy; \
+			ctype* x1 = x + (j  )*ldx + (0  )*incx; \
+			ctype* y1 = y + (j  )*ldy + (0  )*incy; \
 \
 			for ( dim_t i = 0; i < n_elem; ++i ) \
 			{ \
-				const ctype* x11 = x1 + (i  )*incx; \
-				const ctype* y11 = y1 + (i  )*incy; \
-				ctype        x11c; \
+				ctype* x11 = x1 + (i  )*incx; \
+				ctype* y11 = y1 + (i  )*incy; \
+				ctype  x11c; \
 \
 				if ( bli_is_conj( conjx ) ) { PASTEMAC(ch,copyjs)( *x11, x11c ); } \
 				else                        { PASTEMAC(ch,copys)( *x11, x11c ); } \
@@ -1244,14 +1244,14 @@ bool PASTEMAC(ch,opname) \
 			{ \
 				const dim_t n_elem = bli_min( n_shift + j + 1, n_elem_max ); \
 \
-				const ctype* x1 = x + (ij0+j  )*ldx + (0  )*incx; \
-				const ctype* y1 = y + (ij0+j  )*ldy + (0  )*incy; \
+				ctype* x1 = x + (ij0+j  )*ldx + (0  )*incx; \
+				ctype* y1 = y + (ij0+j  )*ldy + (0  )*incy; \
 \
 				for ( dim_t i = 0; i < n_elem; ++i ) \
 				{ \
-					const ctype* x11 = x1 + (i  )*incx; \
-					const ctype* y11 = y1 + (i  )*incy; \
-					ctype        x11c; \
+					ctype* x11 = x1 + (i  )*incx; \
+					ctype* y11 = y1 + (i  )*incy; \
+					ctype  x11c; \
 \
 					if ( bli_is_conj( conjx ) ) { PASTEMAC(ch,copyjs)( *x11, x11c ); } \
 					else                        { PASTEMAC(ch,copys)( *x11, x11c ); } \
@@ -1268,14 +1268,14 @@ bool PASTEMAC(ch,opname) \
 				const dim_t offi   = bli_max( 0, ( doff_t )j - ( doff_t )n_shift ); \
 				const dim_t n_elem = n_elem_max - offi; \
 \
-				const ctype* x1 = x + (j  )*ldx + (ij0+offi  )*incx; \
-				const ctype* y1 = y + (j  )*ldy + (ij0+offi  )*incy; \
+				ctype* x1 = x + (j  )*ldx + (ij0+offi  )*incx; \
+				ctype* y1 = y + (j  )*ldy + (ij0+offi  )*incy; \
 \
 				for ( dim_t i = 0; i < n_elem; ++i ) \
 				{ \
-					const ctype* x11 = x1 + (i  )*incx; \
-					const ctype* y11 = y1 + (i  )*incy; \
-					ctype        x11c; \
+					ctype* x11 = x1 + (i  )*incx; \
+					ctype* y11 = y1 + (i  )*incy; \
+					ctype  x11c; \
 \
 					if ( bli_is_conj( conjx ) ) { PASTEMAC(ch,copyjs)( *x11, x11c ); } \
 					else                        { PASTEMAC(ch,copys)( *x11, x11c ); } \
@@ -1298,25 +1298,23 @@ INSERT_GENTFUNC_BASIC0( eqm_unb_var1 )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       FILE*        file, \
+             FILE*  file, \
        const char*  s1, \
-       dim_t        n, \
+             dim_t  n, \
        const ctype* x, inc_t incx, \
        const char*  format, \
        const char*  s2  \
      ) \
 { \
-	dim_t  i; \
-	const ctype* chi1; \
-	char   default_spec[32] = PASTEMAC(ch,formatspec)(); \
+	const char default_spec[32] = PASTEMAC(ch,formatspec)(); \
 \
 	if ( format == NULL ) format = default_spec; \
 \
-	chi1 = x; \
+	const ctype*chi1 = x; \
 \
 	fprintf( file, "%s\n", s1 ); \
 \
-	for ( i = 0; i < n; ++i ) \
+	for ( dim_t i = 0; i < n; ++i ) \
 	{ \
 		PASTEMAC(ch,fprints)( file, format, *chi1 ); \
 		fprintf( file, "\n" ); \
@@ -1335,28 +1333,26 @@ INSERT_GENTFUNC_BASIC0_I( fprintv )
 \
 void PASTEMAC(ch,opname) \
      ( \
-       FILE*        file, \
+             FILE*  file, \
        const char*  s1, \
-       dim_t        m, \
-       dim_t        n, \
+             dim_t  m, \
+             dim_t  n, \
        const ctype* x, inc_t rs_x, inc_t cs_x, \
        const char*  format, \
        const char*  s2  \
      ) \
 { \
-	dim_t  i, j; \
-	const ctype* chi1; \
-	char   default_spec[32] = PASTEMAC(ch,formatspec)(); \
+	const char default_spec[32] = PASTEMAC(ch,formatspec)(); \
 \
 	if ( format == NULL ) format = default_spec; \
 \
 	fprintf( file, "%s\n", s1 ); \
 \
-	for ( i = 0; i < m; ++i ) \
+	for ( dim_t i = 0; i < m; ++i ) \
 	{ \
-		for ( j = 0; j < n; ++j ) \
+		for ( dim_t j = 0; j < n; ++j ) \
 		{ \
-			chi1 = (( ctype* ) x) + i*rs_x + j*cs_x; \
+			const ctype* chi1 = (( ctype* ) x) + i*rs_x + j*cs_x; \
 \
 			PASTEMAC(ch,fprints)( file, format, *chi1 ); \
 			fprintf( file, " " ); \
diff --git a/frame/util/bli_util_unb_var1.h b/frame/util/bli_util_unb_var1.h
index 12a5b7de8a..435efa4ac7 100644
--- a/frame/util/bli_util_unb_var1.h
+++ b/frame/util/bli_util_unb_var1.h
@@ -42,11 +42,11 @@
 \
 void PASTEMAC(ch,varname) \
      ( \
-       dim_t         n, \
-       const ctype*  x, inc_t incx, \
-       ctype_r*      asum, \
-       const cntx_t* cntx, \
-       const rntm_t* rntm  \
+       dim_t    n, \
+       ctype*   x, inc_t incx, \
+       ctype_r* asum, \
+       cntx_t*  cntx, \
+       rntm_t*  rntm  \
      );
 
 INSERT_GENTPROTR_BASIC0( asumv_unb_var1 )
@@ -57,11 +57,11 @@ INSERT_GENTPROTR_BASIC0( asumv_unb_var1 )
 \
 void PASTEMAC(ch,varname) \
      ( \
-       uplo_t        uploa, \
-       dim_t         m, \
-       ctype*        a, inc_t rs_a, inc_t cs_a, \
-       const cntx_t* cntx, \
-       const rntm_t* rntm  \
+       uplo_t  uploa, \
+       dim_t   m, \
+       ctype*  a, inc_t rs_a, inc_t cs_a, \
+       cntx_t* cntx, \
+       rntm_t* rntm  \
      );
 
 INSERT_GENTPROT_BASIC0( mkherm_unb_var1 )
@@ -74,11 +74,11 @@ INSERT_GENTPROT_BASIC0( mktrim_unb_var1 )
 \
 void PASTEMAC(ch,varname) \
      ( \
-       dim_t         n, \
-       const ctype*  x, inc_t incx, \
-       ctype_r*      norm, \
-       const cntx_t* cntx, \
-       const rntm_t* rntm  \
+       dim_t    n, \
+       ctype*   x, inc_t incx, \
+       ctype_r* norm, \
+       cntx_t*  cntx, \
+       rntm_t*  rntm  \
      );
 
 INSERT_GENTPROTR_BASIC0( norm1v_unb_var1 )
@@ -91,15 +91,15 @@ INSERT_GENTPROTR_BASIC0( normiv_unb_var1 )
 \
 void PASTEMAC(ch,varname) \
      ( \
-       doff_t        diagoffx, \
-       diag_t        diagx, \
-       uplo_t        uplox, \
-       dim_t         m, \
-       dim_t         n, \
-       const ctype*  x, inc_t rs_x, inc_t cs_x, \
-       ctype_r*      norm, \
-       const cntx_t* cntx, \
-       const rntm_t* rntm  \
+       doff_t   diagoffx, \
+       diag_t   diagx, \
+       uplo_t   uplox, \
+       dim_t    m, \
+       dim_t    n, \
+       ctype*   x, inc_t rs_x, inc_t cs_x, \
+       ctype_r* norm, \
+       cntx_t*  cntx, \
+       rntm_t*  rntm  \
      );
 
 INSERT_GENTPROTR_BASIC0( norm1m_unb_var1 )
@@ -112,10 +112,10 @@ INSERT_GENTPROTR_BASIC0( normim_unb_var1 )
 \
 void PASTEMAC(ch,varname) \
      ( \
-       dim_t         n, \
-       ctype*        x, inc_t incx, \
-       const cntx_t* cntx, \
-       const rntm_t* rntm  \
+       dim_t   n, \
+       ctype*  x, inc_t incx, \
+       cntx_t* cntx, \
+       rntm_t* rntm  \
      );
 
 INSERT_GENTPROT_BASIC0( randv_unb_var1 )
@@ -127,13 +127,13 @@ INSERT_GENTPROT_BASIC0( randnv_unb_var1 )
 \
 void PASTEMAC(ch,varname) \
      ( \
-       doff_t        diagoffx, \
-       uplo_t        uplox, \
-       dim_t         m, \
-       dim_t         n, \
-       ctype*        x, inc_t rs_x, inc_t cs_x, \
-       const cntx_t* cntx, \
-       const rntm_t* rntm  \
+       doff_t  diagoffx, \
+       uplo_t  uplox, \
+       dim_t   m, \
+       dim_t   n, \
+       ctype*  x, inc_t rs_x, inc_t cs_x, \
+       cntx_t* cntx, \
+       rntm_t* rntm  \
      );
 
 INSERT_GENTPROT_BASIC0( randm_unb_var1 )
@@ -145,12 +145,12 @@ INSERT_GENTPROT_BASIC0( randnm_unb_var1 )
 \
 void PASTEMAC(ch,varname) \
      ( \
-       dim_t          n, \
-       const ctype*   x, inc_t incx, \
-       ctype_r*       scale, \
-       ctype_r*       sumsq, \
-       const cntx_t*  cntx, \
-       const rntm_t*  rntm  \
+       dim_t    n, \
+       ctype*   x, inc_t incx, \
+       ctype_r* scale, \
+       ctype_r* sumsq, \
+       cntx_t*  cntx, \
+       rntm_t*  rntm  \
      );
 
 INSERT_GENTPROTR_BASIC0( sumsqv_unb_var1 )
@@ -162,10 +162,10 @@ INSERT_GENTPROTR_BASIC0( sumsqv_unb_var1 )
 \
 bool PASTEMAC(ch,varname) \
      ( \
-       conj_t       conjx, \
-       dim_t        n, \
-       const ctype* x, inc_t incx, \
-       const ctype* y, inc_t incy  \
+       conj_t conjx, \
+       dim_t  n, \
+       ctype* x, inc_t incx, \
+       ctype* y, inc_t incy  \
      );
 
 INSERT_GENTPROT_BASIC0( eqv_unb_var1 )
@@ -176,14 +176,14 @@ INSERT_GENTPROT_BASIC0( eqv_unb_var1 )
 \
 bool PASTEMAC(ch,varname) \
      ( \
-       doff_t       diagoffx, \
-       diag_t       diagx, \
-       uplo_t       uplox, \
-       trans_t      transx, \
-       dim_t        m, \
-       dim_t        n, \
-       const ctype* x, inc_t rs_x, inc_t cs_x, \
-       const ctype* y, inc_t rs_y, inc_t cs_y  \
+       doff_t  diagoffx, \
+       diag_t  diagx, \
+       uplo_t  uplox, \
+       trans_t transx, \
+       dim_t   m, \
+       dim_t   n, \
+       ctype*  x, inc_t rs_x, inc_t cs_x, \
+       ctype*  y, inc_t rs_y, inc_t cs_y  \
      );
 
 INSERT_GENTPROT_BASIC0( eqm_unb_var1 )
@@ -194,9 +194,9 @@ INSERT_GENTPROT_BASIC0( eqm_unb_var1 )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       FILE*        file, \
+             FILE*  file, \
        const char*  s1, \
-       dim_t        n, \
+             dim_t  n, \
        const ctype* x, inc_t incx, \
        const char*  format, \
        const char*  s2  \
@@ -210,10 +210,10 @@ INSERT_GENTPROT_BASIC0_I( fprintv )
 \
 BLIS_EXPORT_BLIS void PASTEMAC(ch,opname) \
      ( \
-       FILE*        file, \
+             FILE*  file, \
        const char*  s1, \
-       dim_t        m, \
-       dim_t        n, \
+             dim_t  m, \
+             dim_t  n, \
        const ctype* x, inc_t rs_x, inc_t cs_x, \
        const char*  format, \
        const char*  s2  \
diff --git a/kernels/armsve/1m/bli_dpackm_armsve256_int_8xk.c b/kernels/armsve/1m/bli_dpackm_armsve256_int_8xk.c
index 85dfaa9c0e..1d9a0e47c6 100644
--- a/kernels/armsve/1m/bli_dpackm_armsve256_int_8xk.c
+++ b/kernels/armsve/1m/bli_dpackm_armsve256_int_8xk.c
@@ -52,7 +52,7 @@ void bli_dpackm_armsve256_int_8xk
        double* restrict kappa,
        double* restrict a, inc_t inca_, inc_t lda_,
        double* restrict p,              inc_t ldp_,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
     const int64_t cdim  = cdim_;
diff --git a/kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c b/kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c
index a086b3a76e..5866ed26f4 100644
--- a/kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c
+++ b/kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c
@@ -51,7 +51,7 @@ void bli_dpackm_armsve512_asm_10xk
        double* restrict kappa,
        double* restrict a, inc_t inca_, inc_t lda_,
        double* restrict p,              inc_t ldp_,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
     const int64_t cdim  = cdim_;
diff --git a/kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c b/kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c
index aeb323c0ca..88ccb4b8ed 100644
--- a/kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c
+++ b/kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c
@@ -50,7 +50,7 @@ void bli_dpackm_armsve512_asm_16xk
        double* restrict kappa,
        double* restrict a, inc_t inca_, inc_t lda_,
        double* restrict p,              inc_t ldp_,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
     const int64_t cdim  = cdim_;
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c
index 098d5d4b5e..9bc7fd9492 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c
@@ -53,8 +53,8 @@ void bli_cgemm_armsve_asm_2vx10_unindexed
        scomplex*  restrict b,
        scomplex*  restrict beta,
        scomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
   void* a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c
index 0ee470f240..1c9d68dec3 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c
@@ -53,8 +53,8 @@ void bli_dgemm_armsve_asm_2vx10_unindexed
        double*    restrict b,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
   void* a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c
index d03af59230..05005f8c3e 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c
@@ -53,8 +53,8 @@ void bli_sgemm_armsve_asm_2vx10_unindexed
        float*     restrict b,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
   void* a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c
index 8636a527ba..210d40f0b7 100644
--- a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c
+++ b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c
@@ -53,8 +53,8 @@ void bli_zgemm_armsve_asm_2vx10_unindexed
        dcomplex*  restrict b,
        dcomplex*  restrict beta,
        dcomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
   void* a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/armv7a/3/bli_gemm_armv7a_asm_d4x4.c b/kernels/armv7a/3/bli_gemm_armv7a_asm_d4x4.c
index c248285c38..4dec190e03 100644
--- a/kernels/armv7a/3/bli_gemm_armv7a_asm_d4x4.c
+++ b/kernels/armv7a/3/bli_gemm_armv7a_asm_d4x4.c
@@ -43,7 +43,7 @@ void bli_sgemm_armv7a_ker_4x4
        float*     restrict b,
        float*     restrict beta,
        float*     restrict c, uint32_t rs_c, uint32_t cs_c,
-       auxinfo_t* restrict data
+       auxinfo_t*          data
      );
 
 void bli_sgemm_armv7a_asm_4x4
@@ -56,8 +56,8 @@ void bli_sgemm_armv7a_asm_4x4
        float*     restrict b,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	// Typecast local copies of integers in case dim_t and inc_t are a
@@ -78,7 +78,7 @@ void bli_dgemm_armv7a_ker_4x4
        double*    restrict b,
        double*    restrict beta,
        double*    restrict c, uint32_t rs_c, uint32_t cs_c,
-       auxinfo_t* restrict data
+       auxinfo_t*          data
      );
 
 void bli_dgemm_armv7a_asm_4x4
@@ -91,8 +91,8 @@ void bli_dgemm_armv7a_asm_4x4
        double*    restrict b,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	// Typecast local copies of integers in case dim_t and inc_t are a
@@ -113,7 +113,7 @@ void bli_cgemm_armv7a_ker_2x2
        scomplex*  restrict b,
        scomplex*  restrict beta,
        scomplex*  restrict c, uint32_t rs_c, uint32_t cs_c,
-       auxinfo_t* restrict data
+       auxinfo_t*          data
      );
 
 void bli_cgemm_armv7a_asm_2x2
@@ -126,8 +126,8 @@ void bli_cgemm_armv7a_asm_2x2
        scomplex*  restrict b,
        scomplex*  restrict beta,
        scomplex*  restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	// Typecast local copies of integers in case dim_t and inc_t are a
@@ -148,7 +148,7 @@ void bli_zgemm_armv7a_ker_2x2
        dcomplex*  restrict b,
        dcomplex*  restrict beta,
        dcomplex*  restrict c, uint32_t rs_c, uint32_t cs_c,
-       auxinfo_t* restrict data
+       auxinfo_t*          data
      );
 
 void bli_zgemm_armv7a_asm_2x2
@@ -161,8 +161,8 @@ void bli_zgemm_armv7a_asm_2x2
        dcomplex*  restrict b,
        dcomplex*  restrict beta,
        dcomplex*  restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	// Typecast local copies of integers in case dim_t and inc_t are a
diff --git a/kernels/armv7a/3/bli_gemm_armv7a_int_d4x4.c b/kernels/armv7a/3/bli_gemm_armv7a_int_d4x4.c
index 06f36a3463..b1e9481a31 100644
--- a/kernels/armv7a/3/bli_gemm_armv7a_int_d4x4.c
+++ b/kernels/armv7a/3/bli_gemm_armv7a_int_d4x4.c
@@ -45,8 +45,8 @@ void bli_sgemm_armv7a_int_4x4
        float*     restrict b,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	// Typecast local copies of integers in case dim_t and inc_t are a
@@ -251,8 +251,8 @@ void bli_dgemm_armv7a_int_4x4
        double*    restrict b,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	// Typecast local copies of integers in case dim_t and inc_t are a
diff --git a/kernels/armv8a/1m/bli_packm_armv8a_int_d6xk.c b/kernels/armv8a/1m/bli_packm_armv8a_int_d6xk.c
index 301b8ad790..3eefd9ddc7 100644
--- a/kernels/armv8a/1m/bli_packm_armv8a_int_d6xk.c
+++ b/kernels/armv8a/1m/bli_packm_armv8a_int_d6xk.c
@@ -57,7 +57,7 @@ void bli_dpackm_armv8a_int_6xk
        double*    restrict kappa,
        double*    restrict a, inc_t inca0, inc_t lda0,
        double*    restrict p,              inc_t ldp0,
-       cntx_t*    restrict cntx
+       cntx_t*             cntx
      )
 {
   // This is the panel dimension assumed by the packm kernel.
@@ -296,7 +296,7 @@ void bli_dpackm_armv8a_int_6xk
       (
         m_edge,
         n_edge,
-        p_edge, 1, ldp 
+        p_edge, 1, ldp
       );
     }
   }
@@ -316,7 +316,7 @@ void bli_dpackm_armv8a_int_6xk
     (
       m_edge,
       n_edge,
-      p_edge, 1, ldp 
+      p_edge, 1, ldp
     );
   }
 }
diff --git a/kernels/armv8a/1m/bli_packm_armv8a_int_d8xk.c b/kernels/armv8a/1m/bli_packm_armv8a_int_d8xk.c
index 321fa5403b..51b064a24a 100644
--- a/kernels/armv8a/1m/bli_packm_armv8a_int_d8xk.c
+++ b/kernels/armv8a/1m/bli_packm_armv8a_int_d8xk.c
@@ -57,7 +57,7 @@ void bli_dpackm_armv8a_int_8xk
        double*    restrict kappa,
        double*    restrict a, inc_t inca0, inc_t lda0,
        double*    restrict p,              inc_t ldp0,
-       cntx_t*    restrict cntx
+       cntx_t*             cntx
      )
 {
   // This is the panel dimension assumed by the packm kernel.
@@ -326,7 +326,7 @@ void bli_dpackm_armv8a_int_8xk
       (
         m_edge,
         n_edge,
-        p_edge, 1, ldp 
+        p_edge, 1, ldp
       );
     }
   }
@@ -346,7 +346,7 @@ void bli_dpackm_armv8a_int_8xk
     (
       m_edge,
       n_edge,
-      p_edge, 1, ldp 
+      p_edge, 1, ldp
     );
   }
 }
diff --git a/kernels/armv8a/1m/bli_packm_armv8a_int_s12xk.c b/kernels/armv8a/1m/bli_packm_armv8a_int_s12xk.c
index 3718772473..f915215e1b 100644
--- a/kernels/armv8a/1m/bli_packm_armv8a_int_s12xk.c
+++ b/kernels/armv8a/1m/bli_packm_armv8a_int_s12xk.c
@@ -57,7 +57,7 @@ void bli_spackm_armv8a_int_12xk
        float*     restrict kappa,
        float*     restrict a, inc_t inca0, inc_t lda0,
        float*     restrict p,              inc_t ldp0,
-       cntx_t*    restrict cntx
+       cntx_t*             cntx
      )
 {
   // This is the panel dimension assumed by the packm kernel.
@@ -410,7 +410,7 @@ void bli_spackm_armv8a_int_12xk
       (
         m_edge,
         n_edge,
-        p_edge, 1, ldp 
+        p_edge, 1, ldp
       );
     }
   }
@@ -428,7 +428,7 @@ void bli_spackm_armv8a_int_12xk
     (
       m_edge,
       n_edge,
-      p_edge, 1, ldp 
+      p_edge, 1, ldp
     );
   }
 }
diff --git a/kernels/armv8a/1m/bli_packm_armv8a_int_s8xk.c b/kernels/armv8a/1m/bli_packm_armv8a_int_s8xk.c
index 3d363c2d8d..b508b2a0e0 100644
--- a/kernels/armv8a/1m/bli_packm_armv8a_int_s8xk.c
+++ b/kernels/armv8a/1m/bli_packm_armv8a_int_s8xk.c
@@ -57,7 +57,7 @@ void bli_spackm_armv8a_int_8xk
        float*     restrict kappa,
        float*     restrict a, inc_t inca0, inc_t lda0,
        float*     restrict p,              inc_t ldp0,
-       cntx_t*    restrict cntx
+       cntx_t*             cntx
      )
 {
   // This is the panel dimension assumed by the packm kernel.
@@ -348,7 +348,7 @@ void bli_spackm_armv8a_int_8xk
       (
         m_edge,
         n_edge,
-        p_edge, 1, ldp 
+        p_edge, 1, ldp
       );
     }
   }
@@ -366,7 +366,7 @@ void bli_spackm_armv8a_int_8xk
     (
       m_edge,
       n_edge,
-      p_edge, 1, ldp 
+      p_edge, 1, ldp
     );
   }
 }
diff --git a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c
index 4d9a888178..94f0090bc4 100644
--- a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c
+++ b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c
@@ -61,8 +61,8 @@ void bli_sgemm_armv8a_asm_8x12
        float*     restrict b,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	void* a_next = bli_auxinfo_next_a( data );
@@ -740,8 +740,8 @@ void bli_dgemm_armv8a_asm_6x8
        double*    restrict b,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	void* a_next = bli_auxinfo_next_a( data );
@@ -1462,8 +1462,8 @@ void bli_cgemm_armv8a_opt_4x4
        scomplex*  restrict b,
        scomplex*  restrict beta,
        scomplex*  restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 }
@@ -1478,8 +1478,8 @@ void bli_zgemm_armv8a_opt_4x4
        dcomplex*  restrict b,
        dcomplex*  restrict beta,
        dcomplex*  restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 }
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_armv8a_ref.c b/kernels/armv8a/3/sup/bli_gemmsup_armv8a_ref.c
index c87ff1feb6..44e0ac419f 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_armv8a_ref.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_armv8a_ref.c
@@ -57,8 +57,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
        ctype*     restrict beta, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      ) \
 { \
 	/* NOTE: This microkernel can actually handle arbitrarily large
@@ -262,8 +262,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
        ctype*     restrict beta, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      ) \
 { \
 	/* NOTE: This microkernel can actually handle arbitrarily large
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.c b/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.c
index 630459db73..cade3ee052 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8m.c
@@ -121,8 +121,8 @@ void bli_dgemmsup_rd_armv8a_asm_6x8m
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
   if ( n0 != 8 )
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.c b/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.c
index e13dd668ea..06c9ac32c6 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rd_armv8a_asm_d6x8n.c
@@ -114,8 +114,8 @@ void bli_dgemmsup_rd_armv8a_asm_6x8n
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
   if ( m0 != 6 )
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8m.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8m.c
index 16001a73ce..312eb44540 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8m.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8m.c
@@ -98,7 +98,7 @@ GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 )
 /*
  * 4x8 dgemmsup kernel with extending 1st dimension.
  *
- * Recommanded usage case: 
+ * Recommanded usage case:
  * o 16 < (L1 cache latency) * (Num. FPU) < 25.
  * o L1 cache has a bandwidth not too low (true in most cases).
  * o (FMLA latency) * (Num. FPU) < 32 cycles (true in almost all cases).
@@ -115,8 +115,8 @@ void bli_dgemmsup_rv_armv8a_asm_4x8m
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
   // Fixme: This uker has no dispatching for unalighed sizes.
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.c
index 43913cd38d..bc7402a5fe 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d4x8n.c
@@ -98,7 +98,7 @@ GEMMSUP_KER_PROT( double, d, gemmsup_r_armv8a_ref2 )
 /*
  * 4x8 dgemmsup kernel with extending 2nd dimension.
  *
- * Recommanded usage case: 
+ * Recommanded usage case:
  * o 16 < (L1 cache latency) * (Num. FPU) < 25.
  * o L1 cache has a bandwidth not too low (true in most cases).
  * o (FMLA latency) * (Num. FPU) < 32 cycles (true in almost all cases).
@@ -115,8 +115,8 @@ void bli_dgemmsup_rv_armv8a_asm_4x8n
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
   // Fixme: This uker has no dispatching for unalighed sizes.
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.c
index 3100112d3f..8ff5ec1732 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8m.c
@@ -140,8 +140,8 @@ void bli_dgemmsup_rv_armv8a_asm_6x8m
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
   if ( n0 != 8 )
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.c
index fb9357c11e..9bdf4b3b82 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d6x8n.c
@@ -140,8 +140,8 @@ void bli_dgemmsup_rv_armv8a_asm_6x8n
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
   if ( m0 != 6 )
diff --git a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c
index 5b0e9b062f..4d374df98b 100644
--- a/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c
+++ b/kernels/armv8a/3/sup/bli_gemmsup_rv_armv8a_asm_d8x4m.c
@@ -111,8 +111,8 @@ void bli_dgemmsup_rv_armv8a_asm_8x4m
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
   // Fixme: This uker has no dispatching for unalighed sizes.
diff --git a/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d3x4.c b/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d3x4.c
index 84c7c4a7d2..aa53de55c8 100644
--- a/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d3x4.c
+++ b/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d3x4.c
@@ -94,8 +94,8 @@ void bli_dgemmsup_rd_armv8a_asm_3x4
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
   assert( m0 == 3 );
diff --git a/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d6x3.c b/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d6x3.c
index abbb6fb4d9..b10546764a 100644
--- a/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d6x3.c
+++ b/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_asm_d6x3.c
@@ -118,8 +118,8 @@ void bli_dgemmsup_rd_armv8a_asm_6x3
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
   assert( m0 == 6 );
diff --git a/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d2x8.c b/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d2x8.c
index 43880063eb..5438fdfc2a 100644
--- a/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d2x8.c
+++ b/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d2x8.c
@@ -69,8 +69,8 @@ void bli_dgemmsup_rd_armv8a_int_2x8
        double*    restrict b, inc_t rs_b, inc_t cs_b,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
   assert( m0 <= 2 );
@@ -114,10 +114,10 @@ void bli_dgemmsup_rd_armv8a_int_2x8
   PRAGMA_UNROLL
   for ( ; k_mker > 0; --k_mker )
   {
-    // if ( m0 > 0 ) 
+    // if ( m0 > 0 )
                   va_0 = vld1q_f64( a_loc + rs_a * 0 );
     if ( m0 > 1 ) va_1 = vld1q_f64( a_loc + rs_a * 1 );
-    // if ( n0 > 0 ) 
+    // if ( n0 > 0 )
                   vb_0 = vld1q_f64( b_loc + cs_b * 0 );
     if ( n0 > 1 ) vb_1 = vld1q_f64( b_loc + cs_b * 1 );
     if ( n0 > 2 ) vb_2 = vld1q_f64( b_loc + cs_b * 2 );
@@ -174,10 +174,10 @@ void bli_dgemmsup_rd_armv8a_int_2x8
   PRAGMA_NOUNROLL
   for ( ; k_left > 0; --k_left )
   {
-    // if ( m0 > 0 ) 
+    // if ( m0 > 0 )
                   va_0 = vld1q_lane_f64( a_loc + rs_a * 0, va_0, 0 );
     if ( m0 > 1 ) va_1 = vld1q_lane_f64( a_loc + rs_a * 1, va_1, 0 );
-    // if ( n0 > 0 ) 
+    // if ( n0 > 0 )
                   vb_0 = vld1q_lane_f64( b_loc + cs_b * 0, vb_0, 0 );
     if ( n0 > 1 ) vb_1 = vld1q_lane_f64( b_loc + cs_b * 1, vb_1, 0 );
     if ( n0 > 2 ) vb_2 = vld1q_lane_f64( b_loc + cs_b * 2, vb_2, 0 );
diff --git a/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d3x4.c b/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d3x4.c
index 73e5f20fb7..89817d6d55 100644
--- a/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d3x4.c
+++ b/kernels/armv8a/3/sup/d3x4/bli_gemmsup_rd_armv8a_int_d3x4.c
@@ -69,8 +69,8 @@ void bli_dgemmsup_rd_armv8a_int_3x4
        double*    restrict b, inc_t rs_b, inc_t cs_b,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
   // if ( m0 == 3 && n0 == 4 )
diff --git a/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d3x8mn.c b/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d3x8mn.c
index 16af42ade6..931f3ed66b 100644
--- a/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d3x8mn.c
+++ b/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d3x8mn.c
@@ -69,8 +69,8 @@ void bli_dgemmsup_rv_armv8a_int_3x8mn
        double*    restrict b0, inc_t rs_b, inc_t cs_b,
        double*    restrict beta,
        double*    restrict c0, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
   // Unlike the rd case, this rv case does not impose restriction upon
diff --git a/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d6x4mn.c b/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d6x4mn.c
index 8bbd87f1f6..f850b0fa67 100644
--- a/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d6x4mn.c
+++ b/kernels/armv8a/3/sup/d6x4/bli_gemmsup_rv_armv8a_int_d6x4mn.c
@@ -69,8 +69,8 @@ void bli_dgemmsup_rv_armv8a_int_6x4mn
        double*    restrict b0, inc_t rs_b, inc_t cs_b,
        double*    restrict beta,
        double*    restrict c0, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
   // Unlike the rd case, this rv case does not impose restriction upon
@@ -123,7 +123,7 @@ void bli_dgemmsup_rv_armv8a_int_6x4mn
       for ( ; k > 0; --k )
       {
         // A columns.
-        // if ( m0 > 0 ) 
+        // if ( m0 > 0 )
                       va_0 = vld1q_lane_f64( a_loc + rs_a * 0, va_0, 0 );
         if ( m0 > 1 ) va_0 = vld1q_lane_f64( a_loc + rs_a * 1, va_0, 1 );
         if ( m0 > 2 ) va_1 = vld1q_lane_f64( a_loc + rs_a * 2, va_1, 0 );
diff --git a/kernels/bgq/1/bli_axpyv_bgq_int.c b/kernels/bgq/1/bli_axpyv_bgq_int.c
index 0c4a8cbd3c..1d233f5c17 100644
--- a/kernels/bgq/1/bli_axpyv_bgq_int.c
+++ b/kernels/bgq/1/bli_axpyv_bgq_int.c
@@ -34,14 +34,14 @@
 
 #include "blis.h"
 
-void bli_daxpyv_bgq_int 
-     ( 
+void bli_daxpyv_bgq_int
+     (
        conj_t           conjx,
        dim_t            n,
        double* restrict alpha,
        double* restrict x, inc_t incx,
        double* restrict y, inc_t incy,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	if ( bli_zero_dim1( n ) ) return;
@@ -70,7 +70,7 @@ void bli_daxpyv_bgq_int
         xv = vec_lda( 0 * sizeof(double), &x[i*4] );
         yv = vec_lda( 0 * sizeof(double), &y[i*4] );
         zv = vec_madd( alphav, xv, yv );
-        vec_sta( zv, 0 * sizeof(double), &y[i*4] );   
+        vec_sta( zv, 0 * sizeof(double), &y[i*4] );
 	}
     for ( dim_t i = 0; i < n_left; i++ )
     {
diff --git a/kernels/bgq/1/bli_dotv_bgq_int.c b/kernels/bgq/1/bli_dotv_bgq_int.c
index 73e53c23a1..eb6805a4c2 100644
--- a/kernels/bgq/1/bli_dotv_bgq_int.c
+++ b/kernels/bgq/1/bli_dotv_bgq_int.c
@@ -42,7 +42,7 @@ void bli_ddotv_bgq_int
        double* restrict x, inc_t incx,
        double* restrict y, inc_t incy,
        double* restrict rho,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	bool   use_ref = FALSE;
@@ -91,7 +91,7 @@ void bli_ddotv_bgq_int
     {
         rhos += x[4*n_run + i] * y[4*n_run + i];
     }
-	
+
     *rho = rhos;
 }
 
diff --git a/kernels/bgq/1f/bli_axpyf_bgq_int.c b/kernels/bgq/1f/bli_axpyf_bgq_int.c
index 4e296e0a25..cf0fe633cd 100644
--- a/kernels/bgq/1f/bli_axpyf_bgq_int.c
+++ b/kernels/bgq/1f/bli_axpyf_bgq_int.c
@@ -45,7 +45,7 @@ void bli_daxpyf_bgq_int
        double* restrict a, inc_t inca, inc_t lda,
        double* restrict x, inc_t incx,
        double* restrict y, inc_t incy,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	const dim_t fusefac = 8;
@@ -60,7 +60,7 @@ void bli_daxpyf_bgq_int
 		use_ref = TRUE;
 	// Call the reference implementation if needed.
 	if ( use_ref == TRUE )
-	{   
+	{
 //        printf("%d\t%d\t%d\t%d\t%d\t%d\n", fusefac, inca, incx, incy, bli_is_unaligned_to( ( siz_t )a, 32 ), bli_is_unaligned_to( ( siz_t )y, 32));
 //        printf("DEFAULTING TO REFERENCE IMPLEMENTATION\n");
 		BLIS_DAXPYF_KERNEL_REF( conja, conjx, m, b_n, alpha, a, inca, lda, x, incx, y, incy, cntx );
@@ -134,7 +134,7 @@ void bli_daxpyf_bgq_int
 
         vec_sta( yv, 0 * sizeof(double), &y0[i*4]);
 	}
-    
+
     for ( dim_t i = 0; i < m_left; ++i )
     {
         y0[4*m_run + i] += chi0 * a0[4*m_run + i]
diff --git a/kernels/bgq/3/bli_gemm_bgq_int_8x8.c b/kernels/bgq/3/bli_gemm_bgq_int_8x8.c
index 15e3e072f3..2adbc4c36c 100644
--- a/kernels/bgq/3/bli_gemm_bgq_int_8x8.c
+++ b/kernels/bgq/3/bli_gemm_bgq_int_8x8.c
@@ -64,8 +64,8 @@ void bli_dgemm_bgq_int_8x8
        double*    restrict b,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
     GEMM_UKR_SETUP_CT_ANY( d, 8, 8, false );
@@ -228,8 +228,8 @@ void bli_zgemm_bgq_int_4x4
        dcomplex*  restrict b,
        dcomplex*  restrict beta,
        dcomplex*  restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
     GEMM_UKR_SETUP_CT_ANY( z, 4, 4, false );
diff --git a/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c b/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c
index 3a75d61d73..bef7232dd2 100644
--- a/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c
+++ b/kernels/bulldozer/3/bli_gemm_bulldozer_asm_d4x6_fma4.c
@@ -98,8 +98,8 @@ void bli_sgemm_bulldozer_asm_8x8_fma4
        float*     restrict b,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	// Typecast local copies of integers in case dim_t and inc_t are a
@@ -590,8 +590,8 @@ void bli_dgemm_bulldozer_asm_4x6_fma4
        double*    restrict b,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	// Typecast local copies of integers in case dim_t and inc_t are a
@@ -810,8 +810,8 @@ void bli_cgemm_bulldozer_asm_8x4_fma4
        scomplex*  restrict b,
        scomplex*  restrict beta,
        scomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
@@ -1334,8 +1334,8 @@ void bli_zgemm_bulldozer_asm_4x4_fma4
        dcomplex*  restrict b,
        dcomplex*  restrict beta,
        dcomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c
index 843335ad5d..e5d077409f 100644
--- a/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c
+++ b/kernels/haswell/1m/bli_packm_haswell_asm_c3xk.c
@@ -51,7 +51,7 @@ void bli_cpackm_haswell_asm_3xk
        scomplex*  restrict kappa,
        scomplex*  restrict a, inc_t inca0, inc_t lda0,
        scomplex*  restrict p,              inc_t ldp0,
-       cntx_t*    restrict cntx
+       cntx_t*             cntx
      )
 {
 #if 0
diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c
index 862a33b86a..fa8fabe9d5 100644
--- a/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c
+++ b/kernels/haswell/1m/bli_packm_haswell_asm_c8xk.c
@@ -51,7 +51,7 @@ void bli_cpackm_haswell_asm_8xk
        scomplex*  restrict kappa,
        scomplex*  restrict a, inc_t inca0, inc_t lda0,
        scomplex*  restrict p,              inc_t ldp0,
-       cntx_t*    restrict cntx
+       cntx_t*             cntx
      )
 {
 #if 0
diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_d6xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_d6xk.c
index b64f26591d..47fc5b98d1 100644
--- a/kernels/haswell/1m/bli_packm_haswell_asm_d6xk.c
+++ b/kernels/haswell/1m/bli_packm_haswell_asm_d6xk.c
@@ -51,7 +51,7 @@ void bli_dpackm_haswell_asm_6xk
        double*    restrict kappa,
        double*    restrict a, inc_t inca0, inc_t lda0,
        double*    restrict p,              inc_t ldp0,
-       cntx_t*    restrict cntx
+       cntx_t*             cntx
      )
 {
 #if 0
@@ -107,7 +107,7 @@ void bli_dpackm_haswell_asm_6xk
 	if ( cdim0 == mnr && !gs && unitk )
 	{
 		begin_asm()
-		
+
 		mov(var(a), rax)                   // load address of a.
 
 		mov(var(inca), r8)                 // load inca
@@ -121,13 +121,13 @@ void bli_dpackm_haswell_asm_6xk
 
 		mov(var(one), rdx)                 // load address of 1.0 constant
 		vmovsd(mem(rdx), xmm1)             // load 1.0
-		
+
 		mov(var(kappa), rcx)               // load address of kappa
 		vmovsd(mem(rcx), xmm0)             // load kappa
-		
+
 
 										   // now branch on kappa == 1.0
-		
+
 		vucomisd(xmm0, xmm1)               // set ZF if kappa == 1.0
 		je(.DKAPPAUNIT)                    // if ZF = 1, jump to beta == 0 case
 
@@ -137,7 +137,7 @@ void bli_dpackm_haswell_asm_6xk
 
 		cmp(imm(8), r8)                    // set ZF if (8*inca) == 8.
 		jz(.DCOLNONU)                      // jump to column storage case
-		
+
 		// -- kappa non-unit, row storage on A -------------------------------------
 
 		label(.DROWNONU)
@@ -150,7 +150,7 @@ void bli_dpackm_haswell_asm_6xk
 		label(.DCOLNONU)
 
 		jmp(.DDONE)                        // jump to end.
-		
+
 
 
 
@@ -161,7 +161,7 @@ void bli_dpackm_haswell_asm_6xk
 
 
 		// -- kappa unit, row storage on A -----------------------------------------
-		
+
 		label(.DROWUNIT)
 
 		lea(mem(r8,  r8,  2), r12)         // r12 = 3*inca
@@ -255,7 +255,7 @@ void bli_dpackm_haswell_asm_6xk
 		// -- kappa unit, column storage on A --------------------------------------
 
 		label(.DCOLUNIT)
-		
+
 		lea(mem(r10, r10, 2), r13)         // r13 = 3*lda
 
 		mov(var(k_iter), rsi)              // i = k_iter;
@@ -319,8 +319,8 @@ void bli_dpackm_haswell_asm_6xk
 
 
 		label(.DDONE)
-		
-		
+
+
 
 		end_asm(
 		: // output operands (none)
@@ -374,7 +374,7 @@ void bli_dpackm_haswell_asm_6xk
 			(
 			  m_edge,
 			  n_edge,
-			  p_edge, 1, ldp 
+			  p_edge, 1, ldp
 			);
 		}
 	}
@@ -394,7 +394,7 @@ void bli_dpackm_haswell_asm_6xk
 		(
 		  m_edge,
 		  n_edge,
-		  p_edge, 1, ldp 
+		  p_edge, 1, ldp
 		);
 	}
 }
diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_d8xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_d8xk.c
index 9deb564ce4..9f07e37a43 100644
--- a/kernels/haswell/1m/bli_packm_haswell_asm_d8xk.c
+++ b/kernels/haswell/1m/bli_packm_haswell_asm_d8xk.c
@@ -51,7 +51,7 @@ void bli_dpackm_haswell_asm_8xk
        double*    restrict kappa,
        double*    restrict a, inc_t inca0, inc_t lda0,
        double*    restrict p,              inc_t ldp0,
-       cntx_t*    restrict cntx
+       cntx_t*             cntx
      )
 {
 #if 0
@@ -107,7 +107,7 @@ void bli_dpackm_haswell_asm_8xk
 	if ( cdim0 == mnr && !gs && unitk )
 	{
 		begin_asm()
-		
+
 		mov(var(a), rax)                   // load address of a.
 
 		mov(var(inca), r8)                 // load inca
@@ -121,13 +121,13 @@ void bli_dpackm_haswell_asm_8xk
 
 		mov(var(one), rdx)                 // load address of 1.0 constant
 		vmovsd(mem(rdx), xmm1)             // load 1.0
-		
+
 		mov(var(kappa), rcx)               // load address of kappa
 		vmovsd(mem(rcx), xmm0)             // load kappa
-		
+
 
 										   // now branch on kappa == 1.0
-		
+
 		vucomisd(xmm0, xmm1)               // set ZF if kappa == 1.0
 		je(.DKAPPAUNIT)                    // if ZF = 1, jump to beta == 0 case
 
@@ -137,7 +137,7 @@ void bli_dpackm_haswell_asm_8xk
 
 		cmp(imm(8), r8)                    // set ZF if (8*inca) == 8.
 		jz(.DCOLNONU)                      // jump to column storage case
-		
+
 		// -- kappa non-unit, row storage on A -------------------------------------
 
 		label(.DROWNONU)
@@ -150,7 +150,7 @@ void bli_dpackm_haswell_asm_8xk
 		label(.DCOLNONU)
 
 		jmp(.DDONE)                        // jump to end.
-		
+
 
 
 
@@ -161,7 +161,7 @@ void bli_dpackm_haswell_asm_8xk
 
 
 		// -- kappa unit, row storage on A -----------------------------------------
-		
+
 		label(.DROWUNIT)
 
 		lea(mem(r8,  r8,  2), r12)         // r12 = 3*inca
@@ -265,7 +265,7 @@ void bli_dpackm_haswell_asm_8xk
 		// -- kappa unit, column storage on A --------------------------------------
 
 		label(.DCOLUNIT)
-		
+
 		lea(mem(r10, r10, 2), r13)         // r13 = 3*lda
 
 		mov(var(k_iter), rsi)              // i = k_iter;
@@ -329,8 +329,8 @@ void bli_dpackm_haswell_asm_8xk
 
 
 		label(.DDONE)
-		
-		
+
+
 
 		end_asm(
 		: // output operands (none)
@@ -384,7 +384,7 @@ void bli_dpackm_haswell_asm_8xk
 			(
 			  m_edge,
 			  n_edge,
-			  p_edge, 1, ldp 
+			  p_edge, 1, ldp
 			);
 		}
 	}
@@ -402,7 +402,7 @@ void bli_dpackm_haswell_asm_8xk
 		(
 		  m_edge,
 		  n_edge,
-		  p_edge, 1, ldp 
+		  p_edge, 1, ldp
 		);
 	}
 }
diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_s16xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_s16xk.c
index 40ac22bc55..27b2c71eed 100644
--- a/kernels/haswell/1m/bli_packm_haswell_asm_s16xk.c
+++ b/kernels/haswell/1m/bli_packm_haswell_asm_s16xk.c
@@ -51,7 +51,7 @@ void bli_spackm_haswell_asm_16xk
        float*     restrict kappa,
        float*     restrict a, inc_t inca0, inc_t lda0,
        float*     restrict p,              inc_t ldp0,
-       cntx_t*    restrict cntx
+       cntx_t*             cntx
      )
 {
 #if 0
@@ -100,14 +100,14 @@ void bli_spackm_haswell_asm_16xk
 	// NOTE: If/when this kernel ever supports scaling by kappa within the
 	// assembly region, this constraint should be lifted.
 	const bool     unitk  = bli_seq1( *kappa );
-	
+
 
 	// -------------------------------------------------------------------------
 
 	if ( cdim0 == mnr && !gs && unitk )
 	{
 		begin_asm()
-		
+
 		mov(var(a), rax)                   // load address of a.
 
 		mov(var(inca), r8)                 // load inca
@@ -121,13 +121,13 @@ void bli_spackm_haswell_asm_16xk
 
 		mov(var(one), rdx)                 // load address of 1.0 constant
 		vmovss(mem(rdx), xmm1)             // load 1.0
-		
+
 		mov(var(kappa), rcx)               // load address of kappa
 		vmovss(mem(rcx), xmm0)             // load kappa
-		
+
 
 										   // now branch on kappa == 1.0
-		
+
 		vucomiss(xmm0, xmm1)               // set ZF if kappa == 1.0
 		je(.SKAPPAUNIT)                    // if ZF = 1, jump to beta == 0 case
 
@@ -137,7 +137,7 @@ void bli_spackm_haswell_asm_16xk
 
 		cmp(imm(4), r8)                    // set ZF if (4*inca) == 4.
 		jz(.SCOLNONU)                      // jump to column storage case
-		
+
 		// -- kappa non-unit, row storage on A -------------------------------------
 
 		label(.SROWNONU)
@@ -150,7 +150,7 @@ void bli_spackm_haswell_asm_16xk
 		label(.SCOLNONU)
 
 		jmp(.SDONE)                        // jump to end.
-		
+
 
 
 
@@ -161,7 +161,7 @@ void bli_spackm_haswell_asm_16xk
 
 
 		// -- kappa unit, row storage on A -----------------------------------------
-		
+
 		label(.SROWUNIT)
 
 		lea(mem(r8,  r8,  2), r13)         // r13 = 3*inca
@@ -402,7 +402,7 @@ void bli_spackm_haswell_asm_16xk
 		// -- kappa unit, column storage on A --------------------------------------
 
 		label(.SCOLUNIT)
-		
+
 		lea(mem(r10, r10, 2), r13)         // r13 = 3*lda
 		lea(mem(r13, r10, 2), r15)         // r15 = 5*lda
 		lea(mem(r13, r10, 4), rdx)         // rdx = 7*lda
@@ -488,8 +488,8 @@ void bli_spackm_haswell_asm_16xk
 
 
 		label(.SDONE)
-		
-		
+
+
 
 		end_asm(
 		: // output operands (none)
@@ -543,7 +543,7 @@ void bli_spackm_haswell_asm_16xk
 			(
 			  m_edge,
 			  n_edge,
-			  p_edge, 1, ldp 
+			  p_edge, 1, ldp
 			);
 		}
 	}
@@ -561,7 +561,7 @@ void bli_spackm_haswell_asm_16xk
 		(
 		  m_edge,
 		  n_edge,
-		  p_edge, 1, ldp 
+		  p_edge, 1, ldp
 		);
 	}
 }
diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_s6xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_s6xk.c
index 3a134bed8f..a073eca62a 100644
--- a/kernels/haswell/1m/bli_packm_haswell_asm_s6xk.c
+++ b/kernels/haswell/1m/bli_packm_haswell_asm_s6xk.c
@@ -51,7 +51,7 @@ void bli_spackm_haswell_asm_6xk
        float*     restrict kappa,
        float*     restrict a, inc_t inca0, inc_t lda0,
        float*     restrict p,              inc_t ldp0,
-       cntx_t*    restrict cntx
+       cntx_t*             cntx
      )
 {
 #if 0
@@ -100,14 +100,14 @@ void bli_spackm_haswell_asm_6xk
 	// NOTE: If/when this kernel ever supports scaling by kappa within the
 	// assembly region, this constraint should be lifted.
 	const bool     unitk  = bli_seq1( *kappa );
-	
+
 
 	// -------------------------------------------------------------------------
 
 	if ( cdim0 == mnr && !gs && unitk )
 	{
 		begin_asm()
-		
+
 		mov(var(a), rax)                   // load address of a.
 
 		mov(var(inca), r8)                 // load inca
@@ -121,13 +121,13 @@ void bli_spackm_haswell_asm_6xk
 
 		mov(var(one), rdx)                 // load address of 1.0 constant
 		vmovss(mem(rdx), xmm1)             // load 1.0
-		
+
 		mov(var(kappa), rcx)               // load address of kappa
 		vmovss(mem(rcx), xmm0)             // load kappa
-		
+
 
 										   // now branch on kappa == 1.0
-		
+
 		vucomiss(xmm0, xmm1)               // set ZF if kappa == 1.0
 		je(.SKAPPAUNIT)                    // if ZF = 1, jump to beta == 0 case
 
@@ -137,7 +137,7 @@ void bli_spackm_haswell_asm_6xk
 
 		cmp(imm(4), r8)                    // set ZF if (4*inca) == 4.
 		jz(.SCOLNONU)                      // jump to column storage case
-		
+
 		// -- kappa non-unit, row storage on A -------------------------------------
 
 		label(.SROWNONU)
@@ -150,7 +150,7 @@ void bli_spackm_haswell_asm_6xk
 		label(.SCOLNONU)
 
 		jmp(.SDONE)                        // jump to end.
-		
+
 
 
 
@@ -161,7 +161,7 @@ void bli_spackm_haswell_asm_6xk
 
 
 		// -- kappa unit, row storage on A -----------------------------------------
-		
+
 		label(.SROWUNIT)
 
 		lea(mem(r8,  r8,  2), r13)         // r13 = 3*inca
@@ -274,7 +274,7 @@ void bli_spackm_haswell_asm_6xk
 		// -- kappa unit, column storage on A --------------------------------------
 
 		label(.SCOLUNIT)
-		
+
 		lea(mem(r10, r10, 2), r13)         // r13 = 3*lda
 		lea(mem(r13, r10, 2), r15)         // r15 = 5*lda
 		lea(mem(r13, r10, 4), rdx)         // rdx = 7*lda
@@ -361,8 +361,8 @@ void bli_spackm_haswell_asm_6xk
 
 
 		label(.SDONE)
-		
-		
+
+
 
 		end_asm(
 		: // output operands (none)
@@ -416,7 +416,7 @@ void bli_spackm_haswell_asm_6xk
 			(
 			  m_edge,
 			  n_edge,
-			  p_edge, 1, ldp 
+			  p_edge, 1, ldp
 			);
 		}
 	}
@@ -434,7 +434,7 @@ void bli_spackm_haswell_asm_6xk
 		(
 		  m_edge,
 		  n_edge,
-		  p_edge, 1, ldp 
+		  p_edge, 1, ldp
 		);
 	}
 }
diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c
index 1a714abe26..5e65565d57 100644
--- a/kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c
+++ b/kernels/haswell/1m/bli_packm_haswell_asm_z3xk.c
@@ -51,7 +51,7 @@ void bli_zpackm_haswell_asm_3xk
        dcomplex*  restrict kappa,
        dcomplex*  restrict a, inc_t inca0, inc_t lda0,
        dcomplex*  restrict p,              inc_t ldp0,
-       cntx_t*    restrict cntx
+       cntx_t*             cntx
      )
 {
 #if 0
diff --git a/kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c b/kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c
index 4e11872afb..d118081ccc 100644
--- a/kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c
+++ b/kernels/haswell/1m/bli_packm_haswell_asm_z4xk.c
@@ -51,7 +51,7 @@ void bli_zpackm_haswell_asm_4xk
        dcomplex*  restrict kappa,
        dcomplex*  restrict a, inc_t inca0, inc_t lda0,
        dcomplex*  restrict p,              inc_t ldp0,
-       cntx_t*    restrict cntx
+       cntx_t*             cntx
      )
 {
 #if 0
diff --git a/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c b/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c
index d0e7938678..eccf57b64a 100644
--- a/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c
+++ b/kernels/haswell/3/bli_gemm_haswell_asm_d6x8.c
@@ -87,8 +87,8 @@ void bli_sgemm_haswell_asm_6x16
        float*     restrict b,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
@@ -767,8 +767,8 @@ void bli_dgemm_haswell_asm_6x8
        double*    restrict b,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
@@ -1326,8 +1326,8 @@ void bli_cgemm_haswell_asm_3x8
        scomplex*  restrict b,
        scomplex*  restrict beta,
        scomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
@@ -1719,8 +1719,8 @@ void bli_zgemm_haswell_asm_3x4
        dcomplex*  restrict b,
        dcomplex*  restrict beta,
        dcomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c b/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c
index dd9526d566..261054499d 100644
--- a/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c
+++ b/kernels/haswell/3/bli_gemm_haswell_asm_d8x6.c
@@ -86,8 +86,8 @@ void bli_sgemm_haswell_asm_16x6
        float*     restrict b,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
@@ -470,8 +470,8 @@ void bli_dgemm_haswell_asm_8x6
        double*    restrict b,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
@@ -840,8 +840,8 @@ void bli_cgemm_haswell_asm_8x3
        scomplex*  restrict b,
        scomplex*  restrict beta,
        scomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
@@ -1231,8 +1231,8 @@ void bli_zgemm_haswell_asm_4x3
        dcomplex*  restrict b,
        dcomplex*  restrict beta,
        dcomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c b/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c
index d0d0ff2115..915fbf08f1 100644
--- a/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c
+++ b/kernels/haswell/3/bli_gemmtrsm_l_haswell_asm_d6x8.c
@@ -67,8 +67,8 @@ void bli_sgemmtrsm_l_haswell_asm_6x16
        float*     restrict b01,
        float*     restrict b11,
        float*     restrict c11, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
@@ -858,8 +858,8 @@ void bli_dgemmtrsm_l_haswell_asm_6x8
        double*    restrict b01,
        double*    restrict b11,
        double*    restrict c11, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c b/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c
index 68a8c069b4..63c42785c9 100644
--- a/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c
+++ b/kernels/haswell/3/bli_gemmtrsm_u_haswell_asm_d6x8.c
@@ -67,8 +67,8 @@ void bli_sgemmtrsm_u_haswell_asm_6x16
        float*     restrict b01,
        float*     restrict b11,
        float*     restrict c11, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
@@ -863,8 +863,8 @@ void bli_dgemmtrsm_u_haswell_asm_6x8
        double*    restrict b01,
        double*    restrict b11,
        double*    restrict c11, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c
index 1820277d5a..637e5917b2 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8m.c
@@ -78,8 +78,8 @@ void bli_dgemmsup_rd_haswell_asm_6x8m
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	uint64_t n_left = n0 % 8;
@@ -166,7 +166,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	//mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -184,7 +184,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	//mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -259,18 +259,18 @@ void bli_dgemmsup_rd_haswell_asm_6x8m
 #endif
 	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
 
-	
 
-	
+
+
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
 	                                   // contains the k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -305,7 +305,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	vfmadd231pd(ymm2, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovupd(mem(rax       ), ymm0)
@@ -336,7 +336,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 1
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
@@ -398,27 +398,27 @@ void bli_dgemmsup_rd_haswell_asm_6x8m
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	vfmadd231pd(ymm2, ymm3, ymm15)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKITER4)
-	
+
 	mov(var(k_iter4), rsi)             // i = k_iter4;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
+
 #if 1
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
@@ -429,7 +429,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m
 	vmovupd(mem(rax, r8, 1), ymm1)
 	vmovupd(mem(rax, r8, 2), ymm2)
 	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	
+
 	vmovupd(mem(rbx        ), ymm3)
 	vfmadd231pd(ymm0, ymm3, ymm4)
 	vfmadd231pd(ymm1, ymm3, ymm5)
@@ -451,21 +451,21 @@ void bli_dgemmsup_rd_haswell_asm_6x8m
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	vfmadd231pd(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.DCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -473,12 +473,12 @@ void bli_dgemmsup_rd_haswell_asm_6x8m
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovsd(mem(rax       ), xmm0)
 	vmovsd(mem(rax, r8, 1), xmm1)
 	vmovsd(mem(rax, r8, 2), xmm2)
 	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	
+
 	vmovsd(mem(rbx        ), xmm3)
 	vfmadd231pd(ymm0, ymm3, ymm4)
 	vfmadd231pd(ymm1, ymm3, ymm5)
@@ -500,22 +500,22 @@ void bli_dgemmsup_rd_haswell_asm_6x8m
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	vfmadd231pd(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 
 
 	label(.DPOSTACCUM)
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
+
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
 	                                   // ymm6  ymm9  ymm12 ymm15
-	
+
 	vhaddpd( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vaddpd( xmm0, xmm1, xmm0 )
@@ -555,7 +555,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m
 
 
 
-	
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
 
@@ -563,73 +563,73 @@ void bli_dgemmsup_rd_haswell_asm_6x8m
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm5, ymm5)
 	vmulpd(ymm0, ymm6, ymm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), ymm3, ymm5)
 	vmovupd(ymm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), ymm3, ymm6)
 	vmovupd(ymm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(ymm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(ymm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
 
 	lea(mem(r12, rdi, 2), r12)         //
@@ -653,7 +653,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8m
 
 	label(.DRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -735,8 +735,8 @@ void bli_dgemmsup_rd_haswell_asm_6x4m
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -766,7 +766,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4m
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -784,7 +784,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4m
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -838,19 +838,19 @@ void bli_dgemmsup_rd_haswell_asm_6x4m
 	prefetch(0, mem(rcx, rdi, 2, 3*8)) // prefetch c + 2*rs_c
 #endif
 	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
-	
 
-	
-	
+
+
+
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
 	                                   // contains the k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -885,7 +885,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4m
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	vfmadd231pd(ymm2, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovupd(mem(rax       ), ymm0)
@@ -916,7 +916,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4m
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 1
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
@@ -978,25 +978,25 @@ void bli_dgemmsup_rd_haswell_asm_6x4m
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	vfmadd231pd(ymm2, ymm3, ymm15)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKITER4)
-	
+
 	mov(var(k_iter4), rsi)             // i = k_iter4;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
 
 #if 1
@@ -1004,12 +1004,12 @@ void bli_dgemmsup_rd_haswell_asm_6x4m
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
 	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
 #endif
-	
+
 	vmovupd(mem(rax       ), ymm0)
 	vmovupd(mem(rax, r8, 1), ymm1)
 	vmovupd(mem(rax, r8, 2), ymm2)
 	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	
+
 	vmovupd(mem(rbx        ), ymm3)
 	vfmadd231pd(ymm0, ymm3, ymm4)
 	vfmadd231pd(ymm1, ymm3, ymm5)
@@ -1031,21 +1031,21 @@ void bli_dgemmsup_rd_haswell_asm_6x4m
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	vfmadd231pd(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.DCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -1053,12 +1053,12 @@ void bli_dgemmsup_rd_haswell_asm_6x4m
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovsd(mem(rax       ), xmm0)
 	vmovsd(mem(rax, r8, 1), xmm1)
 	vmovsd(mem(rax, r8, 2), xmm2)
 	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	
+
 	vmovsd(mem(rbx        ), xmm3)
 	vfmadd231pd(ymm0, ymm3, ymm4)
 	vfmadd231pd(ymm1, ymm3, ymm5)
@@ -1080,22 +1080,22 @@ void bli_dgemmsup_rd_haswell_asm_6x4m
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	vfmadd231pd(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 
 
 	label(.DPOSTACCUM)
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
+
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
 	                                   // ymm6  ymm9  ymm12 ymm15
-	
+
 	vhaddpd( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vaddpd( xmm0, xmm1, xmm0 )
@@ -1134,7 +1134,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4m
 
 
 
-	
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
 
@@ -1142,73 +1142,73 @@ void bli_dgemmsup_rd_haswell_asm_6x4m
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm5, ymm5)
 	vmulpd(ymm0, ymm6, ymm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), ymm3, ymm5)
 	vmovupd(ymm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), ymm3, ymm6)
 	vmovupd(ymm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(ymm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(ymm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
 
 	lea(mem(r12, rdi, 2), r12)         //
@@ -1225,7 +1225,7 @@ void bli_dgemmsup_rd_haswell_asm_6x4m
 
 	label(.DRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -1307,8 +1307,8 @@ void bli_dgemmsup_rd_haswell_asm_6x2m
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1336,9 +1336,9 @@ void bli_dgemmsup_rd_haswell_asm_6x2m
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -1356,7 +1356,7 @@ void bli_dgemmsup_rd_haswell_asm_6x2m
 
 	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -1412,19 +1412,19 @@ void bli_dgemmsup_rd_haswell_asm_6x2m
 	prefetch(0, mem(r10, rdi, 1, 1*8)) // prefetch c + 4*rs_c
 	prefetch(0, mem(r10, rdi, 2, 1*8)) // prefetch c + 5*rs_c
 #endif
-	
 
-	
-	
+
+
+
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
 	                                   // contains the k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -1462,7 +1462,7 @@ void bli_dgemmsup_rd_haswell_asm_6x2m
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovupd(mem(rbx        ), ymm0)
@@ -1496,7 +1496,7 @@ void bli_dgemmsup_rd_haswell_asm_6x2m
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
@@ -1564,25 +1564,25 @@ void bli_dgemmsup_rd_haswell_asm_6x2m
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKITER4)
-	
+
 	mov(var(k_iter4), rsi)             // i = k_iter4;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
 
 #if 0
@@ -1590,7 +1590,7 @@ void bli_dgemmsup_rd_haswell_asm_6x2m
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
 	prefetch(0, mem(rax, rbp, 1, 0*8)) // prefetch rax + 5*rs_a
 #endif
-	
+
 	vmovupd(mem(rbx        ), ymm0)
 	vmovupd(mem(rbx, r11, 1), ymm1)
 	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
@@ -1620,21 +1620,21 @@ void bli_dgemmsup_rd_haswell_asm_6x2m
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.DCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -1642,7 +1642,7 @@ void bli_dgemmsup_rd_haswell_asm_6x2m
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovsd(mem(rbx        ), xmm0)
 	vmovsd(mem(rbx, r11, 1), xmm1)
 	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
@@ -1672,12 +1672,12 @@ void bli_dgemmsup_rd_haswell_asm_6x2m
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 
@@ -1690,7 +1690,7 @@ void bli_dgemmsup_rd_haswell_asm_6x2m
 	                                   // ymm10 ymm11
 	                                   // ymm12 ymm13
 	                                   // ymm14 ymm15
-	
+
 	vhaddpd( ymm5, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vaddpd( xmm0, xmm1, xmm4 )         // xmm0[0] = sum(ymm4); xmm0[1] = sum(ymm5)
@@ -1723,7 +1723,7 @@ void bli_dgemmsup_rd_haswell_asm_6x2m
 	                                   // xmm14[0:1] = sum(ymm14) sum(ymm15)
 
 
-	
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
 
@@ -1731,96 +1731,96 @@ void bli_dgemmsup_rd_haswell_asm_6x2m
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(xmm0, xmm4,  xmm4)          // scale by alpha
 	vmulpd(xmm0, xmm6,  xmm6)
 	vmulpd(xmm0, xmm8,  xmm8)
 	vmulpd(xmm0, xmm10, xmm10)
 	vmulpd(xmm0, xmm12, xmm12)
 	vmulpd(xmm0, xmm14, xmm14)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx), xmm3, xmm4)
 	vmovupd(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), xmm3, xmm6)
 	vmovupd(xmm6, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), xmm3, xmm8)
 	vmovupd(xmm8, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), xmm3, xmm10)
 	vmovupd(xmm10, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), xmm3, xmm12)
 	vmovupd(xmm12, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), xmm3, xmm14)
 	vmovupd(xmm14, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(xmm6, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(xmm8, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(xmm10, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(xmm12, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(xmm14, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.DDONE)
-	
+
 
 
 
@@ -1838,7 +1838,7 @@ void bli_dgemmsup_rd_haswell_asm_6x2m
 
 	label(.DRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c
index e720e7da1c..d9dad5feac 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_d6x8n.c
@@ -78,8 +78,8 @@ void bli_dgemmsup_rd_haswell_asm_6x8n
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	uint64_t m_left = m0 % 6;
@@ -223,7 +223,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8n
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	//mov(var(a), rdx)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -241,7 +241,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8n
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	//mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -317,18 +317,18 @@ void bli_dgemmsup_rd_haswell_asm_6x8n
 	//lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
 	lea(mem(rbx, r11, 4), r10)         // r10 = rbx + 4*cs_b
 
-	
 
-	
+
+
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
 	                                   // contains the k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -362,7 +362,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8n
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	vfmadd231pd(ymm2, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 1
@@ -398,7 +398,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8n
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 1
 	prefetch(0, mem(r10,         8*8)) // prefetch rbx + 4*cs_b + 8*rs_b
 	prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b
@@ -465,32 +465,32 @@ void bli_dgemmsup_rd_haswell_asm_6x8n
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	vfmadd231pd(ymm2, ymm3, ymm15)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKITER4)
-	
+
 	mov(var(k_iter4), rsi)             // i = k_iter4;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
+
 	vmovupd(mem(rax       ), ymm0)
 	vmovupd(mem(rax, r8, 1), ymm1)
 	vmovupd(mem(rax, r8, 2), ymm2)
 	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	
+
 	vmovupd(mem(rbx        ), ymm3)
 	vfmadd231pd(ymm0, ymm3, ymm4)
 	vfmadd231pd(ymm1, ymm3, ymm5)
@@ -512,21 +512,21 @@ void bli_dgemmsup_rd_haswell_asm_6x8n
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	vfmadd231pd(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.DCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -534,12 +534,12 @@ void bli_dgemmsup_rd_haswell_asm_6x8n
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovsd(mem(rax       ), xmm0)
 	vmovsd(mem(rax, r8, 1), xmm1)
 	vmovsd(mem(rax, r8, 2), xmm2)
 	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	
+
 	vmovsd(mem(rbx        ), xmm3)
 	vfmadd231pd(ymm0, ymm3, ymm4)
 	vfmadd231pd(ymm1, ymm3, ymm5)
@@ -561,22 +561,22 @@ void bli_dgemmsup_rd_haswell_asm_6x8n
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	vfmadd231pd(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 
 
 	label(.DPOSTACCUM)
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
+
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
 	                                   // ymm6  ymm9  ymm12 ymm15
-	
+
 	vhaddpd( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vaddpd( xmm0, xmm1, xmm0 )
@@ -615,7 +615,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8n
 
 
 
-	
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
 
@@ -623,73 +623,73 @@ void bli_dgemmsup_rd_haswell_asm_6x8n
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm5, ymm5)
 	vmulpd(ymm0, ymm6, ymm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), ymm3, ymm5)
 	vmovupd(ymm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), ymm3, ymm6)
 	vmovupd(ymm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(ymm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(ymm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
 
 	add(imm(4*8), r12)                 // c_jj = r12 += 4*cs_c
@@ -711,7 +711,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8n
 
 	label(.DRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -804,8 +804,8 @@ void bli_dgemmsup_rd_haswell_asm_3x8n
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -835,7 +835,7 @@ void bli_dgemmsup_rd_haswell_asm_3x8n
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rdx)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -853,7 +853,7 @@ void bli_dgemmsup_rd_haswell_asm_3x8n
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -909,18 +909,18 @@ void bli_dgemmsup_rd_haswell_asm_3x8n
 	//lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
 	lea(mem(rbx, r11, 4), r10)         // r10 = rbx + 4*cs_b
 
-	
 
-	
+
+
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
 	                                   // contains the k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -954,7 +954,7 @@ void bli_dgemmsup_rd_haswell_asm_3x8n
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	vfmadd231pd(ymm2, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 1
@@ -990,7 +990,7 @@ void bli_dgemmsup_rd_haswell_asm_3x8n
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 1
 	prefetch(0, mem(r10,         8*8)) // prefetch rbx + 4*cs_b + 8*rs_b
 	prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b
@@ -1057,32 +1057,32 @@ void bli_dgemmsup_rd_haswell_asm_3x8n
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	vfmadd231pd(ymm2, ymm3, ymm15)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKITER4)
-	
+
 	mov(var(k_iter4), rsi)             // i = k_iter4;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
+
 	vmovupd(mem(rax       ), ymm0)
 	vmovupd(mem(rax, r8, 1), ymm1)
 	vmovupd(mem(rax, r8, 2), ymm2)
 	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	
+
 	vmovupd(mem(rbx        ), ymm3)
 	vfmadd231pd(ymm0, ymm3, ymm4)
 	vfmadd231pd(ymm1, ymm3, ymm5)
@@ -1104,21 +1104,21 @@ void bli_dgemmsup_rd_haswell_asm_3x8n
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	vfmadd231pd(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.DCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -1126,12 +1126,12 @@ void bli_dgemmsup_rd_haswell_asm_3x8n
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovsd(mem(rax       ), xmm0)
 	vmovsd(mem(rax, r8, 1), xmm1)
 	vmovsd(mem(rax, r8, 2), xmm2)
 	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	
+
 	vmovsd(mem(rbx        ), xmm3)
 	vfmadd231pd(ymm0, ymm3, ymm4)
 	vfmadd231pd(ymm1, ymm3, ymm5)
@@ -1153,22 +1153,22 @@ void bli_dgemmsup_rd_haswell_asm_3x8n
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	vfmadd231pd(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 
 
 	label(.DPOSTACCUM)
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
+
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
 	                                   // ymm6  ymm9  ymm12 ymm15
-	
+
 	vhaddpd( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vaddpd( xmm0, xmm1, xmm0 )
@@ -1206,7 +1206,7 @@ void bli_dgemmsup_rd_haswell_asm_3x8n
 	                                   // ymm6[2] = sum(ymm12); ymm6[3] = sum(ymm15)
 
 
-	
+
 
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
@@ -1215,73 +1215,73 @@ void bli_dgemmsup_rd_haswell_asm_3x8n
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm5, ymm5)
 	vmulpd(ymm0, ymm6, ymm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), ymm3, ymm5)
 	vmovupd(ymm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), ymm3, ymm6)
 	vmovupd(ymm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(ymm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(ymm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
 
 	add(imm(4*8), r12)                 // c_jj = r12 += 4*cs_c
@@ -1297,7 +1297,7 @@ void bli_dgemmsup_rd_haswell_asm_3x8n
 
 	label(.DRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -1391,8 +1391,8 @@ void bli_dgemmsup_rd_haswell_asm_2x8n
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1422,7 +1422,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8n
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rdx)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -1440,7 +1440,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8n
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -1491,18 +1491,18 @@ void bli_dgemmsup_rd_haswell_asm_2x8n
 	//lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
 	lea(mem(rbx, r11, 4), r10)         // r10 = rbx + 4*cs_b
 
-	
 
-	
+
+
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
 	                                   // contains the k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -1531,7 +1531,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8n
 	vfmadd231pd(ymm0, ymm3, ymm13)
 	vfmadd231pd(ymm1, ymm3, ymm14)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 1
@@ -1562,7 +1562,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8n
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 1
 	prefetch(0, mem(r10,         8*8)) // prefetch rbx + 4*cs_b + 8*rs_b
 	prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b
@@ -1619,31 +1619,31 @@ void bli_dgemmsup_rd_haswell_asm_2x8n
 	vfmadd231pd(ymm0, ymm3, ymm13)
 	vfmadd231pd(ymm1, ymm3, ymm14)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKITER4)
-	
+
 	mov(var(k_iter4), rsi)             // i = k_iter4;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
+
 	vmovupd(mem(rax       ), ymm0)
 	vmovupd(mem(rax, r8, 1), ymm1)
 	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	
+
 	vmovupd(mem(rbx        ), ymm3)
 	vfmadd231pd(ymm0, ymm3, ymm4)
 	vfmadd231pd(ymm1, ymm3, ymm5)
@@ -1661,21 +1661,21 @@ void bli_dgemmsup_rd_haswell_asm_2x8n
 	vfmadd231pd(ymm0, ymm3, ymm13)
 	vfmadd231pd(ymm1, ymm3, ymm14)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.DCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -1683,11 +1683,11 @@ void bli_dgemmsup_rd_haswell_asm_2x8n
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovsd(mem(rax       ), xmm0)
 	vmovsd(mem(rax, r8, 1), xmm1)
 	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	
+
 	vmovsd(mem(rbx        ), xmm3)
 	vfmadd231pd(ymm0, ymm3, ymm4)
 	vfmadd231pd(ymm1, ymm3, ymm5)
@@ -1705,21 +1705,21 @@ void bli_dgemmsup_rd_haswell_asm_2x8n
 	vfmadd231pd(ymm0, ymm3, ymm13)
 	vfmadd231pd(ymm1, ymm3, ymm14)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 
 
 	label(.DPOSTACCUM)
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
+
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
-	
+
 	vhaddpd( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vaddpd( xmm0, xmm1, xmm0 )
@@ -1746,7 +1746,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8n
 
 
 
-	
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
 
@@ -1754,65 +1754,65 @@ void bli_dgemmsup_rd_haswell_asm_2x8n
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm5, ymm5)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), ymm3, ymm5)
 	vmovupd(ymm5, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(ymm5, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
 
 	add(imm(4*8), r12)                 // c_jj = r12 += 4*cs_c
@@ -1828,7 +1828,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8n
 
 	label(.DRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -1921,8 +1921,8 @@ void bli_dgemmsup_rd_haswell_asm_1x8n
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1952,7 +1952,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8n
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rdx)                   // load address of a.
 	//mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -1970,7 +1970,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8n
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -2016,18 +2016,18 @@ void bli_dgemmsup_rd_haswell_asm_1x8n
 	//lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
 	lea(mem(rbx, r11, 4), r10)         // r10 = rbx + 4*cs_b
 
-	
 
-	
+
+
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
 	                                   // contains the k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -2051,7 +2051,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8n
 	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
 	vfmadd231pd(ymm0, ymm3, ymm13)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 1
@@ -2077,7 +2077,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8n
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 1
 	prefetch(0, mem(r10,         8*8)) // prefetch rbx + 4*cs_b + 8*rs_b
 	prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b
@@ -2124,30 +2124,30 @@ void bli_dgemmsup_rd_haswell_asm_1x8n
 	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
 	vfmadd231pd(ymm0, ymm3, ymm13)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKITER4)
-	
+
 	mov(var(k_iter4), rsi)             // i = k_iter4;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
+
 	vmovupd(mem(rax       ), ymm0)
 	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	
+
 	vmovupd(mem(rbx        ), ymm3)
 	vfmadd231pd(ymm0, ymm3, ymm4)
 
@@ -2161,21 +2161,21 @@ void bli_dgemmsup_rd_haswell_asm_1x8n
 	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
 	vfmadd231pd(ymm0, ymm3, ymm13)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.DCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -2183,10 +2183,10 @@ void bli_dgemmsup_rd_haswell_asm_1x8n
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovsd(mem(rax       ), xmm0)
 	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	
+
 	vmovsd(mem(rbx        ), xmm3)
 	vfmadd231pd(ymm0, ymm3, ymm4)
 
@@ -2200,20 +2200,20 @@ void bli_dgemmsup_rd_haswell_asm_1x8n
 	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
 	vfmadd231pd(ymm0, ymm3, ymm13)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 
 
 	label(.DPOSTACCUM)
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
-	
+
+	                                   // ymm4  ymm7  ymm10 ymm13
+
 	vhaddpd( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vaddpd( xmm0, xmm1, xmm0 )
@@ -2228,7 +2228,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8n
 
 
 
-	
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
 
@@ -2236,57 +2236,57 @@ void bli_dgemmsup_rd_haswell_asm_1x8n
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
 
 	add(imm(4*8), r12)                 // c_jj = r12 += 4*cs_c
@@ -2302,7 +2302,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8n
 
 	label(.DRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c
index f764bc613e..fcf4484239 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16m.c
@@ -78,8 +78,8 @@ void bli_sgemmsup_rd_haswell_asm_6x16m
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	uint64_t n_left = n0 % 16;
@@ -190,7 +190,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16m
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	//mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -208,7 +208,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16m
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	//mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -283,18 +283,18 @@ void bli_sgemmsup_rd_haswell_asm_6x16m
 #endif
 	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -329,7 +329,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16m
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rax       ), ymm0)
@@ -360,7 +360,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16m
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 1
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
@@ -422,27 +422,27 @@ void bli_sgemmsup_rd_haswell_asm_6x16m
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 1
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
@@ -453,7 +453,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16m
 	vmovups(mem(rax, r8, 1), ymm1)
 	vmovups(mem(rax, r8, 2), ymm2)
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
-	
+
 	vmovups(mem(rbx        ), ymm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -475,21 +475,21 @@ void bli_sgemmsup_rd_haswell_asm_6x16m
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -497,12 +497,12 @@ void bli_sgemmsup_rd_haswell_asm_6x16m
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rax       ), xmm0)
 	vmovss(mem(rax, r8, 1), xmm1)
 	vmovss(mem(rax, r8, 2), xmm2)
 	add(imm(1*4), rax)                 // a += 1*cs_a = 1*4;
-	
+
 	vmovss(mem(rbx        ), xmm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -524,22 +524,22 @@ void bli_sgemmsup_rd_haswell_asm_6x16m
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 
 
 	label(.SPOSTACCUM)
 
-	                                   // ymm4  ymm7  ymm10 ymm13  
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
 	                                   // ymm6  ymm9  ymm12 ymm15
-	
+
 	vhaddps( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -555,7 +555,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16m
 	vshufps(imm(0x44), xmm2, xmm0, xmm4)
 	                                   // xmm4[0] = sum(ymm4);  xmm4[1] = sum(ymm7)
 	                                   // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13)
-	
+
 	vhaddps( ymm8, ymm5, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -571,7 +571,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16m
 	vshufps(imm(0x44), xmm2, xmm0, xmm5)
 	                                   // xmm5[0] = sum(ymm5);  xmm5[1] = sum(ymm8)
 	                                   // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14)
-	
+
 	vhaddps( ymm9, ymm6, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -590,7 +590,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16m
 
 
 
-	
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -598,73 +598,73 @@ void bli_sgemmsup_rd_haswell_asm_6x16m
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm5, xmm5)
 	vmulps(xmm0, xmm6, xmm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm5)
 	vmovups(xmm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm6)
 	vmovups(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	lea(mem(r12, rdi, 2), r12)         //
@@ -688,7 +688,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16m
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -770,8 +770,8 @@ void bli_sgemmsup_rd_haswell_asm_6x12m
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -801,7 +801,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12m
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	//mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -819,7 +819,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12m
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	//mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -894,18 +894,18 @@ void bli_sgemmsup_rd_haswell_asm_6x12m
 #endif
 	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -940,7 +940,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12m
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rax       ), ymm0)
@@ -971,7 +971,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12m
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 1
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
@@ -1033,27 +1033,27 @@ void bli_sgemmsup_rd_haswell_asm_6x12m
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 1
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
@@ -1064,7 +1064,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12m
 	vmovups(mem(rax, r8, 1), ymm1)
 	vmovups(mem(rax, r8, 2), ymm2)
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
-	
+
 	vmovups(mem(rbx        ), ymm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -1086,21 +1086,21 @@ void bli_sgemmsup_rd_haswell_asm_6x12m
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -1108,12 +1108,12 @@ void bli_sgemmsup_rd_haswell_asm_6x12m
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rax       ), xmm0)
 	vmovss(mem(rax, r8, 1), xmm1)
 	vmovss(mem(rax, r8, 2), xmm2)
 	add(imm(1*4), rax)                 // a += 1*cs_a = 1*4;
-	
+
 	vmovss(mem(rbx        ), xmm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -1135,22 +1135,22 @@ void bli_sgemmsup_rd_haswell_asm_6x12m
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 
 
 	label(.SPOSTACCUM)
 
-	                                   // ymm4  ymm7  ymm10 ymm13  
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
 	                                   // ymm6  ymm9  ymm12 ymm15
-	
+
 	vhaddps( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -1166,7 +1166,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12m
 	vshufps(imm(0x44), xmm2, xmm0, xmm4)
 	                                   // xmm4[0] = sum(ymm4);  xmm4[1] = sum(ymm7)
 	                                   // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13)
-	
+
 	vhaddps( ymm8, ymm5, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -1182,7 +1182,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12m
 	vshufps(imm(0x44), xmm2, xmm0, xmm5)
 	                                   // xmm5[0] = sum(ymm5);  xmm5[1] = sum(ymm8)
 	                                   // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14)
-	
+
 	vhaddps( ymm9, ymm6, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -1201,7 +1201,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12m
 
 
 
-	
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -1209,73 +1209,73 @@ void bli_sgemmsup_rd_haswell_asm_6x12m
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm5, xmm5)
 	vmulps(xmm0, xmm6, xmm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm5)
 	vmovups(xmm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm6)
 	vmovups(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	lea(mem(r12, rdi, 2), r12)         //
@@ -1299,7 +1299,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12m
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -1383,8 +1383,8 @@ void bli_sgemmsup_rd_haswell_asm_6x8m
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1414,7 +1414,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8m
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	//mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -1432,7 +1432,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8m
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	//mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -1507,18 +1507,18 @@ void bli_sgemmsup_rd_haswell_asm_6x8m
 #endif
 	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -1553,7 +1553,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8m
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rax       ), ymm0)
@@ -1584,7 +1584,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8m
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 1
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
@@ -1646,27 +1646,27 @@ void bli_sgemmsup_rd_haswell_asm_6x8m
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 1
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
@@ -1677,7 +1677,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8m
 	vmovups(mem(rax, r8, 1), ymm1)
 	vmovups(mem(rax, r8, 2), ymm2)
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
-	
+
 	vmovups(mem(rbx        ), ymm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -1699,21 +1699,21 @@ void bli_sgemmsup_rd_haswell_asm_6x8m
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -1721,12 +1721,12 @@ void bli_sgemmsup_rd_haswell_asm_6x8m
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rax       ), xmm0)
 	vmovss(mem(rax, r8, 1), xmm1)
 	vmovss(mem(rax, r8, 2), xmm2)
 	add(imm(1*4), rax)                 // a += 1*cs_a = 1*4;
-	
+
 	vmovss(mem(rbx        ), xmm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -1748,22 +1748,22 @@ void bli_sgemmsup_rd_haswell_asm_6x8m
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 
 
 	label(.SPOSTACCUM)
 
-	                                   // ymm4  ymm7  ymm10 ymm13  
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
 	                                   // ymm6  ymm9  ymm12 ymm15
-	
+
 	vhaddps( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -1779,7 +1779,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8m
 	vshufps(imm(0x44), xmm2, xmm0, xmm4)
 	                                   // xmm4[0] = sum(ymm4);  xmm4[1] = sum(ymm7)
 	                                   // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13)
-	
+
 	vhaddps( ymm8, ymm5, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -1795,7 +1795,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8m
 	vshufps(imm(0x44), xmm2, xmm0, xmm5)
 	                                   // xmm5[0] = sum(ymm5);  xmm5[1] = sum(ymm8)
 	                                   // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14)
-	
+
 	vhaddps( ymm9, ymm6, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -1814,7 +1814,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8m
 
 
 
-	
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -1822,73 +1822,73 @@ void bli_sgemmsup_rd_haswell_asm_6x8m
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm5, xmm5)
 	vmulps(xmm0, xmm6, xmm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm5)
 	vmovups(xmm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm6)
 	vmovups(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	lea(mem(r12, rdi, 2), r12)         //
@@ -1912,7 +1912,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8m
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -1994,8 +1994,8 @@ void bli_sgemmsup_rd_haswell_asm_6x4m
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2025,7 +2025,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4m
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -2043,7 +2043,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4m
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -2098,18 +2098,18 @@ void bli_sgemmsup_rd_haswell_asm_6x4m
 #endif
 	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -2144,7 +2144,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4m
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rax       ), ymm0)
@@ -2175,7 +2175,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4m
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 1
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
@@ -2237,27 +2237,27 @@ void bli_sgemmsup_rd_haswell_asm_6x4m
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 1
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
@@ -2268,7 +2268,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4m
 	vmovups(mem(rax, r8, 1), ymm1)
 	vmovups(mem(rax, r8, 2), ymm2)
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
-	
+
 	vmovups(mem(rbx        ), ymm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -2290,21 +2290,21 @@ void bli_sgemmsup_rd_haswell_asm_6x4m
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -2312,12 +2312,12 @@ void bli_sgemmsup_rd_haswell_asm_6x4m
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rax       ), xmm0)
 	vmovss(mem(rax, r8, 1), xmm1)
 	vmovss(mem(rax, r8, 2), xmm2)
 	add(imm(1*4), rax)                 // a += 1*cs_a = 1*4;
-	
+
 	vmovss(mem(rbx        ), xmm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -2339,22 +2339,22 @@ void bli_sgemmsup_rd_haswell_asm_6x4m
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 
 
 	label(.SPOSTACCUM)
 
-	                                   // ymm4  ymm7  ymm10 ymm13  
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
 	                                   // ymm6  ymm9  ymm12 ymm15
-	
+
 	vhaddps( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -2370,7 +2370,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4m
 	vshufps(imm(0x44), xmm2, xmm0, xmm4)
 	                                   // xmm4[0] = sum(ymm4);  xmm4[1] = sum(ymm7)
 	                                   // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13)
-	
+
 	vhaddps( ymm8, ymm5, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -2386,7 +2386,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4m
 	vshufps(imm(0x44), xmm2, xmm0, xmm5)
 	                                   // xmm5[0] = sum(ymm5);  xmm5[1] = sum(ymm8)
 	                                   // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14)
-	
+
 	vhaddps( ymm9, ymm6, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -2405,7 +2405,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4m
 
 
 
-	
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -2413,73 +2413,73 @@ void bli_sgemmsup_rd_haswell_asm_6x4m
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm5, xmm5)
 	vmulps(xmm0, xmm6, xmm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm5)
 	vmovups(xmm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm6)
 	vmovups(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	lea(mem(r12, rdi, 2), r12)         //
@@ -2496,7 +2496,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4m
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -2579,8 +2579,8 @@ void bli_sgemmsup_rd_haswell_asm_6x2m
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2610,7 +2610,7 @@ void bli_sgemmsup_rd_haswell_asm_6x2m
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -2628,7 +2628,7 @@ void bli_sgemmsup_rd_haswell_asm_6x2m
 
 	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -2685,18 +2685,18 @@ void bli_sgemmsup_rd_haswell_asm_6x2m
 	prefetch(0, mem(r10, rdi, 2, 1*4)) // prefetch c + 5*rs_c
 #endif
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -2734,7 +2734,7 @@ void bli_sgemmsup_rd_haswell_asm_6x2m
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rbx        ), ymm0)
@@ -2768,7 +2768,7 @@ void bli_sgemmsup_rd_haswell_asm_6x2m
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
@@ -2836,27 +2836,27 @@ void bli_sgemmsup_rd_haswell_asm_6x2m
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*rs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*rs_a
@@ -2892,21 +2892,21 @@ void bli_sgemmsup_rd_haswell_asm_6x2m
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -2944,12 +2944,12 @@ void bli_sgemmsup_rd_haswell_asm_6x2m
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 
@@ -2962,7 +2962,7 @@ void bli_sgemmsup_rd_haswell_asm_6x2m
 	                                   // ymm10 ymm11
 	                                   // ymm12 ymm13
 	                                   // ymm14 ymm15
-	
+
 	vhaddps( ymm5, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -3007,7 +3007,7 @@ void bli_sgemmsup_rd_haswell_asm_6x2m
                                        // xmm14[0:1] = sum(ymm14) sum(ymm15)
 
 
-	
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -3015,103 +3015,103 @@ void bli_sgemmsup_rd_haswell_asm_6x2m
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4,  xmm4)          // scale by alpha
 	vmulps(xmm0, xmm6,  xmm6)
 	vmulps(xmm0, xmm8,  xmm8)
 	vmulps(xmm0, xmm10, xmm10)
 	vmulps(xmm0, xmm12, xmm12)
 	vmulps(xmm0, xmm14, xmm14)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vmovsd(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm4)
 	vmovsd(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm6)
 	vmovsd(xmm6, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm8)
 	vmovsd(xmm8, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm10)
 	vmovsd(xmm10, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm12)
 	vmovsd(xmm12, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm14)
 	vmovsd(xmm14, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovsd(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(xmm6, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(xmm8, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(xmm10, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(xmm12, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(xmm14, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	lea(mem(r12, rdi, 4), r12)         //
@@ -3128,7 +3128,7 @@ void bli_sgemmsup_rd_haswell_asm_6x2m
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c
index 1fe862a8d1..33b2df4b4a 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rd_haswell_asm_s6x16n.c
@@ -78,8 +78,8 @@ void bli_sgemmsup_rd_haswell_asm_6x16n
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	uint64_t m_left = m0 % 6;
@@ -223,7 +223,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16n
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rdx)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -240,7 +240,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16n
 	lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
+
 
 	//mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -317,18 +317,18 @@ void bli_sgemmsup_rd_haswell_asm_6x16n
 	//lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
 	lea(mem(rbx, r11, 4), r10)         // r10 = rbx + 4*cs_b
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -362,7 +362,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16n
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 1
@@ -398,7 +398,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16n
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 1
 	prefetch(0, mem(r10,         8*8)) // prefetch rbx + 4*cs_b + 8*rs_b
 	prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b
@@ -465,32 +465,32 @@ void bli_sgemmsup_rd_haswell_asm_6x16n
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 	vmovups(mem(rax       ), ymm0)
 	vmovups(mem(rax, r8, 1), ymm1)
 	vmovups(mem(rax, r8, 2), ymm2)
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
-	
+
 	vmovups(mem(rbx        ), ymm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -512,21 +512,21 @@ void bli_sgemmsup_rd_haswell_asm_6x16n
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -534,12 +534,12 @@ void bli_sgemmsup_rd_haswell_asm_6x16n
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rax       ), xmm0)
 	vmovss(mem(rax, r8, 1), xmm1)
 	vmovss(mem(rax, r8, 2), xmm2)
 	add(imm(1*4), rax)                 // a += 1*cs_a = 1*4;
-	
+
 	vmovss(mem(rbx        ), xmm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -561,22 +561,22 @@ void bli_sgemmsup_rd_haswell_asm_6x16n
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 
 
 	label(.SPOSTACCUM)
 
-	                                   // ymm4  ymm7  ymm10 ymm13  
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
 	                                   // ymm6  ymm9  ymm12 ymm15
-	
+
 	vhaddps( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -592,7 +592,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16n
 	vshufps(imm(0x44), xmm2, xmm0, xmm4)
 	                                   // xmm4[0] = sum(ymm4);  xmm4[1] = sum(ymm7)
 	                                   // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13)
-	
+
 	vhaddps( ymm8, ymm5, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -608,7 +608,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16n
 	vshufps(imm(0x44), xmm2, xmm0, xmm5)
 	                                   // xmm5[0] = sum(ymm5);  xmm5[1] = sum(ymm8)
 	                                   // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14)
-	
+
 	vhaddps( ymm9, ymm6, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -627,7 +627,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16n
 
 
 
-	
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -635,73 +635,73 @@ void bli_sgemmsup_rd_haswell_asm_6x16n
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm5, xmm5)
 	vmulps(xmm0, xmm6, xmm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm5)
 	vmovups(xmm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm6)
 	vmovups(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	add(imm(4*4), r12)                 // c_jj = r12 += 4*cs_c
@@ -723,7 +723,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16n
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -816,8 +816,8 @@ void bli_sgemmsup_rd_haswell_asm_3x16n
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -847,7 +847,7 @@ void bli_sgemmsup_rd_haswell_asm_3x16n
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rdx)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -864,7 +864,7 @@ void bli_sgemmsup_rd_haswell_asm_3x16n
 	lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
+
 
 	mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -919,18 +919,18 @@ void bli_sgemmsup_rd_haswell_asm_3x16n
 	//lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
 	lea(mem(rbx, r11, 4), r10)         // r10 = rbx + 4*cs_b
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -964,7 +964,7 @@ void bli_sgemmsup_rd_haswell_asm_3x16n
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 1
@@ -1000,7 +1000,7 @@ void bli_sgemmsup_rd_haswell_asm_3x16n
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 1
 	prefetch(0, mem(r10,         8*8)) // prefetch rbx + 4*cs_b + 8*rs_b
 	prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b
@@ -1067,32 +1067,32 @@ void bli_sgemmsup_rd_haswell_asm_3x16n
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 	vmovups(mem(rax       ), ymm0)
 	vmovups(mem(rax, r8, 1), ymm1)
 	vmovups(mem(rax, r8, 2), ymm2)
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
-	
+
 	vmovups(mem(rbx        ), ymm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -1114,21 +1114,21 @@ void bli_sgemmsup_rd_haswell_asm_3x16n
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -1136,12 +1136,12 @@ void bli_sgemmsup_rd_haswell_asm_3x16n
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rax       ), xmm0)
 	vmovss(mem(rax, r8, 1), xmm1)
 	vmovss(mem(rax, r8, 2), xmm2)
 	add(imm(1*4), rax)                 // a += 1*cs_a = 1*4;
-	
+
 	vmovss(mem(rbx        ), xmm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -1163,22 +1163,22 @@ void bli_sgemmsup_rd_haswell_asm_3x16n
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 
 
 	label(.SPOSTACCUM)
 
-	                                   // ymm4  ymm7  ymm10 ymm13  
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
 	                                   // ymm6  ymm9  ymm12 ymm15
-	
+
 	vhaddps( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -1194,7 +1194,7 @@ void bli_sgemmsup_rd_haswell_asm_3x16n
 	vshufps(imm(0x44), xmm2, xmm0, xmm4)
 	                                   // xmm4[0] = sum(ymm4);  xmm4[1] = sum(ymm7)
 	                                   // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13)
-	
+
 	vhaddps( ymm8, ymm5, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -1210,7 +1210,7 @@ void bli_sgemmsup_rd_haswell_asm_3x16n
 	vshufps(imm(0x44), xmm2, xmm0, xmm5)
 	                                   // xmm5[0] = sum(ymm5);  xmm5[1] = sum(ymm8)
 	                                   // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14)
-	
+
 	vhaddps( ymm9, ymm6, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -1229,7 +1229,7 @@ void bli_sgemmsup_rd_haswell_asm_3x16n
 
 
 
-	
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -1237,73 +1237,73 @@ void bli_sgemmsup_rd_haswell_asm_3x16n
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm5, xmm5)
 	vmulps(xmm0, xmm6, xmm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm5)
 	vmovups(xmm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm6)
 	vmovups(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	add(imm(4*4), r12)                 // c_jj = r12 += 4*cs_c
@@ -1319,7 +1319,7 @@ void bli_sgemmsup_rd_haswell_asm_3x16n
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -1413,8 +1413,8 @@ void bli_sgemmsup_rd_haswell_asm_2x16n
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1444,7 +1444,7 @@ void bli_sgemmsup_rd_haswell_asm_2x16n
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rdx)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -1461,7 +1461,7 @@ void bli_sgemmsup_rd_haswell_asm_2x16n
 	lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
+
 
 	mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -1511,18 +1511,18 @@ void bli_sgemmsup_rd_haswell_asm_2x16n
 	//lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
 	lea(mem(rbx, r11, 4), r10)         // r10 = rbx + 4*cs_b
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -1551,7 +1551,7 @@ void bli_sgemmsup_rd_haswell_asm_2x16n
 	vfmadd231ps(ymm0, ymm3, ymm13)
 	vfmadd231ps(ymm1, ymm3, ymm14)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 1
@@ -1582,7 +1582,7 @@ void bli_sgemmsup_rd_haswell_asm_2x16n
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 1
 	prefetch(0, mem(r10,         8*8)) // prefetch rbx + 4*cs_b + 8*rs_b
 	prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b
@@ -1639,31 +1639,31 @@ void bli_sgemmsup_rd_haswell_asm_2x16n
 	vfmadd231ps(ymm0, ymm3, ymm13)
 	vfmadd231ps(ymm1, ymm3, ymm14)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 	vmovups(mem(rax       ), ymm0)
 	vmovups(mem(rax, r8, 1), ymm1)
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
-	
+
 	vmovups(mem(rbx        ), ymm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -1681,21 +1681,21 @@ void bli_sgemmsup_rd_haswell_asm_2x16n
 	vfmadd231ps(ymm0, ymm3, ymm13)
 	vfmadd231ps(ymm1, ymm3, ymm14)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -1703,11 +1703,11 @@ void bli_sgemmsup_rd_haswell_asm_2x16n
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rax       ), xmm0)
 	vmovss(mem(rax, r8, 1), xmm1)
 	add(imm(1*4), rax)                 // a += 1*cs_a = 1*4;
-	
+
 	vmovss(mem(rbx        ), xmm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -1725,21 +1725,21 @@ void bli_sgemmsup_rd_haswell_asm_2x16n
 	vfmadd231ps(ymm0, ymm3, ymm13)
 	vfmadd231ps(ymm1, ymm3, ymm14)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 
 
 	label(.SPOSTACCUM)
 
-	                                   // ymm4  ymm7  ymm10 ymm13  
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
-	
+
 	vhaddps( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -1755,7 +1755,7 @@ void bli_sgemmsup_rd_haswell_asm_2x16n
 	vshufps(imm(0x44), xmm2, xmm0, xmm4)
 	                                   // xmm4[0] = sum(ymm4);  xmm4[1] = sum(ymm7)
 	                                   // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13)
-	
+
 	vhaddps( ymm8, ymm5, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -1774,7 +1774,7 @@ void bli_sgemmsup_rd_haswell_asm_2x16n
 
 
 
-	
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -1782,65 +1782,65 @@ void bli_sgemmsup_rd_haswell_asm_2x16n
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm5, xmm5)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm5)
 	vmovups(xmm5, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm5, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	add(imm(4*4), r12)                 // c_jj = r12 += 4*cs_c
@@ -1856,7 +1856,7 @@ void bli_sgemmsup_rd_haswell_asm_2x16n
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -1949,8 +1949,8 @@ void bli_sgemmsup_rd_haswell_asm_1x16n
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1980,7 +1980,7 @@ void bli_sgemmsup_rd_haswell_asm_1x16n
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rdx)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -1997,7 +1997,7 @@ void bli_sgemmsup_rd_haswell_asm_1x16n
 	lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
-	
+
 
 	mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -2042,18 +2042,18 @@ void bli_sgemmsup_rd_haswell_asm_1x16n
 	//lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
 	lea(mem(rbx, r11, 4), r10)         // r10 = rbx + 4*cs_b
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -2077,7 +2077,7 @@ void bli_sgemmsup_rd_haswell_asm_1x16n
 	add(imm(8*4), rbx)                 // b += 8*rs_b = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm13)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 1
@@ -2103,7 +2103,7 @@ void bli_sgemmsup_rd_haswell_asm_1x16n
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 1
 	prefetch(0, mem(r10,         8*8)) // prefetch rbx + 4*cs_b + 8*rs_b
 	prefetch(0, mem(r10, r11, 1, 8*8)) // prefetch rbx + 5*cs_b + 8*rs_b
@@ -2150,30 +2150,30 @@ void bli_sgemmsup_rd_haswell_asm_1x16n
 	add(imm(8*4), rbx)                 // b += 8*rs_b = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm13)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 	vmovups(mem(rax       ), ymm0)
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
-	
+
 	vmovups(mem(rbx        ), ymm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 
@@ -2187,21 +2187,21 @@ void bli_sgemmsup_rd_haswell_asm_1x16n
 	add(imm(8*4), rbx)                 // b += 8*rs_b = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm13)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -2209,11 +2209,11 @@ void bli_sgemmsup_rd_haswell_asm_1x16n
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rax       ), xmm0)
 	vmovss(mem(rax, r8, 1), xmm1)
 	add(imm(1*4), rax)                 // a += 1*cs_a = 1*4;
-	
+
 	vmovss(mem(rbx        ), xmm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 
@@ -2227,20 +2227,20 @@ void bli_sgemmsup_rd_haswell_asm_1x16n
 	add(imm(1*4), rbx)                 // b += 1*rs_b = 1*4;
 	vfmadd231ps(ymm0, ymm3, ymm13)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 
 
 	label(.SPOSTACCUM)
 
-	                                   // ymm4  ymm7  ymm10 ymm13  
-	
+	                                   // ymm4  ymm7  ymm10 ymm13
+
 	vhaddps( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -2259,7 +2259,7 @@ void bli_sgemmsup_rd_haswell_asm_1x16n
 
 
 
-	
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -2267,57 +2267,57 @@ void bli_sgemmsup_rd_haswell_asm_1x16n
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	add(imm(4*4), r12)                 // c_jj = r12 += 4*cs_c
@@ -2333,7 +2333,7 @@ void bli_sgemmsup_rd_haswell_asm_1x16n
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c
index 1637e97667..4e6b755725 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8m.c
@@ -40,20 +40,20 @@
 
 /*
    rrr:
-	 --------        ------        --------      
-	 --------        ------        --------      
-	 --------   +=   ------ ...    --------      
-	 --------        ------        --------      
-	 --------        ------            :         
-	 --------        ------            :         
+	 --------        ------        --------
+	 --------        ------        --------
+	 --------   +=   ------ ...    --------
+	 --------        ------        --------
+	 --------        ------            :
+	 --------        ------            :
 
    rcr:
-	 --------        | | | |       --------      
-	 --------        | | | |       --------      
-	 --------   +=   | | | | ...   --------      
-	 --------        | | | |       --------      
-	 --------        | | | |           :         
-	 --------        | | | |           :         
+	 --------        | | | |       --------
+	 --------        | | | |       --------
+	 --------   +=   | | | | ...   --------
+	 --------        | | | |       --------
+	 --------        | | | |           :
+	 --------        | | | |           :
 
    Assumptions:
    - B is row-stored;
@@ -69,12 +69,12 @@
    cost of the in-register transpose).
 
    crr:
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |  +=   ------ ...    --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------            :         
-	 | | | | | | | |       ------            :         
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |  +=   ------ ...    --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------            :
+	 | | | | | | | |       ------            :
 */
 
 // Prototype reference microkernels.
@@ -93,8 +93,8 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	uint64_t n_left = n0 % 8;
@@ -225,15 +225,15 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
 	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-	
+
 	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -336,19 +336,19 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
 	lea(mem(rdx, r8,  2), rdx)         // from next upanel of a.
 	lea(mem(r9, r9, 2), rcx)           // rcx = 3*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -356,7 +356,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
 #else
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx,  0*32), ymm0)
 	vmovupd(mem(rbx,  1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -367,14 +367,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -383,7 +383,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -402,14 +402,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -417,8 +417,8 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 0
@@ -426,7 +426,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
 #else
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx,  0*32), ymm0)
 	vmovupd(mem(rbx,  1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -437,14 +437,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -452,7 +452,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -473,14 +473,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -488,50 +488,50 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovupd(mem(rbx,  0*32), ymm0)
 	vmovupd(mem(rbx,  1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -539,23 +539,23 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
-	
-	
+
+
 	mov(r12, rcx)                      // reset rcx to current utile of c.
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm5, ymm5)
 	vmulpd(ymm0, ymm6, ymm6)
@@ -568,24 +568,24 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
 	vmulpd(ymm0, ymm13, ymm13)
 	vmulpd(ymm0, ymm14, ymm14)
 	vmulpd(ymm0, ymm15, ymm15)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -594,60 +594,60 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5)
 	vmovupd(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6)
 	vmovupd(ymm6, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm7)
 	vmovupd(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8)
 	vmovupd(ymm8, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm9)
 	vmovupd(ymm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10)
 	vmovupd(ymm10, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm11)
 	vmovupd(ymm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12)
 	vmovupd(ymm12, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm13)
 	vmovupd(ymm13, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm14)
 	vmovupd(ymm14, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm15)
 	vmovupd(ymm15, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -734,51 +734,51 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
+
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	vmovupd(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(ymm6, mem(rcx, 0*32))
 	vmovupd(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm8, mem(rcx, 0*32))
 	vmovupd(ymm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm10, mem(rcx, 0*32))
 	vmovupd(ymm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm12, mem(rcx, 0*32))
 	vmovupd(ymm13, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm14, mem(rcx, 0*32))
 	vmovupd(ymm15, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -843,9 +843,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
 
 	//lea(mem(rdx, rsi, 4), rdx)
 
-	
-	
-	
+
+
+
 	label(.DDONE)
 
 
@@ -866,8 +866,8 @@ void bli_dgemmsup_rv_haswell_asm_6x8m
 
 
 	label(.DRETURN)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -995,8 +995,8 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1026,15 +1026,15 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
 	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-	
+
 	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -1135,19 +1135,19 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
 	lea(mem(rdx, r8,  2), rdx)         // from next upanel of a.
 	lea(mem(r9, r9, 2), rcx)           // rcx = 3*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -1155,7 +1155,7 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
 #else
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx,  0*32), ymm0)
 	vmovupd(mem(rbx,  1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -1166,14 +1166,14 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1182,7 +1182,7 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1201,14 +1201,14 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1216,8 +1216,8 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 0
@@ -1225,7 +1225,7 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
 #else
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx,  0*32), ymm0)
 	vmovupd(mem(rbx,  1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -1236,14 +1236,14 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1251,7 +1251,7 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1272,14 +1272,14 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1287,27 +1287,27 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
-	
+
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
@@ -1316,21 +1316,21 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
 	vmovupd(mem(rbx,  0*32), ymm0)
 	vmovupd(mem(rbx,  1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1338,23 +1338,23 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
-	
-	
+
+
 	mov(r12, rcx)                      // reset rcx to current utile of c.
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(xmm0, xmm5, xmm5)
 	vmulpd(ymm0, ymm6, ymm6)
@@ -1367,24 +1367,24 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
 	vmulpd(xmm0, xmm13, xmm13)
 	vmulpd(ymm0, ymm14, ymm14)
 	vmulpd(xmm0, xmm15, xmm15)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -1393,60 +1393,60 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm5)
 	vmovupd(xmm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6)
 	vmovupd(ymm6, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm7)
 	vmovupd(xmm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8)
 	vmovupd(ymm8, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm9)
 	vmovupd(xmm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10)
 	vmovupd(ymm10, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm11)
 	vmovupd(xmm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12)
 	vmovupd(ymm12, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm13)
 	vmovupd(xmm13, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm14)
 	vmovupd(ymm14, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm15)
 	vmovupd(xmm15, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -1521,51 +1521,51 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
+
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	vmovupd(xmm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(ymm6, mem(rcx, 0*32))
 	vmovupd(xmm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm8, mem(rcx, 0*32))
 	vmovupd(xmm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm10, mem(rcx, 0*32))
 	vmovupd(xmm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm12, mem(rcx, 0*32))
 	vmovupd(xmm13, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm14, mem(rcx, 0*32))
 	vmovupd(xmm15, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -1622,9 +1622,9 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
 
 	//lea(mem(rdx, rsi, 4), rdx)
 
-	
-	
-	
+
+
+
 	label(.DDONE)
 
 
@@ -1645,8 +1645,8 @@ void bli_dgemmsup_rv_haswell_asm_6x6m
 
 
 	label(.DRETURN)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1774,8 +1774,8 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1805,9 +1805,9 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -1822,7 +1822,7 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
 	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -1907,17 +1907,17 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
 #endif
 
 
-	
-	
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -1925,7 +1925,7 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
 #else
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx,  0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1933,19 +1933,19 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1961,18 +1961,18 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm0, ymm3, ymm14)
-	
+
 
 	// ---------------------------------- iteration 2
 
@@ -1981,7 +1981,7 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
 #else
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx,  0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1989,18 +1989,18 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm0, ymm3, ymm14)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -2018,38 +2018,38 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm0, ymm3, ymm14)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
-	
+
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
@@ -2062,58 +2062,58 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm0, ymm3, ymm14)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
 
-	
+
 	mov(r12, rcx)                      // reset rcx to current utile of c.
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm6, ymm6)
 	vmulpd(ymm0, ymm8, ymm8)
 	vmulpd(ymm0, ymm10, ymm10)
 	vmulpd(ymm0, ymm12, ymm12)
 	vmulpd(ymm0, ymm14, ymm14)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -2122,42 +2122,42 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6)
 	vmovupd(ymm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8)
 	vmovupd(ymm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10)
 	vmovupd(ymm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12)
 	vmovupd(ymm12, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm14)
 	vmovupd(ymm14, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -2205,45 +2205,45 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
 
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(ymm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm8, mem(rcx, 0*32))
 	add(rdi, rcx)
 
-	
+
 	vmovupd(ymm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm12, mem(rcx, 0*32))
 	add(rdi, rcx)
 
-	
+
 	vmovupd(ymm14, mem(rcx, 0*32))
 	//add(rdi, rcx)
 
-	
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -2278,15 +2278,15 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
 	vmovupd(xmm4, mem(rdx, rax, 1))
 
 	//lea(mem(rdx, rsi, 4), rdx)
-	
-	
-	
-	
+
+
+
+
 	label(.DDONE)
 
 
 
-	
+
 	lea(mem(r12, rdi, 4), r12)         //
 	lea(mem(r12, rdi, 2), r12)         // c_ii = r12 += 6*rs_c
 
@@ -2302,8 +2302,8 @@ void bli_dgemmsup_rv_haswell_asm_6x4m
 
 
 	label(.DRETURN)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -2431,8 +2431,8 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2462,9 +2462,9 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -2479,7 +2479,7 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
 	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -2558,19 +2558,19 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
 	lea(mem(rdx, r8,  2), rdx)         // from next upanel of a.
 	lea(mem(r9, r9, 2), rcx)           // rcx = 3*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -2578,7 +2578,7 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
 #else
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx,  0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -2586,19 +2586,19 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm8)
 	vfmadd231pd(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm12)
 	vfmadd231pd(xmm0, xmm3, xmm14)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -2614,18 +2614,18 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm8)
 	vfmadd231pd(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm12)
 	vfmadd231pd(xmm0, xmm3, xmm14)
-	
+
 
 	// ---------------------------------- iteration 2
 
@@ -2634,7 +2634,7 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
 #else
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx,  0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -2642,18 +2642,18 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm8)
 	vfmadd231pd(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm12)
 	vfmadd231pd(xmm0, xmm3, xmm14)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -2671,43 +2671,43 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm8)
 	vfmadd231pd(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm12)
 	vfmadd231pd(xmm0, xmm3, xmm14)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovupd(mem(rbx,  0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -2715,58 +2715,58 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm8)
 	vfmadd231pd(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm12)
 	vfmadd231pd(xmm0, xmm3, xmm14)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
 
-	
+
 	mov(r12, rcx)                      // reset rcx to current utile of c.
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulpd(xmm0, xmm6, xmm6)
 	vmulpd(xmm0, xmm8, xmm8)
 	vmulpd(xmm0, xmm10, xmm10)
 	vmulpd(xmm0, xmm12, xmm12)
 	vmulpd(xmm0, xmm14, xmm14)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	//lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -2775,42 +2775,42 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm4)
 	vmovupd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm6)
 	vmovupd(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm8)
 	vmovupd(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm10)
 	vmovupd(xmm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm12)
 	vmovupd(xmm12, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm14)
 	vmovupd(xmm14, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -2846,40 +2846,40 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
 
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(xmm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(xmm12, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(xmm14, mem(rcx, 0*32))
 	//add(rdi, rcx)
@@ -2890,7 +2890,7 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
 
 
 	label(.DCOLSTORBZ)
-	
+
 	                                   // begin I/O on columns 0-3
 	vunpcklpd(xmm6, xmm4, xmm0)
 	vunpckhpd(xmm6, xmm4, xmm1)
@@ -2911,10 +2911,10 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
 	vmovupd(xmm1, mem(rdx, rsi, 1))
 
 	//lea(mem(rdx, rsi, 4), rdx)
-	
-	
-	
-	
+
+
+
+
 	label(.DDONE)
 
 
@@ -2936,7 +2936,7 @@ void bli_dgemmsup_rv_haswell_asm_6x2m
 
 	label(.DRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c
index 5ecef06e8b..2533a7825d 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_d6x8n.c
@@ -40,20 +40,20 @@
 
 /*
    rrr:
-	 --------        ------        --------      
-	 --------        ------        --------      
-	 --------   +=   ------ ...    --------      
-	 --------        ------        --------      
-	 --------        ------            :         
-	 --------        ------            :         
+	 --------        ------        --------
+	 --------        ------        --------
+	 --------   +=   ------ ...    --------
+	 --------        ------        --------
+	 --------        ------            :
+	 --------        ------            :
 
    rcr:
-	 --------        | | | |       --------      
-	 --------        | | | |       --------      
-	 --------   +=   | | | | ...   --------      
-	 --------        | | | |       --------      
-	 --------        | | | |           :         
-	 --------        | | | |           :         
+	 --------        | | | |       --------
+	 --------        | | | |       --------
+	 --------   +=   | | | | ...   --------
+	 --------        | | | |       --------
+	 --------        | | | |           :
+	 --------        | | | |           :
 
    Assumptions:
    - B is row-stored;
@@ -69,12 +69,12 @@
    cost of the in-register transpose).
 
    crr:
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |  +=   ------ ...    --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------            :         
-	 | | | | | | | |       ------            :         
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |  +=   ------ ...    --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------            :
+	 | | | | | | | |       ------            :
 */
 
 // Prototype reference microkernels.
@@ -93,8 +93,8 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	uint64_t m_left = m0 % 6;
@@ -154,14 +154,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
 		}
 #endif
 
-		dgemmsup_ker_ft ker_fps[6] = 
+		dgemmsup_ker_ft ker_fps[6] =
 		{
 		  NULL,
 		  bli_dgemmsup_rv_haswell_asm_1x8n,
 		  bli_dgemmsup_rv_haswell_asm_2x8n,
 		  bli_dgemmsup_rv_haswell_asm_3x8n,
 		  bli_dgemmsup_rv_haswell_asm_4x8n,
-		  bli_dgemmsup_rv_haswell_asm_5x8n 
+		  bli_dgemmsup_rv_haswell_asm_5x8n
 		};
 
 		dgemmsup_ker_ft ker_fp = ker_fps[ m_left ];
@@ -203,15 +203,15 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	//mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
 	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-	
+
 	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -313,19 +313,19 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
 	lea(mem(rdx, r8,  8), rdx)         // from next upanel of b.
 	lea(mem(r10, r10, 2), rcx)         // rcx = 3*rs_b;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -344,14 +344,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -360,7 +360,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -379,14 +379,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -394,8 +394,8 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 0
@@ -414,14 +414,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -429,7 +429,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -450,14 +450,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -465,25 +465,25 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 1
@@ -494,21 +494,21 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
 	vmovupd(mem(rbx,  0*32), ymm0)
 	vmovupd(mem(rbx,  1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -516,23 +516,23 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
-	
-	
+
+
 	mov(r12, rcx)                      // reset rcx to current utile of c.
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm5, ymm5)
 	vmulpd(ymm0, ymm6, ymm6)
@@ -545,24 +545,24 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
 	vmulpd(ymm0, ymm13, ymm13)
 	vmulpd(ymm0, ymm14, ymm14)
 	vmulpd(ymm0, ymm15, ymm15)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -571,60 +571,60 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5)
 	vmovupd(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6)
 	vmovupd(ymm6, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm7)
 	vmovupd(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8)
 	vmovupd(ymm8, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm9)
 	vmovupd(ymm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10)
 	vmovupd(ymm10, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm11)
 	vmovupd(ymm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12)
 	vmovupd(ymm12, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm13)
 	vmovupd(ymm13, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm14)
 	vmovupd(ymm14, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm15)
 	vmovupd(ymm15, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -711,51 +711,51 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
+
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	vmovupd(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(ymm6, mem(rcx, 0*32))
 	vmovupd(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm8, mem(rcx, 0*32))
 	vmovupd(ymm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm10, mem(rcx, 0*32))
 	vmovupd(ymm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm12, mem(rcx, 0*32))
 	vmovupd(ymm13, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm14, mem(rcx, 0*32))
 	vmovupd(ymm15, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -820,9 +820,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
 
 	//lea(mem(rdx, rsi, 4), rdx)
 
-	
-	
-	
+
+
+
 	label(.DDONE)
 
 
@@ -841,8 +841,8 @@ void bli_dgemmsup_rv_haswell_asm_6x8n
 
 
 	label(.DRETURN)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -959,8 +959,8 @@ void bli_dgemmsup_rv_haswell_asm_5x8n
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -990,15 +990,15 @@ void bli_dgemmsup_rv_haswell_asm_5x8n
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	//mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
 	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-	
+
 	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -1097,19 +1097,19 @@ void bli_dgemmsup_rv_haswell_asm_5x8n
 	lea(mem(rdx, r8,  8), rdx)         // from next upanel of b.
 	lea(mem(r10, r10, 2), rcx)         // rcx = 3*rs_b;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -1117,7 +1117,7 @@ void bli_dgemmsup_rv_haswell_asm_5x8n
 #else
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx,  0*32), ymm0)
 	vmovupd(mem(rbx,  1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -1128,20 +1128,20 @@ void bli_dgemmsup_rv_haswell_asm_5x8n
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm1, ymm2, ymm13)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1160,20 +1160,20 @@ void bli_dgemmsup_rv_haswell_asm_5x8n
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm1, ymm2, ymm13)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 0
@@ -1181,7 +1181,7 @@ void bli_dgemmsup_rv_haswell_asm_5x8n
 #else
 	prefetch(0, mem(rdx, r10, 2, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx,  0*32), ymm0)
 	vmovupd(mem(rbx,  1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -1192,19 +1192,19 @@ void bli_dgemmsup_rv_haswell_asm_5x8n
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm1, ymm2, ymm13)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1225,37 +1225,37 @@ void bli_dgemmsup_rv_haswell_asm_5x8n
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm1, ymm2, ymm13)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 1
@@ -1266,42 +1266,42 @@ void bli_dgemmsup_rv_haswell_asm_5x8n
 	vmovupd(mem(rbx,  0*32), ymm0)
 	vmovupd(mem(rbx,  1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm1, ymm2, ymm13)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
-	
-	
+
+
 	mov(r12, rcx)                      // reset rcx to current utile of c.
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm5, ymm5)
 	vmulpd(ymm0, ymm6, ymm6)
@@ -1312,24 +1312,24 @@ void bli_dgemmsup_rv_haswell_asm_5x8n
 	vmulpd(ymm0, ymm11, ymm11)
 	vmulpd(ymm0, ymm12, ymm12)
 	vmulpd(ymm0, ymm13, ymm13)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -1338,52 +1338,52 @@ void bli_dgemmsup_rv_haswell_asm_5x8n
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5)
 	vmovupd(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6)
 	vmovupd(ymm6, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm7)
 	vmovupd(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8)
 	vmovupd(ymm8, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm9)
 	vmovupd(ymm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10)
 	vmovupd(ymm10, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm11)
 	vmovupd(ymm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12)
 	vmovupd(ymm12, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm13)
 	vmovupd(ymm13, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -1468,46 +1468,46 @@ void bli_dgemmsup_rv_haswell_asm_5x8n
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
+
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	vmovupd(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(ymm6, mem(rcx, 0*32))
 	vmovupd(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm8, mem(rcx, 0*32))
 	vmovupd(ymm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm10, mem(rcx, 0*32))
 	vmovupd(ymm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm12, mem(rcx, 0*32))
 	vmovupd(ymm13, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -1568,9 +1568,9 @@ void bli_dgemmsup_rv_haswell_asm_5x8n
 
 	//lea(mem(rdx, rsi, 4), rdx)
 
-	
-	
-	
+
+
+
 	label(.DDONE)
 
 
@@ -1589,8 +1589,8 @@ void bli_dgemmsup_rv_haswell_asm_5x8n
 
 
 	label(.DRETURN)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1707,8 +1707,8 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1740,13 +1740,13 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	//mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
 	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-	
+
 	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -1842,19 +1842,19 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
 	lea(mem(rdx, r8,  8), rdx)         // from next upanel of b.
 	lea(mem(r10, r10, 2), rcx)         // rcx = 3*rs_b;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -1862,7 +1862,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
 #else
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx,  0*32), ymm0)
 	vmovupd(mem(rbx,  1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -1873,7 +1873,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1882,7 +1882,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1901,7 +1901,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1909,8 +1909,8 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 0
@@ -1918,7 +1918,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
 #else
 	prefetch(0, mem(rdx, r10, 2, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx,  0*32), ymm0)
 	vmovupd(mem(rbx,  1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -1929,7 +1929,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1937,7 +1937,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1958,7 +1958,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1966,25 +1966,25 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 1
@@ -1995,14 +1995,14 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
 	vmovupd(mem(rbx,  0*32), ymm0)
 	vmovupd(mem(rbx,  1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -2010,23 +2010,23 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
-	
-	
+
+
 	mov(r12, rcx)                      // reset rcx to current utile of c.
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm5, ymm5)
 	vmulpd(ymm0, ymm6, ymm6)
@@ -2035,24 +2035,24 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
 	vmulpd(ymm0, ymm9, ymm9)
 	vmulpd(ymm0, ymm10, ymm10)
 	vmulpd(ymm0, ymm11, ymm11)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -2061,44 +2061,44 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5)
 	vmovupd(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6)
 	vmovupd(ymm6, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm7)
 	vmovupd(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8)
 	vmovupd(ymm8, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm9)
 	vmovupd(ymm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10)
 	vmovupd(ymm10, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm11)
 	vmovupd(ymm11, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -2153,21 +2153,21 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
+
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	vmovupd(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
@@ -2175,16 +2175,16 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
 	vmovupd(ymm6, mem(rcx, 0*32))
 	vmovupd(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
+
 	vmovupd(ymm8, mem(rcx, 0*32))
 	vmovupd(ymm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
+
 	vmovupd(ymm10, mem(rcx, 0*32))
 	vmovupd(ymm11, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -2225,9 +2225,9 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
 
 	//lea(mem(rcx, rsi, 4), rcx)
 
-	
-	
-	
+
+
+
 	label(.DDONE)
 
 
@@ -2246,8 +2246,8 @@ void bli_dgemmsup_rv_haswell_asm_4x8n
 
 
 	label(.DRETURN)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -2355,8 +2355,8 @@ void bli_dgemmsup_rv_haswell_asm_3x8n
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2388,13 +2388,13 @@ void bli_dgemmsup_rv_haswell_asm_3x8n
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	//mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
 	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-	
+
 	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -2493,19 +2493,19 @@ void bli_dgemmsup_rv_haswell_asm_3x8n
 	lea(mem(rdx, r8,  8), rdx)         // from next upanel of b.
 	lea(mem(r10, r10, 2), rcx)         // rcx = 3*rs_b;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -2513,7 +2513,7 @@ void bli_dgemmsup_rv_haswell_asm_3x8n
 #else
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx,  0*32), ymm0)
 	vmovupd(mem(rbx,  1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -2524,13 +2524,13 @@ void bli_dgemmsup_rv_haswell_asm_3x8n
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -2549,13 +2549,13 @@ void bli_dgemmsup_rv_haswell_asm_3x8n
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 0
@@ -2563,7 +2563,7 @@ void bli_dgemmsup_rv_haswell_asm_3x8n
 #else
 	prefetch(0, mem(rdx, r10, 2, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx,  0*32), ymm0)
 	vmovupd(mem(rbx,  1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -2574,12 +2574,12 @@ void bli_dgemmsup_rv_haswell_asm_3x8n
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -2600,30 +2600,30 @@ void bli_dgemmsup_rv_haswell_asm_3x8n
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 1
@@ -2634,59 +2634,59 @@ void bli_dgemmsup_rv_haswell_asm_3x8n
 	vmovupd(mem(rbx,  0*32), ymm0)
 	vmovupd(mem(rbx,  1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
-	
-	
+
+
 	mov(r12, rcx)                      // reset rcx to current utile of c.
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm5, ymm5)
 	vmulpd(ymm0, ymm6, ymm6)
 	vmulpd(ymm0, ymm7, ymm7)
 	vmulpd(ymm0, ymm8, ymm8)
 	vmulpd(ymm0, ymm9, ymm9)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -2695,36 +2695,36 @@ void bli_dgemmsup_rv_haswell_asm_3x8n
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5)
 	vmovupd(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6)
 	vmovupd(ymm6, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm7)
 	vmovupd(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8)
 	vmovupd(ymm8, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm9)
 	vmovupd(ymm9, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -2811,21 +2811,21 @@ void bli_dgemmsup_rv_haswell_asm_3x8n
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
+
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	vmovupd(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
@@ -2833,12 +2833,12 @@ void bli_dgemmsup_rv_haswell_asm_3x8n
 	vmovupd(ymm6, mem(rcx, 0*32))
 	vmovupd(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
+
 	vmovupd(ymm8, mem(rcx, 0*32))
 	vmovupd(ymm9, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -2903,9 +2903,9 @@ void bli_dgemmsup_rv_haswell_asm_3x8n
 
 	//lea(mem(rdx, rsi, 4), rdx)
 
-	
-	
-	
+
+
+
 	label(.DDONE)
 
 
@@ -2924,8 +2924,8 @@ void bli_dgemmsup_rv_haswell_asm_3x8n
 
 
 	label(.DRETURN)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -3033,8 +3033,8 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -3066,13 +3066,13 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	//mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
 	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-	
+
 	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -3162,19 +3162,19 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
 	lea(mem(rdx, r8,  8), rdx)         // from next upanel of b.
 	lea(mem(r10, r10, 2), rcx)         // rcx = 3*rs_b;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -3182,7 +3182,7 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
 #else
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx,  0*32), ymm0)
 	vmovupd(mem(rbx,  1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -3195,7 +3195,7 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -3215,8 +3215,8 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 0
@@ -3224,7 +3224,7 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
 #else
 	prefetch(0, mem(rdx, r10, 2, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx,  0*32), ymm0)
 	vmovupd(mem(rbx,  1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -3236,7 +3236,7 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -3258,25 +3258,25 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 1
@@ -3287,7 +3287,7 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
 	vmovupd(mem(rbx,  0*32), ymm0)
 	vmovupd(mem(rbx,  1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -3295,45 +3295,45 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
-	
-	
+
+
 	mov(r12, rcx)                      // reset rcx to current utile of c.
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm5, ymm5)
 	vmulpd(ymm0, ymm6, ymm6)
 	vmulpd(ymm0, ymm7, ymm7)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -3342,28 +3342,28 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5)
 	vmovupd(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6)
 	vmovupd(ymm6, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm7)
 	vmovupd(ymm7, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -3406,21 +3406,21 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
+
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	vmovupd(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
@@ -3428,8 +3428,8 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
 	vmovupd(ymm6, mem(rcx, 0*32))
 	vmovupd(ymm7, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -3462,9 +3462,9 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
 
 	//lea(mem(rcx, rsi, 4), rcx)
 
-	
-	
-	
+
+
+
 	label(.DDONE)
 
 
@@ -3483,8 +3483,8 @@ void bli_dgemmsup_rv_haswell_asm_2x8n
 
 
 	label(.DRETURN)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -3592,8 +3592,8 @@ void bli_dgemmsup_rv_haswell_asm_1x8n
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -3625,13 +3625,13 @@ void bli_dgemmsup_rv_haswell_asm_1x8n
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	//mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
 	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-	
+
 	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -3718,19 +3718,19 @@ void bli_dgemmsup_rv_haswell_asm_1x8n
 	lea(mem(rdx, r8,  8), rdx)         // from next upanel of b.
 	lea(mem(r10, r10, 2), rcx)         // rcx = 3*rs_b;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -3738,7 +3738,7 @@ void bli_dgemmsup_rv_haswell_asm_1x8n
 #else
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx,  0*32), ymm0)
 	vmovupd(mem(rbx,  1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -3748,7 +3748,7 @@ void bli_dgemmsup_rv_haswell_asm_1x8n
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 1
@@ -3765,8 +3765,8 @@ void bli_dgemmsup_rv_haswell_asm_1x8n
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
@@ -3774,7 +3774,7 @@ void bli_dgemmsup_rv_haswell_asm_1x8n
 #else
 	prefetch(0, mem(rdx, r10, 2, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx,  0*32), ymm0)
 	vmovupd(mem(rbx,  1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -3783,7 +3783,7 @@ void bli_dgemmsup_rv_haswell_asm_1x8n
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -3802,25 +3802,25 @@ void bli_dgemmsup_rv_haswell_asm_1x8n
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 1
@@ -3831,48 +3831,48 @@ void bli_dgemmsup_rv_haswell_asm_1x8n
 	vmovupd(mem(rbx,  0*32), ymm0)
 	vmovupd(mem(rbx,  1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
-	
-	
+
+
 	mov(r12, rcx)                      // reset rcx to current utile of c.
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm5, ymm5)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -3881,20 +3881,20 @@ void bli_dgemmsup_rv_haswell_asm_1x8n
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5)
 	vmovupd(ymm5, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -3937,26 +3937,26 @@ void bli_dgemmsup_rv_haswell_asm_1x8n
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
+
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	vmovupd(ymm5, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -3985,9 +3985,9 @@ void bli_dgemmsup_rv_haswell_asm_1x8n
 
 	//lea(mem(rcx, rsi, 4), rcx)
 
-	
-	
-	
+
+
+
 	label(.DDONE)
 
 
@@ -4006,8 +4006,8 @@ void bli_dgemmsup_rv_haswell_asm_1x8n
 
 
 	label(.DRETURN)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c
index 426e5157e1..aacfd8d1fe 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16m.c
@@ -40,20 +40,20 @@
 
 /*
    rrr:
-	 --------        ------        --------      
-	 --------        ------        --------      
-	 --------   +=   ------ ...    --------      
-	 --------        ------        --------      
-	 --------        ------            :         
-	 --------        ------            :         
+	 --------        ------        --------
+	 --------        ------        --------
+	 --------   +=   ------ ...    --------
+	 --------        ------        --------
+	 --------        ------            :
+	 --------        ------            :
 
    rcr:
-	 --------        | | | |       --------      
-	 --------        | | | |       --------      
-	 --------   +=   | | | | ...   --------      
-	 --------        | | | |       --------      
-	 --------        | | | |           :         
-	 --------        | | | |           :         
+	 --------        | | | |       --------
+	 --------        | | | |       --------
+	 --------   +=   | | | | ...   --------
+	 --------        | | | |       --------
+	 --------        | | | |           :
+	 --------        | | | |           :
 
    Assumptions:
    - B is row-stored;
@@ -69,12 +69,12 @@
    cost of the in-register transpose).
 
    crr:
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |  +=   ------ ...    --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------            :         
-	 | | | | | | | |       ------            :         
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |  +=   ------ ...    --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------            :
+	 | | | | | | | |       ------            :
 */
 
 // Prototype reference microkernels.
@@ -93,8 +93,8 @@ void bli_sgemmsup_rv_haswell_asm_6x16m
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	uint64_t n_left = n0 % 16;
@@ -249,15 +249,15 @@ void bli_sgemmsup_rv_haswell_asm_6x16m
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
 	lea(mem(, r9, 4), r9)              // cs_a *= sizeof(float)
-	
+
 	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -369,19 +369,19 @@ void bli_sgemmsup_rv_haswell_asm_6x16m
 	lea(mem(rdx, r8,  2), rdx)         // from next upanel of a.
 	lea(mem(r9, r9, 2), rcx)           // rcx = 3*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -389,7 +389,7 @@ void bli_sgemmsup_rv_haswell_asm_6x16m
 #else
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -400,14 +400,14 @@ void bli_sgemmsup_rv_haswell_asm_6x16m
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -416,7 +416,7 @@ void bli_sgemmsup_rv_haswell_asm_6x16m
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -435,14 +435,14 @@ void bli_sgemmsup_rv_haswell_asm_6x16m
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -450,8 +450,8 @@ void bli_sgemmsup_rv_haswell_asm_6x16m
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 0
@@ -459,7 +459,7 @@ void bli_sgemmsup_rv_haswell_asm_6x16m
 #else
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -470,14 +470,14 @@ void bli_sgemmsup_rv_haswell_asm_6x16m
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -485,7 +485,7 @@ void bli_sgemmsup_rv_haswell_asm_6x16m
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -506,14 +506,14 @@ void bli_sgemmsup_rv_haswell_asm_6x16m
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -521,50 +521,50 @@ void bli_sgemmsup_rv_haswell_asm_6x16m
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), ymm2)
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -572,23 +572,23 @@ void bli_sgemmsup_rv_haswell_asm_6x16m
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(r12, rcx)                      // reset rcx to current utile of c.
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm5, ymm5)
 	vmulps(ymm0, ymm6, ymm6)
@@ -601,26 +601,26 @@ void bli_sgemmsup_rv_haswell_asm_6x16m
 	vmulps(ymm0, ymm13, ymm13)
 	vmulps(ymm0, ymm14, ymm14)
 	vmulps(ymm0, ymm15, ymm15)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -629,60 +629,60 @@ void bli_sgemmsup_rv_haswell_asm_6x16m
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm5)
 	vmovups(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6)
 	vmovups(ymm6, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm7)
 	vmovups(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8)
 	vmovups(ymm8, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm9)
 	vmovups(ymm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10)
 	vmovups(ymm10, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm11)
 	vmovups(ymm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm12)
 	vmovups(ymm12, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm13)
 	vmovups(ymm13, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm14)
 	vmovups(ymm14, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm15)
 	vmovups(ymm15, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -828,51 +828,51 @@ void bli_sgemmsup_rv_haswell_asm_6x16m
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	vmovups(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
+
 
 	vmovups(ymm6, mem(rcx, 0*32))
 	vmovups(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm8, mem(rcx, 0*32))
 	vmovups(ymm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm10, mem(rcx, 0*32))
 	vmovups(ymm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm12, mem(rcx, 0*32))
 	vmovups(ymm13, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm14, mem(rcx, 0*32))
 	vmovups(ymm15, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -977,9 +977,9 @@ void bli_sgemmsup_rv_haswell_asm_6x16m
 
 	//lea(mem(rdx, rsi, 8), rdx)         // rdx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
 
 
@@ -1000,8 +1000,8 @@ void bli_sgemmsup_rv_haswell_asm_6x16m
 
 
 	label(.SRETURN)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1129,8 +1129,8 @@ void bli_sgemmsup_rv_haswell_asm_6x12m
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1160,15 +1160,15 @@ void bli_sgemmsup_rv_haswell_asm_6x12m
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
 	lea(mem(, r9, 4), r9)              // cs_a *= sizeof(float)
-	
+
 	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -1275,19 +1275,19 @@ void bli_sgemmsup_rv_haswell_asm_6x12m
 	lea(mem(rdx, r8,  2), rdx)         // from next upanel of a.
 	lea(mem(r9, r9, 2), rcx)           // rcx = 3*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -1295,7 +1295,7 @@ void bli_sgemmsup_rv_haswell_asm_6x12m
 #else
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -1306,14 +1306,14 @@ void bli_sgemmsup_rv_haswell_asm_6x12m
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1322,7 +1322,7 @@ void bli_sgemmsup_rv_haswell_asm_6x12m
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1341,14 +1341,14 @@ void bli_sgemmsup_rv_haswell_asm_6x12m
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1356,8 +1356,8 @@ void bli_sgemmsup_rv_haswell_asm_6x12m
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 0
@@ -1365,7 +1365,7 @@ void bli_sgemmsup_rv_haswell_asm_6x12m
 #else
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -1376,14 +1376,14 @@ void bli_sgemmsup_rv_haswell_asm_6x12m
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1391,7 +1391,7 @@ void bli_sgemmsup_rv_haswell_asm_6x12m
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1412,14 +1412,14 @@ void bli_sgemmsup_rv_haswell_asm_6x12m
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1427,50 +1427,50 @@ void bli_sgemmsup_rv_haswell_asm_6x12m
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), ymm2)
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1478,23 +1478,23 @@ void bli_sgemmsup_rv_haswell_asm_6x12m
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(r12, rcx)                      // reset rcx to current utile of c.
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(xmm0, xmm5, xmm5)
 	vmulps(ymm0, ymm6, ymm6)
@@ -1507,26 +1507,26 @@ void bli_sgemmsup_rv_haswell_asm_6x12m
 	vmulps(xmm0, xmm13, xmm13)
 	vmulps(ymm0, ymm14, ymm14)
 	vmulps(xmm0, xmm15, xmm15)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -1535,60 +1535,60 @@ void bli_sgemmsup_rv_haswell_asm_6x12m
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), xmm3, xmm5)
 	vmovups(xmm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6)
 	vmovups(ymm6, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), xmm3, xmm7)
 	vmovups(xmm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8)
 	vmovups(ymm8, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), xmm3, xmm9)
 	vmovups(xmm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10)
 	vmovups(ymm10, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), xmm3, xmm11)
 	vmovups(xmm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm12)
 	vmovups(ymm12, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), xmm3, xmm13)
 	vmovups(xmm13, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm14)
 	vmovups(ymm14, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), xmm3, xmm15)
 	vmovups(xmm15, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1711,51 +1711,51 @@ void bli_sgemmsup_rv_haswell_asm_6x12m
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	vmovups(xmm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
+
 
 	vmovups(ymm6, mem(rcx, 0*32))
 	vmovups(xmm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm8, mem(rcx, 0*32))
 	vmovups(xmm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm10, mem(rcx, 0*32))
 	vmovups(xmm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm12, mem(rcx, 0*32))
 	vmovups(xmm13, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm14, mem(rcx, 0*32))
 	vmovups(xmm15, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1844,9 +1844,9 @@ void bli_sgemmsup_rv_haswell_asm_6x12m
 
 	//lea(mem(rdx, rsi, 8), rdx)         // rdx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
 
 
@@ -1867,8 +1867,8 @@ void bli_sgemmsup_rv_haswell_asm_6x12m
 
 
 	label(.SRETURN)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1996,8 +1996,8 @@ void bli_sgemmsup_rv_haswell_asm_6x8m
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2027,15 +2027,15 @@ void bli_sgemmsup_rv_haswell_asm_6x8m
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
 	lea(mem(, r9, 4), r9)              // cs_a *= sizeof(float)
-	
+
 	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -2131,19 +2131,19 @@ void bli_sgemmsup_rv_haswell_asm_6x8m
 	lea(mem(rdx, r8,  2), rdx)         // from next upanel of a.
 	lea(mem(r9, r9, 2), rcx)           // rcx = 3*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -2151,7 +2151,7 @@ void bli_sgemmsup_rv_haswell_asm_6x8m
 #else
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -2159,19 +2159,19 @@ void bli_sgemmsup_rv_haswell_asm_6x8m
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -2187,19 +2187,19 @@ void bli_sgemmsup_rv_haswell_asm_6x8m
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm0, ymm3, ymm14)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 0
@@ -2207,7 +2207,7 @@ void bli_sgemmsup_rv_haswell_asm_6x8m
 #else
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -2215,18 +2215,18 @@ void bli_sgemmsup_rv_haswell_asm_6x8m
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm0, ymm3, ymm14)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -2244,104 +2244,104 @@ void bli_sgemmsup_rv_haswell_asm_6x8m
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm0, ymm3, ymm14)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), ymm2)
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm0, ymm3, ymm14)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(r12, rcx)                      // reset rcx to current utile of c.
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm6, ymm6)
 	vmulps(ymm0, ymm8, ymm8)
 	vmulps(ymm0, ymm10, ymm10)
 	vmulps(ymm0, ymm12, ymm12)
 	vmulps(ymm0, ymm14, ymm14)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -2350,42 +2350,42 @@ void bli_sgemmsup_rv_haswell_asm_6x8m
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6)
 	vmovups(ymm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8)
 	vmovups(ymm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10)
 	vmovups(ymm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm12)
 	vmovups(ymm12, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm14)
 	vmovups(ymm14, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -2463,45 +2463,45 @@ void bli_sgemmsup_rv_haswell_asm_6x8m
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovups(ymm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm12, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm14, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -2557,9 +2557,9 @@ void bli_sgemmsup_rv_haswell_asm_6x8m
 
 	//lea(mem(rdx, rsi, 8), rdx)         // rdx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
 
 
@@ -2580,8 +2580,8 @@ void bli_sgemmsup_rv_haswell_asm_6x8m
 
 
 	label(.SRETURN)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -2709,8 +2709,8 @@ void bli_sgemmsup_rv_haswell_asm_6x6m
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2740,15 +2740,15 @@ void bli_sgemmsup_rv_haswell_asm_6x6m
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
 	lea(mem(, r9, 4), r9)              // cs_a *= sizeof(float)
-	
+
 	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -2842,19 +2842,19 @@ void bli_sgemmsup_rv_haswell_asm_6x6m
 	lea(mem(rdx, r8,  2), rdx)         // from next upanel of a.
 	lea(mem(r9, r9, 2), rcx)           // rcx = 3*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -2862,7 +2862,7 @@ void bli_sgemmsup_rv_haswell_asm_6x6m
 #else
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*4), xmm0)
 	vmovsd(mem(rbx, 4*4), xmm1)
 	vinsertf128(imm(0x1), xmm1, ymm0, ymm0)
@@ -2872,19 +2872,19 @@ void bli_sgemmsup_rv_haswell_asm_6x6m
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -2902,19 +2902,19 @@ void bli_sgemmsup_rv_haswell_asm_6x6m
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm0, ymm3, ymm14)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 0
@@ -2922,7 +2922,7 @@ void bli_sgemmsup_rv_haswell_asm_6x6m
 #else
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*4), xmm0)
 	vmovsd(mem(rbx, 4*4), xmm1)
 	vinsertf128(imm(0x1), xmm1, ymm0, ymm0)
@@ -2932,18 +2932,18 @@ void bli_sgemmsup_rv_haswell_asm_6x6m
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm0, ymm3, ymm14)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -2963,43 +2963,43 @@ void bli_sgemmsup_rv_haswell_asm_6x6m
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm0, ymm3, ymm14)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*4), xmm0)
 	vmovsd(mem(rbx, 4*4), xmm1)
 	vinsertf128(imm(0x1), xmm1, ymm0, ymm0)
@@ -3009,60 +3009,60 @@ void bli_sgemmsup_rv_haswell_asm_6x6m
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(r12, rcx)                      // reset rcx to current utile of c.
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm6, ymm6)
 	vmulps(ymm0, ymm8, ymm8)
 	vmulps(ymm0, ymm10, ymm10)
 	vmulps(ymm0, ymm12, ymm12)
 	vmulps(ymm0, ymm14, ymm14)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	//lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set xmm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -3071,12 +3071,12 @@ void bli_sgemmsup_rv_haswell_asm_6x6m
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm4, xmm5)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx, 0*4))
@@ -3086,8 +3086,8 @@ void bli_sgemmsup_rv_haswell_asm_6x6m
 	vmovsd(xmm5, mem(rcx, 4*4))
 
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm6, xmm7)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm6)
 	vmovups(xmm6, mem(rcx, 0*4))
@@ -3097,8 +3097,8 @@ void bli_sgemmsup_rv_haswell_asm_6x6m
 	vmovsd(xmm7, mem(rcx, 4*4))
 
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm8, xmm9)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm8)
 	vmovups(xmm8, mem(rcx, 0*4))
@@ -3108,8 +3108,8 @@ void bli_sgemmsup_rv_haswell_asm_6x6m
 	vmovsd(xmm9, mem(rcx, 4*4))
 
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm10, xmm11)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm10)
 	vmovups(xmm10, mem(rcx, 0*4))
@@ -3119,8 +3119,8 @@ void bli_sgemmsup_rv_haswell_asm_6x6m
 	vmovsd(xmm11, mem(rcx, 4*4))
 
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm12, xmm13)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm12)
 	vmovups(xmm12, mem(rcx, 0*4))
@@ -3130,8 +3130,8 @@ void bli_sgemmsup_rv_haswell_asm_6x6m
 	vmovsd(xmm13, mem(rcx, 4*4))
 
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm14, xmm15)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm14)
 	vmovups(xmm14, mem(rcx, 0*4))
@@ -3141,8 +3141,8 @@ void bli_sgemmsup_rv_haswell_asm_6x6m
 	vmovsd(xmm15, mem(rcx, 4*4))
 
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -3207,57 +3207,57 @@ void bli_sgemmsup_rv_haswell_asm_6x6m
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm4, xmm5)
 	vmovups(xmm4, mem(rcx, 0*4))
 	vmovsd(xmm5, mem(rcx, 4*4))
 	add(rdi, rcx)
-	
+
 
 	vextractf128(imm(0x1), ymm6, xmm7)
 	vmovups(xmm6, mem(rcx, 0*4))
 	vmovsd(xmm7, mem(rcx, 4*4))
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm8, xmm9)
 	vmovups(xmm8, mem(rcx, 0*4))
 	vmovsd(xmm9, mem(rcx, 4*4))
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm10, xmm11)
 	vmovups(xmm10, mem(rcx, 0*4))
 	vmovsd(xmm11, mem(rcx, 4*4))
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm12, xmm13)
 	vmovups(xmm12, mem(rcx, 0*4))
 	vmovsd(xmm13, mem(rcx, 4*4))
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm14, xmm15)
 	vmovups(xmm14, mem(rcx, 0*4))
 	vmovsd(xmm15, mem(rcx, 4*4))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -3305,7 +3305,7 @@ void bli_sgemmsup_rv_haswell_asm_6x6m
 
 	//lea(mem(rdx, rsi, 8), rdx)         // rdx += 8*cs_c
 
-	
+
 	label(.SDONE)
 
 
@@ -3326,8 +3326,8 @@ void bli_sgemmsup_rv_haswell_asm_6x6m
 
 
 	label(.SRETURN)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -3455,8 +3455,8 @@ void bli_sgemmsup_rv_haswell_asm_6x4m
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -3486,15 +3486,15 @@ void bli_sgemmsup_rv_haswell_asm_6x4m
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
 	lea(mem(, r9, 4), r9)              // cs_a *= sizeof(float)
-	
+
 	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -3585,19 +3585,19 @@ void bli_sgemmsup_rv_haswell_asm_6x4m
 	lea(mem(rdx, r8,  2), rdx)         // from next upanel of a.
 	lea(mem(r9, r9, 2), rcx)           // rcx = 3*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -3605,7 +3605,7 @@ void bli_sgemmsup_rv_haswell_asm_6x4m
 #else
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -3613,19 +3613,19 @@ void bli_sgemmsup_rv_haswell_asm_6x4m
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	vbroadcastss(mem(rax, r15, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
 	vfmadd231ps(xmm0, xmm3, xmm14)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -3641,19 +3641,19 @@ void bli_sgemmsup_rv_haswell_asm_6x4m
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	vbroadcastss(mem(rax, r15, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
 	vfmadd231ps(xmm0, xmm3, xmm14)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 0
@@ -3661,7 +3661,7 @@ void bli_sgemmsup_rv_haswell_asm_6x4m
 #else
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -3669,18 +3669,18 @@ void bli_sgemmsup_rv_haswell_asm_6x4m
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	vbroadcastss(mem(rax, r15, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
 	vfmadd231ps(xmm0, xmm3, xmm14)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -3698,104 +3698,104 @@ void bli_sgemmsup_rv_haswell_asm_6x4m
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	vbroadcastss(mem(rax, r15, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
 	vfmadd231ps(xmm0, xmm3, xmm14)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), xmm2)
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	vbroadcastss(mem(rax, r15, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
 	vfmadd231ps(xmm0, xmm3, xmm14)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(r12, rcx)                      // reset rcx to current utile of c.
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm6, xmm6)
 	vmulps(xmm0, xmm8, xmm8)
 	vmulps(xmm0, xmm10, xmm10)
 	vmulps(xmm0, xmm12, xmm12)
 	vmulps(xmm0, xmm14, xmm14)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	//lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	//lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(xmm0, xmm0, xmm0)           // set xmm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -3804,42 +3804,42 @@ void bli_sgemmsup_rv_haswell_asm_6x4m
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6)
 	vmovups(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8)
 	vmovups(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm10)
 	vmovups(xmm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm12)
 	vmovups(xmm12, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm14)
 	vmovups(xmm14, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -3893,45 +3893,45 @@ void bli_sgemmsup_rv_haswell_asm_6x4m
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovups(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(xmm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(xmm12, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(xmm14, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -3972,9 +3972,9 @@ void bli_sgemmsup_rv_haswell_asm_6x4m
 
 	//lea(mem(rdx, rsi, 8), rdx)         // rdx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
 
 
@@ -3995,8 +3995,8 @@ void bli_sgemmsup_rv_haswell_asm_6x4m
 
 
 	label(.SRETURN)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -4124,8 +4124,8 @@ void bli_sgemmsup_rv_haswell_asm_6x2m
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -4155,15 +4155,15 @@ void bli_sgemmsup_rv_haswell_asm_6x2m
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
 	lea(mem(, r9, 4), r9)              // cs_a *= sizeof(float)
-	
+
 	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -4252,19 +4252,19 @@ void bli_sgemmsup_rv_haswell_asm_6x2m
 	lea(mem(rdx, r8,  2), rdx)         // from next upanel of a.
 	lea(mem(r9, r9, 2), rcx)           // rcx = 3*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -4272,7 +4272,7 @@ void bli_sgemmsup_rv_haswell_asm_6x2m
 #else
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovsd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -4280,19 +4280,19 @@ void bli_sgemmsup_rv_haswell_asm_6x2m
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	vbroadcastss(mem(rax, r15, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
 	vfmadd231ps(xmm0, xmm3, xmm14)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -4308,19 +4308,19 @@ void bli_sgemmsup_rv_haswell_asm_6x2m
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	vbroadcastss(mem(rax, r15, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
 	vfmadd231ps(xmm0, xmm3, xmm14)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 0
@@ -4328,7 +4328,7 @@ void bli_sgemmsup_rv_haswell_asm_6x2m
 #else
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovsd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -4336,18 +4336,18 @@ void bli_sgemmsup_rv_haswell_asm_6x2m
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	vbroadcastss(mem(rax, r15, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
 	vfmadd231ps(xmm0, xmm3, xmm14)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -4365,104 +4365,104 @@ void bli_sgemmsup_rv_haswell_asm_6x2m
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	vbroadcastss(mem(rax, r15, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
 	vfmadd231ps(xmm0, xmm3, xmm14)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovsd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), xmm2)
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	vbroadcastss(mem(rax, r15, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
 	vfmadd231ps(xmm0, xmm3, xmm14)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(r12, rcx)                      // reset rcx to current utile of c.
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm6, xmm6)
 	vmulps(xmm0, xmm8, xmm8)
 	vmulps(xmm0, xmm10, xmm10)
 	vmulps(xmm0, xmm12, xmm12)
 	vmulps(xmm0, xmm14, xmm14)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	//lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	//lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	//lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(xmm0, xmm0, xmm0)           // set xmm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -4471,42 +4471,42 @@ void bli_sgemmsup_rv_haswell_asm_6x2m
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4)
 	vmovsd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6)
 	vmovsd(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8)
 	vmovsd(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm10)
 	vmovsd(xmm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm12)
 	vmovsd(xmm12, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm14)
 	vmovsd(xmm14, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -4541,45 +4541,45 @@ void bli_sgemmsup_rv_haswell_asm_6x2m
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovsd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovsd(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovsd(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovsd(xmm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovsd(xmm12, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovsd(xmm14, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -4606,9 +4606,9 @@ void bli_sgemmsup_rv_haswell_asm_6x2m
 
 	//lea(mem(rdx, rsi, 8), rdx)         // rdx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
 
 
@@ -4629,8 +4629,8 @@ void bli_sgemmsup_rv_haswell_asm_6x2m
 
 
 	label(.SRETURN)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c
index 7463707cc9..da768ebf19 100644
--- a/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c
+++ b/kernels/haswell/3/sup/bli_gemmsup_rv_haswell_asm_s6x16n.c
@@ -40,20 +40,20 @@
 
 /*
    rrr:
-	 --------        ------        --------      
-	 --------        ------        --------      
-	 --------   +=   ------ ...    --------      
-	 --------        ------        --------      
-	 --------        ------            :         
-	 --------        ------            :         
+	 --------        ------        --------
+	 --------        ------        --------
+	 --------   +=   ------ ...    --------
+	 --------        ------        --------
+	 --------        ------            :
+	 --------        ------            :
 
    rcr:
-	 --------        | | | |       --------      
-	 --------        | | | |       --------      
-	 --------   +=   | | | | ...   --------      
-	 --------        | | | |       --------      
-	 --------        | | | |           :         
-	 --------        | | | |           :         
+	 --------        | | | |       --------
+	 --------        | | | |       --------
+	 --------   +=   | | | | ...   --------
+	 --------        | | | |       --------
+	 --------        | | | |           :
+	 --------        | | | |           :
 
    Assumptions:
    - B is row-stored;
@@ -69,12 +69,12 @@
    cost of the in-register transpose).
 
    crr:
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |  +=   ------ ...    --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------            :         
-	 | | | | | | | |       ------            :         
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |  +=   ------ ...    --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------            :
+	 | | | | | | | |       ------            :
 */
 
 // Prototype reference microkernels.
@@ -93,8 +93,8 @@ void bli_sgemmsup_rv_haswell_asm_6x16n
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	uint64_t m_left = m0 % 6;
@@ -154,14 +154,14 @@ void bli_sgemmsup_rv_haswell_asm_6x16n
 		}
 #endif
 
-		sgemmsup_ker_ft ker_fps[6] = 
+		sgemmsup_ker_ft ker_fps[6] =
 		{
 		  NULL,
 		  bli_sgemmsup_rv_haswell_asm_1x16n,
 		  bli_sgemmsup_rv_haswell_asm_2x16n,
 		  bli_sgemmsup_rv_haswell_asm_3x16n,
 		  bli_sgemmsup_rv_haswell_asm_4x16n,
-		  bli_sgemmsup_rv_haswell_asm_5x16n 
+		  bli_sgemmsup_rv_haswell_asm_5x16n
 		};
 
 		sgemmsup_ker_ft ker_fp = ker_fps[ m_left ];
@@ -203,15 +203,15 @@ void bli_sgemmsup_rv_haswell_asm_6x16n
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	//mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
 	lea(mem(, r9, 4), r9)              // cs_a *= sizeof(float)
-	
+
 	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -322,19 +322,19 @@ void bli_sgemmsup_rv_haswell_asm_6x16n
 	lea(mem(rdx, r8,  8), rdx)         // from next upanel of b.
 	lea(mem(r10, r10, 2), rcx)         // rcx = 3*rs_b;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -353,14 +353,14 @@ void bli_sgemmsup_rv_haswell_asm_6x16n
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -369,7 +369,7 @@ void bli_sgemmsup_rv_haswell_asm_6x16n
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -388,14 +388,14 @@ void bli_sgemmsup_rv_haswell_asm_6x16n
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -403,8 +403,8 @@ void bli_sgemmsup_rv_haswell_asm_6x16n
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 0
@@ -423,14 +423,14 @@ void bli_sgemmsup_rv_haswell_asm_6x16n
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -438,7 +438,7 @@ void bli_sgemmsup_rv_haswell_asm_6x16n
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -459,14 +459,14 @@ void bli_sgemmsup_rv_haswell_asm_6x16n
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -474,25 +474,25 @@ void bli_sgemmsup_rv_haswell_asm_6x16n
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 1
@@ -503,21 +503,21 @@ void bli_sgemmsup_rv_haswell_asm_6x16n
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), ymm2)
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -525,23 +525,23 @@ void bli_sgemmsup_rv_haswell_asm_6x16n
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(r12, rcx)                      // reset rcx to current utile of c.
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm5, ymm5)
 	vmulps(ymm0, ymm6, ymm6)
@@ -554,26 +554,26 @@ void bli_sgemmsup_rv_haswell_asm_6x16n
 	vmulps(ymm0, ymm13, ymm13)
 	vmulps(ymm0, ymm14, ymm14)
 	vmulps(ymm0, ymm15, ymm15)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -582,60 +582,60 @@ void bli_sgemmsup_rv_haswell_asm_6x16n
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm5)
 	vmovups(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6)
 	vmovups(ymm6, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm7)
 	vmovups(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8)
 	vmovups(ymm8, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm9)
 	vmovups(ymm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10)
 	vmovups(ymm10, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm11)
 	vmovups(ymm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm12)
 	vmovups(ymm12, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm13)
 	vmovups(ymm13, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm14)
 	vmovups(ymm14, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm15)
 	vmovups(ymm15, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -781,51 +781,51 @@ void bli_sgemmsup_rv_haswell_asm_6x16n
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	vmovups(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
+
 
 	vmovups(ymm6, mem(rcx, 0*32))
 	vmovups(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm8, mem(rcx, 0*32))
 	vmovups(ymm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm10, mem(rcx, 0*32))
 	vmovups(ymm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm12, mem(rcx, 0*32))
 	vmovups(ymm13, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm14, mem(rcx, 0*32))
 	vmovups(ymm15, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -930,9 +930,9 @@ void bli_sgemmsup_rv_haswell_asm_6x16n
 
 	//lea(mem(rdx, rsi, 8), rdx)         // rdx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
 
 
@@ -952,8 +952,8 @@ void bli_sgemmsup_rv_haswell_asm_6x16n
 
 
 	label(.SRETURN)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1094,8 +1094,8 @@ void bli_sgemmsup_rv_haswell_asm_5x16n
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1125,15 +1125,15 @@ void bli_sgemmsup_rv_haswell_asm_5x16n
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	//mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
 	lea(mem(, r9, 4), r9)              // cs_a *= sizeof(float)
-	
+
 	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -1241,19 +1241,19 @@ void bli_sgemmsup_rv_haswell_asm_5x16n
 	lea(mem(rdx, r8,  8), rdx)         // from next upanel of b.
 	lea(mem(r10, r10, 2), rcx)         // rcx = 3*rs_b;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -1272,20 +1272,20 @@ void bli_sgemmsup_rv_haswell_asm_5x16n
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm1, ymm2, ymm13)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1304,20 +1304,20 @@ void bli_sgemmsup_rv_haswell_asm_5x16n
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm1, ymm2, ymm13)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 0
@@ -1336,19 +1336,19 @@ void bli_sgemmsup_rv_haswell_asm_5x16n
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm1, ymm2, ymm13)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1369,37 +1369,37 @@ void bli_sgemmsup_rv_haswell_asm_5x16n
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm1, ymm2, ymm13)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 1
@@ -1410,42 +1410,42 @@ void bli_sgemmsup_rv_haswell_asm_5x16n
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), ymm2)
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm1, ymm2, ymm13)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(r12, rcx)                      // reset rcx to current utile of c.
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm5, ymm5)
 	vmulps(ymm0, ymm6, ymm6)
@@ -1456,26 +1456,26 @@ void bli_sgemmsup_rv_haswell_asm_5x16n
 	vmulps(ymm0, ymm11, ymm11)
 	vmulps(ymm0, ymm12, ymm12)
 	vmulps(ymm0, ymm13, ymm13)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -1484,52 +1484,52 @@ void bli_sgemmsup_rv_haswell_asm_5x16n
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm5)
 	vmovups(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6)
 	vmovups(ymm6, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm7)
 	vmovups(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8)
 	vmovups(ymm8, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm9)
 	vmovups(ymm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10)
 	vmovups(ymm10, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm11)
 	vmovups(ymm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm12)
 	vmovups(ymm12, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm13)
 	vmovups(ymm13, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1701,46 +1701,46 @@ void bli_sgemmsup_rv_haswell_asm_5x16n
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	vmovups(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
+
 
 	vmovups(ymm6, mem(rcx, 0*32))
 	vmovups(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm8, mem(rcx, 0*32))
 	vmovups(ymm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm10, mem(rcx, 0*32))
 	vmovups(ymm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm12, mem(rcx, 0*32))
 	vmovups(ymm13, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1862,9 +1862,9 @@ void bli_sgemmsup_rv_haswell_asm_5x16n
 
 	//lea(mem(rdx, rsi, 8), rdx)         // rdx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
 
 
@@ -1884,8 +1884,8 @@ void bli_sgemmsup_rv_haswell_asm_5x16n
 
 
 	label(.SRETURN)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -2026,8 +2026,8 @@ void bli_sgemmsup_rv_haswell_asm_4x16n
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2057,15 +2057,15 @@ void bli_sgemmsup_rv_haswell_asm_4x16n
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	//mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
 	lea(mem(, r9, 4), r9)              // cs_a *= sizeof(float)
-	
+
 	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -2170,19 +2170,19 @@ void bli_sgemmsup_rv_haswell_asm_4x16n
 	lea(mem(rdx, r8,  8), rdx)         // from next upanel of b.
 	lea(mem(r10, r10, 2), rcx)         // rcx = 3*rs_b;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -2201,7 +2201,7 @@ void bli_sgemmsup_rv_haswell_asm_4x16n
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -2210,7 +2210,7 @@ void bli_sgemmsup_rv_haswell_asm_4x16n
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -2229,7 +2229,7 @@ void bli_sgemmsup_rv_haswell_asm_4x16n
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -2237,8 +2237,8 @@ void bli_sgemmsup_rv_haswell_asm_4x16n
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 0
@@ -2257,7 +2257,7 @@ void bli_sgemmsup_rv_haswell_asm_4x16n
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -2265,7 +2265,7 @@ void bli_sgemmsup_rv_haswell_asm_4x16n
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -2286,7 +2286,7 @@ void bli_sgemmsup_rv_haswell_asm_4x16n
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -2294,25 +2294,25 @@ void bli_sgemmsup_rv_haswell_asm_4x16n
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 1
@@ -2323,14 +2323,14 @@ void bli_sgemmsup_rv_haswell_asm_4x16n
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), ymm2)
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -2338,23 +2338,23 @@ void bli_sgemmsup_rv_haswell_asm_4x16n
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(r12, rcx)                      // reset rcx to current utile of c.
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm5, ymm5)
 	vmulps(ymm0, ymm6, ymm6)
@@ -2363,26 +2363,26 @@ void bli_sgemmsup_rv_haswell_asm_4x16n
 	vmulps(ymm0, ymm9, ymm9)
 	vmulps(ymm0, ymm10, ymm10)
 	vmulps(ymm0, ymm11, ymm11)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -2391,44 +2391,44 @@ void bli_sgemmsup_rv_haswell_asm_4x16n
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm5)
 	vmovups(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6)
 	vmovups(ymm6, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm7)
 	vmovups(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8)
 	vmovups(ymm8, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm9)
 	vmovups(ymm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10)
 	vmovups(ymm10, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm11)
 	vmovups(ymm11, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -2516,41 +2516,41 @@ void bli_sgemmsup_rv_haswell_asm_4x16n
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	vmovups(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
+
 
 	vmovups(ymm6, mem(rcx, 0*32))
 	vmovups(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm8, mem(rcx, 0*32))
 	vmovups(ymm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm10, mem(rcx, 0*32))
 	vmovups(ymm11, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -2620,9 +2620,9 @@ void bli_sgemmsup_rv_haswell_asm_4x16n
 
 	//lea(mem(rcx, rsi, 8), rcx)         // rcx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
 
 
@@ -2642,8 +2642,8 @@ void bli_sgemmsup_rv_haswell_asm_4x16n
 
 
 	label(.SRETURN)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -2784,8 +2784,8 @@ void bli_sgemmsup_rv_haswell_asm_3x16n
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2815,15 +2815,15 @@ void bli_sgemmsup_rv_haswell_asm_3x16n
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	//mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
 	lea(mem(, r9, 4), r9)              // cs_a *= sizeof(float)
-	
+
 	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -2931,19 +2931,19 @@ void bli_sgemmsup_rv_haswell_asm_3x16n
 	lea(mem(rdx, r8,  8), rdx)         // from next upanel of b.
 	lea(mem(r10, r10, 2), rcx)         // rcx = 3*rs_b;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -2962,13 +2962,13 @@ void bli_sgemmsup_rv_haswell_asm_3x16n
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -2987,13 +2987,13 @@ void bli_sgemmsup_rv_haswell_asm_3x16n
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 0
@@ -3012,12 +3012,12 @@ void bli_sgemmsup_rv_haswell_asm_3x16n
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -3038,30 +3038,30 @@ void bli_sgemmsup_rv_haswell_asm_3x16n
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 1
@@ -3072,61 +3072,61 @@ void bli_sgemmsup_rv_haswell_asm_3x16n
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), ymm2)
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(r12, rcx)                      // reset rcx to current utile of c.
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm5, ymm5)
 	vmulps(ymm0, ymm6, ymm6)
 	vmulps(ymm0, ymm7, ymm7)
 	vmulps(ymm0, ymm8, ymm8)
 	vmulps(ymm0, ymm9, ymm9)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -3135,36 +3135,36 @@ void bli_sgemmsup_rv_haswell_asm_3x16n
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm5)
 	vmovups(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6)
 	vmovups(ymm6, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm7)
 	vmovups(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8)
 	vmovups(ymm8, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm9)
 	vmovups(ymm9, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -3316,36 +3316,36 @@ void bli_sgemmsup_rv_haswell_asm_3x16n
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	vmovups(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
+
 
 	vmovups(ymm6, mem(rcx, 0*32))
 	vmovups(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm8, mem(rcx, 0*32))
 	vmovups(ymm9, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -3439,9 +3439,9 @@ void bli_sgemmsup_rv_haswell_asm_3x16n
 
 	//lea(mem(rcx, rsi, 4), rcx)
 
-	
-	
-	
+
+
+
 	label(.SDONE)
 
 
@@ -3461,8 +3461,8 @@ void bli_sgemmsup_rv_haswell_asm_3x16n
 
 
 	label(.SRETURN)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -3603,8 +3603,8 @@ void bli_sgemmsup_rv_haswell_asm_2x16n
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -3634,15 +3634,15 @@ void bli_sgemmsup_rv_haswell_asm_2x16n
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	//mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
 	lea(mem(, r9, 4), r9)              // cs_a *= sizeof(float)
-	
+
 	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -3741,19 +3741,19 @@ void bli_sgemmsup_rv_haswell_asm_2x16n
 	lea(mem(rdx, r8,  8), rdx)         // from next upanel of b.
 	lea(mem(r10, r10, 2), rcx)         // rcx = 3*rs_b;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -3774,7 +3774,7 @@ void bli_sgemmsup_rv_haswell_asm_2x16n
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -3794,8 +3794,8 @@ void bli_sgemmsup_rv_haswell_asm_2x16n
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 0
@@ -3815,7 +3815,7 @@ void bli_sgemmsup_rv_haswell_asm_2x16n
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -3837,25 +3837,25 @@ void bli_sgemmsup_rv_haswell_asm_2x16n
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 1
@@ -3866,7 +3866,7 @@ void bli_sgemmsup_rv_haswell_asm_2x16n
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), ymm2)
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -3874,47 +3874,47 @@ void bli_sgemmsup_rv_haswell_asm_2x16n
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(r12, rcx)                      // reset rcx to current utile of c.
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm5, ymm5)
 	vmulps(ymm0, ymm6, ymm6)
 	vmulps(ymm0, ymm7, ymm7)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -3923,28 +3923,28 @@ void bli_sgemmsup_rv_haswell_asm_2x16n
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm5)
 	vmovups(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6)
 	vmovups(ymm6, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm7)
 	vmovups(ymm7, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -4012,21 +4012,21 @@ void bli_sgemmsup_rv_haswell_asm_2x16n
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	vmovups(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
@@ -4034,8 +4034,8 @@ void bli_sgemmsup_rv_haswell_asm_2x16n
 	vmovups(ymm6, mem(rcx, 0*32))
 	vmovups(ymm7, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -4077,9 +4077,9 @@ void bli_sgemmsup_rv_haswell_asm_2x16n
 
 	//lea(mem(rcx, rsi, 8), rcx)         // rcx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
 
 
@@ -4099,8 +4099,8 @@ void bli_sgemmsup_rv_haswell_asm_2x16n
 
 
 	label(.SRETURN)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -4241,8 +4241,8 @@ void bli_sgemmsup_rv_haswell_asm_1x16n
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -4272,15 +4272,15 @@ void bli_sgemmsup_rv_haswell_asm_1x16n
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	//mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 4), r8)              // rs_a *= sizeof(float)
 	lea(mem(, r9, 4), r9)              // cs_a *= sizeof(float)
-	
+
 	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -4376,19 +4376,19 @@ void bli_sgemmsup_rv_haswell_asm_1x16n
 	lea(mem(rdx, r8,  8), rdx)         // from next upanel of b.
 	lea(mem(r10, r10, 2), rcx)         // rcx = 3*rs_b;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -4406,7 +4406,7 @@ void bli_sgemmsup_rv_haswell_asm_1x16n
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -4423,8 +4423,8 @@ void bli_sgemmsup_rv_haswell_asm_1x16n
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 0
@@ -4441,7 +4441,7 @@ void bli_sgemmsup_rv_haswell_asm_1x16n
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -4460,25 +4460,25 @@ void bli_sgemmsup_rv_haswell_asm_1x16n
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 1
@@ -4489,50 +4489,50 @@ void bli_sgemmsup_rv_haswell_asm_1x16n
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(r12, rcx)                      // reset rcx to current utile of c.
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm5, ymm5)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -4541,20 +4541,20 @@ void bli_sgemmsup_rv_haswell_asm_1x16n
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm5)
 	vmovups(ymm5, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -4648,26 +4648,26 @@ void bli_sgemmsup_rv_haswell_asm_1x16n
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	vmovups(ymm5, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -4728,9 +4728,9 @@ void bli_sgemmsup_rv_haswell_asm_1x16n
 
 	//lea(mem(rcx, rsi, 8), rcx)         // rcx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
 
 
@@ -4750,8 +4750,8 @@ void bli_sgemmsup_rv_haswell_asm_1x16n
 
 
 	label(.SRETURN)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.c
index 69d543a99d..67b3ec8bf6 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_r_haswell_ref_dMx1.c
@@ -40,20 +40,20 @@
 
 /*
    rrr:
-	 --------        ------        --------      
-	 --------        ------        --------      
-	 --------   +=   ------ ...    --------      
-	 --------        ------        --------      
-	 --------        ------            :         
-	 --------        ------            :         
+	 --------        ------        --------
+	 --------        ------        --------
+	 --------   +=   ------ ...    --------
+	 --------        ------        --------
+	 --------        ------            :
+	 --------        ------            :
 
    rcr:
-	 --------        | | | |       --------      
-	 --------        | | | |       --------      
-	 --------   +=   | | | | ...   --------      
-	 --------        | | | |       --------      
-	 --------        | | | |           :         
-	 --------        | | | |           :         
+	 --------        | | | |       --------
+	 --------        | | | |       --------
+	 --------   +=   | | | | ...   --------
+	 --------        | | | |       --------
+	 --------        | | | |           :
+	 --------        | | | |           :
 
    Assumptions:
    - B is row-stored;
@@ -69,12 +69,12 @@
    cost of the in-register transpose).
 
    crr:
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |  +=   ------ ...    --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------            :         
-	 | | | | | | | |       ------            :         
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |  +=   ------ ...    --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------            :
+	 | | | | | | | |       ------            :
 */
 
 // Prototype reference microkernels.
@@ -104,8 +104,8 @@ void PASTEMAC(ch,opname) \
        ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
        ctype*     restrict beta, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx \
+       auxinfo_t*          data, \
+       cntx_t*             cntx \
      ) \
 { \
 	for ( dim_t i = 0; i < mdim; ++i ) \
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c
index 457ef9f22d..929f9ea476 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx1.c
@@ -72,8 +72,8 @@ void bli_dgemmsup_rd_haswell_asm_6x1
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -586,8 +586,8 @@ void bli_dgemmsup_rd_haswell_asm_3x1
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -995,8 +995,8 @@ void bli_dgemmsup_rd_haswell_asm_2x1
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1369,8 +1369,8 @@ void bli_dgemmsup_rd_haswell_asm_1x1
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c
index af498eb0ee..397d932e44 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx2.c
@@ -72,8 +72,8 @@ void bli_dgemmsup_rd_haswell_asm_6x2
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -99,9 +99,9 @@ void bli_dgemmsup_rd_haswell_asm_6x2
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -119,7 +119,7 @@ void bli_dgemmsup_rd_haswell_asm_6x2
 
 	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), rcx)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -169,19 +169,19 @@ void bli_dgemmsup_rd_haswell_asm_6x2
 	prefetch(0, mem(r10, rdi, 1, 1*8)) // prefetch c + 4*rs_c
 	prefetch(0, mem(r10, rdi, 2, 1*8)) // prefetch c + 5*rs_c
 #endif
-	
 
-	
-	
+
+
+
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
 	                                   // contains the k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -219,7 +219,7 @@ void bli_dgemmsup_rd_haswell_asm_6x2
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovupd(mem(rbx        ), ymm0)
@@ -253,7 +253,7 @@ void bli_dgemmsup_rd_haswell_asm_6x2
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -321,27 +321,27 @@ void bli_dgemmsup_rd_haswell_asm_6x2
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKITER4)
-	
+
 	mov(var(k_iter4), rsi)             // i = k_iter4;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -377,21 +377,21 @@ void bli_dgemmsup_rd_haswell_asm_6x2
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.DCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -399,7 +399,7 @@ void bli_dgemmsup_rd_haswell_asm_6x2
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovsd(mem(rbx        ), xmm0)
 	vmovsd(mem(rbx, r11, 1), xmm1)
 	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
@@ -429,12 +429,12 @@ void bli_dgemmsup_rd_haswell_asm_6x2
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 
@@ -447,7 +447,7 @@ void bli_dgemmsup_rd_haswell_asm_6x2
 	                                   // ymm10 ymm11
 	                                   // ymm12 ymm13
 	                                   // ymm14 ymm15
-	
+
 	vhaddpd( ymm5, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vaddpd( xmm0, xmm1, xmm4 )
@@ -480,7 +480,7 @@ void bli_dgemmsup_rd_haswell_asm_6x2
 	                                   // xmm14[0:1] = sum(ymm14) sum(ymm15)
 
 
-	
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
 
@@ -488,103 +488,103 @@ void bli_dgemmsup_rd_haswell_asm_6x2
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(xmm0, xmm4,  xmm4)          // scale by alpha
 	vmulpd(xmm0, xmm6,  xmm6)
 	vmulpd(xmm0, xmm8,  xmm8)
 	vmulpd(xmm0, xmm10, xmm10)
 	vmulpd(xmm0, xmm12, xmm12)
 	vmulpd(xmm0, xmm14, xmm14)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx), xmm3, xmm4)
 	vmovupd(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), xmm3, xmm6)
 	vmovupd(xmm6, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), xmm3, xmm8)
 	vmovupd(xmm8, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), xmm3, xmm10)
 	vmovupd(xmm10, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), xmm3, xmm12)
 	vmovupd(xmm12, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), xmm3, xmm14)
 	vmovupd(xmm14, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(xmm6, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(xmm8, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(xmm10, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(xmm12, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(xmm14, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 
 	label(.DDONE)
-	
+
 
 
 
 	label(.DRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -628,8 +628,8 @@ void bli_dgemmsup_rd_haswell_asm_3x2
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -655,9 +655,9 @@ void bli_dgemmsup_rd_haswell_asm_3x2
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -675,7 +675,7 @@ void bli_dgemmsup_rd_haswell_asm_3x2
 
 	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), rcx)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -716,19 +716,19 @@ void bli_dgemmsup_rd_haswell_asm_3x2
 	prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c
 	prefetch(0, mem(rcx, rdi, 2, 1*8)) // prefetch c + 2*rs_c
 #endif
-	
 
-	
-	
+
+
+
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
 	                                   // contains the k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -754,7 +754,7 @@ void bli_dgemmsup_rd_haswell_asm_3x2
 	vfmadd231pd(ymm0, ymm3, ymm8)
 	vfmadd231pd(ymm1, ymm3, ymm9)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovupd(mem(rbx        ), ymm0)
@@ -776,7 +776,7 @@ void bli_dgemmsup_rd_haswell_asm_3x2
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -820,27 +820,27 @@ void bli_dgemmsup_rd_haswell_asm_3x2
 	vfmadd231pd(ymm0, ymm3, ymm8)
 	vfmadd231pd(ymm1, ymm3, ymm9)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKITER4)
-	
+
 	mov(var(k_iter4), rsi)             // i = k_iter4;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -864,21 +864,21 @@ void bli_dgemmsup_rd_haswell_asm_3x2
 	vfmadd231pd(ymm0, ymm3, ymm8)
 	vfmadd231pd(ymm1, ymm3, ymm9)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.DCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -886,7 +886,7 @@ void bli_dgemmsup_rd_haswell_asm_3x2
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovsd(mem(rbx        ), xmm0)
 	vmovsd(mem(rbx, r11, 1), xmm1)
 	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
@@ -904,12 +904,12 @@ void bli_dgemmsup_rd_haswell_asm_3x2
 	vfmadd231pd(ymm0, ymm3, ymm8)
 	vfmadd231pd(ymm1, ymm3, ymm9)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 
@@ -919,7 +919,7 @@ void bli_dgemmsup_rd_haswell_asm_3x2
 	                                   // ymm4  ymm5
 	                                   // ymm6  ymm7
 	                                   // ymm8  ymm9
-	
+
 	vhaddpd( ymm5, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vaddpd( xmm0, xmm1, xmm4 )
@@ -937,7 +937,7 @@ void bli_dgemmsup_rd_haswell_asm_3x2
 	                                   // xmm8[0:1]  = sum(ymm8)  sum(ymm9)
 
 
-	
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
 
@@ -945,79 +945,79 @@ void bli_dgemmsup_rd_haswell_asm_3x2
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(xmm0, xmm4,  xmm4)          // scale by alpha
 	vmulpd(xmm0, xmm6,  xmm6)
 	vmulpd(xmm0, xmm8,  xmm8)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx), xmm3, xmm4)
 	vmovupd(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), xmm3, xmm6)
 	vmovupd(xmm6, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), xmm3, xmm8)
 	vmovupd(xmm8, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(xmm6, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(xmm8, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 
 	label(.DDONE)
-	
+
 
 
 
 	label(.DRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -1061,8 +1061,8 @@ void bli_dgemmsup_rd_haswell_asm_2x2
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1088,9 +1088,9 @@ void bli_dgemmsup_rd_haswell_asm_2x2
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -1108,7 +1108,7 @@ void bli_dgemmsup_rd_haswell_asm_2x2
 
 	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), rcx)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -1146,19 +1146,19 @@ void bli_dgemmsup_rd_haswell_asm_2x2
 	prefetch(0, mem(rcx,         1*8)) // prefetch c + 0*rs_c
 	prefetch(0, mem(rcx, rdi, 1, 1*8)) // prefetch c + 1*rs_c
 #endif
-	
 
-	
-	
+
+
+
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
 	                                   // contains the k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -1180,7 +1180,7 @@ void bli_dgemmsup_rd_haswell_asm_2x2
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovupd(mem(rbx        ), ymm0)
@@ -1198,7 +1198,7 @@ void bli_dgemmsup_rd_haswell_asm_2x2
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -1234,27 +1234,27 @@ void bli_dgemmsup_rd_haswell_asm_2x2
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKITER4)
-	
+
 	mov(var(k_iter4), rsi)             // i = k_iter4;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -1274,21 +1274,21 @@ void bli_dgemmsup_rd_haswell_asm_2x2
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.DCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -1296,7 +1296,7 @@ void bli_dgemmsup_rd_haswell_asm_2x2
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovsd(mem(rbx        ), xmm0)
 	vmovsd(mem(rbx, r11, 1), xmm1)
 	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
@@ -1310,12 +1310,12 @@ void bli_dgemmsup_rd_haswell_asm_2x2
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 
@@ -1324,7 +1324,7 @@ void bli_dgemmsup_rd_haswell_asm_2x2
 
 	                                   // ymm4  ymm5
 	                                   // ymm6  ymm7
-	
+
 	vhaddpd( ymm5, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vaddpd( xmm0, xmm1, xmm4 )
@@ -1337,7 +1337,7 @@ void bli_dgemmsup_rd_haswell_asm_2x2
 	                                   // xmm6[0:1]  = sum(ymm6)  sum(ymm7)
 
 
-	
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
 
@@ -1345,71 +1345,71 @@ void bli_dgemmsup_rd_haswell_asm_2x2
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(xmm0, xmm4,  xmm4)          // scale by alpha
 	vmulpd(xmm0, xmm6,  xmm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx), xmm3, xmm4)
 	vmovupd(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), xmm3, xmm6)
 	vmovupd(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 
 	label(.DDONE)
-	
+
 
 
 
 	label(.DRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -1453,8 +1453,8 @@ void bli_dgemmsup_rd_haswell_asm_1x2
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1480,9 +1480,9 @@ void bli_dgemmsup_rd_haswell_asm_1x2
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -1500,7 +1500,7 @@ void bli_dgemmsup_rd_haswell_asm_1x2
 
 	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), rcx)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -1535,19 +1535,19 @@ void bli_dgemmsup_rd_haswell_asm_1x2
 	//lea(mem(r10, rdi, 1), r10)         // rdx = c + 3*rs_c;
 	prefetch(0, mem(rcx,         1*8)) // prefetch c + 0*rs_c
 #endif
-	
 
-	
-	
+
+
+
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
 	                                   // contains the k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -1565,7 +1565,7 @@ void bli_dgemmsup_rd_haswell_asm_1x2
 	vfmadd231pd(ymm0, ymm3, ymm4)
 	vfmadd231pd(ymm1, ymm3, ymm5)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovupd(mem(rbx        ), ymm0)
@@ -1579,7 +1579,7 @@ void bli_dgemmsup_rd_haswell_asm_1x2
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -1607,27 +1607,27 @@ void bli_dgemmsup_rd_haswell_asm_1x2
 	vfmadd231pd(ymm0, ymm3, ymm4)
 	vfmadd231pd(ymm1, ymm3, ymm5)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKITER4)
-	
+
 	mov(var(k_iter4), rsi)             // i = k_iter4;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -1643,21 +1643,21 @@ void bli_dgemmsup_rd_haswell_asm_1x2
 	vfmadd231pd(ymm0, ymm3, ymm4)
 	vfmadd231pd(ymm1, ymm3, ymm5)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.DCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -1665,7 +1665,7 @@ void bli_dgemmsup_rd_haswell_asm_1x2
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovsd(mem(rbx        ), xmm0)
 	vmovsd(mem(rbx, r11, 1), xmm1)
 	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
@@ -1675,12 +1675,12 @@ void bli_dgemmsup_rd_haswell_asm_1x2
 	vfmadd231pd(ymm0, ymm3, ymm4)
 	vfmadd231pd(ymm1, ymm3, ymm5)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 
@@ -1688,7 +1688,7 @@ void bli_dgemmsup_rd_haswell_asm_1x2
 	label(.DPOSTACCUM)
 
 	                                   // ymm4  ymm5
-	
+
 	vhaddpd( ymm5, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vaddpd( xmm0, xmm1, xmm4 )
@@ -1696,7 +1696,7 @@ void bli_dgemmsup_rd_haswell_asm_1x2
 	                                   // xmm4[0:1]  = sum(ymm4)  sum(ymm5)
 
 
-	
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
 
@@ -1704,63 +1704,63 @@ void bli_dgemmsup_rd_haswell_asm_1x2
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(xmm0, xmm4,  xmm4)          // scale by alpha
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx), xmm3, xmm4)
 	vmovupd(xmm4, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(xmm4, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 
 	label(.DDONE)
-	
+
 
 
 
 	label(.DRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c
index 516bfced54..75e84650c7 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx4.c
@@ -72,8 +72,8 @@ void bli_dgemmsup_rd_haswell_asm_6x4
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -605,8 +605,8 @@ void bli_dgemmsup_rd_haswell_asm_2x4
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1055,8 +1055,8 @@ void bli_dgemmsup_rd_haswell_asm_1x4
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c
index 571444bed3..b2e3d83af2 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rd_haswell_asm_dMx8.c
@@ -72,8 +72,8 @@ void bli_dgemmsup_rd_haswell_asm_6x8
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	uint64_t n_left = n0 % 8;
@@ -163,7 +163,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	//mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -181,7 +181,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	//mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -256,18 +256,18 @@ void bli_dgemmsup_rd_haswell_asm_6x8
 #endif
 	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
 
-	
 
-	
+
+
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
 	                                   // contains the k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -302,7 +302,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	vfmadd231pd(ymm2, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovupd(mem(rax       ), ymm0)
@@ -333,7 +333,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -395,27 +395,27 @@ void bli_dgemmsup_rd_haswell_asm_6x8
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	vfmadd231pd(ymm2, ymm3, ymm15)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKITER4)
-	
+
 	mov(var(k_iter4), rsi)             // i = k_iter4;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -426,7 +426,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8
 	vmovupd(mem(rax, r8, 1), ymm1)
 	vmovupd(mem(rax, r8, 2), ymm2)
 	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	
+
 	vmovupd(mem(rbx        ), ymm3)
 	vfmadd231pd(ymm0, ymm3, ymm4)
 	vfmadd231pd(ymm1, ymm3, ymm5)
@@ -448,21 +448,21 @@ void bli_dgemmsup_rd_haswell_asm_6x8
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	vfmadd231pd(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.DCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -470,12 +470,12 @@ void bli_dgemmsup_rd_haswell_asm_6x8
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovsd(mem(rax       ), xmm0)
 	vmovsd(mem(rax, r8, 1), xmm1)
 	vmovsd(mem(rax, r8, 2), xmm2)
 	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	
+
 	vmovsd(mem(rbx        ), xmm3)
 	vfmadd231pd(ymm0, ymm3, ymm4)
 	vfmadd231pd(ymm1, ymm3, ymm5)
@@ -497,23 +497,23 @@ void bli_dgemmsup_rd_haswell_asm_6x8
 	vfmadd231pd(ymm1, ymm3, ymm14)
 	vfmadd231pd(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 
 
 	label(.DPOSTACCUM)
 
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
+
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
 	                                   // ymm6  ymm9  ymm12 ymm15
-	
+
 	vhaddpd( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vaddpd( xmm0, xmm1, xmm0 )
@@ -553,7 +553,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8
 	                                   // xmm6[2] = sum(ymm12); xmm6[3] = sum(ymm15)
 
 
-	
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
 
@@ -561,73 +561,73 @@ void bli_dgemmsup_rd_haswell_asm_6x8
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm5, ymm5)
 	vmulpd(ymm0, ymm6, ymm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), ymm3, ymm5)
 	vmovupd(ymm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), ymm3, ymm6)
 	vmovupd(ymm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(ymm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(ymm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
 
 	lea(mem(r12, rdi, 2), r12)         //
@@ -651,7 +651,7 @@ void bli_dgemmsup_rd_haswell_asm_6x8
 
 	label(.DRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -735,8 +735,8 @@ void bli_dgemmsup_rd_haswell_asm_2x8
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -761,7 +761,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -779,7 +779,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -833,18 +833,18 @@ void bli_dgemmsup_rd_haswell_asm_2x8
 	prefetch(0, mem(rcx, rdi, 1, 3*8)) // prefetch c + 1*rs_c
 #endif
 
-	
-	
-	
+
+
+
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
 	                                   // contains the k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -874,7 +874,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8
 	vfmadd231pd(ymm0, ymm3, ymm13)
 	vfmadd231pd(ymm1, ymm3, ymm14)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovupd(mem(rax       ), ymm0)
@@ -900,7 +900,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -952,27 +952,27 @@ void bli_dgemmsup_rd_haswell_asm_2x8
 	vfmadd231pd(ymm0, ymm3, ymm13)
 	vfmadd231pd(ymm1, ymm3, ymm14)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKITER4)
-	
+
 	mov(var(k_iter4), rsi)             // i = k_iter4;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -982,7 +982,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8
 	vmovupd(mem(rax       ), ymm0)
 	vmovupd(mem(rax, r8, 1), ymm1)
 	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	
+
 	vmovupd(mem(rbx        ), ymm3)
 	vfmadd231pd(ymm0, ymm3, ymm4)
 	vfmadd231pd(ymm1, ymm3, ymm5)
@@ -1000,21 +1000,21 @@ void bli_dgemmsup_rd_haswell_asm_2x8
 	vfmadd231pd(ymm0, ymm3, ymm13)
 	vfmadd231pd(ymm1, ymm3, ymm14)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.DCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -1022,11 +1022,11 @@ void bli_dgemmsup_rd_haswell_asm_2x8
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovsd(mem(rax       ), xmm0)
 	vmovsd(mem(rax, r8, 1), xmm1)
 	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	
+
 	vmovsd(mem(rbx        ), xmm3)
 	vfmadd231pd(ymm0, ymm3, ymm4)
 	vfmadd231pd(ymm1, ymm3, ymm5)
@@ -1044,22 +1044,22 @@ void bli_dgemmsup_rd_haswell_asm_2x8
 	vfmadd231pd(ymm0, ymm3, ymm13)
 	vfmadd231pd(ymm1, ymm3, ymm14)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 
 
 	label(.DPOSTACCUM)
 
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
+
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
-	
+
 	vhaddpd( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vaddpd( xmm0, xmm1, xmm0 )
@@ -1090,70 +1090,70 @@ void bli_dgemmsup_rd_haswell_asm_2x8
 
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 8), rdi)            // rs_c *= sizeof(double)
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm5, ymm5)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231pd(mem(rcx), ymm3, ymm5)
 	vmovupd(ymm5, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(ymm5, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
 
 	add(imm(4), r15)                   // jj += 4;
@@ -1165,7 +1165,7 @@ void bli_dgemmsup_rd_haswell_asm_2x8
 
 	label(.DRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -1209,8 +1209,8 @@ void bli_dgemmsup_rd_haswell_asm_1x8
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1235,7 +1235,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), r14)                   // load address of a.
 	//mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -1253,7 +1253,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), r12)                   // load address of c
 	//mov(var(rs_c), rdi)                // load rs_c
@@ -1302,18 +1302,18 @@ void bli_dgemmsup_rd_haswell_asm_1x8
 	prefetch(0, mem(rcx, 3*8))         // prefetch c + 0*rs_c
 #endif
 
-	
-	
-	
+
+
+
 	mov(var(k_iter16), rsi)            // i = k_iter16;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKITER4)                 // if i == 0, jump to code that
 	                                   // contains the k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER16)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -1338,7 +1338,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8
 	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
 	vfmadd231pd(ymm0, ymm3, ymm13)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovupd(mem(rax       ), ymm0)
@@ -1359,7 +1359,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -1401,27 +1401,27 @@ void bli_dgemmsup_rd_haswell_asm_1x8
 	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
 	vfmadd231pd(ymm0, ymm3, ymm13)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER16)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKITER4)
-	
+
 	mov(var(k_iter4), rsi)             // i = k_iter4;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter4 loop.
-	
-	
+
+
 	label(.DLOOPKITER4)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -1430,7 +1430,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8
 
 	vmovupd(mem(rax       ), ymm0)
 	add(imm(4*8), rax)                 // a += 4*cs_a = 4*8;
-	
+
 	vmovupd(mem(rbx        ), ymm3)
 	vfmadd231pd(ymm0, ymm3, ymm4)
 
@@ -1444,21 +1444,21 @@ void bli_dgemmsup_rd_haswell_asm_1x8
 	add(imm(4*8), rbx)                 // b += 4*rs_b = 4*8;
 	vfmadd231pd(ymm0, ymm3, ymm13)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER4)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.DCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.DLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -1466,10 +1466,10 @@ void bli_dgemmsup_rd_haswell_asm_1x8
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovsd(mem(rax       ), xmm0)
 	add(imm(1*8), rax)                 // a += 1*cs_a = 1*8;
-	
+
 	vmovsd(mem(rbx        ), xmm3)
 	vfmadd231pd(ymm0, ymm3, ymm4)
 
@@ -1483,12 +1483,12 @@ void bli_dgemmsup_rd_haswell_asm_1x8
 	add(imm(1*8), rbx)                 // b += 1*rs_b = 1*8;
 	vfmadd231pd(ymm0, ymm3, ymm13)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 
@@ -1496,9 +1496,9 @@ void bli_dgemmsup_rd_haswell_asm_1x8
 	label(.DPOSTACCUM)
 
 
-	
-	                                   // ymm4  ymm7  ymm10 ymm13  
-	
+
+	                                   // ymm4  ymm7  ymm10 ymm13
+
 	vhaddpd( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vaddpd( xmm0, xmm1, xmm0 )
@@ -1513,7 +1513,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8
 
 
 
-	
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -1521,57 +1521,57 @@ void bli_dgemmsup_rd_haswell_asm_1x8
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
 
 	add(imm(4), r15)                   // jj += 4;
@@ -1583,7 +1583,7 @@ void bli_dgemmsup_rd_haswell_asm_1x8
 
 	label(.DRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c
index eb1118196b..5843d5e40f 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx2.c
@@ -40,20 +40,20 @@
 
 /*
    rrr:
-	 --------        ------        --------      
-	 --------        ------        --------      
-	 --------   +=   ------ ...    --------      
-	 --------        ------        --------      
-	 --------        ------            :         
-	 --------        ------            :         
+	 --------        ------        --------
+	 --------        ------        --------
+	 --------   +=   ------ ...    --------
+	 --------        ------        --------
+	 --------        ------            :
+	 --------        ------            :
 
    rcr:
-	 --------        | | | |       --------      
-	 --------        | | | |       --------      
-	 --------   +=   | | | | ...   --------      
-	 --------        | | | |       --------      
-	 --------        | | | |           :         
-	 --------        | | | |           :         
+	 --------        | | | |       --------
+	 --------        | | | |       --------
+	 --------   +=   | | | | ...   --------
+	 --------        | | | |       --------
+	 --------        | | | |           :
+	 --------        | | | |           :
 
    Assumptions:
    - B is row-stored;
@@ -69,12 +69,12 @@
    cost of the in-register transpose).
 
    crr:
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |  +=   ------ ...    --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------            :         
-	 | | | | | | | |       ------            :         
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |  +=   ------ ...    --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------            :
+	 | | | | | | | |       ------            :
 */
 
 // Prototype reference microkernels.
@@ -93,8 +93,8 @@ void bli_dgemmsup_rv_haswell_asm_6x2
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -115,9 +115,9 @@ void bli_dgemmsup_rv_haswell_asm_6x2
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -132,7 +132,7 @@ void bli_dgemmsup_rv_haswell_asm_6x2
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
 	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -168,31 +168,31 @@ void bli_dgemmsup_rv_haswell_asm_6x2
 	prefetch(0, mem(rcx, rsi, 1, 5*8)) // prefetch c + 1*cs_c
 
 	label(.DPOSTPFETCH)                // done prefetching c
-	
-	
+
+
 #if 1
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
 
 
-	
-	
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -200,19 +200,19 @@ void bli_dgemmsup_rv_haswell_asm_6x2
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm8)
 	vfmadd231pd(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm12)
 	vfmadd231pd(xmm0, xmm3, xmm14)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -226,25 +226,25 @@ void bli_dgemmsup_rv_haswell_asm_6x2
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm8)
 	vfmadd231pd(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm12)
 	vfmadd231pd(xmm0, xmm3, xmm14)
-	
+
 
 	// ---------------------------------- iteration 2
-	
+
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -252,18 +252,18 @@ void bli_dgemmsup_rv_haswell_asm_6x2
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm8)
 	vfmadd231pd(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm12)
 	vfmadd231pd(xmm0, xmm3, xmm14)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -278,43 +278,43 @@ void bli_dgemmsup_rv_haswell_asm_6x2
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm8)
 	vfmadd231pd(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm12)
 	vfmadd231pd(xmm0, xmm3, xmm14)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -322,57 +322,57 @@ void bli_dgemmsup_rv_haswell_asm_6x2
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm8)
 	vfmadd231pd(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm12)
 	vfmadd231pd(xmm0, xmm3, xmm14)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulpd(xmm0, xmm6, xmm6)
 	vmulpd(xmm0, xmm8, xmm8)
 	vmulpd(xmm0, xmm10, xmm10)
 	vmulpd(xmm0, xmm12, xmm12)
 	vmulpd(xmm0, xmm14, xmm14)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	//lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -381,42 +381,42 @@ void bli_dgemmsup_rv_haswell_asm_6x2
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm4)
 	vmovupd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm6)
 	vmovupd(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm8)
 	vmovupd(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm10)
 	vmovupd(xmm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm12)
 	vmovupd(xmm12, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm14)
 	vmovupd(xmm14, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -452,40 +452,40 @@ void bli_dgemmsup_rv_haswell_asm_6x2
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
 
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(xmm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(xmm12, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(xmm14, mem(rcx, 0*32))
 	//add(rdi, rcx)
@@ -517,13 +517,13 @@ void bli_dgemmsup_rv_haswell_asm_6x2
 	vmovupd(xmm1, mem(rdx, rsi, 1))
 
 	//lea(mem(rdx, rsi, 4), rdx)
-	
-	
-	
-	
+
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -566,8 +566,8 @@ void bli_dgemmsup_rv_haswell_asm_5x2
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -588,9 +588,9 @@ void bli_dgemmsup_rv_haswell_asm_5x2
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -605,7 +605,7 @@ void bli_dgemmsup_rv_haswell_asm_5x2
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
 	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -646,21 +646,21 @@ void bli_dgemmsup_rv_haswell_asm_5x2
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
-	
+
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
@@ -672,17 +672,17 @@ void bli_dgemmsup_rv_haswell_asm_5x2
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm8)
 	vfmadd231pd(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm12)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -696,23 +696,23 @@ void bli_dgemmsup_rv_haswell_asm_5x2
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm8)
 	vfmadd231pd(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm12)
-	
+
 
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -720,16 +720,16 @@ void bli_dgemmsup_rv_haswell_asm_5x2
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm8)
 	vfmadd231pd(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm12)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -744,41 +744,41 @@ void bli_dgemmsup_rv_haswell_asm_5x2
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm8)
 	vfmadd231pd(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm12)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -786,54 +786,54 @@ void bli_dgemmsup_rv_haswell_asm_5x2
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm8)
 	vfmadd231pd(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm12)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulpd(xmm0, xmm6, xmm6)
 	vmulpd(xmm0, xmm8, xmm8)
 	vmulpd(xmm0, xmm10, xmm10)
 	vmulpd(xmm0, xmm12, xmm12)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	//lea(mem(rsi, rsi, 2), rax)         // r13 = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -842,37 +842,37 @@ void bli_dgemmsup_rv_haswell_asm_5x2
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm4)
 	vmovupd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm6)
 	vmovupd(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm8)
 	vmovupd(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm10)
 	vmovupd(xmm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm12)
 	vmovupd(xmm12, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -907,37 +907,37 @@ void bli_dgemmsup_rv_haswell_asm_5x2
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
 
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(xmm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(xmm12, mem(rcx, 0*32))
 	//add(rdi, rcx)
 
@@ -947,7 +947,7 @@ void bli_dgemmsup_rv_haswell_asm_5x2
 
 
 	label(.DCOLSTORBZ)
-	
+
 	                                   // begin I/O on columns 0-1
 	vunpcklpd(xmm6, xmm4, xmm0)
 	vunpckhpd(xmm6, xmm4, xmm1)
@@ -967,13 +967,13 @@ void bli_dgemmsup_rv_haswell_asm_5x2
 	vmovhpd(xmm0, mem(rdx, rsi, 1))
 
 	//lea(mem(rdx, rsi, 4), rdx)
-	
-	
-	
-	
+
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1016,8 +1016,8 @@ void bli_dgemmsup_rv_haswell_asm_4x2
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1038,9 +1038,9 @@ void bli_dgemmsup_rv_haswell_asm_4x2
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -1055,7 +1055,7 @@ void bli_dgemmsup_rv_haswell_asm_4x2
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
 	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -1089,31 +1089,31 @@ void bli_dgemmsup_rv_haswell_asm_4x2
 	prefetch(0, mem(rcx, rsi, 1, 3*8)) // prefetch c + 1*cs_c
 
 	label(.DPOSTPFETCH)                // done prefetching c
-	
+
 
 #if 1
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
 
-	
-	
-	
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1121,14 +1121,14 @@ void bli_dgemmsup_rv_haswell_asm_4x2
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm8)
 	vfmadd231pd(xmm0, xmm3, xmm10)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1142,20 +1142,20 @@ void bli_dgemmsup_rv_haswell_asm_4x2
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm8)
 	vfmadd231pd(xmm0, xmm3, xmm10)
-	
+
 
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1163,13 +1163,13 @@ void bli_dgemmsup_rv_haswell_asm_4x2
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm8)
 	vfmadd231pd(xmm0, xmm3, xmm10)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1184,89 +1184,89 @@ void bli_dgemmsup_rv_haswell_asm_4x2
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm8)
 	vfmadd231pd(xmm0, xmm3, xmm10)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm8)
 	vfmadd231pd(xmm0, xmm3, xmm10)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulpd(xmm0, xmm6, xmm6)
 	vmulpd(xmm0, xmm8, xmm8)
 	vmulpd(xmm0, xmm10, xmm10)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 4), r14)         // load address of c +  4*rs_c;
 
 	//lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -1275,32 +1275,32 @@ void bli_dgemmsup_rv_haswell_asm_4x2
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm4)
 	vmovupd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm6)
 	vmovupd(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm8)
 	vmovupd(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm10)
 	vmovupd(xmm10, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -1326,32 +1326,32 @@ void bli_dgemmsup_rv_haswell_asm_4x2
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
 
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(xmm10, mem(rcx, 0*32))
 	//add(rdi, rcx)
@@ -1362,7 +1362,7 @@ void bli_dgemmsup_rv_haswell_asm_4x2
 
 
 	label(.DCOLSTORBZ)
-	
+
 	                                   // begin I/O on columns 0-1
 	vunpcklpd(xmm6, xmm4, xmm0)
 	vunpckhpd(xmm6, xmm4, xmm1)
@@ -1375,13 +1375,13 @@ void bli_dgemmsup_rv_haswell_asm_4x2
 	vmovupd(ymm6, mem(rcx, rsi, 1))
 
 	//lea(mem(rcx, rsi, 4), rcx)
-	
 
-	
-	
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1424,8 +1424,8 @@ void bli_dgemmsup_rv_haswell_asm_3x2
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1446,9 +1446,9 @@ void bli_dgemmsup_rv_haswell_asm_3x2
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -1463,7 +1463,7 @@ void bli_dgemmsup_rv_haswell_asm_3x2
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
 	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -1496,31 +1496,31 @@ void bli_dgemmsup_rv_haswell_asm_3x2
 	prefetch(0, mem(rcx, rsi, 1, 2*8)) // prefetch c + 1*cs_c
 
 	label(.DPOSTPFETCH)                // done prefetching c
-	
-	
+
+
 #if 1
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
 
 
-	
-	
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1528,12 +1528,12 @@ void bli_dgemmsup_rv_haswell_asm_3x2
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm8)
-	
-	
+
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1547,18 +1547,18 @@ void bli_dgemmsup_rv_haswell_asm_3x2
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm8)
-	
+
 
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1566,11 +1566,11 @@ void bli_dgemmsup_rv_haswell_asm_3x2
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm8)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1585,36 +1585,36 @@ void bli_dgemmsup_rv_haswell_asm_3x2
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm8)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1622,61 +1622,61 @@ void bli_dgemmsup_rv_haswell_asm_3x2
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm8)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulpd(xmm0, xmm6, xmm6)
 	vmulpd(xmm0, xmm8, xmm8)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
+
 
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm4)
 	vmovupd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
@@ -1690,10 +1690,10 @@ void bli_dgemmsup_rv_haswell_asm_3x2
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm8)
 	vmovupd(xmm8, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
-	
+
 
 
 	label(.DCOLSTORED)
@@ -1722,26 +1722,26 @@ void bli_dgemmsup_rv_haswell_asm_3x2
 	vfmadd231sd(mem(rdx, rsi, 1), xmm3, xmm13)
 	vmovsd(xmm12, mem(rdx        ))
 	vmovsd(xmm13, mem(rdx, rsi, 1))
-	
+
 	//lea(mem(rdx, rsi, 4), rdx)
 
 
 	jmp(.DDONE)                        // jump to end.
 
 
-	
-	
+
+
 	label(.DBETAZERO)
-	
+
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
 
@@ -1752,8 +1752,8 @@ void bli_dgemmsup_rv_haswell_asm_3x2
 
 	vmovupd(xmm8, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -1781,12 +1781,12 @@ void bli_dgemmsup_rv_haswell_asm_3x2
 
 	//lea(mem(rdx, rsi, 4), rdx)
 
-	
-	
-	
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1829,8 +1829,8 @@ void bli_dgemmsup_rv_haswell_asm_2x2
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1851,9 +1851,9 @@ void bli_dgemmsup_rv_haswell_asm_2x2
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -1868,7 +1868,7 @@ void bli_dgemmsup_rv_haswell_asm_2x2
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
 	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -1900,41 +1900,41 @@ void bli_dgemmsup_rv_haswell_asm_2x2
 	prefetch(0, mem(rcx, rsi, 1, 1*8)) // prefetch c + 1*cs_c
 
 	label(.DPOSTPFETCH)                // done prefetching c
-	
-	
+
+
 #if 1
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
 
 
-	
-	
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1943,29 +1943,29 @@ void bli_dgemmsup_rv_haswell_asm_2x2
 
 	vmovupd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1975,82 +1975,82 @@ void bli_dgemmsup_rv_haswell_asm_2x2
 
 	vmovupd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm4)
 	vfmadd231pd(xmm0, xmm3, xmm6)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulpd(xmm0, xmm6, xmm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 4), r14)         // load address of c +  4*rs_c;
 
 	//lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -2059,22 +2059,22 @@ void bli_dgemmsup_rv_haswell_asm_2x2
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm4)
 	vmovupd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm6)
 	vmovupd(xmm6, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -2094,34 +2094,34 @@ void bli_dgemmsup_rv_haswell_asm_2x2
 
 
 	jmp(.DDONE)                        // jump to end.
-	
 
-	
-	
+
+
+
 	label(.DBETAZERO)
 
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 	vmovupd(xmm6, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
+
 
 	jmp(.DDONE)                        // jump to end.
 
 
 
 	label(.DCOLSTORBZ)
-	
+
 
 	vunpcklpd(xmm6, xmm4, xmm0)
 	vunpckhpd(xmm6, xmm4, xmm1)
@@ -2130,13 +2130,13 @@ void bli_dgemmsup_rv_haswell_asm_2x2
 	vmovupd(xmm1, mem(rcx, rsi, 1))
 
 	//lea(mem(rcx, rsi, 4), rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -2179,8 +2179,8 @@ void bli_dgemmsup_rv_haswell_asm_1x2
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2201,9 +2201,9 @@ void bli_dgemmsup_rv_haswell_asm_1x2
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -2218,7 +2218,7 @@ void bli_dgemmsup_rv_haswell_asm_1x2
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
 	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -2249,31 +2249,31 @@ void bli_dgemmsup_rv_haswell_asm_1x2
 	prefetch(0, mem(rcx, rsi, 1, 0*8)) // prefetch c + 1*cs_c
 
 	label(.DPOSTPFETCH)                // done prefetching c
-	
-	
+
+
 #if 1
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
 
 
-	
-	
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -2281,7 +2281,7 @@ void bli_dgemmsup_rv_haswell_asm_1x2
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm4)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -2294,21 +2294,21 @@ void bli_dgemmsup_rv_haswell_asm_1x2
 	vbroadcastsd(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm4)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
 	vbroadcastsd(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm4)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -2322,92 +2322,92 @@ void bli_dgemmsup_rv_haswell_asm_1x2
 	vbroadcastsd(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm4)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
 	vbroadcastsd(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(xmm0, xmm2, xmm4)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(xmm0, xmm4, xmm4)           // scale by alpha
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 4), r14)         // load address of c +  4*rs_c;
 
 	//lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
+
 
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), xmm3, xmm4)
 	vmovupd(xmm4, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -2422,48 +2422,48 @@ void bli_dgemmsup_rv_haswell_asm_1x2
 
 	vmovlpd(xmm0, mem(rcx        ))
 	vmovhpd(xmm0, mem(rcx, rsi, 1))
-	
+
 	//lea(mem(rcx, rsi, 4), rcx)
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
 
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(xmm4, mem(rcx, 0*32))
 	//add(rdi, rcx)
 
 
 	jmp(.DDONE)                        // jump to end.
-	
+
 
 
 	label(.DCOLSTORBZ)
-	
+
 	                                   // begin I/O on columns 0-1
 	vmovlpd(xmm4, mem(rcx        ))
 	vmovhpd(xmm4, mem(rcx, rsi, 1))
 
 	//lea(mem(rcx, rsi, 4), rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c
index bdcf833e3d..6fb5eaf8ae 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx4.c
@@ -40,20 +40,20 @@
 
 /*
    rrr:
-	 --------        ------        --------      
-	 --------        ------        --------      
-	 --------   +=   ------ ...    --------      
-	 --------        ------        --------      
-	 --------        ------            :         
-	 --------        ------            :         
+	 --------        ------        --------
+	 --------        ------        --------
+	 --------   +=   ------ ...    --------
+	 --------        ------        --------
+	 --------        ------            :
+	 --------        ------            :
 
    rcr:
-	 --------        | | | |       --------      
-	 --------        | | | |       --------      
-	 --------   +=   | | | | ...   --------      
-	 --------        | | | |       --------      
-	 --------        | | | |           :         
-	 --------        | | | |           :         
+	 --------        | | | |       --------
+	 --------        | | | |       --------
+	 --------   +=   | | | | ...   --------
+	 --------        | | | |       --------
+	 --------        | | | |           :
+	 --------        | | | |           :
 
    Assumptions:
    - B is row-stored;
@@ -69,12 +69,12 @@
    cost of the in-register transpose).
 
    crr:
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |  +=   ------ ...    --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------            :         
-	 | | | | | | | |       ------            :         
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |  +=   ------ ...    --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------            :
+	 | | | | | | | |       ------            :
 */
 
 // Prototype reference microkernels.
@@ -93,8 +93,8 @@ void bli_dgemmsup_rv_haswell_asm_6x4
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -115,9 +115,9 @@ void bli_dgemmsup_rv_haswell_asm_6x4
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -132,7 +132,7 @@ void bli_dgemmsup_rv_haswell_asm_6x4
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
 	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -170,31 +170,31 @@ void bli_dgemmsup_rv_haswell_asm_6x4
 	prefetch(0, mem(rcx, rbp, 1, 5*8)) // prefetch c + 3*cs_c
 
 	label(.DPOSTPFETCH)                // done prefetching c
-	
-	
+
+
 #if 1
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
 
 
-	
-	
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -202,19 +202,19 @@ void bli_dgemmsup_rv_haswell_asm_6x4
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -228,25 +228,25 @@ void bli_dgemmsup_rv_haswell_asm_6x4
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm0, ymm3, ymm14)
-	
+
 
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -254,18 +254,18 @@ void bli_dgemmsup_rv_haswell_asm_6x4
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm0, ymm3, ymm14)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -280,43 +280,43 @@ void bli_dgemmsup_rv_haswell_asm_6x4
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm0, ymm3, ymm14)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -324,57 +324,57 @@ void bli_dgemmsup_rv_haswell_asm_6x4
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm0, ymm3, ymm14)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm6, ymm6)
 	vmulpd(ymm0, ymm8, ymm8)
 	vmulpd(ymm0, ymm10, ymm10)
 	vmulpd(ymm0, ymm12, ymm12)
 	vmulpd(ymm0, ymm14, ymm14)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -383,42 +383,42 @@ void bli_dgemmsup_rv_haswell_asm_6x4
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6)
 	vmovupd(ymm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8)
 	vmovupd(ymm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10)
 	vmovupd(ymm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12)
 	vmovupd(ymm12, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm14)
 	vmovupd(ymm14, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -466,45 +466,45 @@ void bli_dgemmsup_rv_haswell_asm_6x4
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
 
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(ymm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(ymm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm12, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(ymm14, mem(rcx, 0*32))
 	//add(rdi, rcx)
 
-	
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -539,13 +539,13 @@ void bli_dgemmsup_rv_haswell_asm_6x4
 	vmovupd(xmm4, mem(rdx, rax, 1))
 
 	//lea(mem(rdx, rsi, 4), rdx)
-	
-	
-	
-	
+
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -588,8 +588,8 @@ void bli_dgemmsup_rv_haswell_asm_5x4
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -610,9 +610,9 @@ void bli_dgemmsup_rv_haswell_asm_5x4
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -627,7 +627,7 @@ void bli_dgemmsup_rv_haswell_asm_5x4
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
 	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -670,19 +670,19 @@ void bli_dgemmsup_rv_haswell_asm_5x4
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -696,17 +696,17 @@ void bli_dgemmsup_rv_haswell_asm_5x4
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -720,16 +720,16 @@ void bli_dgemmsup_rv_haswell_asm_5x4
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
-	
+
 
 	// ---------------------------------- iteration 2
 
@@ -744,16 +744,16 @@ void bli_dgemmsup_rv_haswell_asm_5x4
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -768,41 +768,41 @@ void bli_dgemmsup_rv_haswell_asm_5x4
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -810,54 +810,54 @@ void bli_dgemmsup_rv_haswell_asm_5x4
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm6, ymm6)
 	vmulpd(ymm0, ymm8, ymm8)
 	vmulpd(ymm0, ymm10, ymm10)
 	vmulpd(ymm0, ymm12, ymm12)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -866,37 +866,37 @@ void bli_dgemmsup_rv_haswell_asm_5x4
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6)
 	vmovupd(ymm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8)
 	vmovupd(ymm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10)
 	vmovupd(ymm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12)
 	vmovupd(ymm12, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -943,41 +943,41 @@ void bli_dgemmsup_rv_haswell_asm_5x4
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
 
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(ymm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(ymm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm12, mem(rcx, 0*32))
 	//add(rdi, rcx)
 
-	
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -1010,13 +1010,13 @@ void bli_dgemmsup_rv_haswell_asm_5x4
 	vmovhpd(xmm1, mem(rdx, rax, 1))
 
 	//lea(mem(rdx, rsi, 4), rdx)
-	
-	
-	
-	
+
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1059,8 +1059,8 @@ void bli_dgemmsup_rv_haswell_asm_4x4
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1081,9 +1081,9 @@ void bli_dgemmsup_rv_haswell_asm_4x4
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -1098,7 +1098,7 @@ void bli_dgemmsup_rv_haswell_asm_4x4
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
 	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -1134,8 +1134,8 @@ void bli_dgemmsup_rv_haswell_asm_4x4
 	prefetch(0, mem(rcx, rbp, 1, 3*8)) // prefetch c + 3*cs_c
 
 	label(.DPOSTPFETCH)                // done prefetching c
-	
-		
+
+
 #if 1
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
@@ -1143,22 +1143,22 @@ void bli_dgemmsup_rv_haswell_asm_4x4
 
 
 
-	
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1166,14 +1166,14 @@ void bli_dgemmsup_rv_haswell_asm_4x4
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm0, ymm3, ymm10)
-	
-	
+
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1182,39 +1182,39 @@ void bli_dgemmsup_rv_haswell_asm_4x4
 
 	vmovupd(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm0, ymm3, ymm10)
-	
+
 
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm0, ymm3, ymm10)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1224,128 +1224,128 @@ void bli_dgemmsup_rv_haswell_asm_4x4
 
 	vmovupd(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm0, ymm3, ymm10)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm0, ymm3, ymm10)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm6, ymm6)
 	vmulpd(ymm0, ymm8, ymm8)
 	vmulpd(ymm0, ymm10, ymm10)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 4), r14)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
 
 
 
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6)
 	vmovupd(ymm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8)
 	vmovupd(ymm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10)
 	vmovupd(ymm10, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -1377,33 +1377,33 @@ void bli_dgemmsup_rv_haswell_asm_4x4
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
 
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(ymm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm8, mem(rcx, 0*32))
 	add(rdi, rcx)
 
-	
+
 	vmovupd(ymm10, mem(rcx, 0*32))
 	//add(rdi, rcx)
 
@@ -1413,7 +1413,7 @@ void bli_dgemmsup_rv_haswell_asm_4x4
 
 
 	label(.DCOLSTORBZ)
-	
+
 	                                   // begin I/O on columns 0-3
 	vunpcklpd(ymm6, ymm4, ymm0)
 	vunpckhpd(ymm6, ymm4, ymm1)
@@ -1431,12 +1431,12 @@ void bli_dgemmsup_rv_haswell_asm_4x4
 
 	//lea(mem(rcx, rsi, 4), rcx)
 
-	
-	
-	
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1479,8 +1479,8 @@ void bli_dgemmsup_rv_haswell_asm_3x4
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1501,9 +1501,9 @@ void bli_dgemmsup_rv_haswell_asm_3x4
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -1518,7 +1518,7 @@ void bli_dgemmsup_rv_haswell_asm_3x4
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
 	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -1553,31 +1553,31 @@ void bli_dgemmsup_rv_haswell_asm_3x4
 	prefetch(0, mem(rcx, rbp, 1, 2*8)) // prefetch c + 3*cs_c
 
 	label(.DPOSTPFETCH)                // done prefetching c
-	
-	
+
+
 #if 1
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
 
 
-	
-	
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1585,12 +1585,12 @@ void bli_dgemmsup_rv_haswell_asm_3x4
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
-	
-	
+
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1604,18 +1604,18 @@ void bli_dgemmsup_rv_haswell_asm_3x4
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
-	
+
 
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1623,11 +1623,11 @@ void bli_dgemmsup_rv_haswell_asm_3x4
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1642,36 +1642,36 @@ void bli_dgemmsup_rv_haswell_asm_3x4
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1679,61 +1679,61 @@ void bli_dgemmsup_rv_haswell_asm_3x4
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm6, ymm6)
 	vmulpd(ymm0, ymm8, ymm8)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
+
 
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
@@ -1747,10 +1747,10 @@ void bli_dgemmsup_rv_haswell_asm_3x4
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8)
 	vmovupd(ymm8, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
-	
+
 
 
 	label(.DCOLSTORED)
@@ -1791,26 +1791,26 @@ void bli_dgemmsup_rv_haswell_asm_3x4
 	vmovsd(xmm13, mem(rdx, rsi, 1))
 	vmovsd(xmm14, mem(rdx, rsi, 2))
 	vmovsd(xmm15, mem(rdx, rax, 1))
-	
+
 	//lea(mem(rdx, rsi, 4), rdx)
 
 
 	jmp(.DDONE)                        // jump to end.
 
 
-	
-	
+
+
 	label(.DBETAZERO)
-	
+
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
 
@@ -1821,8 +1821,8 @@ void bli_dgemmsup_rv_haswell_asm_3x4
 
 	vmovupd(ymm8, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -1858,12 +1858,12 @@ void bli_dgemmsup_rv_haswell_asm_3x4
 
 	//lea(mem(rdx, rsi, 4), rdx)
 
-	
-	
-	
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1906,8 +1906,8 @@ void bli_dgemmsup_rv_haswell_asm_2x4
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1928,9 +1928,9 @@ void bli_dgemmsup_rv_haswell_asm_2x4
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -1945,7 +1945,7 @@ void bli_dgemmsup_rv_haswell_asm_2x4
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
 	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -1979,31 +1979,31 @@ void bli_dgemmsup_rv_haswell_asm_2x4
 	prefetch(0, mem(rcx, rbp, 1, 1*8)) // prefetch c + 3*cs_c
 
 	label(.DPOSTPFETCH)                // done prefetching c
-	
-	
+
+
 #if 1
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
 
-	
 
-	
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -2012,8 +2012,8 @@ void bli_dgemmsup_rv_haswell_asm_2x4
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
-	
+
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -2028,10 +2028,10 @@ void bli_dgemmsup_rv_haswell_asm_2x4
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 
 	// ---------------------------------- iteration 2
-	
+
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
@@ -2044,7 +2044,7 @@ void bli_dgemmsup_rv_haswell_asm_2x4
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -2060,32 +2060,32 @@ void bli_dgemmsup_rv_haswell_asm_2x4
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -2094,42 +2094,42 @@ void bli_dgemmsup_rv_haswell_asm_2x4
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm0, ymm3, ymm6)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm6, ymm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 4), r14)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -2138,22 +2138,22 @@ void bli_dgemmsup_rv_haswell_asm_2x4
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6)
 	vmovupd(ymm6, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -2179,24 +2179,24 @@ void bli_dgemmsup_rv_haswell_asm_2x4
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
 
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(ymm6, mem(rcx, 0*32))
 	//add(rdi, rcx)
@@ -2207,7 +2207,7 @@ void bli_dgemmsup_rv_haswell_asm_2x4
 
 
 	label(.DCOLSTORBZ)
-	
+
 	                                   // begin I/O on columns 0-3
 	vunpcklpd(ymm6, ymm4, ymm0)
 	vunpckhpd(ymm6, ymm4, ymm1)
@@ -2220,13 +2220,13 @@ void bli_dgemmsup_rv_haswell_asm_2x4
 	vmovupd(xmm4, mem(rcx, rax, 1))
 
 	//lea(mem(rcx, rsi, 4), rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -2269,8 +2269,8 @@ void bli_dgemmsup_rv_haswell_asm_1x4
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2291,9 +2291,9 @@ void bli_dgemmsup_rv_haswell_asm_1x4
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -2308,7 +2308,7 @@ void bli_dgemmsup_rv_haswell_asm_1x4
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
 	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -2341,27 +2341,27 @@ void bli_dgemmsup_rv_haswell_asm_1x4
 	prefetch(0, mem(rcx, rbp, 1, 0*8)) // prefetch c + 3*cs_c
 
 	label(.DPOSTPFETCH)                // done prefetching c
-	
-	
+
+
 #if 1
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
 
-	
-	
-	
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
-	
+
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
@@ -2372,8 +2372,8 @@ void bli_dgemmsup_rv_haswell_asm_1x4
 	vbroadcastsd(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm4)
-	
-	
+
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -2386,21 +2386,21 @@ void bli_dgemmsup_rv_haswell_asm_1x4
 	vbroadcastsd(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm4)
-	
+
 
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
 	vbroadcastsd(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm4)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -2414,27 +2414,27 @@ void bli_dgemmsup_rv_haswell_asm_1x4
 	vbroadcastsd(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm4)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
-	
+
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
@@ -2446,41 +2446,41 @@ void bli_dgemmsup_rv_haswell_asm_1x4
 	vbroadcastsd(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm4)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 4), r14)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -2488,17 +2488,17 @@ void bli_dgemmsup_rv_haswell_asm_1x4
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -2520,15 +2520,15 @@ void bli_dgemmsup_rv_haswell_asm_1x4
 	vmovhpd(xmm0, mem(rcx, rsi, 1))
 	vmovlpd(xmm1, mem(rcx, rsi, 2))
 	vmovhpd(xmm1, mem(rcx, rax, 1))
-	
+
 	//lea(mem(rcx, rsi, 4), rcx)
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
 
 
@@ -2536,10 +2536,10 @@ void bli_dgemmsup_rv_haswell_asm_1x4
 	jz(.DCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	//add(rdi, rcx)
 
@@ -2549,7 +2549,7 @@ void bli_dgemmsup_rv_haswell_asm_1x4
 
 
 	label(.DCOLSTORBZ)
-	
+
 	                                   // begin I/O on columns 0-3
 	vmovupd(ymm4, ymm0)
 
@@ -2560,14 +2560,14 @@ void bli_dgemmsup_rv_haswell_asm_1x4
 	vmovhpd(xmm1, mem(rcx, rax, 1))
 
 	//lea(mem(rcx, rsi, 4), rcx)
-	
-	
-	
-	
-	
+
+
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c
index 9da1e7b838..2b7222a344 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx6.c
@@ -40,20 +40,20 @@
 
 /*
    rrr:
-	 --------        ------        --------      
-	 --------        ------        --------      
-	 --------   +=   ------ ...    --------      
-	 --------        ------        --------      
-	 --------        ------            :         
-	 --------        ------            :         
+	 --------        ------        --------
+	 --------        ------        --------
+	 --------   +=   ------ ...    --------
+	 --------        ------        --------
+	 --------        ------            :
+	 --------        ------            :
 
    rcr:
-	 --------        | | | |       --------      
-	 --------        | | | |       --------      
-	 --------   +=   | | | | ...   --------      
-	 --------        | | | |       --------      
-	 --------        | | | |           :         
-	 --------        | | | |           :         
+	 --------        | | | |       --------
+	 --------        | | | |       --------
+	 --------   +=   | | | | ...   --------
+	 --------        | | | |       --------
+	 --------        | | | |           :
+	 --------        | | | |           :
 
    Assumptions:
    - B is row-stored;
@@ -69,12 +69,12 @@
    cost of the in-register transpose).
 
    crr:
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |  +=   ------ ...    --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------            :         
-	 | | | | | | | |       ------            :         
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |  +=   ------ ...    --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------            :
+	 | | | | | | | |       ------            :
 */
 
 // Prototype reference microkernels.
@@ -93,8 +93,8 @@ void bli_dgemmsup_rv_haswell_asm_6x6
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -115,15 +115,15 @@ void bli_dgemmsup_rv_haswell_asm_6x6
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
 	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-	
+
 	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -180,18 +180,18 @@ void bli_dgemmsup_rv_haswell_asm_6x6
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
 
-	
-	
-	
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -208,14 +208,14 @@ void bli_dgemmsup_rv_haswell_asm_6x6
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -224,7 +224,7 @@ void bli_dgemmsup_rv_haswell_asm_6x6
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -241,14 +241,14 @@ void bli_dgemmsup_rv_haswell_asm_6x6
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -256,8 +256,8 @@ void bli_dgemmsup_rv_haswell_asm_6x6
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
@@ -274,14 +274,14 @@ void bli_dgemmsup_rv_haswell_asm_6x6
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -289,7 +289,7 @@ void bli_dgemmsup_rv_haswell_asm_6x6
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -307,14 +307,14 @@ void bli_dgemmsup_rv_haswell_asm_6x6
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -322,50 +322,50 @@ void bli_dgemmsup_rv_haswell_asm_6x6
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	vmovupd(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -373,22 +373,22 @@ void bli_dgemmsup_rv_haswell_asm_6x6
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(xmm0, xmm5, xmm5)
 	vmulpd(ymm0, ymm6, ymm6)
@@ -401,24 +401,24 @@ void bli_dgemmsup_rv_haswell_asm_6x6
 	vmulpd(xmm0, xmm13, xmm13)
 	vmulpd(ymm0, ymm14, ymm14)
 	vmulpd(xmm0, xmm15, xmm15)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -427,60 +427,60 @@ void bli_dgemmsup_rv_haswell_asm_6x6
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm5)
 	vmovupd(xmm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6)
 	vmovupd(ymm6, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm7)
 	vmovupd(xmm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8)
 	vmovupd(ymm8, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm9)
 	vmovupd(xmm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10)
 	vmovupd(ymm10, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm11)
 	vmovupd(xmm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12)
 	vmovupd(ymm12, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm13)
 	vmovupd(xmm13, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm14)
 	vmovupd(ymm14, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm15)
 	vmovupd(xmm15, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -555,51 +555,51 @@ void bli_dgemmsup_rv_haswell_asm_6x6
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
+
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	vmovupd(xmm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(ymm6, mem(rcx, 0*32))
 	vmovupd(xmm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm8, mem(rcx, 0*32))
 	vmovupd(xmm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm10, mem(rcx, 0*32))
 	vmovupd(xmm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm12, mem(rcx, 0*32))
 	vmovupd(xmm13, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm14, mem(rcx, 0*32))
 	vmovupd(xmm15, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -656,12 +656,12 @@ void bli_dgemmsup_rv_haswell_asm_6x6
 
 	//lea(mem(rdx, rsi, 4), rdx)
 
-	
-	
-	
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -704,8 +704,8 @@ void bli_dgemmsup_rv_haswell_asm_5x6
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -726,15 +726,15 @@ void bli_dgemmsup_rv_haswell_asm_5x6
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
 	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-	
+
 	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -790,18 +790,18 @@ void bli_dgemmsup_rv_haswell_asm_5x6
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
 
-	
-	
-	
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -818,20 +818,20 @@ void bli_dgemmsup_rv_haswell_asm_5x6
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm1, ymm2, ymm13)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -848,20 +848,20 @@ void bli_dgemmsup_rv_haswell_asm_5x6
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm1, ymm2, ymm13)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
@@ -878,19 +878,19 @@ void bli_dgemmsup_rv_haswell_asm_5x6
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm1, ymm2, ymm13)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -908,82 +908,82 @@ void bli_dgemmsup_rv_haswell_asm_5x6
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm1, ymm2, ymm13)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	vmovupd(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm1, ymm2, ymm13)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(xmm0, xmm5, xmm5)
 	vmulpd(ymm0, ymm6, ymm6)
@@ -994,24 +994,24 @@ void bli_dgemmsup_rv_haswell_asm_5x6
 	vmulpd(xmm0, xmm11, xmm11)
 	vmulpd(ymm0, ymm12, ymm12)
 	vmulpd(xmm0, xmm13, xmm13)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -1020,52 +1020,52 @@ void bli_dgemmsup_rv_haswell_asm_5x6
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm5)
 	vmovupd(xmm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6)
 	vmovupd(ymm6, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm7)
 	vmovupd(xmm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8)
 	vmovupd(ymm8, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm9)
 	vmovupd(xmm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10)
 	vmovupd(ymm10, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm11)
 	vmovupd(xmm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12)
 	vmovupd(ymm12, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm13)
 	vmovupd(xmm13, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -1138,46 +1138,46 @@ void bli_dgemmsup_rv_haswell_asm_5x6
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
+
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	vmovupd(xmm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(ymm6, mem(rcx, 0*32))
 	vmovupd(xmm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm8, mem(rcx, 0*32))
 	vmovupd(xmm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm10, mem(rcx, 0*32))
 	vmovupd(xmm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm12, mem(rcx, 0*32))
 	vmovupd(xmm13, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -1231,12 +1231,12 @@ void bli_dgemmsup_rv_haswell_asm_5x6
 
 	//lea(mem(rdx, rsi, 4), rdx)
 
-	
-	
-	
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1279,8 +1279,8 @@ void bli_dgemmsup_rv_haswell_asm_4x6
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1301,15 +1301,15 @@ void bli_dgemmsup_rv_haswell_asm_4x6
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
 	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-	
+
 	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -1365,17 +1365,17 @@ void bli_dgemmsup_rv_haswell_asm_4x6
 #endif
 
 
-	
-	
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -1392,7 +1392,7 @@ void bli_dgemmsup_rv_haswell_asm_4x6
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1401,7 +1401,7 @@ void bli_dgemmsup_rv_haswell_asm_4x6
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1418,7 +1418,7 @@ void bli_dgemmsup_rv_haswell_asm_4x6
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1426,8 +1426,8 @@ void bli_dgemmsup_rv_haswell_asm_4x6
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
@@ -1444,7 +1444,7 @@ void bli_dgemmsup_rv_haswell_asm_4x6
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1452,7 +1452,7 @@ void bli_dgemmsup_rv_haswell_asm_4x6
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1470,7 +1470,7 @@ void bli_dgemmsup_rv_haswell_asm_4x6
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1478,43 +1478,43 @@ void bli_dgemmsup_rv_haswell_asm_4x6
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	vmovupd(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1522,22 +1522,22 @@ void bli_dgemmsup_rv_haswell_asm_4x6
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(xmm0, xmm5, xmm5)
 	vmulpd(ymm0, ymm6, ymm6)
@@ -1546,24 +1546,24 @@ void bli_dgemmsup_rv_haswell_asm_4x6
 	vmulpd(xmm0, xmm9, xmm9)
 	vmulpd(ymm0, ymm10, ymm10)
 	vmulpd(xmm0, xmm11, xmm11)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -1572,44 +1572,44 @@ void bli_dgemmsup_rv_haswell_asm_4x6
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm5)
 	vmovupd(xmm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6)
 	vmovupd(ymm6, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm7)
 	vmovupd(xmm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8)
 	vmovupd(ymm8, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm9)
 	vmovupd(xmm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10)
 	vmovupd(ymm10, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm11)
 	vmovupd(xmm11, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -1658,41 +1658,41 @@ void bli_dgemmsup_rv_haswell_asm_4x6
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
+
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	vmovupd(xmm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(ymm6, mem(rcx, 0*32))
 	vmovupd(xmm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm8, mem(rcx, 0*32))
 	vmovupd(xmm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm10, mem(rcx, 0*32))
 	vmovupd(xmm11, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -1729,9 +1729,9 @@ void bli_dgemmsup_rv_haswell_asm_4x6
 
 	//lea(mem(rcx, rsi, 4), rcx)
 
-	
-	
-	
+
+
+
 	label(.DDONE)
 
 
@@ -1777,8 +1777,8 @@ void bli_dgemmsup_rv_haswell_asm_3x6
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1799,9 +1799,9 @@ void bli_dgemmsup_rv_haswell_asm_3x6
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -1816,7 +1816,7 @@ void bli_dgemmsup_rv_haswell_asm_3x6
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
 	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -1854,31 +1854,31 @@ void bli_dgemmsup_rv_haswell_asm_3x6
 	prefetch(0, mem(rdx, rsi, 2, 2*8)) // prefetch c + 5*cs_c
 
 	label(.DPOSTPFETCH)                // done prefetching c
-	
-	
+
+
 #if 1
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
 
 
-	
-	
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	vmovupd(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -1889,13 +1889,13 @@ void bli_dgemmsup_rv_haswell_asm_3x6
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
-	
-	
+
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1912,15 +1912,15 @@ void bli_dgemmsup_rv_haswell_asm_3x6
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
-	
+
 
 	// ---------------------------------- iteration 2
-	
+
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
@@ -1935,12 +1935,12 @@ void bli_dgemmsup_rv_haswell_asm_3x6
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1958,37 +1958,37 @@ void bli_dgemmsup_rv_haswell_asm_3x6
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	vmovupd(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -1999,65 +1999,65 @@ void bli_dgemmsup_rv_haswell_asm_3x6
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(xmm0, xmm5, xmm5)
 	vmulpd(ymm0, ymm6, ymm6)
 	vmulpd(xmm0, xmm7, xmm7)
 	vmulpd(ymm0, ymm8, ymm8)
 	vmulpd(xmm0, xmm9, xmm9)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
+
 
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 
@@ -2080,10 +2080,10 @@ void bli_dgemmsup_rv_haswell_asm_3x6
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm9)
 	vmovupd(xmm9, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
-	
+
 
 
 	label(.DCOLSTORED)
@@ -2124,7 +2124,7 @@ void bli_dgemmsup_rv_haswell_asm_3x6
 	vmovsd(xmm13, mem(rdx, rsi, 1))
 	vmovsd(xmm14, mem(rdx, rsi, 2))
 	vmovsd(xmm15, mem(rdx, rax, 1))
-	
+
 	lea(mem(rdx, rsi, 4), rdx)
 
 	                                   // begin I/O on columns 4-5
@@ -2155,26 +2155,26 @@ void bli_dgemmsup_rv_haswell_asm_3x6
 	vfmadd231sd(mem(rdx, rsi, 1), xmm3, xmm13)
 	vmovsd(xmm12, mem(rdx        ))
 	vmovsd(xmm13, mem(rdx, rsi, 1))
-	
+
 	//lea(mem(rdx, rsi, 4), rdx)
 
 
 	jmp(.DDONE)                        // jump to end.
 
 
-	
-	
+
+
 	label(.DBETAZERO)
-	
+
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	vmovupd(xmm5, mem(rcx, 1*32))
 	add(rdi, rcx)
@@ -2186,8 +2186,8 @@ void bli_dgemmsup_rv_haswell_asm_3x6
 	vmovupd(ymm8, mem(rcx, 0*32))
 	vmovupd(xmm9, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -2244,12 +2244,12 @@ void bli_dgemmsup_rv_haswell_asm_3x6
 
 	//lea(mem(rdx, rsi, 4), rdx)
 
-	
-	
-	
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -2292,8 +2292,8 @@ void bli_dgemmsup_rv_haswell_asm_2x6
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2314,15 +2314,15 @@ void bli_dgemmsup_rv_haswell_asm_2x6
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
 	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-	
+
 	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -2376,17 +2376,17 @@ void bli_dgemmsup_rv_haswell_asm_2x6
 #endif
 
 
-	
-	
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -2405,7 +2405,7 @@ void bli_dgemmsup_rv_haswell_asm_2x6
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -2423,8 +2423,8 @@ void bli_dgemmsup_rv_haswell_asm_2x6
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
@@ -2442,7 +2442,7 @@ void bli_dgemmsup_rv_haswell_asm_2x6
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -2461,36 +2461,36 @@ void bli_dgemmsup_rv_haswell_asm_2x6
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	vmovupd(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -2498,44 +2498,44 @@ void bli_dgemmsup_rv_haswell_asm_2x6
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(xmm0, xmm5, xmm5)
 	vmulpd(ymm0, ymm6, ymm6)
 	vmulpd(xmm0, xmm7, xmm7)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -2544,28 +2544,28 @@ void bli_dgemmsup_rv_haswell_asm_2x6
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm5)
 	vmovupd(xmm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6)
 	vmovupd(ymm6, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm7)
 	vmovupd(xmm7, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -2614,31 +2614,31 @@ void bli_dgemmsup_rv_haswell_asm_2x6
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
+
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	vmovupd(xmm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(ymm6, mem(rcx, 0*32))
 	vmovupd(xmm7, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -2675,9 +2675,9 @@ void bli_dgemmsup_rv_haswell_asm_2x6
 
 	//lea(mem(rcx, rsi, 4), rcx)
 
-	
-	
-	
+
+
+
 	label(.DDONE)
 
 
@@ -2723,8 +2723,8 @@ void bli_dgemmsup_rv_haswell_asm_1x6
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2745,15 +2745,15 @@ void bli_dgemmsup_rv_haswell_asm_1x6
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
 	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-	
+
 	//lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -2806,17 +2806,17 @@ void bli_dgemmsup_rv_haswell_asm_1x6
 #endif
 
 
-	
-	
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -2832,7 +2832,7 @@ void bli_dgemmsup_rv_haswell_asm_1x6
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -2847,8 +2847,8 @@ void bli_dgemmsup_rv_haswell_asm_1x6
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
@@ -2863,7 +2863,7 @@ void bli_dgemmsup_rv_haswell_asm_1x6
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -2879,76 +2879,76 @@ void bli_dgemmsup_rv_haswell_asm_1x6
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	vmovupd(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(xmm0, xmm5, xmm5)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -2957,20 +2957,20 @@ void bli_dgemmsup_rv_haswell_asm_1x6
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), xmm3, xmm5)
 	vmovupd(xmm5, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -3007,26 +3007,26 @@ void bli_dgemmsup_rv_haswell_asm_1x6
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
+
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	vmovupd(xmm5, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -3052,9 +3052,9 @@ void bli_dgemmsup_rv_haswell_asm_1x6
 
 	//lea(mem(rcx, rsi, 4), rcx)
 
-	
-	
-	
+
+
+
 	label(.DDONE)
 
 
diff --git a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c
index a6c8f0e43d..b3a7c17ca4 100644
--- a/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c
+++ b/kernels/haswell/3/sup/d6x8/bli_gemmsup_rv_haswell_asm_dMx8.c
@@ -40,20 +40,20 @@
 
 /*
    rrr:
-	 --------        ------        --------      
-	 --------        ------        --------      
-	 --------   +=   ------ ...    --------      
-	 --------        ------        --------      
-	 --------        ------            :         
-	 --------        ------            :         
+	 --------        ------        --------
+	 --------        ------        --------
+	 --------   +=   ------ ...    --------
+	 --------        ------        --------
+	 --------        ------            :
+	 --------        ------            :
 
    rcr:
-	 --------        | | | |       --------      
-	 --------        | | | |       --------      
-	 --------   +=   | | | | ...   --------      
-	 --------        | | | |       --------      
-	 --------        | | | |           :         
-	 --------        | | | |           :         
+	 --------        | | | |       --------
+	 --------        | | | |       --------
+	 --------   +=   | | | | ...   --------
+	 --------        | | | |       --------
+	 --------        | | | |           :
+	 --------        | | | |           :
 
    Assumptions:
    - B is row-stored;
@@ -69,12 +69,12 @@
    cost of the in-register transpose).
 
    crr:
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |  +=   ------ ...    --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------            :         
-	 | | | | | | | |       ------            :         
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |  +=   ------ ...    --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------            :
+	 | | | | | | | |       ------            :
 */
 
 // Prototype reference microkernels.
@@ -108,8 +108,8 @@ void bli_dgemmsup_rv_haswell_asm_6x8
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 #if 0
@@ -178,7 +178,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8
 						// Advance C and A pointers by the mrs and nrs we just
 						// used, and decrement m_left.
 						cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
-					} 
+					}
 				}
 
 				// Advance C and B pointers by the mrs and nrs we just used, and
@@ -208,9 +208,9 @@ void bli_dgemmsup_rv_haswell_asm_6x8
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -225,7 +225,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
 	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -275,25 +275,25 @@ void bli_dgemmsup_rv_haswell_asm_6x8
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	vmovupd(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -304,14 +304,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -320,7 +320,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -337,14 +337,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -352,14 +352,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	vmovupd(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -370,14 +370,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -385,7 +385,7 @@ void bli_dgemmsup_rv_haswell_asm_6x8
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -403,14 +403,14 @@ void bli_dgemmsup_rv_haswell_asm_6x8
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -418,50 +418,50 @@ void bli_dgemmsup_rv_haswell_asm_6x8
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	vmovupd(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	vbroadcastsd(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -469,22 +469,22 @@ void bli_dgemmsup_rv_haswell_asm_6x8
 	vfmadd231pd(ymm1, ymm2, ymm13)
 	vfmadd231pd(ymm0, ymm3, ymm14)
 	vfmadd231pd(ymm1, ymm3, ymm15)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm5, ymm5)
 	vmulpd(ymm0, ymm6, ymm6)
@@ -497,24 +497,24 @@ void bli_dgemmsup_rv_haswell_asm_6x8
 	vmulpd(ymm0, ymm13, ymm13)
 	vmulpd(ymm0, ymm14, ymm14)
 	vmulpd(ymm0, ymm15, ymm15)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -523,60 +523,60 @@ void bli_dgemmsup_rv_haswell_asm_6x8
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5)
 	vmovupd(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6)
 	vmovupd(ymm6, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm7)
 	vmovupd(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8)
 	vmovupd(ymm8, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm9)
 	vmovupd(ymm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10)
 	vmovupd(ymm10, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm11)
 	vmovupd(ymm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12)
 	vmovupd(ymm12, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm13)
 	vmovupd(ymm13, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm14)
 	vmovupd(ymm14, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm15)
 	vmovupd(ymm15, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -663,51 +663,51 @@ void bli_dgemmsup_rv_haswell_asm_6x8
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
+
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	vmovupd(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(ymm6, mem(rcx, 0*32))
 	vmovupd(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm8, mem(rcx, 0*32))
 	vmovupd(ymm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm10, mem(rcx, 0*32))
 	vmovupd(ymm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm12, mem(rcx, 0*32))
 	vmovupd(ymm13, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm14, mem(rcx, 0*32))
 	vmovupd(ymm15, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -772,12 +772,12 @@ void bli_dgemmsup_rv_haswell_asm_6x8
 
 	//lea(mem(rdx, rsi, 4), rdx)
 
-	
-	
-	
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -820,8 +820,8 @@ void bli_dgemmsup_rv_haswell_asm_5x8
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -842,15 +842,15 @@ void bli_dgemmsup_rv_haswell_asm_5x8
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
 	lea(mem(, r8, 8), r8)              // rs_a *= sizeof(double)
 	lea(mem(, r9, 8), r9)              // cs_a *= sizeof(double)
-	
+
 	lea(mem(r8, r8, 2), r13)           // r13 = 3*rs_a
 	//lea(mem(r8, r8, 4), r15)           // r15 = 5*rs_a
 
@@ -909,18 +909,18 @@ void bli_dgemmsup_rv_haswell_asm_5x8
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
 
-	
-	
-	
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
@@ -937,20 +937,20 @@ void bli_dgemmsup_rv_haswell_asm_5x8
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm1, ymm2, ymm13)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -967,26 +967,26 @@ void bli_dgemmsup_rv_haswell_asm_5x8
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm1, ymm2, ymm13)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	vmovupd(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -997,19 +997,19 @@ void bli_dgemmsup_rv_haswell_asm_5x8
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm1, ymm2, ymm13)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1027,37 +1027,37 @@ void bli_dgemmsup_rv_haswell_asm_5x8
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm1, ymm2, ymm13)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
@@ -1068,41 +1068,41 @@ void bli_dgemmsup_rv_haswell_asm_5x8
 	vmovupd(mem(rbx, 0*32), ymm0)
 	vmovupd(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	vbroadcastsd(mem(rax, r8,  1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastsd(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm12)
 	vfmadd231pd(ymm1, ymm2, ymm13)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm5, ymm5)
 	vmulpd(ymm0, ymm6, ymm6)
@@ -1113,24 +1113,24 @@ void bli_dgemmsup_rv_haswell_asm_5x8
 	vmulpd(ymm0, ymm11, ymm11)
 	vmulpd(ymm0, ymm12, ymm12)
 	vmulpd(ymm0, ymm13, ymm13)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -1139,52 +1139,52 @@ void bli_dgemmsup_rv_haswell_asm_5x8
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5)
 	vmovupd(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm6)
 	vmovupd(ymm6, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm7)
 	vmovupd(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm8)
 	vmovupd(ymm8, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm9)
 	vmovupd(ymm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm10)
 	vmovupd(ymm10, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm11)
 	vmovupd(ymm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm12)
 	vmovupd(ymm12, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm13)
 	vmovupd(ymm13, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -1269,46 +1269,46 @@ void bli_dgemmsup_rv_haswell_asm_5x8
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
-	
+
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	vmovupd(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
+
 
 	vmovupd(ymm6, mem(rcx, 0*32))
 	vmovupd(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm8, mem(rcx, 0*32))
 	vmovupd(ymm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm10, mem(rcx, 0*32))
 	vmovupd(ymm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovupd(ymm12, mem(rcx, 0*32))
 	vmovupd(ymm13, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -1367,9 +1367,9 @@ void bli_dgemmsup_rv_haswell_asm_5x8
 
 	//lea(mem(rdx, rsi, 4), rdx)
 
-	
-	
-	
+
+
+
 	label(.DDONE)
 
 
@@ -1415,8 +1415,8 @@ void bli_dgemmsup_rv_haswell_asm_4x8
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1437,9 +1437,9 @@ void bli_dgemmsup_rv_haswell_asm_4x8
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -1454,7 +1454,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
 	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -1496,31 +1496,31 @@ void bli_dgemmsup_rv_haswell_asm_4x8
 	prefetch(0, mem(rdx, rsi, 2, 3*8)) // prefetch c + 7*cs_c
 
 	label(.DPOSTPFETCH)                // done prefetching c
-	
+
 
 #if 1
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
 
-	
-	
-	
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
-	
+
 	vmovupd(mem(rbx, 0*32), ymm0)
 	vmovupd(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -1531,7 +1531,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1539,8 +1539,8 @@ void bli_dgemmsup_rv_haswell_asm_4x8
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
-	
+
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1557,7 +1557,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1565,10 +1565,10 @@ void bli_dgemmsup_rv_haswell_asm_4x8
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 
 	// ---------------------------------- iteration 2
-	
+
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
@@ -1583,7 +1583,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1591,7 +1591,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1609,7 +1609,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1617,27 +1617,27 @@ void bli_dgemmsup_rv_haswell_asm_4x8
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
-	
+
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
@@ -1653,7 +1653,7 @@ void bli_dgemmsup_rv_haswell_asm_4x8
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	vbroadcastsd(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1661,22 +1661,22 @@ void bli_dgemmsup_rv_haswell_asm_4x8
 	vfmadd231pd(ymm1, ymm2, ymm9)
 	vfmadd231pd(ymm0, ymm3, ymm10)
 	vfmadd231pd(ymm1, ymm3, ymm11)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm5, ymm5)
 	vmulpd(ymm0, ymm6, ymm6)
@@ -1685,38 +1685,38 @@ void bli_dgemmsup_rv_haswell_asm_4x8
 	vmulpd(ymm0, ymm9, ymm9)
 	vmulpd(ymm0, ymm10, ymm10)
 	vmulpd(ymm0, ymm11, ymm11)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
+
 
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 
@@ -1747,10 +1747,10 @@ void bli_dgemmsup_rv_haswell_asm_4x8
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm11)
 	vmovupd(ymm11, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
-	
+
 
 
 	label(.DCOLSTORED)
@@ -1805,19 +1805,19 @@ void bli_dgemmsup_rv_haswell_asm_4x8
 	jmp(.DDONE)                        // jump to end.
 
 
-	
-	
+
+
 	label(.DBETAZERO)
-	
+
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	vmovupd(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
@@ -1833,8 +1833,8 @@ void bli_dgemmsup_rv_haswell_asm_4x8
 	vmovupd(ymm10, mem(rcx, 0*32))
 	vmovupd(ymm11, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -1875,12 +1875,12 @@ void bli_dgemmsup_rv_haswell_asm_4x8
 
 	//lea(mem(rcx, rsi, 4), rcx)
 
-	
-	
-	
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1923,8 +1923,8 @@ void bli_dgemmsup_rv_haswell_asm_3x8
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1945,9 +1945,9 @@ void bli_dgemmsup_rv_haswell_asm_3x8
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -1962,7 +1962,7 @@ void bli_dgemmsup_rv_haswell_asm_3x8
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
 	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -2003,27 +2003,27 @@ void bli_dgemmsup_rv_haswell_asm_3x8
 	prefetch(0, mem(rdx, rsi, 2, 2*8)) // prefetch c + 7*cs_c
 
 	label(.DPOSTPFETCH)                // done prefetching c
-	
-	
+
+
 #if 1
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
 
 
-	
-	
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
-	
+
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
@@ -2038,13 +2038,13 @@ void bli_dgemmsup_rv_haswell_asm_3x8
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
-	
-	
+
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -2061,15 +2061,15 @@ void bli_dgemmsup_rv_haswell_asm_3x8
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
-	
+
 
 	// ---------------------------------- iteration 2
-	
+
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
@@ -2084,12 +2084,12 @@ void bli_dgemmsup_rv_haswell_asm_3x8
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -2107,32 +2107,32 @@ void bli_dgemmsup_rv_haswell_asm_3x8
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
-	
+
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
@@ -2148,65 +2148,65 @@ void bli_dgemmsup_rv_haswell_asm_3x8
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastsd(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm8)
 	vfmadd231pd(ymm1, ymm2, ymm9)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm5, ymm5)
 	vmulpd(ymm0, ymm6, ymm6)
 	vmulpd(ymm0, ymm7, ymm7)
 	vmulpd(ymm0, ymm8, ymm8)
 	vmulpd(ymm0, ymm9, ymm9)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
+
 
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 
@@ -2229,10 +2229,10 @@ void bli_dgemmsup_rv_haswell_asm_3x8
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm9)
 	vmovupd(ymm9, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
-	
+
 
 
 	label(.DCOLSTORED)
@@ -2273,7 +2273,7 @@ void bli_dgemmsup_rv_haswell_asm_3x8
 	vmovsd(xmm13, mem(rdx, rsi, 1))
 	vmovsd(xmm14, mem(rdx, rsi, 2))
 	vmovsd(xmm15, mem(rdx, rax, 1))
-	
+
 	lea(mem(rdx, rsi, 4), rdx)
 
 	                                   // begin I/O on columns 4-7
@@ -2312,26 +2312,26 @@ void bli_dgemmsup_rv_haswell_asm_3x8
 	vmovsd(xmm13, mem(rdx, rsi, 1))
 	vmovsd(xmm14, mem(rdx, rsi, 2))
 	vmovsd(xmm15, mem(rdx, rax, 1))
-	
+
 	//lea(mem(rdx, rsi, 4), rdx)
 
 
 	jmp(.DDONE)                        // jump to end.
 
 
-	
-	
+
+
 	label(.DBETAZERO)
-	
+
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	vmovupd(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
@@ -2343,8 +2343,8 @@ void bli_dgemmsup_rv_haswell_asm_3x8
 	vmovupd(ymm8, mem(rcx, 0*32))
 	vmovupd(ymm9, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -2409,12 +2409,12 @@ void bli_dgemmsup_rv_haswell_asm_3x8
 
 	//lea(mem(rdx, rsi, 4), rdx)
 
-	
-	
-	
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -2457,8 +2457,8 @@ void bli_dgemmsup_rv_haswell_asm_2x8
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2479,9 +2479,9 @@ void bli_dgemmsup_rv_haswell_asm_2x8
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -2496,7 +2496,7 @@ void bli_dgemmsup_rv_haswell_asm_2x8
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
 	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -2536,27 +2536,27 @@ void bli_dgemmsup_rv_haswell_asm_2x8
 	prefetch(0, mem(rdx, rsi, 2, 1*8)) // prefetch c + 7*cs_c
 
 	label(.DPOSTPFETCH)                // done prefetching c
-	
-	
+
+
 #if 1
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
 
 
-	
-	
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
-	
+
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
@@ -2572,8 +2572,8 @@ void bli_dgemmsup_rv_haswell_asm_2x8
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	
+
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -2591,10 +2591,10 @@ void bli_dgemmsup_rv_haswell_asm_2x8
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 
 	// ---------------------------------- iteration 2
-	
+
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
@@ -2610,7 +2610,7 @@ void bli_dgemmsup_rv_haswell_asm_2x8
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -2629,27 +2629,27 @@ void bli_dgemmsup_rv_haswell_asm_2x8
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
-	
+
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
@@ -2666,44 +2666,44 @@ void bli_dgemmsup_rv_haswell_asm_2x8
 	vfmadd231pd(ymm1, ymm2, ymm5)
 	vfmadd231pd(ymm0, ymm3, ymm6)
 	vfmadd231pd(ymm1, ymm3, ymm7)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm5, ymm5)
 	vmulpd(ymm0, ymm6, ymm6)
 	vmulpd(ymm0, ymm7, ymm7)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -2712,12 +2712,12 @@ void bli_dgemmsup_rv_haswell_asm_2x8
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 
@@ -2732,8 +2732,8 @@ void bli_dgemmsup_rv_haswell_asm_2x8
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm7)
 	vmovupd(ymm7, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -2778,19 +2778,19 @@ void bli_dgemmsup_rv_haswell_asm_2x8
 	jmp(.DDONE)                        // jump to end.
 
 
-	
-	
+
+
 	label(.DBETAZERO)
 
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	vmovupd(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
@@ -2798,8 +2798,8 @@ void bli_dgemmsup_rv_haswell_asm_2x8
 	vmovupd(ymm6, mem(rcx, 0*32))
 	vmovupd(ymm7, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -2832,12 +2832,12 @@ void bli_dgemmsup_rv_haswell_asm_2x8
 
 	//lea(mem(rcx, rsi, 4), rcx)
 
-	
-	
-	
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -2880,8 +2880,8 @@ void bli_dgemmsup_rv_haswell_asm_1x8
        double*    restrict b, inc_t rs_b0, inc_t cs_b0,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2902,9 +2902,9 @@ void bli_dgemmsup_rv_haswell_asm_1x8
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -2919,7 +2919,7 @@ void bli_dgemmsup_rv_haswell_asm_1x8
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 8), r10)            // rs_b *= sizeof(double)
 	//lea(mem(, r11, 8), r11)            // cs_b *= sizeof(double)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -2958,27 +2958,27 @@ void bli_dgemmsup_rv_haswell_asm_1x8
 	prefetch(0, mem(rdx, rsi, 2, 0*8)) // prefetch c + 7*cs_c
 
 	label(.DPOSTPFETCH)                // done prefetching c
-	
+
 
 #if 1
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
 
-	
-	
-	
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.DLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
-	
+
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
@@ -2991,8 +2991,8 @@ void bli_dgemmsup_rv_haswell_asm_1x8
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
-	
-	
+
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -3002,15 +3002,15 @@ void bli_dgemmsup_rv_haswell_asm_1x8
 	vmovupd(mem(rbx, 0*32), ymm0)
 	vmovupd(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
-	
+
 
 	// ---------------------------------- iteration 2
-	
+
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
@@ -3023,7 +3023,7 @@ void bli_dgemmsup_rv_haswell_asm_1x8
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -3039,27 +3039,27 @@ void bli_dgemmsup_rv_haswell_asm_1x8
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.DCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.DPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.DLOOPKLEFT)                 // EDGE LOOP
-	
+
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
@@ -3068,18 +3068,18 @@ void bli_dgemmsup_rv_haswell_asm_1x8
 	vmovupd(mem(rbx, 0*32), ymm0)
 	vmovupd(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastsd(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231pd(ymm0, ymm2, ymm4)
 	vfmadd231pd(ymm1, ymm2, ymm5)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.DLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.DPOSTACCUM)
 
 
@@ -3088,27 +3088,27 @@ void bli_dgemmsup_rv_haswell_asm_1x8
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastsd(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastsd(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulpd(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulpd(ymm0, ymm5, ymm5)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(double)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorpd(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomisd(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.DBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -3116,20 +3116,20 @@ void bli_dgemmsup_rv_haswell_asm_1x8
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORED)
-	
-	
+
+
 	vfmadd231pd(mem(rcx, 0*32), ymm3, ymm4)
 	vmovupd(ymm4, mem(rcx, 0*32))
 
 	vfmadd231pd(mem(rcx, 1*32), ymm3, ymm5)
 	vmovupd(ymm5, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -3150,7 +3150,7 @@ void bli_dgemmsup_rv_haswell_asm_1x8
 	vmovhpd(xmm0, mem(rcx, rsi, 1))
 	vmovlpd(xmm1, mem(rcx, rsi, 2))
 	vmovhpd(xmm1, mem(rcx, rax, 1))
-	
+
 	lea(mem(rcx, rsi, 4), rcx)
 
 	                                   // begin I/O on columns 4-7
@@ -3173,26 +3173,26 @@ void bli_dgemmsup_rv_haswell_asm_1x8
 
 
 	jmp(.DDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.DBETAZERO)
 
 
 	cmp(imm(8), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.DCOLSTORBZ)                    // jump to column storage case
-	
 
-	
+
+
 	label(.DROWSTORBZ)
-	
-	
+
+
 	vmovupd(ymm4, mem(rcx, 0*32))
 	vmovupd(ymm5, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.DDONE)                        // jump to end.
 
 
@@ -3220,14 +3220,14 @@ void bli_dgemmsup_rv_haswell_asm_1x8
 	vmovhpd(xmm1, mem(rcx, rax, 1))
 
 	//lea(mem(rcx, rsi, 4), rcx)
-	
 
-	
-	
-	
+
+
+
+
 	label(.DDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_r_haswell_ref_sMx1.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_r_haswell_ref_sMx1.c
index dad5458b9a..98b557fae1 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_r_haswell_ref_sMx1.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_r_haswell_ref_sMx1.c
@@ -40,20 +40,20 @@
 
 /*
    rrr:
-	 --------        ------        --------      
-	 --------        ------        --------      
-	 --------   +=   ------ ...    --------      
-	 --------        ------        --------      
-	 --------        ------            :         
-	 --------        ------            :         
+	 --------        ------        --------
+	 --------        ------        --------
+	 --------   +=   ------ ...    --------
+	 --------        ------        --------
+	 --------        ------            :
+	 --------        ------            :
 
    rcr:
-	 --------        | | | |       --------      
-	 --------        | | | |       --------      
-	 --------   +=   | | | | ...   --------      
-	 --------        | | | |       --------      
-	 --------        | | | |           :         
-	 --------        | | | |           :         
+	 --------        | | | |       --------
+	 --------        | | | |       --------
+	 --------   +=   | | | | ...   --------
+	 --------        | | | |       --------
+	 --------        | | | |           :
+	 --------        | | | |           :
 
    Assumptions:
    - B is row-stored;
@@ -69,12 +69,12 @@
    cost of the in-register transpose).
 
    crr:
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |  +=   ------ ...    --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------            :         
-	 | | | | | | | |       ------            :         
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |  +=   ------ ...    --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------            :
+	 | | | | | | | |       ------            :
 */
 
 // Prototype reference microkernels.
@@ -103,8 +103,8 @@ void PASTEMAC(ch,opname) \
        ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
        ctype*     restrict beta, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx \
+       auxinfo_t*          data, \
+       cntx_t*             cntx \
      ) \
 { \
 	for ( dim_t i = 0; i < mdim; ++i ) \
@@ -175,8 +175,8 @@ void PASTEMAC(ch,opname) \
        ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
        ctype*     restrict beta, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx \
+       auxinfo_t*          data, \
+       cntx_t*             cntx \
      ) \
 { \
 	for ( dim_t i = 0; i < m; ++i ) \
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c
index 1eb8d926c9..c17b0b2754 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx1.c
@@ -72,8 +72,8 @@ void bli_sgemmsup_rd_haswell_asm_6x1
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -101,7 +101,7 @@ void bli_sgemmsup_rd_haswell_asm_6x1
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -119,7 +119,7 @@ void bli_sgemmsup_rd_haswell_asm_6x1
 
 	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), rcx)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -164,18 +164,18 @@ void bli_sgemmsup_rd_haswell_asm_6x1
 	prefetch(0, mem(r10, rdi, 2, 0*4)) // prefetch c + 5*rs_c
 #endif
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -206,7 +206,7 @@ void bli_sgemmsup_rd_haswell_asm_6x1
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm14)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rbx        ), ymm0)
@@ -233,7 +233,7 @@ void bli_sgemmsup_rd_haswell_asm_6x1
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -287,27 +287,27 @@ void bli_sgemmsup_rd_haswell_asm_6x1
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm14)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -336,21 +336,21 @@ void bli_sgemmsup_rd_haswell_asm_6x1
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm14)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -358,7 +358,7 @@ void bli_sgemmsup_rd_haswell_asm_6x1
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rbx        ), xmm0)
 	add(imm(1*4), rbx)                 // b += 8*rs_b = 8*4;
 
@@ -381,12 +381,12 @@ void bli_sgemmsup_rd_haswell_asm_6x1
 	add(imm(1*4), rax)                 // a += 8*cs_a = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm14)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 
@@ -399,7 +399,7 @@ void bli_sgemmsup_rd_haswell_asm_6x1
 	                                   // ymm10
 	                                   // ymm12
 	                                   // ymm14
-	
+
 	vhaddps( ymm5, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -443,8 +443,8 @@ void bli_sgemmsup_rd_haswell_asm_6x1
 	                                   // xmm12[0] = sum(ymm12)
 	                                   // xmm14[0] = sum(ymm14)
 
-	
-	
+
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -452,109 +452,109 @@ void bli_sgemmsup_rd_haswell_asm_6x1
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm6, xmm6)
 	vmulps(xmm0, xmm8, xmm8)
 	vmulps(xmm0, xmm10, xmm10)
 	vmulps(xmm0, xmm12, xmm12)
 	vmulps(xmm0, xmm14, xmm14)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vmovss(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm4)
 	vmovss(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovss(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm6)
 	vmovss(xmm6, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovss(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm8)
 	vmovss(xmm8, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovss(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm10)
 	vmovss(xmm10, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovss(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm12)
 	vmovss(xmm12, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovss(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm14)
 	vmovss(xmm14, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovss(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovss(xmm6, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovss(xmm8, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovss(xmm10, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovss(xmm12, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovss(xmm14, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
-	
+
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -598,8 +598,8 @@ void bli_sgemmsup_rd_haswell_asm_3x1
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -627,7 +627,7 @@ void bli_sgemmsup_rd_haswell_asm_3x1
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -645,7 +645,7 @@ void bli_sgemmsup_rd_haswell_asm_3x1
 
 	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), rcx)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -684,18 +684,18 @@ void bli_sgemmsup_rd_haswell_asm_3x1
 	prefetch(0, mem(rcx, rdi, 2, 0*4)) // prefetch c + 2*rs_c
 #endif
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -717,7 +717,7 @@ void bli_sgemmsup_rd_haswell_asm_3x1
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm8)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rbx        ), ymm0)
@@ -735,7 +735,7 @@ void bli_sgemmsup_rd_haswell_asm_3x1
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -771,27 +771,27 @@ void bli_sgemmsup_rd_haswell_asm_3x1
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm8)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -811,21 +811,21 @@ void bli_sgemmsup_rd_haswell_asm_3x1
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm8)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -833,7 +833,7 @@ void bli_sgemmsup_rd_haswell_asm_3x1
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rbx        ), xmm0)
 	add(imm(1*4), rbx)                 // b += 8*rs_b = 8*4;
 
@@ -847,12 +847,12 @@ void bli_sgemmsup_rd_haswell_asm_3x1
 	add(imm(1*4), rax)                 // a += 8*cs_a = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm8)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 
@@ -862,7 +862,7 @@ void bli_sgemmsup_rd_haswell_asm_3x1
 	                                   // ymm4
 	                                   // ymm6
 	                                   // ymm8
-	
+
 	vhaddps( ymm5, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -885,8 +885,8 @@ void bli_sgemmsup_rd_haswell_asm_3x1
 	                                   // xmm6[0]  = sum(ymm6)
 	                                   // xmm8[0]  = sum(ymm8)
 
-	
-	
+
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -894,82 +894,82 @@ void bli_sgemmsup_rd_haswell_asm_3x1
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm6, xmm6)
 	vmulps(xmm0, xmm8, xmm8)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vmovss(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm4)
 	vmovss(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovss(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm6)
 	vmovss(xmm6, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovss(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm8)
 	vmovss(xmm8, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovss(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovss(xmm6, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovss(xmm8, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
-	
+
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -1013,8 +1013,8 @@ void bli_sgemmsup_rd_haswell_asm_2x1
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1042,7 +1042,7 @@ void bli_sgemmsup_rd_haswell_asm_2x1
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -1060,7 +1060,7 @@ void bli_sgemmsup_rd_haswell_asm_2x1
 
 	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), rcx)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -1097,18 +1097,18 @@ void bli_sgemmsup_rd_haswell_asm_2x1
 	prefetch(0, mem(rcx, rdi, 1, 0*4)) // prefetch c + 1*rs_c
 #endif
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -1127,7 +1127,7 @@ void bli_sgemmsup_rd_haswell_asm_2x1
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm6)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rbx        ), ymm0)
@@ -1142,7 +1142,7 @@ void bli_sgemmsup_rd_haswell_asm_2x1
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -1172,27 +1172,27 @@ void bli_sgemmsup_rd_haswell_asm_2x1
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm6)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -1209,21 +1209,21 @@ void bli_sgemmsup_rd_haswell_asm_2x1
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm6)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -1231,7 +1231,7 @@ void bli_sgemmsup_rd_haswell_asm_2x1
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rbx        ), xmm0)
 	add(imm(1*4), rbx)                 // b += 8*rs_b = 8*4;
 
@@ -1242,12 +1242,12 @@ void bli_sgemmsup_rd_haswell_asm_2x1
 	add(imm(1*4), rax)                 // a += 8*cs_a = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm6)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 
@@ -1256,7 +1256,7 @@ void bli_sgemmsup_rd_haswell_asm_2x1
 
 	                                   // ymm4
 	                                   // ymm6
-	
+
 	vhaddps( ymm5, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -1272,8 +1272,8 @@ void bli_sgemmsup_rd_haswell_asm_2x1
 	                                   // xmm4[0]  = sum(ymm4)
 	                                   // xmm6[0]  = sum(ymm6)
 
-	
-	
+
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -1281,73 +1281,73 @@ void bli_sgemmsup_rd_haswell_asm_2x1
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm6, xmm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vmovss(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm4)
 	vmovss(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovss(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm6)
 	vmovss(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovss(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovss(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
-	
+
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -1391,8 +1391,8 @@ void bli_sgemmsup_rd_haswell_asm_1x1
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1420,7 +1420,7 @@ void bli_sgemmsup_rd_haswell_asm_1x1
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -1438,7 +1438,7 @@ void bli_sgemmsup_rd_haswell_asm_1x1
 
 	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), rcx)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -1473,18 +1473,18 @@ void bli_sgemmsup_rd_haswell_asm_1x1
 	prefetch(0, mem(rcx,         0*4)) // prefetch c + 0*rs_c
 #endif
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -1500,7 +1500,7 @@ void bli_sgemmsup_rd_haswell_asm_1x1
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm4)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rbx        ), ymm0)
@@ -1512,7 +1512,7 @@ void bli_sgemmsup_rd_haswell_asm_1x1
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -1536,27 +1536,27 @@ void bli_sgemmsup_rd_haswell_asm_1x1
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm4)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -1570,21 +1570,21 @@ void bli_sgemmsup_rd_haswell_asm_1x1
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm4)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -1592,7 +1592,7 @@ void bli_sgemmsup_rd_haswell_asm_1x1
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rbx        ), xmm0)
 	add(imm(1*4), rbx)                 // b += 8*rs_b = 8*4;
 
@@ -1600,12 +1600,12 @@ void bli_sgemmsup_rd_haswell_asm_1x1
 	add(imm(1*4), rax)                 // a += 8*cs_a = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm4)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 
@@ -1614,7 +1614,7 @@ void bli_sgemmsup_rd_haswell_asm_1x1
 
 	                                   // ymm4
 	                                   // ymm6
-	
+
 	vhaddps( ymm5, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -1624,8 +1624,8 @@ void bli_sgemmsup_rd_haswell_asm_1x1
 	                                   // xmm4[0]  = sum(ymm4)
 	                                   // xmm6[0]  = sum(ymm6)
 
-	
-	
+
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -1633,65 +1633,65 @@ void bli_sgemmsup_rd_haswell_asm_1x1
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vmovss(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm4)
 	vmovss(xmm4, mem(rcx))
 	add(rdi, rcx)
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovss(xmm4, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
-	
+
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c
index 1d3d88309f..5fb91e6343 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx12.c
@@ -72,8 +72,8 @@ void bli_sgemmsup_rd_haswell_asm_6x12
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -101,7 +101,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	//mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -119,7 +119,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	//mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -194,18 +194,18 @@ void bli_sgemmsup_rd_haswell_asm_6x12
 #endif
 	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -240,7 +240,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rax       ), ymm0)
@@ -271,7 +271,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -333,27 +333,27 @@ void bli_sgemmsup_rd_haswell_asm_6x12
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -364,7 +364,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12
 	vmovups(mem(rax, r8, 1), ymm1)
 	vmovups(mem(rax, r8, 2), ymm2)
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
-	
+
 	vmovups(mem(rbx        ), ymm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -386,21 +386,21 @@ void bli_sgemmsup_rd_haswell_asm_6x12
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -408,12 +408,12 @@ void bli_sgemmsup_rd_haswell_asm_6x12
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rax       ), xmm0)
 	vmovss(mem(rax, r8, 1), xmm1)
 	vmovss(mem(rax, r8, 2), xmm2)
 	add(imm(1*4), rax)                 // a += 1*cs_a = 1*4;
-	
+
 	vmovss(mem(rbx        ), xmm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -435,22 +435,22 @@ void bli_sgemmsup_rd_haswell_asm_6x12
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 
 
 	label(.SPOSTACCUM)
 
-	                                   // ymm4  ymm7  ymm10 ymm13  
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
 	                                   // ymm6  ymm9  ymm12 ymm15
-	
+
 	vhaddps( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -466,7 +466,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12
 	vshufps(imm(0x44), xmm2, xmm0, xmm4)
 	                                   // xmm4[0] = sum(ymm4);  xmm4[1] = sum(ymm7)
 	                                   // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13)
-	
+
 	vhaddps( ymm8, ymm5, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -482,7 +482,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12
 	vshufps(imm(0x44), xmm2, xmm0, xmm5)
 	                                   // xmm5[0] = sum(ymm5);  xmm5[1] = sum(ymm8)
 	                                   // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14)
-	
+
 	vhaddps( ymm9, ymm6, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -500,8 +500,8 @@ void bli_sgemmsup_rd_haswell_asm_6x12
 	                                   // xmm6[2] = sum(ymm12); xmm6[3] = sum(ymm15)
 
 
-	
-	
+
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -509,73 +509,73 @@ void bli_sgemmsup_rd_haswell_asm_6x12
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm5, xmm5)
 	vmulps(xmm0, xmm6, xmm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm5)
 	vmovups(xmm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm6)
 	vmovups(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	lea(mem(r12, rdi, 2), r12)         //
@@ -599,7 +599,7 @@ void bli_sgemmsup_rd_haswell_asm_6x12
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -644,8 +644,8 @@ void bli_sgemmsup_rd_haswell_asm_2x12
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -670,7 +670,7 @@ void bli_sgemmsup_rd_haswell_asm_2x12
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -688,7 +688,7 @@ void bli_sgemmsup_rd_haswell_asm_2x12
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -742,18 +742,18 @@ void bli_sgemmsup_rd_haswell_asm_2x12
 	prefetch(0, mem(rcx, rdi, 1, 1*4)) // prefetch c + 1*rs_c
 #endif
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -782,7 +782,7 @@ void bli_sgemmsup_rd_haswell_asm_2x12
 	vfmadd231ps(ymm0, ymm3, ymm13)
 	vfmadd231ps(ymm1, ymm3, ymm14)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rax       ), ymm0)
@@ -808,7 +808,7 @@ void bli_sgemmsup_rd_haswell_asm_2x12
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -859,27 +859,27 @@ void bli_sgemmsup_rd_haswell_asm_2x12
 	vfmadd231ps(ymm0, ymm3, ymm13)
 	vfmadd231ps(ymm1, ymm3, ymm14)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -888,7 +888,7 @@ void bli_sgemmsup_rd_haswell_asm_2x12
 	vmovups(mem(rax       ), ymm0)
 	vmovups(mem(rax, r8, 1), ymm1)
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
-	
+
 	vmovups(mem(rbx        ), ymm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -906,21 +906,21 @@ void bli_sgemmsup_rd_haswell_asm_2x12
 	vfmadd231ps(ymm0, ymm3, ymm13)
 	vfmadd231ps(ymm1, ymm3, ymm14)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -928,11 +928,11 @@ void bli_sgemmsup_rd_haswell_asm_2x12
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rax       ), xmm0)
 	vmovss(mem(rax, r8, 1), xmm1)
 	add(imm(1*4), rax)                 // a += 1*cs_a = 1*4;
-	
+
 	vmovss(mem(rbx        ), xmm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -950,21 +950,21 @@ void bli_sgemmsup_rd_haswell_asm_2x12
 	vfmadd231ps(ymm0, ymm3, ymm13)
 	vfmadd231ps(ymm1, ymm3, ymm14)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 
 
 	label(.SPOSTACCUM)
 
-	                                   // ymm4  ymm7  ymm10 ymm13  
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
-	
+
 	vhaddps( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -980,7 +980,7 @@ void bli_sgemmsup_rd_haswell_asm_2x12
 	vshufps(imm(0x44), xmm2, xmm0, xmm4)
 	                                   // xmm4[0] = sum(ymm4);  xmm4[1] = sum(ymm7)
 	                                   // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13)
-	
+
 	vhaddps( ymm8, ymm5, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -998,8 +998,8 @@ void bli_sgemmsup_rd_haswell_asm_2x12
 	                                   // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14)
 
 
-	
-	
+
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -1007,65 +1007,65 @@ void bli_sgemmsup_rd_haswell_asm_2x12
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm5, xmm5)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm5)
 	vmovups(xmm5, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm5, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	add(imm(4), r15)                   // jj += 4;
@@ -1077,7 +1077,7 @@ void bli_sgemmsup_rd_haswell_asm_2x12
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -1121,8 +1121,8 @@ void bli_sgemmsup_rd_haswell_asm_1x12
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1147,7 +1147,7 @@ void bli_sgemmsup_rd_haswell_asm_1x12
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -1165,7 +1165,7 @@ void bli_sgemmsup_rd_haswell_asm_1x12
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -1214,18 +1214,18 @@ void bli_sgemmsup_rd_haswell_asm_1x12
 	prefetch(0, mem(rcx,         1*4)) // prefetch c + 0*rs_c
 #endif
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -1248,7 +1248,7 @@ void bli_sgemmsup_rd_haswell_asm_1x12
 	add(imm(8*4), rbx)                 // b += 8*rs_b = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm13)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rax       ), ymm0)
@@ -1269,7 +1269,7 @@ void bli_sgemmsup_rd_haswell_asm_1x12
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 #endif
@@ -1309,34 +1309,34 @@ void bli_sgemmsup_rd_haswell_asm_1x12
 	add(imm(8*4), rbx)                 // b += 8*rs_b = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm13)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 #endif
 
 	vmovups(mem(rax       ), ymm0)
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
-	
+
 	vmovups(mem(rbx        ), ymm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 
@@ -1350,21 +1350,21 @@ void bli_sgemmsup_rd_haswell_asm_1x12
 	add(imm(8*4), rbx)                 // b += 8*rs_b = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm13)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -1372,10 +1372,10 @@ void bli_sgemmsup_rd_haswell_asm_1x12
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rax       ), xmm0)
 	add(imm(1*4), rax)                 // a += 1*cs_a = 1*4;
-	
+
 	vmovss(mem(rbx        ), xmm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 
@@ -1389,21 +1389,21 @@ void bli_sgemmsup_rd_haswell_asm_1x12
 	add(imm(1*4), rbx)                 // b += 1*rs_b = 1*4;
 	vfmadd231ps(ymm0, ymm3, ymm13)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 
 
 	label(.SPOSTACCUM)
 
-	                                   // ymm4  ymm7  ymm10 ymm13  
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
-	
+
 	vhaddps( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -1421,8 +1421,8 @@ void bli_sgemmsup_rd_haswell_asm_1x12
 	                                   // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13)
 
 
-	
-	
+
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -1430,57 +1430,57 @@ void bli_sgemmsup_rd_haswell_asm_1x12
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	add(imm(4), r15)                   // jj += 4;
@@ -1492,7 +1492,7 @@ void bli_sgemmsup_rd_haswell_asm_1x12
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c
index bbb75a6fcd..1398c3da79 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx16.c
@@ -72,8 +72,8 @@ void bli_sgemmsup_rd_haswell_asm_6x16
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	uint64_t n_left = n0 % 16;
@@ -176,7 +176,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	//mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -194,7 +194,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	//mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -269,18 +269,18 @@ void bli_sgemmsup_rd_haswell_asm_6x16
 #endif
 	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -315,7 +315,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rax       ), ymm0)
@@ -346,7 +346,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -408,27 +408,27 @@ void bli_sgemmsup_rd_haswell_asm_6x16
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -439,7 +439,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16
 	vmovups(mem(rax, r8, 1), ymm1)
 	vmovups(mem(rax, r8, 2), ymm2)
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
-	
+
 	vmovups(mem(rbx        ), ymm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -461,21 +461,21 @@ void bli_sgemmsup_rd_haswell_asm_6x16
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -483,12 +483,12 @@ void bli_sgemmsup_rd_haswell_asm_6x16
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rax       ), xmm0)
 	vmovss(mem(rax, r8, 1), xmm1)
 	vmovss(mem(rax, r8, 2), xmm2)
 	add(imm(1*4), rax)                 // a += 1*cs_a = 1*4;
-	
+
 	vmovss(mem(rbx        ), xmm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -510,22 +510,22 @@ void bli_sgemmsup_rd_haswell_asm_6x16
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 
 
 	label(.SPOSTACCUM)
 
-	                                   // ymm4  ymm7  ymm10 ymm13  
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
 	                                   // ymm6  ymm9  ymm12 ymm15
-	
+
 	vhaddps( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -541,7 +541,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16
 	vshufps(imm(0x44), xmm2, xmm0, xmm4)
 	                                   // xmm4[0] = sum(ymm4);  xmm4[1] = sum(ymm7)
 	                                   // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13)
-	
+
 	vhaddps( ymm8, ymm5, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -557,7 +557,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16
 	vshufps(imm(0x44), xmm2, xmm0, xmm5)
 	                                   // xmm5[0] = sum(ymm5);  xmm5[1] = sum(ymm8)
 	                                   // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14)
-	
+
 	vhaddps( ymm9, ymm6, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -575,8 +575,8 @@ void bli_sgemmsup_rd_haswell_asm_6x16
 	                                   // xmm6[2] = sum(ymm12); xmm6[3] = sum(ymm15)
 
 
-	
-	
+
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -584,73 +584,73 @@ void bli_sgemmsup_rd_haswell_asm_6x16
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm5, xmm5)
 	vmulps(xmm0, xmm6, xmm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm5)
 	vmovups(xmm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm6)
 	vmovups(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	lea(mem(r12, rdi, 2), r12)         //
@@ -674,7 +674,7 @@ void bli_sgemmsup_rd_haswell_asm_6x16
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -758,8 +758,8 @@ void bli_sgemmsup_rd_haswell_asm_2x16
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -784,7 +784,7 @@ void bli_sgemmsup_rd_haswell_asm_2x16
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -802,7 +802,7 @@ void bli_sgemmsup_rd_haswell_asm_2x16
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -856,18 +856,18 @@ void bli_sgemmsup_rd_haswell_asm_2x16
 	prefetch(0, mem(rcx, rdi, 1, 1*4)) // prefetch c + 1*rs_c
 #endif
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -897,7 +897,7 @@ void bli_sgemmsup_rd_haswell_asm_2x16
 	vfmadd231ps(ymm0, ymm3, ymm13)
 	vfmadd231ps(ymm1, ymm3, ymm14)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rax       ), ymm0)
@@ -923,7 +923,7 @@ void bli_sgemmsup_rd_haswell_asm_2x16
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -975,27 +975,27 @@ void bli_sgemmsup_rd_haswell_asm_2x16
 	vfmadd231ps(ymm0, ymm3, ymm13)
 	vfmadd231ps(ymm1, ymm3, ymm14)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -1005,7 +1005,7 @@ void bli_sgemmsup_rd_haswell_asm_2x16
 	vmovups(mem(rax       ), ymm0)
 	vmovups(mem(rax, r8, 1), ymm1)
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
-	
+
 	vmovups(mem(rbx        ), ymm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -1023,21 +1023,21 @@ void bli_sgemmsup_rd_haswell_asm_2x16
 	vfmadd231ps(ymm0, ymm3, ymm13)
 	vfmadd231ps(ymm1, ymm3, ymm14)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -1045,11 +1045,11 @@ void bli_sgemmsup_rd_haswell_asm_2x16
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rax       ), xmm0)
 	vmovss(mem(rax, r8, 1), xmm1)
 	add(imm(1*4), rax)                 // a += 1*cs_a = 1*4;
-	
+
 	vmovss(mem(rbx        ), xmm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -1067,21 +1067,21 @@ void bli_sgemmsup_rd_haswell_asm_2x16
 	vfmadd231ps(ymm0, ymm3, ymm13)
 	vfmadd231ps(ymm1, ymm3, ymm14)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 
 
 	label(.SPOSTACCUM)
 
-	                                   // ymm4  ymm7  ymm10 ymm13  
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
-	
+
 	vhaddps( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -1097,7 +1097,7 @@ void bli_sgemmsup_rd_haswell_asm_2x16
 	vshufps(imm(0x44), xmm2, xmm0, xmm4)
 	                                   // xmm4[0] = sum(ymm4);  xmm4[1] = sum(ymm7)
 	                                   // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13)
-	
+
 	vhaddps( ymm8, ymm5, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -1115,8 +1115,8 @@ void bli_sgemmsup_rd_haswell_asm_2x16
 	                                   // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14)
 
 
-	
-	
+
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -1124,65 +1124,65 @@ void bli_sgemmsup_rd_haswell_asm_2x16
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm5, xmm5)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm5)
 	vmovups(xmm5, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm5, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	add(imm(4), r15)                   // jj += 4;
@@ -1194,7 +1194,7 @@ void bli_sgemmsup_rd_haswell_asm_2x16
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -1238,8 +1238,8 @@ void bli_sgemmsup_rd_haswell_asm_1x16
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1264,7 +1264,7 @@ void bli_sgemmsup_rd_haswell_asm_1x16
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), r14)                   // load address of a.
 	//mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -1282,7 +1282,7 @@ void bli_sgemmsup_rd_haswell_asm_1x16
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), r12)                   // load address of c
 	//mov(var(rs_c), rdi)                // load rs_c
@@ -1331,18 +1331,18 @@ void bli_sgemmsup_rd_haswell_asm_1x16
 	prefetch(0, mem(rcx,         1*4)) // prefetch c + 0*rs_c
 #endif
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -1367,7 +1367,7 @@ void bli_sgemmsup_rd_haswell_asm_1x16
 	add(imm(8*4), rbx)                 // b += 8*rs_b = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm13)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rax       ), ymm0)
@@ -1388,7 +1388,7 @@ void bli_sgemmsup_rd_haswell_asm_1x16
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -1430,27 +1430,27 @@ void bli_sgemmsup_rd_haswell_asm_1x16
 	add(imm(8*4), rbx)                 // b += 8*rs_b = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm13)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -1459,7 +1459,7 @@ void bli_sgemmsup_rd_haswell_asm_1x16
 
 	vmovups(mem(rax       ), ymm0)
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
-	
+
 	vmovups(mem(rbx        ), ymm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 
@@ -1473,21 +1473,21 @@ void bli_sgemmsup_rd_haswell_asm_1x16
 	add(imm(8*4), rbx)                 // b += 8*rs_b = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm13)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -1495,10 +1495,10 @@ void bli_sgemmsup_rd_haswell_asm_1x16
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rax       ), xmm0)
 	add(imm(1*4), rax)                 // a += 1*cs_a = 1*4;
-	
+
 	vmovss(mem(rbx        ), xmm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 
@@ -1512,20 +1512,20 @@ void bli_sgemmsup_rd_haswell_asm_1x16
 	add(imm(1*4), rbx)                 // b += 1*rs_b = 1*4;
 	vfmadd231ps(ymm0, ymm3, ymm13)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 
 
 	label(.SPOSTACCUM)
 
-	                                   // ymm4  ymm7  ymm10 ymm13  
-	
+	                                   // ymm4  ymm7  ymm10 ymm13
+
 	vhaddps( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -1543,8 +1543,8 @@ void bli_sgemmsup_rd_haswell_asm_1x16
 	                                   // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13)
 
 
-	
-	
+
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -1552,57 +1552,57 @@ void bli_sgemmsup_rd_haswell_asm_1x16
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	add(imm(4), r15)                   // jj += 4;
@@ -1614,7 +1614,7 @@ void bli_sgemmsup_rd_haswell_asm_1x16
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c
index 1e3240350b..75c6872674 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx2.c
@@ -72,8 +72,8 @@ void bli_sgemmsup_rd_haswell_asm_6x2
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -101,7 +101,7 @@ void bli_sgemmsup_rd_haswell_asm_6x2
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -119,7 +119,7 @@ void bli_sgemmsup_rd_haswell_asm_6x2
 
 	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), rcx)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -170,18 +170,18 @@ void bli_sgemmsup_rd_haswell_asm_6x2
 	prefetch(0, mem(r10, rdi, 2, 1*4)) // prefetch c + 5*rs_c
 #endif
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -219,7 +219,7 @@ void bli_sgemmsup_rd_haswell_asm_6x2
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rbx        ), ymm0)
@@ -253,7 +253,7 @@ void bli_sgemmsup_rd_haswell_asm_6x2
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -321,27 +321,27 @@ void bli_sgemmsup_rd_haswell_asm_6x2
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -377,21 +377,21 @@ void bli_sgemmsup_rd_haswell_asm_6x2
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -399,7 +399,7 @@ void bli_sgemmsup_rd_haswell_asm_6x2
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rbx        ), xmm0)
 	vmovss(mem(rbx, r11, 1), xmm1)
 	add(imm(1*4), rbx)                 // b += 8*rs_b = 8*4;
@@ -429,12 +429,12 @@ void bli_sgemmsup_rd_haswell_asm_6x2
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 
@@ -447,7 +447,7 @@ void bli_sgemmsup_rd_haswell_asm_6x2
 	                                   // ymm10 ymm11
 	                                   // ymm12 ymm13
 	                                   // ymm14 ymm15
-	
+
 	vhaddps( ymm5, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -491,8 +491,8 @@ void bli_sgemmsup_rd_haswell_asm_6x2
 	                                   // xmm12[0:1] = sum(ymm12) sum(ymm13)
 	                                   // xmm14[0:1] = sum(ymm14) sum(ymm15)
 
-	
-	
+
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -500,109 +500,109 @@ void bli_sgemmsup_rd_haswell_asm_6x2
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm6, xmm6)
 	vmulps(xmm0, xmm8, xmm8)
 	vmulps(xmm0, xmm10, xmm10)
 	vmulps(xmm0, xmm12, xmm12)
 	vmulps(xmm0, xmm14, xmm14)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vmovsd(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm4)
 	vmovsd(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm6)
 	vmovsd(xmm6, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm8)
 	vmovsd(xmm8, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm10)
 	vmovsd(xmm10, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm12)
 	vmovsd(xmm12, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm14)
 	vmovsd(xmm14, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovsd(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(xmm6, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(xmm8, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(xmm10, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(xmm12, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(xmm14, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
-	
+
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -646,8 +646,8 @@ void bli_sgemmsup_rd_haswell_asm_3x2
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -675,7 +675,7 @@ void bli_sgemmsup_rd_haswell_asm_3x2
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -693,7 +693,7 @@ void bli_sgemmsup_rd_haswell_asm_3x2
 
 	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), rcx)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -735,18 +735,18 @@ void bli_sgemmsup_rd_haswell_asm_3x2
 	prefetch(0, mem(rcx, rdi, 2, 1*4)) // prefetch c + 2*rs_c
 #endif
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -772,7 +772,7 @@ void bli_sgemmsup_rd_haswell_asm_3x2
 	vfmadd231ps(ymm0, ymm3, ymm8)
 	vfmadd231ps(ymm1, ymm3, ymm9)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rbx        ), ymm0)
@@ -794,7 +794,7 @@ void bli_sgemmsup_rd_haswell_asm_3x2
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -837,27 +837,27 @@ void bli_sgemmsup_rd_haswell_asm_3x2
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm8)
 	vfmadd231ps(ymm1, ymm3, ymm9)
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -881,21 +881,21 @@ void bli_sgemmsup_rd_haswell_asm_3x2
 	vfmadd231ps(ymm0, ymm3, ymm8)
 	vfmadd231ps(ymm1, ymm3, ymm9)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -903,7 +903,7 @@ void bli_sgemmsup_rd_haswell_asm_3x2
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rbx        ), xmm0)
 	vmovss(mem(rbx, r11, 1), xmm1)
 	add(imm(1*4), rbx)                 // b += 8*rs_b = 8*4;
@@ -921,12 +921,12 @@ void bli_sgemmsup_rd_haswell_asm_3x2
 	vfmadd231ps(ymm0, ymm3, ymm8)
 	vfmadd231ps(ymm1, ymm3, ymm9)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 
@@ -936,7 +936,7 @@ void bli_sgemmsup_rd_haswell_asm_3x2
 	                                   // ymm4  ymm5
 	                                   // ymm6  ymm7
 	                                   // ymm8  ymm9
-	
+
 	vhaddps( ymm5, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -959,8 +959,8 @@ void bli_sgemmsup_rd_haswell_asm_3x2
 	                                   // xmm6[0:1]  = sum(ymm6)  sum(ymm7)
 	                                   // xmm8[0:1]  = sum(ymm8)  sum(ymm9)
 
-	
-	
+
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -968,83 +968,83 @@ void bli_sgemmsup_rd_haswell_asm_3x2
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm6, xmm6)
 	vmulps(xmm0, xmm8, xmm8)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vmovsd(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm4)
 	vmovsd(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm6)
 	vmovsd(xmm6, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm8)
 	vmovsd(xmm8, mem(rcx))
 	add(rdi, rcx)
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovsd(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(xmm6, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(xmm8, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
-	
+
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -1088,8 +1088,8 @@ void bli_sgemmsup_rd_haswell_asm_2x2
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1117,7 +1117,7 @@ void bli_sgemmsup_rd_haswell_asm_2x2
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -1135,7 +1135,7 @@ void bli_sgemmsup_rd_haswell_asm_2x2
 
 	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), rcx)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -1174,18 +1174,18 @@ void bli_sgemmsup_rd_haswell_asm_2x2
 	prefetch(0, mem(rcx, rdi, 1, 1*4)) // prefetch c + 1*rs_c
 #endif
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -1207,7 +1207,7 @@ void bli_sgemmsup_rd_haswell_asm_2x2
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rbx        ), ymm0)
@@ -1225,7 +1225,7 @@ void bli_sgemmsup_rd_haswell_asm_2x2
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -1260,27 +1260,27 @@ void bli_sgemmsup_rd_haswell_asm_2x2
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -1300,21 +1300,21 @@ void bli_sgemmsup_rd_haswell_asm_2x2
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -1322,7 +1322,7 @@ void bli_sgemmsup_rd_haswell_asm_2x2
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rbx        ), xmm0)
 	vmovss(mem(rbx, r11, 1), xmm1)
 	add(imm(1*4), rbx)                 // b += 8*rs_b = 8*4;
@@ -1336,12 +1336,12 @@ void bli_sgemmsup_rd_haswell_asm_2x2
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 
@@ -1350,7 +1350,7 @@ void bli_sgemmsup_rd_haswell_asm_2x2
 
 	                                   // ymm4  ymm5
 	                                   // ymm6  ymm7
-	
+
 	vhaddps( ymm5, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -1366,8 +1366,8 @@ void bli_sgemmsup_rd_haswell_asm_2x2
 	                                   // xmm4[0:1]  = sum(ymm4)  sum(ymm5)
 	                                   // xmm6[0:1]  = sum(ymm6)  sum(ymm7)
 
-	
-	
+
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -1375,73 +1375,73 @@ void bli_sgemmsup_rd_haswell_asm_2x2
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm6, xmm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vmovsd(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm4)
 	vmovsd(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm6)
 	vmovsd(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovsd(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovsd(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
-	
+
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -1485,8 +1485,8 @@ void bli_sgemmsup_rd_haswell_asm_1x2
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1514,7 +1514,7 @@ void bli_sgemmsup_rd_haswell_asm_1x2
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -1532,7 +1532,7 @@ void bli_sgemmsup_rd_haswell_asm_1x2
 
 	//lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), rcx)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -1568,18 +1568,18 @@ void bli_sgemmsup_rd_haswell_asm_1x2
 	prefetch(0, mem(rcx,         1*4)) // prefetch c + 0*rs_c
 #endif
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -1597,7 +1597,7 @@ void bli_sgemmsup_rd_haswell_asm_1x2
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rbx        ), ymm0)
@@ -1611,7 +1611,7 @@ void bli_sgemmsup_rd_haswell_asm_1x2
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -1638,27 +1638,27 @@ void bli_sgemmsup_rd_haswell_asm_1x2
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -1674,21 +1674,21 @@ void bli_sgemmsup_rd_haswell_asm_1x2
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -1696,7 +1696,7 @@ void bli_sgemmsup_rd_haswell_asm_1x2
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rbx        ), xmm0)
 	vmovss(mem(rbx, r11, 1), xmm1)
 	add(imm(1*4), rbx)                 // b += 8*rs_b = 8*4;
@@ -1706,12 +1706,12 @@ void bli_sgemmsup_rd_haswell_asm_1x2
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 
@@ -1719,7 +1719,7 @@ void bli_sgemmsup_rd_haswell_asm_1x2
 	label(.SPOSTACCUM)
 
 	                                   // ymm4  ymm5
-	
+
 	vhaddps( ymm5, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -1728,8 +1728,8 @@ void bli_sgemmsup_rd_haswell_asm_1x2
 
 	                                   // xmm4[0:1]  = sum(ymm4)  sum(ymm5)
 
-	
-	
+
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -1737,64 +1737,64 @@ void bli_sgemmsup_rd_haswell_asm_1x2
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vmovsd(mem(rcx), xmm0)
 	vfmadd231ps(xmm0, xmm3, xmm4)
 	vmovsd(xmm4, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovsd(xmm4, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
-	
+
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c
index 9d4e9d51d2..80be4e9324 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx4.c
@@ -72,8 +72,8 @@ void bli_sgemmsup_rd_haswell_asm_6x4
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -101,7 +101,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -119,7 +119,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -173,18 +173,18 @@ void bli_sgemmsup_rd_haswell_asm_6x4
 #endif
 	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -219,7 +219,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rax       ), ymm0)
@@ -250,7 +250,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -312,27 +312,27 @@ void bli_sgemmsup_rd_haswell_asm_6x4
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -343,7 +343,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4
 	vmovups(mem(rax, r8, 1), ymm1)
 	vmovups(mem(rax, r8, 2), ymm2)
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
-	
+
 	vmovups(mem(rbx        ), ymm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -365,21 +365,21 @@ void bli_sgemmsup_rd_haswell_asm_6x4
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -387,12 +387,12 @@ void bli_sgemmsup_rd_haswell_asm_6x4
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rax       ), xmm0)
 	vmovss(mem(rax, r8, 1), xmm1)
 	vmovss(mem(rax, r8, 2), xmm2)
 	add(imm(1*4), rax)                 // a += 1*cs_a = 1*4;
-	
+
 	vmovss(mem(rbx        ), xmm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -414,22 +414,22 @@ void bli_sgemmsup_rd_haswell_asm_6x4
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 
 
 	label(.SPOSTACCUM)
 
-	                                   // ymm4  ymm7  ymm10 ymm13  
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
 	                                   // ymm6  ymm9  ymm12 ymm15
-	
+
 	vhaddps( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -445,7 +445,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4
 	vshufps(imm(0x44), xmm2, xmm0, xmm4)
 	                                   // xmm4[0] = sum(ymm4);  xmm4[1] = sum(ymm7)
 	                                   // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13)
-	
+
 	vhaddps( ymm8, ymm5, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -461,7 +461,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4
 	vshufps(imm(0x44), xmm2, xmm0, xmm5)
 	                                   // xmm5[0] = sum(ymm5);  xmm5[1] = sum(ymm8)
 	                                   // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14)
-	
+
 	vhaddps( ymm9, ymm6, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -479,8 +479,8 @@ void bli_sgemmsup_rd_haswell_asm_6x4
 	                                   // xmm6[2] = sum(ymm12); xmm6[3] = sum(ymm15)
 
 
-	
-	
+
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -488,73 +488,73 @@ void bli_sgemmsup_rd_haswell_asm_6x4
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm5, xmm5)
 	vmulps(xmm0, xmm6, xmm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm5)
 	vmovups(xmm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm6)
 	vmovups(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	lea(mem(r12, rdi, 2), r12)         //
@@ -571,7 +571,7 @@ void bli_sgemmsup_rd_haswell_asm_6x4
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -616,8 +616,8 @@ void bli_sgemmsup_rd_haswell_asm_2x4
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -642,7 +642,7 @@ void bli_sgemmsup_rd_haswell_asm_2x4
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -660,7 +660,7 @@ void bli_sgemmsup_rd_haswell_asm_2x4
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), rcx)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -701,18 +701,18 @@ void bli_sgemmsup_rd_haswell_asm_2x4
 	prefetch(0, mem(rcx, rdi, 1, 1*4)) // prefetch c + 1*rs_c
 #endif
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -741,7 +741,7 @@ void bli_sgemmsup_rd_haswell_asm_2x4
 	vfmadd231ps(ymm0, ymm3, ymm13)
 	vfmadd231ps(ymm1, ymm3, ymm14)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rax       ), ymm0)
@@ -767,7 +767,7 @@ void bli_sgemmsup_rd_haswell_asm_2x4
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -818,27 +818,27 @@ void bli_sgemmsup_rd_haswell_asm_2x4
 	vfmadd231ps(ymm0, ymm3, ymm13)
 	vfmadd231ps(ymm1, ymm3, ymm14)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -847,7 +847,7 @@ void bli_sgemmsup_rd_haswell_asm_2x4
 	vmovups(mem(rax       ), ymm0)
 	vmovups(mem(rax, r8, 1), ymm1)
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
-	
+
 	vmovups(mem(rbx        ), ymm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -865,21 +865,21 @@ void bli_sgemmsup_rd_haswell_asm_2x4
 	vfmadd231ps(ymm0, ymm3, ymm13)
 	vfmadd231ps(ymm1, ymm3, ymm14)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -887,11 +887,11 @@ void bli_sgemmsup_rd_haswell_asm_2x4
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rax       ), xmm0)
 	vmovss(mem(rax, r8, 1), xmm1)
 	add(imm(1*4), rax)                 // a += 1*cs_a = 1*4;
-	
+
 	vmovss(mem(rbx        ), xmm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -909,21 +909,21 @@ void bli_sgemmsup_rd_haswell_asm_2x4
 	vfmadd231ps(ymm0, ymm3, ymm13)
 	vfmadd231ps(ymm1, ymm3, ymm14)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 
 
 	label(.SPOSTACCUM)
 
-	                                   // ymm4  ymm7  ymm10 ymm13  
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
-	
+
 	vhaddps( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -939,7 +939,7 @@ void bli_sgemmsup_rd_haswell_asm_2x4
 	vshufps(imm(0x44), xmm2, xmm0, xmm4)
 	                                   // xmm4[0] = sum(ymm4);  xmm4[1] = sum(ymm7)
 	                                   // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13)
-	
+
 	vhaddps( ymm8, ymm5, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -957,8 +957,8 @@ void bli_sgemmsup_rd_haswell_asm_2x4
 	                                   // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14)
 
 
-	
-	
+
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -966,70 +966,70 @@ void bli_sgemmsup_rd_haswell_asm_2x4
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm5, xmm5)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm5)
 	vmovups(xmm5, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm5, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -1073,8 +1073,8 @@ void bli_sgemmsup_rd_haswell_asm_1x4
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1099,7 +1099,7 @@ void bli_sgemmsup_rd_haswell_asm_1x4
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -1117,7 +1117,7 @@ void bli_sgemmsup_rd_haswell_asm_1x4
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), rcx)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -1153,18 +1153,18 @@ void bli_sgemmsup_rd_haswell_asm_1x4
 	prefetch(0, mem(rcx,         1*4)) // prefetch c + 0*rs_c
 #endif
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -1187,7 +1187,7 @@ void bli_sgemmsup_rd_haswell_asm_1x4
 	add(imm(8*4), rbx)                 // b += 8*rs_b = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm13)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rax       ), ymm0)
@@ -1208,7 +1208,7 @@ void bli_sgemmsup_rd_haswell_asm_1x4
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 #endif
@@ -1248,34 +1248,34 @@ void bli_sgemmsup_rd_haswell_asm_1x4
 	add(imm(8*4), rbx)                 // b += 8*rs_b = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm13)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 #endif
 
 	vmovups(mem(rax       ), ymm0)
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
-	
+
 	vmovups(mem(rbx        ), ymm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 
@@ -1289,21 +1289,21 @@ void bli_sgemmsup_rd_haswell_asm_1x4
 	add(imm(8*4), rbx)                 // b += 8*rs_b = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm13)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -1311,10 +1311,10 @@ void bli_sgemmsup_rd_haswell_asm_1x4
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rax       ), xmm0)
 	add(imm(1*4), rax)                 // a += 1*cs_a = 1*4;
-	
+
 	vmovss(mem(rbx        ), xmm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 
@@ -1328,20 +1328,20 @@ void bli_sgemmsup_rd_haswell_asm_1x4
 	add(imm(1*4), rbx)                 // b += 1*rs_b = 1*4;
 	vfmadd231ps(ymm0, ymm3, ymm13)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 
 
 	label(.SPOSTACCUM)
 
-	                                   // ymm4  ymm7  ymm10 ymm13  
-	
+	                                   // ymm4  ymm7  ymm10 ymm13
+
 	vhaddps( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -1359,8 +1359,8 @@ void bli_sgemmsup_rd_haswell_asm_1x4
 	                                   // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13)
 
 
-	
-	
+
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -1368,62 +1368,62 @@ void bli_sgemmsup_rd_haswell_asm_1x4
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c
index 788912ecf6..3a82e9b3e1 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rd_haswell_asm_sMx8.c
@@ -72,8 +72,8 @@ void bli_sgemmsup_rd_haswell_asm_6x8
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -101,7 +101,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	//mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -119,7 +119,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	//mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -194,18 +194,18 @@ void bli_sgemmsup_rd_haswell_asm_6x8
 #endif
 	lea(mem(r8,  r8,  4), rbp)         // rbp = 5*rs_a
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -240,7 +240,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rax       ), ymm0)
@@ -271,7 +271,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -333,27 +333,27 @@ void bli_sgemmsup_rd_haswell_asm_6x8
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -364,7 +364,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8
 	vmovups(mem(rax, r8, 1), ymm1)
 	vmovups(mem(rax, r8, 2), ymm2)
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
-	
+
 	vmovups(mem(rbx        ), ymm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -386,21 +386,21 @@ void bli_sgemmsup_rd_haswell_asm_6x8
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -408,12 +408,12 @@ void bli_sgemmsup_rd_haswell_asm_6x8
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rax       ), xmm0)
 	vmovss(mem(rax, r8, 1), xmm1)
 	vmovss(mem(rax, r8, 2), xmm2)
 	add(imm(1*4), rax)                 // a += 1*cs_a = 1*4;
-	
+
 	vmovss(mem(rbx        ), xmm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -435,22 +435,22 @@ void bli_sgemmsup_rd_haswell_asm_6x8
 	vfmadd231ps(ymm1, ymm3, ymm14)
 	vfmadd231ps(ymm2, ymm3, ymm15)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 
 
 	label(.SPOSTACCUM)
 
-	                                   // ymm4  ymm7  ymm10 ymm13  
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
 	                                   // ymm6  ymm9  ymm12 ymm15
-	
+
 	vhaddps( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -466,7 +466,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8
 	vshufps(imm(0x44), xmm2, xmm0, xmm4)
 	                                   // xmm4[0] = sum(ymm4);  xmm4[1] = sum(ymm7)
 	                                   // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13)
-	
+
 	vhaddps( ymm8, ymm5, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -482,7 +482,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8
 	vshufps(imm(0x44), xmm2, xmm0, xmm5)
 	                                   // xmm5[0] = sum(ymm5);  xmm5[1] = sum(ymm8)
 	                                   // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14)
-	
+
 	vhaddps( ymm9, ymm6, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -500,8 +500,8 @@ void bli_sgemmsup_rd_haswell_asm_6x8
 	                                   // xmm6[2] = sum(ymm12); xmm6[3] = sum(ymm15)
 
 
-	
-	
+
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -509,73 +509,73 @@ void bli_sgemmsup_rd_haswell_asm_6x8
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm5, xmm5)
 	vmulps(xmm0, xmm6, xmm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm5)
 	vmovups(xmm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm6)
 	vmovups(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm5, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm6, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	lea(mem(r12, rdi, 2), r12)         //
@@ -599,7 +599,7 @@ void bli_sgemmsup_rd_haswell_asm_6x8
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -644,8 +644,8 @@ void bli_sgemmsup_rd_haswell_asm_2x8
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -670,7 +670,7 @@ void bli_sgemmsup_rd_haswell_asm_2x8
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -688,7 +688,7 @@ void bli_sgemmsup_rd_haswell_asm_2x8
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -742,18 +742,18 @@ void bli_sgemmsup_rd_haswell_asm_2x8
 	prefetch(0, mem(rcx, rdi, 1, 1*4)) // prefetch c + 1*rs_c
 #endif
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -782,7 +782,7 @@ void bli_sgemmsup_rd_haswell_asm_2x8
 	vfmadd231ps(ymm0, ymm3, ymm13)
 	vfmadd231ps(ymm1, ymm3, ymm14)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rax       ), ymm0)
@@ -808,7 +808,7 @@ void bli_sgemmsup_rd_haswell_asm_2x8
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -859,27 +859,27 @@ void bli_sgemmsup_rd_haswell_asm_2x8
 	vfmadd231ps(ymm0, ymm3, ymm13)
 	vfmadd231ps(ymm1, ymm3, ymm14)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 	prefetch(0, mem(rax, r8,  4, 0*8)) // prefetch rax + 4*cs_a
@@ -888,7 +888,7 @@ void bli_sgemmsup_rd_haswell_asm_2x8
 	vmovups(mem(rax       ), ymm0)
 	vmovups(mem(rax, r8, 1), ymm1)
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
-	
+
 	vmovups(mem(rbx        ), ymm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -906,21 +906,21 @@ void bli_sgemmsup_rd_haswell_asm_2x8
 	vfmadd231ps(ymm0, ymm3, ymm13)
 	vfmadd231ps(ymm1, ymm3, ymm14)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -928,11 +928,11 @@ void bli_sgemmsup_rd_haswell_asm_2x8
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rax       ), xmm0)
 	vmovss(mem(rax, r8, 1), xmm1)
 	add(imm(1*4), rax)                 // a += 1*cs_a = 1*4;
-	
+
 	vmovss(mem(rbx        ), xmm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 	vfmadd231ps(ymm1, ymm3, ymm5)
@@ -950,21 +950,21 @@ void bli_sgemmsup_rd_haswell_asm_2x8
 	vfmadd231ps(ymm0, ymm3, ymm13)
 	vfmadd231ps(ymm1, ymm3, ymm14)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 
 
 	label(.SPOSTACCUM)
 
-	                                   // ymm4  ymm7  ymm10 ymm13  
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
-	
+
 	vhaddps( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -980,7 +980,7 @@ void bli_sgemmsup_rd_haswell_asm_2x8
 	vshufps(imm(0x44), xmm2, xmm0, xmm4)
 	                                   // xmm4[0] = sum(ymm4);  xmm4[1] = sum(ymm7)
 	                                   // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13)
-	
+
 	vhaddps( ymm8, ymm5, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -998,8 +998,8 @@ void bli_sgemmsup_rd_haswell_asm_2x8
 	                                   // xmm5[2] = sum(ymm11); xmm5[3] = sum(ymm14)
 
 
-	
-	
+
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -1007,65 +1007,65 @@ void bli_sgemmsup_rd_haswell_asm_2x8
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm5, xmm5)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vfmadd231ps(mem(rcx), xmm3, xmm5)
 	vmovups(xmm5, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovups(xmm5, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	add(imm(4), r15)                   // jj += 4;
@@ -1077,7 +1077,7 @@ void bli_sgemmsup_rd_haswell_asm_2x8
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
@@ -1121,8 +1121,8 @@ void bli_sgemmsup_rd_haswell_asm_1x8
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1147,7 +1147,7 @@ void bli_sgemmsup_rd_haswell_asm_1x8
 	begin_asm()
 
 	//vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), r14)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	//mov(var(cs_a), r9)                 // load cs_a
@@ -1165,7 +1165,7 @@ void bli_sgemmsup_rd_haswell_asm_1x8
 
 	lea(mem(r11, r11, 2), r13)         // r13 = 3*cs_b
 	//lea(mem(r8,  r8,  2), r10)         // r10 = 3*rs_a
-	
+
 
 	mov(var(c), r12)                   // load address of c
 	mov(var(rs_c), rdi)                // load rs_c
@@ -1214,18 +1214,18 @@ void bli_sgemmsup_rd_haswell_asm_1x8
 	prefetch(0, mem(rcx,         1*4)) // prefetch c + 0*rs_c
 #endif
 
-	
 
-	
+
+
 	mov(var(k_iter32), rsi)            // i = k_iter32;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKITER8)                 // if i == 0, jump to code that
 	                                   // contains the k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER32)               // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 0
@@ -1248,7 +1248,7 @@ void bli_sgemmsup_rd_haswell_asm_1x8
 	add(imm(8*4), rbx)                 // b += 8*rs_b = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm13)
 
-	
+
 	// ---------------------------------- iteration 1
 
 	vmovups(mem(rax       ), ymm0)
@@ -1269,7 +1269,7 @@ void bli_sgemmsup_rd_haswell_asm_1x8
 
 
 	// ---------------------------------- iteration 2
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 #endif
@@ -1309,34 +1309,34 @@ void bli_sgemmsup_rd_haswell_asm_1x8
 	add(imm(8*4), rbx)                 // b += 8*rs_b = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm13)
 
-	
+
 
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER32)                 // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKITER8)
-	
+
 	mov(var(k_iter8), rsi)             // i = k_iter8;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT1)                 // if i == 0, jump to code that
 	                                   // considers k_left1 loop.
 	                                   // else, we prepare to enter k_iter8 loop.
-	
-	
+
+
 	label(.SLOOPKITER8)                // EDGE LOOP (ymm)
-	
+
 #if 0
 	prefetch(0, mem(rax, r10, 1, 0*8)) // prefetch rax + 3*cs_a
 #endif
 
 	vmovups(mem(rax       ), ymm0)
 	add(imm(8*4), rax)                 // a += 8*cs_a = 8*4;
-	
+
 	vmovups(mem(rbx        ), ymm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 
@@ -1350,21 +1350,21 @@ void bli_sgemmsup_rd_haswell_asm_1x8
 	add(imm(8*4), rbx)                 // b += 8*rs_b = 8*4;
 	vfmadd231ps(ymm0, ymm3, ymm13)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER8)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 	label(.SCONSIDKLEFT1)
-	
+
 	mov(var(k_left1), rsi)             // i = k_left1;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left1 loop.
-	
-	
+
+
 
 
 	label(.SLOOPKLEFT1)                // EDGE LOOP (scalar)
@@ -1372,10 +1372,10 @@ void bli_sgemmsup_rd_haswell_asm_1x8
 	                                   // using the xmm registers would zero out the
 	                                   // high bits of the destination registers,
 	                                   // which would destory intermediate results.
-	
+
 	vmovss(mem(rax       ), xmm0)
 	add(imm(1*4), rax)                 // a += 1*cs_a = 1*4;
-	
+
 	vmovss(mem(rbx        ), xmm3)
 	vfmadd231ps(ymm0, ymm3, ymm4)
 
@@ -1389,21 +1389,21 @@ void bli_sgemmsup_rd_haswell_asm_1x8
 	add(imm(1*4), rbx)                 // b += 1*rs_b = 1*4;
 	vfmadd231ps(ymm0, ymm3, ymm13)
 
-	
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT1)                  // iterate again if i != 0.
-	
-	
-	
+
+
+
 
 
 
 
 	label(.SPOSTACCUM)
 
-	                                   // ymm4  ymm7  ymm10 ymm13  
+	                                   // ymm4  ymm7  ymm10 ymm13
 	                                   // ymm5  ymm8  ymm11 ymm14
-	
+
 	vhaddps( ymm7, ymm4, ymm0 )
 	vextractf128(imm(1), ymm0, xmm1 )
 	vhaddps( xmm1, xmm0, xmm0 )
@@ -1421,8 +1421,8 @@ void bli_sgemmsup_rd_haswell_asm_1x8
 	                                   // xmm4[2] = sum(ymm10); xmm4[3] = sum(ymm13)
 
 
-	
-	
+
+
 	//mov(var(rs_c), rdi)                // load rs_c
 	//lea(mem(, rdi, 4), rdi)            // rs_c *= sizeof(float)
 
@@ -1430,57 +1430,57 @@ void bli_sgemmsup_rd_haswell_asm_1x8
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	//mov(var(cs_c), rsi)                // load cs_c
 	//lea(mem(, rsi, 8), rsi)            // rsi = cs_c * sizeof(float)
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
+
+
+
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
 
-	
+
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx))
 	//add(rdi, rcx)
-	
-	
-	
-	
+
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
 
 	add(imm(4), r15)                   // jj += 4;
@@ -1492,7 +1492,7 @@ void bli_sgemmsup_rd_haswell_asm_1x8
 
 	label(.SRETURN)
 
-	
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c
index 1bea78ee73..65d8664dae 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx12.c
@@ -40,20 +40,20 @@
 
 /*
    rrr:
-	 --------        ------        --------      
-	 --------        ------        --------      
-	 --------   +=   ------ ...    --------      
-	 --------        ------        --------      
-	 --------        ------            :         
-	 --------        ------            :         
+	 --------        ------        --------
+	 --------        ------        --------
+	 --------   +=   ------ ...    --------
+	 --------        ------        --------
+	 --------        ------            :
+	 --------        ------            :
 
    rcr:
-	 --------        | | | |       --------      
-	 --------        | | | |       --------      
-	 --------   +=   | | | | ...   --------      
-	 --------        | | | |       --------      
-	 --------        | | | |           :         
-	 --------        | | | |           :         
+	 --------        | | | |       --------
+	 --------        | | | |       --------
+	 --------   +=   | | | | ...   --------
+	 --------        | | | |       --------
+	 --------        | | | |           :
+	 --------        | | | |           :
 
    Assumptions:
    - B is row-stored;
@@ -69,12 +69,12 @@
    cost of the in-register transpose).
 
    crr:
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |  +=   ------ ...    --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------            :         
-	 | | | | | | | |       ------            :         
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |  +=   ------ ...    --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------            :
+	 | | | | | | | |       ------            :
 */
 
 // Prototype reference microkernels.
@@ -93,8 +93,8 @@ void bli_sgemmsup_rv_haswell_asm_6x12
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -115,9 +115,9 @@ void bli_sgemmsup_rv_haswell_asm_6x12
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -132,7 +132,7 @@ void bli_sgemmsup_rv_haswell_asm_6x12
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -186,25 +186,25 @@ void bli_sgemmsup_rv_haswell_asm_6x12
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -215,14 +215,14 @@ void bli_sgemmsup_rv_haswell_asm_6x12
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -231,7 +231,7 @@ void bli_sgemmsup_rv_haswell_asm_6x12
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -248,14 +248,14 @@ void bli_sgemmsup_rv_haswell_asm_6x12
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -263,14 +263,14 @@ void bli_sgemmsup_rv_haswell_asm_6x12
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -281,14 +281,14 @@ void bli_sgemmsup_rv_haswell_asm_6x12
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -296,7 +296,7 @@ void bli_sgemmsup_rv_haswell_asm_6x12
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -314,14 +314,14 @@ void bli_sgemmsup_rv_haswell_asm_6x12
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -329,50 +329,50 @@ void bli_sgemmsup_rv_haswell_asm_6x12
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), ymm2)
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -380,22 +380,22 @@ void bli_sgemmsup_rv_haswell_asm_6x12
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(xmm0, xmm5, xmm5)
 	vmulps(ymm0, ymm6, ymm6)
@@ -408,26 +408,26 @@ void bli_sgemmsup_rv_haswell_asm_6x12
 	vmulps(xmm0, xmm13, xmm13)
 	vmulps(ymm0, ymm14, ymm14)
 	vmulps(xmm0, xmm15, xmm15)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -436,60 +436,60 @@ void bli_sgemmsup_rv_haswell_asm_6x12
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), xmm3, xmm5)
 	vmovups(xmm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6)
 	vmovups(ymm6, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), xmm3, xmm7)
 	vmovups(xmm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8)
 	vmovups(ymm8, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), xmm3, xmm9)
 	vmovups(xmm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10)
 	vmovups(ymm10, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), xmm3, xmm11)
 	vmovups(xmm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm12)
 	vmovups(ymm12, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), xmm3, xmm13)
 	vmovups(xmm13, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm14)
 	vmovups(ymm14, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), xmm3, xmm15)
 	vmovups(xmm15, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -611,51 +611,51 @@ void bli_sgemmsup_rv_haswell_asm_6x12
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	vmovups(xmm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
+
 
 	vmovups(ymm6, mem(rcx, 0*32))
 	vmovups(xmm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm8, mem(rcx, 0*32))
 	vmovups(xmm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm10, mem(rcx, 0*32))
 	vmovups(xmm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm12, mem(rcx, 0*32))
 	vmovups(xmm13, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm14, mem(rcx, 0*32))
 	vmovups(xmm15, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -743,12 +743,12 @@ void bli_sgemmsup_rv_haswell_asm_6x12
 
 	//lea(mem(rdx, rsi, 8), rdx)         // rdx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -791,8 +791,8 @@ void bli_sgemmsup_rv_haswell_asm_5x12
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -813,9 +813,9 @@ void bli_sgemmsup_rv_haswell_asm_5x12
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -830,7 +830,7 @@ void bli_sgemmsup_rv_haswell_asm_5x12
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -883,25 +883,25 @@ void bli_sgemmsup_rv_haswell_asm_5x12
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -912,20 +912,20 @@ void bli_sgemmsup_rv_haswell_asm_5x12
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm1, ymm2, ymm13)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -942,26 +942,26 @@ void bli_sgemmsup_rv_haswell_asm_5x12
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm1, ymm2, ymm13)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -972,19 +972,19 @@ void bli_sgemmsup_rv_haswell_asm_5x12
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm1, ymm2, ymm13)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1002,82 +1002,82 @@ void bli_sgemmsup_rv_haswell_asm_5x12
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm1, ymm2, ymm13)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), ymm2)
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm1, ymm2, ymm13)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(xmm0, xmm5, xmm5)
 	vmulps(ymm0, ymm6, ymm6)
@@ -1088,26 +1088,26 @@ void bli_sgemmsup_rv_haswell_asm_5x12
 	vmulps(xmm0, xmm11, xmm11)
 	vmulps(ymm0, ymm12, ymm12)
 	vmulps(xmm0, xmm13, xmm13)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -1116,52 +1116,52 @@ void bli_sgemmsup_rv_haswell_asm_5x12
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), xmm3, xmm5)
 	vmovups(xmm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6)
 	vmovups(ymm6, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), xmm3, xmm7)
 	vmovups(xmm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8)
 	vmovups(ymm8, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), xmm3, xmm9)
 	vmovups(xmm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10)
 	vmovups(ymm10, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), xmm3, xmm11)
 	vmovups(xmm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm12)
 	vmovups(ymm12, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), xmm3, xmm13)
 	vmovups(xmm13, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1302,46 +1302,46 @@ void bli_sgemmsup_rv_haswell_asm_5x12
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	vmovups(xmm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
+
 
 	vmovups(ymm6, mem(rcx, 0*32))
 	vmovups(xmm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm8, mem(rcx, 0*32))
 	vmovups(xmm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm10, mem(rcx, 0*32))
 	vmovups(xmm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm12, mem(rcx, 0*32))
 	vmovups(xmm13, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1442,12 +1442,12 @@ void bli_sgemmsup_rv_haswell_asm_5x12
 
 	//lea(mem(rdx, rsi, 8), rdx)         // rdx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1490,8 +1490,8 @@ void bli_sgemmsup_rv_haswell_asm_4x12
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1512,9 +1512,9 @@ void bli_sgemmsup_rv_haswell_asm_4x12
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -1529,7 +1529,7 @@ void bli_sgemmsup_rv_haswell_asm_4x12
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -1581,25 +1581,25 @@ void bli_sgemmsup_rv_haswell_asm_4x12
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
+
+
+
 
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -1610,7 +1610,7 @@ void bli_sgemmsup_rv_haswell_asm_4x12
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1619,7 +1619,7 @@ void bli_sgemmsup_rv_haswell_asm_4x12
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1636,7 +1636,7 @@ void bli_sgemmsup_rv_haswell_asm_4x12
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1644,14 +1644,14 @@ void bli_sgemmsup_rv_haswell_asm_4x12
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -1662,7 +1662,7 @@ void bli_sgemmsup_rv_haswell_asm_4x12
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1670,7 +1670,7 @@ void bli_sgemmsup_rv_haswell_asm_4x12
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1688,7 +1688,7 @@ void bli_sgemmsup_rv_haswell_asm_4x12
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1696,32 +1696,32 @@ void bli_sgemmsup_rv_haswell_asm_4x12
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -1732,7 +1732,7 @@ void bli_sgemmsup_rv_haswell_asm_4x12
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1740,22 +1740,22 @@ void bli_sgemmsup_rv_haswell_asm_4x12
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(xmm0, xmm5, xmm5)
 	vmulps(ymm0, ymm6, ymm6)
@@ -1764,40 +1764,40 @@ void bli_sgemmsup_rv_haswell_asm_4x12
 	vmulps(xmm0, xmm9, xmm9)
 	vmulps(ymm0, ymm10, ymm10)
 	vmulps(xmm0, xmm11, xmm11)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
+
 
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 
@@ -1828,10 +1828,10 @@ void bli_sgemmsup_rv_haswell_asm_4x12
 	vfmadd231ps(mem(rcx, 1*32), xmm3, xmm11)
 	vmovups(xmm11, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
-	
+
 
 
 	label(.SCOLSTORED)
@@ -1907,19 +1907,19 @@ void bli_sgemmsup_rv_haswell_asm_4x12
 	jmp(.SDONE)                        // jump to end.
 
 
-	
-	
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	vmovups(xmm5, mem(rcx, 1*32))
 	add(rdi, rcx)
@@ -1938,8 +1938,8 @@ void bli_sgemmsup_rv_haswell_asm_4x12
 	vmovups(ymm10, mem(rcx, 0*32))
 	vmovups(xmm11, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1999,12 +1999,12 @@ void bli_sgemmsup_rv_haswell_asm_4x12
 
 	//lea(mem(rcx, rsi, 4), rcx)
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -2047,8 +2047,8 @@ void bli_sgemmsup_rv_haswell_asm_3x12
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2069,9 +2069,9 @@ void bli_sgemmsup_rv_haswell_asm_3x12
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -2086,7 +2086,7 @@ void bli_sgemmsup_rv_haswell_asm_3x12
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -2137,25 +2137,25 @@ void bli_sgemmsup_rv_haswell_asm_3x12
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
+
+
+
 
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -2166,13 +2166,13 @@ void bli_sgemmsup_rv_haswell_asm_3x12
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -2189,19 +2189,19 @@ void bli_sgemmsup_rv_haswell_asm_3x12
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -2212,12 +2212,12 @@ void bli_sgemmsup_rv_haswell_asm_3x12
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -2235,37 +2235,37 @@ void bli_sgemmsup_rv_haswell_asm_3x12
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -2276,67 +2276,67 @@ void bli_sgemmsup_rv_haswell_asm_3x12
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(xmm0, xmm5, xmm5)
 	vmulps(ymm0, ymm6, ymm6)
 	vmulps(xmm0, xmm7, xmm7)
 	vmulps(ymm0, ymm8, ymm8)
 	vmulps(xmm0, xmm9, xmm9)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
+
 
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 
@@ -2359,10 +2359,10 @@ void bli_sgemmsup_rv_haswell_asm_3x12
 	vfmadd231ps(mem(rcx, 1*32), xmm3, xmm9)
 	vmovups(xmm9, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
-	
+
 
 
 	label(.SCOLSTORED)
@@ -2483,19 +2483,19 @@ void bli_sgemmsup_rv_haswell_asm_3x12
 	jmp(.SDONE)                        // jump to end.
 
 
-	
-	
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	vmovups(xmm5, mem(rcx, 1*32))
 	add(rdi, rcx)
@@ -2509,8 +2509,8 @@ void bli_sgemmsup_rv_haswell_asm_3x12
 	vmovups(ymm8, mem(rcx, 0*32))
 	vmovups(xmm9, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -2587,12 +2587,12 @@ void bli_sgemmsup_rv_haswell_asm_3x12
 
 	//lea(mem(rcx, rsi, 4), rcx)
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -2635,8 +2635,8 @@ void bli_sgemmsup_rv_haswell_asm_2x12
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2657,9 +2657,9 @@ void bli_sgemmsup_rv_haswell_asm_2x12
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -2674,7 +2674,7 @@ void bli_sgemmsup_rv_haswell_asm_2x12
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -2724,25 +2724,25 @@ void bli_sgemmsup_rv_haswell_asm_2x12
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
+
+
+
 
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -2755,7 +2755,7 @@ void bli_sgemmsup_rv_haswell_asm_2x12
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -2773,14 +2773,14 @@ void bli_sgemmsup_rv_haswell_asm_2x12
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -2792,7 +2792,7 @@ void bli_sgemmsup_rv_haswell_asm_2x12
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -2811,32 +2811,32 @@ void bli_sgemmsup_rv_haswell_asm_2x12
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -2848,60 +2848,60 @@ void bli_sgemmsup_rv_haswell_asm_2x12
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(xmm0, xmm5, xmm5)
 	vmulps(ymm0, ymm6, ymm6)
 	vmulps(xmm0, xmm7, xmm7)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
+
 
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 
@@ -2916,10 +2916,10 @@ void bli_sgemmsup_rv_haswell_asm_2x12
 	vfmadd231ps(mem(rcx, 1*32), xmm3, xmm7)
 	vmovups(xmm7, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
-	
+
 
 
 	label(.SCOLSTORED)
@@ -2975,19 +2975,19 @@ void bli_sgemmsup_rv_haswell_asm_2x12
 	jmp(.SDONE)                        // jump to end.
 
 
-	
-	
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	vmovups(xmm5, mem(rcx, 1*32))
 	add(rdi, rcx)
@@ -2996,8 +2996,8 @@ void bli_sgemmsup_rv_haswell_asm_2x12
 	vmovups(ymm6, mem(rcx, 0*32))
 	vmovups(xmm7, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -3033,12 +3033,12 @@ void bli_sgemmsup_rv_haswell_asm_2x12
 
 	//lea(mem(rcx, rsi, 8), rcx)         // rcx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -3081,8 +3081,8 @@ void bli_sgemmsup_rv_haswell_asm_1x12
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -3103,9 +3103,9 @@ void bli_sgemmsup_rv_haswell_asm_1x12
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -3120,7 +3120,7 @@ void bli_sgemmsup_rv_haswell_asm_1x12
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -3169,25 +3169,25 @@ void bli_sgemmsup_rv_haswell_asm_1x12
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
+
+
+
 
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -3197,7 +3197,7 @@ void bli_sgemmsup_rv_haswell_asm_1x12
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -3212,14 +3212,14 @@ void bli_sgemmsup_rv_haswell_asm_1x12
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -3228,7 +3228,7 @@ void bli_sgemmsup_rv_haswell_asm_1x12
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -3244,32 +3244,32 @@ void bli_sgemmsup_rv_haswell_asm_1x12
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), xmm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -3278,68 +3278,68 @@ void bli_sgemmsup_rv_haswell_asm_1x12
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(xmm0, xmm5, xmm5)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
+
 
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), xmm3, xmm5)
 	vmovups(xmm5, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
-	
+
 
 
 	label(.SCOLSTORED)
@@ -3414,24 +3414,24 @@ void bli_sgemmsup_rv_haswell_asm_1x12
 	jmp(.SDONE)                        // jump to end.
 
 
-	
-	
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	vmovups(xmm5, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -3480,12 +3480,12 @@ void bli_sgemmsup_rv_haswell_asm_1x12
 
 	//lea(mem(rcx, rsi, 8), rcx)         // rcx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c
index 6a08cecd43..26eec0c09d 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx16.c
@@ -40,20 +40,20 @@
 
 /*
    rrr:
-	 --------        ------        --------      
-	 --------        ------        --------      
-	 --------   +=   ------ ...    --------      
-	 --------        ------        --------      
-	 --------        ------            :         
-	 --------        ------            :         
+	 --------        ------        --------
+	 --------        ------        --------
+	 --------   +=   ------ ...    --------
+	 --------        ------        --------
+	 --------        ------            :
+	 --------        ------            :
 
    rcr:
-	 --------        | | | |       --------      
-	 --------        | | | |       --------      
-	 --------   +=   | | | | ...   --------      
-	 --------        | | | |       --------      
-	 --------        | | | |           :         
-	 --------        | | | |           :         
+	 --------        | | | |       --------
+	 --------        | | | |       --------
+	 --------   +=   | | | | ...   --------
+	 --------        | | | |       --------
+	 --------        | | | |           :
+	 --------        | | | |           :
 
    Assumptions:
    - B is row-stored;
@@ -69,12 +69,12 @@
    cost of the in-register transpose).
 
    crr:
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |  +=   ------ ...    --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------            :         
-	 | | | | | | | |       ------            :         
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |  +=   ------ ...    --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------            :
+	 | | | | | | | |       ------            :
 */
 
 // Prototype reference microkernels.
@@ -108,8 +108,8 @@ void bli_sgemmsup_rv_haswell_asm_6x16
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 #if 0
@@ -178,7 +178,7 @@ void bli_sgemmsup_rv_haswell_asm_6x16
 						// Advance C and A pointers by the mrs and nrs we just
 						// used, and decrement m_left.
 						cij += mr_cur*rs_c0; ai += mr_cur*rs_a0; m_left -= mr_cur;
-					} 
+					}
 				}
 
 				// Advance C and B pointers by the mrs and nrs we just used, and
@@ -208,9 +208,9 @@ void bli_sgemmsup_rv_haswell_asm_6x16
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -225,7 +225,7 @@ void bli_sgemmsup_rv_haswell_asm_6x16
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -286,25 +286,25 @@ void bli_sgemmsup_rv_haswell_asm_6x16
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -315,14 +315,14 @@ void bli_sgemmsup_rv_haswell_asm_6x16
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -331,7 +331,7 @@ void bli_sgemmsup_rv_haswell_asm_6x16
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -348,14 +348,14 @@ void bli_sgemmsup_rv_haswell_asm_6x16
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -363,14 +363,14 @@ void bli_sgemmsup_rv_haswell_asm_6x16
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -381,14 +381,14 @@ void bli_sgemmsup_rv_haswell_asm_6x16
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -396,7 +396,7 @@ void bli_sgemmsup_rv_haswell_asm_6x16
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -414,14 +414,14 @@ void bli_sgemmsup_rv_haswell_asm_6x16
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -429,50 +429,50 @@ void bli_sgemmsup_rv_haswell_asm_6x16
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), ymm2)
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -480,22 +480,22 @@ void bli_sgemmsup_rv_haswell_asm_6x16
 	vfmadd231ps(ymm1, ymm2, ymm13)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 	vfmadd231ps(ymm1, ymm3, ymm15)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm5, ymm5)
 	vmulps(ymm0, ymm6, ymm6)
@@ -508,26 +508,26 @@ void bli_sgemmsup_rv_haswell_asm_6x16
 	vmulps(ymm0, ymm13, ymm13)
 	vmulps(ymm0, ymm14, ymm14)
 	vmulps(ymm0, ymm15, ymm15)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -536,60 +536,60 @@ void bli_sgemmsup_rv_haswell_asm_6x16
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm5)
 	vmovups(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6)
 	vmovups(ymm6, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm7)
 	vmovups(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8)
 	vmovups(ymm8, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm9)
 	vmovups(ymm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10)
 	vmovups(ymm10, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm11)
 	vmovups(ymm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm12)
 	vmovups(ymm12, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm13)
 	vmovups(ymm13, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm14)
 	vmovups(ymm14, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm15)
 	vmovups(ymm15, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -735,51 +735,51 @@ void bli_sgemmsup_rv_haswell_asm_6x16
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	vmovups(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
+
 
 	vmovups(ymm6, mem(rcx, 0*32))
 	vmovups(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm8, mem(rcx, 0*32))
 	vmovups(ymm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm10, mem(rcx, 0*32))
 	vmovups(ymm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm12, mem(rcx, 0*32))
 	vmovups(ymm13, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm14, mem(rcx, 0*32))
 	vmovups(ymm15, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -884,12 +884,12 @@ void bli_sgemmsup_rv_haswell_asm_6x16
 
 	//lea(mem(rdx, rsi, 8), rdx)         // rdx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -932,8 +932,8 @@ void bli_sgemmsup_rv_haswell_asm_5x16
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -954,9 +954,9 @@ void bli_sgemmsup_rv_haswell_asm_5x16
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -971,7 +971,7 @@ void bli_sgemmsup_rv_haswell_asm_5x16
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -1029,25 +1029,25 @@ void bli_sgemmsup_rv_haswell_asm_5x16
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -1058,20 +1058,20 @@ void bli_sgemmsup_rv_haswell_asm_5x16
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm1, ymm2, ymm13)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1088,26 +1088,26 @@ void bli_sgemmsup_rv_haswell_asm_5x16
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm1, ymm2, ymm13)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -1118,19 +1118,19 @@ void bli_sgemmsup_rv_haswell_asm_5x16
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm1, ymm2, ymm13)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1148,82 +1148,82 @@ void bli_sgemmsup_rv_haswell_asm_5x16
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm1, ymm2, ymm13)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), ymm2)
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm1, ymm2, ymm13)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm5, ymm5)
 	vmulps(ymm0, ymm6, ymm6)
@@ -1234,26 +1234,26 @@ void bli_sgemmsup_rv_haswell_asm_5x16
 	vmulps(ymm0, ymm11, ymm11)
 	vmulps(ymm0, ymm12, ymm12)
 	vmulps(ymm0, ymm13, ymm13)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -1262,52 +1262,52 @@ void bli_sgemmsup_rv_haswell_asm_5x16
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm5)
 	vmovups(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6)
 	vmovups(ymm6, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm7)
 	vmovups(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8)
 	vmovups(ymm8, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm9)
 	vmovups(ymm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10)
 	vmovups(ymm10, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm11)
 	vmovups(ymm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm12)
 	vmovups(ymm12, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm13)
 	vmovups(ymm13, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1479,46 +1479,46 @@ void bli_sgemmsup_rv_haswell_asm_5x16
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	vmovups(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
+
 
 	vmovups(ymm6, mem(rcx, 0*32))
 	vmovups(ymm7, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm8, mem(rcx, 0*32))
 	vmovups(ymm9, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm10, mem(rcx, 0*32))
 	vmovups(ymm11, mem(rcx, 1*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm12, mem(rcx, 0*32))
 	vmovups(ymm13, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1640,12 +1640,12 @@ void bli_sgemmsup_rv_haswell_asm_5x16
 
 	//lea(mem(rdx, rsi, 8), rdx)         // rdx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1688,8 +1688,8 @@ void bli_sgemmsup_rv_haswell_asm_4x16
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1710,9 +1710,9 @@ void bli_sgemmsup_rv_haswell_asm_4x16
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -1727,7 +1727,7 @@ void bli_sgemmsup_rv_haswell_asm_4x16
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -1784,25 +1784,25 @@ void bli_sgemmsup_rv_haswell_asm_4x16
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
+
+
+
 
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -1813,7 +1813,7 @@ void bli_sgemmsup_rv_haswell_asm_4x16
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1822,7 +1822,7 @@ void bli_sgemmsup_rv_haswell_asm_4x16
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1839,7 +1839,7 @@ void bli_sgemmsup_rv_haswell_asm_4x16
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1847,14 +1847,14 @@ void bli_sgemmsup_rv_haswell_asm_4x16
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -1865,7 +1865,7 @@ void bli_sgemmsup_rv_haswell_asm_4x16
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1873,7 +1873,7 @@ void bli_sgemmsup_rv_haswell_asm_4x16
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1891,7 +1891,7 @@ void bli_sgemmsup_rv_haswell_asm_4x16
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1899,32 +1899,32 @@ void bli_sgemmsup_rv_haswell_asm_4x16
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -1935,7 +1935,7 @@ void bli_sgemmsup_rv_haswell_asm_4x16
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
@@ -1943,22 +1943,22 @@ void bli_sgemmsup_rv_haswell_asm_4x16
 	vfmadd231ps(ymm1, ymm2, ymm9)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 	vfmadd231ps(ymm1, ymm3, ymm11)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm5, ymm5)
 	vmulps(ymm0, ymm6, ymm6)
@@ -1967,40 +1967,40 @@ void bli_sgemmsup_rv_haswell_asm_4x16
 	vmulps(ymm0, ymm9, ymm9)
 	vmulps(ymm0, ymm10, ymm10)
 	vmulps(ymm0, ymm11, ymm11)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
+
 
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 
@@ -2031,10 +2031,10 @@ void bli_sgemmsup_rv_haswell_asm_4x16
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm11)
 	vmovups(ymm11, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
-	
+
 
 
 	label(.SCOLSTORED)
@@ -2122,19 +2122,19 @@ void bli_sgemmsup_rv_haswell_asm_4x16
 	jmp(.SDONE)                        // jump to end.
 
 
-	
-	
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	vmovups(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
@@ -2153,8 +2153,8 @@ void bli_sgemmsup_rv_haswell_asm_4x16
 	vmovups(ymm10, mem(rcx, 0*32))
 	vmovups(ymm11, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -2225,12 +2225,12 @@ void bli_sgemmsup_rv_haswell_asm_4x16
 
 	//lea(mem(rcx, rsi, 4), rcx)
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -2273,8 +2273,8 @@ void bli_sgemmsup_rv_haswell_asm_3x16
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2295,9 +2295,9 @@ void bli_sgemmsup_rv_haswell_asm_3x16
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -2312,7 +2312,7 @@ void bli_sgemmsup_rv_haswell_asm_3x16
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -2368,25 +2368,25 @@ void bli_sgemmsup_rv_haswell_asm_3x16
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
+
+
+
 
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -2397,13 +2397,13 @@ void bli_sgemmsup_rv_haswell_asm_3x16
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -2420,19 +2420,19 @@ void bli_sgemmsup_rv_haswell_asm_3x16
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -2443,12 +2443,12 @@ void bli_sgemmsup_rv_haswell_asm_3x16
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -2466,37 +2466,37 @@ void bli_sgemmsup_rv_haswell_asm_3x16
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -2507,67 +2507,67 @@ void bli_sgemmsup_rv_haswell_asm_3x16
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm1, ymm2, ymm9)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm5, ymm5)
 	vmulps(ymm0, ymm6, ymm6)
 	vmulps(ymm0, ymm7, ymm7)
 	vmulps(ymm0, ymm8, ymm8)
 	vmulps(ymm0, ymm9, ymm9)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
+
 
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 
@@ -2590,10 +2590,10 @@ void bli_sgemmsup_rv_haswell_asm_3x16
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm9)
 	vmovups(ymm9, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
-	
+
 
 
 	label(.SCOLSTORED)
@@ -2745,19 +2745,19 @@ void bli_sgemmsup_rv_haswell_asm_3x16
 	jmp(.SDONE)                        // jump to end.
 
 
-	
-	
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	vmovups(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
@@ -2771,8 +2771,8 @@ void bli_sgemmsup_rv_haswell_asm_3x16
 	vmovups(ymm8, mem(rcx, 0*32))
 	vmovups(ymm9, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -2866,12 +2866,12 @@ void bli_sgemmsup_rv_haswell_asm_3x16
 
 	//lea(mem(rcx, rsi, 4), rcx)
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -2914,8 +2914,8 @@ void bli_sgemmsup_rv_haswell_asm_2x16
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2936,9 +2936,9 @@ void bli_sgemmsup_rv_haswell_asm_2x16
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -2953,7 +2953,7 @@ void bli_sgemmsup_rv_haswell_asm_2x16
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -3008,25 +3008,25 @@ void bli_sgemmsup_rv_haswell_asm_2x16
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
+
+
+
 
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -3039,7 +3039,7 @@ void bli_sgemmsup_rv_haswell_asm_2x16
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -3057,14 +3057,14 @@ void bli_sgemmsup_rv_haswell_asm_2x16
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -3076,7 +3076,7 @@ void bli_sgemmsup_rv_haswell_asm_2x16
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -3095,32 +3095,32 @@ void bli_sgemmsup_rv_haswell_asm_2x16
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -3132,60 +3132,60 @@ void bli_sgemmsup_rv_haswell_asm_2x16
 	vfmadd231ps(ymm1, ymm2, ymm5)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 	vfmadd231ps(ymm1, ymm3, ymm7)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm5, ymm5)
 	vmulps(ymm0, ymm6, ymm6)
 	vmulps(ymm0, ymm7, ymm7)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
+
 
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 
@@ -3200,10 +3200,10 @@ void bli_sgemmsup_rv_haswell_asm_2x16
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm7)
 	vmovups(ymm7, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
-	
+
 
 
 	label(.SCOLSTORED)
@@ -3271,19 +3271,19 @@ void bli_sgemmsup_rv_haswell_asm_2x16
 	jmp(.SDONE)                        // jump to end.
 
 
-	
-	
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	vmovups(ymm5, mem(rcx, 1*32))
 	add(rdi, rcx)
@@ -3292,8 +3292,8 @@ void bli_sgemmsup_rv_haswell_asm_2x16
 	vmovups(ymm6, mem(rcx, 0*32))
 	vmovups(ymm7, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -3335,12 +3335,12 @@ void bli_sgemmsup_rv_haswell_asm_2x16
 
 	//lea(mem(rcx, rsi, 8), rcx)         // rcx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -3383,8 +3383,8 @@ void bli_sgemmsup_rv_haswell_asm_1x16
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -3405,9 +3405,9 @@ void bli_sgemmsup_rv_haswell_asm_1x16
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -3422,7 +3422,7 @@ void bli_sgemmsup_rv_haswell_asm_1x16
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -3476,25 +3476,25 @@ void bli_sgemmsup_rv_haswell_asm_1x16
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
+
+
+
 
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -3504,7 +3504,7 @@ void bli_sgemmsup_rv_haswell_asm_1x16
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -3519,14 +3519,14 @@ void bli_sgemmsup_rv_haswell_asm_1x16
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -3535,7 +3535,7 @@ void bli_sgemmsup_rv_haswell_asm_1x16
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -3551,32 +3551,32 @@ void bli_sgemmsup_rv_haswell_asm_1x16
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	vmovups(mem(rbx, 1*32), ymm1)
 	add(r10, rbx)                      // b += rs_b;
@@ -3585,68 +3585,68 @@ void bli_sgemmsup_rv_haswell_asm_1x16
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm1, ymm2, ymm5)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm5, ymm5)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
+
 
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 
 	vfmadd231ps(mem(rcx, 1*32), ymm3, ymm5)
 	vmovups(ymm5, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
-	
+
 
 
 	label(.SCOLSTORED)
@@ -3740,24 +3740,24 @@ void bli_sgemmsup_rv_haswell_asm_1x16
 	jmp(.SDONE)                        // jump to end.
 
 
-	
-	
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	vmovups(ymm5, mem(rcx, 1*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -3817,12 +3817,12 @@ void bli_sgemmsup_rv_haswell_asm_1x16
 
 	//lea(mem(rcx, rsi, 8), rcx)         // rcx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c
index 6090f8b0b9..53a70d15f0 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx2.c
@@ -40,20 +40,20 @@
 
 /*
    rrr:
-	 --------        ------        --------      
-	 --------        ------        --------      
-	 --------   +=   ------ ...    --------      
-	 --------        ------        --------      
-	 --------        ------            :         
-	 --------        ------            :         
+	 --------        ------        --------
+	 --------        ------        --------
+	 --------   +=   ------ ...    --------
+	 --------        ------        --------
+	 --------        ------            :
+	 --------        ------            :
 
    rcr:
-	 --------        | | | |       --------      
-	 --------        | | | |       --------      
-	 --------   +=   | | | | ...   --------      
-	 --------        | | | |       --------      
-	 --------        | | | |           :         
-	 --------        | | | |           :         
+	 --------        | | | |       --------
+	 --------        | | | |       --------
+	 --------   +=   | | | | ...   --------
+	 --------        | | | |       --------
+	 --------        | | | |           :
+	 --------        | | | |           :
 
    Assumptions:
    - B is row-stored;
@@ -69,12 +69,12 @@
    cost of the in-register transpose).
 
    crr:
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |  +=   ------ ...    --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------            :         
-	 | | | | | | | |       ------            :         
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |  +=   ------ ...    --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------            :
+	 | | | | | | | |       ------            :
 */
 
 // Prototype reference microkernels.
@@ -93,8 +93,8 @@ void bli_sgemmsup_rv_haswell_asm_6x2
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -115,9 +115,9 @@ void bli_sgemmsup_rv_haswell_asm_6x2
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -132,7 +132,7 @@ void bli_sgemmsup_rv_haswell_asm_6x2
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -174,25 +174,25 @@ void bli_sgemmsup_rv_haswell_asm_6x2
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovsd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -200,19 +200,19 @@ void bli_sgemmsup_rv_haswell_asm_6x2
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	vbroadcastss(mem(rax, r15, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
 	vfmadd231ps(xmm0, xmm3, xmm14)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -226,25 +226,25 @@ void bli_sgemmsup_rv_haswell_asm_6x2
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	vbroadcastss(mem(rax, r15, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
 	vfmadd231ps(xmm0, xmm3, xmm14)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovsd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -252,18 +252,18 @@ void bli_sgemmsup_rv_haswell_asm_6x2
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	vbroadcastss(mem(rax, r15, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
 	vfmadd231ps(xmm0, xmm3, xmm14)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -278,103 +278,103 @@ void bli_sgemmsup_rv_haswell_asm_6x2
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	vbroadcastss(mem(rax, r15, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
 	vfmadd231ps(xmm0, xmm3, xmm14)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovsd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), xmm2)
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	vbroadcastss(mem(rax, r15, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
 	vfmadd231ps(xmm0, xmm3, xmm14)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm6, xmm6)
 	vmulps(xmm0, xmm8, xmm8)
 	vmulps(xmm0, xmm10, xmm10)
 	vmulps(xmm0, xmm12, xmm12)
 	vmulps(xmm0, xmm14, xmm14)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	//lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	//lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	//lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(xmm0, xmm0, xmm0)           // set xmm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -383,42 +383,42 @@ void bli_sgemmsup_rv_haswell_asm_6x2
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4)
 	vmovsd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6)
 	vmovsd(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8)
 	vmovsd(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm10)
 	vmovsd(xmm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm12)
 	vmovsd(xmm12, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm14)
 	vmovsd(xmm14, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -452,45 +452,45 @@ void bli_sgemmsup_rv_haswell_asm_6x2
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovsd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovsd(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovsd(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovsd(xmm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovsd(xmm12, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovsd(xmm14, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -516,12 +516,12 @@ void bli_sgemmsup_rv_haswell_asm_6x2
 
 	//lea(mem(rdx, rsi, 8), rdx)         // rdx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -564,8 +564,8 @@ void bli_sgemmsup_rv_haswell_asm_5x2
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -586,9 +586,9 @@ void bli_sgemmsup_rv_haswell_asm_5x2
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -603,7 +603,7 @@ void bli_sgemmsup_rv_haswell_asm_5x2
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -644,25 +644,25 @@ void bli_sgemmsup_rv_haswell_asm_5x2
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovsd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -670,17 +670,17 @@ void bli_sgemmsup_rv_haswell_asm_5x2
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -694,23 +694,23 @@ void bli_sgemmsup_rv_haswell_asm_5x2
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovsd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -718,16 +718,16 @@ void bli_sgemmsup_rv_haswell_asm_5x2
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -742,98 +742,98 @@ void bli_sgemmsup_rv_haswell_asm_5x2
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovsd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), xmm2)
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm6, xmm6)
 	vmulps(xmm0, xmm8, xmm8)
 	vmulps(xmm0, xmm10, xmm10)
 	vmulps(xmm0, xmm12, xmm12)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	//lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	//lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	//lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(xmm0, xmm0, xmm0)           // set xmm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -842,37 +842,37 @@ void bli_sgemmsup_rv_haswell_asm_5x2
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4)
 	vmovsd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6)
 	vmovsd(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8)
 	vmovsd(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm10)
 	vmovsd(xmm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm12)
 	vmovsd(xmm12, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -910,41 +910,41 @@ void bli_sgemmsup_rv_haswell_asm_5x2
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovsd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovsd(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovsd(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovsd(xmm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovsd(xmm12, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -973,12 +973,12 @@ void bli_sgemmsup_rv_haswell_asm_5x2
 
 	//lea(mem(rdx, rsi, 8), rdx)         // rdx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1021,8 +1021,8 @@ void bli_sgemmsup_rv_haswell_asm_4x2
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1043,9 +1043,9 @@ void bli_sgemmsup_rv_haswell_asm_4x2
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -1060,7 +1060,7 @@ void bli_sgemmsup_rv_haswell_asm_4x2
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -1100,25 +1100,25 @@ void bli_sgemmsup_rv_haswell_asm_4x2
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovsd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1126,14 +1126,14 @@ void bli_sgemmsup_rv_haswell_asm_4x2
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1147,20 +1147,20 @@ void bli_sgemmsup_rv_haswell_asm_4x2
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovsd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1168,13 +1168,13 @@ void bli_sgemmsup_rv_haswell_asm_4x2
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1189,91 +1189,91 @@ void bli_sgemmsup_rv_haswell_asm_4x2
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovsd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), xmm2)
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm6, xmm6)
 	vmulps(xmm0, xmm8, xmm8)
 	vmulps(xmm0, xmm10, xmm10)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	//lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	//lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	//lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(xmm0, xmm0, xmm0)           // set xmm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -1282,32 +1282,32 @@ void bli_sgemmsup_rv_haswell_asm_4x2
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4)
 	vmovsd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6)
 	vmovsd(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8)
 	vmovsd(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm10)
 	vmovsd(xmm10, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1331,37 +1331,37 @@ void bli_sgemmsup_rv_haswell_asm_4x2
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovsd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovsd(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovsd(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovsd(xmm10, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1380,12 +1380,12 @@ void bli_sgemmsup_rv_haswell_asm_4x2
 
 	//lea(mem(rcx, rsi, 8), rcx)         // rcx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1428,8 +1428,8 @@ void bli_sgemmsup_rv_haswell_asm_3x2
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1450,9 +1450,9 @@ void bli_sgemmsup_rv_haswell_asm_3x2
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -1467,7 +1467,7 @@ void bli_sgemmsup_rv_haswell_asm_3x2
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -1506,25 +1506,25 @@ void bli_sgemmsup_rv_haswell_asm_3x2
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovsd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1532,12 +1532,12 @@ void bli_sgemmsup_rv_haswell_asm_3x2
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm8)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1551,18 +1551,18 @@ void bli_sgemmsup_rv_haswell_asm_3x2
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm8)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovsd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1570,11 +1570,11 @@ void bli_sgemmsup_rv_haswell_asm_3x2
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm8)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1589,86 +1589,86 @@ void bli_sgemmsup_rv_haswell_asm_3x2
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm8)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovsd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), xmm2)
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm8)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm6, xmm6)
 	vmulps(xmm0, xmm8, xmm8)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
 
 	//lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	//lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	//lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(xmm0, xmm0, xmm0)           // set xmm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -1677,27 +1677,27 @@ void bli_sgemmsup_rv_haswell_asm_3x2
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4)
 	vmovsd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6)
 	vmovsd(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8)
 	vmovsd(xmm8, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1730,33 +1730,33 @@ void bli_sgemmsup_rv_haswell_asm_3x2
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovsd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovsd(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovsd(xmm8, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1780,12 +1780,12 @@ void bli_sgemmsup_rv_haswell_asm_3x2
 
 	//lea(mem(rcx, rsi, 8), rcx)         // rcx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1828,8 +1828,8 @@ void bli_sgemmsup_rv_haswell_asm_2x2
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1850,9 +1850,9 @@ void bli_sgemmsup_rv_haswell_asm_2x2
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -1867,7 +1867,7 @@ void bli_sgemmsup_rv_haswell_asm_2x2
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -1905,25 +1905,25 @@ void bli_sgemmsup_rv_haswell_asm_2x2
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovsd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1933,7 +1933,7 @@ void bli_sgemmsup_rv_haswell_asm_2x2
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1948,14 +1948,14 @@ void bli_sgemmsup_rv_haswell_asm_2x2
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovsd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1964,7 +1964,7 @@ void bli_sgemmsup_rv_haswell_asm_2x2
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1980,78 +1980,78 @@ void bli_sgemmsup_rv_haswell_asm_2x2
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovsd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), xmm2)
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm6, xmm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 2), rdx)         // load address of c +  4*rs_c;
 
 	//lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	//lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	//lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(xmm0, xmm0, xmm0)           // set xmm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -2060,22 +2060,22 @@ void bli_sgemmsup_rv_haswell_asm_2x2
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4)
 	vmovsd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6)
 	vmovsd(xmm6, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -2094,29 +2094,29 @@ void bli_sgemmsup_rv_haswell_asm_2x2
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovsd(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovsd(xmm6, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -2130,12 +2130,12 @@ void bli_sgemmsup_rv_haswell_asm_2x2
 
 	//lea(mem(rcx, rsi, 8), rcx)         // rcx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -2178,8 +2178,8 @@ void bli_sgemmsup_rv_haswell_asm_1x2
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2200,9 +2200,9 @@ void bli_sgemmsup_rv_haswell_asm_1x2
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -2217,7 +2217,7 @@ void bli_sgemmsup_rv_haswell_asm_1x2
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -2254,25 +2254,25 @@ void bli_sgemmsup_rv_haswell_asm_1x2
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovsd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -2280,7 +2280,7 @@ void bli_sgemmsup_rv_haswell_asm_1x2
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm4)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -2293,21 +2293,21 @@ void bli_sgemmsup_rv_haswell_asm_1x2
 	vbroadcastss(mem(rax        ), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm4)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovsd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
 	vbroadcastss(mem(rax        ), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm4)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -2321,75 +2321,75 @@ void bli_sgemmsup_rv_haswell_asm_1x2
 	vbroadcastss(mem(rax        ), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm4)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovsd(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm4)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 2), rdx)         // load address of c +  4*rs_c;
 
 	//lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	//lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	//lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(xmm0, xmm0, xmm0)           // set xmm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -2398,17 +2398,17 @@ void bli_sgemmsup_rv_haswell_asm_1x2
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4)
 	vmovsd(xmm4, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -2431,25 +2431,25 @@ void bli_sgemmsup_rv_haswell_asm_1x2
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovsd(xmm4, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -2466,12 +2466,12 @@ void bli_sgemmsup_rv_haswell_asm_1x2
 
 	//lea(mem(rcx, rsi, 8), rcx)         // rcx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx4.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx4.c
index 512fd60525..2d6165710c 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx4.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx4.c
@@ -40,20 +40,20 @@
 
 /*
    rrr:
-	 --------        ------        --------      
-	 --------        ------        --------      
-	 --------   +=   ------ ...    --------      
-	 --------        ------        --------      
-	 --------        ------            :         
-	 --------        ------            :         
+	 --------        ------        --------
+	 --------        ------        --------
+	 --------   +=   ------ ...    --------
+	 --------        ------        --------
+	 --------        ------            :
+	 --------        ------            :
 
    rcr:
-	 --------        | | | |       --------      
-	 --------        | | | |       --------      
-	 --------   +=   | | | | ...   --------      
-	 --------        | | | |       --------      
-	 --------        | | | |           :         
-	 --------        | | | |           :         
+	 --------        | | | |       --------
+	 --------        | | | |       --------
+	 --------   +=   | | | | ...   --------
+	 --------        | | | |       --------
+	 --------        | | | |           :
+	 --------        | | | |           :
 
    Assumptions:
    - B is row-stored;
@@ -69,12 +69,12 @@
    cost of the in-register transpose).
 
    crr:
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |  +=   ------ ...    --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------            :         
-	 | | | | | | | |       ------            :         
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |  +=   ------ ...    --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------            :
+	 | | | | | | | |       ------            :
 */
 
 // Prototype reference microkernels.
@@ -93,8 +93,8 @@ void bli_sgemmsup_rv_haswell_asm_6x4
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -115,9 +115,9 @@ void bli_sgemmsup_rv_haswell_asm_6x4
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -132,7 +132,7 @@ void bli_sgemmsup_rv_haswell_asm_6x4
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -176,25 +176,25 @@ void bli_sgemmsup_rv_haswell_asm_6x4
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -202,19 +202,19 @@ void bli_sgemmsup_rv_haswell_asm_6x4
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	vbroadcastss(mem(rax, r15, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
 	vfmadd231ps(xmm0, xmm3, xmm14)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -228,25 +228,25 @@ void bli_sgemmsup_rv_haswell_asm_6x4
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	vbroadcastss(mem(rax, r15, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
 	vfmadd231ps(xmm0, xmm3, xmm14)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -254,18 +254,18 @@ void bli_sgemmsup_rv_haswell_asm_6x4
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	vbroadcastss(mem(rax, r15, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
 	vfmadd231ps(xmm0, xmm3, xmm14)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -280,103 +280,103 @@ void bli_sgemmsup_rv_haswell_asm_6x4
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	vbroadcastss(mem(rax, r15, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
 	vfmadd231ps(xmm0, xmm3, xmm14)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), xmm2)
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	vbroadcastss(mem(rax, r15, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
 	vfmadd231ps(xmm0, xmm3, xmm14)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm6, xmm6)
 	vmulps(xmm0, xmm8, xmm8)
 	vmulps(xmm0, xmm10, xmm10)
 	vmulps(xmm0, xmm12, xmm12)
 	vmulps(xmm0, xmm14, xmm14)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(xmm0, xmm0, xmm0)           // set xmm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -385,42 +385,42 @@ void bli_sgemmsup_rv_haswell_asm_6x4
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6)
 	vmovups(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8)
 	vmovups(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm10)
 	vmovups(xmm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm12)
 	vmovups(xmm12, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm14)
 	vmovups(xmm14, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -474,45 +474,45 @@ void bli_sgemmsup_rv_haswell_asm_6x4
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovups(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(xmm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(xmm12, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(xmm14, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -552,12 +552,12 @@ void bli_sgemmsup_rv_haswell_asm_6x4
 
 	//lea(mem(rdx, rsi, 8), rdx)         // rdx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -600,8 +600,8 @@ void bli_sgemmsup_rv_haswell_asm_5x4
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -622,9 +622,9 @@ void bli_sgemmsup_rv_haswell_asm_5x4
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -639,7 +639,7 @@ void bli_sgemmsup_rv_haswell_asm_5x4
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -682,25 +682,25 @@ void bli_sgemmsup_rv_haswell_asm_5x4
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -708,17 +708,17 @@ void bli_sgemmsup_rv_haswell_asm_5x4
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -732,23 +732,23 @@ void bli_sgemmsup_rv_haswell_asm_5x4
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -756,16 +756,16 @@ void bli_sgemmsup_rv_haswell_asm_5x4
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -780,98 +780,98 @@ void bli_sgemmsup_rv_haswell_asm_5x4
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), xmm2)
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm12)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm6, xmm6)
 	vmulps(xmm0, xmm8, xmm8)
 	vmulps(xmm0, xmm10, xmm10)
 	vmulps(xmm0, xmm12, xmm12)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(xmm0, xmm0, xmm0)           // set xmm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -880,37 +880,37 @@ void bli_sgemmsup_rv_haswell_asm_5x4
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6)
 	vmovups(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8)
 	vmovups(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm10)
 	vmovups(xmm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm12)
 	vmovups(xmm12, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -970,41 +970,41 @@ void bli_sgemmsup_rv_haswell_asm_5x4
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovups(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(xmm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(xmm12, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1049,12 +1049,12 @@ void bli_sgemmsup_rv_haswell_asm_5x4
 
 	//lea(mem(rdx, rsi, 8), rdx)         // rdx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1097,8 +1097,8 @@ void bli_sgemmsup_rv_haswell_asm_4x4
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1119,9 +1119,9 @@ void bli_sgemmsup_rv_haswell_asm_4x4
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -1136,7 +1136,7 @@ void bli_sgemmsup_rv_haswell_asm_4x4
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -1178,25 +1178,25 @@ void bli_sgemmsup_rv_haswell_asm_4x4
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1204,14 +1204,14 @@ void bli_sgemmsup_rv_haswell_asm_4x4
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1225,20 +1225,20 @@ void bli_sgemmsup_rv_haswell_asm_4x4
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1246,13 +1246,13 @@ void bli_sgemmsup_rv_haswell_asm_4x4
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1267,91 +1267,91 @@ void bli_sgemmsup_rv_haswell_asm_4x4
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), xmm2)
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	vbroadcastss(mem(rax, r13, 1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm8)
 	vfmadd231ps(xmm0, xmm3, xmm10)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm6, xmm6)
 	vmulps(xmm0, xmm8, xmm8)
 	vmulps(xmm0, xmm10, xmm10)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(xmm0, xmm0, xmm0)           // set xmm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -1360,32 +1360,32 @@ void bli_sgemmsup_rv_haswell_asm_4x4
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6)
 	vmovups(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8)
 	vmovups(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm10)
 	vmovups(xmm10, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1422,37 +1422,37 @@ void bli_sgemmsup_rv_haswell_asm_4x4
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovups(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(xmm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(xmm10, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1482,12 +1482,12 @@ void bli_sgemmsup_rv_haswell_asm_4x4
 
 	//lea(mem(rcx, rsi, 8), rcx)         // rcx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1530,8 +1530,8 @@ void bli_sgemmsup_rv_haswell_asm_3x4
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1552,9 +1552,9 @@ void bli_sgemmsup_rv_haswell_asm_3x4
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -1569,7 +1569,7 @@ void bli_sgemmsup_rv_haswell_asm_3x4
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -1610,25 +1610,25 @@ void bli_sgemmsup_rv_haswell_asm_3x4
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1636,12 +1636,12 @@ void bli_sgemmsup_rv_haswell_asm_3x4
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm8)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1655,18 +1655,18 @@ void bli_sgemmsup_rv_haswell_asm_3x4
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm8)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1674,11 +1674,11 @@ void bli_sgemmsup_rv_haswell_asm_3x4
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm8)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1693,86 +1693,86 @@ void bli_sgemmsup_rv_haswell_asm_3x4
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm8)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), xmm2)
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm8)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm6, xmm6)
 	vmulps(xmm0, xmm8, xmm8)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(xmm0, xmm0, xmm0)           // set xmm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -1781,27 +1781,27 @@ void bli_sgemmsup_rv_haswell_asm_3x4
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6)
 	vmovups(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm8)
 	vmovups(xmm8, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1851,33 +1851,33 @@ void bli_sgemmsup_rv_haswell_asm_3x4
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovups(xmm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(xmm8, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1911,12 +1911,12 @@ void bli_sgemmsup_rv_haswell_asm_3x4
 
 	//lea(mem(rcx, rsi, 8), rcx)         // rcx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1959,8 +1959,8 @@ void bli_sgemmsup_rv_haswell_asm_2x4
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1981,9 +1981,9 @@ void bli_sgemmsup_rv_haswell_asm_2x4
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -1998,7 +1998,7 @@ void bli_sgemmsup_rv_haswell_asm_2x4
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -2038,25 +2038,25 @@ void bli_sgemmsup_rv_haswell_asm_2x4
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -2066,7 +2066,7 @@ void bli_sgemmsup_rv_haswell_asm_2x4
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -2081,14 +2081,14 @@ void bli_sgemmsup_rv_haswell_asm_2x4
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -2097,7 +2097,7 @@ void bli_sgemmsup_rv_haswell_asm_2x4
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -2113,78 +2113,78 @@ void bli_sgemmsup_rv_haswell_asm_2x4
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), xmm2)
 	vbroadcastss(mem(rax, r8,  1), xmm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm4)
 	vfmadd231ps(xmm0, xmm3, xmm6)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
 	vmulps(xmm0, xmm6, xmm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 2), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(xmm0, xmm0, xmm0)           // set xmm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -2193,22 +2193,22 @@ void bli_sgemmsup_rv_haswell_asm_2x4
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm6)
 	vmovups(xmm6, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -2235,29 +2235,29 @@ void bli_sgemmsup_rv_haswell_asm_2x4
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovups(xmm6, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -2276,12 +2276,12 @@ void bli_sgemmsup_rv_haswell_asm_2x4
 
 	//lea(mem(rcx, rsi, 8), rcx)         // rcx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -2324,8 +2324,8 @@ void bli_sgemmsup_rv_haswell_asm_1x4
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2346,9 +2346,9 @@ void bli_sgemmsup_rv_haswell_asm_1x4
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -2363,7 +2363,7 @@ void bli_sgemmsup_rv_haswell_asm_1x4
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -2402,25 +2402,25 @@ void bli_sgemmsup_rv_haswell_asm_1x4
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -2428,7 +2428,7 @@ void bli_sgemmsup_rv_haswell_asm_1x4
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm4)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -2441,21 +2441,21 @@ void bli_sgemmsup_rv_haswell_asm_1x4
 	vbroadcastss(mem(rax        ), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm4)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
 
 	vbroadcastss(mem(rax        ), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm4)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -2469,75 +2469,75 @@ void bli_sgemmsup_rv_haswell_asm_1x4
 	vbroadcastss(mem(rax        ), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm4)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), xmm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), xmm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(xmm0, xmm2, xmm4)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), xmm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), xmm3)       // load beta and duplicate
-	
+
 	vmulps(xmm0, xmm4, xmm4)           // scale by alpha
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 2), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(xmm0, xmm0, xmm0)           // set xmm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -2546,17 +2546,17 @@ void bli_sgemmsup_rv_haswell_asm_1x4
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -2589,25 +2589,25 @@ void bli_sgemmsup_rv_haswell_asm_1x4
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(xmm4, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -2630,12 +2630,12 @@ void bli_sgemmsup_rv_haswell_asm_1x4
 
 	//lea(mem(rcx, rsi, 8), rcx)         // rcx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c
index ac4e1ee0b0..f2cb1df425 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx6.c
@@ -40,20 +40,20 @@
 
 /*
    rrr:
-	 --------        ------        --------      
-	 --------        ------        --------      
-	 --------   +=   ------ ...    --------      
-	 --------        ------        --------      
-	 --------        ------            :         
-	 --------        ------            :         
+	 --------        ------        --------
+	 --------        ------        --------
+	 --------   +=   ------ ...    --------
+	 --------        ------        --------
+	 --------        ------            :
+	 --------        ------            :
 
    rcr:
-	 --------        | | | |       --------      
-	 --------        | | | |       --------      
-	 --------   +=   | | | | ...   --------      
-	 --------        | | | |       --------      
-	 --------        | | | |           :         
-	 --------        | | | |           :         
+	 --------        | | | |       --------
+	 --------        | | | |       --------
+	 --------   +=   | | | | ...   --------
+	 --------        | | | |       --------
+	 --------        | | | |           :
+	 --------        | | | |           :
 
    Assumptions:
    - B is row-stored;
@@ -69,12 +69,12 @@
    cost of the in-register transpose).
 
    crr:
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |  +=   ------ ...    --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------            :         
-	 | | | | | | | |       ------            :         
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |  +=   ------ ...    --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------            :
+	 | | | | | | | |       ------            :
 */
 
 // Prototype reference microkernels.
@@ -93,8 +93,8 @@ void bli_sgemmsup_rv_haswell_asm_6x6
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -115,9 +115,9 @@ void bli_sgemmsup_rv_haswell_asm_6x6
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -132,7 +132,7 @@ void bli_sgemmsup_rv_haswell_asm_6x6
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -179,25 +179,25 @@ void bli_sgemmsup_rv_haswell_asm_6x6
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*4), xmm0)
 	vmovsd(mem(rbx, 4*4), xmm1)
 	vinsertf128(imm(0x1), xmm1, ymm0, ymm0)
@@ -207,19 +207,19 @@ void bli_sgemmsup_rv_haswell_asm_6x6
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -235,25 +235,25 @@ void bli_sgemmsup_rv_haswell_asm_6x6
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm0, ymm3, ymm14)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*4), xmm0)
 	vmovsd(mem(rbx, 4*4), xmm1)
 	vinsertf128(imm(0x1), xmm1, ymm0, ymm0)
@@ -263,18 +263,18 @@ void bli_sgemmsup_rv_haswell_asm_6x6
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm0, ymm3, ymm14)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -291,105 +291,105 @@ void bli_sgemmsup_rv_haswell_asm_6x6
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm0, ymm3, ymm14)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*4), xmm0)
 	vmovsd(mem(rbx, 4*4), xmm1)
 	vinsertf128(imm(0x1), xmm1, ymm0, ymm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), ymm2)
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm0, ymm3, ymm14)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm6, ymm6)
 	vmulps(ymm0, ymm8, ymm8)
 	vmulps(ymm0, ymm10, ymm10)
 	vmulps(ymm0, ymm12, ymm12)
 	vmulps(ymm0, ymm14, ymm14)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	//lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -398,12 +398,12 @@ void bli_sgemmsup_rv_haswell_asm_6x6
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm4, xmm5)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx, 0*4))
@@ -413,8 +413,8 @@ void bli_sgemmsup_rv_haswell_asm_6x6
 	vmovsd(xmm5, mem(rcx, 4*4))
 
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm6, xmm7)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm6)
 	vmovups(xmm6, mem(rcx, 0*4))
@@ -424,8 +424,8 @@ void bli_sgemmsup_rv_haswell_asm_6x6
 	vmovsd(xmm7, mem(rcx, 4*4))
 
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm8, xmm9)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm8)
 	vmovups(xmm8, mem(rcx, 0*4))
@@ -435,8 +435,8 @@ void bli_sgemmsup_rv_haswell_asm_6x6
 	vmovsd(xmm9, mem(rcx, 4*4))
 
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm10, xmm11)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm10)
 	vmovups(xmm10, mem(rcx, 0*4))
@@ -446,8 +446,8 @@ void bli_sgemmsup_rv_haswell_asm_6x6
 	vmovsd(xmm11, mem(rcx, 4*4))
 
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm12, xmm13)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm12)
 	vmovups(xmm12, mem(rcx, 0*4))
@@ -457,8 +457,8 @@ void bli_sgemmsup_rv_haswell_asm_6x6
 	vmovsd(xmm13, mem(rcx, 4*4))
 
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm14, xmm15)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm14)
 	vmovups(xmm14, mem(rcx, 0*4))
@@ -468,8 +468,8 @@ void bli_sgemmsup_rv_haswell_asm_6x6
 	vmovsd(xmm15, mem(rcx, 4*4))
 
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -534,57 +534,57 @@ void bli_sgemmsup_rv_haswell_asm_6x6
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm4, xmm5)
 	vmovups(xmm4, mem(rcx, 0*4))
 	vmovsd(xmm5, mem(rcx, 4*4))
 	add(rdi, rcx)
-	
+
 
 	vextractf128(imm(0x1), ymm6, xmm7)
 	vmovups(xmm6, mem(rcx, 0*4))
 	vmovsd(xmm7, mem(rcx, 4*4))
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm8, xmm9)
 	vmovups(xmm8, mem(rcx, 0*4))
 	vmovsd(xmm9, mem(rcx, 4*4))
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm10, xmm11)
 	vmovups(xmm10, mem(rcx, 0*4))
 	vmovsd(xmm11, mem(rcx, 4*4))
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm12, xmm13)
 	vmovups(xmm12, mem(rcx, 0*4))
 	vmovsd(xmm13, mem(rcx, 4*4))
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm14, xmm15)
 	vmovups(xmm14, mem(rcx, 0*4))
 	vmovsd(xmm15, mem(rcx, 4*4))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -632,10 +632,10 @@ void bli_sgemmsup_rv_haswell_asm_6x6
 
 	//lea(mem(rdx, rsi, 8), rdx)         // rdx += 8*cs_c
 
-	
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -678,8 +678,8 @@ void bli_sgemmsup_rv_haswell_asm_5x6
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -700,9 +700,9 @@ void bli_sgemmsup_rv_haswell_asm_5x6
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -717,7 +717,7 @@ void bli_sgemmsup_rv_haswell_asm_5x6
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -763,25 +763,25 @@ void bli_sgemmsup_rv_haswell_asm_5x6
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*4), xmm0)
 	vmovsd(mem(rbx, 4*4), xmm1)
 	vinsertf128(imm(0x1), xmm1, ymm0, ymm0)
@@ -791,17 +791,17 @@ void bli_sgemmsup_rv_haswell_asm_5x6
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -817,23 +817,23 @@ void bli_sgemmsup_rv_haswell_asm_5x6
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*4), xmm0)
 	vmovsd(mem(rbx, 4*4), xmm1)
 	vinsertf128(imm(0x1), xmm1, ymm0, ymm0)
@@ -843,16 +843,16 @@ void bli_sgemmsup_rv_haswell_asm_5x6
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -869,100 +869,100 @@ void bli_sgemmsup_rv_haswell_asm_5x6
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*4), xmm0)
 	vmovsd(mem(rbx, 4*4), xmm1)
 	vinsertf128(imm(0x1), xmm1, ymm0, ymm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), ymm2)
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm6, ymm6)
 	vmulps(ymm0, ymm8, ymm8)
 	vmulps(ymm0, ymm10, ymm10)
 	vmulps(ymm0, ymm12, ymm12)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	//lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -971,12 +971,12 @@ void bli_sgemmsup_rv_haswell_asm_5x6
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm4, xmm5)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx, 0*4))
@@ -986,8 +986,8 @@ void bli_sgemmsup_rv_haswell_asm_5x6
 	vmovsd(xmm5, mem(rcx, 4*4))
 
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm6, xmm7)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm6)
 	vmovups(xmm6, mem(rcx, 0*4))
@@ -997,8 +997,8 @@ void bli_sgemmsup_rv_haswell_asm_5x6
 	vmovsd(xmm7, mem(rcx, 4*4))
 
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm8, xmm9)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm8)
 	vmovups(xmm8, mem(rcx, 0*4))
@@ -1008,8 +1008,8 @@ void bli_sgemmsup_rv_haswell_asm_5x6
 	vmovsd(xmm9, mem(rcx, 4*4))
 
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm10, xmm11)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm10)
 	vmovups(xmm10, mem(rcx, 0*4))
@@ -1019,8 +1019,8 @@ void bli_sgemmsup_rv_haswell_asm_5x6
 	vmovsd(xmm11, mem(rcx, 4*4))
 
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm12, xmm13)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm12)
 	vmovups(xmm12, mem(rcx, 0*4))
@@ -1030,8 +1030,8 @@ void bli_sgemmsup_rv_haswell_asm_5x6
 	vmovsd(xmm13, mem(rcx, 4*4))
 
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1106,51 +1106,51 @@ void bli_sgemmsup_rv_haswell_asm_5x6
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm4, xmm5)
 	vmovups(xmm4, mem(rcx, 0*4))
 	vmovsd(xmm5, mem(rcx, 4*4))
 	add(rdi, rcx)
-	
+
 
 	vextractf128(imm(0x1), ymm6, xmm7)
 	vmovups(xmm6, mem(rcx, 0*4))
 	vmovsd(xmm7, mem(rcx, 4*4))
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm8, xmm9)
 	vmovups(xmm8, mem(rcx, 0*4))
 	vmovsd(xmm9, mem(rcx, 4*4))
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm10, xmm11)
 	vmovups(xmm10, mem(rcx, 0*4))
 	vmovsd(xmm11, mem(rcx, 4*4))
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm12, xmm13)
 	vmovups(xmm12, mem(rcx, 0*4))
 	vmovsd(xmm13, mem(rcx, 4*4))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1206,10 +1206,10 @@ void bli_sgemmsup_rv_haswell_asm_5x6
 
 	//lea(mem(rdx, rsi, 8), rdx)         // rdx += 8*cs_c
 
-	
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1252,8 +1252,8 @@ void bli_sgemmsup_rv_haswell_asm_4x6
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1274,9 +1274,9 @@ void bli_sgemmsup_rv_haswell_asm_4x6
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -1291,7 +1291,7 @@ void bli_sgemmsup_rv_haswell_asm_4x6
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -1336,25 +1336,25 @@ void bli_sgemmsup_rv_haswell_asm_4x6
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*4), xmm0)
 	vmovsd(mem(rbx, 4*4), xmm1)
 	vinsertf128(imm(0x1), xmm1, ymm0, ymm0)
@@ -1364,14 +1364,14 @@ void bli_sgemmsup_rv_haswell_asm_4x6
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1387,20 +1387,20 @@ void bli_sgemmsup_rv_haswell_asm_4x6
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*4), xmm0)
 	vmovsd(mem(rbx, 4*4), xmm1)
 	vinsertf128(imm(0x1), xmm1, ymm0, ymm0)
@@ -1410,13 +1410,13 @@ void bli_sgemmsup_rv_haswell_asm_4x6
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1433,93 +1433,93 @@ void bli_sgemmsup_rv_haswell_asm_4x6
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*4), xmm0)
 	vmovsd(mem(rbx, 4*4), xmm1)
 	vinsertf128(imm(0x1), xmm1, ymm0, ymm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), ymm2)
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm6, ymm6)
 	vmulps(ymm0, ymm8, ymm8)
 	vmulps(ymm0, ymm10, ymm10)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	//lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -1528,12 +1528,12 @@ void bli_sgemmsup_rv_haswell_asm_4x6
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm4, xmm5)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx, 0*4))
@@ -1543,8 +1543,8 @@ void bli_sgemmsup_rv_haswell_asm_4x6
 	vmovsd(xmm5, mem(rcx, 4*4))
 
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm6, xmm7)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm6)
 	vmovups(xmm6, mem(rcx, 0*4))
@@ -1554,8 +1554,8 @@ void bli_sgemmsup_rv_haswell_asm_4x6
 	vmovsd(xmm7, mem(rcx, 4*4))
 
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm8, xmm9)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm8)
 	vmovups(xmm8, mem(rcx, 0*4))
@@ -1565,9 +1565,9 @@ void bli_sgemmsup_rv_haswell_asm_4x6
 	vmovsd(xmm9, mem(rcx, 4*4))
 
 	add(rdi, rcx)
-	
-	
-	vextractf128(imm(0x1), ymm10, xmm11)
+
+
+	vextractf128(imm(0x1), ymm10, xmm11)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm10)
 	vmovups(xmm10, mem(rcx, 0*4))
 
@@ -1576,8 +1576,8 @@ void bli_sgemmsup_rv_haswell_asm_4x6
 	vmovsd(xmm11, mem(rcx, 4*4))
 
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1620,45 +1620,45 @@ void bli_sgemmsup_rv_haswell_asm_4x6
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm4, xmm5)
 	vmovups(xmm4, mem(rcx, 0*4))
 	vmovsd(xmm5, mem(rcx, 4*4))
 	add(rdi, rcx)
-	
+
 
 	vextractf128(imm(0x1), ymm6, xmm7)
 	vmovups(xmm6, mem(rcx, 0*4))
 	vmovsd(xmm7, mem(rcx, 4*4))
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm8, xmm9)
 	vmovups(xmm8, mem(rcx, 0*4))
 	vmovsd(xmm9, mem(rcx, 4*4))
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm10, xmm11)
 	vmovups(xmm10, mem(rcx, 0*4))
 	vmovsd(xmm11, mem(rcx, 4*4))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1693,10 +1693,10 @@ void bli_sgemmsup_rv_haswell_asm_4x6
 
 	//lea(mem(rcx, rsi, 8), rcx)         // rcx += 8*cs_c
 
-	
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1739,8 +1739,8 @@ void bli_sgemmsup_rv_haswell_asm_3x6
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1761,9 +1761,9 @@ void bli_sgemmsup_rv_haswell_asm_3x6
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -1778,7 +1778,7 @@ void bli_sgemmsup_rv_haswell_asm_3x6
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -1822,25 +1822,25 @@ void bli_sgemmsup_rv_haswell_asm_3x6
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*4), xmm0)
 	vmovsd(mem(rbx, 4*4), xmm1)
 	vinsertf128(imm(0x1), xmm1, ymm0, ymm0)
@@ -1850,12 +1850,12 @@ void bli_sgemmsup_rv_haswell_asm_3x6
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1871,18 +1871,18 @@ void bli_sgemmsup_rv_haswell_asm_3x6
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*4), xmm0)
 	vmovsd(mem(rbx, 4*4), xmm1)
 	vinsertf128(imm(0x1), xmm1, ymm0, ymm0)
@@ -1892,11 +1892,11 @@ void bli_sgemmsup_rv_haswell_asm_3x6
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1913,88 +1913,88 @@ void bli_sgemmsup_rv_haswell_asm_3x6
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*4), xmm0)
 	vmovsd(mem(rbx, 4*4), xmm1)
 	vinsertf128(imm(0x1), xmm1, ymm0, ymm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), ymm2)
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm6, ymm6)
 	vmulps(ymm0, ymm8, ymm8)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	//lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -2003,12 +2003,12 @@ void bli_sgemmsup_rv_haswell_asm_3x6
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm4, xmm5)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx, 0*4))
@@ -2018,8 +2018,8 @@ void bli_sgemmsup_rv_haswell_asm_3x6
 	vmovsd(xmm5, mem(rcx, 4*4))
 
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm6, xmm7)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm6)
 	vmovups(xmm6, mem(rcx, 0*4))
@@ -2029,8 +2029,8 @@ void bli_sgemmsup_rv_haswell_asm_3x6
 	vmovsd(xmm7, mem(rcx, 4*4))
 
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm8, xmm9)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm8)
 	vmovups(xmm8, mem(rcx, 0*4))
@@ -2040,8 +2040,8 @@ void bli_sgemmsup_rv_haswell_asm_3x6
 	vmovsd(xmm9, mem(rcx, 4*4))
 
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -2106,39 +2106,39 @@ void bli_sgemmsup_rv_haswell_asm_3x6
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm4, xmm5)
 	vmovups(xmm4, mem(rcx, 0*4))
 	vmovsd(xmm5, mem(rcx, 4*4))
 	add(rdi, rcx)
-	
+
 
 	vextractf128(imm(0x1), ymm6, xmm7)
 	vmovups(xmm6, mem(rcx, 0*4))
 	vmovsd(xmm7, mem(rcx, 4*4))
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm8, xmm9)
 	vmovups(xmm8, mem(rcx, 0*4))
 	vmovsd(xmm9, mem(rcx, 4*4))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -2181,10 +2181,10 @@ void bli_sgemmsup_rv_haswell_asm_3x6
 
 	//lea(mem(rdx, rsi, 8), rdx)         // rdx += 8*cs_c
 
-	
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -2227,8 +2227,8 @@ void bli_sgemmsup_rv_haswell_asm_2x6
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2249,9 +2249,9 @@ void bli_sgemmsup_rv_haswell_asm_2x6
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -2266,7 +2266,7 @@ void bli_sgemmsup_rv_haswell_asm_2x6
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -2309,25 +2309,25 @@ void bli_sgemmsup_rv_haswell_asm_2x6
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*4), xmm0)
 	vmovsd(mem(rbx, 4*4), xmm1)
 	vinsertf128(imm(0x1), xmm1, ymm0, ymm0)
@@ -2339,7 +2339,7 @@ void bli_sgemmsup_rv_haswell_asm_2x6
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -2356,14 +2356,14 @@ void bli_sgemmsup_rv_haswell_asm_2x6
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*4), xmm0)
 	vmovsd(mem(rbx, 4*4), xmm1)
 	vinsertf128(imm(0x1), xmm1, ymm0, ymm0)
@@ -2374,7 +2374,7 @@ void bli_sgemmsup_rv_haswell_asm_2x6
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -2392,80 +2392,80 @@ void bli_sgemmsup_rv_haswell_asm_2x6
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*4), xmm0)
 	vmovsd(mem(rbx, 4*4), xmm1)
 	vinsertf128(imm(0x1), xmm1, ymm0, ymm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), ymm2)
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm6, ymm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	//lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -2474,12 +2474,12 @@ void bli_sgemmsup_rv_haswell_asm_2x6
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm4, xmm5)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx, 0*4))
@@ -2489,8 +2489,8 @@ void bli_sgemmsup_rv_haswell_asm_2x6
 	vmovsd(xmm5, mem(rcx, 4*4))
 
 	add(rdi, rcx)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm6, xmm7)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm6)
 	vmovups(xmm6, mem(rcx, 0*4))
@@ -2500,8 +2500,8 @@ void bli_sgemmsup_rv_haswell_asm_2x6
 	vmovsd(xmm7, mem(rcx, 4*4))
 
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -2533,33 +2533,33 @@ void bli_sgemmsup_rv_haswell_asm_2x6
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm4, xmm5)
 	vmovups(xmm4, mem(rcx, 0*4))
 	vmovsd(xmm5, mem(rcx, 4*4))
 	add(rdi, rcx)
-	
+
 
 	vextractf128(imm(0x1), ymm6, xmm7)
 	vmovups(xmm6, mem(rcx, 0*4))
 	vmovsd(xmm7, mem(rcx, 4*4))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -2581,10 +2581,10 @@ void bli_sgemmsup_rv_haswell_asm_2x6
 
 	//lea(mem(rcx, rsi, 8), rcx)         // rcx += 8*cs_c
 
-	
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -2627,8 +2627,8 @@ void bli_sgemmsup_rv_haswell_asm_1x6
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2649,9 +2649,9 @@ void bli_sgemmsup_rv_haswell_asm_1x6
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -2666,7 +2666,7 @@ void bli_sgemmsup_rv_haswell_asm_1x6
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -2708,25 +2708,25 @@ void bli_sgemmsup_rv_haswell_asm_1x6
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*4), xmm0)
 	vmovsd(mem(rbx, 4*4), xmm1)
 	vinsertf128(imm(0x1), xmm1, ymm0, ymm0)
@@ -2736,7 +2736,7 @@ void bli_sgemmsup_rv_haswell_asm_1x6
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -2751,14 +2751,14 @@ void bli_sgemmsup_rv_haswell_asm_1x6
 	vbroadcastss(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*4), xmm0)
 	vmovsd(mem(rbx, 4*4), xmm1)
 	vinsertf128(imm(0x1), xmm1, ymm0, ymm0)
@@ -2767,7 +2767,7 @@ void bli_sgemmsup_rv_haswell_asm_1x6
 	vbroadcastss(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -2783,77 +2783,77 @@ void bli_sgemmsup_rv_haswell_asm_1x6
 	vbroadcastss(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*4), xmm0)
 	vmovsd(mem(rbx, 4*4), xmm1)
 	vinsertf128(imm(0x1), xmm1, ymm0, ymm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	//lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -2862,12 +2862,12 @@ void bli_sgemmsup_rv_haswell_asm_1x6
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm4, xmm5)
 	vfmadd231ps(mem(rcx, 0*4), xmm3, xmm4)
 	vmovups(xmm4, mem(rcx, 0*4))
@@ -2877,8 +2877,8 @@ void bli_sgemmsup_rv_haswell_asm_1x6
 	vmovsd(xmm5, mem(rcx, 4*4))
 
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -2920,27 +2920,27 @@ void bli_sgemmsup_rv_haswell_asm_1x6
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vextractf128(imm(0x1), ymm4, xmm5)
 	vmovups(xmm4, mem(rcx, 0*4))
 	vmovsd(xmm5, mem(rcx, 4*4))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -2968,12 +2968,12 @@ void bli_sgemmsup_rv_haswell_asm_1x6
 
 	//lea(mem(rcx, rsi, 8), rcx)         // rcx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -3018,8 +3018,8 @@ void bli_sgemmsup_rv_haswell_asm_1x6
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -3040,9 +3040,9 @@ void bli_sgemmsup_rv_haswell_asm_1x6
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -3057,7 +3057,7 @@ void bli_sgemmsup_rv_haswell_asm_1x6
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -3101,25 +3101,25 @@ void bli_sgemmsup_rv_haswell_asm_1x6
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
+
+
+
 
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -3127,7 +3127,7 @@ void bli_sgemmsup_rv_haswell_asm_1x6
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -3140,21 +3140,21 @@ void bli_sgemmsup_rv_haswell_asm_1x6
 	vbroadcastss(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
 	vbroadcastss(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -3168,96 +3168,96 @@ void bli_sgemmsup_rv_haswell_asm_1x6
 	vbroadcastss(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
 	vbroadcastss(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
+
 
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
-	
+
 
 
 	label(.SCOLSTORED)
@@ -3308,23 +3308,23 @@ void bli_sgemmsup_rv_haswell_asm_1x6
 	jmp(.SDONE)                        // jump to end.
 
 
-	
-	
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -3357,12 +3357,12 @@ void bli_sgemmsup_rv_haswell_asm_1x6
 
 	//lea(mem(rcx, rsi, 8), rcx)         // rcx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c
index 2b1a221ada..603ba7554d 100644
--- a/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c
+++ b/kernels/haswell/3/sup/s6x16/bli_gemmsup_rv_haswell_asm_sMx8.c
@@ -40,20 +40,20 @@
 
 /*
    rrr:
-	 --------        ------        --------      
-	 --------        ------        --------      
-	 --------   +=   ------ ...    --------      
-	 --------        ------        --------      
-	 --------        ------            :         
-	 --------        ------            :         
+	 --------        ------        --------
+	 --------        ------        --------
+	 --------   +=   ------ ...    --------
+	 --------        ------        --------
+	 --------        ------            :
+	 --------        ------            :
 
    rcr:
-	 --------        | | | |       --------      
-	 --------        | | | |       --------      
-	 --------   +=   | | | | ...   --------      
-	 --------        | | | |       --------      
-	 --------        | | | |           :         
-	 --------        | | | |           :         
+	 --------        | | | |       --------
+	 --------        | | | |       --------
+	 --------   +=   | | | | ...   --------
+	 --------        | | | |       --------
+	 --------        | | | |           :
+	 --------        | | | |           :
 
    Assumptions:
    - B is row-stored;
@@ -69,12 +69,12 @@
    cost of the in-register transpose).
 
    crr:
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |  +=   ------ ...    --------      
-	 | | | | | | | |       ------        --------      
-	 | | | | | | | |       ------            :         
-	 | | | | | | | |       ------            :         
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |  +=   ------ ...    --------
+	 | | | | | | | |       ------        --------
+	 | | | | | | | |       ------            :
+	 | | | | | | | |       ------            :
 */
 
 // Prototype reference microkernels.
@@ -93,8 +93,8 @@ void bli_sgemmsup_rv_haswell_asm_6x8
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -115,9 +115,9 @@ void bli_sgemmsup_rv_haswell_asm_6x8
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -132,7 +132,7 @@ void bli_sgemmsup_rv_haswell_asm_6x8
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -181,25 +181,25 @@ void bli_sgemmsup_rv_haswell_asm_6x8
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -207,19 +207,19 @@ void bli_sgemmsup_rv_haswell_asm_6x8
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm0, ymm3, ymm14)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -233,25 +233,25 @@ void bli_sgemmsup_rv_haswell_asm_6x8
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm0, ymm3, ymm14)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 5*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -259,18 +259,18 @@ void bli_sgemmsup_rv_haswell_asm_6x8
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm0, ymm3, ymm14)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -285,103 +285,103 @@ void bli_sgemmsup_rv_haswell_asm_6x8
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm0, ymm3, ymm14)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), ymm2)
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	vbroadcastss(mem(rax, r15, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 	vfmadd231ps(ymm0, ymm3, ymm14)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm6, ymm6)
 	vmulps(ymm0, ymm8, ymm8)
 	vmulps(ymm0, ymm10, ymm10)
 	vmulps(ymm0, ymm12, ymm12)
 	vmulps(ymm0, ymm14, ymm14)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -390,42 +390,42 @@ void bli_sgemmsup_rv_haswell_asm_6x8
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6)
 	vmovups(ymm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8)
 	vmovups(ymm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10)
 	vmovups(ymm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm12)
 	vmovups(ymm12, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm14)
 	vmovups(ymm14, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -502,45 +502,45 @@ void bli_sgemmsup_rv_haswell_asm_6x8
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovups(ymm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm12, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm14, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -596,12 +596,12 @@ void bli_sgemmsup_rv_haswell_asm_6x8
 
 	//lea(mem(rdx, rsi, 8), rdx)         // rdx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -644,8 +644,8 @@ void bli_sgemmsup_rv_haswell_asm_5x8
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -666,9 +666,9 @@ void bli_sgemmsup_rv_haswell_asm_5x8
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -683,7 +683,7 @@ void bli_sgemmsup_rv_haswell_asm_5x8
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -731,25 +731,25 @@ void bli_sgemmsup_rv_haswell_asm_5x8
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
-	
+
+
+
+
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -757,17 +757,17 @@ void bli_sgemmsup_rv_haswell_asm_5x8
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -781,23 +781,23 @@ void bli_sgemmsup_rv_haswell_asm_5x8
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -805,16 +805,16 @@ void bli_sgemmsup_rv_haswell_asm_5x8
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -829,98 +829,98 @@ void bli_sgemmsup_rv_haswell_asm_5x8
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
-	
+
 	vbroadcastss(mem(rax        ), ymm2)
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 	vbroadcastss(mem(rax, r8,  4), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm12)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
-	
-	
+
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm6, ymm6)
 	vmulps(ymm0, ymm8, ymm8)
 	vmulps(ymm0, ymm10, ymm10)
 	vmulps(ymm0, ymm12, ymm12)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
@@ -929,37 +929,37 @@ void bli_sgemmsup_rv_haswell_asm_5x8
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
-	
 
-	
+
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6)
 	vmovups(ymm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8)
 	vmovups(ymm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10)
 	vmovups(ymm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm12)
 	vmovups(ymm12, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1049,41 +1049,41 @@ void bli_sgemmsup_rv_haswell_asm_5x8
 
 
 	jmp(.SDONE)                        // jump to end.
-	
-	
-	
-	
+
+
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
+
 
 	vmovups(ymm6, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm8, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm10, mem(rcx, 0*32))
 	add(rdi, rcx)
-	
-	
+
+
 	vmovups(ymm12, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1147,12 +1147,12 @@ void bli_sgemmsup_rv_haswell_asm_5x8
 
 	//lea(mem(rdx, rsi, 8), rdx)         // rdx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1195,8 +1195,8 @@ void bli_sgemmsup_rv_haswell_asm_4x8
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1217,9 +1217,9 @@ void bli_sgemmsup_rv_haswell_asm_4x8
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -1234,7 +1234,7 @@ void bli_sgemmsup_rv_haswell_asm_4x8
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -1281,25 +1281,25 @@ void bli_sgemmsup_rv_haswell_asm_4x8
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
+
+
+
 
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1307,14 +1307,14 @@ void bli_sgemmsup_rv_haswell_asm_4x8
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1328,20 +1328,20 @@ void bli_sgemmsup_rv_haswell_asm_4x8
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1349,13 +1349,13 @@ void bli_sgemmsup_rv_haswell_asm_4x8
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1370,38 +1370,38 @@ void bli_sgemmsup_rv_haswell_asm_4x8
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1409,66 +1409,66 @@ void bli_sgemmsup_rv_haswell_asm_4x8
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	vbroadcastss(mem(rax, r13, 1), ymm3)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 	vfmadd231ps(ymm0, ymm3, ymm10)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm6, ymm6)
 	vmulps(ymm0, ymm8, ymm8)
 	vmulps(ymm0, ymm10, ymm10)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 4), rdx)         // load address of c +  4*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
+
 
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
@@ -1487,10 +1487,10 @@ void bli_sgemmsup_rv_haswell_asm_4x8
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm10)
 	vmovups(ymm10, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
-	
+
 
 
 	label(.SCOLSTORED)
@@ -1538,19 +1538,19 @@ void bli_sgemmsup_rv_haswell_asm_4x8
 	jmp(.SDONE)                        // jump to end.
 
 
-	
-	
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
 
@@ -1565,8 +1565,8 @@ void bli_sgemmsup_rv_haswell_asm_4x8
 
 	vmovups(ymm10, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -1604,12 +1604,12 @@ void bli_sgemmsup_rv_haswell_asm_4x8
 
 	//lea(mem(rcx, rsi, 8), rcx)         // rcx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -1652,8 +1652,8 @@ void bli_sgemmsup_rv_haswell_asm_3x8
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1674,9 +1674,9 @@ void bli_sgemmsup_rv_haswell_asm_3x8
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -1691,7 +1691,7 @@ void bli_sgemmsup_rv_haswell_asm_3x8
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -1737,25 +1737,25 @@ void bli_sgemmsup_rv_haswell_asm_3x8
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
+
+
+
 
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1763,12 +1763,12 @@ void bli_sgemmsup_rv_haswell_asm_3x8
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -1782,18 +1782,18 @@ void bli_sgemmsup_rv_haswell_asm_3x8
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1801,11 +1801,11 @@ void bli_sgemmsup_rv_haswell_asm_3x8
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -1820,36 +1820,36 @@ void bli_sgemmsup_rv_haswell_asm_3x8
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -1857,63 +1857,63 @@ void bli_sgemmsup_rv_haswell_asm_3x8
 	vbroadcastss(mem(rax, r8,  1), ymm3)
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 	vbroadcastss(mem(rax, r8,  2), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm8)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm6, ymm6)
 	vmulps(ymm0, ymm8, ymm8)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
+
 
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
@@ -1927,10 +1927,10 @@ void bli_sgemmsup_rv_haswell_asm_3x8
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm8)
 	vmovups(ymm8, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
-	
+
 
 
 	label(.SCOLSTORED)
@@ -2010,19 +2010,19 @@ void bli_sgemmsup_rv_haswell_asm_3x8
 	jmp(.SDONE)                        // jump to end.
 
 
-	
-	
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
 
@@ -2033,8 +2033,8 @@ void bli_sgemmsup_rv_haswell_asm_3x8
 
 	vmovups(ymm8, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -2084,12 +2084,12 @@ void bli_sgemmsup_rv_haswell_asm_3x8
 
 	//lea(mem(rdx, rsi, 8), rdx)         // rdx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -2132,8 +2132,8 @@ void bli_sgemmsup_rv_haswell_asm_2x8
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2154,9 +2154,9 @@ void bli_sgemmsup_rv_haswell_asm_2x8
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -2171,7 +2171,7 @@ void bli_sgemmsup_rv_haswell_asm_2x8
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -2216,25 +2216,25 @@ void bli_sgemmsup_rv_haswell_asm_2x8
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
+
+
+
 
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -2244,7 +2244,7 @@ void bli_sgemmsup_rv_haswell_asm_2x8
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -2259,14 +2259,14 @@ void bli_sgemmsup_rv_haswell_asm_2x8
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -2275,7 +2275,7 @@ void bli_sgemmsup_rv_haswell_asm_2x8
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -2291,32 +2291,32 @@ void bli_sgemmsup_rv_haswell_asm_2x8
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -2325,58 +2325,58 @@ void bli_sgemmsup_rv_haswell_asm_2x8
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
 	vfmadd231ps(ymm0, ymm3, ymm6)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
 	vmulps(ymm0, ymm6, ymm6)
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
+
 
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
@@ -2385,10 +2385,10 @@ void bli_sgemmsup_rv_haswell_asm_2x8
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm6)
 	vmovups(ymm6, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
-	
+
 
 
 	label(.SCOLSTORED)
@@ -2426,27 +2426,27 @@ void bli_sgemmsup_rv_haswell_asm_2x8
 	jmp(.SDONE)                        // jump to end.
 
 
-	
-	
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	add(rdi, rcx)
 
 
 	vmovups(ymm6, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -2470,12 +2470,12 @@ void bli_sgemmsup_rv_haswell_asm_2x8
 
 	//lea(mem(rcx, rsi, 8), rcx)         // rcx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
@@ -2518,8 +2518,8 @@ void bli_sgemmsup_rv_haswell_asm_1x8
        float*     restrict b, inc_t rs_b0, inc_t cs_b0,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -2540,9 +2540,9 @@ void bli_sgemmsup_rv_haswell_asm_1x8
 	// -------------------------------------------------------------------------
 
 	begin_asm()
-	
+
 	vzeroall()                         // zero all xmm/ymm registers.
-	
+
 	mov(var(a), rax)                   // load address of a.
 	mov(var(rs_a), r8)                 // load rs_a
 	mov(var(cs_a), r9)                 // load cs_a
@@ -2557,7 +2557,7 @@ void bli_sgemmsup_rv_haswell_asm_1x8
 	//mov(var(cs_b), r11)                // load cs_b
 	lea(mem(, r10, 4), r10)            // rs_b *= sizeof(float)
 	//lea(mem(, r11, 4), r11)            // cs_b *= sizeof(float)
-	
+
 	                                   // NOTE: We cannot pre-load elements of a or b
 	                                   // because it could eventually, in the last
 	                                   // unrolled iter or the cleanup loop, result
@@ -2601,25 +2601,25 @@ void bli_sgemmsup_rv_haswell_asm_1x8
 	lea(mem(rax, r9,  8), rdx)         //
 	lea(mem(rdx, r9,  8), rdx)         // rdx = a + 16*cs_a;
 #endif
-	
-	
-	
+
+
+
 
 	mov(var(k_iter), rsi)              // i = k_iter;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SCONSIDKLEFT)                  // if i == 0, jump to code that
 	                                   // contains the k_left loop.
-	
-	
+
+
 	label(.SLOOPKITER)                 // MAIN LOOP
-	
-	
+
+
 	// ---------------------------------- iteration 0
 
 #if 1
 	prefetch(0, mem(rdx, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
@@ -2627,7 +2627,7 @@ void bli_sgemmsup_rv_haswell_asm_1x8
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
 
-	
+
 	// ---------------------------------- iteration 1
 
 #if 0
@@ -2640,21 +2640,21 @@ void bli_sgemmsup_rv_haswell_asm_1x8
 	vbroadcastss(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
-	
-	
+
+
 	// ---------------------------------- iteration 2
 
 #if 1
 	prefetch(0, mem(rdx, r9, 2, 4*8))
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
 	vbroadcastss(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
-	
+
 
 	// ---------------------------------- iteration 3
 
@@ -2668,96 +2668,96 @@ void bli_sgemmsup_rv_haswell_asm_1x8
 	vbroadcastss(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
-	
-	
-	
+
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKITER)                   // iterate again if i != 0.
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	label(.SCONSIDKLEFT)
-	
+
 	mov(var(k_left), rsi)              // i = k_left;
 	test(rsi, rsi)                     // check i via logical AND.
 	je(.SPOSTACCUM)                    // if i == 0, we're done; jump to end.
 	                                   // else, we prepare to enter k_left loop.
-	
-	
+
+
 	label(.SLOOPKLEFT)                 // EDGE LOOP
 
 #if 0
 	prefetch(0, mem(rdx, 5*8))
 	add(r9, rdx)
 #endif
-	
+
 	vmovups(mem(rbx, 0*32), ymm0)
 	add(r10, rbx)                      // b += rs_b;
 
 	vbroadcastss(mem(rax        ), ymm2)
 	add(r9, rax)                       // a += cs_a;
 	vfmadd231ps(ymm0, ymm2, ymm4)
-	
-	
+
+
 	dec(rsi)                           // i -= 1;
 	jne(.SLOOPKLEFT)                   // iterate again if i != 0.
-	
-	
-	
+
+
+
 	label(.SPOSTACCUM)
 
 
-	
+
 	mov(var(alpha), rax)               // load address of alpha
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rax), ymm0)       // load alpha and duplicate
 	vbroadcastss(mem(rbx), ymm3)       // load beta and duplicate
-	
+
 	vmulps(ymm0, ymm4, ymm4)           // scale by alpha
-	
-	
-	
-	
-	
-	
+
+
+
+
+
+
 	mov(var(cs_c), rsi)                // load cs_c
 	lea(mem(, rsi, 4), rsi)            // rsi = cs_c * sizeof(float)
-	
+
 	//lea(mem(rcx, rsi, 4), rdx)         // load address of c +  4*cs_c;
 	//lea(mem(rcx, rdi, 2), rdx)         // load address of c +  2*rs_c;
 
 	lea(mem(rsi, rsi, 2), rax)         // rax = 3*cs_c;
 	lea(mem(rsi, rsi, 4), rbx)         // rbx = 5*cs_c;
 	lea(mem(rax, rsi, 4), rbp)         // rbp = 7*cs_c;
-	
-	
-	
+
+
+
 	                                   // now avoid loading C if beta == 0
-	
+
 	vxorps(ymm0, ymm0, ymm0)           // set ymm0 to zero.
 	vucomiss(xmm0, xmm3)               // set ZF if beta == 0.
 	je(.SBETAZERO)                     // if ZF = 1, jump to beta == 0 case
-	
+
 
 
 	cmp(imm(4), rdi)                   // set ZF if (4*rs_c) == 4.
 	jz(.SCOLSTORED)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORED)
-	
-	
+
+
 	vfmadd231ps(mem(rcx, 0*32), ymm3, ymm4)
 	vmovups(ymm4, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
-	
+
 
 
 	label(.SCOLSTORED)
@@ -2808,23 +2808,23 @@ void bli_sgemmsup_rv_haswell_asm_1x8
 	jmp(.SDONE)                        // jump to end.
 
 
-	
-	
+
+
 	label(.SBETAZERO)
-	
+
 
 	cmp(imm(4), rdi)                   // set ZF if (8*rs_c) == 8.
 	jz(.SCOLSTORBZ)                    // jump to column storage case
 
 
-	
+
 	label(.SROWSTORBZ)
-	
-	
+
+
 	vmovups(ymm4, mem(rcx, 0*32))
 	//add(rdi, rcx)
-	
-	
+
+
 	jmp(.SDONE)                        // jump to end.
 
 
@@ -2857,12 +2857,12 @@ void bli_sgemmsup_rv_haswell_asm_1x8
 
 	//lea(mem(rcx, rsi, 8), rcx)         // rcx += 8*cs_c
 
-	
-	
-	
+
+
+
 	label(.SDONE)
-	
-	
+
+
 
     end_asm(
 	: // output operands (none)
diff --git a/kernels/knc/3/bli_dgemm_knc_asm_30x8.c b/kernels/knc/3/bli_dgemm_knc_asm_30x8.c
index f20e43f7cc..a53b763da6 100644
--- a/kernels/knc/3/bli_dgemm_knc_asm_30x8.c
+++ b/kernels/knc/3/bli_dgemm_knc_asm_30x8.c
@@ -264,8 +264,8 @@ void bli_dgemm_knc_asm_30x8
        double*    restrict b,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
     double * a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/knc/3/bli_sgemm_knc_asm_30x16.c b/kernels/knc/3/bli_sgemm_knc_asm_30x16.c
index 18a8e5e2ee..7374abfe02 100644
--- a/kernels/knc/3/bli_sgemm_knc_asm_30x16.c
+++ b/kernels/knc/3/bli_sgemm_knc_asm_30x16.c
@@ -264,8 +264,8 @@ void bli_sgemm_knc_asm_30x16
        float*     restrict b,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
     float * a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/knl/1m/bli_dpackm_knl_asm_24x8.c b/kernels/knl/1m/bli_dpackm_knl_asm_24x8.c
index 91fe1989f0..2464ecf0ae 100644
--- a/kernels/knl/1m/bli_dpackm_knl_asm_24x8.c
+++ b/kernels/knl/1m/bli_dpackm_knl_asm_24x8.c
@@ -116,7 +116,7 @@ void bli_dpackm_knl_asm_8xk
        double* restrict kappa_,
        double* restrict a_, inc_t inca_, inc_t lda_,
        double* restrict p_,              inc_t ldp_,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
     const int32_t* offsetPtr = &offsets[0];
@@ -367,7 +367,7 @@ void bli_dpackm_knl_asm_24xk
        double* restrict kappa_,
        double* restrict a_, inc_t inca_, inc_t lda_,
        double* restrict p_,              inc_t ldp_,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
     const int32_t* offsetPtr = &offsets[0];
diff --git a/kernels/knl/1m/bli_spackm_knl_asm_24x16.c b/kernels/knl/1m/bli_spackm_knl_asm_24x16.c
index 8c4bdfe6be..4326a00dde 100644
--- a/kernels/knl/1m/bli_spackm_knl_asm_24x16.c
+++ b/kernels/knl/1m/bli_spackm_knl_asm_24x16.c
@@ -118,7 +118,7 @@ void bli_spackm_knl_asm_16xk
        float*  restrict kappa_,
        float*  restrict a_, inc_t inca_, inc_t lda_,
        float*  restrict p_,              inc_t ldp_,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
     const int32_t* offsetPtr = &offsets[0];
@@ -385,7 +385,7 @@ void bli_spackm_knl_asm_24xk
        float*  restrict kappa_,
        float*  restrict a_, inc_t inca_, inc_t lda_,
        float*  restrict p_,              inc_t ldp_,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
     const int32_t* offsetPtr = &offsets[0];
diff --git a/kernels/knl/3/bli_dgemm_knl_asm_24x8.c b/kernels/knl/3/bli_dgemm_knl_asm_24x8.c
index a7f860ae02..11a480997a 100644
--- a/kernels/knl/3/bli_dgemm_knl_asm_24x8.c
+++ b/kernels/knl/3/bli_dgemm_knl_asm_24x8.c
@@ -193,8 +193,8 @@ void bli_dgemm_knl_asm_24x8
        double*    restrict b,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c_, inc_t cs_c_,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
     (void)data;
diff --git a/kernels/knl/3/bli_sgemm_knl_asm_24x16.c b/kernels/knl/3/bli_sgemm_knl_asm_24x16.c
index 64feba09f1..cbef0cb82e 100644
--- a/kernels/knl/3/bli_sgemm_knl_asm_24x16.c
+++ b/kernels/knl/3/bli_sgemm_knl_asm_24x16.c
@@ -190,8 +190,8 @@ void bli_sgemm_knl_asm_24x16
        float*     restrict b,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c_, inc_t cs_c_,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
     (void)data;
diff --git a/kernels/penryn/1/bli_axpyv_penryn_int.c b/kernels/penryn/1/bli_axpyv_penryn_int.c
index 2dd7c73244..c329912b4b 100644
--- a/kernels/penryn/1/bli_axpyv_penryn_int.c
+++ b/kernels/penryn/1/bli_axpyv_penryn_int.c
@@ -50,7 +50,7 @@ void bli_daxpyv_penryn_int
        double* restrict alpha,
        double* restrict x, inc_t incx,
        double* restrict y, inc_t incy,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	double*  restrict alpha_cast = alpha;
diff --git a/kernels/penryn/1/bli_dotv_penryn_int.c b/kernels/penryn/1/bli_dotv_penryn_int.c
index 2e88a577a9..6d63a9cf07 100644
--- a/kernels/penryn/1/bli_dotv_penryn_int.c
+++ b/kernels/penryn/1/bli_dotv_penryn_int.c
@@ -51,7 +51,7 @@ void bli_ddotv_penryn_int
        double* restrict x, inc_t incx,
        double* restrict y, inc_t incy,
        double* restrict rho,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	double*  restrict x_cast   = x;
diff --git a/kernels/penryn/1f/bli_axpy2v_penryn_int.c b/kernels/penryn/1f/bli_axpy2v_penryn_int.c
index c809ebb41c..350a0af5f1 100644
--- a/kernels/penryn/1f/bli_axpy2v_penryn_int.c
+++ b/kernels/penryn/1f/bli_axpy2v_penryn_int.c
@@ -53,7 +53,7 @@ void bli_daxpy2v_penryn_int
        double* restrict x, inc_t incx,
        double* restrict y, inc_t incy,
        double* restrict z, inc_t incz,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	double*  restrict alpha_cast  = alpha;
diff --git a/kernels/penryn/1f/bli_axpyf_penryn_int.c b/kernels/penryn/1f/bli_axpyf_penryn_int.c
index ce4c4f786f..f52c05d67f 100644
--- a/kernels/penryn/1f/bli_axpyf_penryn_int.c
+++ b/kernels/penryn/1f/bli_axpyf_penryn_int.c
@@ -53,7 +53,7 @@ void bli_daxpyf_penryn_int
        double* restrict a, inc_t inca, inc_t lda,
        double* restrict x, inc_t incx,
        double* restrict y, inc_t incy,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	double*  restrict alpha_cast = alpha;
diff --git a/kernels/penryn/1f/bli_dotaxpyv_penryn_int.c b/kernels/penryn/1f/bli_dotaxpyv_penryn_int.c
index 6b9dab7739..244e3f11c3 100644
--- a/kernels/penryn/1f/bli_dotaxpyv_penryn_int.c
+++ b/kernels/penryn/1f/bli_dotaxpyv_penryn_int.c
@@ -54,7 +54,7 @@ void bli_ddotaxpyv_penryn_int
        double* restrict y, inc_t incy,
        double* restrict rho,
        double* restrict z, inc_t incz,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	double*  restrict alpha_cast = alpha;
diff --git a/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c b/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c
index fe102d427b..3ff80319ad 100644
--- a/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c
+++ b/kernels/penryn/1f/bli_dotxaxpyf_penryn_int.c
@@ -58,7 +58,7 @@ void bli_ddotxaxpyf_penryn_int
        double* restrict beta,
        double* restrict y, inc_t incy,
        double* restrict z, inc_t incz,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	double*  restrict alpha_cast = alpha;
diff --git a/kernels/penryn/1f/bli_dotxf_penryn_int.c b/kernels/penryn/1f/bli_dotxf_penryn_int.c
index ac9887d59e..e8775bd0cd 100644
--- a/kernels/penryn/1f/bli_dotxf_penryn_int.c
+++ b/kernels/penryn/1f/bli_dotxf_penryn_int.c
@@ -54,7 +54,7 @@ void bli_ddotxf_penryn_int
        double* restrict x, inc_t incx,
        double* restrict beta,
        double* restrict y, inc_t incy,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	double*  restrict alpha_cast = alpha;
diff --git a/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c b/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c
index a3e39c3ac1..8a3ec077fd 100644
--- a/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c
+++ b/kernels/penryn/3/bli_gemm_penryn_asm_d4x4.c
@@ -47,8 +47,8 @@ void bli_sgemm_penryn_asm_8x4
        float*     restrict b,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
@@ -522,8 +522,8 @@ void bli_dgemm_penryn_asm_4x4
        double*    restrict b,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	void*   a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c b/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c
index 7bef618faf..aa8dcf858b 100644
--- a/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c
+++ b/kernels/penryn/3/bli_gemmtrsm_l_penryn_asm_d4x4.c
@@ -47,8 +47,8 @@ void bli_sgemmtrsm_l_penryn_asm_8x4
        float*     restrict b01,
        float*     restrict b11,
        float*     restrict c11, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 }
@@ -65,8 +65,8 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
        double*    restrict b01,
        double*    restrict b11,
        double*    restrict c11, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	void*   b_next  = bli_auxinfo_next_b( data );
@@ -81,30 +81,30 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 	GEMMTRSM_UKR_SETUP_CT( d, 4, 4, false );
 
 	begin_asm()
-		
+
 		mov(var(a10), rax) // load address of a10.
 		mov(var(b01), rbx) // load address of b01.
 		//mov(var(b_next), r9) // load address of b_next.
-		
+
 		sub(imm(0-8*16), rax) // increment pointers to allow byte
 		sub(imm(0-8*16), rbx) // offsets in the unrolled iterations.
-		
+
 		movaps(mem(rax, -8*16), xmm0) // initialize loop by pre-loading elements
 		movaps(mem(rax, -7*16), xmm1) // of a and b.
 		movaps(mem(rbx, -8*16), xmm2)
-		
+
 		//mov(var(c11), rcx) // load address of c11
 		//mov(var(rs_c), rdi) // load cs_c
 		//lea(mem(, rdi, 8), rdi) // cs_c *= sizeof(double)
 		//lea(mem(rcx, rdi, 2), rdx) // load address of c + 2*cs_c;
-		
+
 		//prefetch(2, mem(r9, 0*8)) // prefetch b_next
-		
+
 		xorpd(xmm3, xmm3)
 		xorpd(xmm4, xmm4)
 		xorpd(xmm5, xmm5)
 		xorpd(xmm6, xmm6)
-		
+
 		//prefetch(2, mem(rcx, 3*8)) // prefetch c + 0*cs_c
 		xorpd(xmm8, xmm8)
 		movaps(xmm8, xmm9)
@@ -117,20 +117,20 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		//prefetch(2, mem(rdx, rdi, 1, 3*8)) // prefetch c + 3*cs_c
 		movaps(xmm8, xmm14)
 		movaps(xmm8, xmm15)
-		
-		
-		
+
+
+
 		mov(var(k_iter), rsi) // i = k_iter;
 		test(rsi, rsi) // check i via logical AND.
 		je(.CONSIDERKLEFT) // if i == 0, jump to code that
 		 // contains the k_left loop.
-		
-		
+
+
 		label(.LOOPKITER) // MAIN LOOP
-		
+
 		//prefetch(0, mem(rax, 1264))
 		prefetch(0, mem(rax, (4*35+1)*8))
-		
+
 		addpd(xmm3, xmm11) // iteration 0
 		movaps(mem(rbx, -7*16), xmm3)
 		addpd(xmm4, xmm15)
@@ -138,13 +138,13 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm2, xmm7)
 		mulpd(xmm0, xmm2)
 		mulpd(xmm1, xmm4)
-		
+
 		addpd(xmm5, xmm10)
 		addpd(xmm6, xmm14)
 		movaps(xmm7, xmm6)
 		mulpd(xmm0, xmm7)
 		mulpd(xmm1, xmm6)
-		
+
 		addpd(xmm2, xmm9)
 		movaps(mem(rbx, -6*16), xmm2)
 		addpd(xmm4, xmm13)
@@ -152,7 +152,7 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm3, xmm5)
 		mulpd(xmm0, xmm3)
 		mulpd(xmm1, xmm4)
-		
+
 		addpd(xmm7, xmm8)
 		addpd(xmm6, xmm12)
 		movaps(xmm5, xmm6)
@@ -160,8 +160,8 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		movaps(mem(rax, -6*16), xmm0)
 		mulpd(xmm1, xmm6)
 		movaps(mem(rax, -5*16), xmm1)
-		
-		
+
+
 		addpd(xmm3, xmm11) // iteration 1
 		movaps(mem(rbx, -5*16), xmm3)
 		addpd(xmm4, xmm15)
@@ -169,13 +169,13 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm2, xmm7)
 		mulpd(xmm0, xmm2)
 		mulpd(xmm1, xmm4)
-		
+
 		addpd(xmm5, xmm10)
 		addpd(xmm6, xmm14)
 		movaps(xmm7, xmm6)
 		mulpd(xmm0, xmm7)
 		mulpd(xmm1, xmm6)
-		
+
 		addpd(xmm2, xmm9)
 		movaps(mem(rbx, -4*16), xmm2)
 		addpd(xmm4, xmm13)
@@ -183,7 +183,7 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm3, xmm5)
 		mulpd(xmm0, xmm3)
 		mulpd(xmm1, xmm4)
-		
+
 		addpd(xmm7, xmm8)
 		addpd(xmm6, xmm12)
 		movaps(xmm5, xmm6)
@@ -191,10 +191,10 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		movaps(mem(rax, -4*16), xmm0)
 		mulpd(xmm1, xmm6)
 		movaps(mem(rax, -3*16), xmm1)
-		
+
 		//prefetch(0, mem(rax, 1328))
 		prefetch(0, mem(rax, (4*37+1)*8))
-		
+
 		addpd(xmm3, xmm11) // iteration 2
 		movaps(mem(rbx, -3*16), xmm3)
 		addpd(xmm4, xmm15)
@@ -202,13 +202,13 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm2, xmm7)
 		mulpd(xmm0, xmm2)
 		mulpd(xmm1, xmm4)
-		
+
 		addpd(xmm5, xmm10)
 		addpd(xmm6, xmm14)
 		movaps(xmm7, xmm6)
 		mulpd(xmm0, xmm7)
 		mulpd(xmm1, xmm6)
-		
+
 		addpd(xmm2, xmm9)
 		movaps(mem(rbx, -2*16), xmm2)
 		addpd(xmm4, xmm13)
@@ -216,7 +216,7 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm3, xmm5)
 		mulpd(xmm0, xmm3)
 		mulpd(xmm1, xmm4)
-		
+
 		addpd(xmm7, xmm8)
 		addpd(xmm6, xmm12)
 		movaps(xmm5, xmm6)
@@ -224,8 +224,8 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		movaps(mem(rax, -2*16), xmm0)
 		mulpd(xmm1, xmm6)
 		movaps(mem(rax, -1*16), xmm1)
-		
-		
+
+
 		addpd(xmm3, xmm11) // iteration 3
 		movaps(mem(rbx, -1*16), xmm3)
 		addpd(xmm4, xmm15)
@@ -233,17 +233,17 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm2, xmm7)
 		mulpd(xmm0, xmm2)
 		mulpd(xmm1, xmm4)
-		
+
 		sub(imm(0-4*4*8), rax) // a += 4*4 (unroll x mr)
-		
+
 		addpd(xmm5, xmm10)
 		addpd(xmm6, xmm14)
 		movaps(xmm7, xmm6)
 		mulpd(xmm0, xmm7)
 		mulpd(xmm1, xmm6)
-		
+
 		//sub(imm(-4*4*8), r9) // b_next += 4*4 (unroll x nr)
-		
+
 		addpd(xmm2, xmm9)
 		movaps(mem(rbx, 0*16), xmm2)
 		addpd(xmm4, xmm13)
@@ -251,9 +251,9 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm3, xmm5)
 		mulpd(xmm0, xmm3)
 		mulpd(xmm1, xmm4)
-		
+
 		sub(imm(0-4*4*8), rbx) // b += 4*4 (unroll x nr)
-		
+
 		addpd(xmm7, xmm8)
 		addpd(xmm6, xmm12)
 		movaps(xmm5, xmm6)
@@ -261,26 +261,26 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		movaps(mem(rax, -8*16), xmm0)
 		mulpd(xmm1, xmm6)
 		movaps(mem(rax, -7*16), xmm1)
-		
+
 		//prefetch(2, mem(r9, 0*8)) // prefetch b_next[0]
 		//prefetch(2, mem(r9, 8*8)) // prefetch b_next[8]
-		
-		
+
+
 		dec(rsi) // i -= 1;
 		jne(.LOOPKITER) // iterate again if i != 0.
-		
-		
-		
+
+
+
 		label(.CONSIDERKLEFT)
-		
+
 		mov(var(k_left), rsi) // i = k_left;
 		test(rsi, rsi) // check i via logical AND.
 		je(.POSTACCUM) // if i == 0, we're done; jump to end.
 		 // else, we prepare to enter k_left loop.
-		
-		
+
+
 		label(.LOOPKLEFT) // EDGE LOOP
-		
+
 		addpd(xmm3, xmm11) // iteration 0
 		movaps(mem(rbx, -7*16), xmm3)
 		addpd(xmm4, xmm15)
@@ -288,13 +288,13 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm2, xmm7)
 		mulpd(xmm0, xmm2)
 		mulpd(xmm1, xmm4)
-		
+
 		addpd(xmm5, xmm10)
 		addpd(xmm6, xmm14)
 		movaps(xmm7, xmm6)
 		mulpd(xmm0, xmm7)
 		mulpd(xmm1, xmm6)
-		
+
 		addpd(xmm2, xmm9)
 		movaps(mem(rbx, -6*16), xmm2)
 		addpd(xmm4, xmm13)
@@ -302,7 +302,7 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm3, xmm5)
 		mulpd(xmm0, xmm3)
 		mulpd(xmm1, xmm4)
-		
+
 		addpd(xmm7, xmm8)
 		addpd(xmm6, xmm12)
 		movaps(xmm5, xmm6)
@@ -310,28 +310,28 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		movaps(mem(rax, -6*16), xmm0)
 		mulpd(xmm1, xmm6)
 		movaps(mem(rax, -5*16), xmm1)
-		
-		
+
+
 		sub(imm(0-4*1*8), rax) // a += 4 (1 x mr)
 		sub(imm(0-4*1*8), rbx) // b += 4 (1 x nr)
-		
-		
+
+
 		dec(rsi) // i -= 1;
 		jne(.LOOPKLEFT) // iterate again if i != 0.
-		
-		
-		
+
+
+
 		label(.POSTACCUM)
-		
+
 		addpd(xmm3, xmm11)
 		addpd(xmm4, xmm15)
 		addpd(xmm5, xmm10)
 		addpd(xmm6, xmm14)
-		
-		
-		
+
+
+
 		mov(var(b11), rbx) // load address of b11.
-		
+
 		 // xmm8:   xmm9:   xmm10:  xmm11:
 		 // ( ab01  ( ab00  ( ab03  ( ab02
 		 //   ab10 )  ab11 )  ab12 )  ab13 )
@@ -343,31 +343,31 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		movaps(xmm8, xmm1)
 		unpcklpd(xmm8, xmm0)
 		unpckhpd(xmm9, xmm1)
-		
+
 		movaps(xmm11, xmm4)
 		movaps(xmm10, xmm5)
 		unpcklpd(xmm10, xmm4)
 		unpckhpd(xmm11, xmm5)
-		
+
 		movaps(xmm13, xmm2)
 		movaps(xmm12, xmm3)
 		unpcklpd(xmm12, xmm2)
 		unpckhpd(xmm13, xmm3)
-		
+
 		movaps(xmm15, xmm6)
 		movaps(xmm14, xmm7)
 		unpcklpd(xmm14, xmm6)
 		unpckhpd(xmm15, xmm7)
-		
+
 		 // xmm0: ( ab00 ab01 ) xmm4: ( ab02 ab03 )
 		 // xmm1: ( ab10 ab11 ) xmm5: ( ab12 ab13 )
 		 // xmm2: ( ab20 ab21 ) xmm6: ( ab22 ab23 )
 		 // xmm3: ( ab30 ab31 ) xmm7: ( ab32 ab33 )
-		
+
 		mov(var(alpha), rax) // load address of alpha
 		movddup(mem(rax), xmm15) // load alpha and duplicate
-		
-		movaps(mem(rbx, 0*16), xmm8) 
+
+		movaps(mem(rbx, 0*16), xmm8)
 		movaps(mem(rbx, 1*16), xmm12)
 		mulpd(xmm15, xmm8) // xmm8  = alpha * ( beta00 beta01 )
 		mulpd(xmm15, xmm12) // xmm12 = alpha * ( beta02 beta03 )
@@ -382,13 +382,13 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		movaps(mem(rbx, 6*16), xmm11)
 		mulpd(xmm15, xmm11) // xmm11 = alpha * ( beta30 beta31 )
 		mulpd(mem(rbx, 7*16), xmm15) // xmm15 = alpha * ( beta32 beta33 )
-		
+
 		 // (Now scaled by alpha:)
 		 // xmm8:  ( beta00 beta01 ) xmm12: ( beta02 beta03 )
 		 // xmm9:  ( beta10 beta11 ) xmm13: ( beta12 beta13 )
 		 // xmm10: ( beta20 beta21 ) xmm14: ( beta22 beta23 )
 		 // xmm11: ( beta30 beta31 ) xmm15: ( beta32 beta33 )
-		
+
 		subpd(xmm0, xmm8) // xmm8  -= xmm0
 		subpd(xmm1, xmm9) // xmm9  -= xmm1
 		subpd(xmm2, xmm10) // xmm10 -= xmm2
@@ -397,28 +397,28 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		subpd(xmm5, xmm13) // xmm13 -= xmm5
 		subpd(xmm6, xmm14) // xmm14 -= xmm6
 		subpd(xmm7, xmm15) // xmm15 -= xmm7
-		
-		
-		
+
+
+
 		label(.TRSM)
-		
-		
+
+
 		mov(var(a11), rax) // load address of a11
 		mov(var(c11), rcx) // load address of c11
-		
+
 		mov(var(rs_c), rsi) // load rs_c
 		mov(var(cs_c), rdi) // load cs_c
 		sal(imm(3), rsi) // rs_c *= sizeof( double )
 		sal(imm(3), rdi) // cs_c *= sizeof( double )
-		
+
 		lea(mem(rcx, rdi, 2), rdx) // c11_2 = c11 + 2*cs_c
-		
-		
-		
+
+
+
 		 // iteration 0
-		
+
 		movddup(mem(0+0*4)*8(rax), xmm0) // load xmm0 = (1/alpha00)
-		
+
 #ifdef BLIS_ENABLE_TRSM_PREINVERSION
 		mulpd(xmm0, xmm8)  // xmm8  *= (1/alpha00);
 		mulpd(xmm0, xmm12) // xmm12 *= (1/alpha00);
@@ -426,7 +426,7 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		divpd(xmm0, xmm8)  // xmm8  /= alpha00;
 		divpd(xmm0, xmm12) // xmm12 /= alpha00;
 #endif
-		
+
 		movaps(xmm8, mem(rbx, 0*16)) // store ( beta00 beta01 ) = xmm8
 		movaps(xmm12, mem(rbx, 1*16)) // store ( beta02 beta03 ) = xmm12
 		movlpd(xmm8, mem(rcx)) // store ( gamma00 ) = xmm8[0]
@@ -435,14 +435,14 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		movhpd(xmm12, mem(rdx, rdi, 1)) // store ( gamma03 ) = xmm12[1]
 		add(rsi, rcx) // c11   += rs_c
 		add(rsi, rdx) // c11_2 += rs_c
-		
-		
-		
+
+
+
 		 // iteration 1
-		
+
 		movddup(mem(1+0*4)*8(rax), xmm0) // load xmm0 = alpha10
 		movddup(mem(1+1*4)*8(rax), xmm1) // load xmm1 = (1/alpha11)
-		
+
 		movaps(xmm0, xmm4) // xmm4 = xmm0
 		mulpd(xmm8, xmm0) // xmm0 = alpha10 * ( beta00 beta01 )
 		mulpd(xmm12, xmm4) // xmm4 = alpha10 * ( beta02 beta03 )
@@ -455,7 +455,7 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		divpd(xmm1, xmm9)  // xmm9  /= alpha11;
 		divpd(xmm1, xmm13) // xmm13 /= alpha11;
 #endif
-		
+
 		movaps(xmm9, mem(rbx, 2*16)) // store ( beta10 beta11 ) = xmm9
 		movaps(xmm13, mem(rbx, 3*16)) // store ( beta12 beta13 ) = xmm13
 		movlpd(xmm9, mem(rcx)) // store ( gamma10 ) = xmm9[0]
@@ -464,15 +464,15 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		movhpd(xmm13, mem(rdx, rdi, 1)) // store ( gamma13 ) = xmm13[1]
 		add(rsi, rcx) // c11   += rs_c
 		add(rsi, rdx) // c11_2 += rs_c
-		
-		
-		
+
+
+
 		 // iteration 2
-		
+
 		movddup(mem(2+0*4)*8(rax), xmm0) // load xmm0 = alpha20
 		movddup(mem(2+1*4)*8(rax), xmm1) // load xmm1 = alpha21
 		movddup(mem(2+2*4)*8(rax), xmm2) // load xmm2 = (1/alpha22)
-		
+
 		movaps(xmm0, xmm4) // xmm4 = xmm0
 		movaps(xmm1, xmm5) // xmm5 = xmm1
 		mulpd(xmm8, xmm0) // xmm0 = alpha20 * ( beta00 beta01 )
@@ -490,7 +490,7 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		divpd(xmm2, xmm10) // xmm10 /= alpha22;
 		divpd(xmm2, xmm14) // xmm14 /= alpha22;
 #endif
-		
+
 		movaps(xmm10, mem(rbx, 4*16)) // store ( beta20 beta21 ) = xmm10
 		movaps(xmm14, mem(rbx, 5*16)) // store ( beta22 beta23 ) = xmm14
 		movlpd(xmm10, mem(rcx)) // store ( gamma20 ) = xmm10[0]
@@ -499,16 +499,16 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		movhpd(xmm14, mem(rdx, rdi, 1)) // store ( gamma23 ) = xmm14[1]
 		add(rsi, rcx) // c11   += rs_c
 		add(rsi, rdx) // c11_2 += rs_c
-		
-		
-		
+
+
+
 		 // iteration 3
-		
+
 		movddup(mem(3+0*4)*8(rax), xmm0) // load xmm0 = alpha30
 		movddup(mem(3+1*4)*8(rax), xmm1) // load xmm1 = alpha31
 		movddup(mem(3+2*4)*8(rax), xmm2) // load xmm2 = alpha32
 		movddup(mem(3+3*4)*8(rax), xmm3) // load xmm3 = (1/alpha33)
-		
+
 		movaps(xmm0, xmm4) // xmm4 = xmm0
 		movaps(xmm1, xmm5) // xmm5 = xmm1
 		movaps(xmm2, xmm6) // xmm6 = xmm2
@@ -531,16 +531,16 @@ void bli_dgemmtrsm_l_penryn_asm_4x4
 		divpd(xmm3, xmm11) // xmm11 /= alpha33;
 		divpd(xmm3, xmm15) // xmm15 /= alpha33;
 #endif
-		
+
 		movaps(xmm11, mem(rbx, 6*16)) // store ( beta30 beta31 ) = xmm11
 		movaps(xmm15, mem(rbx, 7*16)) // store ( beta32 beta33 ) = xmm15
 		movlpd(xmm11, mem(rcx)) // store ( gamma30 ) = xmm11[0]
 		movhpd(xmm11, mem(rcx, rdi, 1)) // store ( gamma31 ) = xmm11[1]
 		movlpd(xmm15, mem(rdx)) // store ( gamma32 ) = xmm15[0]
 		movhpd(xmm15, mem(rdx, rdi, 1)) // store ( gamma33 ) = xmm15[1]
-		
-		
-		
+
+
+
 
     end_asm(
 		: // output operands (none)
diff --git a/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c b/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c
index add12ea244..2efc037cc1 100644
--- a/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c
+++ b/kernels/penryn/3/bli_gemmtrsm_u_penryn_asm_d4x4.c
@@ -47,8 +47,8 @@ void bli_sgemmtrsm_u_penryn_asm_8x4
        float*     restrict b21,
        float*     restrict b11,
        float*     restrict c11, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 }
@@ -65,8 +65,8 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
        double*    restrict b21,
        double*    restrict b11,
        double*    restrict c11, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	void*   b_next  = bli_auxinfo_next_b( data );
@@ -81,23 +81,23 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 	GEMMTRSM_UKR_SETUP_CT( d, 4, 4, false );
 
 	begin_asm()
-		
+
 		mov(var(a12), rax) // load address of a12.
 		mov(var(b21), rbx) // load address of b21.
 		//mov(var(b_next), r9) // load address of b_next.
-		
+
 		add(imm(8*16), rax) // increment pointers to allow byte
 		add(imm(8*16), rbx) // offsets in the unrolled iterations.
-		
+
 		movaps(mem(rax, -8*16), xmm0) // initialize loop by pre-loading elements
 		movaps(mem(rax, -7*16), xmm1) // of a and b.
 		movaps(mem(rbx, -8*16), xmm2)
-		
+
 		xorpd(xmm3, xmm3)
 		xorpd(xmm4, xmm4)
 		xorpd(xmm5, xmm5)
 		xorpd(xmm6, xmm6)
-		
+
 		xorpd(xmm8, xmm8)
 		movaps(xmm8, xmm9)
 		movaps(xmm8, xmm10)
@@ -106,19 +106,19 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		movaps(xmm8, xmm13)
 		movaps(xmm8, xmm14)
 		movaps(xmm8, xmm15)
-		
-		
-		
+
+
+
 		mov(var(k_iter), rsi) // i = k_iter;
 		test(rsi, rsi) // check i via logical AND.
 		je(.CONSIDERKLEFT) // if i == 0, jump to code that
 		 // contains the k_left loop.
-		
-		
+
+
 		label(.LOOPKITER) // MAIN LOOP
-		
+
 		prefetch(0, mem(rax, 1264))
-		
+
 		addpd(xmm3, xmm11) // iteration 0
 		movaps(mem(rbx, -7*16), xmm3)
 		addpd(xmm4, xmm15)
@@ -126,13 +126,13 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm2, xmm7)
 		mulpd(xmm0, xmm2)
 		mulpd(xmm1, xmm4)
-		
+
 		addpd(xmm5, xmm10)
 		addpd(xmm6, xmm14)
 		movaps(xmm7, xmm6)
 		mulpd(xmm0, xmm7)
 		mulpd(xmm1, xmm6)
-		
+
 		addpd(xmm2, xmm9)
 		movaps(mem(rbx, -6*16), xmm2)
 		addpd(xmm4, xmm13)
@@ -140,7 +140,7 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm3, xmm5)
 		mulpd(xmm0, xmm3)
 		mulpd(xmm1, xmm4)
-		
+
 		addpd(xmm7, xmm8)
 		addpd(xmm6, xmm12)
 		movaps(xmm5, xmm6)
@@ -148,8 +148,8 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		movaps(mem(rax, -6*16), xmm0)
 		mulpd(xmm1, xmm6)
 		movaps(mem(rax, -5*16), xmm1)
-		
-		
+
+
 		addpd(xmm3, xmm11) // iteration 1
 		movaps(mem(rbx, -5*16), xmm3)
 		addpd(xmm4, xmm15)
@@ -157,13 +157,13 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm2, xmm7)
 		mulpd(xmm0, xmm2)
 		mulpd(xmm1, xmm4)
-		
+
 		addpd(xmm5, xmm10)
 		addpd(xmm6, xmm14)
 		movaps(xmm7, xmm6)
 		mulpd(xmm0, xmm7)
 		mulpd(xmm1, xmm6)
-		
+
 		addpd(xmm2, xmm9)
 		movaps(mem(rbx, -4*16), xmm2)
 		addpd(xmm4, xmm13)
@@ -171,7 +171,7 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm3, xmm5)
 		mulpd(xmm0, xmm3)
 		mulpd(xmm1, xmm4)
-		
+
 		addpd(xmm7, xmm8)
 		addpd(xmm6, xmm12)
 		movaps(xmm5, xmm6)
@@ -179,9 +179,9 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		movaps(mem(rax, -4*16), xmm0)
 		mulpd(xmm1, xmm6)
 		movaps(mem(rax, -3*16), xmm1)
-		
+
 		prefetch(0, mem(rax, 1328))
-		
+
 		addpd(xmm3, xmm11) // iteration 2
 		movaps(mem(rbx, -3*16), xmm3)
 		addpd(xmm4, xmm15)
@@ -189,13 +189,13 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm2, xmm7)
 		mulpd(xmm0, xmm2)
 		mulpd(xmm1, xmm4)
-		
+
 		addpd(xmm5, xmm10)
 		addpd(xmm6, xmm14)
 		movaps(xmm7, xmm6)
 		mulpd(xmm0, xmm7)
 		mulpd(xmm1, xmm6)
-		
+
 		addpd(xmm2, xmm9)
 		movaps(mem(rbx, -2*16), xmm2)
 		addpd(xmm4, xmm13)
@@ -203,7 +203,7 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm3, xmm5)
 		mulpd(xmm0, xmm3)
 		mulpd(xmm1, xmm4)
-		
+
 		addpd(xmm7, xmm8)
 		addpd(xmm6, xmm12)
 		movaps(xmm5, xmm6)
@@ -211,8 +211,8 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		movaps(mem(rax, -2*16), xmm0)
 		mulpd(xmm1, xmm6)
 		movaps(mem(rax, -1*16), xmm1)
-		
-		
+
+
 		addpd(xmm3, xmm11) // iteration 3
 		movaps(mem(rbx, -1*16), xmm3)
 		addpd(xmm4, xmm15)
@@ -220,15 +220,15 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm2, xmm7)
 		mulpd(xmm0, xmm2)
 		mulpd(xmm1, xmm4)
-		
+
 		addpd(xmm5, xmm10)
 		addpd(xmm6, xmm14)
 		movaps(xmm7, xmm6)
 		mulpd(xmm0, xmm7)
 		mulpd(xmm1, xmm6)
-		
+
 		add(imm(4*4*8), rax) // a += 4*4 (unroll x mr)
-		
+
 		addpd(xmm2, xmm9)
 		movaps(mem(rbx, 0*16), xmm2)
 		addpd(xmm4, xmm13)
@@ -236,9 +236,9 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm3, xmm5)
 		mulpd(xmm0, xmm3)
 		mulpd(xmm1, xmm4)
-		
+
 		add(imm(4*4*8), rbx) // b += 4*4 (unroll x nr)
-		
+
 		addpd(xmm7, xmm8)
 		addpd(xmm6, xmm12)
 		movaps(xmm5, xmm6)
@@ -246,24 +246,24 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		movaps(mem(rax, -8*16), xmm0)
 		mulpd(xmm1, xmm6)
 		movaps(mem(rax, -7*16), xmm1)
-		
-		
-		
+
+
+
 		dec(rsi) // i -= 1;
 		jne(.LOOPKITER) // iterate again if i != 0.
-		
-		
-		
+
+
+
 		label(.CONSIDERKLEFT)
-		
+
 		mov(var(k_left), rsi) // i = k_left;
 		test(rsi, rsi) // check i via logical AND.
 		je(.POSTACCUM) // if i == 0, we're done; jump to end.
 		 // else, we prepare to enter k_left loop.
-		
-		
+
+
 		label(.LOOPKLEFT) // EDGE LOOP
-		
+
 		addpd(xmm3, xmm11) // iteration 0
 		movaps(mem(rbx, -7*16), xmm3)
 		addpd(xmm4, xmm15)
@@ -271,13 +271,13 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm2, xmm7)
 		mulpd(xmm0, xmm2)
 		mulpd(xmm1, xmm4)
-		
+
 		addpd(xmm5, xmm10)
 		addpd(xmm6, xmm14)
 		movaps(xmm7, xmm6)
 		mulpd(xmm0, xmm7)
 		mulpd(xmm1, xmm6)
-		
+
 		addpd(xmm2, xmm9)
 		movaps(mem(rbx, -6*16), xmm2)
 		addpd(xmm4, xmm13)
@@ -285,7 +285,7 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		pshufd(imm(0x4e), xmm3, xmm5)
 		mulpd(xmm0, xmm3)
 		mulpd(xmm1, xmm4)
-		
+
 		addpd(xmm7, xmm8)
 		addpd(xmm6, xmm12)
 		movaps(xmm5, xmm6)
@@ -293,28 +293,28 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		movaps(mem(rax, -6*16), xmm0)
 		mulpd(xmm1, xmm6)
 		movaps(mem(rax, -5*16), xmm1)
-		
-		
+
+
 		add(imm(4*1*8), rax) // a += 4 (1 x mr)
 		add(imm(4*1*8), rbx) // b += 4 (1 x nr)
-		
-		
+
+
 		dec(rsi) // i -= 1;
 		jne(.LOOPKLEFT) // iterate again if i != 0.
-		
-		
-		
+
+
+
 		label(.POSTACCUM)
-		
+
 		addpd(xmm3, xmm11)
 		addpd(xmm4, xmm15)
 		addpd(xmm5, xmm10)
 		addpd(xmm6, xmm14)
-		
-		
-		
+
+
+
 		mov(var(b11), rbx) // load address of b11.
-		
+
 		 // xmm8:   xmm9:   xmm10:  xmm11:
 		 // ( ab01  ( ab00  ( ab03  ( ab02
 		 //   ab10 )  ab11 )  ab12 )  ab13 )
@@ -326,30 +326,30 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		movaps(xmm8, xmm1)
 		unpcklpd(xmm8, xmm0)
 		unpckhpd(xmm9, xmm1)
-		
+
 		movaps(xmm11, xmm4)
 		movaps(xmm10, xmm5)
 		unpcklpd(xmm10, xmm4)
 		unpckhpd(xmm11, xmm5)
-		
+
 		movaps(xmm13, xmm2)
 		movaps(xmm12, xmm3)
 		unpcklpd(xmm12, xmm2)
 		unpckhpd(xmm13, xmm3)
-		
+
 		movaps(xmm15, xmm6)
 		movaps(xmm14, xmm7)
 		unpcklpd(xmm14, xmm6)
 		unpckhpd(xmm15, xmm7)
-		
+
 		 // xmm0: ( ab00 ab01 ) xmm4: ( ab02 ab03 )
 		 // xmm1: ( ab10 ab11 ) xmm5: ( ab12 ab13 )
 		 // xmm2: ( ab20 ab21 ) xmm6: ( ab22 ab23 )
 		 // xmm3: ( ab30 ab31 ) xmm7: ( ab32 ab33 )
-		
+
 		mov(var(alpha), rax) // load address of alpha
 		movddup(mem(rax), xmm15) // load alpha and duplicate
-		
+
 		movaps(mem(rbx, 0*16), xmm8)
 		movaps(mem(rbx, 1*16), xmm12)
 		mulpd(xmm15, xmm8) // xmm8  = alpha * ( beta00 beta01 )
@@ -365,13 +365,13 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		movaps(mem(rbx, 6*16), xmm11)
 		mulpd(xmm15, xmm11) // xmm11 = alpha * ( beta30 beta31 )
 		mulpd(mem(rbx, 7*16), xmm15) // xmm15 = alpha * ( beta32 beta33 )
-		
+
 		 // (Now scaled by alpha:)
 		 // xmm8:  ( beta00 beta01 ) xmm12: ( beta02 beta03 )
 		 // xmm9:  ( beta10 beta11 ) xmm13: ( beta12 beta13 )
 		 // xmm10: ( beta20 beta21 ) xmm14: ( beta22 beta23 )
 		 // xmm11: ( beta30 beta31 ) xmm15: ( beta32 beta33 )
-		
+
 		subpd(xmm0, xmm8) // xmm8  -= xmm0
 		subpd(xmm1, xmm9) // xmm9  -= xmm1
 		subpd(xmm2, xmm10) // xmm10 -= xmm2
@@ -380,31 +380,31 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		subpd(xmm5, xmm13) // xmm13 -= xmm5
 		subpd(xmm6, xmm14) // xmm14 -= xmm6
 		subpd(xmm7, xmm15) // xmm15 -= xmm7
-		
-		
-		
+
+
+
 		label(.TRSM)
-		
-		
+
+
 		mov(var(a11), rax) // load address of a11
 		mov(var(c11), rcx) // load address of c11
-		
+
 		mov(var(rs_c), rsi) // load rs_c
 		mov(var(cs_c), rdi) // load cs_c
 		sal(imm(3), rsi) // rs_c *= sizeof( double )
 		sal(imm(3), rdi) // cs_c *= sizeof( double )
-		
+
 		add(rsi, rcx) // c11 += (4-1)*rs_c
 		add(rsi, rcx)
 		add(rsi, rcx)
 		lea(mem(rcx, rdi, 2), rdx) // c11_2 = c11 + 2*cs_c;
-		
-		
-		
+
+
+
 		 // iteration 0
-		
+
 		movddup(mem(3+3*4)*8(rax), xmm3) // load xmm3 = (1/alpha33)
-		
+
 #ifdef BLIS_ENABLE_TRSM_PREINVERSION
 		mulpd(xmm3, xmm11) // xmm11 *= (1/alpha33);
 		mulpd(xmm3, xmm15) // xmm15 *= (1/alpha33);
@@ -412,7 +412,7 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		divpd(xmm3, xmm11) // xmm11 /= alpha33;
 		divpd(xmm3, xmm15) // xmm15 /= alpha33;
 #endif
-		
+
 		movaps(xmm11, mem(rbx, 6*16)) // store ( beta30 beta31 ) = xmm11
 		movaps(xmm15, mem(rbx, 7*16)) // store ( beta32 beta33 ) = xmm15
 		movlpd(xmm11, mem(rcx)) // store ( gamma30 ) = xmm11[0]
@@ -421,14 +421,14 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		movhpd(xmm15, mem(rdx, rdi, 1)) // store ( gamma33 ) = xmm15[1]
 		sub(rsi, rcx) // c11   -= rs_c
 		sub(rsi, rdx) // c11_2 -= rs_c
-		
-		
-		
+
+
+
 		 // iteration 1
-		
+
 		movddup(mem(2+2*4)*8(rax), xmm2) // load xmm2 = (1/alpha22)
 		movddup(mem(2+3*4)*8(rax), xmm3) // load xmm3 = alpha23
-		
+
 		movaps(xmm3, xmm7) // xmm7 = xmm3
 		mulpd(xmm11, xmm3) // xmm3 = alpha23 * ( beta30 beta31 )
 		mulpd(xmm15, xmm7) // xmm7 = alpha23 * ( beta32 beta33 )
@@ -441,7 +441,7 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		divpd(xmm2, xmm10) // xmm10 /= alpha22;
 		divpd(xmm2, xmm14) // xmm14 /= alpha22;
 #endif
-		
+
 		movaps(xmm10, mem(rbx, 4*16)) // store ( beta20 beta21 ) = xmm10
 		movaps(xmm14, mem(rbx, 5*16)) // store ( beta22 beta23 ) = xmm14
 		movlpd(xmm10, mem(rcx)) // store ( gamma20 ) = xmm10[0]
@@ -450,15 +450,15 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		movhpd(xmm14, mem(rdx, rdi, 1)) // store ( gamma23 ) = xmm14[1]
 		sub(rsi, rcx) // c11   -= rs_c
 		sub(rsi, rdx) // c11_2 -= rs_c
-		
-		
-		
+
+
+
 		 // iteration 2
-		
+
 		movddup(mem(1+1*4)*8(rax), xmm1) // load xmm1 = (1/alpha11)
 		movddup(mem(1+2*4)*8(rax), xmm2) // load xmm2 = alpha12
 		movddup(mem(1+3*4)*8(rax), xmm3) // load xmm3 = alpha13
-		
+
 		movaps(xmm2, xmm6) // xmm6 = xmm2
 		movaps(xmm3, xmm7) // xmm7 = xmm3
 		mulpd(xmm10, xmm2) // xmm2 = alpha12 * ( beta20 beta21 )
@@ -476,7 +476,7 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		divpd(xmm1, xmm9)  // xmm9  /= alpha11;
 		divpd(xmm1, xmm13) // xmm13 /= alpha11;
 #endif
-		
+
 		movaps(xmm9, mem(rbx, 2*16)) // store ( beta10 beta11 ) = xmm9
 		movaps(xmm13, mem(rbx, 3*16)) // store ( beta12 beta13 ) = xmm13
 		movlpd(xmm9, mem(rcx)) // store ( gamma10 ) = xmm9[0]
@@ -485,16 +485,16 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		movhpd(xmm13, mem(rdx, rdi, 1)) // store ( gamma13 ) = xmm13[1]
 		sub(rsi, rcx) // c11   -= rs_c
 		sub(rsi, rdx) // c11_2 -= rs_c
-		
-		
-		
+
+
+
 		 // iteration 3
-		
+
 		movddup(mem(0+0*4)*8(rax), xmm0) // load xmm0 = (1/alpha00)
 		movddup(mem(0+1*4)*8(rax), xmm1) // load xmm1 = alpha01
 		movddup(mem(0+2*4)*8(rax), xmm2) // load xmm2 = alpha02
 		movddup(mem(0+3*4)*8(rax), xmm3) // load xmm3 = alpha03
-		
+
 		movaps(xmm1, xmm5) // xmm5 = xmm1
 		movaps(xmm2, xmm6) // xmm6 = xmm2
 		movaps(xmm3, xmm7) // xmm7 = xmm3
@@ -517,16 +517,16 @@ void bli_dgemmtrsm_u_penryn_asm_4x4
 		divpd(xmm0, xmm8)  // xmm8  /= alpha00;
 		divpd(xmm0, xmm12) // xmm12 /= alpha00;
 #endif
-		
+
 		movaps(xmm8, mem(rbx, 0*16)) // store ( beta00 beta01 ) = xmm8
 		movaps(xmm12, mem(rbx, 1*16)) // store ( beta02 beta03 ) = xmm12
 		movlpd(xmm8, mem(rcx)) // store ( gamma00 ) = xmm8[0]
 		movhpd(xmm8, mem(rcx, rdi, 1)) // store ( gamma01 ) = xmm8[1]
 		movlpd(xmm12, mem(rdx)) // store ( gamma02 ) = xmm12[0]
 		movhpd(xmm12, mem(rdx, rdi, 1)) // store ( gamma03 ) = xmm12[1]
-		
-		
-		
+
+
+
     end_asm(
 		: // output operands (none)
 		: // input operands
diff --git a/kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c b/kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c
index 21c0b2f100..69341320e2 100644
--- a/kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c
+++ b/kernels/penryn/3/bli_trsm_l_penryn_asm_d4x4.c
@@ -43,8 +43,8 @@ void bli_strsm_l_penryn_asm_8x4
        float*     restrict a11,
        float*     restrict b11,
        float*     restrict c11, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 }
@@ -55,8 +55,8 @@ void bli_dtrsm_l_penryn_asm_4x4
        double*    restrict a11,
        double*    restrict b11,
        double*    restrict c11, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	// Typecast local copies of integers in case dim_t and inc_t are a
@@ -65,9 +65,9 @@ void bli_dtrsm_l_penryn_asm_4x4
 	uint64_t cs_c   = cs_c0;
 
 	begin_asm()
-		
+
 		mov(var(b11), rbx) // load address of b11.
-		
+
 		movaps(mem(rbx, 0*16), xmm8) // xmm8  = ( beta00 beta01 )
 		movaps(mem(rbx, 1*16), xmm12) // xmm9  = ( beta02 beta03 )
 		movaps(mem(rbx, 2*16), xmm9) // xmm10 = ( beta10 beta11 )
@@ -76,28 +76,28 @@ void bli_dtrsm_l_penryn_asm_4x4
 		movaps(mem(rbx, 5*16), xmm14) // xmm13 = ( beta22 beta23 )
 		movaps(mem(rbx, 6*16), xmm11) // xmm14 = ( beta30 beta31 )
 		movaps(mem(rbx, 7*16), xmm15) // xmm15 = ( beta32 beta33 )
-		
-		
-		
+
+
+
 		mov(var(a11), rax) // load address of a11
 		mov(var(c11), rcx) // load address of c11
-		
+
 		mov(var(rs_c), rsi) // load rs_c
 		mov(var(cs_c), rdi) // load cs_c
 		sal(imm(3), rsi) // rs_c *= sizeof( double )
 		sal(imm(3), rdi) // cs_c *= sizeof( double )
-		
+
 		lea(mem(rcx, rdi, 2), rdx) // c11_2 = c11 + 2*cs_c
-		
-		
-		
+
+
+
 		 // iteration 0
-		
+
 		movddup(mem(0+0*4)*8(rax), xmm0) // load xmm0 = (1/alpha00)
-		
+
 		mulpd(xmm0, xmm8) // xmm8  *= (1/alpha00);
 		mulpd(xmm0, xmm12) // xmm12 *= (1/alpha00);
-		
+
 		movaps(xmm8, mem(rbx, 0*16)) // store ( beta00 beta01 ) = xmm8
 		movaps(xmm12, mem(rbx, 1*16)) // store ( beta02 beta03 ) = xmm12
 		movlpd(xmm8, mem(rcx)) // store ( gamma00 ) = xmm8[0]
@@ -106,14 +106,14 @@ void bli_dtrsm_l_penryn_asm_4x4
 		movhpd(xmm12, mem(rdx, rdi, 1)) // store ( gamma03 ) = xmm12[1]
 		add(rsi, rcx) // c11   += rs_c
 		add(rsi, rdx) // c11_2 += rs_c
-		
-		
-		
+
+
+
 		 // iteration 1
-		
+
 		movddup(mem(1+0*4)*8(rax), xmm0) // load xmm0 = alpha10
 		movddup(mem(1+1*4)*8(rax), xmm1) // load xmm1 = (1/alpha11)
-		
+
 		movaps(xmm0, xmm4) // xmm4 = xmm0
 		mulpd(xmm8, xmm0) // xmm0 = alpha10 * ( beta00 beta01 )
 		mulpd(xmm12, xmm4) // xmm4 = alpha10 * ( beta02 beta03 )
@@ -121,7 +121,7 @@ void bli_dtrsm_l_penryn_asm_4x4
 		subpd(xmm4, xmm13) // xmm13 -= xmm4
 		mulpd(xmm1, xmm9) // xmm9  *= (1/alpha11);
 		mulpd(xmm1, xmm13) // xmm13 *= (1/alpha11);
-		
+
 		movaps(xmm9, mem(rbx, 2*16)) // store ( beta10 beta11 ) = xmm9
 		movaps(xmm13, mem(rbx, 3*16)) // store ( beta12 beta13 ) = xmm13
 		movlpd(xmm9, mem(rcx)) // store ( gamma10 ) = xmm9[0]
@@ -130,15 +130,15 @@ void bli_dtrsm_l_penryn_asm_4x4
 		movhpd(xmm13, mem(rdx, rdi, 1)) // store ( gamma13 ) = xmm13[1]
 		add(rsi, rcx) // c11   += rs_c
 		add(rsi, rdx) // c11_2 += rs_c
-		
-		
-		
+
+
+
 		 // iteration 2
-		
+
 		movddup(mem(2+0*4)*8(rax), xmm0) // load xmm0 = alpha20
 		movddup(mem(2+1*4)*8(rax), xmm1) // load xmm1 = alpha21
 		movddup(mem(2+2*4)*8(rax), xmm2) // load xmm2 = (1/alpha22)
-		
+
 		movaps(xmm0, xmm4) // xmm4 = xmm0
 		movaps(xmm1, xmm5) // xmm5 = xmm1
 		mulpd(xmm8, xmm0) // xmm0 = alpha20 * ( beta00 beta01 )
@@ -151,7 +151,7 @@ void bli_dtrsm_l_penryn_asm_4x4
 		subpd(xmm4, xmm14) // xmm14 -= xmm4
 		mulpd(xmm2, xmm10) // xmm10 *= (1/alpha22);
 		mulpd(xmm2, xmm14) // xmm14 *= (1/alpha22);
-		
+
 		movaps(xmm10, mem(rbx, 4*16)) // store ( beta20 beta21 ) = xmm10
 		movaps(xmm14, mem(rbx, 5*16)) // store ( beta22 beta23 ) = xmm14
 		movlpd(xmm10, mem(rcx)) // store ( gamma20 ) = xmm10[0]
@@ -160,16 +160,16 @@ void bli_dtrsm_l_penryn_asm_4x4
 		movhpd(xmm14, mem(rdx, rdi, 1)) // store ( gamma23 ) = xmm14[1]
 		add(rsi, rcx) // c11   += rs_c
 		add(rsi, rdx) // c11_2 += rs_c
-		
-		
-		
+
+
+
 		 // iteration 3
-		
+
 		movddup(mem(3+0*4)*8(rax), xmm0) // load xmm0 = alpha30
 		movddup(mem(3+1*4)*8(rax), xmm1) // load xmm1 = alpha31
 		movddup(mem(3+2*4)*8(rax), xmm2) // load xmm2 = alpha32
 		movddup(mem(3+3*4)*8(rax), xmm3) // load xmm3 = (1/alpha33)
-		
+
 		movaps(xmm0, xmm4) // xmm4 = xmm0
 		movaps(xmm1, xmm5) // xmm5 = xmm1
 		movaps(xmm2, xmm6) // xmm6 = xmm2
@@ -187,16 +187,16 @@ void bli_dtrsm_l_penryn_asm_4x4
 		subpd(xmm4, xmm15) // xmm15 -= xmm4
 		mulpd(xmm3, xmm11) // xmm11 *= (1/alpha33);
 		mulpd(xmm3, xmm15) // xmm15 *= (1/alpha33);
-		
+
 		movaps(xmm11, mem(rbx, 6*16)) // store ( beta30 beta31 ) = xmm11
 		movaps(xmm15, mem(rbx, 7*16)) // store ( beta32 beta33 ) = xmm15
 		movlpd(xmm11, mem(rcx)) // store ( gamma30 ) = xmm11[0]
 		movhpd(xmm11, mem(rcx, rdi, 1)) // store ( gamma31 ) = xmm11[1]
 		movlpd(xmm15, mem(rdx)) // store ( gamma32 ) = xmm15[0]
 		movhpd(xmm15, mem(rdx, rdi, 1)) // store ( gamma33 ) = xmm15[1]
-		
-		
-		
+
+
+
 
     end_asm(
 		: // output operands (none)
diff --git a/kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c b/kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c
index 23855a460a..0befb4e4e7 100644
--- a/kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c
+++ b/kernels/penryn/3/bli_trsm_u_penryn_asm_d4x4.c
@@ -43,8 +43,8 @@ void bli_strsm_u_penryn_asm_8x4
        float*     restrict a11,
        float*     restrict b11,
        float*     restrict c11, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 }
@@ -55,8 +55,8 @@ void bli_dtrsm_u_penryn_asm_4x4
        double*    restrict a11,
        double*    restrict b11,
        double*    restrict c11, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	// Typecast local copies of integers in case dim_t and inc_t are a
@@ -65,9 +65,9 @@ void bli_dtrsm_u_penryn_asm_4x4
 	uint64_t cs_c   = cs_c0;
 
 	begin_asm()
-		
+
 		mov(var(b11), rbx) // load address of b11.
-		
+
 		movaps(mem(rbx, 0*16), xmm8) // xmm8  = ( beta00 beta01 )
 		movaps(mem(rbx, 1*16), xmm12) // xmm9  = ( beta02 beta03 )
 		movaps(mem(rbx, 2*16), xmm9) // xmm10 = ( beta10 beta11 )
@@ -76,31 +76,31 @@ void bli_dtrsm_u_penryn_asm_4x4
 		movaps(mem(rbx, 5*16), xmm14) // xmm13 = ( beta22 beta23 )
 		movaps(mem(rbx, 6*16), xmm11) // xmm14 = ( beta30 beta31 )
 		movaps(mem(rbx, 7*16), xmm15) // xmm15 = ( beta32 beta33 )
-		
-		
-		
+
+
+
 		mov(var(a11), rax) // load address of a11
 		mov(var(c11), rcx) // load address of c11
-		
+
 		mov(var(rs_c), rsi) // load rs_c
 		mov(var(cs_c), rdi) // load cs_c
 		sal(imm(3), rsi) // rs_c *= sizeof( double )
 		sal(imm(3), rdi) // cs_c *= sizeof( double )
-		
+
 		add(rsi, rcx) // c11 += (4-1)*rs_c
 		add(rsi, rcx)
 		add(rsi, rcx)
 		lea(mem(rcx, rdi, 2), rdx) // c11_2 = c11 + 2*cs_c;
-		
-		
-		
+
+
+
 		 // iteration 0
-		
+
 		movddup(mem(3+3*4)*8(rax), xmm3) // load xmm3 = (1/alpha33)
-		
+
 		mulpd(xmm3, xmm11) // xmm11 *= (1/alpha33);
 		mulpd(xmm3, xmm15) // xmm15 *= (1/alpha33);
-		
+
 		movaps(xmm11, mem(rbx, 6*16)) // store ( beta30 beta31 ) = xmm11
 		movaps(xmm15, mem(rbx, 7*16)) // store ( beta32 beta33 ) = xmm15
 		movlpd(xmm11, mem(rcx)) // store ( gamma30 ) = xmm11[0]
@@ -109,14 +109,14 @@ void bli_dtrsm_u_penryn_asm_4x4
 		movhpd(xmm15, mem(rdx, rdi, 1)) // store ( gamma33 ) = xmm15[1]
 		sub(rsi, rcx) // c11   -= rs_c
 		sub(rsi, rdx) // c11_2 -= rs_c
-		
-		
-		
+
+
+
 		 // iteration 1
-		
+
 		movddup(mem(2+2*4)*8(rax), xmm2) // load xmm2 = (1/alpha22)
 		movddup(mem(2+3*4)*8(rax), xmm3) // load xmm3 = alpha23
-		
+
 		movaps(xmm3, xmm7) // xmm7 = xmm3
 		mulpd(xmm11, xmm3) // xmm3 = alpha23 * ( beta30 beta31 )
 		mulpd(xmm15, xmm7) // xmm7 = alpha23 * ( beta32 beta33 )
@@ -124,7 +124,7 @@ void bli_dtrsm_u_penryn_asm_4x4
 		subpd(xmm7, xmm14) // xmm14 -= xmm7
 		mulpd(xmm2, xmm10) // xmm10 *= (1/alpha22);
 		mulpd(xmm2, xmm14) // xmm14 *= (1/alpha22);
-		
+
 		movaps(xmm10, mem(rbx, 4*16)) // store ( beta20 beta21 ) = xmm10
 		movaps(xmm14, mem(rbx, 5*16)) // store ( beta22 beta23 ) = xmm14
 		movlpd(xmm10, mem(rcx)) // store ( gamma20 ) = xmm10[0]
@@ -133,15 +133,15 @@ void bli_dtrsm_u_penryn_asm_4x4
 		movhpd(xmm14, mem(rdx, rdi, 1)) // store ( gamma23 ) = xmm14[1]
 		sub(rsi, rcx) // c11   -= rs_c
 		sub(rsi, rdx) // c11_2 -= rs_c
-		
-		
-		
+
+
+
 		 // iteration 2
-		
+
 		movddup(mem(1+1*4)*8(rax), xmm1) // load xmm1 = (1/alpha11)
 		movddup(mem(1+2*4)*8(rax), xmm2) // load xmm2 = alpha12
 		movddup(mem(1+3*4)*8(rax), xmm3) // load xmm3 = alpha13
-		
+
 		movaps(xmm2, xmm6) // xmm6 = xmm2
 		movaps(xmm3, xmm7) // xmm7 = xmm3
 		mulpd(xmm10, xmm2) // xmm2 = alpha12 * ( beta20 beta21 )
@@ -154,7 +154,7 @@ void bli_dtrsm_u_penryn_asm_4x4
 		subpd(xmm6, xmm13) // xmm13 -= xmm6
 		mulpd(xmm1, xmm9) // xmm9  *= (1/alpha11);
 		mulpd(xmm1, xmm13) // xmm13 *= (1/alpha11);
-		
+
 		movaps(xmm9, mem(rbx, 2*16)) // store ( beta10 beta11 ) = xmm9
 		movaps(xmm13, mem(rbx, 3*16)) // store ( beta12 beta13 ) = xmm13
 		movlpd(xmm9, mem(rcx)) // store ( gamma10 ) = xmm9[0]
@@ -163,16 +163,16 @@ void bli_dtrsm_u_penryn_asm_4x4
 		movhpd(xmm13, mem(rdx, rdi, 1)) // store ( gamma13 ) = xmm13[1]
 		sub(rsi, rcx) // c11   -= rs_c
 		sub(rsi, rdx) // c11_2 -= rs_c
-		
-		
-		
+
+
+
 		 // iteration 3
-		
+
 		movddup(mem(0+0*4)*8(rax), xmm0) // load xmm0 = (1/alpha00)
 		movddup(mem(0+1*4)*8(rax), xmm1) // load xmm1 = alpha01
 		movddup(mem(0+2*4)*8(rax), xmm2) // load xmm2 = alpha02
 		movddup(mem(0+3*4)*8(rax), xmm3) // load xmm3 = alpha03
-		
+
 		movaps(xmm1, xmm5) // xmm5 = xmm1
 		movaps(xmm2, xmm6) // xmm6 = xmm2
 		movaps(xmm3, xmm7) // xmm7 = xmm3
@@ -190,16 +190,16 @@ void bli_dtrsm_u_penryn_asm_4x4
 		subpd(xmm5, xmm12) // xmm12 -= xmm5
 		mulpd(xmm0, xmm8) // xmm8  *= (1/alpha00);
 		mulpd(xmm0, xmm12) // xmm12 *= (1/alpha00);
-		
+
 		movaps(xmm8, mem(rbx, 0*16)) // store ( beta00 beta01 ) = xmm8
 		movaps(xmm12, mem(rbx, 1*16)) // store ( beta02 beta03 ) = xmm12
 		movlpd(xmm8, mem(rcx)) // store ( gamma00 ) = xmm8[0]
 		movhpd(xmm8, mem(rcx, rdi, 1)) // store ( gamma01 ) = xmm8[1]
 		movlpd(xmm12, mem(rdx)) // store ( gamma02 ) = xmm12[0]
 		movhpd(xmm12, mem(rdx, rdi, 1)) // store ( gamma03 ) = xmm12[1]
-		
-		
-		
+
+
+
 
     end_asm(
 		: // output operands (none)
diff --git a/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c b/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c
index e65ce7178a..95ce7edeb2 100644
--- a/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c
+++ b/kernels/piledriver/3/bli_gemm_piledriver_asm_d8x3.c
@@ -50,8 +50,8 @@ void bli_sgemm_piledriver_asm_16x3
        float*     restrict b,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	void*   a_next = bli_auxinfo_next_a( data );
@@ -531,8 +531,8 @@ void bli_dgemm_piledriver_asm_8x3
        double*    restrict b,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	void*   a_next = bli_auxinfo_next_a( data );
@@ -987,8 +987,8 @@ void bli_cgemm_piledriver_asm_4x2
        scomplex*  restrict b,
        scomplex*  restrict beta,
        scomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	void*   a_next = bli_auxinfo_next_a( data );
@@ -1397,8 +1397,8 @@ void bli_zgemm_piledriver_asm_2x2
        dcomplex*  restrict b,
        dcomplex*  restrict beta,
        dcomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	void*   a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/power10/3/bli_dgemm_power10_mma.c b/kernels/power10/3/bli_dgemm_power10_mma.c
index 84e7d16d34..abf66f58ff 100644
--- a/kernels/power10/3/bli_dgemm_power10_mma.c
+++ b/kernels/power10/3/bli_dgemm_power10_mma.c
@@ -70,8 +70,8 @@ void bli_dgemm_power10_mma_8x8
         double*    restrict b,
         double*    restrict beta,
         double*    restrict c, inc_t rs_c0, inc_t cs_c,
-        auxinfo_t* restrict data,
-        cntx_t*    restrict cntx
+        auxinfo_t*          data,
+        cntx_t*             cntx
     )
 {
 
diff --git a/kernels/power10/3/bli_i16gemm_power10_mma.c b/kernels/power10/3/bli_i16gemm_power10_mma.c
index c7f81dc7de..d0c9390f5a 100644
--- a/kernels/power10/3/bli_i16gemm_power10_mma.c
+++ b/kernels/power10/3/bli_i16gemm_power10_mma.c
@@ -63,8 +63,8 @@ void bli_i16gemm_power10_mma_8x16
         short*     restrict b,
         int32_t*       restrict beta,
         int32_t*       restrict c, inc_t rs_c0, inc_t cs_c0,
-        auxinfo_t* restrict data,
-        cntx_t*    restrict cntx
+        auxinfo_t*          data,
+        cntx_t*             cntx
     )
 {
 
diff --git a/kernels/power10/3/bli_i16sgemm_power10_mma.c b/kernels/power10/3/bli_i16sgemm_power10_mma.c
index 9e8d99c138..7d84e68e21 100644
--- a/kernels/power10/3/bli_i16sgemm_power10_mma.c
+++ b/kernels/power10/3/bli_i16sgemm_power10_mma.c
@@ -63,8 +63,8 @@ void bli_i16sgemm_power10_mma_8x16
         short*     restrict b,
         int32_t*       restrict beta,
         int32_t*       restrict c, inc_t rs_c0, inc_t cs_c0,
-        auxinfo_t* restrict data,
-        cntx_t*    restrict cntx
+        auxinfo_t*          data,
+        cntx_t*             cntx
     )
 {
 
diff --git a/kernels/power10/3/bli_i4gemm_power10_mma.c b/kernels/power10/3/bli_i4gemm_power10_mma.c
index 7527f271ff..6c78a9f00c 100644
--- a/kernels/power10/3/bli_i4gemm_power10_mma.c
+++ b/kernels/power10/3/bli_i4gemm_power10_mma.c
@@ -63,8 +63,8 @@ void bli_i4gemm_power10_mma_8x16
         nibbles*   restrict b,
         int32_t*       restrict beta,
         int32_t*       restrict c, inc_t rs_c0, inc_t cs_c0,
-        auxinfo_t* restrict data,
-        cntx_t*    restrict cntx
+        auxinfo_t*          data,
+        cntx_t*             cntx
     )
 {
 
diff --git a/kernels/power10/3/bli_i8gemm_power10_mma.c b/kernels/power10/3/bli_i8gemm_power10_mma.c
index 037a285953..8a0b158a58 100644
--- a/kernels/power10/3/bli_i8gemm_power10_mma.c
+++ b/kernels/power10/3/bli_i8gemm_power10_mma.c
@@ -63,8 +63,8 @@ void bli_i8gemm_power10_mma_8x16
         int8_t*    restrict b,
         int32_t*       restrict beta,
         int32_t*       restrict c, inc_t rs_c0, inc_t cs_c0,
-        auxinfo_t* restrict data,
-        cntx_t*    restrict cntx
+        auxinfo_t*          data,
+        cntx_t*             cntx
     )
 {
     uint64_t k_iter = (k-1) / 4;
diff --git a/kernels/power10/3/bli_sbgemm_power10_mma.c b/kernels/power10/3/bli_sbgemm_power10_mma.c
index b37a0c7cee..c16710f45a 100644
--- a/kernels/power10/3/bli_sbgemm_power10_mma.c
+++ b/kernels/power10/3/bli_sbgemm_power10_mma.c
@@ -64,8 +64,8 @@ void bli_sbgemm_power10_mma_8x16
         bfloat16*  restrict b,
         float*     restrict beta,
         float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-        auxinfo_t* restrict data,
-        cntx_t*    restrict cntx
+        auxinfo_t*          data,
+        cntx_t*             cntx
     )
 {
 
diff --git a/kernels/power10/3/bli_sgemm_power10_mma.c b/kernels/power10/3/bli_sgemm_power10_mma.c
index 42bbaa9169..15895e654a 100644
--- a/kernels/power10/3/bli_sgemm_power10_mma.c
+++ b/kernels/power10/3/bli_sgemm_power10_mma.c
@@ -63,8 +63,8 @@ void bli_sgemm_power10_mma_8x16
         float*     restrict b,
         float*     restrict beta,
         float*     restrict c, inc_t rs_c0, inc_t cs_c,
-        auxinfo_t* restrict data,
-        cntx_t*    restrict cntx
+        auxinfo_t*          data,
+        cntx_t*             cntx
     )
 {
     // Typecast local copies of integers in case dim_t and inc_t are a
diff --git a/kernels/power10/3/bli_shgemm_power10_mma.c b/kernels/power10/3/bli_shgemm_power10_mma.c
index 0e80735dfa..dc62b5d60e 100644
--- a/kernels/power10/3/bli_shgemm_power10_mma.c
+++ b/kernels/power10/3/bli_shgemm_power10_mma.c
@@ -64,8 +64,8 @@ void bli_shgemm_power10_mma_8x16
         float16*  restrict b,
         float*     restrict beta,
         float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-        auxinfo_t* restrict data,
-        cntx_t*    restrict cntx
+        auxinfo_t*          data,
+        cntx_t*             cntx
     )
 {
 
diff --git a/kernels/power7/3/bli_gemm_power7_int_8x4.c b/kernels/power7/3/bli_gemm_power7_int_8x4.c
index b9ce85f724..8ca0c891e8 100644
--- a/kernels/power7/3/bli_gemm_power7_int_8x4.c
+++ b/kernels/power7/3/bli_gemm_power7_int_8x4.c
@@ -58,8 +58,8 @@ void bli_sgemm_power7_int_8x4
        float*     restrict b,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 #if 1 || defined(UTEST)
@@ -100,8 +100,8 @@ void bli_dgemm_power7_int_8x4
        double*    restrict b,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
     if ( cs_c == 1 )
@@ -457,8 +457,8 @@ void bli_cgemm_power7_int_8x4
        scomplex*  restrict b,
        scomplex*  restrict beta,
        scomplex*  restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 #if 1 || defined(UTEST)
@@ -510,8 +510,8 @@ void bli_zgemm_power7_int_8x4
        scomplex*  restrict b,
        scomplex*  restrict beta,
        scomplex*  restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 #if 1 || defined(UTEST)
diff --git a/kernels/power7/3/test/bli_gemm_power7_int_8x4.h b/kernels/power7/3/test/bli_gemm_power7_int_8x4.h
index 50984a67df..a8082b38ef 100644
--- a/kernels/power7/3/test/bli_gemm_power7_int_8x4.h
+++ b/kernels/power7/3/test/bli_gemm_power7_int_8x4.h
@@ -51,8 +51,8 @@ void bli_sgemm_opt_8x4
        float*     restrict b,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      );
 
 void bli_dgemm_opt_8x4
@@ -65,8 +65,8 @@ void bli_dgemm_opt_8x4
        double*    restrict b,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      );
 
 void bli_cgemm_opt_8x4
@@ -79,8 +79,8 @@ void bli_cgemm_opt_8x4
        scomplex*  restrict b,
        scomplex*  restrict beta,
        scomplex*  restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      );
 
 void bli_zgemm_opt_8x4
@@ -93,8 +93,8 @@ void bli_zgemm_opt_8x4
        dcomplex*  restrict b,
        dcomplex*  restrict beta,
        dcomplex*  restrict c, inc_t rs_c, inc_t cs_c,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      );
 
 #endif
diff --git a/kernels/power9/3/bli_gemm_power9_asm_d12x6.c b/kernels/power9/3/bli_gemm_power9_asm_d12x6.c
index 3e5f0d4164..70af2b17ed 100644
--- a/kernels/power9/3/bli_gemm_power9_asm_d12x6.c
+++ b/kernels/power9/3/bli_gemm_power9_asm_d12x6.c
@@ -45,8 +45,8 @@ void bli_dgemm_power9_asm_12x6
        double*    restrict b,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	// Typecast local copies of integers in case dim_t and inc_t are a
diff --git a/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c b/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c
index 7890ad347d..051af62e7f 100644
--- a/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c
+++ b/kernels/sandybridge/3/bli_gemm_sandybridge_asm_d8x4.c
@@ -50,8 +50,8 @@ void bli_sgemm_sandybridge_asm_8x8
        float*     restrict b,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
@@ -542,8 +542,8 @@ void bli_dgemm_sandybridge_asm_8x4
        double*    restrict b,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
@@ -1004,8 +1004,8 @@ void bli_cgemm_sandybridge_asm_8x4
        scomplex*  restrict b,
        scomplex*  restrict beta,
        scomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
@@ -1707,8 +1707,8 @@ void bli_zgemm_sandybridge_asm_4x4
        dcomplex*  restrict b,
        dcomplex*  restrict beta,
        dcomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 	//void*   a_next = bli_auxinfo_next_a( data );
diff --git a/kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.c b/kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.c
index 6bf991082b..cb1cdc7c29 100644
--- a/kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.c
+++ b/kernels/sandybridge/3/bli_gemm_sandybridge_int_d8x4.c
@@ -48,8 +48,8 @@ void bli_sgemm_sandybridge_int_8x8
        float*     restrict b,
        float*     restrict beta,
        float*     restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 }
@@ -65,8 +65,8 @@ void bli_dgemm_sandybridge_int_8x4
        double*    restrict b,
        double*    restrict beta,
        double*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 
@@ -503,8 +503,8 @@ void bli_cgemm_sandybridge_int_8x4
        scomplex*  restrict b,
        scomplex*  restrict beta,
        scomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 }
@@ -523,8 +523,8 @@ void bli_zgemm_sandybridge_int_4x4
        dcomplex*  restrict b,
        dcomplex*  restrict beta,
        dcomplex*  restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t* restrict data,
-       cntx_t*    restrict cntx
+       auxinfo_t*          data,
+       cntx_t*             cntx
      )
 {
 }
diff --git a/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c b/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c
index 9943a170be..2579ac4b51 100644
--- a/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c
+++ b/kernels/skx/3/bli_dgemm_skx_asm_16x12_l2.c
@@ -298,7 +298,7 @@ void bli_dgemm_skx_asm_16x12_l2
        double* restrict beta,
        double* restrict c, inc_t rs_c_, inc_t cs_c_,
        auxinfo_t*       data,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
     (void)data;
diff --git a/kernels/skx/3/bli_dgemm_skx_asm_16x14.c b/kernels/skx/3/bli_dgemm_skx_asm_16x14.c
index e3bc52041d..babb89a1dc 100644
--- a/kernels/skx/3/bli_dgemm_skx_asm_16x14.c
+++ b/kernels/skx/3/bli_dgemm_skx_asm_16x14.c
@@ -164,7 +164,7 @@ void bli_dgemm_skx_asm_16x14
        double* restrict beta,
        double* restrict c, inc_t rs_c_, inc_t cs_c_,
        auxinfo_t*       data,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
     (void)data;
diff --git a/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c b/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c
index 8808449b65..99b850d1d8 100644
--- a/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c
+++ b/kernels/skx/3/bli_sgemm_skx_asm_32x12_l2.c
@@ -328,7 +328,7 @@ void bli_sgemm_skx_asm_32x12_l2
        float* restrict beta,
        float* restrict c, inc_t rs_c_, inc_t cs_c_,
        auxinfo_t*       data,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
     (void)data;
diff --git a/kernels/zen/1/bli_amaxv_zen_int.c b/kernels/zen/1/bli_amaxv_zen_int.c
index 4ece5af291..d1263a6c1b 100644
--- a/kernels/zen/1/bli_amaxv_zen_int.c
+++ b/kernels/zen/1/bli_amaxv_zen_int.c
@@ -104,7 +104,7 @@ void bli_samaxv_zen_int
        dim_t            n,
        float*  restrict x, inc_t incx,
        dim_t*  restrict i_max,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	float*  minus_one = PASTEMAC(s,m1);
@@ -202,7 +202,7 @@ void bli_samaxv_zen_int
 		max_vec_hi.v    = _mm256_extractf128_ps( max_vec.v, 1 );
 		maxInx_vec_lo.v = _mm256_extractf128_ps( maxInx_vec.v, 0 );
 		maxInx_vec_hi.v = _mm256_extractf128_ps( maxInx_vec.v, 1 );
-		
+
 		mask_vec_lo.v = CMP128( s, max_vec_hi.v, max_vec_lo.v, maxInx_vec_hi.v, maxInx_vec_lo.v );
 
 		max_vec_lo.v    = _mm_blendv_ps( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v );
@@ -210,7 +210,7 @@ void bli_samaxv_zen_int
 
 		max_vec_hi.v    = _mm_permute_ps( max_vec_lo.v, 14 );
 		maxInx_vec_hi.v = _mm_permute_ps( maxInx_vec_lo.v, 14 );
-		
+
 		mask_vec_lo.v = CMP128( s, max_vec_hi.v, max_vec_lo.v, maxInx_vec_hi.v, maxInx_vec_lo.v );
 
 		max_vec_lo.v    = _mm_blendv_ps( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v );
@@ -218,7 +218,7 @@ void bli_samaxv_zen_int
 
 		max_vec_hi.v    = _mm_permute_ps( max_vec_lo.v, 1 );
 		maxInx_vec_hi.v = _mm_permute_ps( maxInx_vec_lo.v, 1 );
-		
+
 		mask_vec_lo.v = CMP128( s, max_vec_hi.v, max_vec_lo.v, maxInx_vec_hi.v, maxInx_vec_lo.v );
 
 		max_vec_lo.v    = _mm_blendv_ps( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v );
@@ -269,7 +269,7 @@ void bli_damaxv_zen_int
        dim_t            n,
        double* restrict x, inc_t incx,
        dim_t*  restrict i_max,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	double* minus_one = PASTEMAC(d,m1);
@@ -367,15 +367,15 @@ void bli_damaxv_zen_int
 		max_vec_hi.v    = _mm256_extractf128_pd( max_vec.v, 1 );
 		maxInx_vec_lo.v = _mm256_extractf128_pd( maxInx_vec.v, 0 );
 		maxInx_vec_hi.v = _mm256_extractf128_pd( maxInx_vec.v, 1 );
-		
+
 		mask_vec_lo.v = CMP128( d, max_vec_hi.v, max_vec_lo.v, maxInx_vec_hi.v, maxInx_vec_lo.v );
 
 		max_vec_lo.v    = _mm_blendv_pd( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v );
 		maxInx_vec_lo.v = _mm_blendv_pd( maxInx_vec_lo.v, maxInx_vec_hi.v, mask_vec_lo.v );
-		
+
 		max_vec_hi.v    = _mm_permute_pd( max_vec_lo.v, 1 );
 		maxInx_vec_hi.v = _mm_permute_pd( maxInx_vec_lo.v, 1 );
-		
+
 		mask_vec_lo.v = CMP128( d, max_vec_hi.v, max_vec_lo.v, maxInx_vec_hi.v, maxInx_vec_lo.v );
 
 		max_vec_lo.v    = _mm_blendv_pd( max_vec_lo.v, max_vec_hi.v, mask_vec_lo.v );
diff --git a/kernels/zen/1/bli_axpyv_zen_int.c b/kernels/zen/1/bli_axpyv_zen_int.c
index 686580b290..b842c59eda 100644
--- a/kernels/zen/1/bli_axpyv_zen_int.c
+++ b/kernels/zen/1/bli_axpyv_zen_int.c
@@ -62,7 +62,7 @@ void bli_saxpyv_zen_int
        float*  restrict alpha,
        float*  restrict x, inc_t incx,
        float*  restrict y, inc_t incy,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	const dim_t      n_elem_per_reg = 8;
@@ -166,7 +166,7 @@ void bli_daxpyv_zen_int
        double* restrict alpha,
        double* restrict x, inc_t incx,
        double* restrict y, inc_t incy,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	const dim_t       n_elem_per_reg = 4;
diff --git a/kernels/zen/1/bli_axpyv_zen_int10.c b/kernels/zen/1/bli_axpyv_zen_int10.c
index 873b7da536..6ad6d30cfb 100644
--- a/kernels/zen/1/bli_axpyv_zen_int10.c
+++ b/kernels/zen/1/bli_axpyv_zen_int10.c
@@ -62,7 +62,7 @@ void bli_saxpyv_zen_int10
        float*  restrict alpha,
        float*  restrict x, inc_t incx,
        float*  restrict y, inc_t incy,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	const dim_t      n_elem_per_reg = 8;
@@ -268,7 +268,7 @@ void bli_daxpyv_zen_int10
        double* restrict alpha,
        double* restrict x, inc_t incx,
        double* restrict y, inc_t incy,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	const dim_t      n_elem_per_reg = 4;
diff --git a/kernels/zen/1/bli_copyv_zen_int.c b/kernels/zen/1/bli_copyv_zen_int.c
index 5fd2b15760..87ff03961b 100644
--- a/kernels/zen/1/bli_copyv_zen_int.c
+++ b/kernels/zen/1/bli_copyv_zen_int.c
@@ -43,7 +43,7 @@ void bli_scopyv_zen_int
        dim_t            n,
        float*  restrict x, inc_t incx,
        float*  restrict y, inc_t incy,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	const dim_t num_elem_per_reg = 8;
@@ -192,7 +192,7 @@ void bli_dcopyv_zen_int
        dim_t            n,
        double* restrict x, inc_t incx,
        double* restrict y, inc_t incy,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	const dim_t num_elem_per_reg = 4;
diff --git a/kernels/zen/1/bli_dotv_zen_int.c b/kernels/zen/1/bli_dotv_zen_int.c
index 01022d353a..03c448f857 100644
--- a/kernels/zen/1/bli_dotv_zen_int.c
+++ b/kernels/zen/1/bli_dotv_zen_int.c
@@ -62,7 +62,7 @@ void bli_sdotv_zen_int
        float*  restrict x, inc_t incx,
        float*  restrict y, inc_t incy,
        float*  restrict rho,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	const dim_t      n_elem_per_reg = 8;
@@ -184,7 +184,7 @@ void bli_ddotv_zen_int
        double* restrict x, inc_t incx,
        double* restrict y, inc_t incy,
        double* restrict rho,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	const dim_t      n_elem_per_reg = 4;
diff --git a/kernels/zen/1/bli_dotv_zen_int10.c b/kernels/zen/1/bli_dotv_zen_int10.c
index 8c445849b0..f3fe5ea71f 100644
--- a/kernels/zen/1/bli_dotv_zen_int10.c
+++ b/kernels/zen/1/bli_dotv_zen_int10.c
@@ -63,7 +63,7 @@ void bli_sdotv_zen_int10
        float*  restrict x, inc_t incx,
        float*  restrict y, inc_t incy,
        float*  restrict rho,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	const dim_t      n_elem_per_reg = 8;
@@ -254,7 +254,7 @@ void bli_ddotv_zen_int10
        double* restrict x, inc_t incx,
        double* restrict y, inc_t incy,
        double* restrict rho,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	const dim_t      n_elem_per_reg = 4;
diff --git a/kernels/zen/1/bli_dotxv_zen_int.c b/kernels/zen/1/bli_dotxv_zen_int.c
index 99ea517104..48a9878a77 100644
--- a/kernels/zen/1/bli_dotxv_zen_int.c
+++ b/kernels/zen/1/bli_dotxv_zen_int.c
@@ -64,7 +64,7 @@ void bli_sdotxv_zen_int
        float*  restrict y, inc_t incy,
        float*  restrict beta,
        float*  restrict rho,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	const dim_t      n_elem_per_reg = 8;
@@ -192,7 +192,7 @@ void bli_ddotxv_zen_int
        double* restrict y, inc_t incy,
        double* restrict beta,
        double* restrict rho,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	const dim_t      n_elem_per_reg = 4;
@@ -264,7 +264,7 @@ void bli_ddotxv_zen_int
 
 		x3v.v = _mm256_loadu_pd( x0 + 3*n_elem_per_reg );
 		y3v.v = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
-		
+
 		// Compute the element-wise product of the x and y vectors,
 		// storing in the corresponding rho vectors.
 		rho0v.v = _mm256_fmadd_pd( x0v.v, y0v.v, rho0v.v );
diff --git a/kernels/zen/1/bli_scalv_zen_int.c b/kernels/zen/1/bli_scalv_zen_int.c
index fb17dd4b38..f92cb0c6cb 100644
--- a/kernels/zen/1/bli_scalv_zen_int.c
+++ b/kernels/zen/1/bli_scalv_zen_int.c
@@ -61,7 +61,7 @@ void bli_sscalv_zen_int
        dim_t            n,
        float*  restrict alpha,
        float*  restrict x, inc_t incx,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	const dim_t      n_elem_per_reg = 8;
@@ -160,7 +160,7 @@ void bli_dscalv_zen_int
        dim_t            n,
        double* restrict alpha,
        double* restrict x, inc_t incx,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	const dim_t       n_elem_per_reg = 4;
diff --git a/kernels/zen/1/bli_scalv_zen_int10.c b/kernels/zen/1/bli_scalv_zen_int10.c
index d536ed7c02..c82e773953 100644
--- a/kernels/zen/1/bli_scalv_zen_int10.c
+++ b/kernels/zen/1/bli_scalv_zen_int10.c
@@ -61,7 +61,7 @@ void bli_sscalv_zen_int10
        dim_t            n,
        float*  restrict alpha,
        float*  restrict x, inc_t incx,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	const dim_t      n_elem_per_reg = 8;
@@ -261,7 +261,7 @@ void bli_dscalv_zen_int10
        dim_t            n,
        double* restrict alpha,
        double* restrict x, inc_t incx,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	const dim_t      n_elem_per_reg = 4;
diff --git a/kernels/zen/1/bli_setv_zen_int.c b/kernels/zen/1/bli_setv_zen_int.c
index 16e02c94da..0fbc24cfda 100644
--- a/kernels/zen/1/bli_setv_zen_int.c
+++ b/kernels/zen/1/bli_setv_zen_int.c
@@ -43,7 +43,7 @@ void bli_ssetv_zen_int
        dim_t            n,
        float*  restrict alpha,
        float*  restrict x, inc_t incx,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	const dim_t num_elem_per_reg = 8;
@@ -138,7 +138,7 @@ void  bli_dsetv_zen_int
        dim_t            n,
        double* restrict alpha,
        double* restrict x, inc_t incx,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	const dim_t num_elem_per_reg = 4;
diff --git a/kernels/zen/1/bli_swapv_zen_int8.c b/kernels/zen/1/bli_swapv_zen_int8.c
index aa7a6e3398..824fd0fb81 100644
--- a/kernels/zen/1/bli_swapv_zen_int8.c
+++ b/kernels/zen/1/bli_swapv_zen_int8.c
@@ -59,7 +59,7 @@ void bli_sswapv_zen_int8
        dim_t            n,
        float*  restrict x, inc_t incx,
        float*  restrict y, inc_t incy,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 
@@ -205,7 +205,7 @@ void bli_dswapv_zen_int8
        dim_t            n,
        double* restrict x, inc_t incx,
        double* restrict y, inc_t incy,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	const dim_t      n_elem_per_reg = 4;
diff --git a/kernels/zen/1f/bli_axpyf_zen_int_8.c b/kernels/zen/1f/bli_axpyf_zen_int_8.c
index 15fdf46514..24e6ee5e2a 100644
--- a/kernels/zen/1f/bli_axpyf_zen_int_8.c
+++ b/kernels/zen/1f/bli_axpyf_zen_int_8.c
@@ -64,7 +64,7 @@ void bli_saxpyf_zen_int_8
        float*  restrict a, inc_t inca, inc_t lda,
        float*  restrict x, inc_t incx,
        float*  restrict y, inc_t incy,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	const dim_t      fuse_fac       = 8;
@@ -273,7 +273,7 @@ void bli_daxpyf_zen_int_8
        double* restrict a, inc_t inca, inc_t lda,
        double* restrict x, inc_t incx,
        double* restrict y, inc_t incy,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	const dim_t      fuse_fac       = 8;
diff --git a/kernels/zen/1f/bli_dotxf_zen_int_8.c b/kernels/zen/1f/bli_dotxf_zen_int_8.c
index 1f4a671b65..50ca925610 100644
--- a/kernels/zen/1f/bli_dotxf_zen_int_8.c
+++ b/kernels/zen/1f/bli_dotxf_zen_int_8.c
@@ -65,7 +65,7 @@ void bli_sdotxf_zen_int_8
        float*  restrict x, inc_t incx,
        float*  restrict beta,
        float*  restrict y, inc_t incy,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	const dim_t fuse_fac       = 8;
@@ -455,7 +455,7 @@ void bli_ddotxf_zen_int_8
        double* restrict x, inc_t incx,
        double* restrict beta,
        double* restrict y, inc_t incy,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
 	const dim_t      fuse_fac       = 8;
diff --git a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8m.c b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8m.c
index 8d10406a05..076953725a 100644
--- a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8m.c
+++ b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8m.c
@@ -123,8 +123,8 @@ void bli_cgemmsup_rv_zen_asm_3x8m
        scomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
        scomplex*    restrict beta,
        scomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*   restrict data,
-       cntx_t*      restrict cntx
+       auxinfo_t*            data,
+       cntx_t*               cntx
      )
 {
 	uint64_t n_left = n0 % 8;
@@ -495,7 +495,7 @@ void bli_cgemmsup_rv_zen_asm_3x8m
 	vmulps(ymm1, ymm3, ymm3)
 	vaddsubps(ymm3, ymm13, ymm13)
 
-	/* (�r + �i)x C + ((ar + ai) x AB) */
+	/* (�r + �i)x C + ((ar + ai) x AB) */
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rbx), ymm1)       // load beta_r and duplicate
 	vbroadcastss(mem(rbx, 4), ymm2)    // load beta_i and duplicate
@@ -583,7 +583,7 @@ void bli_cgemmsup_rv_zen_asm_3x8m
 
 	CGEMM_INPUT_SCALE_CS_BETA_NZ
 	vaddps(ymm9, ymm0, ymm9)
-	add(rdi, rcx) 
+	add(rdi, rcx)
 
 	CGEMM_INPUT_SCALE_CS_BETA_NZ
 	vaddps(ymm13, ymm0, ymm13)
@@ -609,18 +609,18 @@ void bli_cgemmsup_rv_zen_asm_3x8m
 	vmovups(xmm2, mem(rcx))				// store (gamma03-13)
 	vmovhpd(xmm12, mem(rcx, 16))	// store (gamma33)
 	lea(mem(rcx, rsi, 1), rcx)
-	
+
 	/******************Transpose bottom tile 4x3***************************/
 	vunpcklpd(ymm9, ymm5, ymm0)        //a8a9b8b9     a12a13b12b13 //gamma04-14 gamma06-16
 	vunpckhpd(ymm9, ymm5, ymm2)        //a10a11b10b11 a14a15b14b15 //gamma05-15 gamma07-17
-	
+
 	vmovups(xmm0, mem(rcx))				// store (gamma04-14)
 	vmovlpd(xmm13, mem(rcx, 16))	// store (gamma24)
 	lea(mem(rcx, rsi, 1), rcx)
 	vmovups(xmm2, mem(rcx))				// store (gamma05-15)
 	vmovhpd(xmm13, mem(rcx, 16))	// store (gamma25)
 	lea(mem(rcx, rsi, 1), rcx)
-	
+
 	vextractf128(imm(0x1), ymm0, xmm0)
 	vextractf128(imm(0x1), ymm2, xmm2)
 	vextractf128(imm(0x1), ymm13, xmm13)
@@ -658,8 +658,8 @@ void bli_cgemmsup_rv_zen_asm_3x8m
 	mov(var(cs_c), rsi)        // load cs_c
 	lea(mem(, rsi, 8), rsi)    // rsi = cs_c * sizeof(dt)
 
-	vunpcklpd(ymm8, ymm4, ymm0) //a0a1b0b1 a4a4b4b5 
-	vunpckhpd(ymm8, ymm4, ymm2) //a2a3b2b3 a6a7b6b7 
+	vunpcklpd(ymm8, ymm4, ymm0) //a0a1b0b1 a4a4b4b5
+	vunpckhpd(ymm8, ymm4, ymm2) //a2a3b2b3 a6a7b6b7
 
 	/******************Transpose top tile 4x3***************************/
 	vmovups(xmm0, mem(rcx))
@@ -680,8 +680,8 @@ void bli_cgemmsup_rv_zen_asm_3x8m
 	lea(mem(rcx, rsi, 1), rcx)
 
 	/******************Transpose bottom tile 4x3***************************/
-	vunpcklpd(ymm9, ymm5, ymm0)  //a8a9b8b9     a12a13b12b13 
-	vunpckhpd(ymm9, ymm5, ymm2)  //a10a11b10b11 a14a15b14b15 
+	vunpcklpd(ymm9, ymm5, ymm0)  //a8a9b8b9     a12a13b12b13
+	vunpckhpd(ymm9, ymm5, ymm2)  //a10a11b10b11 a14a15b14b15
 
 	vmovups(xmm0, mem(rcx))
 	vmovlpd(xmm13, mem(rcx, 16))
@@ -788,8 +788,8 @@ void bli_cgemmsup_rv_zen_asm_3x4m
        scomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
        scomplex*    restrict beta,
        scomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*   restrict data,
-       cntx_t*      restrict cntx
+       auxinfo_t*            data,
+       cntx_t*               cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1060,7 +1060,7 @@ void bli_cgemmsup_rv_zen_asm_3x4m
 	vmulps(ymm1, ymm3, ymm3)
 	vaddsubps(ymm3, ymm12, ymm12)
 
-	/* (�r + �i)x C + ((ar + ai) x AB) */
+	/* (�r + �i)x C + ((ar + ai) x AB) */
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rbx), ymm1)       // load beta_r and duplicate
 	vbroadcastss(mem(rbx, 4), ymm2)    // load beta_i and duplicate
@@ -1117,7 +1117,7 @@ void bli_cgemmsup_rv_zen_asm_3x4m
 	CGEMM_INPUT_SCALE_CS_BETA_NZ
 	vaddps(ymm4, ymm0, ymm4)
 	add(rdi, rcx)
-	
+
 	CGEMM_INPUT_SCALE_CS_BETA_NZ
 	vaddps(ymm8, ymm0, ymm8)
 	add(rdi, rcx)
@@ -1136,7 +1136,7 @@ void bli_cgemmsup_rv_zen_asm_3x4m
 	vmovups(xmm2, mem(rcx))				// store (gamma01-11)
 	vmovhpd(xmm12, mem(rcx, 16))	// store (gamma21)
 	lea(mem(rcx, rsi, 1), rcx)
-	
+
 	vextractf128(imm(0x1), ymm0, xmm0)
 	vextractf128(imm(0x1), ymm2, xmm2)
 	vextractf128(imm(0x1), ymm12, xmm12)
@@ -1172,8 +1172,8 @@ void bli_cgemmsup_rv_zen_asm_3x4m
 	mov(var(cs_c), rsi)        // load cs_c
 	lea(mem(, rsi, 8), rsi)    // rsi = cs_c * sizeof(dt)
 
-	vunpcklpd(ymm8, ymm4, ymm0) //a0a1b0b1 a4a4b4b5 
-	vunpckhpd(ymm8, ymm4, ymm2) //a2a3b2b3 a6a7b6b7 
+	vunpcklpd(ymm8, ymm4, ymm0) //a0a1b0b1 a4a4b4b5
+	vunpckhpd(ymm8, ymm4, ymm2) //a2a3b2b3 a6a7b6b7
 
 	vmovups(xmm0, mem(rcx))
 	vmovlpd(xmm12, mem(rcx, 16))
@@ -1277,8 +1277,8 @@ void bli_cgemmsup_rv_zen_asm_3x2m
        scomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
        scomplex*    restrict beta,
        scomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*   restrict data,
-       cntx_t*      restrict cntx
+       auxinfo_t*            data,
+       cntx_t*               cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -1543,7 +1543,7 @@ void bli_cgemmsup_rv_zen_asm_3x2m
 	vmulps(xmm1, xmm3, xmm3)
 	vaddsubps(xmm3, xmm12, xmm12)
 
-	/* (�r + �i)x C + ((ar + ai) x AB) */
+	/* (�r + �i)x C + ((ar + ai) x AB) */
 	mov(var(beta), rbx)                // load address of beta
 	vbroadcastss(mem(rbx), xmm1)       // load beta_r and duplicate
 	vbroadcastss(mem(rbx, 4), xmm2)    // load beta_i and duplicate
@@ -1627,7 +1627,7 @@ void bli_cgemmsup_rv_zen_asm_3x2m
 	CGEMM_INPUT_SCALE_CS_BETA_NZ_128
 	vaddps(xmm4, xmm0, xmm4)
 	add(rdi, rcx)
-	
+
 	CGEMM_INPUT_SCALE_CS_BETA_NZ_128
 	vaddps(xmm8, xmm0, xmm8)
 	add(rdi, rcx)
@@ -1753,4 +1753,3 @@ void bli_cgemmsup_rv_zen_asm_3x2m
 	}
 }
 
- 
\ No newline at end of file
diff --git a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8n.c b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8n.c
index 6c68707e18..62491dfb41 100644
--- a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8n.c
+++ b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_c3x8n.c
@@ -80,14 +80,14 @@ void bli_cgemmsup_rv_zen_asm_3x8n
        scomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
        scomplex*    restrict beta,
        scomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*   restrict data,
-       cntx_t*      restrict cntx
+       auxinfo_t*            data,
+       cntx_t*               cntx
      )
 {
 	uint64_t m_left = m0 % 3;
 	if ( m_left )
 	{
-		cgemmsup_ker_ft ker_fps[3] = 
+		cgemmsup_ker_ft ker_fps[3] =
 		{
 			NULL,
 			bli_cgemmsup_rv_zen_asm_1x8n,
@@ -120,7 +120,7 @@ void bli_cgemmsup_rv_zen_asm_3x8n
 	uint64_t rs_c   = rs_c0;
 	uint64_t cs_c   = cs_c0;
 
-	
+
 	if ( n_iter == 0 ) goto consider_edge_cases;
 
 	// -------------------------------------------------------------------------
@@ -150,7 +150,7 @@ void bli_cgemmsup_rv_zen_asm_3x8n
 		ymm13 = _mm256_setzero_ps();
 		ymm14 = _mm256_setzero_ps();
 		ymm15 = _mm256_setzero_ps();
-		
+
 		dim_t ta_inc_row = rs_a;
 		dim_t tb_inc_row = rs_b;
 		dim_t tc_inc_row = rs_c;
@@ -170,7 +170,7 @@ void bli_cgemmsup_rv_zen_asm_3x8n
 			// This loop is processing MR x K
 			ymm0 = _mm256_loadu_ps((float const *)(tB + tb_inc_row * k_iter));
 			ymm1 = _mm256_loadu_ps((float const *)(tB + tb_inc_row * k_iter +  4));
-			
+
 			//broadcasted matrix B elements are multiplied
 			//with matrix A columns.
 			ymm2 = _mm256_broadcast_ss((float const *)(tA));
@@ -534,8 +534,8 @@ void bli_cgemmsup_rv_zen_asm_2x8n
        scomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
        scomplex*    restrict beta,
        scomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*   restrict data,
-       cntx_t*      restrict cntx
+       auxinfo_t*            data,
+       cntx_t*               cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -600,7 +600,7 @@ void bli_cgemmsup_rv_zen_asm_2x8n
 			// This loop is processing MR x K
 			ymm0 = _mm256_loadu_ps((float const *)(tB + tb_inc_row * k_iter));
 			ymm1 = _mm256_loadu_ps((float const *)(tB + tb_inc_row * k_iter +  4));
-			
+
 			//broadcasted matrix B elements are multiplied
 			//with matrix A columns.
 			ymm2 = _mm256_broadcast_ss((float const *)(tA));
@@ -882,8 +882,8 @@ void bli_cgemmsup_rv_zen_asm_1x8n
        scomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
        scomplex*    restrict beta,
        scomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*   restrict data,
-       cntx_t*      restrict cntx
+       auxinfo_t*            data,
+       cntx_t*               cntx
      )
 {
 
@@ -1151,8 +1151,8 @@ void bli_cgemmsup_rv_zen_asm_3x4
        scomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
        scomplex*    restrict beta,
        scomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*   restrict data,
-       cntx_t*      restrict cntx
+       auxinfo_t*            data,
+       cntx_t*               cntx
      )
 {
 
@@ -1184,7 +1184,7 @@ void bli_cgemmsup_rv_zen_asm_3x4
 	ymm10 = _mm256_setzero_ps();
 	ymm12 = _mm256_setzero_ps();
 	ymm14 = _mm256_setzero_ps();
-	
+
 	dim_t ta_inc_row = rs_a;
 	dim_t tb_inc_row = rs_b;
 	dim_t tc_inc_row = rs_c;
@@ -1386,8 +1386,8 @@ void bli_cgemmsup_rv_zen_asm_3x2
        scomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
        scomplex*    restrict beta,
        scomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*   restrict data,
-       cntx_t*      restrict cntx
+       auxinfo_t*            data,
+       cntx_t*               cntx
      )
 {
 
@@ -1408,7 +1408,7 @@ void bli_cgemmsup_rv_zen_asm_3x2
 	scomplex *tB = b;
 	scomplex *tC = c;
 	// clear scratch registers.
-	__m128 xmm0, xmm1, xmm2, xmm3; 
+	__m128 xmm0, xmm1, xmm2, xmm3;
 	__m128 xmm4 = _mm_setzero_ps();
 	__m128 xmm6 = _mm_setzero_ps();
 	__m128 xmm8 = _mm_setzero_ps();
diff --git a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4.c b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4.c
index 1638eaba0b..b9ed3c9f9e 100644
--- a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4.c
+++ b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4.c
@@ -82,8 +82,8 @@ void bli_zgemmsup_rv_zen_asm_2x4
        dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
        dcomplex*    restrict beta,
        dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*   restrict data,
-       cntx_t*      restrict cntx
+       auxinfo_t*            data,
+       cntx_t*               cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -357,7 +357,7 @@ void bli_zgemmsup_rv_zen_asm_2x4
 	vmulpd(ymm1, ymm3, ymm3)
 	vaddsubpd(ymm3, ymm9, ymm9)
 
-	/* (�r + �i)x C + ((ar + ai) x AB) */
+	/* (�r + �i)x C + ((ar + ai) x AB) */
 	mov(var(beta), rbx) // load address of beta
 	vbroadcastsd(mem(rbx), ymm1) // load beta_r and duplicate
 	vbroadcastsd(mem(rbx, 8), ymm2) // load beta_i and duplicate
@@ -413,7 +413,7 @@ void bli_zgemmsup_rv_zen_asm_2x4
 	mov(var(cs_c), rsi)        // load cs_c
 	lea(mem(, rsi, 8), rsi)    // rsi = cs_c * sizeof(real dt)
 	lea(mem(, rsi, 2), rsi)    // rsi = cs_c * sizeof((real+imag) dt)
-	
+
 	lea(mem(rsi, rsi, 2), r13)           // r13 = 3*rs_a
 
 	ZGEMM_INPUT_SCALE_CS_BETA_NZ
@@ -423,16 +423,16 @@ void bli_zgemmsup_rv_zen_asm_2x4
 	ZGEMM_INPUT_SCALE_CS_BETA_NZ
 	vaddpd(ymm8, ymm0, ymm8)
 	add(rdi, rcx)
-	
+
 	lea(mem(r12, rsi, 2), rcx)
-	
+
 	ZGEMM_INPUT_SCALE_CS_BETA_NZ
 	vaddpd(ymm5, ymm0, ymm5)
 	add(rdi, rcx)
 
 	ZGEMM_INPUT_SCALE_CS_BETA_NZ
 	vaddpd(ymm9, ymm0, ymm9)
-	add(rdi, rcx) 
+	add(rdi, rcx)
 
 	mov(r12, rcx)                      // reset rcx to current utile of c.
 
@@ -454,12 +454,12 @@ void bli_zgemmsup_rv_zen_asm_2x4
 	vmovups(xmm8, mem(rcx, 16))
 
 	add(rsi, rcx)
-	
+
 	vmovups(xmm5, mem(rcx))
 	vmovups(xmm9, mem(rcx, 16))
-	
+
 	add(rsi, rcx)
-	
+
 	vextractf128(imm(0x1), ymm5, xmm5)
 	vextractf128(imm(0x1), ymm9, xmm9)
 	vmovups(xmm5, mem(rcx))
@@ -501,12 +501,12 @@ void bli_zgemmsup_rv_zen_asm_2x4
 	vmovups(xmm8, mem(rcx, 16))
 
 	add(rsi, rcx)
-	
+
 	vmovups(xmm5, mem(rcx))
 	vmovups(xmm9, mem(rcx, 16))
-	
+
 	add(rsi, rcx)
-	
+
 	vextractf128(imm(0x1), ymm5, xmm5)
 	vextractf128(imm(0x1), ymm9, xmm9)
 	vmovups(xmm5, mem(rcx))
@@ -558,8 +558,8 @@ void bli_zgemmsup_rv_zen_asm_1x4
        dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
        dcomplex*    restrict beta,
        dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*   restrict data,
-       cntx_t*      restrict cntx
+       auxinfo_t*            data,
+       cntx_t*               cntx
      )
 {
 
@@ -781,7 +781,7 @@ void bli_zgemmsup_rv_zen_asm_1x4
 	vmulpd(ymm1, ymm3, ymm3)
 	vaddsubpd(ymm3, ymm5, ymm5)
 
-	/* (�r + �i)x C + ((ar + ai) x AB) */
+	/* (�r + �i)x C + ((ar + ai) x AB) */
 	mov(var(beta), rbx) // load address of beta
 	vbroadcastsd(mem(rbx), ymm1) // load beta_r and duplicate
 	vbroadcastsd(mem(rbx, 8), ymm2) // load beta_i and duplicate
@@ -828,14 +828,14 @@ void bli_zgemmsup_rv_zen_asm_1x4
 	mov(var(cs_c), rsi)        // load cs_c
 	lea(mem(, rsi, 8), rsi)    // rsi = cs_c * sizeof(real dt)
 	lea(mem(, rsi, 2), rsi)    // rsi = cs_c * sizeof((real+imag) dt)
-	
+
 	lea(mem(rsi, rsi, 2), r13)           // r13 = 3*rs_a
 
 	ZGEMM_INPUT_SCALE_CS_BETA_NZ
 	vaddpd(ymm4, ymm0, ymm4)
 
 	lea(mem(r12, rsi, 2), rcx)
-	
+
 	ZGEMM_INPUT_SCALE_CS_BETA_NZ
 	vaddpd(ymm5, ymm0, ymm5)
 
@@ -854,7 +854,7 @@ void bli_zgemmsup_rv_zen_asm_1x4
 	vmovups(xmm4, mem(rcx))
 
 	add(rsi, rcx)
-	
+
 	vmovups(xmm5, mem(rcx))
 
 	add(rsi, rcx)
@@ -943,8 +943,8 @@ void bli_zgemmsup_rv_zen_asm_2x2
        dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
        dcomplex*    restrict beta,
        dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*   restrict data,
-       cntx_t*      restrict cntx
+       auxinfo_t*            data,
+       cntx_t*               cntx
 
      )
 {
@@ -1178,7 +1178,7 @@ void bli_zgemmsup_rv_zen_asm_2x2
 	vmulpd(ymm1, ymm3, ymm3)
 	vaddsubpd(ymm3, ymm8, ymm8)
 
-	/* (�r + �i)x C + ((ar + ai) x AB) */
+	/* (�r + �i)x C + ((ar + ai) x AB) */
 	mov(var(beta), rbx) // load address of beta
 	vbroadcastsd(mem(rbx), ymm1) // load beta_r and duplicate
 	vbroadcastsd(mem(rbx, 8), ymm2) // load beta_i and duplicate
@@ -1226,7 +1226,7 @@ void bli_zgemmsup_rv_zen_asm_2x2
 	mov(var(cs_c), rsi)        // load cs_c
 	lea(mem(, rsi, 8), rsi)    // rsi = cs_c * sizeof(real dt)
 	lea(mem(, rsi, 2), rsi)    // rsi = cs_c * sizeof((real+imag) dt)
-	
+
 	lea(mem(rsi, rsi, 2), r13)           // r13 = 3*rs_a
 
 	ZGEMM_INPUT_SCALE_CS_BETA_NZ
@@ -1330,8 +1330,8 @@ void bli_zgemmsup_rv_zen_asm_1x2
        dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
        dcomplex*    restrict beta,
        dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*   restrict data,
-       cntx_t*      restrict cntx
+       auxinfo_t*            data,
+       cntx_t*               cntx
 
      )
 {
@@ -1529,7 +1529,7 @@ void bli_zgemmsup_rv_zen_asm_1x2
 	vmulpd(ymm1, ymm3, ymm3)
 	vaddsubpd(ymm3, ymm4, ymm4)
 
-	/* (�r + �i)x C + ((ar + ai) x AB) */
+	/* (�r + �i)x C + ((ar + ai) x AB) */
 	mov(var(beta), rbx) // load address of beta
 	vbroadcastsd(mem(rbx), ymm1) // load beta_r and duplicate
 	vbroadcastsd(mem(rbx, 8), ymm2) // load beta_i and duplicate
@@ -1571,7 +1571,7 @@ void bli_zgemmsup_rv_zen_asm_1x2
 	mov(var(cs_c), rsi)        // load cs_c
 	lea(mem(, rsi, 8), rsi)    // rsi = cs_c * sizeof(real dt)
 	lea(mem(, rsi, 2), rsi)    // rsi = cs_c * sizeof((real+imag) dt)
-	
+
 	lea(mem(rsi, rsi, 2), r13)           // r13 = 3*rs_a
 
 	ZGEMM_INPUT_SCALE_CS_BETA_NZ
@@ -1602,7 +1602,7 @@ void bli_zgemmsup_rv_zen_asm_1x2
 
 	vmovupd(ymm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 
 	jmp(.SDONE)                        // jump to end.
 
diff --git a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4m.c b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4m.c
index 05e05dfece..1dd37a3952 100644
--- a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4m.c
+++ b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4m.c
@@ -97,7 +97,7 @@
 
    crr:
 	 | | | | | | | |       ------        --------
-	 | | | | | | | |  +=   ------ 
+	 | | | | | | | |  +=   ------
 	 --------
 	 | | | | | | | |       ------        --------
 	 | | | | | | | |       ------            :
@@ -114,8 +114,8 @@ void bli_zgemmsup_rv_zen_asm_3x4m
        dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
        dcomplex*    restrict beta,
        dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*   restrict data,
-       cntx_t*      restrict cntx
+       auxinfo_t*            data,
+       cntx_t*               cntx
      )
 {
 	uint64_t n_left = n0 % 4;
@@ -477,7 +477,7 @@ void bli_zgemmsup_rv_zen_asm_3x4m
 	vmulpd(ymm1, ymm3, ymm3)
 	vaddsubpd(ymm3, ymm13, ymm13)
 
-	/* (�r + �i)x C + ((ar + ai) x AB) */
+	/* (�r + �i)x C + ((ar + ai) x AB) */
 	mov(var(beta), rbx) // load address of beta
 	vbroadcastsd(mem(rbx), ymm1) // load beta_r and duplicate
 	vbroadcastsd(mem(rbx, 8), ymm2) // load beta_i and duplicate
@@ -563,7 +563,7 @@ void bli_zgemmsup_rv_zen_asm_3x4m
 
 	ZGEMM_INPUT_SCALE_CS_BETA_NZ
 	vaddpd(ymm9, ymm0, ymm9)
-	add(rdi, rcx) 
+	add(rdi, rcx)
 
 	ZGEMM_INPUT_SCALE_CS_BETA_NZ
 	vaddpd(ymm13, ymm0, ymm13)
@@ -591,13 +591,13 @@ void bli_zgemmsup_rv_zen_asm_3x4m
 	vmovups(xmm12, mem(rcx,32))
 
 	add(rsi, rcx)
-	
+
 	vmovups(xmm5, mem(rcx))
 	vmovups(xmm9, mem(rcx, 16))
 	vmovups(xmm13,mem(rcx,32))
-	
+
 	add(rsi, rcx)
-	
+
 	vextractf128(imm(0x1), ymm5, xmm5)
 	vextractf128(imm(0x1), ymm9, xmm9)
 	vextractf128(imm(0x1), ymm13, xmm13)
@@ -649,13 +649,13 @@ void bli_zgemmsup_rv_zen_asm_3x4m
 	vmovups(xmm12, mem(rcx,32))
 
 	add(rsi, rcx)
-	
+
 	vmovups(xmm5, mem(rcx))
 	vmovups(xmm9, mem(rcx, 16))
 	vmovups(xmm13,mem(rcx,32))
-	
+
 	add(rsi, rcx)
-	
+
 	vextractf128(imm(0x1), ymm5, xmm5)
 	vextractf128(imm(0x1), ymm9, xmm9)
 	vextractf128(imm(0x1), ymm13, xmm13)
@@ -750,8 +750,8 @@ void bli_zgemmsup_rv_zen_asm_3x2m
        dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
        dcomplex*    restrict beta,
        dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*   restrict data,
-       cntx_t*      restrict cntx
+       auxinfo_t*            data,
+       cntx_t*               cntx
 
      )
 {
@@ -1025,7 +1025,7 @@ void bli_zgemmsup_rv_zen_asm_3x2m
 	vmulpd(ymm1, ymm3, ymm3)
 	vaddsubpd(ymm3, ymm12, ymm12)
 
-	/* (�r + �i)x C + ((ar + ai) x AB) */
+	/* (�r + �i)x C + ((ar + ai) x AB) */
 	mov(var(beta), rbx) // load address of beta
 	vbroadcastsd(mem(rbx), ymm1) // load beta_r and duplicate
 	vbroadcastsd(mem(rbx, 8), ymm2) // load beta_i and duplicate
@@ -1079,7 +1079,7 @@ void bli_zgemmsup_rv_zen_asm_3x2m
 	mov(var(cs_c), rsi)        // load cs_c
 	lea(mem(, rsi, 8), rsi)    // rsi = cs_c * sizeof(real dt)
 	lea(mem(, rsi, 2), rsi)    // rsi = cs_c * sizeof((real+imag) dt)
-	
+
 	lea(mem(rsi, rsi, 2), r13)           // r13 = 3*rs_a
 
 	ZGEMM_INPUT_SCALE_CS_BETA_NZ
@@ -1089,7 +1089,7 @@ void bli_zgemmsup_rv_zen_asm_3x2m
 	ZGEMM_INPUT_SCALE_CS_BETA_NZ
 	vaddpd(ymm8, ymm0, ymm8)
 	add(rdi, rcx)
-	
+
 	ZGEMM_INPUT_SCALE_CS_BETA_NZ
 	vaddpd(ymm12, ymm0, ymm12)
 
@@ -1126,10 +1126,10 @@ void bli_zgemmsup_rv_zen_asm_3x2m
 
 	vmovupd(ymm4, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(ymm8, mem(rcx))
 	add(rdi, rcx)
-	
+
 	vmovupd(ymm12, mem(rcx))
 
 	jmp(.SDONE)                        // jump to end.
diff --git a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4n.c b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4n.c
index 872d048685..58d08ecbd6 100644
--- a/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4n.c
+++ b/kernels/zen/3/sup/broken/bli_gemmsup_rv_zen_asm_z3x4n.c
@@ -79,14 +79,14 @@ void bli_zgemmsup_rv_zen_asm_3x4n
        dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
        dcomplex*    restrict beta,
        dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*   restrict data,
-       cntx_t*      restrict cntx
+       auxinfo_t*            data,
+       cntx_t*               cntx
      )
 {
 	uint64_t m_left = m0 % 3;
 	if ( m_left )
 	{
-		zgemmsup_ker_ft ker_fps[3] = 
+		zgemmsup_ker_ft ker_fps[3] =
 		{
 			NULL,
 			bli_zgemmsup_rv_zen_asm_1x4n,
@@ -150,7 +150,7 @@ void bli_zgemmsup_rv_zen_asm_3x4n
 		ymm13 = _mm256_setzero_pd();
 		ymm14 = _mm256_setzero_pd();
 		ymm15 = _mm256_setzero_pd();
-		
+
 		dim_t ta_inc_row = rs_a;
 		dim_t tb_inc_row = rs_b;
 		dim_t tc_inc_row = rs_c;
@@ -170,7 +170,7 @@ void bli_zgemmsup_rv_zen_asm_3x4n
 			// This loop is processing MR x K
 			ymm0 = _mm256_loadu_pd((double const *)(tB + tb_inc_row * k_iter));
 			ymm1 = _mm256_loadu_pd((double const *)(tB + tb_inc_row * k_iter +  2));
-			
+
 			//broadcasted matrix B elements are multiplied
 			//with matrix A columns.
 			ymm2 = _mm256_broadcast_sd((double const *)(tA));
@@ -472,8 +472,8 @@ void bli_zgemmsup_rv_zen_asm_2x4n
        dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
        dcomplex*    restrict beta,
        dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*   restrict data,
-       cntx_t*      restrict cntx
+       auxinfo_t*            data,
+       cntx_t*               cntx
      )
 {
 
@@ -515,7 +515,7 @@ void bli_zgemmsup_rv_zen_asm_2x4n
 		ymm9 = _mm256_setzero_pd();
 		ymm10 = _mm256_setzero_pd();
 		ymm11 = _mm256_setzero_pd();
-		
+
 		dim_t ta_inc_row = rs_a;
 		dim_t tb_inc_row = rs_b;
 		dim_t tc_inc_row = rs_c;
@@ -535,7 +535,7 @@ void bli_zgemmsup_rv_zen_asm_2x4n
 			// This loop is processing MR x K
 			ymm0 = _mm256_loadu_pd((double const *)(tB + tb_inc_row * k_iter));
 			ymm1 = _mm256_loadu_pd((double const *)(tB + tb_inc_row * k_iter +  2));
-			
+
 			//broadcasted matrix B elements are multiplied
 			//with matrix A columns.
 			ymm2 = _mm256_broadcast_sd((double const *)(tA));
@@ -772,8 +772,8 @@ void bli_zgemmsup_rv_zen_asm_1x4n
        dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
        dcomplex*    restrict beta,
        dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*   restrict data,
-       cntx_t*      restrict cntx
+       auxinfo_t*            data,
+       cntx_t*               cntx
      )
 {
 	//void*    a_next = bli_auxinfo_next_a( data );
@@ -832,7 +832,7 @@ void bli_zgemmsup_rv_zen_asm_1x4n
 			// This loop is processing MR x K
 			ymm0 = _mm256_loadu_pd((double const *)(tB + tb_inc_row * k_iter));
 			ymm1 = _mm256_loadu_pd((double const *)(tB + tb_inc_row * k_iter +  2));
-			
+
 			//broadcasted matrix B elements are multiplied
 			//with matrix A columns.
 			ymm2 = _mm256_broadcast_sd((double const *)(tA));
@@ -999,8 +999,8 @@ void bli_zgemmsup_rv_zen_asm_3x2
        dcomplex*    restrict b, inc_t rs_b0, inc_t cs_b0,
        dcomplex*    restrict beta,
        dcomplex*    restrict c, inc_t rs_c0, inc_t cs_c0,
-       auxinfo_t*   restrict data,
-       cntx_t*      restrict cntx
+       auxinfo_t*            data,
+       cntx_t*               cntx
      )
 {
 	uint64_t k_iter = 0;
@@ -1046,7 +1046,7 @@ void bli_zgemmsup_rv_zen_asm_3x2
 		// multiplies it with the A matrix.
 		// This loop is processing MR x K
 		ymm0 = _mm256_loadu_pd((double const *)(tB + tb_inc_row * k_iter));
-		
+
 		//broadcasted matrix B elements are multiplied
 		//with matrix A columns.
 		ymm2 = _mm256_broadcast_sd((double const *)(tA));
diff --git a/kernels/zen2/1f/bli_axpyf_zen_int_5.c b/kernels/zen2/1f/bli_axpyf_zen_int_5.c
index f8b04d52d6..8a60bce46d 100644
--- a/kernels/zen2/1f/bli_axpyf_zen_int_5.c
+++ b/kernels/zen2/1f/bli_axpyf_zen_int_5.c
@@ -62,7 +62,7 @@ void bli_saxpyf_zen_int_5
        float* restrict a, inc_t inca, inc_t lda,
        float* restrict x, inc_t incx,
        float* restrict y, inc_t incy,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
     const dim_t      fuse_fac       = 5;
@@ -336,7 +336,7 @@ void bli_daxpyf_zen_int_5
        double* restrict a, inc_t inca, inc_t lda,
        double* restrict x, inc_t incx,
        double* restrict y, inc_t incy,
-       cntx_t* restrict cntx
+       cntx_t*          cntx
      )
 {
     const dim_t      fuse_fac       = 5;
diff --git a/ref_kernels/1/bli_addv_ref.c b/ref_kernels/1/bli_addv_ref.c
index 6724cdfd14..bb637d7e66 100644
--- a/ref_kernels/1/bli_addv_ref.c
+++ b/ref_kernels/1/bli_addv_ref.c
@@ -43,7 +43,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        dim_t            n, \
        ctype*  restrict x, inc_t incx, \
        ctype*  restrict y, inc_t incy, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
diff --git a/ref_kernels/1/bli_amaxv_ref.c b/ref_kernels/1/bli_amaxv_ref.c
index 169180f3b1..cdfae95689 100644
--- a/ref_kernels/1/bli_amaxv_ref.c
+++ b/ref_kernels/1/bli_amaxv_ref.c
@@ -46,7 +46,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        dim_t            n, \
        ctype*  restrict x, inc_t incx, \
        dim_t*  restrict i_max, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ) \
 { \
 	ctype_r* minus_one = PASTEMAC(chr,m1); \
diff --git a/ref_kernels/1/bli_axpbyv_ref.c b/ref_kernels/1/bli_axpbyv_ref.c
index 2da4bc9280..fb48070a55 100644
--- a/ref_kernels/1/bli_axpbyv_ref.c
+++ b/ref_kernels/1/bli_axpbyv_ref.c
@@ -45,7 +45,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*  restrict x, inc_t incx, \
        ctype*  restrict beta, \
        ctype*  restrict y, inc_t incy, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
diff --git a/ref_kernels/1/bli_axpyv_ref.c b/ref_kernels/1/bli_axpyv_ref.c
index 30076ddaf9..295fcf24c4 100644
--- a/ref_kernels/1/bli_axpyv_ref.c
+++ b/ref_kernels/1/bli_axpyv_ref.c
@@ -45,7 +45,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*  restrict alpha, \
        ctype*  restrict x, inc_t incx, \
        ctype*  restrict y, inc_t incy, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
@@ -135,7 +135,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*  restrict alpha, \
        ctype*  restrict x, inc_t incx, \
        ctype*  restrict y, inc_t incy, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
diff --git a/ref_kernels/1/bli_copyv_ref.c b/ref_kernels/1/bli_copyv_ref.c
index 9cf005aaec..1202aa896c 100644
--- a/ref_kernels/1/bli_copyv_ref.c
+++ b/ref_kernels/1/bli_copyv_ref.c
@@ -43,7 +43,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        dim_t            n, \
        ctype*  restrict x, inc_t incx, \
        ctype*  restrict y, inc_t incy, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
diff --git a/ref_kernels/1/bli_dotv_ref.c b/ref_kernels/1/bli_dotv_ref.c
index f2cfae78bc..d17c71dd3b 100644
--- a/ref_kernels/1/bli_dotv_ref.c
+++ b/ref_kernels/1/bli_dotv_ref.c
@@ -45,7 +45,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*  restrict x, inc_t incx, \
        ctype*  restrict y, inc_t incy, \
        ctype*  restrict rho, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ) \
 { \
 	ctype dotxy; \
diff --git a/ref_kernels/1/bli_dotxv_ref.c b/ref_kernels/1/bli_dotxv_ref.c
index e2283bcc62..caea621765 100644
--- a/ref_kernels/1/bli_dotxv_ref.c
+++ b/ref_kernels/1/bli_dotxv_ref.c
@@ -47,7 +47,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*  restrict y, inc_t incy, \
        ctype*  restrict beta, \
        ctype*  restrict rho, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ) \
 { \
 	ctype dotxy; \
diff --git a/ref_kernels/1/bli_invertv_ref.c b/ref_kernels/1/bli_invertv_ref.c
index 07c52d82de..914663c82a 100644
--- a/ref_kernels/1/bli_invertv_ref.c
+++ b/ref_kernels/1/bli_invertv_ref.c
@@ -41,7 +41,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
      ( \
        dim_t            n, \
        ctype*  restrict x, inc_t incx, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
diff --git a/ref_kernels/1/bli_scal2v_ref.c b/ref_kernels/1/bli_scal2v_ref.c
index ba05959908..f4785c2283 100644
--- a/ref_kernels/1/bli_scal2v_ref.c
+++ b/ref_kernels/1/bli_scal2v_ref.c
@@ -44,7 +44,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*  restrict alpha, \
        ctype*  restrict x, inc_t incx, \
        ctype*  restrict y, inc_t incy, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
diff --git a/ref_kernels/1/bli_scalv_ref.c b/ref_kernels/1/bli_scalv_ref.c
index 3e6be74928..6ca9a88a59 100644
--- a/ref_kernels/1/bli_scalv_ref.c
+++ b/ref_kernels/1/bli_scalv_ref.c
@@ -43,7 +43,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        dim_t            n, \
        ctype*  restrict alpha, \
        ctype*  restrict x, inc_t incx, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
diff --git a/ref_kernels/1/bli_setv_ref.c b/ref_kernels/1/bli_setv_ref.c
index 862ff177d2..be6e76cbb7 100644
--- a/ref_kernels/1/bli_setv_ref.c
+++ b/ref_kernels/1/bli_setv_ref.c
@@ -43,7 +43,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        dim_t            n, \
        ctype*  restrict alpha, \
        ctype*  restrict x, inc_t incx, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
diff --git a/ref_kernels/1/bli_subv_ref.c b/ref_kernels/1/bli_subv_ref.c
index 6b512909ff..ce1ec2079e 100644
--- a/ref_kernels/1/bli_subv_ref.c
+++ b/ref_kernels/1/bli_subv_ref.c
@@ -43,7 +43,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        dim_t            n, \
        ctype*  restrict x, inc_t incx, \
        ctype*  restrict y, inc_t incy, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
diff --git a/ref_kernels/1/bli_swapv_ref.c b/ref_kernels/1/bli_swapv_ref.c
index 6f8d54f666..73a90c87b6 100644
--- a/ref_kernels/1/bli_swapv_ref.c
+++ b/ref_kernels/1/bli_swapv_ref.c
@@ -42,7 +42,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        dim_t            n, \
        ctype*  restrict x, inc_t incx, \
        ctype*  restrict y, inc_t incy, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
diff --git a/ref_kernels/1/bli_xpbyv_ref.c b/ref_kernels/1/bli_xpbyv_ref.c
index 28286a5f8b..0a6844bf1d 100644
--- a/ref_kernels/1/bli_xpbyv_ref.c
+++ b/ref_kernels/1/bli_xpbyv_ref.c
@@ -44,7 +44,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*  restrict x, inc_t incx, \
        ctype*  restrict beta, \
        ctype*  restrict y, inc_t incy, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
diff --git a/ref_kernels/1f/bli_axpy2v_ref.c b/ref_kernels/1f/bli_axpy2v_ref.c
index 6439ff8b01..0563322ae5 100644
--- a/ref_kernels/1f/bli_axpy2v_ref.c
+++ b/ref_kernels/1f/bli_axpy2v_ref.c
@@ -48,7 +48,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*  restrict x, inc_t incx, \
        ctype*  restrict y, inc_t incy, \
        ctype*  restrict z, inc_t incz, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( n ) ) return; \
diff --git a/ref_kernels/1f/bli_axpyf_ref.c b/ref_kernels/1f/bli_axpyf_ref.c
index 5799a03a68..873cee563d 100644
--- a/ref_kernels/1f/bli_axpyf_ref.c
+++ b/ref_kernels/1f/bli_axpyf_ref.c
@@ -48,7 +48,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*  restrict a, inc_t inca, inc_t lda, \
        ctype*  restrict x, inc_t incx, \
        ctype*  restrict y, inc_t incy, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( m ) ) return; \
diff --git a/ref_kernels/1f/bli_dotaxpyv_ref.c b/ref_kernels/1f/bli_dotaxpyv_ref.c
index 42936c6506..b83b927c93 100644
--- a/ref_kernels/1f/bli_dotaxpyv_ref.c
+++ b/ref_kernels/1f/bli_dotaxpyv_ref.c
@@ -49,7 +49,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*  restrict y, inc_t incy, \
        ctype*  restrict rho, \
        ctype*  restrict z, inc_t incz, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ) \
 { \
 	if ( bli_zero_dim1( m ) ) return; \
diff --git a/ref_kernels/1f/bli_dotxaxpyf_ref.c b/ref_kernels/1f/bli_dotxaxpyf_ref.c
index 990133621c..249b9a6deb 100644
--- a/ref_kernels/1f/bli_dotxaxpyf_ref.c
+++ b/ref_kernels/1f/bli_dotxaxpyf_ref.c
@@ -53,7 +53,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*  restrict beta, \
        ctype*  restrict y, inc_t incy, \
        ctype*  restrict z, inc_t incz, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ) \
 { \
 	/* A is m x n.                   */ \
diff --git a/ref_kernels/1f/bli_dotxf_ref.c b/ref_kernels/1f/bli_dotxf_ref.c
index 86781fd58a..2d2da1318a 100644
--- a/ref_kernels/1f/bli_dotxf_ref.c
+++ b/ref_kernels/1f/bli_dotxf_ref.c
@@ -49,7 +49,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*  restrict x, inc_t incx, \
        ctype*  restrict beta, \
        ctype*  restrict y, inc_t incy, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ) \
 { \
 	if ( inca == 1 && incx == 1 && incy == 1 && b_n == ff ) \
diff --git a/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c b/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c
index 17ed9bef65..23a635b32a 100644
--- a/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c
+++ b/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c
@@ -122,7 +122,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*  restrict kappa, \
        ctype*  restrict a, inc_t inca, inc_t lda, \
        ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx \
+       cntx_t*          cntx \
      ) \
 { \
     const num_t dt_r      = PASTEMAC(chr,type); \
diff --git a/ref_kernels/1m/bli_packm_cxc_diag_ref.c b/ref_kernels/1m/bli_packm_cxc_diag_ref.c
index bbfa1e3cc3..b9b4e97397 100644
--- a/ref_kernels/1m/bli_packm_cxc_diag_ref.c
+++ b/ref_kernels/1m/bli_packm_cxc_diag_ref.c
@@ -66,7 +66,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*  restrict kappa, \
        ctype*  restrict a, inc_t inca, inc_t lda, \
        ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx \
+       cntx_t*          cntx \
      ) \
 { \
     const num_t dt        = PASTEMAC(ch,type); \
diff --git a/ref_kernels/1m/bli_packm_cxk_1er_ref.c b/ref_kernels/1m/bli_packm_cxk_1er_ref.c
index 06b83debaf..b4fbf79669 100644
--- a/ref_kernels/1m/bli_packm_cxk_1er_ref.c
+++ b/ref_kernels/1m/bli_packm_cxk_1er_ref.c
@@ -87,7 +87,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*  restrict kappa, \
        ctype*  restrict a, inc_t inca, inc_t lda, \
        ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx  \
+       cntx_t*          cntx  \
      ) \
 { \
 	const dim_t dfac = PASTECH2(bb0, _, chr); \
diff --git a/ref_kernels/1m/bli_packm_cxk_ref.c b/ref_kernels/1m/bli_packm_cxk_ref.c
index c385fca1ac..945b3a2470 100644
--- a/ref_kernels/1m/bli_packm_cxk_ref.c
+++ b/ref_kernels/1m/bli_packm_cxk_ref.c
@@ -63,7 +63,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*  restrict kappa, \
        ctype*  restrict a, inc_t inca, inc_t lda, \
        ctype*  restrict p,             inc_t ldp, \
-       cntx_t* restrict cntx \
+       cntx_t*          cntx \
      ) \
 { \
 	const dim_t     mnr        = PASTECH2(mnr0, _, ch); \
diff --git a/ref_kernels/1m/bli_unpackm_cxk_ref.c b/ref_kernels/1m/bli_unpackm_cxk_ref.c
index 73d98e2681..172e93bdf8 100644
--- a/ref_kernels/1m/bli_unpackm_cxk_ref.c
+++ b/ref_kernels/1m/bli_unpackm_cxk_ref.c
@@ -61,7 +61,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*  restrict kappa, \
        ctype*  restrict p,             inc_t ldp, \
        ctype*  restrict a, inc_t inca, inc_t lda, \
-       cntx_t* restrict cntx \
+       cntx_t*          cntx \
      ) \
 { \
 	const dim_t     mnr        = PASTECH2(mnr0, _, ch); \
diff --git a/ref_kernels/3/bli_gemm_ref.c b/ref_kernels/3/bli_gemm_ref.c
index 968ca39979..6cdfcd6128 100644
--- a/ref_kernels/3/bli_gemm_ref.c
+++ b/ref_kernels/3/bli_gemm_ref.c
@@ -50,8 +50,8 @@ static void PASTEMAC3(ch,opname,arch,suf) \
        ctype*     restrict b, \
        ctype*     restrict beta, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      ) \
 { \
 	const num_t     dt     = PASTEMAC(ch,type); \
@@ -153,8 +153,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*     restrict b, \
        ctype*     restrict beta, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      ) \
 { \
 \
diff --git a/ref_kernels/3/bli_gemmsup_ref.c b/ref_kernels/3/bli_gemmsup_ref.c
index 0c3773c1c0..7f73718006 100644
--- a/ref_kernels/3/bli_gemmsup_ref.c
+++ b/ref_kernels/3/bli_gemmsup_ref.c
@@ -53,8 +53,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
        ctype*     restrict beta, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      ) \
 { \
 	/* NOTE: This microkernel can actually handle arbitrarily large
@@ -258,8 +258,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
        ctype*     restrict beta, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      ) \
 { \
 	/* NOTE: This microkernel can actually handle arbitrarily large
@@ -478,8 +478,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
        ctype*     restrict beta, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      ) \
 { \
 	const dim_t     mn     = m * n; \
@@ -602,8 +602,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
        ctype*     restrict beta, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      ) \
 { \
 	const dim_t     mn     = m * n; \
@@ -725,8 +725,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*     restrict b, inc_t rs_b, inc_t cs_b, \
        ctype*     restrict beta, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      ) \
 { \
 	const dim_t     mn     = m * n; \
diff --git a/ref_kernels/3/bli_gemmtrsm_ref.c b/ref_kernels/3/bli_gemmtrsm_ref.c
index 7504307179..f1c8247a97 100644
--- a/ref_kernels/3/bli_gemmtrsm_ref.c
+++ b/ref_kernels/3/bli_gemmtrsm_ref.c
@@ -51,8 +51,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*     restrict bx1, \
        ctype*     restrict b11, \
        ctype*     restrict c11, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      ) \
 { \
 	const num_t     dt     = PASTEMAC(ch,type); \
diff --git a/ref_kernels/3/bli_trsm_ref.c b/ref_kernels/3/bli_trsm_ref.c
index 504849e4ef..dd742ee059 100644
--- a/ref_kernels/3/bli_trsm_ref.c
+++ b/ref_kernels/3/bli_trsm_ref.c
@@ -45,8 +45,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*     restrict a, \
        ctype*     restrict b, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      ) \
 { \
 	const num_t     dt     = PASTEMAC(ch,type); \
@@ -132,8 +132,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*     restrict a, \
        ctype*     restrict b, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      ) \
 { \
 	const num_t     dt     = PASTEMAC(ch,type); \
diff --git a/ref_kernels/bli_cntx_ref.c b/ref_kernels/bli_cntx_ref.c
index 69c546cd4b..e094db54ba 100644
--- a/ref_kernels/bli_cntx_ref.c
+++ b/ref_kernels/bli_cntx_ref.c
@@ -306,7 +306,7 @@ void GENBARNAME(cntx_init)
 
 	// -- Set level-3 virtual micro-kernels ------------------------------------
 
-	funcs = bli_cntx_ukrs_buf( cntx );
+	funcs = cntx->ukrs;
 
 	// NOTE: We set the virtual micro-kernel slots to contain the addresses
 	// of the native micro-kernels. In general, the ukernels in the virtual
@@ -322,7 +322,7 @@ void GENBARNAME(cntx_init)
 
 	// -- Set level-3 native micro-kernels and preferences ---------------------
 
-	mbools = bli_cntx_ukr_prefs_buf( cntx );
+	mbools = cntx->ukr_prefs;
 
 	gen_func_init( &funcs[ BLIS_GEMM_UKR ],       gemm_ukr_name       );
 	gen_func_init( &funcs[ BLIS_GEMMTRSM_L_UKR ], gemmtrsm_l_ukr_name );
@@ -416,7 +416,7 @@ void GENBARNAME(cntx_init)
 
 	// -- Set level-3 small/unpacked handlers ----------------------------------
 
-	vfuncs = bli_cntx_l3_sup_handlers_buf( cntx );
+	vfuncs = cntx->l3_sup_handlers;
 
 	// Initialize all of the function pointers to NULL;
 	for ( i = 0; i < BLIS_NUM_LEVEL3_OPS; ++i ) vfuncs[ i ] = NULL;
@@ -452,7 +452,7 @@ void GENBAINAME(cntx_init)
 
 	// -- Set induced method level-3 virtual micro-kernels ---------------------
 
-	funcs = bli_cntx_ukrs_buf( cntx );
+	funcs = cntx->ukrs;
 
 	if ( method == BLIS_1M )
 	{
@@ -483,8 +483,8 @@ void GENBAINAME(cntx_init)
 	// beta has a zero imaginary component and C is either row- or column-stored).
 	if ( method == BLIS_1M )
 	{
-		func_t* gemm_nat_ukrs = bli_cntx_get_ukrs( BLIS_GEMM_UKR, cntx );
-		func_t* gemm_vir_ukrs = bli_cntx_get_ukrs( BLIS_GEMM_VIR_UKR, cntx );
+		func_t* gemm_nat_ukrs = ( func_t* )bli_cntx_get_ukrs( BLIS_GEMM_UKR, cntx );
+		func_t* gemm_vir_ukrs = ( func_t* )bli_cntx_get_ukrs( BLIS_GEMM_VIR_UKR, cntx );
 
 		bli_func_copy_dt( BLIS_FLOAT,  gemm_nat_ukrs, BLIS_FLOAT,  gemm_vir_ukrs );
 		bli_func_copy_dt( BLIS_DOUBLE, gemm_nat_ukrs, BLIS_DOUBLE, gemm_vir_ukrs );
diff --git a/ref_kernels/ind/bli_gemm1m_ref.c b/ref_kernels/ind/bli_gemm1m_ref.c
index 2f08083892..317cf26048 100644
--- a/ref_kernels/ind/bli_gemm1m_ref.c
+++ b/ref_kernels/ind/bli_gemm1m_ref.c
@@ -47,8 +47,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*     restrict b, \
        ctype*     restrict beta, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      ) \
 { \
 	const num_t       dt        = PASTEMAC(ch,type); \
diff --git a/ref_kernels/ind/bli_gemmtrsm1m_ref.c b/ref_kernels/ind/bli_gemmtrsm1m_ref.c
index 58e08ec927..f228540b89 100644
--- a/ref_kernels/ind/bli_gemmtrsm1m_ref.c
+++ b/ref_kernels/ind/bli_gemmtrsm1m_ref.c
@@ -48,8 +48,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*     restrict bx1, \
        ctype*     restrict b11, \
        ctype*     restrict c11, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      ) \
 { \
 	const num_t       dt          = PASTEMAC(ch,type); \
diff --git a/ref_kernels/ind/bli_trsm1m_ref.c b/ref_kernels/ind/bli_trsm1m_ref.c
index 175bc9e14a..ccbf148a08 100644
--- a/ref_kernels/ind/bli_trsm1m_ref.c
+++ b/ref_kernels/ind/bli_trsm1m_ref.c
@@ -43,8 +43,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*     restrict a, \
        ctype*     restrict b, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      ) \
 { \
 	const num_t       dt     = PASTEMAC(ch,type); \
@@ -261,8 +261,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
        ctype*     restrict a, \
        ctype*     restrict b, \
        ctype*     restrict c, inc_t rs_c, inc_t cs_c, \
-       auxinfo_t* restrict data, \
-       cntx_t*    restrict cntx  \
+       auxinfo_t*          data, \
+       cntx_t*             cntx  \
      ) \
 { \
 	const num_t       dt     = PASTEMAC(ch,type); \
diff --git a/testsuite/src/test_axpy2v.c b/testsuite/src/test_axpy2v.c
index eeebf15e73..3019d472b2 100644
--- a/testsuite/src/test_axpy2v.c
+++ b/testsuite/src/test_axpy2v.c
@@ -176,7 +176,7 @@ void libblis_test_axpy2v_experiment
 
 
 	// Query a context.
-	cntx = bli_gks_query_cntx();
+	cntx = ( cntx_t* )bli_gks_query_cntx();
 
 	// Use the datatype of the first char in the datatype combination string.
 	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
@@ -220,7 +220,7 @@ void libblis_test_axpy2v_experiment
 	bli_obj_set_conj( conjx, &x );
 	bli_obj_set_conj( conjy, &y );
 
-	// Repeat the experiment n_repeats times and record results. 
+	// Repeat the experiment n_repeats times and record results.
 	for ( i = 0; i < n_repeats; ++i )
 	{
 		bli_copyv( &z_save, &z );
diff --git a/testsuite/src/test_axpyf.c b/testsuite/src/test_axpyf.c
index 7a85b22123..42ab73018c 100644
--- a/testsuite/src/test_axpyf.c
+++ b/testsuite/src/test_axpyf.c
@@ -174,7 +174,7 @@ void libblis_test_axpyf_experiment
 
 
 	// Query a context.
-	cntx = bli_gks_query_cntx();
+	cntx = ( cntx_t* )bli_gks_query_cntx();
 
 	// Use the datatype of the first char in the datatype combination string.
 	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
@@ -223,7 +223,7 @@ void libblis_test_axpyf_experiment
 	bli_obj_set_conj( conja, &a );
 	bli_obj_set_conj( conjx, &x );
 
-	// Repeat the experiment n_repeats times and record results. 
+	// Repeat the experiment n_repeats times and record results.
 	for ( i = 0; i < n_repeats; ++i )
 	{
 		bli_copyv( &y_save, &y );
diff --git a/testsuite/src/test_dotaxpyv.c b/testsuite/src/test_dotaxpyv.c
index 391c119bbd..8e09e3ee17 100644
--- a/testsuite/src/test_dotaxpyv.c
+++ b/testsuite/src/test_dotaxpyv.c
@@ -179,7 +179,7 @@ void libblis_test_dotaxpyv_experiment
 
 
 	// Query a context.
-	cntx = bli_gks_query_cntx();
+	cntx = ( cntx_t* )bli_gks_query_cntx();
 
 	// Use the datatype of the first char in the datatype combination string.
 	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
@@ -222,7 +222,7 @@ void libblis_test_dotaxpyv_experiment
 	bli_obj_alias_to( &x, &xt );
 
 	// Determine whether to make a copy of x with or without conjugation.
-	// 
+	//
 	//  conjx conjy  ~conjx^conjy   y is initialized as
 	//  n     n      c              y = conj(x)
 	//  n     c      n              y = x
@@ -239,7 +239,7 @@ void libblis_test_dotaxpyv_experiment
 	bli_obj_set_conj( conjx,  &x );
 	bli_obj_set_conj( conjy,  &y );
 
-	// Repeat the experiment n_repeats times and record results. 
+	// Repeat the experiment n_repeats times and record results.
 	for ( i = 0; i < n_repeats; ++i )
 	{
 		bli_copysc( &BLIS_MINUS_ONE, &rho );
diff --git a/testsuite/src/test_dotxaxpyf.c b/testsuite/src/test_dotxaxpyf.c
index a2c3ef3e94..ec519de51e 100644
--- a/testsuite/src/test_dotxaxpyf.c
+++ b/testsuite/src/test_dotxaxpyf.c
@@ -184,7 +184,7 @@ void libblis_test_dotxaxpyf_experiment
 
 
 	// Query a context.
-	cntx = bli_gks_query_cntx();
+	cntx = ( cntx_t* )bli_gks_query_cntx();
 
 	// Use the datatype of the first char in the datatype combination string.
 	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
@@ -251,7 +251,7 @@ void libblis_test_dotxaxpyf_experiment
 	bli_obj_set_conj( conjw, &w );
 	bli_obj_set_conj( conjx, &x );
 
-	// Repeat the experiment n_repeats times and record results. 
+	// Repeat the experiment n_repeats times and record results.
 	for ( i = 0; i < n_repeats; ++i )
 	{
 		bli_copyv( &y_save, &y );
diff --git a/testsuite/src/test_dotxf.c b/testsuite/src/test_dotxf.c
index 8a1eca4eba..83f4b44ebe 100644
--- a/testsuite/src/test_dotxf.c
+++ b/testsuite/src/test_dotxf.c
@@ -176,7 +176,7 @@ void libblis_test_dotxf_experiment
 
 
 	// Query a context.
-	cntx = bli_gks_query_cntx();
+	cntx = ( cntx_t* )bli_gks_query_cntx();
 
 	// Use the datatype of the first char in the datatype combination string.
 	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
@@ -228,7 +228,7 @@ void libblis_test_dotxf_experiment
 	bli_obj_set_conj( conjat, &a );
 	bli_obj_set_conj( conjx, &x );
 
-	// Repeat the experiment n_repeats times and record results. 
+	// Repeat the experiment n_repeats times and record results.
 	for ( i = 0; i < n_repeats; ++i )
 	{
 		bli_copyv( &y_save, &y );
diff --git a/testsuite/src/test_gemm_ukr.c b/testsuite/src/test_gemm_ukr.c
index d37005b285..69ee4339da 100644
--- a/testsuite/src/test_gemm_ukr.c
+++ b/testsuite/src/test_gemm_ukr.c
@@ -181,7 +181,7 @@ void libblis_test_gemm_ukr_experiment
 
 
 	// Query a context.
-	cntx = bli_gks_query_cntx();
+	cntx = ( cntx_t* )bli_gks_query_cntx();
 
 	// Use the datatype of the first char in the datatype combination string.
 	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
diff --git a/testsuite/src/test_gemmtrsm_ukr.c b/testsuite/src/test_gemmtrsm_ukr.c
index 48fcb78db7..44ba51587c 100644
--- a/testsuite/src/test_gemmtrsm_ukr.c
+++ b/testsuite/src/test_gemmtrsm_ukr.c
@@ -207,7 +207,7 @@ void libblis_test_gemmtrsm_ukr_experiment
 
 
 	// Query a context.
-	cntx = bli_gks_query_cntx();
+	cntx = ( cntx_t* )bli_gks_query_cntx();
 
 	// Use the datatype of the first char in the datatype combination string.
 	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );
diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c
index edab9796d2..f267ae1585 100644
--- a/testsuite/src/test_libblis.c
+++ b/testsuite/src/test_libblis.c
@@ -977,7 +977,7 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
 	libblis_test_fprintf_c( os, "\n" );
 
 	// Query a native context.
-	cntx = bli_gks_query_nat_cntx();
+	cntx = ( cntx_t* )bli_gks_query_nat_cntx();
 
 	libblis_test_fprintf_c( os, "level-3 blocksizes             s       d       c       z \n" );
 	libblis_test_fprintf_c( os, "  mc                     %7d %7d %7d %7d\n",
@@ -1081,8 +1081,8 @@ void libblis_test_output_params_struct( FILE* os, test_params_t* params )
 	libblis_test_fprintf_c( os, "\n" );
 
 	// Query a native context.
-	cntx_c = bli_gks_query_ind_cntx( im, BLIS_SCOMPLEX );
-	cntx_z = bli_gks_query_ind_cntx( im, BLIS_DCOMPLEX );
+	cntx_c = ( cntx_t* )bli_gks_query_ind_cntx( im, BLIS_SCOMPLEX );
+	cntx_z = ( cntx_t* )bli_gks_query_ind_cntx( im, BLIS_DCOMPLEX );
 
 	libblis_test_fprintf_c( os, "level-3 blocksizes                             c       z \n" );
 	libblis_test_fprintf_c( os, "  mc                                     %7d %7d\n",
@@ -2178,7 +2178,7 @@ void libblis_test_op_driver
 				// Query the implementation string associated with the
 				// current operation and datatype. If the operation is
 				// not level-3, we will always get back the native string.
-				ind_str = bli_ind_oper_get_avail_impl_string( op->opid, datatype );
+				ind_str = ( char* )bli_ind_oper_get_avail_impl_string( op->opid, datatype );
 
 				// Loop over the requested parameter combinations.
 				for ( pci = 0; pci < n_param_combos; ++pci )
@@ -3051,7 +3051,7 @@ void libblis_test_parse_command_line( int argc, char** argv )
 	bli_getopt_init_state( 0, &state );
 
 	// Process all option arguments until we get a -1, which means we're done.
-	while( (opt = bli_getopt( argc, argv, "g:o:", &state )) != -1 )
+	while( (opt = bli_getopt( argc, ( const char** )argv, "g:o:", &state )) != -1 )
 	{
 		// Explicitly typecast opt, which is an int, to a char. (Failing to
 		// typecast resulted in at least one user-reported problem whereby
diff --git a/testsuite/src/test_trsm_ukr.c b/testsuite/src/test_trsm_ukr.c
index 9568dfee73..5f4988e1c7 100644
--- a/testsuite/src/test_trsm_ukr.c
+++ b/testsuite/src/test_trsm_ukr.c
@@ -186,7 +186,7 @@ void libblis_test_trsm_ukr_experiment
 
 
 	// Query a context.
-	cntx = bli_gks_query_cntx();
+	cntx = ( cntx_t* )bli_gks_query_cntx();
 
 	// Use the datatype of the first char in the datatype combination string.
 	bli_param_map_char_to_blis_dt( dc_str[0], &datatype );

From 97c5beb601d62b1a853eb691baf4c19053be6fff Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Tue, 15 Mar 2022 10:29:16 -0500
Subject: [PATCH 13/32] Fix TRMM/TRSM bug.

---
 frame/3/trmm/bli_trmm_ll_ker_var2.c |  4 ++--
 frame/3/trmm/bli_trmm_lu_ker_var2.c |  4 ++--
 frame/3/trsm/bli_trsm_ll_ker_var2.c | 10 +++++-----
 frame/3/trsm/bli_trsm_lu_ker_var2.c | 10 +++++-----
 frame/thread/bli_thrinfo_sup.c      |  1 -
 5 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/frame/3/trmm/bli_trmm_ll_ker_var2.c b/frame/3/trmm/bli_trmm_ll_ker_var2.c
index 9ea0db853e..f5476b2cad 100644
--- a/frame/3/trmm/bli_trmm_ll_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_ll_ker_var2.c
@@ -71,7 +71,7 @@ void bli_trmm_ll_ker_var2
 {
 	const num_t     dt_exec   = bli_obj_exec_dt( c );
 
-	const doff_t    diagoffb  = bli_obj_diag_offset( b );
+	const doff_t    diagoffa  = bli_obj_diag_offset( a );
 
 	const pack_t    schema_a  = bli_obj_pack_schema( a );
 	const pack_t    schema_b  = bli_obj_pack_schema( b );
@@ -109,7 +109,7 @@ void bli_trmm_ll_ker_var2
 	// function pointer.
 	ftypes[dt_exec]
 	(
-	  diagoffb,
+	  diagoffa,
 	  schema_a,
 	  schema_b,
 	  m,
diff --git a/frame/3/trmm/bli_trmm_lu_ker_var2.c b/frame/3/trmm/bli_trmm_lu_ker_var2.c
index ba91a58e90..df5b2dac55 100644
--- a/frame/3/trmm/bli_trmm_lu_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_lu_ker_var2.c
@@ -71,7 +71,7 @@ void bli_trmm_lu_ker_var2
 {
 	const num_t     dt_exec   = bli_obj_exec_dt( c );
 
-	const doff_t    diagoffb  = bli_obj_diag_offset( b );
+	const doff_t    diagoffa  = bli_obj_diag_offset( a );
 
 	const pack_t    schema_a  = bli_obj_pack_schema( a );
 	const pack_t    schema_b  = bli_obj_pack_schema( b );
@@ -109,7 +109,7 @@ void bli_trmm_lu_ker_var2
 	// function pointer.
 	ftypes[dt_exec]
 	(
-	  diagoffb,
+	  diagoffa,
 	  schema_a,
 	  schema_b,
 	  m,
diff --git a/frame/3/trsm/bli_trsm_ll_ker_var2.c b/frame/3/trsm/bli_trsm_ll_ker_var2.c
index 71abcca123..075b403362 100644
--- a/frame/3/trsm/bli_trsm_ll_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_ll_ker_var2.c
@@ -71,7 +71,7 @@ void bli_trsm_ll_ker_var2
 {
 	const num_t     dt_exec   = bli_obj_exec_dt( c );
 
-	const doff_t    diagoffb  = bli_obj_diag_offset( b );
+	const doff_t    diagoffa  = bli_obj_diag_offset( a );
 
 	const pack_t    schema_a  = bli_obj_pack_schema( a );
 	const pack_t    schema_b  = bli_obj_pack_schema( b );
@@ -95,12 +95,12 @@ void bli_trsm_ll_ker_var2
 	const inc_t     cs_c      = bli_obj_col_stride( c );
 
 	// Grab the address of the internal scalar buffer for the scalar
-	// attached to A (the non-triangular matrix). This will be the alpha
+	// attached to B (the non-triangular matrix). This will be the alpha
 	// scalar used in the gemmtrsm subproblems (ie: the scalar that would
-	// be applied to the packed copy of A prior to it being updated by
+	// be applied to the packed copy of B prior to it being updated by
 	// the trsm subproblem). This scalar may be unit, if for example it
 	// was applied during packing.
-	const void* buf_alpha1 = bli_obj_internal_scalar_buffer( a );
+	const void* buf_alpha1 = bli_obj_internal_scalar_buffer( b );
 
 	// Grab the address of the internal scalar buffer for the scalar
 	// attached to C. This will be the "beta" scalar used in the gemm-only
@@ -114,7 +114,7 @@ void bli_trsm_ll_ker_var2
 	// function pointer.
 	ftypes[dt_exec]
 	(
-	  diagoffb,
+	  diagoffa,
 	  schema_a,
 	  schema_b,
 	  m,
diff --git a/frame/3/trsm/bli_trsm_lu_ker_var2.c b/frame/3/trsm/bli_trsm_lu_ker_var2.c
index 46ee8f4399..799fdd1013 100644
--- a/frame/3/trsm/bli_trsm_lu_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_lu_ker_var2.c
@@ -71,7 +71,7 @@ void bli_trsm_lu_ker_var2
 {
 	const num_t     dt_exec   = bli_obj_exec_dt( c );
 
-	const doff_t    diagoffb  = bli_obj_diag_offset( b );
+	const doff_t    diagoffa  = bli_obj_diag_offset( a );
 
 	const pack_t    schema_a  = bli_obj_pack_schema( a );
 	const pack_t    schema_b  = bli_obj_pack_schema( b );
@@ -95,12 +95,12 @@ void bli_trsm_lu_ker_var2
 	const inc_t     cs_c      = bli_obj_col_stride( c );
 
 	// Grab the address of the internal scalar buffer for the scalar
-	// attached to A (the non-triangular matrix). This will be the alpha
+	// attached to B (the non-triangular matrix). This will be the alpha
 	// scalar used in the gemmtrsm subproblems (ie: the scalar that would
-	// be applied to the packed copy of A prior to it being updated by
+	// be applied to the packed copy of B prior to it being updated by
 	// the trsm subproblem). This scalar may be unit, if for example it
 	// was applied during packing.
-	const void* buf_alpha1 = bli_obj_internal_scalar_buffer( a );
+	const void* buf_alpha1 = bli_obj_internal_scalar_buffer( b );
 
 	// Grab the address of the internal scalar buffer for the scalar
 	// attached to C. This will be the "beta" scalar used in the gemm-only
@@ -114,7 +114,7 @@ void bli_trsm_lu_ker_var2
 	// function pointer.
 	ftypes[dt_exec]
 	(
-	  diagoffb,
+	  diagoffa,
 	  schema_a,
 	  schema_b,
 	  m,
diff --git a/frame/thread/bli_thrinfo_sup.c b/frame/thread/bli_thrinfo_sup.c
index 8800bc01f0..03ae687b14 100644
--- a/frame/thread/bli_thrinfo_sup.c
+++ b/frame/thread/bli_thrinfo_sup.c
@@ -33,7 +33,6 @@
 
 */
 
-#include "bli_thrcomm_openmp.h"
 #include "blis.h"
 
 void bli_thrinfo_sup_grow

From 1f119e43534694dbda8c3f721ef48e2ed269d9c2 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Tue, 15 Mar 2022 12:52:01 -0500
Subject: [PATCH 14/32] Fix performance bug.

Due to missing `break`s in a switch statement (warn me, dammit!), the virtual gemm ukernels were not getting set to the optimized versions.
---
 frame/base/bli_cntx.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c
index bfc82d1f9e..5a886a128f 100644
--- a/frame/base/bli_cntx.c
+++ b/frame/base/bli_cntx.c
@@ -257,12 +257,12 @@ void bli_cntx_set_ukrs( cntx_t* cntx , ... )
 		// ukernel id provided by the caller.
 		switch ( ukr_id )
 		{
-			case BLIS_GEMM_UKR:       ukrs = &cntx_ukrs[ BLIS_GEMM_VIR_UKR ];
-			case BLIS_GEMMTRSM_L_UKR: ukrs = &cntx_ukrs[ BLIS_GEMMTRSM_L_VIR_UKR ];
-			case BLIS_GEMMTRSM_U_UKR: ukrs = &cntx_ukrs[ BLIS_GEMMTRSM_U_VIR_UKR ];
-			case BLIS_TRSM_L_UKR:     ukrs = &cntx_ukrs[ BLIS_TRSM_L_VIR_UKR ];
-			case BLIS_TRSM_U_UKR:     ukrs = &cntx_ukrs[ BLIS_TRSM_U_VIR_UKR ];
-		    default:                  ukrs = NULL;
+			case BLIS_GEMM_UKR:       ukrs = &cntx_ukrs[ BLIS_GEMM_VIR_UKR ]; break;
+			case BLIS_GEMMTRSM_L_UKR: ukrs = &cntx_ukrs[ BLIS_GEMMTRSM_L_VIR_UKR ]; break;
+			case BLIS_GEMMTRSM_U_UKR: ukrs = &cntx_ukrs[ BLIS_GEMMTRSM_U_VIR_UKR ]; break;
+			case BLIS_TRSM_L_UKR:     ukrs = &cntx_ukrs[ BLIS_TRSM_L_VIR_UKR ]; break;
+			case BLIS_TRSM_U_UKR:     ukrs = &cntx_ukrs[ BLIS_TRSM_U_VIR_UKR ]; break;
+		    default:                  ukrs = NULL; break;
 		};
 
 		if ( ukrs )

From 97d124a09605a73ac76cc3de3a67c1a4bacc0f28 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Tue, 15 Mar 2022 12:52:01 -0500
Subject: [PATCH 15/32] Fix performance bug.

Due to missing `break`s in a switch statement (warn me, dammit!), the virtual gemm ukernels were not getting set to the optimized versions. [ci skip]
---
 frame/base/bli_cntx.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c
index 5ce04b5025..218325d5a0 100644
--- a/frame/base/bli_cntx.c
+++ b/frame/base/bli_cntx.c
@@ -257,12 +257,12 @@ void bli_cntx_set_ukrs( cntx_t* cntx , ... )
 		// ukernel id provided by the caller.
 		switch ( ukr_id )
 		{
-			case BLIS_GEMM_UKR:       ukrs = &cntx_ukrs[ BLIS_GEMM_VIR_UKR ];
-			case BLIS_GEMMTRSM_L_UKR: ukrs = &cntx_ukrs[ BLIS_GEMMTRSM_L_VIR_UKR ];
-			case BLIS_GEMMTRSM_U_UKR: ukrs = &cntx_ukrs[ BLIS_GEMMTRSM_U_VIR_UKR ];
-			case BLIS_TRSM_L_UKR:     ukrs = &cntx_ukrs[ BLIS_TRSM_L_VIR_UKR ];
-			case BLIS_TRSM_U_UKR:     ukrs = &cntx_ukrs[ BLIS_TRSM_U_VIR_UKR ];
-		    default:                  ukrs = NULL;
+			case BLIS_GEMM_UKR:       ukrs = &cntx_ukrs[ BLIS_GEMM_VIR_UKR ]; break;
+			case BLIS_GEMMTRSM_L_UKR: ukrs = &cntx_ukrs[ BLIS_GEMMTRSM_L_VIR_UKR ]; break;
+			case BLIS_GEMMTRSM_U_UKR: ukrs = &cntx_ukrs[ BLIS_GEMMTRSM_U_VIR_UKR ]; break;
+			case BLIS_TRSM_L_UKR:     ukrs = &cntx_ukrs[ BLIS_TRSM_L_VIR_UKR ]; break;
+			case BLIS_TRSM_U_UKR:     ukrs = &cntx_ukrs[ BLIS_TRSM_U_VIR_UKR ]; break;
+		    default:                  ukrs = NULL; break;
 		};
 
 		if ( ukrs )

From 1200d8ca0e42ff8ef9aabbd832f8570e2ded05ec Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Tue, 15 Mar 2022 15:10:19 -0500
Subject: [PATCH 16/32] Fix trsm bug.

Beta (as the scalar attached to C) was not seen as reset to 1 after the first iteration of the pc loop, as the wrong pointer was passed to bli_gemm_int.
---
 frame/3/trsm/bli_trsm_blk_var3.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/frame/3/trsm/bli_trsm_blk_var3.c b/frame/3/trsm/bli_trsm_blk_var3.c
index 4f7bcb9ff9..2ff3db6f1d 100644
--- a/frame/3/trsm/bli_trsm_blk_var3.c
+++ b/frame/3/trsm/bli_trsm_blk_var3.c
@@ -51,10 +51,10 @@ void bli_trsm_blk_var3
 	bli_obj_alias_to( c, &cs );
 
 	// Determine the direction in which to partition (forwards or backwards).
-	dir_t direct = bli_l3_direct( &ap, &bp, c, cntl );
+	dir_t direct = bli_l3_direct( &ap, &bp, &cs, cntl );
 
 	// Prune any zero region that exists along the partitioning dimension.
-	bli_l3_prune_unref_mparts_k( &ap, &bp, c, cntl );
+	bli_l3_prune_unref_mparts_k( &ap, &bp, &cs, cntl );
 
 	// Query dimension in partitioning direction.
 	dim_t k_trans = bli_obj_width_after_trans( &ap );
@@ -81,7 +81,7 @@ void bli_trsm_blk_var3
 		  &a1,
 		  &b1,
 		  &BLIS_ONE,
-		  c,
+		  &cs,
 		  cntx,
 		  rntm,
 		  bli_cntl_sub_node( cntl ),
@@ -96,7 +96,8 @@ void bli_trsm_blk_var3
 		// that they are only used in the first iteration.
 		if ( i == 0 )
 		{
-			bli_obj_scalar_reset( &ap ); bli_obj_scalar_reset( &bp );
+			bli_obj_scalar_reset( &ap );
+			bli_obj_scalar_reset( &bp );
 			bli_obj_scalar_reset( &cs );
 		}
 	}

From 99e4c8081f1398cc879d840a3cb7abbba4508690 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Tue, 5 Apr 2022 16:56:36 -0500
Subject: [PATCH 17/32] Attempt to resolve some conflicts before merge.

---
 config/zen/bli_cntx_init_zen.c        |   28 +-
 config/zen2/bli_cntx_init_zen2.c      |   18 +-
 config/zen3/bli_cntx_init_zen3.c      |   20 +-
 kernels/zen/1/bli_scalv_zen_int10.c   |   59 +-
 kernels/zen/1f/bli_axpyf_zen_int_5.c  | 1236 +++++++++++++++++++++++++
 kernels/zen2/1f/bli_axpyf_zen_int_5.c |  599 ------------
 6 files changed, 1299 insertions(+), 661 deletions(-)
 create mode 100644 kernels/zen/1f/bli_axpyf_zen_int_5.c
 delete mode 100644 kernels/zen2/1f/bli_axpyf_zen_int_5.c

diff --git a/config/zen/bli_cntx_init_zen.c b/config/zen/bli_cntx_init_zen.c
index 2b80c37838..354deecb1e 100644
--- a/config/zen/bli_cntx_init_zen.c
+++ b/config/zen/bli_cntx_init_zen.c
@@ -122,31 +122,24 @@ void bli_cntx_init_zen( cntx_t* cntx )
 	  BLIS_PACKM_NRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk,
 
 	  // axpyf
-	  BLIS_AXPYF_KER,     BLIS_FLOAT,  bli_saxpyf_zen_int_8,
-	  BLIS_AXPYF_KER,     BLIS_DOUBLE, bli_daxpyf_zen_int_8,
+	  BLIS_AXPYF_KER,  BLIS_FLOAT,  bli_saxpyf_zen_int_8,
+	  BLIS_AXPYF_KER,  BLIS_DOUBLE, bli_daxpyf_zen_int_8,
 
 	  // dotxf
-	  BLIS_DOTXF_KER,     BLIS_FLOAT,  bli_sdotxf_zen_int_8,
-	  BLIS_DOTXF_KER,     BLIS_DOUBLE, bli_ddotxf_zen_int_8,
+	  BLIS_DOTXF_KER,  BLIS_FLOAT,  bli_sdotxf_zen_int_8,
+	  BLIS_DOTXF_KER,  BLIS_DOUBLE, bli_ddotxf_zen_int_8,
 
 	  // amaxv
 	  BLIS_AMAXV_KER,  BLIS_FLOAT,  bli_samaxv_zen_int,
 	  BLIS_AMAXV_KER,  BLIS_DOUBLE, bli_damaxv_zen_int,
 
 	  // axpyv
-#if 0
-	  BLIS_AXPYV_KER,  BLIS_FLOAT,  bli_saxpyv_zen_int,
-	  BLIS_AXPYV_KER,  BLIS_DOUBLE, bli_daxpyv_zen_int,
-#else
 	  BLIS_AXPYV_KER,  BLIS_FLOAT,  bli_saxpyv_zen_int10,
 	  BLIS_AXPYV_KER,  BLIS_DOUBLE, bli_daxpyv_zen_int10,
-#endif
 
-#if 0
 	  // copyv
 	  BLIS_COPYV_KER,  BLIS_FLOAT,  bli_scopyv_zen_int,
 	  BLIS_COPYV_KER,  BLIS_DOUBLE, bli_dcopyv_zen_int,
-#endif
 
 	  // dotv
 	  BLIS_DOTV_KER,   BLIS_FLOAT,  bli_sdotv_zen_int,
@@ -157,23 +150,16 @@ void bli_cntx_init_zen( cntx_t* cntx )
 	  BLIS_DOTXV_KER,  BLIS_DOUBLE, bli_ddotxv_zen_int,
 
 	  // scalv
-#if 0
-	  BLIS_SCALV_KER,  BLIS_FLOAT,  bli_sscalv_zen_int,
-	  BLIS_SCALV_KER,  BLIS_DOUBLE, bli_dscalv_zen_int,
-#else
 	  BLIS_SCALV_KER,  BLIS_FLOAT,  bli_sscalv_zen_int10,
 	  BLIS_SCALV_KER,  BLIS_DOUBLE, bli_dscalv_zen_int10,
-#endif
 
-#if 0
 	  // setv
-	  BLIS_SETV_KER,  BLIS_FLOAT,  bli_ssetv_zen_int,
-	  BLIS_SETV_KER,  BLIS_DOUBLE, bli_dsetv_zen_int,
+	  BLIS_SETV_KER,   BLIS_FLOAT,  bli_ssetv_zen_int,
+	  BLIS_SETV_KER,   BLIS_DOUBLE, bli_dsetv_zen_int,
 
 	  // swapv
 	  BLIS_SWAPV_KER,  BLIS_FLOAT,  bli_sswapv_zen_int8,
 	  BLIS_SWAPV_KER,  BLIS_DOUBLE, bli_dswapv_zen_int8,
-#endif
 
 	  BLIS_VA_END
 	);
@@ -282,7 +268,7 @@ void bli_cntx_init_zen( cntx_t* cntx )
 
 	// Initialize level-3 sup blocksize objects with architecture-specific
 	// values.
-	//                                           s      d      c      z
+	//                                               s      d      c      z
 	bli_blksz_init     ( &blkszs[ BLIS_MR_SUP ],     6,     6,    -1,    -1,
 	                                                 9,     9,    -1,    -1 );
 	bli_blksz_init_easy( &blkszs[ BLIS_NR_SUP ],    16,     8,    -1,    -1 );
diff --git a/config/zen2/bli_cntx_init_zen2.c b/config/zen2/bli_cntx_init_zen2.c
index ef16fef721..857f169eb2 100644
--- a/config/zen2/bli_cntx_init_zen2.c
+++ b/config/zen2/bli_cntx_init_zen2.c
@@ -120,12 +120,12 @@ void bli_cntx_init_zen2( cntx_t* cntx )
 	  BLIS_PACKM_NRXK_KER, BLIS_DCOMPLEX, bli_zpackm_haswell_asm_4xk,
 
 	  // axpyf
-	  BLIS_AXPYF_KER,     BLIS_FLOAT,  bli_saxpyf_zen_int_5,
-	  BLIS_AXPYF_KER,     BLIS_DOUBLE, bli_daxpyf_zen_int_5,
+	  BLIS_AXPYF_KER,  BLIS_FLOAT,  bli_saxpyf_zen_int_5,
+	  BLIS_AXPYF_KER,  BLIS_DOUBLE, bli_daxpyf_zen_int_5,
 
 	  // dotxf
-	  BLIS_DOTXF_KER,     BLIS_FLOAT,  bli_sdotxf_zen_int_8,
-	  BLIS_DOTXF_KER,     BLIS_DOUBLE, bli_ddotxf_zen_int_8,
+	  BLIS_DOTXF_KER,  BLIS_FLOAT,  bli_sdotxf_zen_int_8,
+	  BLIS_DOTXF_KER,  BLIS_DOUBLE, bli_ddotxf_zen_int_8,
 
 	  // amaxv
 	  BLIS_AMAXV_KER,  BLIS_FLOAT,  bli_samaxv_zen_int,
@@ -148,16 +148,16 @@ void bli_cntx_init_zen2( cntx_t* cntx )
 	  BLIS_SCALV_KER,  BLIS_DOUBLE, bli_dscalv_zen_int10,
 
 	  //swap
-	  BLIS_SWAPV_KER, BLIS_FLOAT,   bli_sswapv_zen_int8,
-	  BLIS_SWAPV_KER, BLIS_DOUBLE,  bli_dswapv_zen_int8,
+	  BLIS_SWAPV_KER,  BLIS_FLOAT,  bli_sswapv_zen_int8,
+	  BLIS_SWAPV_KER,  BLIS_DOUBLE, bli_dswapv_zen_int8,
 
 	  //copy
 	  BLIS_COPYV_KER,  BLIS_FLOAT,  bli_scopyv_zen_int,
 	  BLIS_COPYV_KER,  BLIS_DOUBLE, bli_dcopyv_zen_int,
 
 	  //set
-	  BLIS_SETV_KER,  BLIS_FLOAT,  bli_ssetv_zen_int,
-	  BLIS_SETV_KER,  BLIS_DOUBLE, bli_dsetv_zen_int,
+	  BLIS_SETV_KER,   BLIS_FLOAT,  bli_ssetv_zen_int,
+	  BLIS_SETV_KER,   BLIS_DOUBLE, bli_dsetv_zen_int,
 
 	  BLIS_VA_END
 	);
@@ -234,7 +234,7 @@ void bli_cntx_init_zen2( cntx_t* cntx )
 
 	// Initialize level-3 sup blocksize objects with architecture-specific
 	// values.
-	//                                           s      d      c      z
+	//                                               s      d      c      z
 	bli_blksz_init     ( &blkszs[ BLIS_MR_SUP ],     6,     6,    -1,    -1,
 	                                                 9,     9,    -1,    -1 );
 	bli_blksz_init_easy( &blkszs[ BLIS_NR_SUP ],    16,     8,    -1,    -1 );
diff --git a/config/zen3/bli_cntx_init_zen3.c b/config/zen3/bli_cntx_init_zen3.c
index db38037ddb..f7aa93717b 100644
--- a/config/zen3/bli_cntx_init_zen3.c
+++ b/config/zen3/bli_cntx_init_zen3.c
@@ -131,12 +131,12 @@ void bli_cntx_init_zen3( cntx_t* cntx )
 #endif
 
 	  // axpyf
-	  BLIS_AXPYF_KER,     BLIS_FLOAT,  bli_saxpyf_zen_int_5,
-	  BLIS_AXPYF_KER,     BLIS_DOUBLE, bli_daxpyf_zen_int_5,
+	  BLIS_AXPYF_KER,  BLIS_FLOAT,  bli_saxpyf_zen_int_5,
+	  BLIS_AXPYF_KER,  BLIS_DOUBLE, bli_daxpyf_zen_int_5,
 
 	  // dotxf
-	  BLIS_DOTXF_KER,     BLIS_FLOAT,  bli_sdotxf_zen_int_8,
-	  BLIS_DOTXF_KER,     BLIS_DOUBLE, bli_ddotxf_zen_int_8,
+	  BLIS_DOTXF_KER,  BLIS_FLOAT,  bli_sdotxf_zen_int_8,
+	  BLIS_DOTXF_KER,  BLIS_DOUBLE, bli_ddotxf_zen_int_8,
 
 	  // amaxv
 	  BLIS_AMAXV_KER,  BLIS_FLOAT,  bli_samaxv_zen_int,
@@ -158,15 +158,15 @@ void bli_cntx_init_zen3( cntx_t* cntx )
 	  BLIS_SCALV_KER,  BLIS_FLOAT,  bli_sscalv_zen_int10,
 	  BLIS_SCALV_KER,  BLIS_DOUBLE, bli_dscalv_zen_int10,
 
-	  //swap
-	  BLIS_SWAPV_KER, BLIS_FLOAT,   bli_sswapv_zen_int8,
-	  BLIS_SWAPV_KER, BLIS_DOUBLE,  bli_dswapv_zen_int8,
+	  // swapv
+	  BLIS_SWAPV_KER,  BLIS_FLOAT,  bli_sswapv_zen_int8,
+	  BLIS_SWAPV_KER,  BLIS_DOUBLE, bli_dswapv_zen_int8,
 
-	  //copy
+	  // copyv
 	  BLIS_COPYV_KER,  BLIS_FLOAT,  bli_scopyv_zen_int,
 	  BLIS_COPYV_KER,  BLIS_DOUBLE, bli_dcopyv_zen_int,
 
-	  //set
+	  // setv
 	  BLIS_SETV_KER,  BLIS_FLOAT,  bli_ssetv_zen_int,
 	  BLIS_SETV_KER,  BLIS_DOUBLE, bli_dsetv_zen_int,
 
@@ -251,7 +251,7 @@ void bli_cntx_init_zen3( cntx_t* cntx )
 
 	// Initialize level-3 sup blocksize objects with architecture-specific
 	// values.
-	//                                           s      d      c      z
+	//                                               s      d      c      z
 	bli_blksz_init     ( &blkszs[ BLIS_MR_SUP ],     6,     6,     3,     3,
 	                                                 9,     9,     3,     3 );
 	bli_blksz_init_easy( &blkszs[ BLIS_NR_SUP ],    16,     8,     8,     4 );
diff --git a/kernels/zen/1/bli_scalv_zen_int10.c b/kernels/zen/1/bli_scalv_zen_int10.c
index d536ed7c02..fc677d9b88 100644
--- a/kernels/zen/1/bli_scalv_zen_int10.c
+++ b/kernels/zen/1/bli_scalv_zen_int10.c
@@ -81,16 +81,9 @@ void bli_sscalv_zen_int10
 	if ( PASTEMAC(s,eq0)( *alpha ) )
 	{
 		float* zero = bli_s0;
-#ifdef BLIS_CONFIG_ZEN2
-		bli_ssetv_zen_int
-		(
-		  BLIS_NO_CONJUGATE,
-		  n,
-		  zero,
-		  x, incx,
-		  cntx
-		);
-#else
+
+		if ( cntx == NULL ) cntx = bli_gks_query_cntx();
+
 		ssetv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_FLOAT, BLIS_SETV_KER, cntx );
 		f
 		(
@@ -100,7 +93,7 @@ void bli_sscalv_zen_int10
 		  x, incx,
 		  cntx
 		);
-#endif
+
 		return;
 	}
 
@@ -281,16 +274,9 @@ void bli_dscalv_zen_int10
 	if ( PASTEMAC(d,eq0)( *alpha ) )
 	{
 		double* zero = bli_d0;
-#ifdef BLIS_CONFIG_ZEN2
-		bli_dsetv_zen_int
-		(
-		  BLIS_NO_CONJUGATE,
-		  n,
-		  zero,
-		  x, incx,
-		  cntx
-		);
-#else
+
+		if ( cntx == NULL ) cntx = bli_gks_query_cntx();
+
 		dsetv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_SETV_KER, cntx );
 
 		f
@@ -301,7 +287,7 @@ void bli_dscalv_zen_int10
 		  x, incx,
 		  cntx
 		);
-#endif
+
 		return;
 	}
 
@@ -454,3 +440,32 @@ void bli_dscalv_zen_int10
 	}
 }
 
+// -----------------------------------------------------------------------------
+
+//
+// NOTE: This function definition is provided as a placeholder in order to allow
+// function names of scalv kernels to be hard-coded in bli_gemv_unf_var2_amd.c.
+//
+
+void bli_cscalv_zen_int10
+     (
+       conj_t             conjalpha,
+       dim_t              n,
+       scomplex* restrict alpha,
+       scomplex* restrict x, inc_t incx,
+       cntx_t*   restrict cntx
+     )
+{
+	const num_t dt = BLIS_SCOMPLEX;
+
+	cscalv_ker_ft f = bli_cntx_get_l1v_ker_dt( dt, BLIS_SCALV_KER, cntx );
+
+	f
+	(
+	  conjalpha,
+	  n,
+	  alpha,
+	  x, incx,
+	  cntx
+	);
+}
diff --git a/kernels/zen/1f/bli_axpyf_zen_int_5.c b/kernels/zen/1f/bli_axpyf_zen_int_5.c
new file mode 100644
index 0000000000..e52a8bb7f5
--- /dev/null
+++ b/kernels/zen/1f/bli_axpyf_zen_int_5.c
@@ -0,0 +1,1236 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2020 - 2022, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "immintrin.h"
+#include "blis.h"
+
+/* Union data structure to access AVX registers
+   One 256-bit AVX register holds 8 SP elements. */
+typedef union
+{
+    __m256  v;
+    float   f[8] __attribute__((aligned(64)));
+} v8sf_t;
+
+/* Union data structure to access AVX registers
+*  One 256-bit AVX register holds 4 DP elements. */
+typedef union
+{
+    __m256d v;
+    __m128d xmm[2];
+    double  d[4] __attribute__((aligned(64)));
+} v4df_t;
+
+typedef union
+{
+    __m128d v;
+    double  d[2] __attribute__((aligned(64)));
+} v2df_t;
+
+
+void bli_saxpyf_zen_int_5
+     (
+       conj_t           conja,
+       conj_t           conjx,
+       dim_t            m,
+       dim_t            b_n,
+       float* restrict alpha,
+       float* restrict a, inc_t inca, inc_t lda,
+       float* restrict x, inc_t incx,
+       float* restrict y, inc_t incy,
+       cntx_t* restrict cntx
+     )
+{
+    const dim_t      fuse_fac       = 5;
+
+    const dim_t      n_elem_per_reg = 8;
+    const dim_t      n_iter_unroll  = 2;
+
+    dim_t            i;
+
+    float* restrict a0;
+    float* restrict a1;
+    float* restrict a2;
+    float* restrict a3;
+    float* restrict a4;
+
+    float* restrict y0;
+
+    v8sf_t           chi0v, chi1v, chi2v, chi3v;
+    v8sf_t           chi4v;
+
+    v8sf_t           a00v, a01v, a02v, a03v;
+    v8sf_t           a04v;
+
+    v8sf_t           a10v, a11v, a12v, a13v;
+    v8sf_t           a14v;
+
+    v8sf_t           y0v, y1v;
+
+    float           chi0, chi1, chi2, chi3;
+    float           chi4;
+
+    // If either dimension is zero, or if alpha is zero, return early.
+    if ( bli_zero_dim2( m, b_n ) || bli_seq0( *alpha ) ) return;
+
+    // If b_n is not equal to the fusing factor, then perform the entire
+    // operation as a loop over axpyv.
+    if ( b_n != fuse_fac )
+    {
+        if ( cntx == NULL ) cntx = bli_gks_query_cntx();
+
+        saxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_FLOAT, BLIS_AXPYV_KER, cntx );
+
+        for ( i = 0; i < b_n; ++i )
+        {
+            float* a1   = a + (0  )*inca + (i  )*lda;
+            float* chi1 = x + (i  )*incx;
+            float* y1   = y + (0  )*incy;
+            float  alpha_chi1;
+
+            bli_scopycjs( conjx, *chi1, alpha_chi1 );
+            bli_sscals( *alpha, alpha_chi1 );
+
+            f
+            (
+              conja,
+              m,
+              &alpha_chi1,
+              a1, inca,
+              y1, incy,
+              cntx
+            );
+        }
+
+        return;
+    }
+
+    // At this point, we know that b_n is exactly equal to the fusing factor.
+
+    a0   = a + 0*lda;
+    a1   = a + 1*lda;
+    a2   = a + 2*lda;
+    a3   = a + 3*lda;
+    a4   = a + 4*lda;
+    y0   = y;
+
+    chi0 = *( x + 0*incx );
+    chi1 = *( x + 1*incx );
+    chi2 = *( x + 2*incx );
+    chi3 = *( x + 3*incx );
+    chi4 = *( x + 4*incx );
+
+
+    // Scale each chi scalar by alpha.
+    bli_sscals( *alpha, chi0 );
+    bli_sscals( *alpha, chi1 );
+    bli_sscals( *alpha, chi2 );
+    bli_sscals( *alpha, chi3 );
+    bli_sscals( *alpha, chi4 );
+
+    // Broadcast the (alpha*chi?) scalars to all elements of vector registers.
+    chi0v.v = _mm256_broadcast_ss( &chi0 );
+    chi1v.v = _mm256_broadcast_ss( &chi1 );
+    chi2v.v = _mm256_broadcast_ss( &chi2 );
+    chi3v.v = _mm256_broadcast_ss( &chi3 );
+    chi4v.v = _mm256_broadcast_ss( &chi4 );
+
+    // If there are vectorized iterations, perform them with vector
+    // instructions.
+    if ( inca == 1 && incy == 1 )
+    {
+        for ( i = 0; (i + 15) < m; i += 16 )
+        {
+            // Load the input values.
+            y0v.v = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
+            y1v.v = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
+
+            a00v.v = _mm256_loadu_ps( a0 + 0*n_elem_per_reg );
+            a10v.v = _mm256_loadu_ps( a0 + 1*n_elem_per_reg );
+
+            a01v.v = _mm256_loadu_ps( a1 + 0*n_elem_per_reg );
+            a11v.v = _mm256_loadu_ps( a1 + 1*n_elem_per_reg );
+
+            a02v.v = _mm256_loadu_ps( a2 + 0*n_elem_per_reg );
+            a12v.v = _mm256_loadu_ps( a2 + 1*n_elem_per_reg );
+
+            a03v.v = _mm256_loadu_ps( a3 + 0*n_elem_per_reg );
+            a13v.v = _mm256_loadu_ps( a3 + 1*n_elem_per_reg );
+
+            a04v.v = _mm256_loadu_ps( a4 + 0*n_elem_per_reg );
+            a14v.v = _mm256_loadu_ps( a4 + 1*n_elem_per_reg );
+
+            // perform : y += alpha * x;
+            y0v.v = _mm256_fmadd_ps( a00v.v, chi0v.v, y0v.v );
+            y1v.v = _mm256_fmadd_ps( a10v.v, chi0v.v, y1v.v );
+
+            y0v.v = _mm256_fmadd_ps( a01v.v, chi1v.v, y0v.v );
+            y1v.v = _mm256_fmadd_ps( a11v.v, chi1v.v, y1v.v );
+
+            y0v.v = _mm256_fmadd_ps( a02v.v, chi2v.v, y0v.v );
+            y1v.v = _mm256_fmadd_ps( a12v.v, chi2v.v, y1v.v );
+
+            y0v.v = _mm256_fmadd_ps( a03v.v, chi3v.v, y0v.v );
+            y1v.v = _mm256_fmadd_ps( a13v.v, chi3v.v, y1v.v );
+
+            y0v.v = _mm256_fmadd_ps( a04v.v, chi4v.v, y0v.v );
+            y1v.v = _mm256_fmadd_ps( a14v.v, chi4v.v, y1v.v );
+
+
+            // Store the output.
+            _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), y0v.v );
+            _mm256_storeu_ps( (y0 + 1*n_elem_per_reg), y1v.v );
+
+            y0 += n_iter_unroll * n_elem_per_reg;
+            a0 += n_iter_unroll * n_elem_per_reg;
+            a1 += n_iter_unroll * n_elem_per_reg;
+            a2 += n_iter_unroll * n_elem_per_reg;
+            a3 += n_iter_unroll * n_elem_per_reg;
+            a4 += n_iter_unroll * n_elem_per_reg;
+        }
+
+        for( ; (i + 7) < m; i += 8 )
+        {
+            // Load the input values.
+            y0v.v = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
+
+            a00v.v = _mm256_loadu_ps( a0 + 0*n_elem_per_reg );
+            a01v.v = _mm256_loadu_ps( a1 + 0*n_elem_per_reg );
+            a02v.v = _mm256_loadu_ps( a2 + 0*n_elem_per_reg );
+            a03v.v = _mm256_loadu_ps( a3 + 0*n_elem_per_reg );
+            a04v.v = _mm256_loadu_ps( a4 + 0*n_elem_per_reg );
+
+
+            // perform : y += alpha * x;
+            y0v.v = _mm256_fmadd_ps( a00v.v, chi0v.v, y0v.v );
+            y0v.v = _mm256_fmadd_ps( a01v.v, chi1v.v, y0v.v );
+            y0v.v = _mm256_fmadd_ps( a02v.v, chi2v.v, y0v.v );
+            y0v.v = _mm256_fmadd_ps( a03v.v, chi3v.v, y0v.v );
+            y0v.v = _mm256_fmadd_ps( a04v.v, chi4v.v, y0v.v );
+
+            // Store the output.
+            _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), y0v.v );
+
+            y0 += n_elem_per_reg;
+            a0 += n_elem_per_reg;
+            a1 += n_elem_per_reg;
+            a2 += n_elem_per_reg;
+            a3 += n_elem_per_reg;
+            a4 += n_elem_per_reg;
+        }
+
+        // If there are leftover iterations, perform them with scalar code.
+        for ( ; (i + 0) < m ; ++i )
+        {
+            double       y0c = *y0;
+
+            const float a0c = *a0;
+            const float a1c = *a1;
+            const float a2c = *a2;
+            const float a3c = *a3;
+            const float a4c = *a4;
+
+            y0c += chi0 * a0c;
+            y0c += chi1 * a1c;
+            y0c += chi2 * a2c;
+            y0c += chi3 * a3c;
+            y0c += chi4 * a4c;
+
+            *y0 = y0c;
+
+            a0 += 1;
+            a1 += 1;
+            a2 += 1;
+            a3 += 1;
+            a4 += 1;
+            y0 += 1;
+        }
+    }
+    else
+    {
+        for ( i = 0; (i + 0) < m ; ++i )
+        {
+            double       y0c = *y0;
+
+            const float a0c = *a0;
+            const float a1c = *a1;
+            const float a2c = *a2;
+            const float a3c = *a3;
+            const float a4c = *a4;
+
+            y0c += chi0 * a0c;
+            y0c += chi1 * a1c;
+            y0c += chi2 * a2c;
+            y0c += chi3 * a3c;
+            y0c += chi4 * a4c;
+
+            *y0 = y0c;
+
+            a0 += inca;
+            a1 += inca;
+            a2 += inca;
+            a3 += inca;
+            a4 += inca;
+            y0 += incy;
+        }
+
+    }
+}
+
+
+// -----------------------------------------------------------------------------
+
+void bli_daxpyf_zen_int_5
+     (
+       conj_t           conja,
+       conj_t           conjx,
+       dim_t            m,
+       dim_t            b_n,
+       double* restrict alpha,
+       double* restrict a, inc_t inca, inc_t lda,
+       double* restrict x, inc_t incx,
+       double* restrict y, inc_t incy,
+       cntx_t* restrict cntx
+     )
+{
+    const dim_t      fuse_fac       = 5;
+
+    const dim_t      n_elem_per_reg = 4;
+    const dim_t      n_iter_unroll  = 2;
+
+    dim_t            i;
+
+    double* restrict a0;
+    double* restrict a1;
+    double* restrict a2;
+    double* restrict a3;
+    double* restrict a4;
+
+    double* restrict y0;
+
+    v4df_t           chi0v, chi1v, chi2v, chi3v;
+    v4df_t           chi4v;
+
+    v4df_t           a00v, a01v, a02v, a03v;
+    v4df_t           a04v;
+
+    v4df_t           a10v, a11v, a12v, a13v;
+    v4df_t           a14v;
+
+    v4df_t           y0v, y1v;
+
+    double           chi0, chi1, chi2, chi3;
+    double           chi4;
+
+    // If either dimension is zero, or if alpha is zero, return early.
+    if ( bli_zero_dim2( m, b_n ) || bli_deq0( *alpha ) ) return;
+
+    // If b_n is not equal to the fusing factor, then perform the entire
+    // operation as a loop over axpyv.
+    if ( b_n != fuse_fac )
+    {
+        if ( cntx == NULL ) cntx = bli_gks_query_cntx();
+
+        daxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
+
+        for ( i = 0; i < b_n; ++i )
+        {
+            double* a1   = a + (0  )*inca + (i  )*lda;
+            double* chi1 = x + (i  )*incx;
+            double* y1   = y + (0  )*incy;
+            double  alpha_chi1;
+
+            bli_dcopycjs( conjx, *chi1, alpha_chi1 );
+            bli_dscals( *alpha, alpha_chi1 );
+
+            f
+            (
+              conja,
+              m,
+              &alpha_chi1,
+              a1, inca,
+              y1, incy,
+              cntx
+            );
+        }
+
+        return;
+    }
+
+    // At this point, we know that b_n is exactly equal to the fusing factor.
+
+    a0   = a + 0*lda;
+    a1   = a + 1*lda;
+    a2   = a + 2*lda;
+    a3   = a + 3*lda;
+    a4   = a + 4*lda;
+    y0   = y;
+
+    chi0 = *( x + 0*incx );
+    chi1 = *( x + 1*incx );
+    chi2 = *( x + 2*incx );
+    chi3 = *( x + 3*incx );
+    chi4 = *( x + 4*incx );
+
+
+    // Scale each chi scalar by alpha.
+    bli_dscals( *alpha, chi0 );
+    bli_dscals( *alpha, chi1 );
+    bli_dscals( *alpha, chi2 );
+    bli_dscals( *alpha, chi3 );
+    bli_dscals( *alpha, chi4 );
+
+    // Broadcast the (alpha*chi?) scalars to all elements of vector registers.
+    chi0v.v = _mm256_broadcast_sd( &chi0 );
+    chi1v.v = _mm256_broadcast_sd( &chi1 );
+    chi2v.v = _mm256_broadcast_sd( &chi2 );
+    chi3v.v = _mm256_broadcast_sd( &chi3 );
+    chi4v.v = _mm256_broadcast_sd( &chi4 );
+
+    // If there are vectorized iterations, perform them with vector
+    // instructions.
+    if ( inca == 1 && incy == 1 )
+    {
+        for ( i = 0; (i + 7) < m; i += 8 )
+        {
+            // Load the input values.
+            y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+            y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
+
+            a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
+            a10v.v = _mm256_loadu_pd( a0 + 1*n_elem_per_reg );
+
+            a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
+            a11v.v = _mm256_loadu_pd( a1 + 1*n_elem_per_reg );
+
+            a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg );
+            a12v.v = _mm256_loadu_pd( a2 + 1*n_elem_per_reg );
+
+            a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg );
+            a13v.v = _mm256_loadu_pd( a3 + 1*n_elem_per_reg );
+
+            a04v.v = _mm256_loadu_pd( a4 + 0*n_elem_per_reg );
+            a14v.v = _mm256_loadu_pd( a4 + 1*n_elem_per_reg );
+
+            // perform : y += alpha * x;
+            y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a10v.v, chi0v.v, y1v.v );
+
+            y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a11v.v, chi1v.v, y1v.v );
+
+            y0v.v = _mm256_fmadd_pd( a02v.v, chi2v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a12v.v, chi2v.v, y1v.v );
+
+            y0v.v = _mm256_fmadd_pd( a03v.v, chi3v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a13v.v, chi3v.v, y1v.v );
+
+            y0v.v = _mm256_fmadd_pd( a04v.v, chi4v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a14v.v, chi4v.v, y1v.v );
+
+
+            // Store the output.
+            _mm256_storeu_pd( ( double* )(y0 + 0*n_elem_per_reg), y0v.v );
+            _mm256_storeu_pd( ( double* )(y0 + 1*n_elem_per_reg), y1v.v );
+
+            y0 += n_iter_unroll * n_elem_per_reg;
+            a0 += n_iter_unroll * n_elem_per_reg;
+            a1 += n_iter_unroll * n_elem_per_reg;
+            a2 += n_iter_unroll * n_elem_per_reg;
+            a3 += n_iter_unroll * n_elem_per_reg;
+            a4 += n_iter_unroll * n_elem_per_reg;
+        }
+
+        for( ; (i + 3) < m; i += 4 )
+        {
+            // Load the input values.
+            y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+
+            a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
+            a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
+            a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg );
+            a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg );
+            a04v.v = _mm256_loadu_pd( a4 + 0*n_elem_per_reg );
+
+
+            // perform : y += alpha * x;
+            y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v );
+            y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v );
+            y0v.v = _mm256_fmadd_pd( a02v.v, chi2v.v, y0v.v );
+            y0v.v = _mm256_fmadd_pd( a03v.v, chi3v.v, y0v.v );
+            y0v.v = _mm256_fmadd_pd( a04v.v, chi4v.v, y0v.v );
+
+            // Store the output.
+            _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), y0v.v );
+
+            y0 += n_elem_per_reg;
+            a0 += n_elem_per_reg;
+            a1 += n_elem_per_reg;
+            a2 += n_elem_per_reg;
+            a3 += n_elem_per_reg;
+            a4 += n_elem_per_reg;
+        }
+
+        // If there are leftover iterations, perform them with scalar code.
+        for ( ; (i + 0) < m ; ++i )
+        {
+            double       y0c = *y0;
+
+            const double a0c = *a0;
+            const double a1c = *a1;
+            const double a2c = *a2;
+            const double a3c = *a3;
+            const double a4c = *a4;
+
+            y0c += chi0 * a0c;
+            y0c += chi1 * a1c;
+            y0c += chi2 * a2c;
+            y0c += chi3 * a3c;
+            y0c += chi4 * a4c;
+
+            *y0 = y0c;
+
+            a0 += 1;
+            a1 += 1;
+            a2 += 1;
+            a3 += 1;
+            a4 += 1;
+            y0 += 1;
+        }
+    }
+    else
+    {
+        for ( i = 0; (i + 0) < m ; ++i )
+        {
+            double       y0c = *y0;
+
+            const double a0c = *a0;
+            const double a1c = *a1;
+            const double a2c = *a2;
+            const double a3c = *a3;
+            const double a4c = *a4;
+
+            y0c += chi0 * a0c;
+            y0c += chi1 * a1c;
+            y0c += chi2 * a2c;
+            y0c += chi3 * a3c;
+            y0c += chi4 * a4c;
+
+            *y0 = y0c;
+
+            a0 += inca;
+            a1 += inca;
+            a2 += inca;
+            a3 += inca;
+            a4 += inca;
+            y0 += incy;
+        }
+
+    }
+}
+
+// -----------------------------------------------------------------------------
+
+void bli_daxpyf_zen_int_16x2
+     (
+       conj_t           conja,
+       conj_t           conjx,
+       dim_t            m,
+       dim_t            b_n,
+       double* restrict alpha,
+       double* restrict a, inc_t inca, inc_t lda,
+       double* restrict x, inc_t incx,
+       double* restrict y, inc_t incy,
+       cntx_t* restrict cntx
+     )
+{
+    const dim_t      fuse_fac       = 2;
+
+    const dim_t      n_elem_per_reg = 4;
+    const dim_t      n_iter_unroll  = 4;
+
+    dim_t            i;
+
+    double* restrict a0;
+    double* restrict a1;
+
+    double* restrict y0;
+
+    v4df_t           chi0v, chi1v;
+
+    v4df_t           a00v, a01v;
+
+    v4df_t           a10v, a11v;
+
+    v4df_t           a20v, a21v;
+
+    v4df_t           a30v, a31v;
+
+    v4df_t           y0v, y1v, y2v, y3v;
+
+    double           chi0, chi1;
+
+    v2df_t           a40v, a41v;
+
+    v2df_t           y4v; 
+    // If either dimension is zero, or if alpha is zero, return early.
+    if ( bli_zero_dim2( m, b_n ) || bli_deq0( *alpha ) ) return;
+
+    // If b_n is not equal to the fusing factor, then perform the entire
+    // operation as a loop over axpyv.
+    if ( b_n != fuse_fac )
+    {
+        daxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
+
+        for ( i = 0; i < b_n; ++i )
+        {
+            double* a1   = a + (0  )*inca + (i  )*lda;
+            double* chi1 = x + (i  )*incx;
+            double* y1   = y + (0  )*incy;
+            double  alpha_chi1;
+
+            bli_dcopycjs( conjx, *chi1, alpha_chi1 );
+            bli_dscals( *alpha, alpha_chi1 );
+
+            f
+            (
+              conja,
+              m,
+              &alpha_chi1,
+              a1, inca,
+              y1, incy,
+              cntx
+            );
+        }
+
+        return;
+    }
+
+    // At this point, we know that b_n is exactly equal to the fusing factor.
+
+    a0   = a + 0*lda;
+    a1   = a + 1*lda;
+
+    y0   = y;
+
+    chi0 = *( x + 0*incx );
+    chi1 = *( x + 1*incx );
+
+
+    // Scale each chi scalar by alpha.
+    bli_dscals( *alpha, chi0 );
+    bli_dscals( *alpha, chi1 );
+
+    // Broadcast the (alpha*chi?) scalars to all elements of vector registers.
+    chi0v.v = _mm256_broadcast_sd( &chi0 );
+    chi1v.v = _mm256_broadcast_sd( &chi1 );
+
+    // If there are vectorized iterations, perform them with vector
+    // instructions.
+    if ( inca == 1 && incy == 1 )
+    {
+        for ( i = 0; (i + 15) < m; i += 16 )
+        {
+            // Load the input values.
+            y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+            y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
+            y2v.v = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
+            y3v.v = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
+
+            a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
+            a10v.v = _mm256_loadu_pd( a0 + 1*n_elem_per_reg );
+            a20v.v = _mm256_loadu_pd( a0 + 2*n_elem_per_reg );
+            a30v.v = _mm256_loadu_pd( a0 + 3*n_elem_per_reg );
+
+            a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
+            a11v.v = _mm256_loadu_pd( a1 + 1*n_elem_per_reg );
+            a21v.v = _mm256_loadu_pd( a1 + 2*n_elem_per_reg );
+            a31v.v = _mm256_loadu_pd( a1 + 3*n_elem_per_reg );
+
+            // perform : y += alpha * x;
+            y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a10v.v, chi0v.v, y1v.v );
+            y2v.v = _mm256_fmadd_pd( a20v.v, chi0v.v, y2v.v );
+            y3v.v = _mm256_fmadd_pd( a30v.v, chi0v.v, y3v.v );
+
+            y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a11v.v, chi1v.v, y1v.v );
+            y2v.v = _mm256_fmadd_pd( a21v.v, chi1v.v, y2v.v );
+            y3v.v = _mm256_fmadd_pd( a31v.v, chi1v.v, y3v.v );
+
+            // Store the output.
+            _mm256_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y0v.v );
+            _mm256_storeu_pd( (double *)(y0 + 1*n_elem_per_reg), y1v.v );
+            _mm256_storeu_pd( (double *)(y0 + 2*n_elem_per_reg), y2v.v );
+            _mm256_storeu_pd( (double *)(y0 + 3*n_elem_per_reg), y3v.v );
+
+            y0 += n_iter_unroll * n_elem_per_reg;
+            a0 += n_iter_unroll * n_elem_per_reg;
+            a1 += n_iter_unroll * n_elem_per_reg;
+        }
+
+        for ( ; (i + 11) < m; i += 12 )
+        {
+            // Load the input values.
+            y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+            y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
+            y2v.v = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
+
+            a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
+            a10v.v = _mm256_loadu_pd( a0 + 1*n_elem_per_reg );
+            a20v.v = _mm256_loadu_pd( a0 + 2*n_elem_per_reg );
+
+            a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
+            a11v.v = _mm256_loadu_pd( a1 + 1*n_elem_per_reg );
+            a21v.v = _mm256_loadu_pd( a1 + 2*n_elem_per_reg );
+
+            // perform : y += alpha * x;
+            y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a10v.v, chi0v.v, y1v.v );
+            y2v.v = _mm256_fmadd_pd( a20v.v, chi0v.v, y2v.v );
+
+            y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a11v.v, chi1v.v, y1v.v );
+            y2v.v = _mm256_fmadd_pd( a21v.v, chi1v.v, y2v.v );
+
+            // Store the output.
+            _mm256_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y0v.v );
+            _mm256_storeu_pd( (double *)(y0 + 1*n_elem_per_reg), y1v.v );
+            _mm256_storeu_pd( (double *)(y0 + 2*n_elem_per_reg), y2v.v );
+
+            y0 += 3 * n_elem_per_reg;
+            a0 += 3 * n_elem_per_reg;
+            a1 += 3 * n_elem_per_reg;
+        }
+        for ( ; (i + 7) < m; i += 8 )
+        {
+            // Load the input values.
+            y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+            y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
+
+            a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
+            a10v.v = _mm256_loadu_pd( a0 + 1*n_elem_per_reg );
+
+            a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
+            a11v.v = _mm256_loadu_pd( a1 + 1*n_elem_per_reg );
+
+            // perform : y += alpha * x;
+            y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a10v.v, chi0v.v, y1v.v );
+
+            y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a11v.v, chi1v.v, y1v.v );
+
+            // Store the output.
+            _mm256_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y0v.v );
+            _mm256_storeu_pd( (double *)(y0 + 1*n_elem_per_reg), y1v.v );
+
+            y0 += 2 * n_elem_per_reg;
+            a0 += 2 * n_elem_per_reg;
+            a1 += 2 * n_elem_per_reg;
+        }
+
+        for ( ; (i + 3) < m; i += 4 )
+        {
+            // Load the input values.
+            y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+
+            a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
+
+            a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
+
+            // perform : y += alpha * x;
+            y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v );
+
+            y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v );
+
+            // Store the output.
+            _mm256_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y0v.v );
+
+            y0 += n_elem_per_reg;
+            a0 += n_elem_per_reg;
+            a1 += n_elem_per_reg;
+        }
+
+        for ( ; (i + 1) < m; i += 2 )
+        {
+            // Load the input values.
+            y4v.v = _mm_loadu_pd( y0 + 0*n_elem_per_reg );
+
+            a40v.v = _mm_loadu_pd( a0 + 0*n_elem_per_reg );
+
+            a41v.v = _mm_loadu_pd( a1 + 0*n_elem_per_reg );
+
+            // perform : y += alpha * x;
+            y4v.v = _mm_fmadd_pd( a40v.v, chi0v.xmm[0], y4v.v );
+
+            y4v.v = _mm_fmadd_pd( a41v.v, chi1v.xmm[0], y4v.v );
+
+            // Store the output.
+            _mm_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y4v.v );
+
+            y0 += 2;
+            a0 += 2;
+            a1 += 2;
+        }
+
+        // If there are leftover iterations, perform them with scalar code.
+        for ( ; (i + 0) < m ; ++i )
+        {
+            double       y0c = *y0;
+
+            const double a0c = *a0;
+            const double a1c = *a1;
+
+            y0c += chi0 * a0c;
+            y0c += chi1 * a1c;
+
+            *y0 = y0c;
+
+            a0 += 1;
+            a1 += 1;
+            y0 += 1;
+        }
+    }
+    else
+    {
+        for ( i = 0; (i + 0) < m ; ++i )
+        {
+            double       y0c = *y0;
+
+            const double a0c = *a0;
+            const double a1c = *a1;
+
+            y0c += chi0 * a0c;
+            y0c += chi1 * a1c;
+
+            *y0 = y0c;
+
+            a0 += inca;
+            a1 += inca;
+            y0 += incy;
+        }
+
+    }
+}
+
+// -----------------------------------------------------------------------------
+
+void bli_daxpyf_zen_int_16x4
+     (
+       conj_t           conja,
+       conj_t           conjx,
+       dim_t            m,
+       dim_t            b_n,
+       double* restrict alpha,
+       double* restrict a, inc_t inca, inc_t lda,
+       double* restrict x, inc_t incx,
+       double* restrict y, inc_t incy,
+       cntx_t* restrict cntx
+     )
+{
+    const dim_t      fuse_fac       = 4;
+
+    const dim_t      n_elem_per_reg = 4;
+    const dim_t      n_iter_unroll  = 4;
+
+    dim_t            i;
+
+    double* restrict a0;
+    double* restrict a1;
+    double* restrict a2;
+    double* restrict a3;
+
+    double* restrict y0;
+
+    v4df_t           chi0v, chi1v, chi2v, chi3v;
+
+    v4df_t           a00v, a01v, a02v, a03v;
+
+    v4df_t           a10v, a11v, a12v, a13v;
+
+    v4df_t           a20v, a21v, a22v, a23v;
+
+    v4df_t           a30v, a31v, a32v, a33v;
+
+    v4df_t           y0v, y1v, y2v, y3v;
+
+    double           chi0, chi1, chi2, chi3;
+
+    v2df_t           y4v;
+
+    v2df_t           a40v, a41v, a42v, a43v;
+
+    // If either dimension is zero, or if alpha is zero, return early.
+    if ( bli_zero_dim2( m, b_n ) || bli_deq0( *alpha ) ) return;
+
+    // If b_n is not equal to the fusing factor, then perform the entire
+    // operation as a loop over axpyv.
+    if ( b_n != fuse_fac )
+    {
+        if ( cntx == NULL ) cntx = bli_gks_query_cntx();
+
+        daxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
+
+        for ( i = 0; i < b_n; ++i )
+        {
+            double* a1   = a + (0  )*inca + (i  )*lda;
+            double* chi1 = x + (i  )*incx;
+            double* y1   = y + (0  )*incy;
+            double  alpha_chi1;
+
+            bli_dcopycjs( conjx, *chi1, alpha_chi1 );
+            bli_dscals( *alpha, alpha_chi1 );
+
+            f
+            (
+              conja,
+              m,
+              &alpha_chi1,
+              a1, inca,
+              y1, incy,
+              cntx
+            );
+        }
+
+        return;
+    }
+
+    // At this point, we know that b_n is exactly equal to the fusing factor.
+
+    a0   = a + 0*lda;
+    a1   = a + 1*lda;
+    a2   = a + 2*lda;
+    a3   = a + 3*lda;
+
+    y0   = y;
+
+    chi0 = *( x + 0*incx );
+    chi1 = *( x + 1*incx );
+    chi2 = *( x + 2*incx );
+    chi3 = *( x + 3*incx );
+
+    // Scale each chi scalar by alpha.
+    bli_dscals( *alpha, chi0 );
+    bli_dscals( *alpha, chi1 );
+    bli_dscals( *alpha, chi2 );
+    bli_dscals( *alpha, chi3 );
+
+    // Broadcast the (alpha*chi?) scalars to all elements of vector registers.
+    chi0v.v = _mm256_broadcast_sd( &chi0 );
+    chi1v.v = _mm256_broadcast_sd( &chi1 );
+    chi2v.v = _mm256_broadcast_sd( &chi2 );
+    chi3v.v = _mm256_broadcast_sd( &chi3 );
+
+    // If there are vectorized iterations, perform them with vector
+    // instructions.
+    if ( inca == 1 && incy == 1 )
+    {
+        for ( i = 0; (i + 15) < m; i += 16 )
+        {
+            // Load the input values.
+            y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+            y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
+            y2v.v = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
+            y3v.v = _mm256_loadu_pd( y0 + 3*n_elem_per_reg );
+
+            a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
+            a10v.v = _mm256_loadu_pd( a0 + 1*n_elem_per_reg );
+            a20v.v = _mm256_loadu_pd( a0 + 2*n_elem_per_reg );
+            a30v.v = _mm256_loadu_pd( a0 + 3*n_elem_per_reg );
+
+            a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
+            a11v.v = _mm256_loadu_pd( a1 + 1*n_elem_per_reg );
+            a21v.v = _mm256_loadu_pd( a1 + 2*n_elem_per_reg );
+            a31v.v = _mm256_loadu_pd( a1 + 3*n_elem_per_reg );
+
+            a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg );
+            a12v.v = _mm256_loadu_pd( a2 + 1*n_elem_per_reg );
+            a22v.v = _mm256_loadu_pd( a2 + 2*n_elem_per_reg );
+            a32v.v = _mm256_loadu_pd( a2 + 3*n_elem_per_reg );
+
+            a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg );
+            a13v.v = _mm256_loadu_pd( a3 + 1*n_elem_per_reg );
+            a23v.v = _mm256_loadu_pd( a3 + 2*n_elem_per_reg );
+            a33v.v = _mm256_loadu_pd( a3 + 3*n_elem_per_reg );
+
+        // perform : y += alpha * x;
+            y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a10v.v, chi0v.v, y1v.v );
+            y2v.v = _mm256_fmadd_pd( a20v.v, chi0v.v, y2v.v );
+            y3v.v = _mm256_fmadd_pd( a30v.v, chi0v.v, y3v.v );
+
+            y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a11v.v, chi1v.v, y1v.v );
+            y2v.v = _mm256_fmadd_pd( a21v.v, chi1v.v, y2v.v );
+            y3v.v = _mm256_fmadd_pd( a31v.v, chi1v.v, y3v.v );
+
+            y0v.v = _mm256_fmadd_pd( a02v.v, chi2v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a12v.v, chi2v.v, y1v.v );
+            y2v.v = _mm256_fmadd_pd( a22v.v, chi2v.v, y2v.v );
+            y3v.v = _mm256_fmadd_pd( a32v.v, chi2v.v, y3v.v );
+
+            y0v.v = _mm256_fmadd_pd( a03v.v, chi3v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a13v.v, chi3v.v, y1v.v );
+            y2v.v = _mm256_fmadd_pd( a23v.v, chi3v.v, y2v.v );
+            y3v.v = _mm256_fmadd_pd( a33v.v, chi3v.v, y3v.v );
+
+            // Store the output.
+            _mm256_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y0v.v );
+            _mm256_storeu_pd( (double *)(y0 + 1*n_elem_per_reg), y1v.v );
+            _mm256_storeu_pd( (double *)(y0 + 2*n_elem_per_reg), y2v.v );
+            _mm256_storeu_pd( (double *)(y0 + 3*n_elem_per_reg), y3v.v );
+
+            y0 += n_iter_unroll * n_elem_per_reg;
+            a0 += n_iter_unroll * n_elem_per_reg;
+            a1 += n_iter_unroll * n_elem_per_reg;
+            a2 += n_iter_unroll * n_elem_per_reg;
+            a3 += n_iter_unroll * n_elem_per_reg;
+        }
+
+        for ( ; (i + 11) < m; i += 12 )
+        {
+            // Load the input values.
+            y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+            y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
+            y2v.v = _mm256_loadu_pd( y0 + 2*n_elem_per_reg );
+
+            a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
+            a10v.v = _mm256_loadu_pd( a0 + 1*n_elem_per_reg );
+            a20v.v = _mm256_loadu_pd( a0 + 2*n_elem_per_reg );
+
+            a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
+            a11v.v = _mm256_loadu_pd( a1 + 1*n_elem_per_reg );
+            a21v.v = _mm256_loadu_pd( a1 + 2*n_elem_per_reg );
+
+            a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg );
+            a12v.v = _mm256_loadu_pd( a2 + 1*n_elem_per_reg );
+            a22v.v = _mm256_loadu_pd( a2 + 2*n_elem_per_reg );
+
+            a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg );
+            a13v.v = _mm256_loadu_pd( a3 + 1*n_elem_per_reg );
+            a23v.v = _mm256_loadu_pd( a3 + 2*n_elem_per_reg );
+
+            // perform : y += alpha * x;
+            y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a10v.v, chi0v.v, y1v.v );
+            y2v.v = _mm256_fmadd_pd( a20v.v, chi0v.v, y2v.v );
+
+            y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a11v.v, chi1v.v, y1v.v );
+            y2v.v = _mm256_fmadd_pd( a21v.v, chi1v.v, y2v.v );
+
+            y0v.v = _mm256_fmadd_pd( a02v.v, chi2v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a12v.v, chi2v.v, y1v.v );
+            y2v.v = _mm256_fmadd_pd( a22v.v, chi2v.v, y2v.v );
+
+            y0v.v = _mm256_fmadd_pd( a03v.v, chi3v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a13v.v, chi3v.v, y1v.v );
+            y2v.v = _mm256_fmadd_pd( a23v.v, chi3v.v, y2v.v );
+
+            // Store the output.
+            _mm256_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y0v.v );
+            _mm256_storeu_pd( (double *)(y0 + 1*n_elem_per_reg), y1v.v );
+            _mm256_storeu_pd( (double *)(y0 + 2*n_elem_per_reg), y2v.v );
+
+            y0 += 3 * n_elem_per_reg;
+            a0 += 3 * n_elem_per_reg;
+            a1 += 3 * n_elem_per_reg;
+            a2 += 3 * n_elem_per_reg;
+            a3 += 3 * n_elem_per_reg;
+        }
+
+        for ( ; (i + 7) < m; i += 8 )
+        {
+            // Load the input values.
+            y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+            y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
+
+            a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
+            a10v.v = _mm256_loadu_pd( a0 + 1*n_elem_per_reg );
+
+            a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
+            a11v.v = _mm256_loadu_pd( a1 + 1*n_elem_per_reg );
+
+            a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg );
+            a12v.v = _mm256_loadu_pd( a2 + 1*n_elem_per_reg );
+
+            a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg );
+            a13v.v = _mm256_loadu_pd( a3 + 1*n_elem_per_reg );
+
+            // perform : y += alpha * x;
+            y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a10v.v, chi0v.v, y1v.v );
+
+            y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a11v.v, chi1v.v, y1v.v );
+
+            y0v.v = _mm256_fmadd_pd( a02v.v, chi2v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a12v.v, chi2v.v, y1v.v );
+
+            y0v.v = _mm256_fmadd_pd( a03v.v, chi3v.v, y0v.v );
+            y1v.v = _mm256_fmadd_pd( a13v.v, chi3v.v, y1v.v );
+
+            // Store the output.
+            _mm256_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y0v.v );
+            _mm256_storeu_pd( (double *)(y0 + 1*n_elem_per_reg), y1v.v );
+
+            y0 += 2 * n_elem_per_reg;
+            a0 += 2 * n_elem_per_reg;
+            a1 += 2 * n_elem_per_reg;
+            a2 += 2 * n_elem_per_reg;
+            a3 += 2 * n_elem_per_reg;
+        }
+
+
+        for ( ; (i + 3) < m; i += 4)
+        {
+            // Load the input values.
+            y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
+
+            a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
+
+            a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
+
+            a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg );
+
+            a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg );
+
+            // perform : y += alpha * x;
+            y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v );
+
+            y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v );
+
+            y0v.v = _mm256_fmadd_pd( a02v.v, chi2v.v, y0v.v );
+
+            y0v.v = _mm256_fmadd_pd( a03v.v, chi3v.v, y0v.v );
+
+            // Store the output.
+            _mm256_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y0v.v );
+
+            y0 += n_elem_per_reg;
+            a0 += n_elem_per_reg;
+            a1 += n_elem_per_reg;
+            a2 += n_elem_per_reg;
+            a3 += n_elem_per_reg;
+        }
+#if 1
+        for ( ; (i + 1) < m; i += 2)
+        {
+
+	    // Load the input values.
+            y4v.v  = _mm_loadu_pd( y0 + 0*n_elem_per_reg );
+
+            a40v.v = _mm_loadu_pd( a0 + 0*n_elem_per_reg );
+
+            a41v.v = _mm_loadu_pd( a1 + 0*n_elem_per_reg );
+
+            a42v.v = _mm_loadu_pd( a2 + 0*n_elem_per_reg );
+
+            a43v.v = _mm_loadu_pd( a3 + 0*n_elem_per_reg );
+
+            // perform : y += alpha * x;
+            y4v.v = _mm_fmadd_pd( a40v.v, chi0v.xmm[0], y4v.v );
+
+            y4v.v = _mm_fmadd_pd( a41v.v, chi1v.xmm[0], y4v.v );
+
+            y4v.v = _mm_fmadd_pd( a42v.v, chi2v.xmm[0], y4v.v );
+
+            y4v.v = _mm_fmadd_pd( a43v.v, chi3v.xmm[0], y4v.v );
+
+            // Store the output.
+            _mm_storeu_pd( (double *)(y0 + 0*n_elem_per_reg), y4v.v );
+
+            y0 += 2;
+            a0 += 2;
+            a1 += 2;
+            a2 += 2;
+            a3 += 2;
+        }
+#endif
+        // If there are leftover iterations, perform them with scalar code.
+        for ( ; (i + 0) < m ; ++i )
+        {
+            double       y0c = *y0;
+
+            const double a0c = *a0;
+            const double a1c = *a1;
+            const double a2c = *a2;
+            const double a3c = *a3;
+
+            y0c += chi0 * a0c;
+            y0c += chi1 * a1c;
+            y0c += chi2 * a2c;
+            y0c += chi3 * a3c;
+
+            *y0 = y0c;
+
+            a0 += 1;
+            a1 += 1;
+            a2 += 1;
+            a3 += 1;
+
+            y0 += 1;
+        }
+    }
+    else
+    {
+        for ( i = 0; (i + 0) < m ; ++i )
+        {
+            double       y0c = *y0;
+
+            const double a0c = *a0;
+            const double a1c = *a1;
+            const double a2c = *a2;
+            const double a3c = *a3;
+
+            y0c += chi0 * a0c;
+            y0c += chi1 * a1c;
+            y0c += chi2 * a2c;
+            y0c += chi3 * a3c;
+
+            *y0 = y0c;
+
+            a0 += inca;
+            a1 += inca;
+            a2 += inca;
+            a3 += inca;
+
+	    y0 += incy;
+        }
+
+    }
+}
+
+
diff --git a/kernels/zen2/1f/bli_axpyf_zen_int_5.c b/kernels/zen2/1f/bli_axpyf_zen_int_5.c
deleted file mode 100644
index f8b04d52d6..0000000000
--- a/kernels/zen2/1f/bli_axpyf_zen_int_5.c
+++ /dev/null
@@ -1,599 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2020, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "immintrin.h"
-#include "blis.h"
-
-/* Union data structure to access AVX registers
-   One 256-bit AVX register holds 8 SP elements. */
-typedef union
-{
-    __m256  v;
-    float   f[8] __attribute__((aligned(64)));
-} v8sf_t;
-
-/* Union data structure to access AVX registers
-*  One 256-bit AVX register holds 4 DP elements. */
-typedef union
-{
-    __m256d v;
-    double  d[4] __attribute__((aligned(64)));
-} v4df_t;
-
-
-void bli_saxpyf_zen_int_5
-     (
-       conj_t           conja,
-       conj_t           conjx,
-       dim_t            m,
-       dim_t            b_n,
-       float* restrict alpha,
-       float* restrict a, inc_t inca, inc_t lda,
-       float* restrict x, inc_t incx,
-       float* restrict y, inc_t incy,
-       cntx_t* restrict cntx
-     )
-{
-    const dim_t      fuse_fac       = 5;
-
-    const dim_t      n_elem_per_reg = 8;
-    const dim_t      n_iter_unroll  = 2;
-
-    dim_t            i;
-
-    float* restrict a0;
-    float* restrict a1;
-    float* restrict a2;
-    float* restrict a3;
-    float* restrict a4;
-
-    float* restrict y0;
-
-    v8sf_t           chi0v, chi1v, chi2v, chi3v;
-    v8sf_t           chi4v;
-
-    v8sf_t           a00v, a01v, a02v, a03v;
-    v8sf_t           a04v;
-
-    v8sf_t           a10v, a11v, a12v, a13v;
-    v8sf_t           a14v;
-
-    v8sf_t           y0v, y1v;
-
-    float           chi0, chi1, chi2, chi3;
-    float           chi4;
-
-    // If either dimension is zero, or if alpha is zero, return early.
-    if ( bli_zero_dim2( m, b_n ) || bli_seq0( *alpha ) ) return;
-
-    // If b_n is not equal to the fusing factor, then perform the entire
-    // operation as a loop over axpyv.
-    if ( b_n != fuse_fac )
-    {
-#ifdef BLIS_CONFIG_ZEN2
-        for ( i = 0; i < b_n; ++i )
-        {
-            float* a1   = a + (0  )*inca + (i  )*lda;
-            float* chi1 = x + (i  )*incx;
-            float* y1   = y + (0  )*incy;
-            float  alpha_chi1;
-
-            bli_scopycjs( conjx, *chi1, alpha_chi1 );
-            bli_sscals( *alpha, alpha_chi1 );
-
-            bli_saxpyv_zen_int10
-            (
-              conja,
-              m,
-              &alpha_chi1,
-              a1, inca,
-              y1, incy,
-              cntx
-            );
-        }
-
-#else
-        saxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_FLOAT, BLIS_AXPYV_KER, cntx );
-
-        for ( i = 0; i < b_n; ++i )
-        {
-            float* a1   = a + (0  )*inca + (i  )*lda;
-            float* chi1 = x + (i  )*incx;
-            float* y1   = y + (0  )*incy;
-            float  alpha_chi1;
-
-            bli_scopycjs( conjx, *chi1, alpha_chi1 );
-            bli_sscals( *alpha, alpha_chi1 );
-
-            f
-            (
-              conja,
-              m,
-              &alpha_chi1,
-              a1, inca,
-              y1, incy,
-              cntx
-            );
-        }
-
-#endif
-        return;
-    }
-
-    // At this point, we know that b_n is exactly equal to the fusing factor.
-
-    a0   = a + 0*lda;
-    a1   = a + 1*lda;
-    a2   = a + 2*lda;
-    a3   = a + 3*lda;
-    a4   = a + 4*lda;
-    y0   = y;
-
-    chi0 = *( x + 0*incx );
-    chi1 = *( x + 1*incx );
-    chi2 = *( x + 2*incx );
-    chi3 = *( x + 3*incx );
-    chi4 = *( x + 4*incx );
-
-
-    // Scale each chi scalar by alpha.
-    bli_sscals( *alpha, chi0 );
-    bli_sscals( *alpha, chi1 );
-    bli_sscals( *alpha, chi2 );
-    bli_sscals( *alpha, chi3 );
-    bli_sscals( *alpha, chi4 );
-
-    // Broadcast the (alpha*chi?) scalars to all elements of vector registers.
-    chi0v.v = _mm256_broadcast_ss( &chi0 );
-    chi1v.v = _mm256_broadcast_ss( &chi1 );
-    chi2v.v = _mm256_broadcast_ss( &chi2 );
-    chi3v.v = _mm256_broadcast_ss( &chi3 );
-    chi4v.v = _mm256_broadcast_ss( &chi4 );
-
-    // If there are vectorized iterations, perform them with vector
-    // instructions.
-    if ( inca == 1 && incy == 1 )
-    {
-        for ( i = 0; (i + 15) < m; i += 16 )
-        {
-            // Load the input values.
-            y0v.v = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
-            y1v.v = _mm256_loadu_ps( y0 + 1*n_elem_per_reg );
-
-            a00v.v = _mm256_loadu_ps( a0 + 0*n_elem_per_reg );
-            a10v.v = _mm256_loadu_ps( a0 + 1*n_elem_per_reg );
-
-            a01v.v = _mm256_loadu_ps( a1 + 0*n_elem_per_reg );
-            a11v.v = _mm256_loadu_ps( a1 + 1*n_elem_per_reg );
-
-            a02v.v = _mm256_loadu_ps( a2 + 0*n_elem_per_reg );
-            a12v.v = _mm256_loadu_ps( a2 + 1*n_elem_per_reg );
-
-            a03v.v = _mm256_loadu_ps( a3 + 0*n_elem_per_reg );
-            a13v.v = _mm256_loadu_ps( a3 + 1*n_elem_per_reg );
-
-            a04v.v = _mm256_loadu_ps( a4 + 0*n_elem_per_reg );
-            a14v.v = _mm256_loadu_ps( a4 + 1*n_elem_per_reg );
-
-            // perform : y += alpha * x;
-            y0v.v = _mm256_fmadd_ps( a00v.v, chi0v.v, y0v.v );
-            y1v.v = _mm256_fmadd_ps( a10v.v, chi0v.v, y1v.v );
-
-            y0v.v = _mm256_fmadd_ps( a01v.v, chi1v.v, y0v.v );
-            y1v.v = _mm256_fmadd_ps( a11v.v, chi1v.v, y1v.v );
-
-            y0v.v = _mm256_fmadd_ps( a02v.v, chi2v.v, y0v.v );
-            y1v.v = _mm256_fmadd_ps( a12v.v, chi2v.v, y1v.v );
-
-            y0v.v = _mm256_fmadd_ps( a03v.v, chi3v.v, y0v.v );
-            y1v.v = _mm256_fmadd_ps( a13v.v, chi3v.v, y1v.v );
-
-            y0v.v = _mm256_fmadd_ps( a04v.v, chi4v.v, y0v.v );
-            y1v.v = _mm256_fmadd_ps( a14v.v, chi4v.v, y1v.v );
-
-
-            // Store the output.
-            _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), y0v.v );
-            _mm256_storeu_ps( (y0 + 1*n_elem_per_reg), y1v.v );
-
-            y0 += n_iter_unroll * n_elem_per_reg;
-            a0 += n_iter_unroll * n_elem_per_reg;
-            a1 += n_iter_unroll * n_elem_per_reg;
-            a2 += n_iter_unroll * n_elem_per_reg;
-            a3 += n_iter_unroll * n_elem_per_reg;
-            a4 += n_iter_unroll * n_elem_per_reg;
-        }
-
-        for( ; (i + 7) < m; i += 8 )
-        {
-            // Load the input values.
-            y0v.v = _mm256_loadu_ps( y0 + 0*n_elem_per_reg );
-
-            a00v.v = _mm256_loadu_ps( a0 + 0*n_elem_per_reg );
-            a01v.v = _mm256_loadu_ps( a1 + 0*n_elem_per_reg );
-            a02v.v = _mm256_loadu_ps( a2 + 0*n_elem_per_reg );
-            a03v.v = _mm256_loadu_ps( a3 + 0*n_elem_per_reg );
-            a04v.v = _mm256_loadu_ps( a4 + 0*n_elem_per_reg );
-
-
-            // perform : y += alpha * x;
-            y0v.v = _mm256_fmadd_ps( a00v.v, chi0v.v, y0v.v );
-            y0v.v = _mm256_fmadd_ps( a01v.v, chi1v.v, y0v.v );
-            y0v.v = _mm256_fmadd_ps( a02v.v, chi2v.v, y0v.v );
-            y0v.v = _mm256_fmadd_ps( a03v.v, chi3v.v, y0v.v );
-            y0v.v = _mm256_fmadd_ps( a04v.v, chi4v.v, y0v.v );
-
-            // Store the output.
-            _mm256_storeu_ps( (y0 + 0*n_elem_per_reg), y0v.v );
-
-            y0 += n_elem_per_reg;
-            a0 += n_elem_per_reg;
-            a1 += n_elem_per_reg;
-            a2 += n_elem_per_reg;
-            a3 += n_elem_per_reg;
-            a4 += n_elem_per_reg;
-        }
-
-        // If there are leftover iterations, perform them with scalar code.
-        for ( ; (i + 0) < m ; ++i )
-        {
-            double       y0c = *y0;
-
-            const float a0c = *a0;
-            const float a1c = *a1;
-            const float a2c = *a2;
-            const float a3c = *a3;
-            const float a4c = *a4;
-
-            y0c += chi0 * a0c;
-            y0c += chi1 * a1c;
-            y0c += chi2 * a2c;
-            y0c += chi3 * a3c;
-            y0c += chi4 * a4c;
-
-            *y0 = y0c;
-
-            a0 += 1;
-            a1 += 1;
-            a2 += 1;
-            a3 += 1;
-            a4 += 1;
-            y0 += 1;
-        }
-    }
-    else
-    {
-        for ( i = 0; (i + 0) < m ; ++i )
-        {
-            double       y0c = *y0;
-
-            const float a0c = *a0;
-            const float a1c = *a1;
-            const float a2c = *a2;
-            const float a3c = *a3;
-            const float a4c = *a4;
-
-            y0c += chi0 * a0c;
-            y0c += chi1 * a1c;
-            y0c += chi2 * a2c;
-            y0c += chi3 * a3c;
-            y0c += chi4 * a4c;
-
-            *y0 = y0c;
-
-            a0 += inca;
-            a1 += inca;
-            a2 += inca;
-            a3 += inca;
-            a4 += inca;
-            y0 += incy;
-        }
-
-    }
-}
-
-
-// -----------------------------------------------------------------------------
-
-void bli_daxpyf_zen_int_5
-     (
-       conj_t           conja,
-       conj_t           conjx,
-       dim_t            m,
-       dim_t            b_n,
-       double* restrict alpha,
-       double* restrict a, inc_t inca, inc_t lda,
-       double* restrict x, inc_t incx,
-       double* restrict y, inc_t incy,
-       cntx_t* restrict cntx
-     )
-{
-    const dim_t      fuse_fac       = 5;
-
-    const dim_t      n_elem_per_reg = 4;
-    const dim_t      n_iter_unroll  = 2;
-
-    dim_t            i;
-
-    double* restrict a0;
-    double* restrict a1;
-    double* restrict a2;
-    double* restrict a3;
-    double* restrict a4;
-
-    double* restrict y0;
-
-    v4df_t           chi0v, chi1v, chi2v, chi3v;
-    v4df_t           chi4v;
-
-    v4df_t           a00v, a01v, a02v, a03v;
-    v4df_t           a04v;
-
-    v4df_t           a10v, a11v, a12v, a13v;
-    v4df_t           a14v;
-
-    v4df_t           y0v, y1v;
-
-    double           chi0, chi1, chi2, chi3;
-    double           chi4;
-
-    // If either dimension is zero, or if alpha is zero, return early.
-    if ( bli_zero_dim2( m, b_n ) || bli_deq0( *alpha ) ) return;
-
-    // If b_n is not equal to the fusing factor, then perform the entire
-    // operation as a loop over axpyv.
-    if ( b_n != fuse_fac )
-    {
-#ifdef BLIS_CONFIG_ZEN2
-        for ( i = 0; i < b_n; ++i )
-        {
-            double* a1   = a + (0  )*inca + (i  )*lda;
-            double* chi1 = x + (i  )*incx;
-            double* y1   = y + (0  )*incy;
-            double  alpha_chi1;
-
-            bli_dcopycjs( conjx, *chi1, alpha_chi1 );
-            bli_dscals( *alpha, alpha_chi1 );
-
-            bli_daxpyv_zen_int10
-            (
-              conja,
-              m,
-              &alpha_chi1,
-              a1, inca,
-              y1, incy,
-              cntx
-            );
-        }
-
-#else
-        daxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
-
-        for ( i = 0; i < b_n; ++i )
-        {
-            double* a1   = a + (0  )*inca + (i  )*lda;
-            double* chi1 = x + (i  )*incx;
-            double* y1   = y + (0  )*incy;
-            double  alpha_chi1;
-
-            bli_dcopycjs( conjx, *chi1, alpha_chi1 );
-            bli_dscals( *alpha, alpha_chi1 );
-
-            f
-            (
-              conja,
-              m,
-              &alpha_chi1,
-              a1, inca,
-              y1, incy,
-              cntx
-            );
-        }
-
-#endif
-        return;
-    }
-
-    // At this point, we know that b_n is exactly equal to the fusing factor.
-
-    a0   = a + 0*lda;
-    a1   = a + 1*lda;
-    a2   = a + 2*lda;
-    a3   = a + 3*lda;
-    a4   = a + 4*lda;
-    y0   = y;
-
-    chi0 = *( x + 0*incx );
-    chi1 = *( x + 1*incx );
-    chi2 = *( x + 2*incx );
-    chi3 = *( x + 3*incx );
-    chi4 = *( x + 4*incx );
-
-
-    // Scale each chi scalar by alpha.
-    bli_dscals( *alpha, chi0 );
-    bli_dscals( *alpha, chi1 );
-    bli_dscals( *alpha, chi2 );
-    bli_dscals( *alpha, chi3 );
-    bli_dscals( *alpha, chi4 );
-
-    // Broadcast the (alpha*chi?) scalars to all elements of vector registers.
-    chi0v.v = _mm256_broadcast_sd( &chi0 );
-    chi1v.v = _mm256_broadcast_sd( &chi1 );
-    chi2v.v = _mm256_broadcast_sd( &chi2 );
-    chi3v.v = _mm256_broadcast_sd( &chi3 );
-    chi4v.v = _mm256_broadcast_sd( &chi4 );
-
-    // If there are vectorized iterations, perform them with vector
-    // instructions.
-    if ( inca == 1 && incy == 1 )
-    {
-        for ( i = 0; (i + 7) < m; i += 8 )
-        {
-            // Load the input values.
-            y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
-            y1v.v = _mm256_loadu_pd( y0 + 1*n_elem_per_reg );
-
-            a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
-            a10v.v = _mm256_loadu_pd( a0 + 1*n_elem_per_reg );
-
-            a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
-            a11v.v = _mm256_loadu_pd( a1 + 1*n_elem_per_reg );
-
-            a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg );
-            a12v.v = _mm256_loadu_pd( a2 + 1*n_elem_per_reg );
-
-            a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg );
-            a13v.v = _mm256_loadu_pd( a3 + 1*n_elem_per_reg );
-
-            a04v.v = _mm256_loadu_pd( a4 + 0*n_elem_per_reg );
-            a14v.v = _mm256_loadu_pd( a4 + 1*n_elem_per_reg );
-
-            // perform : y += alpha * x;
-            y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v );
-            y1v.v = _mm256_fmadd_pd( a10v.v, chi0v.v, y1v.v );
-
-            y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v );
-            y1v.v = _mm256_fmadd_pd( a11v.v, chi1v.v, y1v.v );
-
-            y0v.v = _mm256_fmadd_pd( a02v.v, chi2v.v, y0v.v );
-            y1v.v = _mm256_fmadd_pd( a12v.v, chi2v.v, y1v.v );
-
-            y0v.v = _mm256_fmadd_pd( a03v.v, chi3v.v, y0v.v );
-            y1v.v = _mm256_fmadd_pd( a13v.v, chi3v.v, y1v.v );
-
-            y0v.v = _mm256_fmadd_pd( a04v.v, chi4v.v, y0v.v );
-            y1v.v = _mm256_fmadd_pd( a14v.v, chi4v.v, y1v.v );
-
-
-            // Store the output.
-            _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), y0v.v );
-            _mm256_storeu_pd( (y0 + 1*n_elem_per_reg), y1v.v );
-
-            y0 += n_iter_unroll * n_elem_per_reg;
-            a0 += n_iter_unroll * n_elem_per_reg;
-            a1 += n_iter_unroll * n_elem_per_reg;
-            a2 += n_iter_unroll * n_elem_per_reg;
-            a3 += n_iter_unroll * n_elem_per_reg;
-            a4 += n_iter_unroll * n_elem_per_reg;
-        }
-
-        for( ; (i + 3) < m; i += 4 )
-        {
-            // Load the input values.
-            y0v.v = _mm256_loadu_pd( y0 + 0*n_elem_per_reg );
-
-            a00v.v = _mm256_loadu_pd( a0 + 0*n_elem_per_reg );
-            a01v.v = _mm256_loadu_pd( a1 + 0*n_elem_per_reg );
-            a02v.v = _mm256_loadu_pd( a2 + 0*n_elem_per_reg );
-            a03v.v = _mm256_loadu_pd( a3 + 0*n_elem_per_reg );
-            a04v.v = _mm256_loadu_pd( a4 + 0*n_elem_per_reg );
-
-
-            // perform : y += alpha * x;
-            y0v.v = _mm256_fmadd_pd( a00v.v, chi0v.v, y0v.v );
-            y0v.v = _mm256_fmadd_pd( a01v.v, chi1v.v, y0v.v );
-            y0v.v = _mm256_fmadd_pd( a02v.v, chi2v.v, y0v.v );
-            y0v.v = _mm256_fmadd_pd( a03v.v, chi3v.v, y0v.v );
-            y0v.v = _mm256_fmadd_pd( a04v.v, chi4v.v, y0v.v );
-
-            // Store the output.
-            _mm256_storeu_pd( (y0 + 0*n_elem_per_reg), y0v.v );
-
-            y0 += n_elem_per_reg;
-            a0 += n_elem_per_reg;
-            a1 += n_elem_per_reg;
-            a2 += n_elem_per_reg;
-            a3 += n_elem_per_reg;
-            a4 += n_elem_per_reg;
-        }
-
-        // If there are leftover iterations, perform them with scalar code.
-        for ( ; (i + 0) < m ; ++i )
-        {
-            double       y0c = *y0;
-
-            const double a0c = *a0;
-            const double a1c = *a1;
-            const double a2c = *a2;
-            const double a3c = *a3;
-            const double a4c = *a4;
-
-            y0c += chi0 * a0c;
-            y0c += chi1 * a1c;
-            y0c += chi2 * a2c;
-            y0c += chi3 * a3c;
-            y0c += chi4 * a4c;
-
-            *y0 = y0c;
-
-            a0 += 1;
-            a1 += 1;
-            a2 += 1;
-            a3 += 1;
-            a4 += 1;
-            y0 += 1;
-        }
-    }
-    else
-    {
-        for ( i = 0; (i + 0) < m ; ++i )
-        {
-            double       y0c = *y0;
-
-            const double a0c = *a0;
-            const double a1c = *a1;
-            const double a2c = *a2;
-            const double a3c = *a3;
-            const double a4c = *a4;
-
-            y0c += chi0 * a0c;
-            y0c += chi1 * a1c;
-            y0c += chi2 * a2c;
-            y0c += chi3 * a3c;
-            y0c += chi4 * a4c;
-
-            *y0 = y0c;
-
-            a0 += inca;
-            a1 += inca;
-            a2 += inca;
-            a3 += inca;
-            a4 += inca;
-            y0 += incy;
-        }
-
-    }
-}
-

From 95335190bdf6706bb849d0fbf823688fb18e1868 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Tue, 5 Apr 2022 17:33:29 -0500
Subject: [PATCH 18/32] Fixed some stale bli_cntx_get_l1v_ker_dt() calls.

---
 kernels/zen/1/bli_scalv_zen_int10.c  | 2 +-
 kernels/zen/1f/bli_axpyf_zen_int_4.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernels/zen/1/bli_scalv_zen_int10.c b/kernels/zen/1/bli_scalv_zen_int10.c
index 9b12f0257b..9f31b7200e 100644
--- a/kernels/zen/1/bli_scalv_zen_int10.c
+++ b/kernels/zen/1/bli_scalv_zen_int10.c
@@ -459,7 +459,7 @@ void bli_cscalv_zen_int10
 {
 	const num_t dt = BLIS_SCOMPLEX;
 
-	cscalv_ker_ft f = bli_cntx_get_l1v_ker_dt( dt, BLIS_SCALV_KER, cntx );
+	cscalv_ker_ft f = bli_cntx_get_ukr_dt( dt, BLIS_SCALV_KER, cntx );
 
 	f
 	(
diff --git a/kernels/zen/1f/bli_axpyf_zen_int_4.c b/kernels/zen/1f/bli_axpyf_zen_int_4.c
index 5ddb56ac57..0ec5f44f53 100644
--- a/kernels/zen/1f/bli_axpyf_zen_int_4.c
+++ b/kernels/zen/1f/bli_axpyf_zen_int_4.c
@@ -36,7 +36,7 @@
 #include "blis.h"
 
 
- void bli_caxpyf_zen_int_4
+void bli_caxpyf_zen_int_4
      (
        conj_t           conja,
        conj_t           conjx,
@@ -81,7 +81,7 @@
     {
 	if ( cntx == NULL ) cntx = bli_gks_query_cntx();
 
-        caxpyv_ker_ft f = bli_cntx_get_l1v_ker_dt( BLIS_SCOMPLEX, BLIS_AXPYV_KER, cntx );
+        caxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_SCOMPLEX, BLIS_AXPYV_KER, cntx );
 
         for ( i = 0; i < b_n; ++i )
         {

From 2998bce0b2e186b6ab0016f9833108d4a4d3aa83 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Wed, 6 Apr 2022 18:16:35 -0500
Subject: [PATCH 19/32] Renamed BLIS_IS_KERNEL; whitespace changes.

---
 common.mk                                   |   4 +-
 frame/1m/other/bli_packm_cxk.c              |   3 +-
 frame/1m/other/bli_packm_cxk_1er.c          |   3 +-
 frame/1m/other/bli_unpackm_cxk.c            |   3 +-
 frame/1m/packm/bli_packm_struc_cxk.c        | 404 ++++++++++----------
 frame/1m/unpackm/bli_unpackm_blk_var1.c     |  48 +--
 frame/3/bli_l3_sup_packm_var.c              |   3 +-
 frame/include/bli_kernel_macro_defs.h       |  10 +-
 frame/include/level0/bli_set0s_edge.h       |  48 ++-
 ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c | 372 +++++++++---------
 ref_kernels/1m/bli_packm_cxc_diag_ref.c     | 152 ++++----
 ref_kernels/1m/bli_packm_cxk_1er_ref.c      |  68 ++--
 ref_kernels/1m/bli_packm_cxk_ref.c          |   8 +-
 ref_kernels/3/bli_gemm_ref.c                |   2 +-
 ref_kernels/3/bli_gemmtrsm_ref.c            |  10 +-
 ref_kernels/3/bli_trsm_ref.c                |   8 +-
 ref_kernels/ind/bli_gemmtrsm1m_ref.c        |   2 +-
 ref_kernels/ind/bli_trsm1m_ref.c            |  36 +-
 sandbox/gemmlike/bls_packm_cxk.c            |   3 +-
 19 files changed, 609 insertions(+), 578 deletions(-)

diff --git a/common.mk b/common.mk
index 5d681132f4..a93f8ab246 100644
--- a/common.mk
+++ b/common.mk
@@ -120,7 +120,7 @@ get-refinit-cflags-for   = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
                                    -DBLIS_CNAME=$(1) \
                                    $(BUILD_CPPFLAGS) \
                                    $(BUILD_SYMFLAGS) \
-                                   -DBLIS_IN_KERNEL=1 \
+                                   -DBLIS_IN_REF_KERNEL=1 \
                                    -include $(CONFIG_PATH)/$(1)/bli_kernel_defs_$(1).h \
                             )
 
@@ -131,7 +131,7 @@ get-refkern-cflags-for   = $(strip $(call load-var-for,CROPTFLAGS,$(1)) \
                                    -DBLIS_CNAME=$(1) \
                                    $(BUILD_CPPFLAGS) \
                                    $(BUILD_SYMFLAGS) \
-                                   -DBLIS_IN_KERNEL=1 \
+                                   -DBLIS_IN_REF_KERNEL=1 \
                                    -include $(CONFIG_PATH)/$(1)/bli_kernel_defs_$(1).h \
                             )
 
diff --git a/frame/1m/other/bli_packm_cxk.c b/frame/1m/other/bli_packm_cxk.c
index 53ae58e215..612b37f78c 100644
--- a/frame/1m/other/bli_packm_cxk.c
+++ b/frame/1m/other/bli_packm_cxk.c
@@ -55,7 +55,8 @@ void PASTEMAC(ch,opname) \
 	   kernel function pointer. This means that we always use the same
 	   kernel, even for edge cases. */ \
 	num_t dt     = PASTEMAC(ch,type); \
-	ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_KER : BLIS_PACKM_MRXK_KER; \
+	ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_KER \
+	                                           : BLIS_PACKM_MRXK_KER; \
 \
 	PASTECH2(ch,opname,_ker_ft) f; \
 \
diff --git a/frame/1m/other/bli_packm_cxk_1er.c b/frame/1m/other/bli_packm_cxk_1er.c
index 0c63609e52..22598dbac6 100644
--- a/frame/1m/other/bli_packm_cxk_1er.c
+++ b/frame/1m/other/bli_packm_cxk_1er.c
@@ -55,7 +55,8 @@ void PASTEMAC(ch,opname) \
 	   kernel function pointer. This means that we always use the same
 	   kernel, even for edge cases. */ \
 	num_t dt     = PASTEMAC(ch,type); \
-	ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_1ER_KER : BLIS_PACKM_MRXK_1ER_KER; \
+	ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_1ER_KER \
+	                                           : BLIS_PACKM_MRXK_1ER_KER; \
 \
 	PASTECH2(ch,opname,_ker_ft) f; \
 \
diff --git a/frame/1m/other/bli_unpackm_cxk.c b/frame/1m/other/bli_unpackm_cxk.c
index 2410c8629e..4b7977e863 100644
--- a/frame/1m/other/bli_unpackm_cxk.c
+++ b/frame/1m/other/bli_unpackm_cxk.c
@@ -50,7 +50,8 @@ void PASTEMAC(ch,opname) \
      ) \
 { \
 	num_t dt     = PASTEMAC(ch,type); \
-	ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_UNPACKM_NRXK_KER : BLIS_UNPACKM_MRXK_KER; \
+	ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_UNPACKM_NRXK_KER \
+	                                           : BLIS_UNPACKM_MRXK_KER; \
 \
 	PASTECH2(ch,opname,_ker_ft) f; \
 \
diff --git a/frame/1m/packm/bli_packm_struc_cxk.c b/frame/1m/packm/bli_packm_struc_cxk.c
index 0cf4ac9304..dbdaf4738d 100644
--- a/frame/1m/packm/bli_packm_struc_cxk.c
+++ b/frame/1m/packm/bli_packm_struc_cxk.c
@@ -60,41 +60,45 @@ void PASTEMAC(ch,varname) \
 { \
 	num_t   dt            = PASTEMAC(ch,type); \
 	num_t   dt_r          = PASTEMAC(chr,type); \
-    dim_t   panel_len_pad = panel_len_max - panel_len; \
+	dim_t   panel_len_pad = panel_len_max - panel_len; \
 \
 	bszid_t bsz_id        = bli_is_col_packed( schema ) ? BLIS_NR : BLIS_MR; \
 	dim_t   packmrnr      = bli_cntx_get_blksz_max_dt( dt, bsz_id, cntx ); \
 	dim_t   packmrnr_r    = bli_cntx_get_blksz_max_dt( dt_r, bsz_id, cntx ); \
 \
-	ukr_t   cxk_ker_id    = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_KER : BLIS_PACKM_MRXK_KER; \
-	ukr_t   cxc_ker_id    = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXNR_DIAG_KER : BLIS_PACKM_MRXMR_DIAG_KER; \
+	ukr_t   cxk_ker_id    = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_KER \
+	                                                    : BLIS_PACKM_MRXK_KER; \
+	ukr_t   cxc_ker_id    = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXNR_DIAG_KER \
+	                                                    : BLIS_PACKM_MRXMR_DIAG_KER; \
 \
-    if ( bli_is_1m_packed( schema ) ) \
-    { \
-	    cxk_ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_1ER_KER : BLIS_PACKM_MRXK_1ER_KER; \
-	    cxc_ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXNR_DIAG_1ER_KER : BLIS_PACKM_MRXMR_DIAG_1ER_KER; \
-    } \
+	if ( bli_is_1m_packed( schema ) ) \
+	{ \
+		cxk_ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_1ER_KER \
+		                                         : BLIS_PACKM_MRXK_1ER_KER; \
+		cxc_ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXNR_DIAG_1ER_KER \
+		                                         : BLIS_PACKM_MRXMR_DIAG_1ER_KER; \
+	} \
 \
 	PASTECH2(ch,cxk_kername,_ker_ft) f_cxk = bli_cntx_get_ukr_dt( dt, cxk_ker_id, cntx ); \
 	PASTECH2(ch,cxc_kername,_ker_ft) f_cxc = bli_cntx_get_ukr_dt( dt, cxc_ker_id, cntx ); \
 \
-    /* For general matrices, pack and return early */ \
-    if ( bli_is_general( strucc ) ) \
-    { \
-    	f_cxk \
-    	( \
-    	  conjc, \
-    	  schema, \
-    	  panel_dim, \
-    	  panel_len, \
-    	  panel_len_max, \
-    	  kappa, \
-    	  c, incc, ldc, \
-    	  p,       ldp, \
-    	  cntx  \
-    	); \
-        return; \
-    } \
+	/* For general matrices, pack and return early */ \
+	if ( bli_is_general( strucc ) ) \
+	{ \
+		f_cxk \
+		( \
+		  conjc, \
+		  schema, \
+		  panel_dim, \
+		  panel_len, \
+		  panel_len_max, \
+		  kappa, \
+		  c, incc, ldc, \
+		  p,       ldp, \
+		  cntx  \
+		); \
+		return; \
+	} \
 \
 	/* Sanity check. Diagonals should not intersect the short end of
 	   a micro-panel. If they do, then somehow the constraints on
@@ -102,192 +106,200 @@ void PASTEMAC(ch,varname) \
 	   blocksizes was somehow violated. */ \
 	doff_t diagoffc = panel_dim_off - panel_len_off; \
 	if ( (          -panel_dim < diagoffc && diagoffc <         0 ) || \
-         ( panel_len-panel_dim < diagoffc && diagoffc < panel_len ) ) \
+		 ( panel_len-panel_dim < diagoffc && diagoffc < panel_len ) ) \
 		bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED ); \
 \
-    /* For triangular, symmetric, and hermitian matrices we need to consider three parts. */ \
+	/* For triangular, symmetric, and hermitian matrices we need to consider
+	   three parts. */ \
 \
 	/* Pack to p10. */ \
-    if ( 0 < diagoffc ) \
-    { \
-    	dim_t  p10_dim     = panel_dim; \
-    	dim_t  p10_len     = bli_min( diagoffc, panel_len ); \
-    	dim_t  p10_len_max = p10_len == panel_len ? panel_len_max : p10_len; \
-    	ctype* p10         = p; \
-    	conj_t conjc10     = conjc; \
-        ctype* c10         = c; \
-        inc_t  incc10      = incc; \
-    	inc_t  ldc10       = ldc; \
+	if ( 0 < diagoffc ) \
+	{ \
+		dim_t  p10_dim     = panel_dim; \
+		dim_t  p10_len     = bli_min( diagoffc, panel_len ); \
+		dim_t  p10_len_max = p10_len == panel_len ? panel_len_max : p10_len; \
+		ctype* p10         = p; \
+		conj_t conjc10     = conjc; \
+		ctype* c10         = c; \
+		inc_t  incc10      = incc; \
+		inc_t  ldc10       = ldc; \
 \
-    	if ( bli_is_upper( uploc ) ) \
-        { \
-            bli_reflect_to_stored_part( diagoffc, c10, incc10, ldc10 ); \
+		if ( bli_is_upper( uploc ) ) \
+		{ \
+			bli_reflect_to_stored_part( diagoffc, c10, incc10, ldc10 ); \
 \
-    		if ( bli_is_hermitian( strucc ) ) \
-    			bli_toggle_conj( &conjc10 ); \
-        } \
+			if ( bli_is_hermitian( strucc ) ) \
+				bli_toggle_conj( &conjc10 ); \
+		} \
 \
-        /* If we are referencing the unstored part of a triangular matrix, explicitly store zeros */ \
-        if ( bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) \
-        { \
-            if ( bli_is_1m_packed( schema ) ) \
-            { \
-    		    ctype_r* restrict zero = PASTEMAC(chr,0); \
-        		PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \
-        		( \
-        		  BLIS_NO_CONJUGATE, \
-        		  0, \
-        		  BLIS_NONUNIT_DIAG, \
-        		  BLIS_DENSE, \
-        		  packmrnr_r, \
-        		  p10_len_max * 2, \
-        		  zero, \
-        		  ( ctype_r* )p10, 1, ldp, \
-        		  cntx, \
-        		  NULL  \
-        		); \
-            } \
-            else \
-            { \
-    		    ctype* restrict zero = PASTEMAC(ch,0); \
-        		PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \
-        		( \
-        		  BLIS_NO_CONJUGATE, \
-        		  0, \
-        		  BLIS_NONUNIT_DIAG, \
-        		  BLIS_DENSE, \
-        		  packmrnr, \
-        		  p10_len_max, \
-        		  zero, \
-        		  p10, 1, ldp, \
-        		  cntx, \
-        		  NULL  \
-        		); \
-            } \
-        } \
-        else \
-        { \
-        	f_cxk \
-        	( \
-        	  conjc10, \
-        	  schema, \
-        	  p10_dim, \
-        	  p10_len, \
-        	  p10_len_max, \
-        	  kappa, \
-        	  c10, incc10, ldc10, \
-        	  p10,         ldp, \
-        	  cntx  \
-        	); \
-        } \
-    } \
+		/* If we are referencing the unstored part of a triangular matrix,
+		   explicitly store zeros */ \
+		if ( bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) \
+		{ \
+			if ( bli_is_1m_packed( schema ) ) \
+			{ \
+				ctype_r* restrict zero = PASTEMAC(chr,0); \
+\
+				PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \
+				( \
+				  BLIS_NO_CONJUGATE, \
+				  0, \
+				  BLIS_NONUNIT_DIAG, \
+				  BLIS_DENSE, \
+				  packmrnr_r, \
+				  p10_len_max * 2, \
+				  zero, \
+				  ( ctype_r* )p10, 1, ldp, \
+				  cntx, \
+				  NULL  \
+				); \
+			} \
+			else \
+			{ \
+				ctype* restrict zero = PASTEMAC(ch,0); \
+\
+				PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \
+				( \
+				  BLIS_NO_CONJUGATE, \
+				  0, \
+				  BLIS_NONUNIT_DIAG, \
+				  BLIS_DENSE, \
+				  packmrnr, \
+				  p10_len_max, \
+				  zero, \
+				  p10, 1, ldp, \
+				  cntx, \
+				  NULL  \
+				); \
+			} \
+		} \
+		else \
+		{ \
+			f_cxk \
+			( \
+			  conjc10, \
+			  schema, \
+			  p10_dim, \
+			  p10_len, \
+			  p10_len_max, \
+			  kappa, \
+			  c10, incc10, ldc10, \
+			  p10,         ldp, \
+			  cntx  \
+			); \
+		} \
+	} \
 \
 	/* Pack to p11. */ \
-    if ( 0 <= diagoffc && diagoffc + panel_dim <= panel_len ) \
-    { \
-        dim_t  i           = diagoffc; \
-    	dim_t  p11_dim     = panel_dim; \
-    	dim_t  p11_len_max = panel_dim + ( diagoffc + panel_dim == panel_len ? panel_len_pad : 0 ); \
-    	ctype* p11         = p + i * ldp; \
-    	conj_t conjc11     = conjc; \
-    	ctype* c11         = c + i * ldc; \
-        inc_t  incc11      = incc; \
-    	inc_t  ldc11       = ldc; \
+	if ( 0 <= diagoffc && diagoffc + panel_dim <= panel_len ) \
+	{ \
+		dim_t  i           = diagoffc; \
+		dim_t  p11_dim     = panel_dim; \
+		dim_t  p11_len_max = panel_dim + ( diagoffc + panel_dim == panel_len \
+		                                   ? panel_len_pad : 0 ); \
+		ctype* p11         = p + i * ldp; \
+		conj_t conjc11     = conjc; \
+		ctype* c11         = c + i * ldc; \
+		inc_t  incc11      = incc; \
+		inc_t  ldc11       = ldc; \
 \
-    	f_cxc \
-    	( \
-          strucc, \
-          diagc, \
-          uploc, \
-          conjc11, \
-          schema, \
-          invdiag, \
-    	  p11_dim, \
-    	  p11_len_max, \
-    	  kappa, \
-    	  c11, incc11, ldc11, \
-    	  p11,         ldp, \
-    	  cntx  \
-    	); \
-    } \
+		f_cxc \
+		( \
+		  strucc, \
+		  diagc, \
+		  uploc, \
+		  conjc11, \
+		  schema, \
+		  invdiag, \
+		  p11_dim, \
+		  p11_len_max, \
+		  kappa, \
+		  c11, incc11, ldc11, \
+		  p11,         ldp, \
+		  cntx  \
+		); \
+	} \
 \
 	/* Pack to p12. */ \
-    if ( diagoffc + panel_dim < panel_len ) \
-    { \
-        dim_t  i           = bli_max( 0, diagoffc + panel_dim ); \
-    	dim_t  p12_dim     = panel_dim; \
-    	dim_t  p12_len     = panel_len - i; \
-        /* If we are packing p12, then it is always the last partial block \
-           and so we should make sure to pad with zeros if necessary. */ \
-    	dim_t  p12_len_max = p12_len + panel_len_pad; \
-    	ctype* p12         = p + i * ldp; \
-    	conj_t conjc12     = conjc; \
-    	ctype* c12         = c + i * ldc; \
-        inc_t  incc12      = incc; \
-    	inc_t  ldc12       = ldc; \
+	if ( diagoffc + panel_dim < panel_len ) \
+	{ \
+		dim_t  i           = bli_max( 0, diagoffc + panel_dim ); \
+		dim_t  p12_dim     = panel_dim; \
+		dim_t  p12_len     = panel_len - i; \
+		/* If we are packing p12, then it is always the last partial block \
+		   and so we should make sure to pad with zeros if necessary. */ \
+		dim_t  p12_len_max = p12_len + panel_len_pad; \
+		ctype* p12         = p + i * ldp; \
+		conj_t conjc12     = conjc; \
+		ctype* c12         = c + i * ldc; \
+		inc_t  incc12      = incc; \
+		inc_t  ldc12       = ldc; \
+\
+		if ( bli_is_lower( uploc ) ) \
+		{ \
+			bli_reflect_to_stored_part( diagoffc - i, c12, incc12, ldc12 ); \
+\
+			if ( bli_is_hermitian( strucc ) ) \
+				bli_toggle_conj( &conjc12 ); \
+		} \
 \
-    	if ( bli_is_lower( uploc ) ) \
-        { \
-            bli_reflect_to_stored_part( diagoffc - i, c12, incc12, ldc12 ); \
+		/* If we are referencing the unstored part of a triangular matrix,
+		   explicitly store zeros */ \
+		if ( bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) \
+		{ \
+			if ( bli_is_1m_packed( schema ) ) \
+			{ \
+			    ctype_r* restrict zero = PASTEMAC(chr,0); \
 \
-    		if ( bli_is_hermitian( strucc ) ) \
-    			bli_toggle_conj( &conjc12 ); \
-        } \
+				PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \
+				( \
+				  BLIS_NO_CONJUGATE, \
+				  0, \
+				  BLIS_NONUNIT_DIAG, \
+				  BLIS_DENSE, \
+				  packmrnr_r, \
+				  p12_len_max * 2, \
+				  zero, \
+				  ( ctype_r* )p12, 1, ldp, \
+				  cntx, \
+				  NULL  \
+				); \
+			} \
+			else \
+			{ \
+				ctype* restrict zero = PASTEMAC(ch,0); \
 \
-        /* If we are referencing the unstored part of a triangular matrix, explicitly store zeros */ \
-        if ( bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) \
-        { \
-            if ( bli_is_1m_packed( schema ) ) \
-            { \
-    		    ctype_r* restrict zero = PASTEMAC(chr,0); \
-        		PASTEMAC2(chr,setm,BLIS_TAPI_EX_SUF) \
-        		( \
-        		  BLIS_NO_CONJUGATE, \
-        		  0, \
-        		  BLIS_NONUNIT_DIAG, \
-        		  BLIS_DENSE, \
-        		  packmrnr_r, \
-        		  p12_len_max * 2, \
-        		  zero, \
-        		  ( ctype_r* )p12, 1, ldp, \
-        		  cntx, \
-        		  NULL  \
-        		); \
-            } \
-            else \
-            { \
-    		    ctype* restrict zero = PASTEMAC(ch,0); \
-        		PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \
-        		( \
-        		  BLIS_NO_CONJUGATE, \
-        		  0, \
-        		  BLIS_NONUNIT_DIAG, \
-        		  BLIS_DENSE, \
-        		  packmrnr, \
-        		  p12_len_max, \
-        		  zero, \
-        		  p12, 1, ldp, \
-        		  cntx, \
-        		  NULL  \
-        		); \
-            } \
-        } \
-        else \
-        { \
-        	f_cxk \
-        	( \
-        	  conjc12, \
-        	  schema, \
-        	  p12_dim, \
-        	  p12_len, \
-        	  p12_len_max, \
-        	  kappa, \
-        	  c12, incc12, ldc12, \
-        	  p12,         ldp, \
-        	  cntx  \
-        	); \
-        } \
-    } \
+				PASTEMAC2(ch,setm,BLIS_TAPI_EX_SUF) \
+				( \
+				  BLIS_NO_CONJUGATE, \
+				  0, \
+				  BLIS_NONUNIT_DIAG, \
+				  BLIS_DENSE, \
+				  packmrnr, \
+				  p12_len_max, \
+				  zero, \
+				  p12, 1, ldp, \
+				  cntx, \
+				  NULL  \
+				); \
+			} \
+		} \
+		else \
+		{ \
+			f_cxk \
+			( \
+			  conjc12, \
+			  schema, \
+			  p12_dim, \
+			  p12_len, \
+			  p12_len_max, \
+			  kappa, \
+			  c12, incc12, ldc12, \
+			  p12,         ldp, \
+			  cntx  \
+			); \
+		} \
+	} \
 }
 
 INSERT_GENTFUNCR_BASIC2( packm_struc_cxk, packm_cxk, packm_cxc_diag )
diff --git a/frame/1m/unpackm/bli_unpackm_blk_var1.c b/frame/1m/unpackm/bli_unpackm_blk_var1.c
index cbd9045d9d..b6165f5163 100644
--- a/frame/1m/unpackm/bli_unpackm_blk_var1.c
+++ b/frame/1m/unpackm/bli_unpackm_blk_var1.c
@@ -36,21 +36,22 @@
 
 #define FUNCPTR_T unpackm_fp
 
-typedef void (*FUNCPTR_T)(
-                           struc_t strucc,
-                           doff_t  diagoffc,
-                           diag_t  diagc,
-                           uplo_t  uploc,
-                           trans_t transc,
-                           dim_t   m,
-                           dim_t   n,
-                           dim_t   m_panel,
-                           dim_t   n_panel,
-                           void*   p, inc_t rs_p, inc_t cs_p,
-                                      dim_t pd_p, inc_t ps_p,
-                           void*   c, inc_t rs_c, inc_t cs_c,
-                           cntx_t* cntx
-                         );
+typedef void (*FUNCPTR_T)
+     (
+       struc_t strucc,
+       doff_t  diagoffc,
+       diag_t  diagc,
+       uplo_t  uploc,
+       trans_t transc,
+       dim_t   m,
+       dim_t   n,
+       dim_t   m_panel,
+       dim_t   n_panel,
+       void*   p, inc_t rs_p, inc_t cs_p,
+                  dim_t pd_p, inc_t ps_p,
+       void*   c, inc_t rs_c, inc_t cs_c,
+       cntx_t* cntx
+     );
 
 static FUNCPTR_T GENARRAY(ftypes,unpackm_blk_var1);
 
@@ -152,10 +153,10 @@ void PASTEMAC(ch,varname) \
 	dim_t           iter_dim; \
 	dim_t           num_iter; \
 	dim_t           it, ic, ip; \
-    dim_t           ic0, ip0; \
+	dim_t           ic0, ip0; \
 	doff_t          ic_inc, ip_inc; \
-    doff_t          diagoffc_i; \
-    doff_t          diagoffc_inc; \
+	doff_t          diagoffc_i; \
+	doff_t          diagoffc_inc; \
 	dim_t           panel_len; \
 	dim_t           panel_dim_i; \
 	dim_t           panel_dim_max; \
@@ -164,7 +165,7 @@ void PASTEMAC(ch,varname) \
 	inc_t           ldp; \
 	dim_t*          m_panel_full; \
 	dim_t*          n_panel_full; \
-    pack_t          schema; \
+	pack_t          schema; \
 \
 \
 	/* If c needs a transposition, induce it so that we can more simply
@@ -183,7 +184,7 @@ void PASTEMAC(ch,varname) \
 	if ( bli_is_row_stored_f( m_panel, n_panel, rs_p, cs_p ) ) \
 	{ \
 		/* Prepare to unpack from column panels. */ \
-        schema        = BLIS_PACKED_COL_PANELS; \
+		schema        = BLIS_PACKED_COL_PANELS; \
 		iter_dim      = n; \
 		panel_len     = m; \
 		panel_dim_max = pd_p; \
@@ -198,7 +199,7 @@ void PASTEMAC(ch,varname) \
 	else /* if ( bli_is_col_stored_f( m_panel, n_panel, rs_p, cs_p ) ) */ \
 	{ \
 		/* Prepare to unpack from row panels. */ \
-        schema        = BLIS_PACKED_ROW_PANELS; \
+		schema        = BLIS_PACKED_ROW_PANELS; \
 		iter_dim      = m; \
 		panel_len     = n; \
 		panel_dim_max = pd_p; \
@@ -212,7 +213,8 @@ void PASTEMAC(ch,varname) \
 	} \
 \
 	num_t dt     = PASTEMAC(ch,type); \
-	ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_UNPACKM_NRXK_KER : BLIS_UNPACKM_MRXK_KER; \
+	ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_UNPACKM_NRXK_KER \
+	                                           : BLIS_UNPACKM_MRXK_KER; \
 \
 	/* Query the context for the unpackm kernel corresponding to the current
 	   panel dimension, or kernel id. */ \
@@ -266,7 +268,7 @@ void PASTEMAC(ch,varname) \
 			f \
 			( \
 			  BLIS_NO_CONJUGATE, \
-              schema, \
+			  schema, \
 			  panel_dim_i, \
 			  panel_len, \
 			  one, \
diff --git a/frame/3/bli_l3_sup_packm_var.c b/frame/3/bli_l3_sup_packm_var.c
index f54e5f1256..519dc5ccd5 100644
--- a/frame/3/bli_l3_sup_packm_var.c
+++ b/frame/3/bli_l3_sup_packm_var.c
@@ -124,7 +124,8 @@ void PASTEMAC(ch,varname) \
 	} \
 \
 	num_t dt     = PASTEMAC(ch,type); \
-	ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_KER : BLIS_PACKM_MRXK_KER; \
+	ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_KER \
+	                                           : BLIS_PACKM_MRXK_KER; \
 \
 	/* Query the context for the unpackm kernel corresponding to the current
 	   panel dimension, or kernel id. */ \
diff --git a/frame/include/bli_kernel_macro_defs.h b/frame/include/bli_kernel_macro_defs.h
index 00ce7594e1..d273c353ab 100644
--- a/frame/include/bli_kernel_macro_defs.h
+++ b/frame/include/bli_kernel_macro_defs.h
@@ -247,11 +247,11 @@
 
 // -- MR and NR blocksizes (only for reference kernels) ------------------------
 
-// The build system defines BLIS_IN_KERNEL, but only when compiling reference
-// kernels. By using compile-time constants for MR and NR, the compiler can
-// perform certain optimizations, such as unrolling and vectorization, that
-// would not be otherwise be possible.
-#ifdef BLIS_IN_KERNEL
+// The build system defines BLIS_IN_REF_KERNEL, but only when compiling
+// reference kernels. By using compile-time constants for MR and NR, the
+// compiler can perform certain optimizations, such as unrolling and
+// vectorization, that would not be otherwise be possible.
+#ifdef BLIS_IN_REF_KERNEL
 
 #ifndef BLIS_MR_s
 #define BLIS_MR_s 4
diff --git a/frame/include/level0/bli_set0s_edge.h b/frame/include/level0/bli_set0s_edge.h
index 5ce23c36dd..2c436812e2 100644
--- a/frame/include/level0/bli_set0s_edge.h
+++ b/frame/include/level0/bli_set0s_edge.h
@@ -43,29 +43,35 @@
 
 #define GENTFUNC(ctype,ch,op) \
 \
-BLIS_INLINE void PASTEMAC(ch,op)( const dim_t i, const dim_t m, \
-                                  const dim_t j, const dim_t n, \
-                                  ctype* restrict p, const inc_t ldp ) \
+BLIS_INLINE void PASTEMAC(ch,op) \
+     ( \
+       const dim_t     i, \
+       const dim_t     m, \
+       const dim_t     j, \
+       const dim_t     n, \
+       ctype* restrict p, \
+       const inc_t     ldp \
+     ) \
 { \
-    if ( i < m ) \
-    { \
-    	PASTEMAC(ch,set0s_mxn) \
-        ( \
-          m - i, \
-          j, \
-          p + i*1, 1, ldp \
-        ); \
-    } \
+	if ( i < m ) \
+	{ \
+		PASTEMAC(ch,set0s_mxn) \
+		( \
+		  m - i, \
+		  j, \
+		  p + i*1, 1, ldp \
+		); \
+	} \
 \
-    if ( j < n ) \
-    { \
-    	PASTEMAC(ch,set0s_mxn) \
-        ( \
-          m, \
-          n - j, \
-          p + j*ldp, 1, ldp \
-        ); \
-    } \
+	if ( j < n ) \
+	{ \
+		PASTEMAC(ch,set0s_mxn) \
+		( \
+		  m, \
+		  n - j, \
+		  p + j*ldp, 1, ldp \
+		); \
+	} \
 }
 
 INSERT_GENTFUNC_BASIC0(set0s_edge)
diff --git a/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c b/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c
index 17ed9bef65..5cee5535b1 100644
--- a/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c
+++ b/ref_kernels/1m/bli_packm_cxc_diag_1er_ref.c
@@ -36,75 +36,75 @@
 
 #define PACKM_SET1_1E( chr, mnk ) \
 do { \
-    PASTEMAC(chr,set1s)( *(pi1_ri + (mnk*2 + 0)*dfac + d + mnk*ldp2) ); \
-    PASTEMAC(chr,set0s)( *(pi1_ri + (mnk*2 + 1)*dfac + d + mnk*ldp2) ); \
-    PASTEMAC(chr,set0s)( *(pi1_ir + (mnk*2 + 0)*dfac + d + mnk*ldp2) ); \
-    PASTEMAC(chr,set1s)( *(pi1_ir + (mnk*2 + 1)*dfac + d + mnk*ldp2) ); \
+	PASTEMAC(chr,set1s)( *(pi1_ri + (mnk*2 + 0)*dfac + d + mnk*ldp2) ); \
+	PASTEMAC(chr,set0s)( *(pi1_ri + (mnk*2 + 1)*dfac + d + mnk*ldp2) ); \
+	PASTEMAC(chr,set0s)( *(pi1_ir + (mnk*2 + 0)*dfac + d + mnk*ldp2) ); \
+	PASTEMAC(chr,set1s)( *(pi1_ir + (mnk*2 + 1)*dfac + d + mnk*ldp2) ); \
 } while (0)
 
 #define PACKM_SET1_1R( chr, mnk ) \
 do { \
-    PASTEMAC(chr,set1s)( *(pi1_r + mnk*dfac + d + mnk*ldp2) ); \
-    PASTEMAC(chr,set0s)( *(pi1_i + mnk*dfac + d + mnk*ldp2) ); \
+	PASTEMAC(chr,set1s)( *(pi1_r + mnk*dfac + d + mnk*ldp2) ); \
+	PASTEMAC(chr,set0s)( *(pi1_i + mnk*dfac + d + mnk*ldp2) ); \
 } while (0)
 
 #define PACKM_SCAL_1E( ch, mn, k, op ) \
 do { \
 	PASTEMAC(ch,op)(  kappa_r, kappa_i, *(alpha1 +  mn       *inca2 + 0 + k*lda2), \
-                                        *(alpha1 +  mn       *inca2 + 1 + k*lda2), \
+	                                    *(alpha1 +  mn       *inca2 + 1 + k*lda2), \
 	                                    *(pi1_ri + (mn*2 + 0)*dfac  + d + k*ldp2), \
-                                        *(pi1_ri + (mn*2 + 1)*dfac  + d + k*ldp2) ); \
+	                                    *(pi1_ri + (mn*2 + 1)*dfac  + d + k*ldp2) ); \
 	PASTEMAC(ch,op)( -kappa_i, kappa_r, *(alpha1 +  mn       *inca2 + 0 + k*lda2), \
-                                        *(alpha1 +  mn       *inca2 + 1 + k*lda2), \
+	                                    *(alpha1 +  mn       *inca2 + 1 + k*lda2), \
 	                                    *(pi1_ir + (mn*2 + 0)*dfac  + d + k*ldp2), \
-                                        *(pi1_ir + (mn*2 + 1)*dfac  + d + k*ldp2) ); \
+	                                    *(pi1_ir + (mn*2 + 1)*dfac  + d + k*ldp2) ); \
 } while (0)
 
 #define PACKM_SCAL_1R( ch, mn, k, op ) \
 do { \
 	PASTEMAC(ch,op)( kappa_r, kappa_i, *(alpha1 + mn*inca2 + 0 + k*lda2), \
-                                       *(alpha1 + mn*inca2 + 1 + k*lda2), \
+	                                   *(alpha1 + mn*inca2 + 1 + k*lda2), \
 	                                   *(pi1_r  + mn*dfac  + d + k*ldp2), \
-                                       *(pi1_i  + mn*dfac  + d + k*ldp2) ); \
+	                                   *(pi1_i  + mn*dfac  + d + k*ldp2) ); \
 } while (0)
 
 #define PACKM_DIAG_1E_BODY( ch, mn_min, mn_max, inca2_lu, lda2_lu, op ) \
 \
 do \
 { \
-    /* PACKM_SCAL_1E assumes inca2 and lda2 are the strides to use. */ \
-    dim_t inca2 = inca2_lu; \
-    dim_t lda2 = lda2_lu; \
+	/* PACKM_SCAL_1E assumes inca2 and lda2 are the strides to use. */ \
+	dim_t inca2 = inca2_lu; \
+	dim_t lda2 = lda2_lu; \
 	for ( dim_t k = 0; k < cdim; k++ ) \
 	for ( dim_t mn = mn_min; mn < mn_max; mn++ ) \
 	for ( dim_t d = 0; d < dfac; d++ ) \
-        PACKM_SCAL_1E( ch, mn, k, op ); \
+		PACKM_SCAL_1E( ch, mn, k, op ); \
 } while(0)
 
 #define PACKM_DIAG_BODY_1E_L( ch, op ) \
-    PACKM_DIAG_1E_BODY( ch, k+1, cdim, inca_l2, lda_l2, op )
+	PACKM_DIAG_1E_BODY( ch, k+1, cdim, inca_l2, lda_l2, op )
 
 #define PACKM_DIAG_BODY_1E_U( ch, op ) \
-    PACKM_DIAG_1E_BODY( ch, 0, k, inca_u2, lda_u2, op )
+	PACKM_DIAG_1E_BODY( ch, 0, k, inca_u2, lda_u2, op )
 
 #define PACKM_DIAG_1R_BODY( ch, mn_min, mn_max, inca2_lu, lda2_lu, op ) \
 \
 do \
 { \
-    /* PACKM_SCAL_1R assumes inca2 and lda2 are the strides to use. */ \
-    dim_t inca2 = inca2_lu; \
-    dim_t lda2 = lda2_lu; \
+	/* PACKM_SCAL_1R assumes inca2 and lda2 are the strides to use. */ \
+	dim_t inca2 = inca2_lu; \
+	dim_t lda2 = lda2_lu; \
 	for ( dim_t k = 0; k < cdim; k++ ) \
 	for ( dim_t mn = mn_min; mn < mn_max; mn++ ) \
 	for ( dim_t d = 0; d < dfac; d++ ) \
-        PACKM_SCAL_1R( ch, mn, k, op ); \
+		PACKM_SCAL_1R( ch, mn, k, op ); \
 } while(0)
 
 #define PACKM_DIAG_BODY_1R_L( ch, op ) \
-    PACKM_DIAG_1R_BODY( ch, k+1, cdim, inca_l2, lda_l2, op )
+	PACKM_DIAG_1R_BODY( ch, k+1, cdim, inca_l2, lda_l2, op )
 
 #define PACKM_DIAG_BODY_1R_U( ch, op ) \
-    PACKM_DIAG_1R_BODY( ch, 0, k, inca_u2, lda_u2, op )
+	PACKM_DIAG_1R_BODY( ch, 0, k, inca_u2, lda_u2, op )
 
 #undef  GENTFUNCCO
 #define GENTFUNCCO( ctype, ctype_r, ch, chr, opname, mnr0, bb0, arch, suf ) \
@@ -125,11 +125,11 @@ void PASTEMAC3(ch,opname,arch,suf) \
        cntx_t* restrict cntx \
      ) \
 { \
-    const num_t dt_r      = PASTEMAC(chr,type); \
+	const num_t dt_r      = PASTEMAC(chr,type); \
 	const dim_t cdim_pack = bli_cntx_get_blksz_max_dt( dt_r, mnr0, cntx ); \
 	const dim_t dfac      = bli_cntx_get_blksz_def_dt( dt_r, bb0, cntx ); \
 \
-    /* start by zeroing out the whole block */ \
+	/* start by zeroing out the whole block */ \
 	PASTEMAC(chr,set0s_mxn) \
 	( \
 	  cdim_pack, \
@@ -145,190 +145,190 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	ctype_r           kappa_i = ( ( ctype_r* )kappa )[1]; \
 	ctype_r* restrict alpha1  = ( ctype_r* )a; \
 \
-    if ( bli_is_1e_packed( schema ) ) \
-    { \
-	    const dim_t       cdim_max = bli_cntx_get_blksz_def_dt( dt_r, mnr0, cntx ) / 2; \
+	if ( bli_is_1e_packed( schema ) ) \
+	{ \
+		const dim_t       cdim_max = bli_cntx_get_blksz_def_dt( dt_r, mnr0, cntx ) / 2; \
 \
 		ctype_r* restrict pi1_ri   = ( ctype_r* )p; \
 		ctype_r* restrict pi1_ir   = ( ctype_r* )p + ldp; \
 \
-        /* write the strictly lower part if it exists */ \
-        if ( bli_is_lower( uploa ) || bli_is_herm_or_symm( struca ) ) \
-        { \
-            dim_t  inca_l2 = inca2; \
-            dim_t  lda_l2  = lda2; \
-            conj_t conja_l = conja; \
+		/* write the strictly lower part if it exists */ \
+		if ( bli_is_lower( uploa ) || bli_is_herm_or_symm( struca ) ) \
+		{ \
+			dim_t  inca_l2 = inca2; \
+			dim_t  lda_l2  = lda2; \
+			conj_t conja_l = conja; \
 \
-            if ( bli_is_upper( uploa ) ) \
-            { \
-                bli_swap_incs( &inca_l2, &lda_l2 ); \
-                if ( bli_is_hermitian( struca ) ) \
-                    bli_toggle_conj( &conja_l ); \
-            } \
+			if ( bli_is_upper( uploa ) ) \
+			{ \
+				bli_swap_incs( &inca_l2, &lda_l2 ); \
+				if ( bli_is_hermitian( struca ) ) \
+				    bli_toggle_conj( &conja_l ); \
+			} \
 \
-            if ( bli_is_conj( conja_l ) ) PACKM_DIAG_BODY_1E_L( ch, scal2jris ); \
-            else                          PACKM_DIAG_BODY_1E_L( ch, scal2ris ); \
-        } \
+			if ( bli_is_conj( conja_l ) ) PACKM_DIAG_BODY_1E_L( ch, scal2jris ); \
+			else                          PACKM_DIAG_BODY_1E_L( ch, scal2ris ); \
+		} \
 \
-        /* write the strictly upper part if it exists */ \
-        /* assume either symmetric, hermitian, or triangular */ \
-        if ( bli_is_upper( uploa ) || bli_is_herm_or_symm( struca ) ) \
-        { \
-            dim_t  inca_u2 = inca2; \
-            dim_t  lda_u2  = lda2; \
-            conj_t conja_u = conja; \
+		/* write the strictly upper part if it exists */ \
+		/* assume either symmetric, hermitian, or triangular */ \
+		if ( bli_is_upper( uploa ) || bli_is_herm_or_symm( struca ) ) \
+		{ \
+			dim_t  inca_u2 = inca2; \
+			dim_t  lda_u2  = lda2; \
+			conj_t conja_u = conja; \
 \
-            if ( bli_is_lower( uploa ) ) \
-            { \
-                bli_swap_incs( &inca_u2, &lda_u2 ); \
-                if ( bli_is_hermitian( struca ) ) \
-                    bli_toggle_conj( &conja_u ); \
-            } \
+			if ( bli_is_lower( uploa ) ) \
+			{ \
+				bli_swap_incs( &inca_u2, &lda_u2 ); \
+				if ( bli_is_hermitian( struca ) ) \
+				    bli_toggle_conj( &conja_u ); \
+			} \
 \
-            if ( bli_is_conj( conja_u ) ) PACKM_DIAG_BODY_1E_U( ch, scal2jris ); \
-            else                          PACKM_DIAG_BODY_1E_U( ch, scal2ris ); \
-        } \
+			if ( bli_is_conj( conja_u ) ) PACKM_DIAG_BODY_1E_U( ch, scal2jris ); \
+			else                          PACKM_DIAG_BODY_1E_U( ch, scal2ris ); \
+		} \
 \
-        /* write the diagonal */ \
-        if ( bli_is_unit_diag( diaga ) ) \
-        { \
-            for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
-            for ( dim_t d = 0; d < dfac; ++d ) \
-                PACKM_SET1_1E( chr, mnk ); \
-        } \
-        else if ( bli_is_hermitian( struca ) ) \
-        { \
-            for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
-            for ( dim_t d = 0; d < dfac; ++d ) \
-            { \
-                ctype_r mu_r = *(alpha1 + mnk*(inca2 + lda2)); \
-        		PASTEMAC(chr,scal2s)(  kappa_r, mu_r, *(pi1_ri + (mnk*2 + 0)*dfac + d + mnk*ldp2) ); \
-        		PASTEMAC(chr,scal2s)(  kappa_i, mu_r, *(pi1_ri + (mnk*2 + 1)*dfac + d + mnk*ldp2) ); \
-        		PASTEMAC(chr,scal2s)( -kappa_i, mu_r, *(pi1_ir + (mnk*2 + 0)*dfac + d + mnk*ldp2) ); \
-        		PASTEMAC(chr,scal2s)(  kappa_r, mu_r, *(pi1_ir + (mnk*2 + 1)*dfac + d + mnk*ldp2) ); \
-            } \
-        } \
-        else if ( bli_is_conj( conja )) \
-        { \
-            for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
-            for ( dim_t d = 0; d < dfac; ++d ) \
-                PACKM_SCAL_1E( ch, mnk, mnk, scal2jris ); \
-        } \
-        else \
-        { \
-            for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
-            for ( dim_t d = 0; d < dfac; ++d ) \
-                PACKM_SCAL_1E( ch, mnk, mnk, scal2ris ); \
-        } \
+		/* write the diagonal */ \
+		if ( bli_is_unit_diag( diaga ) ) \
+		{ \
+			for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+			for ( dim_t d = 0; d < dfac; ++d ) \
+				PACKM_SET1_1E( chr, mnk ); \
+		} \
+		else if ( bli_is_hermitian( struca ) ) \
+		{ \
+			for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+			for ( dim_t d = 0; d < dfac; ++d ) \
+			{ \
+				ctype_r mu_r = *(alpha1 + mnk*(inca2 + lda2)); \
+				PASTEMAC(chr,scal2s)(  kappa_r, mu_r, *(pi1_ri + (mnk*2 + 0)*dfac + d + mnk*ldp2) ); \
+				PASTEMAC(chr,scal2s)(  kappa_i, mu_r, *(pi1_ri + (mnk*2 + 1)*dfac + d + mnk*ldp2) ); \
+				PASTEMAC(chr,scal2s)( -kappa_i, mu_r, *(pi1_ir + (mnk*2 + 0)*dfac + d + mnk*ldp2) ); \
+				PASTEMAC(chr,scal2s)(  kappa_r, mu_r, *(pi1_ir + (mnk*2 + 1)*dfac + d + mnk*ldp2) ); \
+			} \
+		} \
+		else if ( bli_is_conj( conja )) \
+		{ \
+			for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+			for ( dim_t d = 0; d < dfac; ++d ) \
+				PACKM_SCAL_1E( ch, mnk, mnk, scal2jris ); \
+		} \
+		else \
+		{ \
+			for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+			for ( dim_t d = 0; d < dfac; ++d ) \
+				PACKM_SCAL_1E( ch, mnk, mnk, scal2ris ); \
+		} \
 \
-        /* invert the diagonal if requested */ \
-        if ( invdiag ) \
-        { \
-            for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
-            for ( dim_t d = 0; d < dfac; ++d ) \
-            { \
-                PASTEMAC(ch,invertris)( *(pi1_ri + (mnk*2 + 0)*dfac + d + mnk*ldp2), \
-                                        *(pi1_ri + (mnk*2 + 1)*dfac + d + mnk*ldp2) ); \
-                PASTEMAC(ch,copyjris)( *(pi1_ri + (mnk*2 + 0)*dfac + d + mnk*ldp2), \
-                                       *(pi1_ri + (mnk*2 + 1)*dfac + d + mnk*ldp2), \
-                                       *(pi1_ir + (mnk*2 + 1)*dfac + d + mnk*ldp2), \
-                                       *(pi1_ir + (mnk*2 + 0)*dfac + d + mnk*ldp2) ); \
-            } \
-        } \
+		/* invert the diagonal if requested */ \
+		if ( invdiag ) \
+		{ \
+			for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+			for ( dim_t d = 0; d < dfac; ++d ) \
+			{ \
+				PASTEMAC(ch,invertris)( *(pi1_ri + (mnk*2 + 0)*dfac + d + mnk*ldp2), \
+				                        *(pi1_ri + (mnk*2 + 1)*dfac + d + mnk*ldp2) ); \
+				PASTEMAC(ch,copyjris)( *(pi1_ri + (mnk*2 + 0)*dfac + d + mnk*ldp2), \
+				                       *(pi1_ri + (mnk*2 + 1)*dfac + d + mnk*ldp2), \
+				                       *(pi1_ir + (mnk*2 + 1)*dfac + d + mnk*ldp2), \
+				                       *(pi1_ir + (mnk*2 + 0)*dfac + d + mnk*ldp2) ); \
+			} \
+		} \
 \
-        /* if this an edge case in both directions, extend the diagonal with ones */ \
-        for ( dim_t mnk = cdim; mnk < bli_min( cdim_max, n_max ); ++mnk ) \
-        for ( dim_t d = 0; d < dfac; ++d ) \
-            PACKM_SET1_1E( chr, mnk ); \
-    } \
-    else /* bli_is_1r_packed( schema ) */ \
-    { \
-    	const dim_t       cdim_max = bli_cntx_get_blksz_def_dt( dt_r, mnr0, cntx ); \
+		/* if this an edge case in both directions, extend the diagonal with ones */ \
+		for ( dim_t mnk = cdim; mnk < bli_min( cdim_max, n_max ); ++mnk ) \
+		for ( dim_t d = 0; d < dfac; ++d ) \
+			PACKM_SET1_1E( chr, mnk ); \
+	} \
+	else /* bli_is_1r_packed( schema ) */ \
+	{ \
+		const dim_t       cdim_max = bli_cntx_get_blksz_def_dt( dt_r, mnr0, cntx ); \
 \
 		ctype_r* restrict pi1_r    = ( ctype_r* )p; \
 		ctype_r* restrict pi1_i    = ( ctype_r* )p + ldp; \
 \
-        /* write the strictly lower part if it exists */ \
-        if ( bli_is_lower( uploa ) || bli_is_herm_or_symm( struca ) ) \
-        { \
-            dim_t  inca_l2 = inca2; \
-            dim_t  lda_l2  = lda2; \
-            conj_t conja_l = conja; \
+		/* write the strictly lower part if it exists */ \
+		if ( bli_is_lower( uploa ) || bli_is_herm_or_symm( struca ) ) \
+		{ \
+			dim_t  inca_l2 = inca2; \
+			dim_t  lda_l2  = lda2; \
+			conj_t conja_l = conja; \
 \
-            if ( bli_is_upper( uploa ) ) \
-            { \
-                bli_swap_incs( &inca_l2, &lda_l2 ); \
-                if ( bli_is_hermitian( struca ) ) \
-                    bli_toggle_conj( &conja_l ); \
-            } \
+			if ( bli_is_upper( uploa ) ) \
+			{ \
+				bli_swap_incs( &inca_l2, &lda_l2 ); \
+				if ( bli_is_hermitian( struca ) ) \
+				    bli_toggle_conj( &conja_l ); \
+			} \
 \
-            if ( bli_is_conj( conja_l ) ) PACKM_DIAG_BODY_1R_L( ch, scal2jris ); \
-            else                          PACKM_DIAG_BODY_1R_L( ch, scal2ris ); \
-        } \
+			if ( bli_is_conj( conja_l ) ) PACKM_DIAG_BODY_1R_L( ch, scal2jris ); \
+			else                          PACKM_DIAG_BODY_1R_L( ch, scal2ris ); \
+		} \
 \
-        /* write the strictly upper part if it exists */ \
-        /* assume either symmetric, hermitian, or triangular */ \
-        if ( bli_is_upper( uploa ) || bli_is_herm_or_symm( struca ) ) \
-        { \
-            dim_t  inca_u2 = inca2; \
-            dim_t  lda_u2  = lda2; \
-            conj_t conja_u = conja; \
+		/* write the strictly upper part if it exists */ \
+		/* assume either symmetric, hermitian, or triangular */ \
+		if ( bli_is_upper( uploa ) || bli_is_herm_or_symm( struca ) ) \
+		{ \
+			dim_t  inca_u2 = inca2; \
+			dim_t  lda_u2  = lda2; \
+			conj_t conja_u = conja; \
 \
-            if ( bli_is_lower( uploa ) ) \
-            { \
-                bli_swap_incs( &inca_u2, &lda_u2 ); \
-                if ( bli_is_hermitian( struca ) ) \
-                    bli_toggle_conj( &conja_u ); \
-            } \
+			if ( bli_is_lower( uploa ) ) \
+			{ \
+				bli_swap_incs( &inca_u2, &lda_u2 ); \
+				if ( bli_is_hermitian( struca ) ) \
+				    bli_toggle_conj( &conja_u ); \
+			} \
 \
-            if ( bli_is_conj( conja_u ) ) PACKM_DIAG_BODY_1R_U( ch, scal2jris ); \
-            else                          PACKM_DIAG_BODY_1R_U( ch, scal2ris ); \
-        } \
+			if ( bli_is_conj( conja_u ) ) PACKM_DIAG_BODY_1R_U( ch, scal2jris ); \
+			else                          PACKM_DIAG_BODY_1R_U( ch, scal2ris ); \
+		} \
 \
-        /* write the diagonal */ \
-        if ( bli_is_unit_diag( diaga ) ) \
-        { \
-            for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
-            for ( dim_t d = 0; d < dfac; ++d ) \
-                PACKM_SET1_1R( chr, mnk ); \
-        } \
-        else if ( bli_is_hermitian( struca ) ) \
-        { \
-            for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
-            for ( dim_t d = 0; d < dfac; ++d ) \
-            { \
-                ctype_r mu_r = *(alpha1 + mnk*(inca2 + lda2)); \
-        		PASTEMAC(chr,scal2s)( kappa_r, mu_r, *(pi1_r + mnk*(dfac + ldp2) + d) ); \
-        		PASTEMAC(chr,scal2s)( kappa_i, mu_r, *(pi1_i + mnk*(dfac + ldp2) + d) ); \
-            } \
-        } \
-        else if ( bli_is_conj( conja ) ) \
-        { \
-            for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
-            for ( dim_t d = 0; d < dfac; ++d ) \
-                PACKM_SCAL_1R( ch, mnk, mnk, scal2jris ); \
-        } \
-        else \
-        { \
-            for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
-            for ( dim_t d = 0; d < dfac; ++d ) \
-                PACKM_SCAL_1R( ch, mnk, mnk, scal2ris ); \
-        } \
+		/* write the diagonal */ \
+		if ( bli_is_unit_diag( diaga ) ) \
+		{ \
+			for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+			for ( dim_t d = 0; d < dfac; ++d ) \
+				PACKM_SET1_1R( chr, mnk ); \
+		} \
+		else if ( bli_is_hermitian( struca ) ) \
+		{ \
+			for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+			for ( dim_t d = 0; d < dfac; ++d ) \
+			{ \
+				ctype_r mu_r = *(alpha1 + mnk*(inca2 + lda2)); \
+				PASTEMAC(chr,scal2s)( kappa_r, mu_r, *(pi1_r + mnk*(dfac + ldp2) + d) ); \
+				PASTEMAC(chr,scal2s)( kappa_i, mu_r, *(pi1_i + mnk*(dfac + ldp2) + d) ); \
+			} \
+		} \
+		else if ( bli_is_conj( conja ) ) \
+		{ \
+			for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+			for ( dim_t d = 0; d < dfac; ++d ) \
+				PACKM_SCAL_1R( ch, mnk, mnk, scal2jris ); \
+		} \
+		else \
+		{ \
+			for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+			for ( dim_t d = 0; d < dfac; ++d ) \
+				PACKM_SCAL_1R( ch, mnk, mnk, scal2ris ); \
+		} \
 \
-        /* invert the diagonal if requested */ \
-        if ( invdiag ) \
-        { \
-            for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
-            for ( dim_t d = 0; d < dfac; ++d ) \
-                PASTEMAC(ch,invertris)( *(pi1_r + mnk*(dfac + ldp2) + d), \
-                                        *(pi1_i + mnk*(dfac + ldp2) + d) ); \
-        } \
+		/* invert the diagonal if requested */ \
+		if ( invdiag ) \
+		{ \
+			for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+			for ( dim_t d = 0; d < dfac; ++d ) \
+				PASTEMAC(ch,invertris)( *(pi1_r + mnk*(dfac + ldp2) + d), \
+				                        *(pi1_i + mnk*(dfac + ldp2) + d) ); \
+		} \
 \
-        /* if this an edge case in both directions, extend the diagonal with ones */ \
-        for ( dim_t mnk = cdim; mnk < bli_min( cdim_max, n_max ); ++mnk ) \
-        for ( dim_t d = 0; d < dfac; ++d ) \
-            PACKM_SET1_1R( chr, mnk ); \
-    } \
+		/* if this an edge case in both directions, extend the diagonal with ones */ \
+		for ( dim_t mnk = cdim; mnk < bli_min( cdim_max, n_max ); ++mnk ) \
+		for ( dim_t d = 0; d < dfac; ++d ) \
+			PACKM_SET1_1R( chr, mnk ); \
+	} \
 }
 
 INSERT_GENTFUNCCO_BASIC4( packm_mrxmr_diag_1er, BLIS_MR, BLIS_BBM, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
diff --git a/ref_kernels/1m/bli_packm_cxc_diag_ref.c b/ref_kernels/1m/bli_packm_cxc_diag_ref.c
index bbfa1e3cc3..80ffcbc141 100644
--- a/ref_kernels/1m/bli_packm_cxc_diag_ref.c
+++ b/ref_kernels/1m/bli_packm_cxc_diag_ref.c
@@ -45,10 +45,10 @@ do \
 } while(0)
 
 #define PACKM_DIAG_BODY_L( ctype, ch, op ) \
-    PACKM_DIAG_BODY( ctype, ch, k+1, cdim, inca_l, lda_l, op )
+	PACKM_DIAG_BODY( ctype, ch, k+1, cdim, inca_l, lda_l, op )
 
 #define PACKM_DIAG_BODY_U( ctype, ch, op ) \
-    PACKM_DIAG_BODY( ctype, ch, 0, k, inca_u, lda_u, op )
+	PACKM_DIAG_BODY( ctype, ch, 0, k, inca_u, lda_u, op )
 
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname, mnr0, bb0, arch, suf ) \
@@ -69,12 +69,12 @@ void PASTEMAC3(ch,opname,arch,suf) \
        cntx_t* restrict cntx \
      ) \
 { \
-    const num_t dt        = PASTEMAC(ch,type); \
+	const num_t dt        = PASTEMAC(ch,type); \
 	const dim_t cdim_max  = bli_cntx_get_blksz_def_dt( dt, mnr0, cntx ); \
 	const dim_t cdim_pack = bli_cntx_get_blksz_max_dt( dt, mnr0, cntx ); \
 	const dim_t dfac      = bli_cntx_get_blksz_def_dt( dt, bb0, cntx ); \
 \
-    /* start by zeroing out the whole block */ \
+	/* start by zeroing out the whole block */ \
 	PASTEMAC(ch,set0s_mxn) \
 	( \
 	  cdim_pack, \
@@ -86,86 +86,86 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	ctype* restrict alpha1     = a; \
 	ctype* restrict pi1        = p; \
 \
-    /* write the strictly lower part if it exists */ \
-    if ( bli_is_lower( uploa ) || bli_is_herm_or_symm( struca ) ) \
-    { \
-        dim_t  inca_l  = inca; \
-        dim_t  lda_l   = lda; \
-        conj_t conja_l = conja; \
+	/* write the strictly lower part if it exists */ \
+	if ( bli_is_lower( uploa ) || bli_is_herm_or_symm( struca ) ) \
+	{ \
+		dim_t  inca_l  = inca; \
+		dim_t  lda_l   = lda; \
+		conj_t conja_l = conja; \
 \
-        if ( bli_is_upper( uploa ) ) \
-        { \
-            bli_swap_incs( &inca_l, &lda_l ); \
-            if ( bli_is_hermitian( struca ) ) \
-                bli_toggle_conj( &conja_l ); \
-        } \
+		if ( bli_is_upper( uploa ) ) \
+		{ \
+			bli_swap_incs( &inca_l, &lda_l ); \
+			if ( bli_is_hermitian( struca ) ) \
+				bli_toggle_conj( &conja_l ); \
+		} \
 \
-        if ( bli_is_conj( conja_l ) ) PACKM_DIAG_BODY_L( ctype, ch, scal2js ); \
-        else                          PACKM_DIAG_BODY_L( ctype, ch, scal2s ); \
-    } \
+		if ( bli_is_conj( conja_l ) ) PACKM_DIAG_BODY_L( ctype, ch, scal2js ); \
+		else                          PACKM_DIAG_BODY_L( ctype, ch, scal2s ); \
+	} \
 \
-    /* write the strictly upper part if it exists */ \
-    /* assume either symmetric, hermitian, or triangular */ \
-    if ( bli_is_upper( uploa ) || bli_is_herm_or_symm( struca ) ) \
-    { \
-        dim_t  inca_u  = inca; \
-        dim_t  lda_u   = lda; \
-        conj_t conja_u = conja; \
+	/* write the strictly upper part if it exists */ \
+	/* assume either symmetric, hermitian, or triangular */ \
+	if ( bli_is_upper( uploa ) || bli_is_herm_or_symm( struca ) ) \
+	{ \
+		dim_t  inca_u  = inca; \
+		dim_t  lda_u   = lda; \
+		conj_t conja_u = conja; \
 \
-        if ( bli_is_lower( uploa ) ) \
-        { \
-            bli_swap_incs( &inca_u, &lda_u ); \
-            if ( bli_is_hermitian( struca ) ) \
-                bli_toggle_conj( &conja_u ); \
-        } \
+		if ( bli_is_lower( uploa ) ) \
+		{ \
+			bli_swap_incs( &inca_u, &lda_u ); \
+			if ( bli_is_hermitian( struca ) ) \
+				bli_toggle_conj( &conja_u ); \
+		} \
 \
-        if ( bli_is_conj( conja_u ) ) PACKM_DIAG_BODY_U( ctype, ch, scal2js ); \
-        else                          PACKM_DIAG_BODY_U( ctype, ch, scal2s ); \
-    } \
+		if ( bli_is_conj( conja_u ) ) PACKM_DIAG_BODY_U( ctype, ch, scal2js ); \
+		else                          PACKM_DIAG_BODY_U( ctype, ch, scal2s ); \
+	} \
 \
-    /* write the diagonal */ \
-    if ( bli_is_unit_diag( diaga ) ) \
-    { \
-        for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
-        for ( dim_t d = 0; d < dfac; ++d ) \
-            PASTEMAC(ch,set1s)( *(pi1 + mnk*(dfac + ldp) + d) ); \
-    } \
-    else if ( bli_is_hermitian( struca ) ) \
-    { \
-        for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
-        for ( dim_t d = 0; d < dfac; ++d ) \
-        { \
-            ctype mu; \
-            PASTEMAC(ch,copys)( *(alpha1 + mnk*(inca + lda)), mu ); \
-            PASTEMAC(ch,seti0s)( mu ); \
-            PASTEMAC(ch,scal2s)( kappa_cast, mu, *(pi1 + mnk*(dfac + ldp) + d) ); \
-        } \
-    } \
-    else if ( bli_is_conj( conja )) \
-    { \
-        for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
-        for ( dim_t d = 0; d < dfac; ++d ) \
-            PASTEMAC(ch,scal2js)( kappa_cast, *(alpha1 + mnk*(inca + lda)), *(pi1 + mnk*(dfac + ldp) + d) ); \
-    } \
-    else \
-    { \
-        for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
-        for ( dim_t d = 0; d < dfac; ++d ) \
-            PASTEMAC(ch,scal2s)( kappa_cast, *(alpha1 + mnk*(inca + lda)), *(pi1 + mnk*(dfac + ldp) + d) ); \
-    } \
+	/* write the diagonal */ \
+	if ( bli_is_unit_diag( diaga ) ) \
+	{ \
+		for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+		for ( dim_t d = 0; d < dfac; ++d ) \
+			PASTEMAC(ch,set1s)( *(pi1 + mnk*(dfac + ldp) + d) ); \
+	} \
+	else if ( bli_is_hermitian( struca ) ) \
+	{ \
+		for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+		for ( dim_t d = 0; d < dfac; ++d ) \
+		{ \
+			ctype mu; \
+			PASTEMAC(ch,copys)( *(alpha1 + mnk*(inca + lda)), mu ); \
+			PASTEMAC(ch,seti0s)( mu ); \
+			PASTEMAC(ch,scal2s)( kappa_cast, mu, *(pi1 + mnk*(dfac + ldp) + d) ); \
+		} \
+	} \
+	else if ( bli_is_conj( conja )) \
+	{ \
+		for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+		for ( dim_t d = 0; d < dfac; ++d ) \
+			PASTEMAC(ch,scal2js)( kappa_cast, *(alpha1 + mnk*(inca + lda)), *(pi1 + mnk*(dfac + ldp) + d) ); \
+	} \
+	else \
+	{ \
+		for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+		for ( dim_t d = 0; d < dfac; ++d ) \
+			PASTEMAC(ch,scal2s)( kappa_cast, *(alpha1 + mnk*(inca + lda)), *(pi1 + mnk*(dfac + ldp) + d) ); \
+	} \
 \
-    /* invert the diagonal if requested */ \
-    if ( invdiag ) \
-    { \
-        for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
-        for ( dim_t d = 0; d < dfac; ++d ) \
-            PASTEMAC(ch,inverts)( *(pi1 + mnk*(dfac + ldp) + d) ); \
-    } \
+	/* invert the diagonal if requested */ \
+	if ( invdiag ) \
+	{ \
+		for ( dim_t mnk = 0; mnk < cdim; ++mnk ) \
+		for ( dim_t d = 0; d < dfac; ++d ) \
+			PASTEMAC(ch,inverts)( *(pi1 + mnk*(dfac + ldp) + d) ); \
+	} \
 \
-    /* if this an edge case in both directions, extend the diagonal with ones */ \
-    for ( dim_t mnk = cdim; mnk < bli_min( cdim_max, n_max ); ++mnk ) \
-    for ( dim_t d = 0; d < dfac; ++d ) \
-        PASTEMAC(ch,set1s)( *(pi1 + mnk*(dfac + ldp) + d) ); \
+	/* if this an edge case in both directions, extend the diagonal with ones */ \
+	for ( dim_t mnk = cdim; mnk < bli_min( cdim_max, n_max ); ++mnk ) \
+	for ( dim_t d = 0; d < dfac; ++d ) \
+		PASTEMAC(ch,set1s)( *(pi1 + mnk*(dfac + ldp) + d) ); \
 }
 
 INSERT_GENTFUNC_BASIC4( packm_mrxmr_diag, BLIS_MR, BLIS_BBM, BLIS_CNAME_INFIX, BLIS_REF_SUFFIX )
diff --git a/ref_kernels/1m/bli_packm_cxk_1er_ref.c b/ref_kernels/1m/bli_packm_cxk_1er_ref.c
index 06b83debaf..56d8379be6 100644
--- a/ref_kernels/1m/bli_packm_cxk_1er_ref.c
+++ b/ref_kernels/1m/bli_packm_cxk_1er_ref.c
@@ -91,13 +91,13 @@ void PASTEMAC3(ch,opname,arch,suf) \
      ) \
 { \
 	const dim_t dfac = PASTECH2(bb0, _, chr); \
-    const num_t dt_r = PASTEMAC(chr,type); \
+	const num_t dt_r = PASTEMAC(chr,type); \
 \
 	if ( bli_is_1e_packed( schema ) ) \
 	{ \
-        /* cdim and mnr are in units of complex values */ \
-    	const dim_t mnr      = PASTECH2(mnr0, _, chr) == -1 ? -1 : PASTECH2(mnr0, _, chr) / 2; \
-    	const dim_t cdim_max = bli_cntx_get_blksz_def_dt( dt_r, mnr0, cntx ) / 2; \
+		/* cdim and mnr are in units of complex values */ \
+		const dim_t mnr      = PASTECH2(mnr0, _, chr) == -1 ? -1 : PASTECH2(mnr0, _, chr) / 2; \
+		const dim_t cdim_max = bli_cntx_get_blksz_def_dt( dt_r, mnr0, cntx ) / 2; \
 \
 		const inc_t       inca2      = 2 * inca; \
 		const inc_t       lda2       = 2 * lda; \
@@ -109,8 +109,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		ctype_r* restrict pi1_ri     = ( ctype_r* )p; \
 		ctype_r* restrict pi1_ir     = ( ctype_r* )p + ldp; \
 \
-    	if ( cdim == mnr && mnr != -1 ) \
-    	{ \
+		if ( cdim == mnr && mnr != -1 ) \
+		{ \
 			if ( inca == 1 ) \
 			{ \
 				if ( bli_is_conj( conja ) ) PACKM_1E_BODY( ctype, ch, PRAGMA_SIMD, mnr, 2, scal2jris ); \
@@ -128,17 +128,17 @@ void PASTEMAC3(ch,opname,arch,suf) \
 			else                        PACKM_1E_BODY( ctype, ch, , cdim, inca2, scal2ris ); \
 		} \
 \
-    	PASTEMAC(chr,set0s_edge) \
-    	( \
-    	  2*cdim*dfac, 2*cdim_max*dfac, \
-    	  2*n, 2*n_max, \
-    	  ( ctype_r* )p, ldp  \
-    	); \
-    } \
+		PASTEMAC(chr,set0s_edge) \
+		( \
+		  2*cdim*dfac, 2*cdim_max*dfac, \
+		  2*n, 2*n_max, \
+		  ( ctype_r* )p, ldp  \
+		); \
+	} \
 	else /* ( bli_is_1r_packed( schema ) ) */ \
 	{ \
-    	const dim_t mnr      = PASTECH2(mnr0, _, chr); \
-    	const dim_t cdim_max = bli_cntx_get_blksz_def_dt( dt_r, mnr0, cntx ); \
+		const dim_t mnr      = PASTECH2(mnr0, _, chr); \
+		const dim_t cdim_max = bli_cntx_get_blksz_def_dt( dt_r, mnr0, cntx ); \
 \
 		const inc_t       inca2      = 2 * inca; \
 		const inc_t       lda2       = 2 * lda; \
@@ -150,31 +150,31 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		ctype_r* restrict pi1_r      = ( ctype_r* )p; \
 		ctype_r* restrict pi1_i      = ( ctype_r* )p + ldp; \
 \
-    	if ( cdim == mnr && mnr != -1 ) \
-    	{ \
-    		if ( inca == 1 ) \
-    		{ \
-    			if ( bli_is_conj( conja ) ) PACKM_1R_BODY( ctype, ch, PRAGMA_SIMD, mnr, 2, scal2jris ); \
-    			else                        PACKM_1R_BODY( ctype, ch, PRAGMA_SIMD, mnr, 2, scal2ris ); \
-    		} \
-    		else \
-    		{ \
-    			if ( bli_is_conj( conja ) ) PACKM_1R_BODY( ctype, ch, PRAGMA_SIMD, mnr, inca2, scal2jris ); \
-    			else                        PACKM_1R_BODY( ctype, ch, PRAGMA_SIMD, mnr, inca2, scal2ris ); \
-    		} \
-        } \
+		if ( cdim == mnr && mnr != -1 ) \
+		{ \
+			if ( inca == 1 ) \
+			{ \
+				if ( bli_is_conj( conja ) ) PACKM_1R_BODY( ctype, ch, PRAGMA_SIMD, mnr, 2, scal2jris ); \
+				else                        PACKM_1R_BODY( ctype, ch, PRAGMA_SIMD, mnr, 2, scal2ris ); \
+			} \
+			else \
+			{ \
+				if ( bli_is_conj( conja ) ) PACKM_1R_BODY( ctype, ch, PRAGMA_SIMD, mnr, inca2, scal2jris ); \
+				else                        PACKM_1R_BODY( ctype, ch, PRAGMA_SIMD, mnr, inca2, scal2ris ); \
+			} \
+		} \
 		else \
 		{ \
 			if ( bli_is_conj( conja ) ) PACKM_1R_BODY( ctype, ch, , cdim, inca2, scal2jris ); \
 			else                        PACKM_1R_BODY( ctype, ch, , cdim, inca2, scal2ris ); \
 		} \
 \
-    	PASTEMAC(chr,set0s_edge) \
-    	( \
-    	  cdim*dfac, cdim_max*dfac, \
-    	  2*n, 2*n_max, \
-    	  ( ctype_r* )p, ldp  \
-    	); \
+		PASTEMAC(chr,set0s_edge) \
+		( \
+		  cdim*dfac, cdim_max*dfac, \
+		  2*n, 2*n_max, \
+		  ( ctype_r* )p, ldp  \
+		); \
 	} \
 }
 
diff --git a/ref_kernels/1m/bli_packm_cxk_ref.c b/ref_kernels/1m/bli_packm_cxk_ref.c
index c385fca1ac..eefdb464b4 100644
--- a/ref_kernels/1m/bli_packm_cxk_ref.c
+++ b/ref_kernels/1m/bli_packm_cxk_ref.c
@@ -67,7 +67,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
      ) \
 { \
 	const dim_t     mnr        = PASTECH2(mnr0, _, ch); \
-    const num_t     dt         = PASTEMAC(ch,type); \
+	const num_t     dt         = PASTEMAC(ch,type); \
 	const dim_t     cdim_max   = bli_cntx_get_blksz_def_dt( dt, mnr0, cntx ); \
 	const dim_t     dfac       = PASTECH2(bb0, _, ch); \
 \
@@ -80,18 +80,18 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		if ( inca == 1 ) \
 		{ \
 			if ( bli_is_conj( conja ) ) PACKM_BODY( ctype, ch, PRAGMA_SIMD, mnr, 1, scal2js ); \
-            else                        PACKM_BODY( ctype, ch, PRAGMA_SIMD, mnr, 1, scal2s ); \
+			else                        PACKM_BODY( ctype, ch, PRAGMA_SIMD, mnr, 1, scal2s ); \
 		} \
 		else \
 		{ \
 			if ( bli_is_conj( conja ) ) PACKM_BODY( ctype, ch, PRAGMA_SIMD, mnr, inca, scal2js ); \
-            else                        PACKM_BODY( ctype, ch, PRAGMA_SIMD, mnr, inca, scal2s ); \
+			else                        PACKM_BODY( ctype, ch, PRAGMA_SIMD, mnr, inca, scal2s ); \
 		} \
 	} \
 	else /* if ( cdim < mnr ) */ \
 	{ \
 		if ( bli_is_conj( conja ) ) PACKM_BODY( ctype, ch, , cdim, inca, scal2js ); \
-        else                        PACKM_BODY( ctype, ch, , cdim, inca, scal2s ); \
+		else                        PACKM_BODY( ctype, ch, , cdim, inca, scal2s ); \
 	} \
 \
 	PASTEMAC(ch,set0s_edge) \
diff --git a/ref_kernels/3/bli_gemm_ref.c b/ref_kernels/3/bli_gemm_ref.c
index 968ca39979..f284acb98e 100644
--- a/ref_kernels/3/bli_gemm_ref.c
+++ b/ref_kernels/3/bli_gemm_ref.c
@@ -180,7 +180,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	} \
 \
 	ctype       ab[ BLIS_STACK_BUF_MAX_SIZE \
-                        / sizeof( ctype ) ] \
+	                / sizeof( ctype ) ] \
 	                __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
 	const inc_t rs_ab  = nr; \
 	const inc_t cs_ab  = 1; \
diff --git a/ref_kernels/3/bli_gemmtrsm_ref.c b/ref_kernels/3/bli_gemmtrsm_ref.c
index 7504307179..046aa5617b 100644
--- a/ref_kernels/3/bli_gemmtrsm_ref.c
+++ b/ref_kernels/3/bli_gemmtrsm_ref.c
@@ -86,8 +86,14 @@ PASTEMAC(d,fprintm)( stdout, "gemmtrsm_ukr: b11", mr, 2*nr, \
 	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
 	                    / sizeof( ctype ) ] \
 	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	/* FGVZ: Should we be querying the preference of BLIS_GEMMTRSM_?_UKR
-	   instead? */ \
+	/* to FGVZ: Should we be querying the preference of BLIS_GEMMTRSM_?_UKR
+	   instead?
+
+	   to DAM: Given that this reference kernel is implemented in terms of gemm,
+	   I think that is the preference we want to query. There might be other
+	   circumstances where we would want the gemmtrsm_? operations to have
+	   and exercise their own IO preferences -- I'd have to think about it --
+	   but this doesn't seem to be one of them. */ \
 	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
 	const inc_t     rs_ct       = ( col_pref ? 1 : nr ); \
 	const inc_t     cs_ct       = ( col_pref ? mr : 1 ); \
diff --git a/ref_kernels/3/bli_trsm_ref.c b/ref_kernels/3/bli_trsm_ref.c
index 504849e4ef..8234a84cc1 100644
--- a/ref_kernels/3/bli_trsm_ref.c
+++ b/ref_kernels/3/bli_trsm_ref.c
@@ -111,8 +111,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
 			PASTEMAC(ch,copys)( beta11c, *gamma11 ); \
 \
 			/* Store the local value back to b11. */ \
-		    for ( dim_t d = 0; d < cs_b; ++d ) \
-			    PASTEMAC(ch,copys)( beta11c, *(beta11 + d) ); \
+			for ( dim_t d = 0; d < cs_b; ++d ) \
+				PASTEMAC(ch,copys)( beta11c, *(beta11 + d) ); \
 		} \
 	} \
 }
@@ -195,8 +195,8 @@ void PASTEMAC3(ch,opname,arch,suf) \
 			PASTEMAC(ch,copys)( beta11c, *gamma11 ); \
 \
 			/* Store the local value back to b11. */ \
-		    for ( dim_t d = 0; d < cs_b; ++d ) \
-			    PASTEMAC(ch,copys)( beta11c, *(beta11 + d) ); \
+			for ( dim_t d = 0; d < cs_b; ++d ) \
+				PASTEMAC(ch,copys)( beta11c, *(beta11 + d) ); \
 		} \
 	} \
 }
diff --git a/ref_kernels/ind/bli_gemmtrsm1m_ref.c b/ref_kernels/ind/bli_gemmtrsm1m_ref.c
index 58e08ec927..6cfb83caed 100644
--- a/ref_kernels/ind/bli_gemmtrsm1m_ref.c
+++ b/ref_kernels/ind/bli_gemmtrsm1m_ref.c
@@ -223,7 +223,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 			); \
 \
 			PASTEMAC(ch,copyris)( -*beta11_ri_i, *beta11_ri_r, \
-                                   *beta11_ir_r, *beta11_ir_i ); \
+			                       *beta11_ir_r, *beta11_ir_i ); \
 		} \
 	} \
 	else /* if ( bli_is_1r_packed( schema_b ) ) */ \
diff --git a/ref_kernels/ind/bli_trsm1m_ref.c b/ref_kernels/ind/bli_trsm1m_ref.c
index 175bc9e14a..5eda20f20d 100644
--- a/ref_kernels/ind/bli_trsm1m_ref.c
+++ b/ref_kernels/ind/bli_trsm1m_ref.c
@@ -151,10 +151,10 @@ void PASTEMAC3(ch,opname,arch,suf) \
 				PASTEMAC(ch,sets)(  beta11c_r, beta11c_i, *gamma11 ); \
 \
 				/* Store the local values back to b11. */ \
-                for ( dim_t d = 0; d < cs_b; ++d ) \
-                { \
-    				PASTEMAC(ch,copyris)(  beta11c_r, beta11c_i, *(beta11_ri_r + d), *(beta11_ri_i + d) ); \
-    				PASTEMAC(ch,copyris)( -beta11c_i, beta11c_r, *(beta11_ir_r + d), *(beta11_ir_i + d) ); \
+				for ( dim_t d = 0; d < cs_b; ++d ) \
+				{ \
+					PASTEMAC(ch,copyris)(  beta11c_r, beta11c_i, *(beta11_ri_r + d), *(beta11_ri_i + d) ); \
+					PASTEMAC(ch,copyris)( -beta11c_i, beta11c_r, *(beta11_ir_r + d), *(beta11_ir_i + d) ); \
                 } \
 			} \
 		} \
@@ -236,11 +236,11 @@ void PASTEMAC3(ch,opname,arch,suf) \
 				                   beta11c_i, *gamma11 ); \
 \
 				/* Store the local values back to b11. */ \
-                for ( dim_t d = 0; d < cs_b; ++d ) \
-    				PASTEMAC(ch,copyris)( beta11c_r, \
-    				                      beta11c_i, \
-    				                      *(beta11_r + d), \
-    				                      *(beta11_i + d) ); \
+				for ( dim_t d = 0; d < cs_b; ++d ) \
+					PASTEMAC(ch,copyris)( beta11c_r, \
+					                      beta11c_i, \
+					                      *(beta11_r + d), \
+					                      *(beta11_i + d) ); \
 			} \
 		} \
 	} \
@@ -369,10 +369,10 @@ void PASTEMAC3(ch,opname,arch,suf) \
 				PASTEMAC(ch,sets)(  beta11c_r, beta11c_i, *gamma11 ); \
 \
 				/* Store the local values back to b11. */ \
-                for ( dim_t d = 0; d < cs_b; ++d ) \
-                { \
-    				PASTEMAC(ch,copyris)(  beta11c_r, beta11c_i, *(beta11_ri_r + d), *(beta11_ri_i + d) ); \
-    				PASTEMAC(ch,copyris)( -beta11c_i, beta11c_r, *(beta11_ir_r + d), *(beta11_ir_i + d) ); \
+				for ( dim_t d = 0; d < cs_b; ++d ) \
+				{ \
+					PASTEMAC(ch,copyris)(  beta11c_r, beta11c_i, *(beta11_ri_r + d), *(beta11_ri_i + d) ); \
+					PASTEMAC(ch,copyris)( -beta11c_i, beta11c_r, *(beta11_ir_r + d), *(beta11_ir_i + d) ); \
                 } \
 			} \
 		} \
@@ -454,11 +454,11 @@ void PASTEMAC3(ch,opname,arch,suf) \
 				                   beta11c_i, *gamma11 ); \
 \
 				/* Store the local values back to b11. */ \
-                for ( dim_t d = 0; d < cs_b; ++d ) \
-    				PASTEMAC(ch,copyris)( beta11c_r, \
-    				                      beta11c_i, \
-    				                      *(beta11_r + d), \
-    				                      *(beta11_i + d) ); \
+				for ( dim_t d = 0; d < cs_b; ++d ) \
+					PASTEMAC(ch,copyris)( beta11c_r, \
+					                      beta11c_i, \
+					                      *(beta11_r + d), \
+					                      *(beta11_i + d) ); \
 			} \
 		} \
 	} \
diff --git a/sandbox/gemmlike/bls_packm_cxk.c b/sandbox/gemmlike/bls_packm_cxk.c
index ee3d57dfee..2ed178c657 100644
--- a/sandbox/gemmlike/bls_packm_cxk.c
+++ b/sandbox/gemmlike/bls_packm_cxk.c
@@ -55,7 +55,8 @@ void PASTECH2(bls_,ch,opname) \
 	   kernel function pointer. This means that we always use the same
 	   kernel, even for edge cases. */ \
 	num_t dt     = PASTEMAC(ch,type); \
-	ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_KER : BLIS_PACKM_MRXK_KER; \
+	ukr_t ker_id = bli_is_col_packed( schema ) ? BLIS_PACKM_NRXK_KER \
+	                                           : BLIS_PACKM_MRXK_KER; \
 \
 	PASTECH2(ch,opname,_ker_ft) f; \
 \

From 94252b0967d1f3174f544396537467b95529fc51 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Fri, 8 Apr 2022 15:39:59 -0500
Subject: [PATCH 20/32] Remove const from typed unpackm kernel cntx_t
 parameter.

---
 frame/1m/unpackm/bli_unpackm_blk_var1.c | 4 ++--
 frame/1m/unpackm/bli_unpackm_blk_var1.h | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/frame/1m/unpackm/bli_unpackm_blk_var1.c b/frame/1m/unpackm/bli_unpackm_blk_var1.c
index 320d56b42b..1eeab9ddbe 100644
--- a/frame/1m/unpackm/bli_unpackm_blk_var1.c
+++ b/frame/1m/unpackm/bli_unpackm_blk_var1.c
@@ -122,7 +122,7 @@ void bli_unpackm_blk_var1
 	  buf_p, rs_p, cs_p,
 	         pd_p, ps_p,
 	  buf_c, rs_c, cs_c,
-	  cntx
+	  ( cntx_t* )cntx
 	);
 }
 
@@ -144,7 +144,7 @@ void PASTEMAC(ch,varname) \
        void*   p, inc_t rs_p, inc_t cs_p, \
                   dim_t pd_p, inc_t ps_p, \
        void*   c, inc_t rs_c, inc_t cs_c, \
-       const cntx_t* cntx  \
+       cntx_t* cntx  \
      ) \
 { \
 	ctype* one    = PASTEMAC(ch,1); \
diff --git a/frame/1m/unpackm/bli_unpackm_blk_var1.h b/frame/1m/unpackm/bli_unpackm_blk_var1.h
index a9c9f5548f..4a92dc1b7f 100644
--- a/frame/1m/unpackm/bli_unpackm_blk_var1.h
+++ b/frame/1m/unpackm/bli_unpackm_blk_var1.h
@@ -59,7 +59,7 @@ void PASTEMAC(ch,varname) \
        void*   p, inc_t rs_p, inc_t cs_p, \
                   dim_t pd_p, inc_t ps_p, \
        void*   c, inc_t rs_c, inc_t cs_c, \
-       const cntx_t* cntx  \
+       cntx_t* cntx  \
      );
 
 INSERT_GENTPROT_BASIC0( unpackm_blk_var1 )

From 7d4a046e37b814eb2e73d19930278a3135fac018 Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Fri, 8 Apr 2022 16:05:21 -0500
Subject: [PATCH 21/32] Moved stale zen2 copy of bli_axpyf_zen_int_5.c.

---
 kernels/zen2/1f/{ => old}/bli_axpyf_zen_int_5.c | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename kernels/zen2/1f/{ => old}/bli_axpyf_zen_int_5.c (100%)

diff --git a/kernels/zen2/1f/bli_axpyf_zen_int_5.c b/kernels/zen2/1f/old/bli_axpyf_zen_int_5.c
similarity index 100%
rename from kernels/zen2/1f/bli_axpyf_zen_int_5.c
rename to kernels/zen2/1f/old/bli_axpyf_zen_int_5.c

From bd71fced9df85760fa43ec5bceb8d7436782af0e Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Mon, 23 May 2022 14:09:18 -0500
Subject: [PATCH 22/32] Backup.

---
 build/flatten-headers.py                      |   10 +-
 config/a64fx/bli_cntx_init_a64fx.c            |   10 +-
 config/armsve/bli_cntx_init_armsve.c          |   10 +-
 config/bgq/bli_cntx_init_bgq.c                |   10 +-
 config/bulldozer/bli_cntx_init_bulldozer.c    |   10 +-
 config/cortexa15/bli_cntx_init_cortexa15.c    |   10 +-
 config/cortexa53/bli_cntx_init_cortexa53.c    |   10 +-
 config/cortexa57/bli_cntx_init_cortexa57.c    |   10 +-
 config/cortexa9/bli_cntx_init_cortexa9.c      |   10 +-
 config/excavator/bli_cntx_init_excavator.c    |   10 +-
 config/firestorm/bli_cntx_init_firestorm.c    |   26 +-
 config/haswell/bli_cntx_init_haswell.c        |   30 +-
 config/knc/bli_cntx_init_knc.c                |   10 +-
 config/knl/bli_cntx_init_knl.c                |   14 +-
 config/penryn/bli_cntx_init_penryn.c          |   10 +-
 config/piledriver/bli_cntx_init_piledriver.c  |   10 +-
 config/power10/bli_cntx_init_power10.c        |   10 +-
 config/power7/bli_cntx_init_power7.c          |   10 +-
 config/power9/bli_cntx_init_power9.c          |   10 +-
 .../sandybridge/bli_cntx_init_sandybridge.c   |   10 +-
 config/skx/bli_cntx_init_skx.c                |   14 +-
 .../steamroller/bli_cntx_init_steamroller.c   |   10 +-
 config/template/bli_cntx_init_template.c      |   10 +-
 config/thunderx2/bli_cntx_init_thunderx2.c    |   10 +-
 config/zen/bli_cntx_init_zen.c                |   32 +-
 config/zen2/bli_cntx_init_zen2.c              |   32 +-
 config/zen3/bli_cntx_init_zen3.c              |   32 +-
 frame/1m/bli_l1m_oft_var.h                    |    5 +-
 frame/1m/bli_l1m_unb_var1.c                   |   15 +-
 frame/1m/bli_l1m_unb_var1.h                   |   15 +-
 frame/1m/packm/bli_packm_alloc.c              |   26 +-
 frame/1m/packm/bli_packm_alloc.h              |   13 +-
 frame/1m/packm/bli_packm_blk_var1.c           |   51 +-
 frame/1m/packm/bli_packm_blk_var1.h           |   15 +-
 frame/1m/packm/bli_packm_cntl.c               |   92 -
 frame/1m/packm/bli_packm_cntl.h               |   81 +-
 frame/1m/packm/bli_packm_init.c               |   23 +-
 frame/1m/packm/bli_packm_init.h               |    5 +-
 frame/1m/packm/bli_packm_int.c                |    6 +-
 frame/1m/packm/bli_packm_int.h                |    5 +-
 frame/1m/packm/bli_packm_thrinfo.c            |   75 -
 frame/1m/unpackm/bli_unpackm.h                |    1 -
 frame/1m/unpackm/bli_unpackm_cntl.c           |   77 -
 frame/1m/unpackm/bli_unpackm_int.c            |    2 +-
 frame/3/bli_l3.h                              |    4 +-
 frame/3/bli_l3_blocksize.c                    |  311 +--
 frame/3/bli_l3_blocksize.h                    |   66 +-
 frame/3/bli_l3_cntl.c                         |  124 --
 frame/3/bli_l3_cntl.h                         |   60 -
 frame/3/bli_l3_int.c                          |   11 +-
 frame/3/bli_l3_int.h                          |    3 +-
 frame/3/bli_l3_oapi_ex.c                      |   24 +-
 frame/3/bli_l3_oapi_ex.h                      |    8 +-
 frame/3/bli_l3_oft.h                          |    8 +-
 frame/3/bli_l3_oft_var.h                      |    3 +-
 frame/3/bli_l3_packab.c                       |   10 +-
 frame/3/bli_l3_packab.h                       |    6 +-
 frame/3/bli_l3_sup.c                          |    8 +-
 frame/3/bli_l3_sup.h                          |    4 +-
 frame/3/bli_l3_sup_int.c                      |   16 +-
 frame/3/bli_l3_sup_oft.h                      |    2 +-
 frame/3/bli_l3_sup_packm.c                    |  433 +++++
 .../bli_l3_sup_packm.h}                       |  100 +-
 frame/3/bli_l3_sup_packm_a.c                  |  430 ----
 frame/3/bli_l3_sup_packm_a.h                  |  118 --
 frame/3/bli_l3_sup_packm_b.c                  |  430 ----
 frame/3/bli_l3_sup_packm_b.h                  |  118 --
 frame/3/bli_l3_sup_packm_var.c                |   12 +-
 frame/3/bli_l3_sup_packm_var.h                |   12 +-
 frame/3/bli_l3_sup_var1n2m.c                  | 1730 +++++++----------
 frame/3/bli_l3_sup_vars.h                     |   28 +-
 frame/3/bli_l3_tapi_ex.c                      |   18 +-
 frame/3/bli_l3_tapi_ex.h                      |   16 +-
 frame/3/bli_l3_thrinfo.c                      |  238 +--
 frame/3/bli_l3_thrinfo.h                      |   60 +-
 frame/3/gemm/bli_gemm.h                       |    5 +-
 frame/3/gemm/bli_gemm_blk_var1.c              |    8 +-
 frame/3/gemm/bli_gemm_blk_var2.c              |    8 +-
 frame/3/gemm/bli_gemm_blk_var3.c              |    8 +-
 frame/3/gemm/bli_gemm_cntl.c                  |  248 +--
 frame/3/gemm/bli_gemm_cntl.h                  |   91 +-
 frame/3/gemm/bli_gemm_front.c                 |   89 +-
 frame/3/gemm/bli_gemm_front.h                 |    5 +-
 frame/3/gemm/bli_gemm_ker_var2.c              |   13 +-
 frame/3/gemm/bli_gemm_md.c                    |  313 ++-
 frame/3/gemm/bli_gemm_md.h                    |   83 +-
 frame/3/gemm/bli_gemm_var.h                   |   13 +-
 frame/3/gemmt/bli_gemmt_front.c               |    7 +-
 frame/3/gemmt/bli_gemmt_front.h               |    3 +-
 frame/3/gemmt/bli_gemmt_l_ker_var2.c          |  670 +++----
 frame/3/gemmt/bli_gemmt_u_ker_var2.c          |  670 +++----
 frame/3/gemmt/bli_gemmt_var.h                 |    4 +-
 frame/3/gemmt/bli_gemmt_x_ker_var2.c          |    4 +-
 frame/3/hemm/bli_hemm_front.c                 |    2 +-
 frame/3/hemm/bli_hemm_front.h                 |    4 +-
 frame/3/symm/bli_symm_front.c                 |    2 +-
 frame/3/symm/bli_symm_front.h                 |    4 +-
 frame/3/trmm/bli_trmm_front.c                 |    2 +-
 frame/3/trmm/bli_trmm_front.h                 |    4 +-
 frame/3/trmm/bli_trmm_ll_ker_var2.c           |  509 ++---
 frame/3/trmm/bli_trmm_lu_ker_var2.c           |  523 ++---
 frame/3/trmm/bli_trmm_rl_ker_var2.c           |  607 +++---
 frame/3/trmm/bli_trmm_ru_ker_var2.c           |  631 +++---
 frame/3/trmm/bli_trmm_var.h                   |    4 +-
 frame/3/trmm/bli_trmm_xx_ker_var2.c           |    4 +-
 frame/3/trmm3/bli_trmm3_front.c               |    2 +-
 frame/3/trmm3/bli_trmm3_front.h               |    3 +-
 frame/3/trsm/bli_trsm_blk_var1.c              |    9 +-
 frame/3/trsm/bli_trsm_blk_var2.c              |    6 +-
 frame/3/trsm/bli_trsm_blk_var3.c              |    6 +-
 frame/3/trsm/bli_trsm_cntl.c                  |  537 ++---
 frame/3/trsm/bli_trsm_cntl.h                  |   68 +-
 frame/3/trsm/bli_trsm_front.c                 |    2 +-
 frame/3/trsm/bli_trsm_front.h                 |    3 +-
 frame/3/trsm/bli_trsm_ll_ker_var2.c           |  558 ++----
 frame/3/trsm/bli_trsm_lu_ker_var2.c           |  581 +++---
 frame/3/trsm/bli_trsm_rl_ker_var2.c           |  566 ++----
 frame/3/trsm/bli_trsm_ru_ker_var2.c           |  561 ++----
 frame/3/trsm/bli_trsm_var.h                   |   36 +-
 frame/3/trsm/bli_trsm_xx_ker_var2.c           |    4 +-
 frame/base/bli_blksz.c                        |   37 +-
 frame/base/bli_blksz.h                        |   29 +-
 frame/base/bli_cntl.c                         |  281 +--
 frame/base/bli_cntl.h                         |   94 +-
 frame/base/bli_cntx.c                         |   11 +-
 frame/base/bli_cntx.h                         |  132 +-
 frame/base/bli_gks.c                          |  308 +--
 frame/base/bli_gks.h                          |    7 +-
 frame/base/bli_obj.c                          |   22 +
 frame/base/bli_obj.h                          |    6 +
 frame/base/bli_obj_scalar.c                   |    5 +-
 frame/base/bli_obj_scalar.h                   |    1 +
 frame/base/bli_pba.c                          |   13 +-
 frame/base/bli_pba.h                          |   16 +-
 .../bli_unpackm_cntl.h => base/bli_plugin.h}  |   50 +-
 frame/base/bli_rntm.h                         |   77 +-
 frame/base/bli_sba.c                          |  137 +-
 frame/base/bli_sba.h                          |   12 +-
 frame/include/bli_extern_defs.h               |    2 -
 frame/include/bli_oapi_ex.h                   |    2 +-
 frame/include/bli_obj_macro_defs.h            |  241 +--
 frame/include/bli_tapi_ex.h                   |    2 +-
 frame/include/bli_type_defs.h                 |   89 +-
 frame/thread/bli_l3_decor.h                   |    7 +-
 frame/thread/bli_l3_decor_openmp.c            |   50 +-
 frame/thread/bli_l3_decor_pthreads.c          |   70 +-
 frame/thread/bli_l3_decor_single.c            |   41 +-
 frame/thread/bli_l3_sup_decor.h               |    2 +-
 frame/thread/bli_l3_sup_decor_openmp.c        |   33 +-
 frame/thread/bli_l3_sup_decor_pthreads.c      |   41 +-
 frame/thread/bli_l3_sup_decor_single.c        |   83 +-
 frame/thread/bli_l3_sup_decor_single.h        |    7 +
 frame/thread/bli_thrcomm.h                    |    4 +-
 frame/thread/bli_thrcomm_openmp.c             |   12 +-
 frame/thread/bli_thrcomm_pthreads.c           |    8 +-
 frame/thread/bli_thrcomm_single.c             |    8 +-
 frame/thread/bli_thread.c                     |   42 +-
 frame/thread/bli_thread.h                     |    4 -
 frame/thread/bli_thrinfo.c                    |  601 +-----
 frame/thread/bli_thrinfo.h                    |  157 +-
 frame/thread/bli_thrinfo_sup.c                |  290 ---
 frame/thread/bli_thrinfo_sup.h                |   66 -
 kernels/zen/1/bli_scalv_zen_int10.c           |    4 -
 kernels/zen/1f/bli_axpyf_zen_int_4.c          |    2 -
 kernels/zen/1f/bli_axpyf_zen_int_5.c          |    8 +-
 ref_kernels/bli_cntx_ref.c                    |  199 +-
 ref_kernels/ind/bli_gemm1m_ref.c              |   49 +-
 167 files changed, 5509 insertions(+), 10631 deletions(-)
 delete mode 100644 frame/1m/packm/bli_packm_cntl.c
 delete mode 100644 frame/1m/packm/bli_packm_thrinfo.c
 delete mode 100644 frame/1m/unpackm/bli_unpackm_cntl.c
 delete mode 100644 frame/3/bli_l3_cntl.c
 delete mode 100644 frame/3/bli_l3_cntl.h
 create mode 100644 frame/3/bli_l3_sup_packm.c
 rename frame/{1m/packm/bli_packm_thrinfo.h => 3/bli_l3_sup_packm.h} (56%)
 delete mode 100644 frame/3/bli_l3_sup_packm_a.c
 delete mode 100644 frame/3/bli_l3_sup_packm_a.h
 delete mode 100644 frame/3/bli_l3_sup_packm_b.c
 delete mode 100644 frame/3/bli_l3_sup_packm_b.h
 rename frame/{1m/unpackm/bli_unpackm_cntl.h => base/bli_plugin.h} (68%)
 delete mode 100644 frame/thread/bli_thrinfo_sup.c
 delete mode 100644 frame/thread/bli_thrinfo_sup.h

diff --git a/build/flatten-headers.py b/build/flatten-headers.py
index 563725a7e9..3863d196e3 100755
--- a/build/flatten-headers.py
+++ b/build/flatten-headers.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 #
-#  BLIS    
+#  BLIS
 #  An object-based framework for developing high-performance BLAS-like
 #  libraries.
 #
@@ -216,7 +216,9 @@ def flatten_header( inputfile, header_dirpaths, cursp ):
 	ifile = open( inputfile, "r" )
 
 	# Iterate over the lines in the file.
+	lineno = 0
 	while True:
+		lineno += 1
 
 		# Read a line in the file.
 		line = ifile.readline()
@@ -268,12 +270,14 @@ def flatten_header( inputfile, header_dirpaths, cursp ):
 
 				# Mark the beginning of the header being inserted.
 				ostring += "%s%s%c" % ( beginstr, header, '\n' )
+				ostring += "#line %d \"%s\"%c\n" % ( 1, header_path, '\n' )
 
 				# Recurse on the header, accumulating the string.
 				ostring += flatten_header( header_path, header_dirpaths, cursp + "  " )
 
 				# Mark the end of the header being inserted.
 				ostring += "%s%s%c" % ( endstr, header, '\n' )
+				ostring += "#line %d \"%s\"%c\n" % ( lineno+1, inputfile, '\n' )
 
 				echov2( "%sheader file '%s' fully processed." \
 				        % ( cursp, header_path ) )
@@ -300,7 +304,7 @@ def flatten_header( inputfile, header_dirpaths, cursp ):
 		# endif
 
 	# endwhile
-	
+
 	# Close the input file.
 	ifile.close()
 
@@ -330,7 +334,7 @@ def find_header_dirs( dirpath ):
 	#endfor
 
 	return header_dirpaths
-	
+
 
 # ------------------------------------------------------------------------------
 
diff --git a/config/a64fx/bli_cntx_init_a64fx.c b/config/a64fx/bli_cntx_init_a64fx.c
index dd920bcec0..ed2dac3dd5 100644
--- a/config/a64fx/bli_cntx_init_a64fx.c
+++ b/config/a64fx/bli_cntx_init_a64fx.c
@@ -91,11 +91,11 @@ void bli_cntx_init_a64fx( cntx_t* cntx )
 	  cntx,
 
 	  // level-3
-	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
-	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
-	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
-	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
-	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+	  BLIS_NC, &blkszs[ BLIS_NC ],
+	  BLIS_KC, &blkszs[ BLIS_KC ],
+	  BLIS_MC, &blkszs[ BLIS_MC ],
+	  BLIS_NR, &blkszs[ BLIS_NR ],
+	  BLIS_MR, &blkszs[ BLIS_MR ],
 
 	  BLIS_VA_END
 	);
diff --git a/config/armsve/bli_cntx_init_armsve.c b/config/armsve/bli_cntx_init_armsve.c
index 6339ba381e..c84cc16239 100644
--- a/config/armsve/bli_cntx_init_armsve.c
+++ b/config/armsve/bli_cntx_init_armsve.c
@@ -126,11 +126,11 @@ void bli_cntx_init_armsve( cntx_t* cntx )
 	  cntx,
 
 	  // level-3
-	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
-	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
-	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
-	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
-	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+	  BLIS_NC, &blkszs[ BLIS_NC ],
+	  BLIS_KC, &blkszs[ BLIS_KC ],
+	  BLIS_MC, &blkszs[ BLIS_MC ],
+	  BLIS_NR, &blkszs[ BLIS_NR ],
+	  BLIS_MR, &blkszs[ BLIS_MR ],
 
 	  BLIS_VA_END
 	);
diff --git a/config/bgq/bli_cntx_init_bgq.c b/config/bgq/bli_cntx_init_bgq.c
index d3871d8f77..160bf4dedc 100644
--- a/config/bgq/bli_cntx_init_bgq.c
+++ b/config/bgq/bli_cntx_init_bgq.c
@@ -82,11 +82,11 @@ void bli_cntx_init_bgq( cntx_t* cntx )
 	  cntx,
 
 	  // level-3
-	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
-	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
-	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
-	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
-	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+	  BLIS_NC, &blkszs[ BLIS_NC ],
+	  BLIS_KC, &blkszs[ BLIS_KC ],
+	  BLIS_MC, &blkszs[ BLIS_MC ],
+	  BLIS_NR, &blkszs[ BLIS_NR ],
+	  BLIS_MR, &blkszs[ BLIS_MR ],
 
 	  BLIS_VA_END
 	);
diff --git a/config/bulldozer/bli_cntx_init_bulldozer.c b/config/bulldozer/bli_cntx_init_bulldozer.c
index 5b056f591f..bd74bf9f0e 100644
--- a/config/bulldozer/bli_cntx_init_bulldozer.c
+++ b/config/bulldozer/bli_cntx_init_bulldozer.c
@@ -86,11 +86,11 @@ void bli_cntx_init_bulldozer( cntx_t* cntx )
 	  cntx,
 
 	  // level-3
-	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
-	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
-	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
-	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
-	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+	  BLIS_NC, &blkszs[ BLIS_NC ],
+	  BLIS_KC, &blkszs[ BLIS_KC ],
+	  BLIS_MC, &blkszs[ BLIS_MC ],
+	  BLIS_NR, &blkszs[ BLIS_NR ],
+	  BLIS_MR, &blkszs[ BLIS_MR ],
 
 	  BLIS_VA_END
 	);
diff --git a/config/cortexa15/bli_cntx_init_cortexa15.c b/config/cortexa15/bli_cntx_init_cortexa15.c
index 28ebdef71b..e85d53e112 100644
--- a/config/cortexa15/bli_cntx_init_cortexa15.c
+++ b/config/cortexa15/bli_cntx_init_cortexa15.c
@@ -90,11 +90,11 @@ void bli_cntx_init_cortexa15( cntx_t* cntx )
 	  cntx,
 
 	  // level-3
-	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
-	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
-	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
-	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
-	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+	  BLIS_NC, &blkszs[ BLIS_NC ],
+	  BLIS_KC, &blkszs[ BLIS_KC ],
+	  BLIS_MC, &blkszs[ BLIS_MC ],
+	  BLIS_NR, &blkszs[ BLIS_NR ],
+	  BLIS_MR, &blkszs[ BLIS_MR ],
 
 	  BLIS_VA_END
 	);
diff --git a/config/cortexa53/bli_cntx_init_cortexa53.c b/config/cortexa53/bli_cntx_init_cortexa53.c
index 4957de04e5..e0a5dd0537 100644
--- a/config/cortexa53/bli_cntx_init_cortexa53.c
+++ b/config/cortexa53/bli_cntx_init_cortexa53.c
@@ -82,11 +82,11 @@ void bli_cntx_init_cortexa53( cntx_t* cntx )
 	  cntx,
 
 	  // level-3
-	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
-	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
-	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
-	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
-	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+	  BLIS_NC, &blkszs[ BLIS_NC ],
+	  BLIS_KC, &blkszs[ BLIS_KC ],
+	  BLIS_MC, &blkszs[ BLIS_MC ],
+	  BLIS_NR, &blkszs[ BLIS_NR ],
+	  BLIS_MR, &blkszs[ BLIS_MR ],
 
 	  BLIS_VA_END
 	);
diff --git a/config/cortexa57/bli_cntx_init_cortexa57.c b/config/cortexa57/bli_cntx_init_cortexa57.c
index 28558bc522..7c03faca29 100644
--- a/config/cortexa57/bli_cntx_init_cortexa57.c
+++ b/config/cortexa57/bli_cntx_init_cortexa57.c
@@ -82,11 +82,11 @@ void bli_cntx_init_cortexa57( cntx_t* cntx )
 	  cntx,
 
 	  // level-3
-	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
-	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
-	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
-	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
-	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+	  BLIS_NC, &blkszs[ BLIS_NC ],
+	  BLIS_KC, &blkszs[ BLIS_KC ],
+	  BLIS_MC, &blkszs[ BLIS_MC ],
+	  BLIS_NR, &blkszs[ BLIS_NR ],
+	  BLIS_MR, &blkszs[ BLIS_MR ],
 
 	  BLIS_VA_END
 	);
diff --git a/config/cortexa9/bli_cntx_init_cortexa9.c b/config/cortexa9/bli_cntx_init_cortexa9.c
index 6af3ff91ce..de75744b42 100644
--- a/config/cortexa9/bli_cntx_init_cortexa9.c
+++ b/config/cortexa9/bli_cntx_init_cortexa9.c
@@ -82,11 +82,11 @@ void bli_cntx_init_cortexa9( cntx_t* cntx )
 	  cntx,
 
 	  // level-3
-	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
-	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
-	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
-	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
-	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+	  BLIS_NC, &blkszs[ BLIS_NC ],
+	  BLIS_KC, &blkszs[ BLIS_KC ],
+	  BLIS_MC, &blkszs[ BLIS_MC ],
+	  BLIS_NR, &blkszs[ BLIS_NR ],
+	  BLIS_MR, &blkszs[ BLIS_MR ],
 
 	  BLIS_VA_END
 	);
diff --git a/config/excavator/bli_cntx_init_excavator.c b/config/excavator/bli_cntx_init_excavator.c
index d36865b216..ae92ce8416 100644
--- a/config/excavator/bli_cntx_init_excavator.c
+++ b/config/excavator/bli_cntx_init_excavator.c
@@ -86,11 +86,11 @@ void bli_cntx_init_excavator( cntx_t* cntx )
 	  cntx,
 
 	  // level-3
-	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
-	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
-	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
-	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
-	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+	  BLIS_NC, &blkszs[ BLIS_NC ],
+	  BLIS_KC, &blkszs[ BLIS_KC ],
+	  BLIS_MC, &blkszs[ BLIS_MC ],
+	  BLIS_NR, &blkszs[ BLIS_NR ],
+	  BLIS_MR, &blkszs[ BLIS_MR ],
 
 	  BLIS_VA_END
 	);
diff --git a/config/firestorm/bli_cntx_init_firestorm.c b/config/firestorm/bli_cntx_init_firestorm.c
index 8e4d0088d5..765ec4b74c 100644
--- a/config/firestorm/bli_cntx_init_firestorm.c
+++ b/config/firestorm/bli_cntx_init_firestorm.c
@@ -123,23 +123,23 @@ void bli_cntx_init_firestorm( cntx_t* cntx )
 	  cntx,
 
 	  // level-3
-	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
-	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
-	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
-	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
-	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+	  BLIS_NC, &blkszs[ BLIS_NC ],
+	  BLIS_KC, &blkszs[ BLIS_KC ],
+	  BLIS_MC, &blkszs[ BLIS_MC ],
+	  BLIS_NR, &blkszs[ BLIS_NR ],
+	  BLIS_MR, &blkszs[ BLIS_MR ],
 
 	  // sup thresholds
-	  BLIS_MT, &blkszs[ BLIS_MT ], BLIS_MT,
-	  BLIS_NT, &blkszs[ BLIS_NT ], BLIS_NT,
-	  BLIS_KT, &blkszs[ BLIS_KT ], BLIS_KT,
+	  BLIS_MT, &blkszs[ BLIS_MT ],
+	  BLIS_NT, &blkszs[ BLIS_NT ],
+	  BLIS_KT, &blkszs[ BLIS_KT ],
 
 	  // level-3 sup
-	  BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ], BLIS_NR_SUP,
-	  BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ], BLIS_KR_SUP,
-	  BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ], BLIS_MR_SUP,
-	  BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ], BLIS_NR_SUP,
-	  BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ], BLIS_MR_SUP,
+	  BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ],
+	  BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ],
+	  BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ],
+	  BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ],
+	  BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ],
 
 	  BLIS_VA_END
 	);
diff --git a/config/haswell/bli_cntx_init_haswell.c b/config/haswell/bli_cntx_init_haswell.c
index fe3b451475..1a79c29a53 100644
--- a/config/haswell/bli_cntx_init_haswell.c
+++ b/config/haswell/bli_cntx_init_haswell.c
@@ -232,27 +232,27 @@ void bli_cntx_init_haswell( cntx_t* cntx )
 	  cntx,
 
 	  // level-3
-	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
-	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
-	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
-	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
-	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+	  BLIS_NC, &blkszs[ BLIS_NC ],
+	  BLIS_KC, &blkszs[ BLIS_KC ],
+	  BLIS_MC, &blkszs[ BLIS_MC ],
+	  BLIS_NR, &blkszs[ BLIS_NR ],
+	  BLIS_MR, &blkszs[ BLIS_MR ],
 
 	  // level-1f
-	  BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
-	  BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
+	  BLIS_AF, &blkszs[ BLIS_AF ],
+	  BLIS_DF, &blkszs[ BLIS_DF ],
 
 	  // gemmsup thresholds
-	  BLIS_MT, &blkszs[ BLIS_MT ], BLIS_MT,
-	  BLIS_NT, &blkszs[ BLIS_NT ], BLIS_NT,
-	  BLIS_KT, &blkszs[ BLIS_KT ], BLIS_KT,
+	  BLIS_MT, &blkszs[ BLIS_MT ],
+	  BLIS_NT, &blkszs[ BLIS_NT ],
+	  BLIS_KT, &blkszs[ BLIS_KT ],
 
 	  // level-3 sup
-	  BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ], BLIS_NR_SUP,
-	  BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ], BLIS_KR_SUP,
-	  BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ], BLIS_MR_SUP,
-	  BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ], BLIS_NR_SUP,
-	  BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ], BLIS_MR_SUP,
+	  BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ],
+	  BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ],
+	  BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ],
+	  BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ],
+	  BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ],
 
 	  BLIS_VA_END
 	);
diff --git a/config/knc/bli_cntx_init_knc.c b/config/knc/bli_cntx_init_knc.c
index 8f615588c6..7ba1dd289d 100644
--- a/config/knc/bli_cntx_init_knc.c
+++ b/config/knc/bli_cntx_init_knc.c
@@ -82,11 +82,11 @@ void bli_cntx_init_knc( cntx_t* cntx )
 	  cntx,
 
 	  // level-3
-	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
-	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
-	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
-	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
-	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+	  BLIS_NC, &blkszs[ BLIS_NC ],
+	  BLIS_KC, &blkszs[ BLIS_KC ],
+	  BLIS_MC, &blkszs[ BLIS_MC ],
+	  BLIS_NR, &blkszs[ BLIS_NR ],
+	  BLIS_MR, &blkszs[ BLIS_MR ],
 
 	  BLIS_VA_END
 	);
diff --git a/config/knl/bli_cntx_init_knl.c b/config/knl/bli_cntx_init_knl.c
index 87fa3176ab..c49a6e2a4b 100644
--- a/config/knl/bli_cntx_init_knl.c
+++ b/config/knl/bli_cntx_init_knl.c
@@ -130,15 +130,15 @@ void bli_cntx_init_knl( cntx_t* cntx )
 	  cntx,
 
 	  // level-3
-	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
-	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
-	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
-	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
-	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+	  BLIS_NC, &blkszs[ BLIS_NC ],
+	  BLIS_KC, &blkszs[ BLIS_KC ],
+	  BLIS_MC, &blkszs[ BLIS_MC ],
+	  BLIS_NR, &blkszs[ BLIS_NR ],
+	  BLIS_MR, &blkszs[ BLIS_MR ],
 
 	  // level-1f
-	  BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
-	  BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
+	  BLIS_AF, &blkszs[ BLIS_AF ],
+	  BLIS_DF, &blkszs[ BLIS_DF ],
 
 	  BLIS_VA_END
 	);
diff --git a/config/penryn/bli_cntx_init_penryn.c b/config/penryn/bli_cntx_init_penryn.c
index 964438e834..c31feeb19c 100644
--- a/config/penryn/bli_cntx_init_penryn.c
+++ b/config/penryn/bli_cntx_init_penryn.c
@@ -90,11 +90,11 @@ void bli_cntx_init_penryn( cntx_t* cntx )
 	  cntx,
 
 	  // level-1
-	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
-	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
-	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
-	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
-	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+	  BLIS_NC, &blkszs[ BLIS_NC ],
+	  BLIS_KC, &blkszs[ BLIS_KC ],
+	  BLIS_MC, &blkszs[ BLIS_MC ],
+	  BLIS_NR, &blkszs[ BLIS_NR ],
+	  BLIS_MR, &blkszs[ BLIS_MR ],
 
 	  BLIS_VA_END
 	);
diff --git a/config/piledriver/bli_cntx_init_piledriver.c b/config/piledriver/bli_cntx_init_piledriver.c
index 1c9a96fd9e..97581dd7e9 100644
--- a/config/piledriver/bli_cntx_init_piledriver.c
+++ b/config/piledriver/bli_cntx_init_piledriver.c
@@ -86,11 +86,11 @@ void bli_cntx_init_piledriver( cntx_t* cntx )
 	  cntx,
 
 	  // level-3
-	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
-	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
-	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
-	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
-	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+	  BLIS_NC, &blkszs[ BLIS_NC ],
+	  BLIS_KC, &blkszs[ BLIS_KC ],
+	  BLIS_MC, &blkszs[ BLIS_MC ],
+	  BLIS_NR, &blkszs[ BLIS_NR ],
+	  BLIS_MR, &blkszs[ BLIS_MR ],
 
 	  BLIS_VA_END
 	);
diff --git a/config/power10/bli_cntx_init_power10.c b/config/power10/bli_cntx_init_power10.c
index 12d9f51c6c..46bf8325fa 100644
--- a/config/power10/bli_cntx_init_power10.c
+++ b/config/power10/bli_cntx_init_power10.c
@@ -92,11 +92,11 @@ void bli_cntx_init_power10( cntx_t* cntx )
 	  cntx,
 
 	  // level-3
-	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
-	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
-	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
-	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
-	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+	  BLIS_NC, &blkszs[ BLIS_NC ],
+	  BLIS_KC, &blkszs[ BLIS_KC ],
+	  BLIS_MC, &blkszs[ BLIS_MC ],
+	  BLIS_NR, &blkszs[ BLIS_NR ],
+	  BLIS_MR, &blkszs[ BLIS_MR ],
 
 	  BLIS_VA_END
 	);
diff --git a/config/power7/bli_cntx_init_power7.c b/config/power7/bli_cntx_init_power7.c
index d5ffe7dcfa..150fcd9525 100644
--- a/config/power7/bli_cntx_init_power7.c
+++ b/config/power7/bli_cntx_init_power7.c
@@ -80,11 +80,11 @@ void bli_cntx_init_power7( cntx_t* cntx )
 	  cntx,
 
 	  // level-3
-	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
-	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
-	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
-	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
-	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+	  BLIS_NC, &blkszs[ BLIS_NC ],
+	  BLIS_KC, &blkszs[ BLIS_KC ],
+	  BLIS_MC, &blkszs[ BLIS_MC ],
+	  BLIS_NR, &blkszs[ BLIS_NR ],
+	  BLIS_MR, &blkszs[ BLIS_MR ],
 
 	  BLIS_VA_END
 	);
diff --git a/config/power9/bli_cntx_init_power9.c b/config/power9/bli_cntx_init_power9.c
index 9f2d67632e..520bdaab98 100644
--- a/config/power9/bli_cntx_init_power9.c
+++ b/config/power9/bli_cntx_init_power9.c
@@ -92,11 +92,11 @@ void bli_cntx_init_power9( cntx_t* cntx )
 	  cntx,
 
 	  // level-3
-	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
-	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
-	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
-	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
-	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+	  BLIS_NC, &blkszs[ BLIS_NC ],
+	  BLIS_KC, &blkszs[ BLIS_KC ],
+	  BLIS_MC, &blkszs[ BLIS_MC ],
+	  BLIS_NR, &blkszs[ BLIS_NR ],
+	  BLIS_MR, &blkszs[ BLIS_MR ],
 
 	  BLIS_VA_END
 	);
diff --git a/config/sandybridge/bli_cntx_init_sandybridge.c b/config/sandybridge/bli_cntx_init_sandybridge.c
index 0697a3351c..5f4be203ed 100644
--- a/config/sandybridge/bli_cntx_init_sandybridge.c
+++ b/config/sandybridge/bli_cntx_init_sandybridge.c
@@ -86,11 +86,11 @@ void bli_cntx_init_sandybridge( cntx_t* cntx )
 	  cntx,
 
 	  // level-3
-	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
-	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
-	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
-	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
-	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+	  BLIS_NC, &blkszs[ BLIS_NC ],
+	  BLIS_KC, &blkszs[ BLIS_KC ],
+	  BLIS_MC, &blkszs[ BLIS_MC ],
+	  BLIS_NR, &blkszs[ BLIS_NR ],
+	  BLIS_MR, &blkszs[ BLIS_MR ],
 
 	  BLIS_VA_END
 	);
diff --git a/config/skx/bli_cntx_init_skx.c b/config/skx/bli_cntx_init_skx.c
index 3af58b38d2..8c1d906946 100644
--- a/config/skx/bli_cntx_init_skx.c
+++ b/config/skx/bli_cntx_init_skx.c
@@ -125,15 +125,15 @@ void bli_cntx_init_skx( cntx_t* cntx )
 	  cntx,
 
 	  // level-3
-	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
-	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
-	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
-	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
-	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+	  BLIS_NC, &blkszs[ BLIS_NC ],
+	  BLIS_KC, &blkszs[ BLIS_KC ],
+	  BLIS_MC, &blkszs[ BLIS_MC ],
+	  BLIS_NR, &blkszs[ BLIS_NR ],
+	  BLIS_MR, &blkszs[ BLIS_MR ],
 
 	  // level-1f
-	  BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
-	  BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
+	  BLIS_AF, &blkszs[ BLIS_AF ],
+	  BLIS_DF, &blkszs[ BLIS_DF ],
 
 	  BLIS_VA_END
 	);
diff --git a/config/steamroller/bli_cntx_init_steamroller.c b/config/steamroller/bli_cntx_init_steamroller.c
index 4b4ecdf4e6..961f65bf6b 100644
--- a/config/steamroller/bli_cntx_init_steamroller.c
+++ b/config/steamroller/bli_cntx_init_steamroller.c
@@ -86,11 +86,11 @@ void bli_cntx_init_steamroller( cntx_t* cntx )
 	  cntx,
 
 	  // level-3
-	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
-	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
-	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
-	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
-	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+	  BLIS_NC, &blkszs[ BLIS_NC ],
+	  BLIS_KC, &blkszs[ BLIS_KC ],
+	  BLIS_MC, &blkszs[ BLIS_MC ],
+	  BLIS_NR, &blkszs[ BLIS_NR ],
+	  BLIS_MR, &blkszs[ BLIS_MR ],
 
 	  BLIS_VA_END
 	);
diff --git a/config/template/bli_cntx_init_template.c b/config/template/bli_cntx_init_template.c
index 4bacc5d63c..ae61a02a31 100644
--- a/config/template/bli_cntx_init_template.c
+++ b/config/template/bli_cntx_init_template.c
@@ -100,11 +100,11 @@ void bli_cntx_init_template( cntx_t* cntx )
 	  cntx,
 
 	  // level-3
-	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
-	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
-	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
-	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
-	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+	  BLIS_NC, &blkszs[ BLIS_NC ],
+	  BLIS_KC, &blkszs[ BLIS_KC ],
+	  BLIS_MC, &blkszs[ BLIS_MC ],
+	  BLIS_NR, &blkszs[ BLIS_NR ],
+	  BLIS_MR, &blkszs[ BLIS_MR ],
 
 	  BLIS_VA_END
 	);
diff --git a/config/thunderx2/bli_cntx_init_thunderx2.c b/config/thunderx2/bli_cntx_init_thunderx2.c
index 9d1af2c99c..2527512f7f 100644
--- a/config/thunderx2/bli_cntx_init_thunderx2.c
+++ b/config/thunderx2/bli_cntx_init_thunderx2.c
@@ -82,11 +82,11 @@ void bli_cntx_init_thunderx2( cntx_t* cntx )
 	  cntx,
 
 	  // level-3
-	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
-	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
-	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
-	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
-	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+	  BLIS_NC, &blkszs[ BLIS_NC ],
+	  BLIS_KC, &blkszs[ BLIS_KC ],
+	  BLIS_MC, &blkszs[ BLIS_MC ],
+	  BLIS_NR, &blkszs[ BLIS_NR ],
+	  BLIS_MR, &blkszs[ BLIS_MR ],
 
 	  BLIS_VA_END
 	);
diff --git a/config/zen/bli_cntx_init_zen.c b/config/zen/bli_cntx_init_zen.c
index a10986b234..061a96c6d1 100644
--- a/config/zen/bli_cntx_init_zen.c
+++ b/config/zen/bli_cntx_init_zen.c
@@ -292,27 +292,27 @@ void bli_cntx_init_zen( cntx_t* cntx )
 	  cntx,
 
 	  // level-3
-	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
-	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
-	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
-	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
-	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+	  BLIS_NC, &blkszs[ BLIS_NC ],
+	  BLIS_KC, &blkszs[ BLIS_KC ],
+	  BLIS_MC, &blkszs[ BLIS_MC ],
+	  BLIS_NR, &blkszs[ BLIS_NR ],
+	  BLIS_MR, &blkszs[ BLIS_MR ],
 
 	  // level-1f
-	  BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
-	  BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
+	  BLIS_AF, &blkszs[ BLIS_AF ],
+	  BLIS_DF, &blkszs[ BLIS_DF ],
 
 	  // sup thresholds
-	  BLIS_MT, &blkszs[ BLIS_MT ], BLIS_MT,
-	  BLIS_NT, &blkszs[ BLIS_NT ], BLIS_NT,
-	  BLIS_KT, &blkszs[ BLIS_KT ], BLIS_KT,
+	  BLIS_MT, &blkszs[ BLIS_MT ],
+	  BLIS_NT, &blkszs[ BLIS_NT ],
+	  BLIS_KT, &blkszs[ BLIS_KT ],
 
 	  // gemmsup
-	  BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ], BLIS_NR_SUP,
-	  BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ], BLIS_KR_SUP,
-	  BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ], BLIS_MR_SUP,
-	  BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ], BLIS_NR_SUP,
-	  BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ], BLIS_MR_SUP,
+	  BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ],
+	  BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ],
+	  BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ],
+	  BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ],
+	  BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ],
 
 	  BLIS_VA_END
 	);
@@ -324,7 +324,7 @@ void bli_cntx_init_zen( cntx_t* cntx )
 	bli_cntx_set_l3_sup_handlers
 	(
 	  cntx,
-	  
+
 	  BLIS_GEMM, bli_gemmsup_ref,
 	  //BLIS_GEMMT, bli_gemmtsup_ref,
 
diff --git a/config/zen2/bli_cntx_init_zen2.c b/config/zen2/bli_cntx_init_zen2.c
index c7e40b4d0e..ffe87353ef 100644
--- a/config/zen2/bli_cntx_init_zen2.c
+++ b/config/zen2/bli_cntx_init_zen2.c
@@ -249,27 +249,27 @@ void bli_cntx_init_zen2( cntx_t* cntx )
 	  cntx,
 
 	  // level-3
-	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
-	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
-	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
-	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
-	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+	  BLIS_NC, &blkszs[ BLIS_NC ],
+	  BLIS_KC, &blkszs[ BLIS_KC ],
+	  BLIS_MC, &blkszs[ BLIS_MC ],
+	  BLIS_NR, &blkszs[ BLIS_NR ],
+	  BLIS_MR, &blkszs[ BLIS_MR ],
 
 	  // level-1f
-	  BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
-	  BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
+	  BLIS_AF, &blkszs[ BLIS_AF ],
+	  BLIS_DF, &blkszs[ BLIS_DF ],
 
 	  // sup thresholds
-	  BLIS_MT, &blkszs[ BLIS_MT ], BLIS_MT,
-	  BLIS_NT, &blkszs[ BLIS_NT ], BLIS_NT,
-	  BLIS_KT, &blkszs[ BLIS_KT ], BLIS_KT,
+	  BLIS_MT, &blkszs[ BLIS_MT ],
+	  BLIS_NT, &blkszs[ BLIS_NT ],
+	  BLIS_KT, &blkszs[ BLIS_KT ],
 
 	  // level-3 sup
-	  BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ], BLIS_NC_SUP,
-	  BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ], BLIS_KC_SUP,
-	  BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ], BLIS_MC_SUP,
-	  BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ], BLIS_NR_SUP,
-	  BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ], BLIS_MR_SUP,
+	  BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ],
+	  BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ],
+	  BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ],
+	  BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ],
+	  BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ],
 
 	  BLIS_VA_END
 	);
@@ -281,7 +281,7 @@ void bli_cntx_init_zen2( cntx_t* cntx )
 	bli_cntx_set_l3_sup_handlers
 	(
 	  cntx,
-	  
+
 	  BLIS_GEMM, bli_gemmsup_ref,
 	  //BLIS_GEMMT, bli_gemmtsup_ref,
 
diff --git a/config/zen3/bli_cntx_init_zen3.c b/config/zen3/bli_cntx_init_zen3.c
index 3ee385ed61..ffce6fca9d 100644
--- a/config/zen3/bli_cntx_init_zen3.c
+++ b/config/zen3/bli_cntx_init_zen3.c
@@ -266,27 +266,27 @@ void bli_cntx_init_zen3( cntx_t* cntx )
 	  cntx,
 
 	  // level-3
-	  BLIS_NC, &blkszs[ BLIS_NC ], BLIS_NR,
-	  BLIS_KC, &blkszs[ BLIS_KC ], BLIS_KR,
-	  BLIS_MC, &blkszs[ BLIS_MC ], BLIS_MR,
-	  BLIS_NR, &blkszs[ BLIS_NR ], BLIS_NR,
-	  BLIS_MR, &blkszs[ BLIS_MR ], BLIS_MR,
+	  BLIS_NC, &blkszs[ BLIS_NC ],
+	  BLIS_KC, &blkszs[ BLIS_KC ],
+	  BLIS_MC, &blkszs[ BLIS_MC ],
+	  BLIS_NR, &blkszs[ BLIS_NR ],
+	  BLIS_MR, &blkszs[ BLIS_MR ],
 
 	  // level-1f
-	  BLIS_AF, &blkszs[ BLIS_AF ], BLIS_AF,
-	  BLIS_DF, &blkszs[ BLIS_DF ], BLIS_DF,
+	  BLIS_AF, &blkszs[ BLIS_AF ],
+	  BLIS_DF, &blkszs[ BLIS_DF ],
 
 	  // sup thresholds
-	  BLIS_MT, &blkszs[ BLIS_MT ], BLIS_MT,
-	  BLIS_NT, &blkszs[ BLIS_NT ], BLIS_NT,
-	  BLIS_KT, &blkszs[ BLIS_KT ], BLIS_KT,
+	  BLIS_MT, &blkszs[ BLIS_MT ],
+	  BLIS_NT, &blkszs[ BLIS_NT ],
+	  BLIS_KT, &blkszs[ BLIS_KT ],
 
 	  // gemmsup
-	  BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ], BLIS_NR_SUP,
-	  BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ], BLIS_KR_SUP,
-	  BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ], BLIS_MR_SUP,
-	  BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ], BLIS_NR_SUP,
-	  BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ], BLIS_MR_SUP,
+	  BLIS_NC_SUP, &blkszs[ BLIS_NC_SUP ],
+	  BLIS_KC_SUP, &blkszs[ BLIS_KC_SUP ],
+	  BLIS_MC_SUP, &blkszs[ BLIS_MC_SUP ],
+	  BLIS_NR_SUP, &blkszs[ BLIS_NR_SUP ],
+	  BLIS_MR_SUP, &blkszs[ BLIS_MR_SUP ],
 
 	  BLIS_VA_END
 	);
@@ -298,7 +298,7 @@ void bli_cntx_init_zen3( cntx_t* cntx )
 	bli_cntx_set_l3_sup_handlers
 	(
 	  cntx,
-	  
+
 	  BLIS_GEMM, bli_gemmsup_ref,
 	  //BLIS_GEMMT, bli_gemmtsup_ref,
 
diff --git a/frame/1m/bli_l1m_oft_var.h b/frame/1m/bli_l1m_oft_var.h
index 325ed0ecff..4888cbdaa7 100644
--- a/frame/1m/bli_l1m_oft_var.h
+++ b/frame/1m/bli_l1m_oft_var.h
@@ -48,9 +48,8 @@ typedef void (*PASTECH(opname,_var_oft)) \
   const obj_t*  a, \
         obj_t*  p, \
   const cntx_t* cntx, \
-        rntm_t* rntm, \
-        cntl_t* cntl, \
-  const thrinfo_t* thread  \
+  const cntl_t* cntl, \
+        thrinfo_t* thread  \
 );
 
 GENTDEF( packm )
diff --git a/frame/1m/bli_l1m_unb_var1.c b/frame/1m/bli_l1m_unb_var1.c
index c979f082aa..f758fb30e7 100644
--- a/frame/1m/bli_l1m_unb_var1.c
+++ b/frame/1m/bli_l1m_unb_var1.c
@@ -51,8 +51,7 @@ void PASTEMAC(ch,opname) \
        dim_t   n, \
        ctype*  x, inc_t rs_x, inc_t cs_x, \
        ctype*  y, inc_t rs_y, inc_t cs_y, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+       cntx_t* cntx \
      ) \
 { \
 	const num_t dt = PASTEMAC(ch,type); \
@@ -168,8 +167,7 @@ void PASTEMAC(ch,opname) \
        ctype*  alpha, \
        ctype*  x, inc_t rs_x, inc_t cs_x, \
        ctype*  y, inc_t rs_y, inc_t cs_y, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+       cntx_t* cntx \
      ) \
 { \
 	const num_t dt = PASTEMAC(ch,type); \
@@ -286,8 +284,7 @@ void PASTEMAC(ch,opname) \
        dim_t   n, \
        ctype*  alpha, \
        ctype*  x, inc_t rs_x, inc_t cs_x, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+       cntx_t* cntx \
      ) \
 { \
 	const num_t dt = PASTEMAC(ch,type); \
@@ -394,8 +391,7 @@ void PASTEMAC(ch,opname) \
        ctype*  x, inc_t rs_x, inc_t cs_x, \
        ctype*  beta, \
        ctype*  y, inc_t rs_y, inc_t cs_y, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+       cntx_t* cntx \
      ) \
 { \
 	const num_t dt = PASTEMAC(ch,type); \
@@ -512,8 +508,7 @@ void PASTEMAC2(chx,chy,opname) \
        ctype_x* x, inc_t rs_x, inc_t cs_x, \
        ctype_y* beta, \
        ctype_y* y, inc_t rs_y, inc_t cs_y, \
-       cntx_t*  cntx, \
-       rntm_t*  rntm  \
+       cntx_t*  cntx \
      ) \
 { \
 	uplo_t uplox_eff; \
diff --git a/frame/1m/bli_l1m_unb_var1.h b/frame/1m/bli_l1m_unb_var1.h
index 0364d4b7cd..9130461bc2 100644
--- a/frame/1m/bli_l1m_unb_var1.h
+++ b/frame/1m/bli_l1m_unb_var1.h
@@ -50,8 +50,7 @@ void PASTEMAC2(ch,opname,_unb_var1) \
        dim_t   n, \
        ctype*  x, inc_t rs_x, inc_t cs_x, \
        ctype*  y, inc_t rs_y, inc_t cs_y, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+       cntx_t* cntx \
      );
 
 INSERT_GENTPROT_BASIC0( addm )
@@ -73,8 +72,7 @@ void PASTEMAC2(ch,opname,_unb_var1) \
        ctype*  alpha, \
        ctype*  x, inc_t rs_x, inc_t cs_x, \
        ctype*  y, inc_t rs_y, inc_t cs_y, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+       cntx_t* cntx \
      );
 
 INSERT_GENTPROT_BASIC0( axpym )
@@ -94,8 +92,7 @@ void PASTEMAC2(ch,opname,_unb_var1) \
        dim_t   n, \
        ctype*  alpha, \
        ctype*  x, inc_t rs_x, inc_t cs_x, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+       cntx_t* cntx \
      );
 
 INSERT_GENTPROT_BASIC0( scalm )
@@ -116,8 +113,7 @@ void PASTEMAC2(ch,opname,_unb_var1) \
        ctype*  x, inc_t rs_x, inc_t cs_x, \
        ctype*  beta, \
        ctype*  y, inc_t rs_y, inc_t cs_y, \
-       cntx_t* cntx, \
-       rntm_t* rntm  \
+       cntx_t* cntx \
      );
 
 INSERT_GENTPROT_BASIC0( xpbym )
@@ -137,8 +133,7 @@ void PASTEMAC3(chx,chy,opname,_unb_var1) \
        ctype_x* x, inc_t rs_x, inc_t cs_x, \
        ctype_y* beta, \
        ctype_y* y, inc_t rs_y, inc_t cs_y, \
-       cntx_t*  cntx, \
-       rntm_t*  rntm  \
+       cntx_t*  cntx \
      );
 
 INSERT_GENTPROT2_BASIC0( xpbym_md )
diff --git a/frame/1m/packm/bli_packm_alloc.c b/frame/1m/packm/bli_packm_alloc.c
index 22ed31ecc5..8e4ede2597 100644
--- a/frame/1m/packm/bli_packm_alloc.c
+++ b/frame/1m/packm/bli_packm_alloc.c
@@ -38,9 +38,8 @@
 void* bli_packm_alloc
      (
              siz_t      size_needed,
-             rntm_t*    rntm,
-             cntl_t*    cntl,
-       const thrinfo_t* thread
+       const cntl_t*    cntl,
+             thrinfo_t* thread
      )
 {
 	// Query the pack buffer type from the control tree node.
@@ -50,35 +49,32 @@ void* bli_packm_alloc
 	(
 	  size_needed,
 	  pack_buf_type,
-	  rntm,
-	  cntl,
 	  thread
 	);
 }
 
 void* bli_packm_alloc_ex
      (
-             siz_t      size_needed,
-             packbuf_t  pack_buf_type,
-             rntm_t*    rntm,
-             cntl_t*    cntl,
-       const thrinfo_t* thread
+         siz_t      size_needed,
+         packbuf_t  pack_buf_type,
+         thrinfo_t* thread
      )
 {
 	// Query the address of the mem_t entry within the control tree node.
-	mem_t* cntl_mem_p = bli_cntl_pack_mem( cntl );
+	mem_t* cntl_mem_p = bli_thread_mem( thread );
+    pba_t* pba        = bli_thread_pba( thread );
 
 	mem_t* local_mem_p;
 	mem_t  local_mem_s;
 
-	siz_t cntl_mem_size = 0;
+	siz_t  cntl_mem_size = 0;
 
 	if ( bli_mem_is_alloc( cntl_mem_p ) )
 		cntl_mem_size = bli_mem_size( cntl_mem_p );
 
 	if ( cntl_mem_size < size_needed )
 	{
-		if ( bli_thread_am_ochief( thread ) )
+		if ( bli_thread_am_chief( thread ) )
 		{
 			// The chief thread releases the existing block associated with
 			// the mem_t entry in the control tree, and then re-acquires a
@@ -87,14 +83,14 @@ void* bli_packm_alloc_ex
 			{
 				bli_pba_release
 				(
-				  rntm,
+				  pba,
 				  cntl_mem_p
 				);
 			}
 
 			bli_pba_acquire_m
 			(
-			  rntm,
+			  pba,
 			  size_needed,
 			  pack_buf_type,
 			  &local_mem_s
diff --git a/frame/1m/packm/bli_packm_alloc.h b/frame/1m/packm/bli_packm_alloc.h
index aec2e1af53..e308709b0e 100644
--- a/frame/1m/packm/bli_packm_alloc.h
+++ b/frame/1m/packm/bli_packm_alloc.h
@@ -35,17 +35,14 @@
 BLIS_EXPORT_BLIS void* bli_packm_alloc
      (
              siz_t      size_needed,
-             rntm_t*    rntm,
-             cntl_t*    cntl,
-       const thrinfo_t* thread
+       const cntl_t*    cntl,
+             thrinfo_t* thread
      );
 
 BLIS_EXPORT_BLIS void* bli_packm_alloc_ex
      (
-             siz_t      size_needed,
-             packbuf_t  pack_buf_type,
-             rntm_t*    rntm,
-             cntl_t*    cntl,
-       const thrinfo_t* thread
+         siz_t      size_needed,
+         packbuf_t  pack_buf_type,
+         thrinfo_t* thread
      );
 
diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c
index 601f2c05c5..3928f780c9 100644
--- a/frame/1m/packm/bli_packm_blk_var1.c
+++ b/frame/1m/packm/bli_packm_blk_var1.c
@@ -57,9 +57,8 @@ void bli_packm_blk_var1
        const obj_t*   c,
              obj_t*   p,
        const cntx_t*  cntx,
-             rntm_t*  rntm,
-             cntl_t*  cntl,
-       const thrinfo_t* thread
+       const cntl_t*  cntl,
+             thrinfo_t* thread
      )
 {
 	// Extract various fields from the control tree.
@@ -71,7 +70,7 @@ void bli_packm_blk_var1
 	// Every thread initializes p and determines the size of memory
 	// block needed (which gets embedded into the otherwise "blank" mem_t
 	// entry in the control tree node). Return early if no packing is required.
-	if ( !bli_packm_init( c, p, cntx, rntm, cntl, thread ) )
+	if ( !bli_packm_init( c, p, cntx, cntl, thread ) )
 		return;
 
 	// Check parameters.
@@ -111,29 +110,25 @@ void bli_packm_blk_var1
 	obj_t   kappa_local;
 	char*   kappa_cast     = bli_packm_scalar( &kappa_local, p );
 
-	// we use the default lookup table to determine the right func_t
-	// for the current schema.
-	func_t* packm_kers = &packm_struc_cxk_kers[ bli_pack_schema_index( schema ) ];
-
 	// Query the datatype-specific function pointer from the func_t object.
-	packm_ker_vft packm_ker_cast = bli_func_get_dt( dt_p, packm_kers );
-
-	// For mixed-precision gemm, select the proper kernel (only dense panels).
-	if ( dt_c != dt_p )
-	{
-		packm_ker_cast = packm_struc_cxk_md[ dt_c ][ dt_p ];
-	}
-
-	// Query the address of the packm params field of the obj_t. The user might
-	// have set this field in order to specify a custom packm kernel.
-	packm_blk_var1_params_t* params = bli_obj_pack_params( c );
-
-	if ( params && params->ukr_fn[ dt_c ][ dt_p ] )
-	{
-		// Query the user-provided packing kernel from the obj_t. If provided,
-		// this overrides the kernel determined above.
-		packm_ker_cast = params->ukr_fn[ dt_c ][ dt_p ];
-	}
+	packm_ker_vft packm_ker_cast = bli_cntl_packm_params_ukr( dt_c, dt_p, cntl );
+
+    if ( packm_ker_cast == NULL )
+    {
+        if ( dt_c == dt_p )
+        {
+        	// we use the default lookup table to determine the right func_t
+        	// for the current schema.
+        	func_t* packm_kers = &packm_struc_cxk_kers[ bli_pack_schema_index( schema ) ];
+
+            bli_func_get_dt( dt_p, packm_kers );
+        }
+    	// For mixed-precision gemm, select the proper kernel (only dense panels).
+    	else
+    	{
+    		packm_ker_cast = packm_struc_cxk_md[ dt_c ][ dt_p ];
+    	}
+    }
 
 	/* Compute the total number of iterations we'll need. */
 	dim_t n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 );
@@ -272,7 +267,7 @@ void bli_packm_blk_var1
 				                p_use,       ldp,
 				                       is_p_use,
 				                ( cntx_t* )cntx,
-				                params );
+				                bli_cntl_params( cntl ) );
 			}
 
 			// NOTE: This value is usually LESS than ps_p because triangular
@@ -304,7 +299,7 @@ void bli_packm_blk_var1
 				                c_begin, incc, ldc,
 				                p_begin,       ldp, is_p,
 				                ( cntx_t* )cntx,
-				                params );
+				                bli_cntl_params( cntl ) );
 			}
 		}
 
diff --git a/frame/1m/packm/bli_packm_blk_var1.h b/frame/1m/packm/bli_packm_blk_var1.h
index 2fec23902e..34ea02418e 100644
--- a/frame/1m/packm/bli_packm_blk_var1.h
+++ b/frame/1m/packm/bli_packm_blk_var1.h
@@ -33,16 +33,6 @@
 
 */
 
-//
-// packm params types.
-//
-
-typedef struct
-{
-    //                   Type of C          Type of P
-    packm_ker_vft ukr_fn[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES];
-} packm_blk_var1_params_t;
-
 //
 // Prototype object-based interfaces.
 //
@@ -52,8 +42,7 @@ BLIS_EXPORT_BLIS void bli_packm_blk_var1
        const obj_t*   c,
              obj_t*   p,
        const cntx_t*  cntx,
-             rntm_t*  rntm,
-             cntl_t*  cntl,
-       const thrinfo_t* t
+       const cntl_t*  cntl,
+             thrinfo_t* t
      );
 
diff --git a/frame/1m/packm/bli_packm_cntl.c b/frame/1m/packm/bli_packm_cntl.c
deleted file mode 100644
index e99ed9cf3d..0000000000
--- a/frame/1m/packm/bli_packm_cntl.c
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-BLIS_EXPORT_BLIS cntl_t* bli_packm_cntl_create_node
-     (
-       rntm_t*   rntm,
-       void_fp   var_func,
-       bszid_t   bmid_m,
-       bszid_t   bmid_n,
-       bool      does_invert_diag,
-       bool      rev_iter_if_upper,
-       bool      rev_iter_if_lower,
-       pack_t    pack_schema,
-       packbuf_t pack_buf_type,
-       cntl_t*   sub_node
-     )
-{
-	cntl_t*         cntl;
-	packm_params_t* params;
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_packm_cntl_create_node(): " );
-	#endif
-
-	// Allocate a packm_params_t struct.
-	params = bli_sba_acquire( rntm, sizeof( packm_params_t ) );
-
-	// Initialize the packm_params_t struct.
-	params->size              = sizeof( packm_params_t );
-	params->bmid_m            = bmid_m;
-	params->bmid_n            = bmid_n;
-	params->does_invert_diag  = does_invert_diag;
-	params->rev_iter_if_upper = rev_iter_if_upper;
-	params->rev_iter_if_lower = rev_iter_if_lower;
-	params->pack_schema       = pack_schema;
-	params->pack_buf_type     = pack_buf_type;
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_packm_cntl_create_node(): " );
-	#endif
-
-	// It's important that we set the bszid field to BLIS_NO_PART to indicate
-	// that no blocksize partitioning is performed. bli_cntl_free() will rely
-	// on this information to know how to step through the thrinfo_t tree in
-	// sync with the cntl_t tree.
-	cntl = bli_cntl_create_node
-	(
-	  rntm,
-	  BLIS_NOID,
-	  BLIS_NO_PART,
-	  var_func,
-	  params,
-	  sub_node
-	);
-
-	return cntl;
-}
-
diff --git a/frame/1m/packm/bli_packm_cntl.h b/frame/1m/packm/bli_packm_cntl.h
index b923682df6..544f739b10 100644
--- a/frame/1m/packm/bli_packm_cntl.h
+++ b/frame/1m/packm/bli_packm_cntl.h
@@ -35,65 +35,72 @@
 
 struct packm_params_s
 {
-	uint64_t      size; // size field must be present and come first.
-	bszid_t       bmid_m;
-	bszid_t       bmid_n;
-	bool          does_invert_diag;
-	bool          rev_iter_if_upper;
-	bool          rev_iter_if_lower;
-	pack_t        pack_schema;
-	packbuf_t     pack_buf_type;
+    func2_t ukr;
 };
 typedef struct packm_params_s packm_params_t;
 
-BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_m( const cntl_t* cntl )
+struct packm_cntl_s
 {
-	packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_m;
+	      bool      does_invert_diag;
+	      bool      rev_iter_if_upper;
+	      bool      rev_iter_if_lower;
+	      pack_t    pack_schema;
+	      packbuf_t pack_buf_type;
+          num_t     dt_pack;
+          dim_t     mr;
+          dim_t     nr;
+          dim_t     kr;
+    const void*     params;
+};
+typedef struct packm_cntl_s packm_cntl_t;
+
+BLIS_INLINE const blksz_t* bli_cntl_packm_mr( const cntl_t* cntl )
+{
+	const packm_params_t* ppp = ( ( packm_cntl_t* )cntl->params )->params; return &ppp->mr;
+}
+
+BLIS_INLINE const blksz_t* bli_cntl_packm_nr( const cntl_t* cntl )
+{
+	const packm_params_t* ppp = ( ( packm_cntl_t* )cntl->params )->params; return &ppp->nr;
 }
 
-BLIS_INLINE bszid_t bli_cntl_packm_params_bmid_n( const cntl_t* cntl )
+BLIS_INLINE const blksz_t* bli_cntl_packm_kr( const cntl_t* cntl )
 {
-	packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->bmid_n;
+	const packm_params_t* ppp = ( ( packm_cntl_t* )cntl->params )->params; return &ppp->kr;
 }
 
-BLIS_INLINE bool bli_cntl_packm_params_does_invert_diag( const cntl_t* cntl )
+BLIS_INLINE bool bli_cntl_packm_does_invert_diag( const cntl_t* cntl )
 {
-	packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->does_invert_diag;
+	packm_cntl_t* ppp = ( packm_cntl_t* )cntl->params; return ppp->does_invert_diag;
 }
 
-BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_upper( const cntl_t* cntl )
+BLIS_INLINE bool bli_cntl_packm_rev_iter_if_upper( const cntl_t* cntl )
 {
-	packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->rev_iter_if_upper;
+	packm_cntl_t* ppp = ( packm_cntl_t* )cntl->params; return ppp->rev_iter_if_upper;
 }
 
-BLIS_INLINE bool bli_cntl_packm_params_rev_iter_if_lower( const cntl_t* cntl )
+BLIS_INLINE bool bli_cntl_packm_rev_iter_if_lower( const cntl_t* cntl )
 {
-	packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->rev_iter_if_lower;
+	packm_cntl_t* ppp = ( packm_cntl_t* )cntl->params; return ppp->rev_iter_if_lower;
 }
 
-BLIS_INLINE pack_t bli_cntl_packm_params_pack_schema( const cntl_t* cntl )
+BLIS_INLINE pack_t bli_cntl_packm_pack_schema( const cntl_t* cntl )
 {
-	packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_schema;
+	packm_cntl_t* ppp = ( packm_cntl_t* )cntl->params; return ppp->pack_schema;
 }
 
-BLIS_INLINE packbuf_t bli_cntl_packm_params_pack_buf_type( const cntl_t* cntl )
+BLIS_INLINE packbuf_t bli_cntl_packm_pack_buf_type( const cntl_t* cntl )
 {
-	packm_params_t* ppp = ( packm_params_t* )cntl->params; return ppp->pack_buf_type;
+	packm_cntl_t* ppp = ( packm_cntl_t* )cntl->params; return ppp->pack_buf_type;
 }
 
-// -----------------------------------------------------------------------------
-
-cntl_t* bli_packm_cntl_create_node
-     (
-       rntm_t*   rntm,
-       void_fp   var_func,
-       bszid_t   bmid_m,
-       bszid_t   bmid_n,
-       bool      does_invert_diag,
-       bool      rev_iter_if_upper,
-       bool      rev_iter_if_lower,
-       pack_t    pack_schema,
-       packbuf_t pack_buf_type,
-       cntl_t*   sub_node
-     );
+BLIS_INLINE packm_ker_vft bli_cntl_packm_ukr_mr( num_t dt_c, num_t dt_p, const cntl_t* cntl )
+{
+	const packm_params_t* ppp = ( ( packm_cntl_t* )cntl->params )->params; return ppp->ukr_mr[ dt_c ][ dt_p ];
+}
+
+BLIS_INLINE packm_ker_vft bli_cntl_packm_ukr_nr( num_t dt_c, num_t dt_p, const cntl_t* cntl )
+{
+	const packm_params_t* ppp = ( ( packm_cntl_t* )cntl->params )->params; return ppp->ukr_nr[ dt_c ][ dt_p ];
+}
 
diff --git a/frame/1m/packm/bli_packm_init.c b/frame/1m/packm/bli_packm_init.c
index 67e02ac0e5..a8156dffa6 100644
--- a/frame/1m/packm/bli_packm_init.c
+++ b/frame/1m/packm/bli_packm_init.c
@@ -40,9 +40,8 @@ bool bli_packm_init
        const obj_t*  c,
              obj_t*  p,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
-       const thrinfo_t* thread
+       const cntl_t* cntl,
+             thrinfo_t* thread
      )
 {
 	bli_init_once();
@@ -66,14 +65,14 @@ bool bli_packm_init
 		return false;
 
 	// Extract various fields from the control tree.
-	bszid_t bmult_id_m   = bli_cntl_packm_params_bmid_m( cntl );
-	bszid_t bmult_id_n   = bli_cntl_packm_params_bmid_n( cntl );
-	pack_t  schema       = bli_cntl_packm_params_pack_schema( cntl );
-	num_t   dt_tar       = bli_obj_target_dt( c );
-	num_t   dt_scalar    = bli_obj_scalar_dt( c );
-	dim_t   bmult_m_def  = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_m, cntx );
-	dim_t   bmult_m_pack = bli_cntx_get_blksz_max_dt( dt_tar, bmult_id_m, cntx );
-	dim_t   bmult_n_def  = bli_cntx_get_blksz_def_dt( dt_tar, bmult_id_n, cntx );
+	const blksz_t* bmult_m      = bli_cntl_packm_params_mr( cntl );
+	const blksz_t* bmult_n      = bli_cntl_packm_params_kr( cntl );
+	      pack_t   schema       = bli_cntl_packm_params_pack_schema( cntl );
+	      num_t    dt_tar       = bli_obj_target_dt( c );
+	      num_t    dt_scalar    = bli_obj_scalar_dt( c );
+	      dim_t    bmult_m_def  = bli_blksz_get_def( dt_tar, bmult_m );
+	      dim_t    bmult_m_pack = bli_blksz_get_max( dt_tar, bmult_m );
+	      dim_t    bmult_n_def  = bli_blksz_get_def( dt_tar, bmult_n );
 
 	// Typecast the internal scalar value to the target datatype.
 	// Note that if the typecasting is needed, this must happen BEFORE we
@@ -179,7 +178,7 @@ bool bli_packm_init
 	// Update the buffer address in p to point to the buffer associated
 	// with the mem_t entry acquired from the memory broker (now cached in
 	// the control tree node).
-	void* buffer = bli_packm_alloc( size_p, rntm, cntl, thread );
+	void* buffer = bli_packm_alloc( size_p, cntl, thread );
 	bli_obj_set_buffer( buffer, p );
 
 	return true;
diff --git a/frame/1m/packm/bli_packm_init.h b/frame/1m/packm/bli_packm_init.h
index 6f9b472736..b34bd53799 100644
--- a/frame/1m/packm/bli_packm_init.h
+++ b/frame/1m/packm/bli_packm_init.h
@@ -37,8 +37,7 @@ BLIS_EXPORT_BLIS bool bli_packm_init
        const obj_t*  a,
              obj_t*  p,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
-       const thrinfo_t* thread
+       const cntl_t* cntl,
+             thrinfo_t* thread
      );
 
diff --git a/frame/1m/packm/bli_packm_int.c b/frame/1m/packm/bli_packm_int.c
index f76607508c..b918d50bd7 100644
--- a/frame/1m/packm/bli_packm_int.c
+++ b/frame/1m/packm/bli_packm_int.c
@@ -39,9 +39,8 @@ void bli_packm_int
        const obj_t*  a,
              obj_t*  p,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
-       const thrinfo_t* thread
+       const cntl_t* cntl,
+             thrinfo_t* thread
      )
 {
 	bli_init_once();
@@ -59,7 +58,6 @@ void bli_packm_int
 	  a,
 	  p,
 	  cntx,
-	  rntm,
 	  cntl,
 	  thread
 	);
diff --git a/frame/1m/packm/bli_packm_int.h b/frame/1m/packm/bli_packm_int.h
index a4cf17d592..b7720cd3e6 100644
--- a/frame/1m/packm/bli_packm_int.h
+++ b/frame/1m/packm/bli_packm_int.h
@@ -37,7 +37,6 @@ void bli_packm_int
        const obj_t*  a,
              obj_t*  p,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
-       const thrinfo_t* thread
+       const cntl_t* cntl,
+             thrinfo_t* thread
      );
diff --git a/frame/1m/packm/bli_packm_thrinfo.c b/frame/1m/packm/bli_packm_thrinfo.c
deleted file mode 100644
index 4b57971ef2..0000000000
--- a/frame/1m/packm/bli_packm_thrinfo.c
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-void bli_packm_thrinfo_init
-     (
-       thrinfo_t* thread,
-       thrcomm_t* ocomm,
-       dim_t      ocomm_id,
-       dim_t      n_way,
-       dim_t      work_id,
-       bszid_t    bszid,
-       thrinfo_t* sub_node
-     )
-{
-	bli_thrinfo_init
-	(
-	  thread,
-	  ocomm, ocomm_id,
-	  n_way, work_id,
-	  FALSE,
-	  BLIS_NO_PART,
-	  sub_node
-	);
-}
-
-void bli_packm_thrinfo_init_single
-     (
-       thrinfo_t* thread
-     )
-{
-	bli_packm_thrinfo_init
-	(
-	  thread,
-	  &BLIS_SINGLE_COMM, 0,
-	  1,
-	  0,
-	  BLIS_NO_PART,
-	  NULL
-	);
-}
-
diff --git a/frame/1m/unpackm/bli_unpackm.h b/frame/1m/unpackm/bli_unpackm.h
index 80fa3804a1..13e43d1857 100644
--- a/frame/1m/unpackm/bli_unpackm.h
+++ b/frame/1m/unpackm/bli_unpackm.h
@@ -32,7 +32,6 @@
 
 */
 
-#include "bli_unpackm_cntl.h"
 #include "bli_unpackm_check.h"
 #include "bli_unpackm_int.h"
 
diff --git a/frame/1m/unpackm/bli_unpackm_cntl.c b/frame/1m/unpackm/bli_unpackm_cntl.c
deleted file mode 100644
index 95d0545bec..0000000000
--- a/frame/1m/unpackm/bli_unpackm_cntl.c
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-cntl_t* bli_unpackm_cntl_create_node
-     (
-       rntm_t*   rntm,
-       void_fp   var_func,
-       void_fp   unpackm_var_func,
-       cntl_t*   sub_node
-     )
-{
-	cntl_t*           cntl;
-	unpackm_params_t* params;
-	err_t             r_val;
-
-	// NOTE: If this function is ever called, figure out whether the
-	// bli_malloc_intl() below needs to be changed to bli_sba_acquire().
-	bli_abort();
-
-	// Allocate an unpackm_params_t struct.
-	params = bli_malloc_intl( sizeof( unpackm_params_t ), &r_val );
-
-	// Initialize the unpackm_params_t struct.
-	params->size      = sizeof( unpackm_params_t );
-	params->var_func  = unpackm_var_func;
-
-	// It's important that we set the bszid field to BLIS_NO_PART to indicate
-	// that no blocksize partitioning is performed. bli_cntl_free() will rely
-	// on this information to know how to step through the thrinfo_t tree in
-	// sync with the cntl_t tree.
-	cntl = bli_cntl_create_node
-	(
-	  rntm,
-	  BLIS_NOID,
-	  BLIS_NO_PART,
-	  var_func,
-	  params,
-	  sub_node
-	);
-
-	return cntl;
-}
-
diff --git a/frame/1m/unpackm/bli_unpackm_int.c b/frame/1m/unpackm/bli_unpackm_int.c
index f67cae084a..ed24366424 100644
--- a/frame/1m/unpackm/bli_unpackm_int.c
+++ b/frame/1m/unpackm/bli_unpackm_int.c
@@ -60,7 +60,7 @@ void bli_unpackm_int
 	f = bli_cntl_unpackm_params_var_func( cntl );
 
 	// Invoke the variant.
-    if ( bli_thread_am_ochief( thread ) )
+    if ( bli_thread_am_chief( thread ) )
 	{
         f
 		(
diff --git a/frame/3/bli_l3.h b/frame/3/bli_l3.h
index 4dc1a9d545..57dbda05cd 100644
--- a/frame/3/bli_l3.h
+++ b/frame/3/bli_l3.h
@@ -33,7 +33,6 @@
 
 */
 
-#include "bli_l3_cntl.h"
 #include "bli_l3_check.h"
 #include "bli_l3_int.h"
 #include "bli_l3_packab.h"
@@ -71,8 +70,7 @@
 #include "bli_l3_sup_ref.h"
 #include "bli_l3_sup_int.h"
 #include "bli_l3_sup_vars.h"
-#include "bli_l3_sup_packm_a.h"
-#include "bli_l3_sup_packm_b.h"
+#include "bli_l3_sup_packm.h"
 #include "bli_l3_sup_packm_var.h"
 
 // Prototype microkernel wrapper APIs.
diff --git a/frame/3/bli_l3_blocksize.c b/frame/3/bli_l3_blocksize.c
index 78482b5f63..66236bce26 100644
--- a/frame/3/bli_l3_blocksize.c
+++ b/frame/3/bli_l3_blocksize.c
@@ -35,273 +35,114 @@
 #include "blis.h"
 
 
-dim_t bli_l3_determine_kc
+void bli_l3_adjust_kc
       (
-              dir_t   direct,
-              dim_t   i,
-              dim_t   dim,
-        const obj_t*  a,
-        const obj_t*  b,
-              bszid_t bszid,
-        const cntx_t* cntx,
-        const cntl_t* cntl
+        const obj_t* a,
+        const obj_t* b,
+              dim_t  mr,
+              dim_t  nr,
+              dim_t* bsize,
+              dim_t* bsize_max,
+              opid_t family
       )
 {
-	opid_t family = bli_cntl_family( cntl );
-
 	if      ( family == BLIS_GEMM )
-		return bli_gemm_determine_kc( direct, i, dim, a, b, bszid, cntx );
+		bli_gemm_adjust_kc( a, b, mr, nr, bsize, bsize_max );
 	else if ( family == BLIS_GEMMT )
-		return bli_gemmt_determine_kc( direct, i, dim, a, b, bszid, cntx );
+		bli_gemmt_adjust_kc( a, b, mr, nr, bsize, bsize_max );
 	else if ( family == BLIS_TRMM )
-		return bli_trmm_determine_kc( direct, i, dim, a, b, bszid, cntx );
+		bli_trmm_adjust_kc( a, b, mr, nr, bsize, bsize_max );
 	else if ( family == BLIS_TRSM )
-		return bli_trsm_determine_kc( direct, i, dim, a, b, bszid, cntx );
-
-	// This should never execute.
-	return bli_gemm_determine_kc( direct, i, dim, a, b, bszid, cntx );
-}
-
-// -----------------------------------------------------------------------------
-
-//
-// NOTE: We call a gemm/hemm/symm, trmm, or trsm-specific blocksize
-// function to determine the kc blocksize so that we can implement the
-// "nudging" of kc to be a multiple of mr or nr, as needed.
-//
-
-#undef  GENFRONT
-#define GENFRONT( opname, l3op ) \
-\
-dim_t PASTEMAC0(opname) \
-      ( \
-              dir_t   direct, \
-              dim_t   i, \
-              dim_t   dim, \
-        const obj_t*  a, \
-        const obj_t*  b, \
-              bszid_t bszid, \
-        const cntx_t* cntx  \
-      ) \
-{ \
-	if ( direct == BLIS_FWD ) \
-		return PASTEMAC(l3op,_determine_kc_f)( i, dim, a, b, bszid, cntx ); \
-	else \
-		return PASTEMAC(l3op,_determine_kc_b)( i, dim, a, b, bszid, cntx ); \
+		bli_trsm_adjust_kc( a, b, mr, nr, bsize, bsize_max );
 }
 
-GENFRONT( gemm_determine_kc, gemm )
-GENFRONT( gemmt_determine_kc, gemmt )
-GENFRONT( trmm_determine_kc, trmm )
-GENFRONT( trsm_determine_kc, trsm )
-
 // -----------------------------------------------------------------------------
 
-#undef  GENFRONT
-#define GENFRONT( opname, chdir ) \
-\
-dim_t PASTEMAC0(opname) \
-      ( \
-              dim_t   i, \
-              dim_t   dim, \
-        const obj_t*  a, \
-        const obj_t*  b, \
-              bszid_t bszid, \
-        const cntx_t* cntx  \
-      ) \
-{ \
-	/* bli_*_determine_kc_f():
-
-	   We assume that this function is being called from an algorithm that
-	   is moving "forward" (ie: top to bottom, left to right, top-left
-	   to bottom-right). */ \
-\
-	/* bli_*_determine_kc_b():
-
-	   We assume that this function is being called from an algorithm that
-	   is moving "backward" (ie: bottom to top, right to left, bottom-right
-	   to top-left). */ \
-\
-	/* Extract the execution datatype and use it to query the corresponding
-	   blocksize and blocksize maximum values from the blksz_t object. */ \
-	const num_t    dt    = bli_obj_exec_dt( a ); \
-	const blksz_t* bsize = bli_cntx_get_blksz( bszid, cntx ); \
-	      dim_t    b_alg = bli_blksz_get_def( dt, bsize ); \
-	      dim_t    b_max = bli_blksz_get_max( dt, bsize ); \
-\
+void bli_gemm_adjust_kc
+      (
+        const obj_t* a,
+        const obj_t* b,
+              dim_t  mr,
+              dim_t  nr,
+              dim_t* bsize,
+              dim_t* bsize_max
+      )
+{
 	/* Nudge the default and maximum kc blocksizes up to the nearest
 	   multiple of MR if A is Hermitian or symmetric, or NR if B is
 	   Hermitian or symmetric. If neither case applies, then we leave
-	   the blocksizes unchanged. */ \
-	dim_t    mnr; \
-	if      ( bli_obj_root_is_herm_or_symm( a ) ) \
-	{ \
-		mnr   = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
-		b_alg = bli_align_dim_to_mult( b_alg, mnr ); \
-		b_max = bli_align_dim_to_mult( b_max, mnr ); \
-	} \
-	else if ( bli_obj_root_is_herm_or_symm( b ) ) \
-	{ \
-		mnr   = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
-		b_alg = bli_align_dim_to_mult( b_alg, mnr ); \
-		b_max = bli_align_dim_to_mult( b_max, mnr ); \
-	} \
-\
-	/* Call the bli_determine_blocksize_[fb]_sub() helper routine defined
-	   in bli_blksz.c */ \
-	return PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \
+	   the blocksizes unchanged. */
+	if      ( bli_obj_root_is_herm_or_symm( a ) )
+	{
+        *bsize     = bli_align_dim_to_mult( *bsize, mr );
+        *bsize_max = bli_align_dim_to_mult( *bsize_max, mr );
+	}
+	else if ( bli_obj_root_is_herm_or_symm( b ) )
+	{
+        *bsize     = bli_align_dim_to_mult( *bsize, nr );
+        *bsize_max = bli_align_dim_to_mult( *bsize_max, nr );
+	}
 }
 
-GENFRONT( gemm_determine_kc_f, f )
-GENFRONT( gemm_determine_kc_b, b )
-
 // -----------------------------------------------------------------------------
 
-#undef  GENFRONT
-#define GENFRONT( opname, chdir ) \
-\
-dim_t PASTEMAC0(opname) \
-      ( \
-              dim_t   i, \
-              dim_t   dim, \
-        const obj_t*  a, \
-        const obj_t*  b, \
-              bszid_t bszid, \
-        const cntx_t* cntx  \
-      ) \
-{ \
-	/* bli_*_determine_kc_f():
-
-	   We assume that this function is being called from an algorithm that
-	   is moving "forward" (ie: top to bottom, left to right, top-left
-	   to bottom-right). */ \
-\
-	/* bli_*_determine_kc_b():
-
-	   We assume that this function is being called from an algorithm that
-	   is moving "backward" (ie: bottom to top, right to left, bottom-right
-	   to top-left). */ \
-\
-	/* Extract the execution datatype and use it to query the corresponding
-	   blocksize and blocksize maximum values from the blksz_t object. */ \
-	const num_t    dt    = bli_obj_exec_dt( a ); \
-	const blksz_t* bsize = bli_cntx_get_blksz( bszid, cntx ); \
-	const dim_t    b_alg = bli_blksz_get_def( dt, bsize ); \
-	const dim_t    b_max = bli_blksz_get_max( dt, bsize ); \
-\
+void bli_gemmt_adjust_kc
+      (
+        const obj_t* a,
+        const obj_t* b,
+              dim_t  mr,
+              dim_t  nr,
+              dim_t* bsize,
+              dim_t* bsize_max
+      )
+{
 	/* Notice that for gemmt, we do not need to perform any special handling
-	   for the default and maximum kc blocksizes vis-a-vis MR or NR. */ \
-\
-	/* Call the bli_determine_blocksize_[fb]_sub() helper routine defined
-	   in bli_blksz.c */ \
-	return PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \
+	   for the default and maximum kc blocksizes vis-a-vis MR or NR. */
 }
 
-GENFRONT( gemmt_determine_kc_f, f )
-GENFRONT( gemmt_determine_kc_b, b )
-
 // -----------------------------------------------------------------------------
 
-#undef  GENFRONT
-#define GENFRONT( opname, chdir ) \
-\
-dim_t PASTEMAC0(opname) \
-      ( \
-              dim_t   i, \
-              dim_t   dim, \
-        const obj_t*  a, \
-        const obj_t*  b, \
-              bszid_t bszid, \
-        const cntx_t* cntx  \
-      ) \
-{ \
-	/* bli_*_determine_kc_f():
-
-	   We assume that this function is being called from an algorithm that
-	   is moving "forward" (ie: top to bottom, left to right, top-left
-	   to bottom-right). */ \
-\
-	/* bli_*_determine_kc_b():
-
-	   We assume that this function is being called from an algorithm that
-	   is moving "backward" (ie: bottom to top, right to left, bottom-right
-	   to top-left). */ \
-\
-	/* Extract the execution datatype and use it to query the corresponding
-	   blocksize and blocksize maximum values from the blksz_t object. */ \
-	const num_t    dt    = bli_obj_exec_dt( a ); \
-	const blksz_t* bsize = bli_cntx_get_blksz( bszid, cntx ); \
-	      dim_t    b_alg = bli_blksz_get_def( dt, bsize ); \
-	      dim_t    b_max = bli_blksz_get_max( dt, bsize ); \
-\
+void bli_trmm_adjust_kc
+      (
+        const obj_t* a,
+        const obj_t* b,
+              dim_t  mr,
+              dim_t  nr,
+              dim_t* bsize,
+              dim_t* bsize_max
+      )
+{
 	/* Nudge the default and maximum kc blocksizes up to the nearest
 	   multiple of MR if the triangular matrix is on the left, or NR
-	   if the triangular matrix is one the right. */ \
-	dim_t mnr; \
-	if ( bli_obj_root_is_triangular( a ) ) \
-		mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
-	else \
-		mnr = bli_cntx_get_blksz_def_dt( dt, BLIS_NR, cntx ); \
-\
-	b_alg = bli_align_dim_to_mult( b_alg, mnr ); \
-	b_max = bli_align_dim_to_mult( b_max, mnr ); \
-\
-	/* Call the bli_determine_blocksize_[fb]_sub() helper routine defined
-	   in bli_blksz.c */ \
-	return PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \
+	   if the triangular matrix is one the right. */
+	dim_t mnr;
+	if ( bli_obj_root_is_triangular( a ) )
+		mnr = mr;
+	else
+		mnr = nr;
+
+    *bsize     = bli_align_dim_to_mult( *bsize, mnr );
+    *bsize_max = bli_align_dim_to_mult( *bsize_max, mnr );
 }
 
-GENFRONT( trmm_determine_kc_f, f )
-GENFRONT( trmm_determine_kc_b, b )
-
 // -----------------------------------------------------------------------------
 
-#undef  GENFRONT
-#define GENFRONT( opname, chdir ) \
-\
-dim_t PASTEMAC0(opname) \
-      ( \
-              dim_t   i, \
-              dim_t   dim, \
-        const obj_t*  a, \
-        const obj_t*  b, \
-              bszid_t bszid, \
-        const cntx_t* cntx  \
-      ) \
-{ \
-	/* bli_*_determine_kc_f():
-
-	   We assume that this function is being called from an algorithm that
-	   is moving "forward" (ie: top to bottom, left to right, top-left
-	   to bottom-right). */ \
-\
-	/* bli_*_determine_kc_b():
-
-	   We assume that this function is being called from an algorithm that
-	   is moving "backward" (ie: bottom to top, right to left, bottom-right
-	   to top-left). */ \
-\
-	/* Extract the execution datatype and use it to query the corresponding
-	   blocksize and blocksize maximum values from the blksz_t object. */ \
-	const num_t    dt    = bli_obj_exec_dt( a ); \
-	const blksz_t* bsize = bli_cntx_get_blksz( bszid, cntx ); \
-	      dim_t    b_alg = bli_blksz_get_def( dt, bsize ); \
-	      dim_t    b_max = bli_blksz_get_max( dt, bsize ); \
-\
+void bli_trsm_adjust_kc
+      (
+        const obj_t* a,
+        const obj_t* b,
+              dim_t  mr,
+              dim_t  nr,
+              dim_t* bsize,
+              dim_t* bsize_max
+      )
+{
 	/* Nudge the default and maximum kc blocksizes up to the nearest
 	   multiple of MR. We always use MR (rather than sometimes using NR)
 	   because even when the triangle is on the right, packing of that
 	   matrix uses MR, since only left-side trsm micro-kernels are
-	   supported. */ \
-	const dim_t mnr   = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
-	            b_alg = bli_align_dim_to_mult( b_alg, mnr ); \
-	            b_max = bli_align_dim_to_mult( b_max, mnr ); \
-\
-	/* Call the bli_determine_blocksize_[fb]_sub() helper routine defined
-	   in bli_blksz.c */ \
-	return PASTEMAC2(determine_blocksize_,chdir,_sub)( i, dim, b_alg, b_max ); \
+	   supported. */
+    *bsize     = bli_align_dim_to_mult( *bsize, mr );
+    *bsize_max = bli_align_dim_to_mult( *bsize_max, mr );
 }
 
-GENFRONT( trsm_determine_kc_f, f )
-GENFRONT( trsm_determine_kc_b, b )
-
diff --git a/frame/3/bli_l3_blocksize.h b/frame/3/bli_l3_blocksize.h
index 1ec889e030..7b3897dcb3 100644
--- a/frame/3/bli_l3_blocksize.h
+++ b/frame/3/bli_l3_blocksize.h
@@ -32,61 +32,33 @@
 
 */
 
-dim_t bli_l3_determine_kc
+void bli_l3_adjust_kc
       (
-              dir_t   direct,
-              dim_t   i,
-              dim_t   dim,
-        const obj_t*  a,
-        const obj_t*  b,
-              bszid_t bszid,
-        const cntx_t* cntx,
-        const cntl_t* cntl
+        const obj_t* a, \
+        const obj_t* b, \
+              dim_t  mr, \
+              dim_t  nr, \
+              dim_t* bsize, \
+              dim_t* bsize_max, \
+              opid_t family \
       );
 
 
 #undef  GENPROT
 #define GENPROT( opname ) \
 \
-dim_t PASTEMAC0(opname) \
+void PASTEMAC0(opname) \
       ( \
-               dir_t   direct, \
-               dim_t   i, \
-               dim_t   dim, \
-         const obj_t*  a, \
-         const obj_t*  b, \
-               bszid_t bszid, \
-         const cntx_t* cntx  \
+        const obj_t* a, \
+        const obj_t* b, \
+              dim_t  mr, \
+              dim_t  nr, \
+              dim_t* bsize, \
+              dim_t* bsize_max \
       );
 
-GENPROT( gemm_determine_kc )
-GENPROT( gemmt_determine_kc )
-GENPROT( trmm_determine_kc )
-GENPROT( trsm_determine_kc )
-
-
-#undef  GENPROT
-#define GENPROT( opname ) \
-\
-dim_t PASTEMAC0(opname) \
-      ( \
-               dim_t   i, \
-               dim_t   dim, \
-         const obj_t*  a, \
-         const obj_t*  b, \
-               bszid_t bszid, \
-         const cntx_t* cntx  \
-      );
-
-GENPROT( gemm_determine_kc_f )
-GENPROT( gemm_determine_kc_b )
-
-GENPROT( gemmt_determine_kc_f )
-GENPROT( gemmt_determine_kc_b )
-
-GENPROT( trmm_determine_kc_f )
-GENPROT( trmm_determine_kc_b )
-
-GENPROT( trsm_determine_kc_f )
-GENPROT( trsm_determine_kc_b )
+GENPROT( gemm_adjust_kc )
+GENPROT( gemmt_adjust_kc )
+GENPROT( trmm_adjust_kc )
+GENPROT( trsm_adjust_kc )
 
diff --git a/frame/3/bli_l3_cntl.c b/frame/3/bli_l3_cntl.c
deleted file mode 100644
index d7fd9649e8..0000000000
--- a/frame/3/bli_l3_cntl.c
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-
-void bli_l3_cntl_create_if
-     (
-             opid_t   family,
-             pack_t   schema_a,
-             pack_t   schema_b,
-       const obj_t*   a,
-       const obj_t*   b,
-       const obj_t*   c,
-             rntm_t*  rntm,
-             cntl_t*  cntl_orig,
-             cntl_t** cntl_use
-     )
-{
-	// If the control tree pointer is NULL, we construct a default
-	// tree as a function of the operation family.
-	if ( cntl_orig == NULL )
-	{
-		if ( family == BLIS_GEMM ||
-		     family == BLIS_GEMMT ||
-		     family == BLIS_TRMM )
-		{
-			*cntl_use = bli_gemm_cntl_create
-			(
-			  rntm,
-			  family,
-			  schema_a,
-			  schema_b,
-			  bli_obj_ker_fn( c )
-			);
-		}
-		else // if ( family == BLIS_TRSM )
-		{
-			side_t side;
-
-			if ( bli_obj_is_triangular( a ) ) side = BLIS_LEFT;
-			else                              side = BLIS_RIGHT;
-
-			*cntl_use = bli_trsm_cntl_create
-			(
-			  rntm,
-			  side,
-			  schema_a,
-			  schema_b,
-			  bli_obj_ker_fn( c )
-			);
-		}
-	}
-	else
-	{
-		// If the user provided a control tree, create a copy and use it
-		// instead (so that threads can use its local tree as a place to
-		// cache things like pack mem_t entries).
-		*cntl_use = bli_cntl_copy( rntm, cntl_orig );
-
-		// Recursively set the family fields of the newly copied control tree
-		// nodes.
-		bli_cntl_mark_family( family, *cntl_use );
-	}
-}
-
-void bli_l3_cntl_free
-     (
-       rntm_t*    rntm,
-       cntl_t*    cntl_use,
-       thrinfo_t* thread
-     )
-{
-	// NOTE: We don't actually need to call separate _cntl_free() functions
-	// for gemm and trsm; it is merely an unnecessary mirroring of behavior
-	// from the _create() side (which must call different functions based
-	// on the family).
-
-	opid_t family = bli_cntl_family( cntl_use );
-
-	if ( family == BLIS_GEMM ||
-	     family == BLIS_GEMMT ||
-	     family == BLIS_TRMM )
-	{
-		bli_gemm_cntl_free( rntm, cntl_use, thread );
-	}
-	else // if ( family == BLIS_TRSM )
-	{
-		bli_trsm_cntl_free( rntm, cntl_use, thread );
-	}
-}
-
diff --git a/frame/3/bli_l3_cntl.h b/frame/3/bli_l3_cntl.h
deleted file mode 100644
index eb4321ecd7..0000000000
--- a/frame/3/bli_l3_cntl.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-
-//
-// Prototype conditional control tree creation functions.
-//
-
-void bli_l3_cntl_create_if
-     (
-             opid_t   family,
-             pack_t   schema_a,
-             pack_t   schema_b,
-       const obj_t*   a,
-       const obj_t*   b,
-       const obj_t*   c,
-             rntm_t*  rntm,
-             cntl_t*  cntl_orig,
-             cntl_t** cntl_use
-     );
-
-void bli_l3_cntl_free
-     (
-       rntm_t*    rntm,
-       cntl_t*    cntl_use,
-       thrinfo_t* thread
-     );
-
diff --git a/frame/3/bli_l3_int.c b/frame/3/bli_l3_int.c
index b786236ab9..07c5e6dcb8 100644
--- a/frame/3/bli_l3_int.c
+++ b/frame/3/bli_l3_int.c
@@ -42,8 +42,7 @@ void bli_l3_int
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
+       const cntl_t* cntl,
              thrinfo_t* thread
      )
 {
@@ -68,7 +67,7 @@ void bli_l3_int
 	if ( bli_obj_has_zero_dim( a ) ||
 	     bli_obj_has_zero_dim( b ) )
 	{
-		if ( bli_thread_am_ochief( thread ) )
+		if ( bli_thread_am_chief( thread ) )
 			bli_scalm( beta, c );
 		bli_thread_barrier( thread );
 		return;
@@ -82,7 +81,7 @@ void bli_l3_int
 		// This should never execute.
 		bli_abort();
 
-		if ( bli_thread_am_ochief( thread ) )
+		if ( bli_thread_am_chief( thread ) )
 			bli_scalm( beta, c );
 		bli_thread_barrier( thread );
 		return;
@@ -130,9 +129,6 @@ void bli_l3_int
 	if ( !bli_obj_equals( beta, &BLIS_ONE ) )
 		bli_obj_scalar_apply_scalar( beta, &c_local );
 
-	// Create the next node in the thrinfo_t structure.
-	bli_thrinfo_grow( rntm, cntl, thread );
-
 	// Extract the function pointer from the current control tree node.
 	l3_var_oft f = bli_cntl_var_func( cntl );
 
@@ -143,7 +139,6 @@ void bli_l3_int
 	  &b_local,
 	  &c_local,
 	  cntx,
-	  rntm,
 	  cntl,
 	  thread
 	);
diff --git a/frame/3/bli_l3_int.h b/frame/3/bli_l3_int.h
index 65485206de..8364d91e4f 100644
--- a/frame/3/bli_l3_int.h
+++ b/frame/3/bli_l3_int.h
@@ -40,8 +40,7 @@ void bli_l3_int
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
+       const cntl_t* cntl,
              thrinfo_t* thread
      );
 
diff --git a/frame/3/bli_l3_oapi_ex.c b/frame/3/bli_l3_oapi_ex.c
index e4c815fe3a..ce234f47ab 100644
--- a/frame/3/bli_l3_oapi_ex.c
+++ b/frame/3/bli_l3_oapi_ex.c
@@ -50,7 +50,7 @@ void PASTEMAC(gemm,BLIS_OAPI_EX_SUF)
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm
+       const rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -109,7 +109,7 @@ void PASTEMAC(gemm,BLIS_OAPI_EX_SUF)
 		bli_gemm_check( alpha, a, b, beta, c, cntx );
 
 	// Invoke the operation's front-end and request the default control tree.
-	bli_gemm_front( alpha, a, b, beta, c, cntx, rntm, NULL );
+	bli_gemm_front( alpha, a, b, beta, c, cntx, rntm );
 }
 
 #endif
@@ -123,7 +123,7 @@ void PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm
+       const rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -172,7 +172,7 @@ void PASTEMAC(her2k,BLIS_OAPI_EX_SUF)
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm
+       const rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -218,7 +218,7 @@ void PASTEMAC(syr2k,BLIS_OAPI_EX_SUF)
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm
+       const rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -251,7 +251,7 @@ void PASTEMAC(hemm,BLIS_OAPI_EX_SUF)
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm
+       const rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -301,7 +301,7 @@ void PASTEMAC(symm,BLIS_OAPI_EX_SUF)
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm
+       const rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -351,7 +351,7 @@ void PASTEMAC(trmm3,BLIS_OAPI_EX_SUF)
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm
+       const rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -399,7 +399,7 @@ void PASTEMAC(herk,BLIS_OAPI_EX_SUF)
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm
+       const rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -433,7 +433,7 @@ void PASTEMAC(syrk,BLIS_OAPI_EX_SUF)
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm
+       const rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -458,7 +458,7 @@ void PASTEMAC(trmm,BLIS_OAPI_EX_SUF)
        const obj_t*  a,
        const obj_t*  b,
        const cntx_t* cntx,
-             rntm_t* rntm
+       const rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -505,7 +505,7 @@ void PASTEMAC(trsm,BLIS_OAPI_EX_SUF)
        const obj_t*  a,
        const obj_t*  b,
        const cntx_t* cntx,
-             rntm_t* rntm
+       const rntm_t* rntm
      )
 {
 	bli_init_once();
diff --git a/frame/3/bli_l3_oapi_ex.h b/frame/3/bli_l3_oapi_ex.h
index 58091704b4..dd7624d929 100644
--- a/frame/3/bli_l3_oapi_ex.h
+++ b/frame/3/bli_l3_oapi_ex.h
@@ -49,7 +49,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
        const obj_t*  beta, \
        const obj_t*  c, \
        const cntx_t* cntx, \
-             rntm_t* rntm  \
+       const rntm_t* rntm  \
      );
 
 GENPROT( gemm )
@@ -70,7 +70,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
        const obj_t*  beta, \
        const obj_t*  c, \
        const cntx_t* cntx, \
-             rntm_t* rntm  \
+       const rntm_t* rntm  \
      );
 
 GENPROT( hemm )
@@ -88,7 +88,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
        const obj_t*  beta, \
        const obj_t*  c, \
        const cntx_t* cntx, \
-             rntm_t* rntm  \
+       const rntm_t* rntm  \
      );
 
 GENPROT( herk )
@@ -105,7 +105,7 @@ BLIS_EXPORT_BLIS void PASTEMAC(opname,BLIS_OAPI_EX_SUF) \
        const obj_t*  a, \
        const obj_t*  b, \
        const cntx_t* cntx, \
-             rntm_t* rntm  \
+       const rntm_t* rntm  \
      );
 
 GENPROT( trmm )
diff --git a/frame/3/bli_l3_oft.h b/frame/3/bli_l3_oft.h
index 997ade58e0..67fa2c75de 100644
--- a/frame/3/bli_l3_oft.h
+++ b/frame/3/bli_l3_oft.h
@@ -54,7 +54,7 @@ typedef void (*PASTECH(opname,_oft)) \
   const obj_t*  beta, \
   const obj_t*  c, \
   const cntx_t* cntx, \
-        rntm_t* rntm  \
+  const rntm_t* rntm  \
 );
 
 GENTDEF( gemm )
@@ -77,7 +77,7 @@ typedef void (*PASTECH(opname,_oft)) \
   const obj_t*  beta, \
   const obj_t*  c, \
   const cntx_t* cntx, \
-        rntm_t* rntm  \
+  const rntm_t* rntm  \
 );
 
 GENTDEF( hemm )
@@ -97,7 +97,7 @@ typedef void (*PASTECH(opname,_oft)) \
   const obj_t*  beta, \
   const obj_t*  c, \
   const cntx_t* cntx, \
-        rntm_t* rntm  \
+  const rntm_t* rntm  \
 );
 
 GENTDEF( herk )
@@ -116,7 +116,7 @@ typedef void (*PASTECH(opname,_oft)) \
   const obj_t*  a, \
   const obj_t*  b, \
   const cntx_t* cntx, \
-        rntm_t* rntm  \
+  const rntm_t* rntm  \
 );
 
 GENTDEF( trmm )
diff --git a/frame/3/bli_l3_oft_var.h b/frame/3/bli_l3_oft_var.h
index ee529b115a..b295b5812a 100644
--- a/frame/3/bli_l3_oft_var.h
+++ b/frame/3/bli_l3_oft_var.h
@@ -49,8 +49,7 @@ typedef void (*PASTECH(opname,_var_oft)) \
   const obj_t*  b, \
   const obj_t*  c, \
   const cntx_t* cntx, \
-        rntm_t* rntm, \
-        cntl_t* cntl, \
+  const cntl_t* cntl, \
         thrinfo_t* thread  \
 );
 
diff --git a/frame/3/bli_l3_packab.c b/frame/3/bli_l3_packab.c
index 6f18169b28..65776d49fb 100644
--- a/frame/3/bli_l3_packab.c
+++ b/frame/3/bli_l3_packab.c
@@ -40,8 +40,7 @@ void bli_l3_packa
        const obj_t*  b,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
+       const cntl_t* cntl,
              thrinfo_t* thread
      )
 {
@@ -60,7 +59,6 @@ void bli_l3_packa
 	  &a_local,
 	  &a_pack,
 	  cntx,
-	  rntm,
 	  cntl,
 	  thread
 	);
@@ -74,7 +72,6 @@ void bli_l3_packa
 	  &BLIS_ONE,
 	  c,
 	  cntx,
-	  rntm,
 	  bli_cntl_sub_node( cntl ),
 	  bli_thrinfo_sub_node( thread )
 	);
@@ -88,8 +85,7 @@ void bli_l3_packb
        const obj_t*  b,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
+       const cntl_t* cntl,
              thrinfo_t* thread
      )
 {
@@ -112,7 +108,6 @@ void bli_l3_packb
 	  &bt_local,
 	  &bt_pack,
 	  cntx,
-	  rntm,
 	  cntl,
 	  thread
 	);
@@ -129,7 +124,6 @@ void bli_l3_packb
 	  &BLIS_ONE,
 	  c,
 	  cntx,
-	  rntm,
 	  bli_cntl_sub_node( cntl ),
 	  bli_thrinfo_sub_node( thread )
 	);
diff --git a/frame/3/bli_l3_packab.h b/frame/3/bli_l3_packab.h
index f03b7f62ce..e58a08e4b4 100644
--- a/frame/3/bli_l3_packab.h
+++ b/frame/3/bli_l3_packab.h
@@ -38,8 +38,7 @@ void bli_l3_packa
        const obj_t*  b,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
+       const cntl_t* cntl,
              thrinfo_t* thread
      );
 
@@ -49,8 +48,7 @@ void bli_l3_packb
        const obj_t*  b,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
+       const cntl_t* cntl,
              thrinfo_t* thread
      );
 
diff --git a/frame/3/bli_l3_sup.c b/frame/3/bli_l3_sup.c
index eedbd9ec51..fa064d74f6 100644
--- a/frame/3/bli_l3_sup.c
+++ b/frame/3/bli_l3_sup.c
@@ -42,7 +42,7 @@ err_t bli_gemmsup
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm
+       const rntm_t* rntm
      )
 {
 	// Return early if small matrix handling is disabled at configure-time.
@@ -127,7 +127,7 @@ printf( "dims: %d %d %d (threshs: %d %d %d)\n",
 	  beta,
 	  c,
 	  cntx,
-	  rntm
+	  &rntm_l
 	);
 }
 
@@ -140,7 +140,7 @@ err_t bli_gemmtsup
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm
+       const rntm_t* rntm
      )
 {
 	// Return early if small matrix handling is disabled at configure-time.
@@ -196,7 +196,7 @@ err_t bli_gemmtsup
 	  beta,
 	  c,
 	  cntx,
-	  rntm
+	  &rntm_l
 	);
 }
 
diff --git a/frame/3/bli_l3_sup.h b/frame/3/bli_l3_sup.h
index 33b3f8ca74..77ff02d912 100644
--- a/frame/3/bli_l3_sup.h
+++ b/frame/3/bli_l3_sup.h
@@ -40,7 +40,7 @@ err_t bli_gemmsup
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm
+       const rntm_t* rntm
      );
 
 err_t bli_gemmtsup
@@ -51,6 +51,6 @@ err_t bli_gemmtsup
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm
+       const rntm_t* rntm
      );
 
diff --git a/frame/3/bli_l3_sup_int.c b/frame/3/bli_l3_sup_int.c
index 3ff13bdb59..6199cad007 100644
--- a/frame/3/bli_l3_sup_int.c
+++ b/frame/3/bli_l3_sup_int.c
@@ -145,7 +145,7 @@ err_t bli_gemmsup_int
 		if ( use_bp )
 		{
 			#ifdef TRACEVAR
-			if ( bli_thread_am_ochief( thread ) )
+			if ( bli_thread_am_chief( thread ) )
 			printf( "bli_l3_sup_int(): var2m primary\n" );
 			#endif
 			// block-panel macrokernel; m -> mc, mr; n -> nc, nr: var2()
@@ -156,7 +156,7 @@ err_t bli_gemmsup_int
 		else // use_pb
 		{
 			#ifdef TRACEVAR
-			if ( bli_thread_am_ochief( thread ) )
+			if ( bli_thread_am_chief( thread ) )
 			printf( "bli_l3_sup_int(): var1n primary\n" );
 			#endif
 			// panel-block macrokernel; m -> nc*,mr; n -> mc*,nr: var1()
@@ -210,7 +210,7 @@ err_t bli_gemmsup_int
 		if ( use_bp )
 		{
 			#ifdef TRACEVAR
-			if ( bli_thread_am_ochief( thread ) )
+			if ( bli_thread_am_chief( thread ) )
 			printf( "bli_l3_sup_int(): var2m non-primary\n" );
 			#endif
 			// panel-block macrokernel; m -> nc, nr; n -> mc, mr: var2() + trans
@@ -221,7 +221,7 @@ err_t bli_gemmsup_int
 		else // use_pb
 		{
 			#ifdef TRACEVAR
-			if ( bli_thread_am_ochief( thread ) )
+			if ( bli_thread_am_chief( thread ) )
 			printf( "bli_l3_sup_int(): var1n non-primary\n" );
 			#endif
 			// block-panel macrokernel; m -> mc*,nr; n -> nc*,mr: var1() + trans
@@ -319,7 +319,7 @@ err_t bli_gemmtsup_int
 		if ( use_bp )
 		{
 			#ifdef TRACEVAR
-			if ( bli_thread_am_ochief( thread ) )
+			if ( bli_thread_am_chief( thread ) )
 			printf( "bli_l3_sup_int(): var2m primary\n" );
 			#endif
 			// block-panel macrokernel; m -> mc, mr; n -> nc, nr: var2()
@@ -332,7 +332,7 @@ err_t bli_gemmtsup_int
 		else // use_pb
 		{
 			#ifdef TRACEVAR
-			if ( bli_thread_am_ochief( thread ) )
+			if ( bli_thread_am_chief( thread ) )
 			printf( "bli_l3_sup_int(): var1n primary\n" );
 			#endif
 			// panel-block macrokernel; m -> nc*,mr; n -> mc*,nr: var1()
@@ -388,7 +388,7 @@ err_t bli_gemmtsup_int
 		if ( use_bp )
 		{
 			#ifdef TRACEVAR
-			if ( bli_thread_am_ochief( thread ) )
+			if ( bli_thread_am_chief( thread ) )
 			printf( "bli_l3_sup_int(): var2m non-primary\n" );
 			#endif
 			// panel-block macrokernel; m -> nc, nr; n -> mc, mr: var2() + trans
@@ -401,7 +401,7 @@ err_t bli_gemmtsup_int
 		else // use_pb
 		{
 			#ifdef TRACEVAR
-			if ( bli_thread_am_ochief( thread ) )
+			if ( bli_thread_am_chief( thread ) )
 			printf( "bli_l3_sup_int(): var1n non-primary\n" );
 			#endif
 			// block-panel macrokernel; m -> mc*,nr; n -> nc*,mr: var1() + trans
diff --git a/frame/3/bli_l3_sup_oft.h b/frame/3/bli_l3_sup_oft.h
index ba60035b78..c36197201e 100644
--- a/frame/3/bli_l3_sup_oft.h
+++ b/frame/3/bli_l3_sup_oft.h
@@ -53,7 +53,7 @@ typedef err_t (*PASTECH(opname,_oft)) \
   const obj_t*  beta, \
   const obj_t*  c, \
   const cntx_t* cntx, \
-        rntm_t* rntm  \
+  const rntm_t* rntm  \
 );
 
 GENTDEF( gemmsup )
diff --git a/frame/3/bli_l3_sup_packm.c b/frame/3/bli_l3_sup_packm.c
new file mode 100644
index 0000000000..636776f425
--- /dev/null
+++ b/frame/3/bli_l3_sup_packm.c
@@ -0,0 +1,433 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+void bli_packm_sup_init_mem
+     (
+         bool       will_pack,
+         packbuf_t  pack_buf_type,
+         num_t      dt,
+         dim_t      m,
+         dim_t      k,
+         dim_t      mr,
+         thrinfo_t* thread
+     )
+{
+	/* Inspect whether we are going to be packing matrix A. */
+	if ( will_pack == FALSE )
+	{
+	}
+	else /* if ( will_pack == TRUE ) */
+	{
+        mem_t* mem = bli_thread_mem( thread );
+        pba_t* pba = bli_thread_pba( thread );
+
+		/* NOTE: This "rounding up" of the last upanel is actually optional
+		   for the rrc/crc cases, but absolutely necessary for the other cases
+		   since we NEED that last micropanel to have the same ldim (cs_p) as
+		   the other micropanels. Why? So that millikernels can use the same
+		   upanel ldim for all iterations of the ir loop. */
+		const dim_t m_pack = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr;
+		const dim_t k_pack = k;
+
+		/* Barrier to make sure all threads are caught up and ready to begin
+		   the packm stage. */
+		bli_thread_barrier( thread );
+
+		/* Compute the size of the memory block eneded. */
+		siz_t size_needed = bli_dt_size( dt ) * m_pack * k_pack;
+
+		/* Check the mem_t entry provided by the caller. If it is unallocated,
+		   then we need to acquire a block from the memory broker. */
+		if ( bli_mem_is_unalloc( mem ) )
+		{
+			if ( bli_thread_am_chief( thread ) )
+			{
+				/* Acquire directly to the chief thread's mem_t that was
+				   passed in. It needs to be that mem_t struct, and not a
+				   local (temporary) mem_t, since there is no barrier until
+				   after packing is finished, which could allow a race
+				   condition whereby the chief thread exits the current
+				   function before the other threads have a chance to copy
+				   from it. (A barrier would fix that race condition, but
+				   then again, I prefer to keep barriers to a minimum.) */
+				bli_pba_acquire_m
+				(
+				  pba,
+				  size_needed,
+				  pack_buf_type,
+				  mem
+				);
+			}
+
+			/* Broadcast the address of the chief thread's passed-in mem_t
+			   to all threads. */
+			mem_t* mem_p = bli_thread_broadcast( thread, mem );
+
+			/* Non-chief threads: Copy the contents of the chief thread's
+			   passed-in mem_t to the passed-in mem_t for this thread. (The
+			   chief thread already has the mem_t, so it does not need to
+			   perform any copy.) */
+			if ( !bli_thread_am_chief( thread ) )
+			{
+				*mem = *mem_p;
+			}
+		}
+		else /* if ( bli_mem_is_alloc( mem ) ) */
+		{
+			/* If the mem_t entry provided by the caller does NOT contain a NULL
+			   buffer, then a block has already been acquired from the memory
+			   broker and cached by the caller. */
+
+			/* As a sanity check, we should make sure that the mem_t object isn't
+			   associated with a block that is too small compared to the size of
+			   the packed matrix buffer that is needed, according to the value
+			   computed above. */
+			siz_t mem_size = bli_mem_size( mem );
+
+			if ( mem_size < size_needed )
+			{
+				if ( bli_thread_am_chief( thread ) )
+				{
+					/* The chief thread releases the existing block associated
+					   with the mem_t, and then re-acquires a new block, saving
+					   the associated mem_t to its passed-in mem_t. (See coment
+					   above for why the acquisition needs to be directly to
+					   the chief thread's passed-in mem_t and not a local
+					   (temporary) mem_t. */
+					bli_pba_release
+					(
+					  pba,
+					  mem
+					);
+					bli_pba_acquire_m
+					(
+					  pba,
+					  size_needed,
+					  pack_buf_type,
+					  mem
+					);
+				}
+
+				/* Broadcast the address of the chief thread's passed-in mem_t
+				   to all threads. */
+				mem_t* mem_p = bli_thread_broadcast( thread, mem );
+
+				/* Non-chief threads: Copy the contents of the chief thread's
+				   passed-in mem_t to the passed-in mem_t for this thread. (The
+				   chief thread already has the mem_t, so it does not need to
+				   perform any copy.) */
+				if ( !bli_thread_am_chief( thread ) )
+				{
+					*mem = *mem_p;
+				}
+			}
+			else
+			{
+				/* If the mem_t entry is already allocated and sufficiently large,
+				   then we use it as-is. No action is needed. */
+			}
+		}
+	}
+}
+
+void bli_packm_sup_finalize_mem
+     (
+       bool       did_pack,
+       thrinfo_t* thread
+     )
+{
+	/* Inspect whether we previously packed matrix A. */
+	if ( did_pack == FALSE )
+	{
+		/* If we didn't pack matrix A, there's nothing to be done. */
+	}
+	else /* if ( did_pack == TRUE ) */
+	{
+        mem_t* mem = bli_thread_mem( thread );
+        pba_t* pba = bli_thread_pba( thread );
+
+		if ( thread != NULL )
+		if ( bli_thread_am_chief( thread ) )
+		{
+			/* Check the mem_t entry provided by the caller. Only proceed if it
+			   is allocated, which it should be. */
+			if ( bli_mem_is_alloc( mem ) )
+			{
+				bli_pba_release
+				(
+				  pba,
+				  mem
+				);
+			}
+		}
+	}
+}
+
+void bli_packm_sup_init
+     (
+             bool       will_pack,
+             stor3_t    stor_id,
+             pack_t*    schema,
+             dim_t      m,
+             dim_t      k,
+             dim_t      mr,
+             dim_t*     m_max,
+             dim_t*     k_max,
+       const void*      x, inc_t  rs_x, inc_t  cs_x,
+             void**     p, inc_t* rs_p, inc_t* cs_p,
+                           dim_t* pd_p, inc_t* ps_p,
+             thrinfo_t* thread
+     )
+{
+	/* Inspect whether we are going to be packing matrix A. */
+	if ( will_pack == FALSE )
+	{
+		*m_max = m;
+		*k_max = k;
+
+		/* Set the parameters for use with no packing of A (ie: using the
+		   source matrix A directly). */
+		{
+			/* Use the strides of the source matrix as the final values. */
+			*rs_p = rs_x;
+			*cs_p = cs_x;
+
+			*pd_p = mr;
+			*ps_p = mr * rs_x;
+
+			/* Set the schema to "not packed" to indicate that packing will be
+			   skipped. */
+			*schema = BLIS_NOT_PACKED;
+		}
+
+		/* Since we won't be packing, simply update the buffer address provided
+		   by the caller to point to source matrix. */
+		*p = ( void* )x;
+	}
+	else /* if ( will_pack == TRUE ) */
+	{
+		/* NOTE: This is "rounding up" of the last upanel is actually optional
+		   for the rrc/crc cases, but absolutely necessary for the other cases
+		   since we NEED that last micropanel to have the same ldim (cs_p) as
+		   the other micropanels. Why? So that millikernels can use the same
+		   upanel ldim for all iterations of the ir loop. */
+		*m_max = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr;
+		*k_max = k;
+
+		/* Determine the dimensions and strides for the packed matrix A. */
+		if ( stor_id == BLIS_RRC ||
+			 stor_id == BLIS_CRC )
+		{
+			/* stor3_t id values _RRC and _CRC: pack A to plain row storage. */
+			*rs_p = k;
+			*cs_p = 1;
+
+			*pd_p = mr;
+			*ps_p = mr * k;
+
+			/* Set the schema to "row packed" to indicate packing to plain
+			   row storage. */
+			*schema = BLIS_PACKED_ROWS;
+		}
+		else
+		{
+			/* All other stor3_t ids: pack A to column-stored row-panels. */
+			*rs_p = 1;
+			*cs_p = mr;
+
+			*pd_p = mr;
+			*ps_p = mr * k;
+
+			/* Set the schema to "packed row panels" to indicate packing to
+			   conventional column-stored row panels. */
+			*schema = BLIS_PACKED_ROW_PANELS;
+		}
+
+		/* Set the buffer address provided by the caller to point to the
+		   memory associated with the mem_t entry acquired from the memory
+		   broker. */
+		*p = bli_mem_buffer( bli_thread_mem( thread ) );
+	}
+}
+
+typedef void (*packm_sup_var1_fp)
+     (
+       trans_t    transc,
+       pack_t     schema,
+       dim_t      m,
+       dim_t      n,
+       dim_t      m_max,
+       dim_t      n_max,
+       void*      kappa,
+       void*      c, inc_t rs_c, inc_t cs_c,
+       void*      p, inc_t rs_p, inc_t cs_p,
+                           dim_t pd_p, inc_t ps_p,
+       cntx_t*    cntx,
+       thrinfo_t* thread
+     );
+
+typedef void (*packm_sup_var2_fp)
+     (
+       trans_t    transc,
+       pack_t     schema,
+       dim_t      m,
+       dim_t      n,
+       void*      kappa,
+       void*      c, inc_t rs_c, inc_t cs_c,
+       void*      p, inc_t rs_p, inc_t cs_p,
+       cntx_t*    cntx,
+       thrinfo_t* thread
+     );
+
+static packm_sup_var1_fp GENARRAY(packm_sup_var1,packm_sup_var1);
+static packm_sup_var2_fp GENARRAY(packm_sup_var2,packm_sup_var2);
+
+//
+// Define BLAS-like interfaces to the variant chooser.
+//
+
+void bli_packm_sup
+     (
+             bool       will_pack,
+             packbuf_t  pack_buf_type,
+             stor3_t    stor_id,
+             trans_t    transc,
+             num_t      dt,
+             dim_t      m_alloc,
+             dim_t      k_alloc,
+             dim_t      m,
+             dim_t      k,
+             dim_t      mr,
+       const void*      kappa,
+       const void*      a, inc_t  rs_a, inc_t  cs_a,
+             void**     p, inc_t* rs_p, inc_t* cs_p,
+                           inc_t* ps_p,
+       const cntx_t*    cntx,
+             thrinfo_t* thread
+     )
+{
+	pack_t schema;
+	dim_t  m_max;
+	dim_t  k_max;
+	dim_t  pd_p;
+
+	/* Prepare the packing destination buffer. If packing is not requested,
+	   this function will reduce to a no-op. */
+	bli_packm_sup_init_mem
+	(
+	  will_pack,
+	  pack_buf_type,
+	  dt, m_alloc, k_alloc, mr,
+	  thread
+	);
+
+	/* Determine the packing buffer and related parameters for matrix A. If A
+	   will not be packed, then a_use will be set to point to a and the _a_use
+	   strides will be set accordingly. */
+	bli_packm_sup_init
+	(
+	  will_pack,
+	  stor_id,
+	  &schema,
+	  m, k, mr,
+	  &m_max, &k_max,
+	  a, rs_a,  cs_a,
+	  p, rs_p,  cs_p,
+	     &pd_p, ps_p,
+	  thread
+	);
+
+	/* Inspect whether we are going to be packing matrix A. */
+	if ( will_pack == FALSE )
+	{
+		/* If we aren't going to pack matrix A, then there's nothing to do. */
+
+		/*
+		printf( "blis_ packm_sup_a: not packing A.\n" );
+		*/
+	}
+	else /* if ( will_pack == TRUE ) */
+	{
+		if ( schema == BLIS_PACKED_ROWS )
+		{
+			/*
+			printf( "blis_ packm_sup_a: packing A to rows.\n" );
+			*/
+
+			/* For plain packing by rows, use var2. */
+			packm_sup_var2[ dt ]
+			(
+			  transc,
+			  schema,
+			  m,
+			  k,
+			  ( void* )kappa,
+			  ( void* )a,  rs_a,  cs_a,
+			  *p, *rs_p, *cs_p,
+			  ( cntx_t* )cntx,
+			  thread
+			);
+		}
+		else /* if ( schema == BLIS_PACKED_ROW_PANELS ) */
+		{
+			/*
+			printf( "blis_ packm_sup_a: packing A to row panels.\n" );
+			*/
+
+			/* For packing to column-stored row panels, use var1. */
+			packm_sup_var1[ dt ]
+			(
+			  transc,
+			  schema,
+			  m,
+			  k,
+			  m_max,
+			  k_max,
+			  ( void* )kappa,
+			  ( void* )a,  rs_a,  cs_a,
+			  *p, *rs_p, *cs_p,
+			      pd_p,  *ps_p,
+			  ( cntx_t* )cntx,
+			  thread
+			);
+		}
+
+		/* Barrier so that packing is done before computation. */
+		bli_thread_barrier( thread );
+	}
+}
+
diff --git a/frame/1m/packm/bli_packm_thrinfo.h b/frame/3/bli_l3_sup_packm.h
similarity index 56%
rename from frame/1m/packm/bli_packm_thrinfo.h
rename to frame/3/bli_l3_sup_packm.h
index 85b61931c1..d2a270736a 100644
--- a/frame/1m/packm/bli_packm_thrinfo.h
+++ b/frame/3/bli_l3_sup_packm.h
@@ -5,7 +5,7 @@
    libraries.
 
    Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -33,73 +33,57 @@
 
 */
 
-//
-// thrinfo_t macros specific to packm.
-//
 
-/*
-#define bli_packm_thread_my_iter( index, thread ) \
-\
-	( index % thread->n_way == thread->work_id % thread->n_way )
-*/
-
-#define bli_packm_my_iter_rr( i, start, end, work_id, n_way ) \
-\
-	( i % n_way == work_id % n_way )
-
-#define bli_packm_my_iter_sl( i, start, end, work_id, n_way ) \
-\
-	( start <= i && i < end )
-
-// Define a general-purpose version of bli_packm_my_iter() whose definition
-// depends on whether slab or round-robin partitioning was requested at
-// configure-time.
-#ifdef BLIS_ENABLE_JRIR_SLAB
-
-  #define bli_packm_my_iter bli_packm_my_iter_sl
-
-#else // BLIS_ENABLE_JRIR_RR
-
-  #define bli_packm_my_iter bli_packm_my_iter_rr
-
-#endif
-
-
-//
-// thrinfo_t APIs specific to packm.
-//
-
-#if 0
-thrinfo_t* bli_packm_thrinfo_create
+void bli_packm_sup_init_mem
      (
-       thrcomm_t* ocomm,
-       dim_t      ocomm_id,
-       dim_t      n_way,
-       dim_t      work_id,
-       thrinfo_t* sub_node
+         bool       will_pack,
+         packbuf_t  pack_buf_type,
+         num_t      dt,
+         dim_t      m,
+         dim_t      k,
+         dim_t      mr,
+         thrinfo_t* thread
      );
-#endif
 
-void bli_packm_thrinfo_init
+void bli_packm_sup_finalize_mem
      (
-       thrinfo_t* thread,
-       thrcomm_t* ocomm,
-       dim_t      ocomm_id,
-       dim_t      n_way,
-       dim_t      work_id,
-       bszid_t    bszid,
-       thrinfo_t* sub_node
+       bool       did_pack,
+       thrinfo_t* thread
      );
 
-void bli_packm_thrinfo_init_single
+void bli_packm_sup_init
      (
-       thrinfo_t* thread
+             bool       will_pack,
+             stor3_t    stor_id,
+             pack_t*    schema,
+             dim_t      m,
+             dim_t      k,
+             dim_t      mr,
+             dim_t*     m_max,
+             dim_t*     k_max,
+       const void*      x, inc_t  rs_x, inc_t  cs_x,
+             void**     p, inc_t* rs_p, inc_t* cs_p,
+                           dim_t* pd_p, inc_t* ps_p,
+             thrinfo_t* thread
      );
 
-#if 0
-void bli_packm_thrinfo_free
+void bli_packm_sup
      (
-       thrinfo_t* thread
+             bool       will_pack,
+             packbuf_t  pack_buf_type,
+             stor3_t    stor_id,
+             trans_t    transc,
+             num_t      dt,
+             dim_t      m_alloc,
+             dim_t      k_alloc,
+             dim_t      m,
+             dim_t      k,
+             dim_t      mr,
+       const void*      kappa,
+       const void*      a, inc_t  rs_a, inc_t  cs_a,
+             void**     p, inc_t* rs_p, inc_t* cs_p,
+                           inc_t* ps_p,
+       const cntx_t*    cntx,
+             thrinfo_t* thread
      );
-#endif
 
diff --git a/frame/3/bli_l3_sup_packm_a.c b/frame/3/bli_l3_sup_packm_a.c
deleted file mode 100644
index 26faefc463..0000000000
--- a/frame/3/bli_l3_sup_packm_a.c
+++ /dev/null
@@ -1,430 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-       bool             will_pack, \
-       packbuf_t        pack_buf_type, \
-       dim_t            m, \
-       dim_t            k, \
-       dim_t            mr, \
-       const cntx_t*    cntx, \
-       rntm_t*          rntm, \
-       mem_t*           mem, \
-       const thrinfo_t* thread  \
-     ) \
-{ \
-	/* Inspect whether we are going to be packing matrix A. */ \
-	if ( will_pack == FALSE ) \
-	{ \
-	} \
-	else /* if ( will_pack == TRUE ) */ \
-	{ \
-		/* NOTE: This "rounding up" of the last upanel is actually optional
-		   for the rrc/crc cases, but absolutely necessary for the other cases
-		   since we NEED that last micropanel to have the same ldim (cs_p) as
-		   the other micropanels. Why? So that millikernels can use the same
-		   upanel ldim for all iterations of the ir loop. */ \
-		const dim_t m_pack = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr; \
-		const dim_t k_pack = k; \
-\
-		/* Barrier to make sure all threads are caught up and ready to begin
-		   the packm stage. */ \
-		bli_thread_barrier( thread ); \
-\
-		/* Compute the size of the memory block eneded. */ \
-		siz_t size_needed = sizeof( ctype ) * m_pack * k_pack; \
-\
-		/* Check the mem_t entry provided by the caller. If it is unallocated,
-		   then we need to acquire a block from the memory broker. */ \
-		if ( bli_mem_is_unalloc( mem ) ) \
-		{ \
-			if ( bli_thread_am_ochief( thread ) ) \
-			{ \
-				/* Acquire directly to the chief thread's mem_t that was
-				   passed in. It needs to be that mem_t struct, and not a
-				   local (temporary) mem_t, since there is no barrier until
-				   after packing is finished, which could allow a race
-				   condition whereby the chief thread exits the current
-				   function before the other threads have a chance to copy
-				   from it. (A barrier would fix that race condition, but
-				   then again, I prefer to keep barriers to a minimum.) */ \
-				bli_pba_acquire_m \
-				( \
-				  rntm, \
-				  size_needed, \
-				  pack_buf_type, \
-				  mem  \
-				); \
-			} \
-\
-			/* Broadcast the address of the chief thread's passed-in mem_t
-			   to all threads. */ \
-			mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
-\
-			/* Non-chief threads: Copy the contents of the chief thread's
-			   passed-in mem_t to the passed-in mem_t for this thread. (The
-			   chief thread already has the mem_t, so it does not need to
-			   perform any copy.) */ \
-			if ( !bli_thread_am_ochief( thread ) ) \
-			{ \
-				*mem = *mem_p; \
-			} \
-		} \
-		else /* if ( bli_mem_is_alloc( mem ) ) */ \
-		{ \
-			/* If the mem_t entry provided by the caller does NOT contain a NULL
-			   buffer, then a block has already been acquired from the memory
-			   broker and cached by the caller. */ \
-\
-			/* As a sanity check, we should make sure that the mem_t object isn't
-			   associated with a block that is too small compared to the size of
-			   the packed matrix buffer that is needed, according to the value
-			   computed above. */ \
-			siz_t mem_size = bli_mem_size( mem ); \
-\
-			if ( mem_size < size_needed ) \
-			{ \
-				if ( bli_thread_am_ochief( thread ) ) \
-				{ \
-					/* The chief thread releases the existing block associated
-					   with the mem_t, and then re-acquires a new block, saving
-					   the associated mem_t to its passed-in mem_t. (See coment
-					   above for why the acquisition needs to be directly to
-					   the chief thread's passed-in mem_t and not a local
-					   (temporary) mem_t. */ \
-					bli_pba_release \
-					( \
-					  rntm, \
-					  mem \
-					); \
-					bli_pba_acquire_m \
-					( \
-					  rntm, \
-					  size_needed, \
-					  pack_buf_type, \
-					  mem \
-					); \
-				} \
-\
-				/* Broadcast the address of the chief thread's passed-in mem_t
-				   to all threads. */ \
-				mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
-\
-				/* Non-chief threads: Copy the contents of the chief thread's
-				   passed-in mem_t to the passed-in mem_t for this thread. (The
-				   chief thread already has the mem_t, so it does not need to
-				   perform any copy.) */ \
-				if ( !bli_thread_am_ochief( thread ) ) \
-				{ \
-					*mem = *mem_p; \
-				} \
-			} \
-			else \
-			{ \
-				/* If the mem_t entry is already allocated and sufficiently large,
-				   then we use it as-is. No action is needed. */ \
-			} \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC0( packm_sup_init_mem_a )
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-       bool             did_pack, \
-       rntm_t*          rntm, \
-       mem_t*           mem, \
-       const thrinfo_t* thread  \
-     ) \
-{ \
-	/* Inspect whether we previously packed matrix A. */ \
-	if ( did_pack == FALSE ) \
-	{ \
-		/* If we didn't pack matrix A, there's nothing to be done. */ \
-	} \
-	else /* if ( did_pack == TRUE ) */ \
-	{ \
-		if ( thread != NULL ) \
-		if ( bli_thread_am_ochief( thread ) ) \
-		{ \
-			/* Check the mem_t entry provided by the caller. Only proceed if it
-			   is allocated, which it should be. */ \
-			if ( bli_mem_is_alloc( mem ) ) \
-			{ \
-				bli_pba_release \
-				( \
-				  rntm, \
-				  mem \
-				); \
-			} \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC0( packm_sup_finalize_mem_a )
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-       bool             will_pack, \
-       stor3_t          stor_id, \
-       pack_t* schema, \
-       dim_t            m, \
-       dim_t            k, \
-       dim_t            mr, \
-       dim_t*  m_max, \
-       dim_t*  k_max, \
-       ctype*           x, inc_t           rs_x, inc_t           cs_x, \
-       ctype**          p, inc_t* rs_p, inc_t* cs_p, \
-                           dim_t* pd_p, inc_t* ps_p, \
-       cntx_t* cntx, \
-       mem_t*  mem, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	/* Inspect whether we are going to be packing matrix A. */ \
-	if ( will_pack == FALSE ) \
-	{ \
-		*m_max = m; \
-		*k_max = k; \
-\
-		/* Set the parameters for use with no packing of A (ie: using the
-		   source matrix A directly). */ \
-		{ \
-			/* Use the strides of the source matrix as the final values. */ \
-			*rs_p = rs_x; \
-			*cs_p = cs_x; \
-\
-			*pd_p = mr; \
-			*ps_p = mr * rs_x; \
-\
-			/* Set the schema to "not packed" to indicate that packing will be
-			   skipped. */ \
-			*schema = BLIS_NOT_PACKED; \
-		} \
-\
-		/* Since we won't be packing, simply update the buffer address provided
-		   by the caller to point to source matrix. */ \
-		*p = x; \
-	} \
-	else /* if ( will_pack == TRUE ) */ \
-	{ \
-		/* NOTE: This is "rounding up" of the last upanel is actually optional
-		   for the rrc/crc cases, but absolutely necessary for the other cases
-		   since we NEED that last micropanel to have the same ldim (cs_p) as
-		   the other micropanels. Why? So that millikernels can use the same
-		   upanel ldim for all iterations of the ir loop. */ \
-		*m_max = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr; \
-		*k_max = k; \
-\
-		/* Determine the dimensions and strides for the packed matrix A. */ \
-		if ( stor_id == BLIS_RRC || \
-			 stor_id == BLIS_CRC ) \
-		{ \
-			/* stor3_t id values _RRC and _CRC: pack A to plain row storage. */ \
-			*rs_p = k; \
-			*cs_p = 1; \
-\
-			*pd_p = mr; \
-			*ps_p = mr * k; \
-\
-			/* Set the schema to "row packed" to indicate packing to plain
-			   row storage. */ \
-			*schema = BLIS_PACKED_ROWS; \
-		} \
-		else \
-		{ \
-			/* All other stor3_t ids: pack A to column-stored row-panels. */ \
-			*rs_p = 1; \
-			*cs_p = mr; \
-\
-			*pd_p = mr; \
-			*ps_p = mr * k; \
-\
-			/* Set the schema to "packed row panels" to indicate packing to
-			   conventional column-stored row panels. */ \
-			*schema = BLIS_PACKED_ROW_PANELS; \
-		} \
-\
-		/* Set the buffer address provided by the caller to point to the
-		   memory associated with the mem_t entry acquired from the memory
-		   broker. */ \
-		*p = bli_mem_buffer( mem ); \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC0( packm_sup_init_a )
-
-
-//
-// Define BLAS-like interfaces to the variant chooser.
-//
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-       bool             will_pack, \
-       packbuf_t        pack_buf_type, \
-       stor3_t          stor_id, \
-       trans_t          transc, \
-       dim_t            m_alloc, \
-       dim_t            k_alloc, \
-       dim_t            m, \
-       dim_t            k, \
-       dim_t            mr, \
-       ctype*  kappa, \
-       ctype*  a, inc_t           rs_a, inc_t           cs_a, \
-       ctype** p, inc_t* rs_p, inc_t* cs_p, \
-                                                 inc_t* ps_p, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       mem_t*  mem, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	pack_t schema; \
-	dim_t  m_max; \
-	dim_t  k_max; \
-	dim_t  pd_p; \
-\
-	/* Prepare the packing destination buffer. If packing is not requested,
-	   this function will reduce to a no-op. */ \
-	PASTEMAC(ch,packm_sup_init_mem_a) \
-	( \
-	  will_pack, \
-	  pack_buf_type, \
-	  m_alloc, k_alloc, mr, \
-	  cntx, \
-	  rntm, \
-	  mem, \
-	  thread  \
-	); \
-\
-	/* Determine the packing buffer and related parameters for matrix A. If A
-	   will not be packed, then a_use will be set to point to a and the _a_use
-	   strides will be set accordingly. */ \
-	PASTEMAC(ch,packm_sup_init_a) \
-	( \
-	  will_pack, \
-	  stor_id, \
-	  &schema, \
-	  m, k, mr, \
-	  &m_max, &k_max, \
-	  a, rs_a,  cs_a, \
-	  p, rs_p,  cs_p, \
-	     &pd_p, ps_p, \
-	  cntx, \
-	  mem, \
-	  thread  \
-	); \
-\
-	/* Inspect whether we are going to be packing matrix A. */ \
-	if ( will_pack == FALSE ) \
-	{ \
-		/* If we aren't going to pack matrix A, then there's nothing to do. */ \
-\
-		/*
-		printf( "blis_ packm_sup_a: not packing A.\n" ); \
-		*/ \
-	} \
-	else /* if ( will_pack == TRUE ) */ \
-	{ \
-		if ( schema == BLIS_PACKED_ROWS ) \
-		{ \
-			/*
-			printf( "blis_ packm_sup_a: packing A to rows.\n" ); \
-			*/ \
-\
-			/* For plain packing by rows, use var2. */ \
-			PASTEMAC(ch,packm_sup_var2) \
-			( \
-			  transc, \
-			  schema, \
-			  m, \
-			  k, \
-			  kappa, \
-			  a,  rs_a,  cs_a, \
-			  *p, *rs_p, *cs_p, \
-			  cntx, \
-			  thread  \
-			); \
-		} \
-		else /* if ( schema == BLIS_PACKED_ROW_PANELS ) */ \
-		{ \
-			/*
-			printf( "blis_ packm_sup_a: packing A to row panels.\n" ); \
-			*/ \
-\
-			/* For packing to column-stored row panels, use var1. */ \
-			PASTEMAC(ch,packm_sup_var1) \
-			( \
-			  transc, \
-			  schema, \
-			  m, \
-			  k, \
-			  m_max, \
-			  k_max, \
-			  kappa, \
-			  a,  rs_a,  cs_a, \
-			  *p, *rs_p, *cs_p, \
-			      pd_p,  *ps_p, \
-			  cntx, \
-			  thread  \
-			); \
-		} \
-\
-		/* Barrier so that packing is done before computation. */ \
-		bli_thread_barrier( thread ); \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC0( packm_sup_a )
-
diff --git a/frame/3/bli_l3_sup_packm_a.h b/frame/3/bli_l3_sup_packm_a.h
deleted file mode 100644
index 2bddeb07b4..0000000000
--- a/frame/3/bli_l3_sup_packm_a.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-       bool             will_pack, \
-       packbuf_t        pack_buf_type, \
-       dim_t            m, \
-       dim_t            k, \
-       dim_t            mr, \
-       const cntx_t*    cntx, \
-       rntm_t*          rntm, \
-       mem_t*           mem, \
-       const thrinfo_t* thread  \
-     ); \
-
-INSERT_GENTPROT_BASIC0( packm_sup_init_mem_a )
-
-
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-       bool             did_pack, \
-       rntm_t*          rntm, \
-       mem_t*           mem, \
-       const thrinfo_t* thread  \
-     ); \
-
-INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_a )
-
-
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-       bool             will_pack, \
-       stor3_t          stor_id, \
-       pack_t* schema, \
-       dim_t            m, \
-       dim_t            k, \
-       dim_t            mr, \
-       dim_t*  m_max, \
-       dim_t*  k_max, \
-       ctype*           a, inc_t           rs_a, inc_t           cs_a, \
-       ctype**          p, inc_t* rs_p, inc_t* cs_p, \
-                           dim_t* pd_p, inc_t* ps_p, \
-       cntx_t* cntx, \
-       mem_t*  mem, \
-       thrinfo_t* thread  \
-     ); \
-
-INSERT_GENTPROT_BASIC0( packm_sup_init_a )
-
-
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-       bool             will_pack, \
-       packbuf_t        pack_buf_type, \
-       stor3_t          stor_id, \
-       trans_t          transc, \
-       dim_t            m_alloc, \
-       dim_t            k_alloc, \
-       dim_t            m, \
-       dim_t            k, \
-       dim_t            mr, \
-       ctype*  kappa, \
-       ctype*  a, inc_t           rs_a, inc_t           cs_a, \
-       ctype** p, inc_t* rs_p, inc_t* cs_p, \
-                                                 inc_t* ps_p, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       mem_t*  mem, \
-       thrinfo_t* thread  \
-     ); \
-
-INSERT_GENTPROT_BASIC0( packm_sup_a )
-
diff --git a/frame/3/bli_l3_sup_packm_b.c b/frame/3/bli_l3_sup_packm_b.c
deleted file mode 100644
index 6165567759..0000000000
--- a/frame/3/bli_l3_sup_packm_b.c
+++ /dev/null
@@ -1,430 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-       bool             will_pack, \
-       packbuf_t        pack_buf_type, \
-       dim_t            k, \
-       dim_t            n, \
-       dim_t            nr, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       mem_t*  mem, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	/* Inspect whether we are going to be packing matrix B. */ \
-	if ( will_pack == FALSE ) \
-	{ \
-	} \
-	else /* if ( will_pack == TRUE ) */ \
-	{ \
-		/* NOTE: This "rounding up" of the last upanel is actually optional
-		   for the rrc/crc cases, but absolutely necessary for the other cases
-		   since we NEED that last micropanel to have the same ldim (cs_p) as
-		   the other micropanels. Why? So that millikernels can use the same
-		   upanel ldim for all iterations of the ir loop. */ \
-		const dim_t k_pack = k; \
-		const dim_t n_pack = ( n / nr + ( n % nr ? 1 : 0 ) ) * nr; \
-\
-		/* Barrier to make sure all threads are caught up and ready to begin
-		   the packm stage. */ \
-		bli_thread_barrier( thread ); \
-\
-		/* Compute the size of the memory block eneded. */ \
-		siz_t size_needed = sizeof( ctype ) * k_pack * n_pack; \
-\
-		/* Check the mem_t entry provided by the caller. If it is unallocated,
-		   then we need to acquire a block from the memory broker. */ \
-		if ( bli_mem_is_unalloc( mem ) ) \
-		{ \
-			if ( bli_thread_am_ochief( thread ) ) \
-			{ \
-				/* Acquire directly to the chief thread's mem_t that was
-				   passed in. It needs to be that mem_t struct, and not a
-				   local (temporary) mem_t, since there is no barrier until
-				   after packing is finished, which could allow a race
-				   condition whereby the chief thread exits the current
-				   function before the other threads have a chance to copy
-				   from it. (A barrier would fix that race condition, but
-				   then again, I prefer to keep barriers to a minimum.) */ \
-				bli_pba_acquire_m \
-				( \
-				  rntm, \
-				  size_needed, \
-				  pack_buf_type, \
-				  mem  \
-				); \
-			} \
-\
-			/* Broadcast the address of the chief thread's passed-in mem_t
-			   to all threads. */ \
-			mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
-\
-			/* Non-chief threads: Copy the contents of the chief thread's
-			   passed-in mem_t to the passed-in mem_t for this thread. (The
-			   chief thread already has the mem_t, so it does not need to
-			   perform any copy.) */ \
-			if ( !bli_thread_am_ochief( thread ) ) \
-			{ \
-				*mem = *mem_p; \
-			} \
-		} \
-		else /* if ( bli_mem_is_alloc( mem ) ) */ \
-		{ \
-			/* If the mem_t entry provided by the caller does NOT contain a NULL
-			   buffer, then a block has already been acquired from the memory
-			   broker and cached by the caller. */ \
-\
-			/* As a sanity check, we should make sure that the mem_t object isn't
-			   associated with a block that is too small compared to the size of
-			   the packed matrix buffer that is needed, according to the value
-			   computed above. */ \
-			siz_t mem_size = bli_mem_size( mem ); \
-\
-			if ( mem_size < size_needed ) \
-			{ \
-				if ( bli_thread_am_ochief( thread ) ) \
-				{ \
-					/* The chief thread releases the existing block associated
-					   with the mem_t, and then re-acquires a new block, saving
-					   the associated mem_t to its passed-in mem_t. (See coment
-					   above for why the acquisition needs to be directly to
-					   the chief thread's passed-in mem_t and not a local
-					   (temporary) mem_t. */ \
-					bli_pba_release \
-					( \
-					  rntm, \
-					  mem \
-					); \
-					bli_pba_acquire_m \
-					( \
-					  rntm, \
-					  size_needed, \
-					  pack_buf_type, \
-					  mem \
-					); \
-				} \
-\
-				/* Broadcast the address of the chief thread's passed-in mem_t
-				   to all threads. */ \
-				mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
-\
-				/* Non-chief threads: Copy the contents of the chief thread's
-				   passed-in mem_t to the passed-in mem_t for this thread. (The
-				   chief thread already has the mem_t, so it does not need to
-				   perform any copy.) */ \
-				if ( !bli_thread_am_ochief( thread ) ) \
-				{ \
-					*mem = *mem_p; \
-				} \
-			} \
-			else \
-			{ \
-				/* If the mem_t entry is already allocated and sufficiently large,
-				   then we use it as-is. No action is needed. */ \
-			} \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC0( packm_sup_init_mem_b )
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-       bool             did_pack, \
-       rntm_t* rntm, \
-       mem_t*  mem, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	/* Inspect whether we previously packed matrix A. */ \
-	if ( did_pack == FALSE ) \
-	{ \
-		/* If we didn't pack matrix A, there's nothing to be done. */ \
-	} \
-	else /* if ( did_pack == TRUE ) */ \
-	{ \
-		if ( thread != NULL ) \
-		if ( bli_thread_am_ochief( thread ) ) \
-		{ \
-			/* Check the mem_t entry provided by the caller. Only proceed if it
-			   is allocated, which it should be. */ \
-			if ( bli_mem_is_alloc( mem ) ) \
-			{ \
-				bli_pba_release \
-				( \
-				  rntm, \
-				  mem \
-				); \
-			} \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC0( packm_sup_finalize_mem_b )
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-       bool             will_pack, \
-       stor3_t          stor_id, \
-       pack_t* schema, \
-       dim_t            k, \
-       dim_t            n, \
-       dim_t            nr, \
-       dim_t*  k_max, \
-       dim_t*  n_max, \
-       ctype*           x, inc_t           rs_x, inc_t           cs_x, \
-       ctype**          p, inc_t* rs_p, inc_t* cs_p, \
-                           dim_t* pd_p, inc_t* ps_p, \
-       cntx_t* cntx, \
-       mem_t*  mem, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	/* Inspect whether we are going to be packing matrix B. */ \
-	if ( will_pack == FALSE ) \
-	{ \
-		*k_max = k; \
-		*n_max = n; \
-\
-		/* Set the parameters for use with no packing of B (ie: using the
-		   source matrix B directly). */ \
-		{ \
-			/* Use the strides of the source matrix as the final values. */ \
-			*rs_p = rs_x; \
-			*cs_p = cs_x; \
-\
-			*pd_p = nr; \
-			*ps_p = nr * cs_x; \
-\
-			/* Set the schema to "not packed" to indicate that packing will be
-			   skipped. */ \
-			*schema = BLIS_NOT_PACKED; \
-		} \
-\
-		/* Since we won't be packing, simply update the buffer address provided
-		   by the caller to point to source matrix. */ \
-		*p = x; \
-	} \
-	else /* if ( will_pack == TRUE ) */ \
-	{ \
-		/* NOTE: This is "rounding up" of the last upanel is actually optional
-		   for the rrc/crc cases, but absolutely necessary for the other cases
-		   since we NEED that last micropanel to have the same ldim (cs_p) as
-		   the other micropanels. Why? So that millikernels can use the same
-		   upanel ldim for all iterations of the ir loop. */ \
-		*k_max = k; \
-		*n_max = ( n / nr + ( n % nr ? 1 : 0 ) ) * nr; \
-\
-		/* Determine the dimensions and strides for the packed matrix B. */ \
-		if ( stor_id == BLIS_RRC || \
-			 stor_id == BLIS_CRC ) \
-		{ \
-			/* stor3_t id values _RRC and _CRC: pack B to plain row storage. */ \
-			*rs_p = 1; \
-			*cs_p = k; \
-\
-			*pd_p = nr; \
-			*ps_p = k * nr; \
-\
-			/* Set the schema to "column packed" to indicate packing to plain
-			   column storage. */ \
-			*schema = BLIS_PACKED_COLUMNS; \
-		} \
-		else \
-		{ \
-			/* All other stor3_t ids: pack B to row-stored column-panels. */ \
-			*rs_p = nr; \
-			*cs_p = 1; \
-\
-			*pd_p = nr; \
-			*ps_p = k * nr; \
-\
-			/* Set the schema to "packed column panels" to indicate packing to
-			   conventional row-stored column panels. */ \
-			*schema = BLIS_PACKED_COL_PANELS; \
-		} \
-\
-		/* Set the buffer address provided by the caller to point to the
-		   memory associated with the mem_t entry acquired from the memory
-		   broker. */ \
-		*p = bli_mem_buffer( mem ); \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC0( packm_sup_init_b )
-
-
-//
-// Define BLAS-like interfaces to the variant chooser.
-//
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-       bool             will_pack, \
-       packbuf_t        pack_buf_type, \
-       stor3_t          stor_id, \
-       trans_t          transc, \
-       dim_t            k_alloc, \
-       dim_t            n_alloc, \
-       dim_t            k, \
-       dim_t            n, \
-       dim_t            nr, \
-       ctype*  kappa, \
-       ctype*  b, inc_t           rs_b, inc_t           cs_b, \
-       ctype** p, inc_t* rs_p, inc_t* cs_p, \
-                                                 inc_t* ps_p, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       mem_t*  mem, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	pack_t schema; \
-	dim_t  k_max; \
-	dim_t  n_max; \
-	dim_t  pd_p; \
-\
-	/* Prepare the packing destination buffer. If packing is not requested,
-	   this function will reduce to a no-op. */ \
-	PASTEMAC(ch,packm_sup_init_mem_b) \
-	( \
-	  will_pack, \
-	  pack_buf_type, \
-	  k_alloc, n_alloc, nr, \
-	  cntx, \
-	  rntm, \
-	  mem, \
-	  thread  \
-	); \
-\
-	/* Determine the packing buffer and related parameters for matrix B. If B
-	   will not be packed, then b_use will be set to point to b and the _b_use
-	   strides will be set accordingly. */ \
-	PASTEMAC(ch,packm_sup_init_b) \
-	( \
-	  will_pack, \
-	  stor_id, \
-	  &schema, \
-	  k, n, nr, \
-	  &k_max, &n_max, \
-	  b, rs_b,  cs_b, \
-	  p, rs_p,  cs_p, \
-	     &pd_p, ps_p, \
-	  cntx, \
-	  mem, \
-	  thread  \
-	); \
-\
-	/* Inspect whether we are going to be packing matrix B. */ \
-	if ( will_pack == FALSE ) \
-	{ \
-		/* If we aren't going to pack matrix B, then there's nothing to do. */ \
-\
-		/*
-		printf( "blis_ packm_sup_b: not packing B.\n" ); \
-		*/ \
-	} \
-	else /* if ( will_pack == TRUE ) */ \
-	{ \
-		if ( schema == BLIS_PACKED_COLUMNS ) \
-		{ \
-			/*
-			printf( "blis_ packm_sup_b: packing B to columns.\n" ); \
-			*/ \
-\
-			/* For plain packing by columns, use var2. */ \
-			PASTEMAC(ch,packm_sup_var2) \
-			( \
-			  transc, \
-			  schema, \
-			  k, \
-			  n, \
-			  kappa, \
-			  b,  rs_b,  cs_b, \
-			  *p, *rs_p, *cs_p, \
-			  cntx, \
-			  thread  \
-			); \
-		} \
-		else /* if ( schema == BLIS_PACKED_COL_PANELS ) */ \
-		{ \
-			/*
-			printf( "blis_ packm_sup_b: packing B to col panels.\n" ); \
-			*/ \
-\
-			/* For packing to row-stored column panels, use var1. */ \
-			PASTEMAC(ch,packm_sup_var1) \
-			( \
-			  transc, \
-			  schema, \
-			  k, \
-			  n, \
-			  k_max, \
-			  n_max, \
-			  kappa, \
-			  b,  rs_b,  cs_b, \
-			  *p, *rs_p, *cs_p, \
-			      pd_p,  *ps_p, \
-			  cntx, \
-			  thread  \
-			); \
-		} \
-\
-		/* Barrier so that packing is done before computation. */ \
-		bli_thread_barrier( thread ); \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC0( packm_sup_b )
-
diff --git a/frame/3/bli_l3_sup_packm_b.h b/frame/3/bli_l3_sup_packm_b.h
deleted file mode 100644
index da20ea71d3..0000000000
--- a/frame/3/bli_l3_sup_packm_b.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-       bool             will_pack, \
-       packbuf_t        pack_buf_type, \
-       dim_t            k, \
-       dim_t            n, \
-       dim_t            nr, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       mem_t*  mem, \
-       thrinfo_t* thread  \
-     ); \
-
-INSERT_GENTPROT_BASIC0( packm_sup_init_mem_b )
-
-
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-       bool             did_pack, \
-       rntm_t* rntm, \
-       mem_t*  mem, \
-       thrinfo_t* thread  \
-     ); \
-
-INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_b )
-
-
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-       bool             will_pack, \
-       stor3_t          stor_id, \
-       pack_t* schema, \
-       dim_t            k, \
-       dim_t            n, \
-       dim_t            nr, \
-       dim_t*  k_max, \
-       dim_t*  n_max, \
-       ctype*           b, inc_t           rs_b, inc_t           cs_b, \
-       ctype**          p, inc_t* rs_p, inc_t* cs_p, \
-                           dim_t* pd_p, inc_t* ps_p, \
-       cntx_t* cntx, \
-       mem_t*  mem, \
-       thrinfo_t* thread  \
-     ); \
-
-INSERT_GENTPROT_BASIC0( packm_sup_init_b )
-
-
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-       bool             will_pack, \
-       packbuf_t        pack_buf_type, \
-       stor3_t          stor_id, \
-       trans_t          transc, \
-       dim_t            k_alloc, \
-       dim_t            n_alloc, \
-       dim_t            k, \
-       dim_t            n, \
-       dim_t            nr, \
-       ctype*  kappa, \
-       ctype*  b, inc_t           rs_b, inc_t           cs_b, \
-       ctype** p, inc_t* rs_p, inc_t* cs_p, \
-                                                 inc_t* ps_p, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       mem_t*  mem, \
-       thrinfo_t* thread  \
-     ); \
-
-INSERT_GENTPROT_BASIC0( packm_sup_b )
-
diff --git a/frame/3/bli_l3_sup_packm_var.c b/frame/3/bli_l3_sup_packm_var.c
index ece1feed2d..9f2357fd6f 100644
--- a/frame/3/bli_l3_sup_packm_var.c
+++ b/frame/3/bli_l3_sup_packm_var.c
@@ -50,9 +50,9 @@ void PASTEMAC(ch,varname) \
        dim_t            n, \
        dim_t            m_max, \
        dim_t            n_max, \
-       ctype*  kappa, \
-       ctype*  c, inc_t rs_c, inc_t cs_c, \
-       ctype*  p, inc_t rs_p, inc_t cs_p, \
+       void*  kappa, \
+       void*  c, inc_t rs_c, inc_t cs_c, \
+       void*  p, inc_t rs_p, inc_t cs_p, \
                            dim_t pd_p, inc_t ps_p, \
        cntx_t* cntx, \
        thrinfo_t* thread  \
@@ -321,9 +321,9 @@ void PASTEMAC(ch,varname) \
        pack_t           schema, \
        dim_t            m, \
        dim_t            n, \
-       ctype*  kappa, \
-       ctype*  c, inc_t rs_c, inc_t cs_c, \
-       ctype*  p, inc_t rs_p, inc_t cs_p, \
+       void*  kappa, \
+       void*  c, inc_t rs_c, inc_t cs_c, \
+       void*  p, inc_t rs_p, inc_t cs_p, \
        cntx_t* cntx, \
        thrinfo_t* thread  \
      ) \
diff --git a/frame/3/bli_l3_sup_packm_var.h b/frame/3/bli_l3_sup_packm_var.h
index 257974e466..9441e441ce 100644
--- a/frame/3/bli_l3_sup_packm_var.h
+++ b/frame/3/bli_l3_sup_packm_var.h
@@ -48,9 +48,9 @@ void PASTEMAC(ch,varname) \
        dim_t            n, \
        dim_t            m_max, \
        dim_t            n_max, \
-       ctype*  kappa, \
-       ctype*  c, inc_t rs_c, inc_t cs_c, \
-       ctype*  p, inc_t rs_p, inc_t cs_p, \
+       void*  kappa, \
+       void*  c, inc_t rs_c, inc_t cs_c, \
+       void*  p, inc_t rs_p, inc_t cs_p, \
                            dim_t pd_p, inc_t ps_p, \
        cntx_t* cntx, \
        thrinfo_t* thread  \
@@ -67,9 +67,9 @@ void PASTEMAC(ch,varname) \
        pack_t           schema, \
        dim_t            m, \
        dim_t            n, \
-       ctype*  kappa, \
-       ctype*  c, inc_t rs_c, inc_t cs_c, \
-       ctype*  p, inc_t rs_p, inc_t cs_p, \
+       void*  kappa, \
+       void*  c, inc_t rs_c, inc_t cs_c, \
+       void*  p, inc_t rs_p, inc_t cs_p, \
        cntx_t* cntx, \
        thrinfo_t* thread  \
      );
diff --git a/frame/3/bli_l3_sup_var1n2m.c b/frame/3/bli_l3_sup_var1n2m.c
index a5d66783fc..e9d44ff0b1 100644
--- a/frame/3/bli_l3_sup_var1n2m.c
+++ b/frame/3/bli_l3_sup_var1n2m.c
@@ -34,34 +34,10 @@
 
 #include "blis.h"
 
-#define FUNCPTR_T gemmsup_fp
-
-typedef void (*FUNCPTR_T)
-     (
-       bool       packa,
-       bool       packb,
-       conj_t     conja,
-       conj_t     conjb,
-       dim_t      m,
-       dim_t      n,
-       dim_t      k,
-       void*      alpha,
-       void*      a, inc_t rs_a, inc_t cs_a,
-       void*      b, inc_t rs_b, inc_t cs_b,
-       void*      beta,
-       void*      c, inc_t rs_c, inc_t cs_c,
-       stor3_t    eff_id,
-       cntx_t*    cntx,
-       rntm_t*    rntm,
-       thrinfo_t* thread
-     );
-
 //
 // -- var1n --------------------------------------------------------------------
 //
 
-static FUNCPTR_T GENARRAY(ftypes_var1n,gemmsup_ref_var1n);
-
 void bli_gemmsup_ref_var1n
      (
              trans_t trans,
@@ -70,67 +46,31 @@ void bli_gemmsup_ref_var1n
        const obj_t*  b,
        const obj_t*  beta,
        const obj_t*  c,
-             stor3_t eff_id,
+             stor3_t stor_id,
        const cntx_t* cntx,
-             rntm_t* rntm,
+       const rntm_t* rntm,
              thrinfo_t* thread
      )
 {
-#if 0
-	obj_t at, bt;
-
-	bli_obj_alias_to( a, &at );
-	bli_obj_alias_to( b, &bt );
-
-	// Induce transpositions on A and/or B if either object is marked for
-	// transposition. We can induce "fast" transpositions since they objects
-	// are guaranteed to not have structure or be packed.
-	if ( bli_obj_has_trans( &at ) ) { bli_obj_induce_fast_trans( &at ); }
-	if ( bli_obj_has_trans( &bt ) ) { bli_obj_induce_fast_trans( &bt ); }
-
-	const num_t    dt        = bli_obj_dt( c );
-
-	const conj_t   conja     = bli_obj_conj_status( a );
-	const conj_t   conjb     = bli_obj_conj_status( b );
+	const num_t  dt      = bli_obj_dt( c );
 
-	const dim_t    m         = bli_obj_length( c );
-	const dim_t    n         = bli_obj_width( c );
+    const dim_t  dt_size = bli_dt_size( dt );
 
-	const dim_t    k         = bli_obj_width( &at );
+	      bool   packa   = bli_rntm_pack_a( rntm );
+	      bool   packb   = bli_rntm_pack_b( rntm );
 
-	void* buf_a     = bli_obj_buffer_at_off( &at );
-	const inc_t    rs_a      = bli_obj_row_stride( &at );
-	const inc_t    cs_a      = bli_obj_col_stride( &at );
+	      conj_t conja   = bli_obj_conj_status( a );
+	      conj_t conjb   = bli_obj_conj_status( b );
 
-	void* buf_b     = bli_obj_buffer_at_off( &bt );
-	const inc_t    rs_b      = bli_obj_row_stride( &bt );
-	const inc_t    cs_b      = bli_obj_col_stride( &bt );
-
-	void* buf_c     = bli_obj_buffer_at_off( c );
-	const inc_t    rs_c      = bli_obj_row_stride( c );
-	const inc_t    cs_c      = bli_obj_col_stride( c );
-
-	void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
-	void* buf_beta  = bli_obj_buffer_for_1x1( dt, beta );
-
-#else
-	const num_t  dt    = bli_obj_dt( c );
-
-	const bool   packa = bli_rntm_pack_a( rntm );
-	const bool   packb = bli_rntm_pack_b( rntm );
-
-	const conj_t conja = bli_obj_conj_status( a );
-	const conj_t conjb = bli_obj_conj_status( b );
-
-	const dim_t  m     = bli_obj_length( c );
-	const dim_t  n     = bli_obj_width( c );
+	      dim_t  m       = bli_obj_length( c );
+	      dim_t  n       = bli_obj_width( c );
 	      dim_t  k;
 
-	const void*  buf_a = bli_obj_buffer_at_off( a );
+	const void*  buf_a   = bli_obj_buffer_at_off( a );
 	      inc_t  rs_a;
 	      inc_t  cs_a;
 
-	const void*  buf_b = bli_obj_buffer_at_off( b );
+	const void*  buf_b   = bli_obj_buffer_at_off( b );
 	      inc_t  rs_b;
 	      inc_t  cs_b;
 
@@ -163,120 +103,58 @@ void bli_gemmsup_ref_var1n
 	}
 
 	      void* buf_c     = bli_obj_buffer_at_off( c );
-	const inc_t rs_c      = bli_obj_row_stride( c );
-	const inc_t cs_c      = bli_obj_col_stride( c );
+	      inc_t rs_c      = bli_obj_row_stride( c );
+	      inc_t cs_c      = bli_obj_col_stride( c );
 
 	const void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
 	const void* buf_beta  = bli_obj_buffer_for_1x1( dt, beta );
 
-#endif
+    // These checks should happen WAY earlier
+#if 0
 
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	FUNCPTR_T f = ftypes_var1n[dt];
+	/* If m or n is zero, return immediately. */
+	if ( bli_zero_dim2( m, n ) ) return;
+
+	/* If k < 1 or alpha is zero, scale by beta and return. */
+	if ( k < 1 || PASTEMAC(ch,eq0)( *(( ctype* )alpha) ) )
+	{
+		if ( bli_thread_am_chief( thread ) )
+		{
+			PASTEMAC(ch,scalm)
+			(
+			  BLIS_NO_CONJUGATE,
+			  0,
+			  BLIS_NONUNIT_DIAG,
+			  BLIS_DENSE,
+			  m, n,
+			  beta,
+			  c, rs_c, cs_c
+			);
+		}
+		return;
+	}
+
+#endif
 
 #if 1
 	// Optimize some storage/packing cases by transforming them into others.
-	// These optimizations are expressed by changing trans and/or eff_id.
-	bli_gemmsup_ref_var1n2m_opt_cases( dt, &trans, packa, packb, &eff_id, cntx );
+	// These optimizations are expressed by changing trans and/or stor_id.
+	bli_gemmsup_ref_var1n2m_opt_cases( dt, &trans, packa, packb, &stor_id, cntx );
 #endif
 
-	if ( bli_is_notrans( trans ) )
-	{
-		// Invoke the function.
-		f
-		(
-		  packa,
-		  packb,
-		  conja,
-		  conjb,
-		  m,
-		  n,
-		  k,
-		  ( void* )buf_alpha,
-		  ( void* )buf_a, rs_a, cs_a,
-		  ( void* )buf_b, rs_b, cs_b,
-		  ( void* )buf_beta,
-		           buf_c, rs_c, cs_c,
-		  eff_id,
-		  ( cntx_t* )cntx,
-		  rntm,
-		  thread
-		);
-	}
-	else
-	{
-		// Invoke the function (transposing the operation).
-		f
-		(
-		  packb,
-		  packa,
-		  conjb,             // swap the conj values.
-		  conja,
-		  n,                 // swap the m and n dimensions.
-		  m,
-		  k,
-		  ( void* )buf_alpha,
-		  ( void* )buf_b, cs_b, rs_b, // swap the positions of A and B.
-		  ( void* )buf_a, cs_a, rs_a, // swap the strides of A and B.
-		  ( void* )buf_beta,
-		           buf_c, cs_c, rs_c, // swap the strides of C.
-		  bli_stor3_trans( eff_id ), // transpose the stor3_t id.
-          ( cntx_t* )cntx,
-		  rntm,
-		  thread
-		);
-	}
-}
+    if ( bli_is_trans( trans ) )
+    {
+              bool   packtmp = packa; packa = packb; packb = packtmp;
+              conj_t conjtmp = conja; conja = conjb; conjb = conjtmp;
+              dim_t  len_tmp =     m;     m =     n;     n = len_tmp;
+        const void*  buf_tmp = buf_a; buf_a = buf_b; buf_b = buf_tmp;
+              inc_t  str_tmp =  rs_a;  rs_a =  cs_b;  cs_b = str_tmp;
+                     str_tmp =  cs_a;  cs_a =  rs_b;  rs_b = str_tmp;
+                     str_tmp =  rs_c;  rs_c =  cs_c;  cs_c = str_tmp;
 
+        stor_id = bli_stor3_trans( stor_id );
+    }
 
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       bool       packa, \
-       bool       packb, \
-       conj_t     conja, \
-       conj_t     conjb, \
-       dim_t      m, \
-       dim_t      n, \
-       dim_t      k, \
-       void*      alpha, \
-       void*      a, inc_t rs_a, inc_t cs_a, \
-       void*      b, inc_t rs_b, inc_t cs_b, \
-       void*      beta, \
-       void*      c, inc_t rs_c, inc_t cs_c, \
-       stor3_t    stor_id, \
-       cntx_t*    cntx, \
-       rntm_t*    rntm, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	const num_t dt = PASTEMAC(ch,type); \
-\
-	/* If m or n is zero, return immediately. */ \
-	if ( bli_zero_dim2( m, n ) ) return; \
-\
-	/* If k < 1 or alpha is zero, scale by beta and return. */ \
-	if ( k < 1 || PASTEMAC(ch,eq0)( *(( ctype* )alpha) ) ) \
-	{ \
-		if ( bli_thread_am_ochief( thread ) ) \
-		{ \
-			PASTEMAC(ch,scalm) \
-			( \
-			  BLIS_NO_CONJUGATE, \
-			  0, \
-			  BLIS_NONUNIT_DIAG, \
-			  BLIS_DENSE, \
-			  m, n, \
-			  beta, \
-			  c, rs_c, cs_c \
-			); \
-		} \
-		return; \
-	} \
-\
 	/* This transposition of the stor3_t id value is inherent to variant 1.
 	   The reason: we assume that variant 2 is the "main" variant. The
 	   consequence of this is that we assume that the millikernels that
@@ -285,431 +163,324 @@ void PASTEMAC(ch,varname) \
 	   n are assumed to be registered to the "non-primary" group associated
 	   with the ("non-primary") anti-preference. Note that this pattern holds
 	   regardless of whether the mkernel set has a row or column preference.)
-	   See bli_l3_sup_int.c for a higher-level view of how this choice is made. */ \
-	stor_id = bli_stor3_trans( stor_id ); \
-\
-	/* Query the context for various blocksizes. */ \
-	const dim_t NR  = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \
-	const dim_t MR  = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \
-	const dim_t NC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \
-	const dim_t MC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \
-	const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \
-\
-	dim_t KC; \
-	if      ( packa && packb ) \
-	{ \
-		KC = KC0; \
-	} \
-	else if ( packb ) \
-	{ \
-		if      ( stor_id == BLIS_RRR || \
-				  stor_id == BLIS_CCC    ) KC = KC0; \
-		else if ( stor_id == BLIS_RRC || \
-				  stor_id == BLIS_CRC    ) KC = KC0; \
-		else if ( stor_id == BLIS_RCR || \
-		          stor_id == BLIS_CCR    ) KC = (( KC0 / 4 ) / 4 ) * 4; \
-		else                               KC = KC0; \
-	} \
-	else if ( packa ) \
-	{ \
-		if      ( stor_id == BLIS_RRR || \
-				  stor_id == BLIS_CCC    ) KC = (( KC0 / 2 ) / 2 ) * 2; \
-		else if ( stor_id == BLIS_RRC || \
-				  stor_id == BLIS_CRC    ) KC = KC0; \
-		else if ( stor_id == BLIS_RCR || \
-		          stor_id == BLIS_CCR    ) KC = (( KC0 / 4 ) / 4 ) * 4; \
-		else                               KC = KC0; \
-	} \
-	else /* if ( !packa && !packb ) */ \
-	{ \
-		if      ( FALSE                  ) KC = KC0; \
-		else if ( stor_id == BLIS_RRC || \
-				  stor_id == BLIS_CRC    ) KC = KC0; \
-		else if ( m <=   MR && n <=   NR ) KC = KC0; \
-		else if ( m <= 2*MR && n <= 2*NR ) KC = KC0 / 2; \
-		else if ( m <= 3*MR && n <= 3*NR ) KC = (( KC0 / 3 ) / 4 ) * 4; \
-		else if ( m <= 4*MR && n <= 4*NR ) KC = KC0 / 4; \
-		else                               KC = (( KC0 / 5 ) / 4 ) * 4; \
-	} \
-\
+	   See bli_l3_sup_int.c for a higher-level view of how this choice is made. */
+	stor_id = bli_stor3_trans( stor_id );
+
+	/* Query the context for various blocksizes. */
+	const dim_t NR  = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx );
+	const dim_t MR  = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx );
+	const dim_t NC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx );
+	const dim_t MC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx );
+	const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx );
+
+	dim_t KC;
+	if      ( packa && packb )
+	{
+		KC = KC0;
+	}
+	else if ( packb )
+	{
+		if      ( stor_id == BLIS_RRR ||
+				  stor_id == BLIS_CCC    ) KC = KC0;
+		else if ( stor_id == BLIS_RRC ||
+				  stor_id == BLIS_CRC    ) KC = KC0;
+		else if ( stor_id == BLIS_RCR ||
+		          stor_id == BLIS_CCR    ) KC = (( KC0 / 4 ) / 4 ) * 4;
+		else                               KC = KC0;
+	}
+	else if ( packa )
+	{
+		if      ( stor_id == BLIS_RRR ||
+				  stor_id == BLIS_CCC    ) KC = (( KC0 / 2 ) / 2 ) * 2;
+		else if ( stor_id == BLIS_RRC ||
+				  stor_id == BLIS_CRC    ) KC = KC0;
+		else if ( stor_id == BLIS_RCR ||
+		          stor_id == BLIS_CCR    ) KC = (( KC0 / 4 ) / 4 ) * 4;
+		else                               KC = KC0;
+	}
+	else /* if ( !packa && !packb ) */
+	{
+		if      ( FALSE                  ) KC = KC0;
+		else if ( stor_id == BLIS_RRC ||
+				  stor_id == BLIS_CRC    ) KC = KC0;
+		else if ( m <=   MR && n <=   NR ) KC = KC0;
+		else if ( m <= 2*MR && n <= 2*NR ) KC = KC0 / 2;
+		else if ( m <= 3*MR && n <= 3*NR ) KC = (( KC0 / 3 ) / 4 ) * 4;
+		else if ( m <= 4*MR && n <= 4*NR ) KC = KC0 / 4;
+		else                               KC = (( KC0 / 5 ) / 4 ) * 4;
+	}
+
 	/* Nudge NC up to a multiple of MR and MC up to a multiple of NR.
 	   NOTE: This is unique to variant 1 (ie: not performed in variant 2)
-	   because MC % MR == 0 and NC % NR == 0 is already enforced at runtime. */ \
-	const dim_t NC  = bli_align_dim_to_mult( NC0, MR ); \
-	const dim_t MC  = bli_align_dim_to_mult( MC0, NR ); \
-\
+	   because MC % MR == 0 and NC % NR == 0 is already enforced at runtime. */
+	const dim_t NC  = bli_align_dim_to_mult( NC0, MR );
+	const dim_t MC  = bli_align_dim_to_mult( MC0, NR );
+
 	/* Query the maximum blocksize for MR, which implies a maximum blocksize
-	   extension for the final iteration. */ \
-	const dim_t MRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_MR, cntx ); \
-	const dim_t MRE = MRM - MR; \
-\
-	/* Compute partitioning step values for each matrix of each loop. */ \
-	const inc_t jcstep_c = rs_c; \
-	const inc_t jcstep_a = rs_a; \
-\
-	const inc_t pcstep_a = cs_a; \
-	const inc_t pcstep_b = rs_b; \
-\
-	const inc_t icstep_c = cs_c; \
-	const inc_t icstep_b = cs_b; \
-\
-	const inc_t jrstep_c = rs_c * MR; \
-\
+	   extension for the final iteration. */
+	const dim_t MRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_MR, cntx );
+	const dim_t MRE = MRM - MR;
+
+	/* Compute partitioning step values for each matrix of each loop. */
+	const inc_t jcstep_c = rs_c * dt_size;
+	const inc_t jcstep_a = rs_a * dt_size;
+
+	const inc_t pcstep_a = cs_a * dt_size;
+	const inc_t pcstep_b = rs_b * dt_size;
+
+	const inc_t icstep_c = cs_c * dt_size;
+	const inc_t icstep_b = cs_b * dt_size;
+
+	const inc_t jrstep_c = rs_c * MR * dt_size;
+
 	/*
-	const inc_t jrstep_a = rs_a * MR; \
-\
-	const inc_t irstep_c = cs_c * NR; \
-	const inc_t irstep_b = cs_b * NR; \
-	*/ \
-\
+	const inc_t jrstep_a = rs_a * MR;
+
+	const inc_t irstep_c = cs_c * NR;
+	const inc_t irstep_b = cs_b * NR;
+	*/
+
 	/* Query the context for the sup microkernel address and cast it to its
-	   function pointer type. */ \
-	PASTECH(ch,gemmsup_ker_ft) \
-	    gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \
-\
-	ctype* a_00       = a; \
-	ctype* b_00       = b; \
-	ctype* c_00       = c; \
-	ctype* alpha_cast = alpha; \
-	ctype* beta_cast  = beta; \
-\
-	/* Make local copies of beta and one scalars to prevent any unnecessary
-	   sharing of cache lines between the cores' caches. */ \
-	ctype           beta_local = *beta_cast; \
-	ctype           one_local  = *PASTEMAC(ch,1); \
-\
-	auxinfo_t aux; \
-\
-	/* Parse and interpret the contents of the rntm_t object to properly
-	   set the ways of parallelism for each loop. */ \
-	/*bli_rntm_set_ways_from_rntm_sup( m, n, k, rntm );*/ \
-\
-	/* Initialize a mem_t entry for A and B. Strictly speaking, this is only
-	   needed for the matrix we will be packing (if any), but we do it
-	   unconditionally to be safe. An alternative way of initializing the
-	   mem_t entries is:
-
-	     bli_mem_clear( &mem_a ); \
-	     bli_mem_clear( &mem_b ); \
-	*/ \
-	mem_t mem_a = BLIS_MEM_INITIALIZER; \
-	mem_t mem_b = BLIS_MEM_INITIALIZER; \
-\
-	/* Define an array of bszid_t ids, which will act as our substitute for
-	   the cntl_t tree.
-	   NOTE: These bszid_t values, and their order, match that of the bp
-	   algorithm (variant 2) because they are not used to query actual
-	   blocksizes but rather query the ways of parallelism for the various
-	   loops. For example, the 2nd loop in variant 1 partitions in the m
-	   dimension (in increments of MR), but parallelizes that m dimension
-	   with BLIS_JR_NT. The only difference is that the _packa and _packb
-	   arrays have been adjusted for the semantic difference in order in
-	   which packa and packb nodes are encountered in the thrinfo tree.
-	   That is, this panel-block algorithm partitions an NC x KC submatrix
-	   of A to be packed in the 4th loop, and a KC x MC submatrix of B
-	   to be packed in the 3rd loop. */ \
-	/*                                  5thloop  4thloop         packa  3rdloop         packb  2ndloop  1stloop  ukrloop */ \
-	bszid_t bszids_nopack[6] = { BLIS_NC, BLIS_KC,               BLIS_MC,               BLIS_NR, BLIS_MR, BLIS_KR }; \
-	bszid_t bszids_packa [7] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC,               BLIS_NR, BLIS_MR, BLIS_KR }; \
-	bszid_t bszids_packb [7] = { BLIS_NC, BLIS_KC,               BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \
-	bszid_t bszids_packab[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \
-	bszid_t* bszids; \
-\
-	/* Set the bszids pointer to the correct bszids array above based on which
-	   matrices (if any) are being packed. */ \
-	if ( packa ) { if ( packb ) bszids = bszids_packab; \
-	               else         bszids = bszids_packa; } \
-	else         { if ( packb ) bszids = bszids_packb; \
-	               else         bszids = bszids_nopack; } \
-\
-	/* Determine whether we are using more than one thread. */ \
-	const bool is_mt = ( bli_rntm_calc_num_threads( rntm ) > 1 ); \
-\
-	thrinfo_t* thread_jc = NULL; \
-	thrinfo_t* thread_pc = NULL; \
-	thrinfo_t* thread_pa = NULL; \
-	thrinfo_t* thread_ic = NULL; \
-	thrinfo_t* thread_pb = NULL; \
-	thrinfo_t* thread_jr = NULL; \
-\
-	/* Grow the thrinfo_t tree. */ \
-	bszid_t*   bszids_jc = bszids; \
-	               thread_jc = thread; \
-	bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \
-\
-	/* Compute the JC loop thread range for the current thread. */ \
-	dim_t jc_start, jc_end; \
-	bli_thread_range_sub( thread_jc, m, MR, FALSE, &jc_start, &jc_end ); \
-	const dim_t m_local = jc_end - jc_start; \
-\
-	/* Compute number of primary and leftover components of the JC loop. */ \
-	/*const dim_t jc_iter = ( m_local + NC - 1 ) / NC;*/ \
-	const dim_t jc_left =   m_local % NC; \
-\
-	/* Loop over the m dimension (NC rows/columns at a time). */ \
-	/*for ( dim_t jj = 0; jj < jc_iter; jj += 1 )*/ \
-	for ( dim_t jj = jc_start; jj < jc_end; jj += NC ) \
-	{ \
-		/* Calculate the thread's current JC block dimension. */ \
-		const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \
-\
-		ctype* a_jc = a_00 + jj * jcstep_a; \
-		ctype* c_jc = c_00 + jj * jcstep_c; \
-\
-		/* Grow the thrinfo_t tree. */ \
-		bszid_t*   bszids_pc = &bszids_jc[1]; \
-		               thread_pc = bli_thrinfo_sub_node( thread_jc ); \
-		bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \
-\
-		/* Compute the PC loop thread range for the current thread. */ \
-		const dim_t pc_start = 0, pc_end = k; \
-		const dim_t k_local = k; \
-\
-		/* Compute number of primary and leftover components of the PC loop. */ \
-		/*const dim_t pc_iter = ( k_local + KC - 1 ) / KC;*/ \
-		const dim_t pc_left =   k_local % KC; \
-\
-		/* Loop over the k dimension (KC rows/columns at a time). */ \
-		/*for ( dim_t pp = 0; pp < pc_iter; pp += 1 )*/ \
-		for ( dim_t pp = pc_start; pp < pc_end; pp += KC ) \
-		{ \
-			/* Calculate the thread's current PC block dimension. */ \
-			const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \
-\
-			ctype* a_pc = a_jc + pp * pcstep_a; \
-			ctype* b_pc = b_00 + pp * pcstep_b; \
-\
-			/* Only apply beta to the first iteration of the pc loop. */ \
-			ctype* beta_use = ( pp == 0 ? &beta_local : &one_local ); \
-\
-			ctype* a_use; \
-			      inc_t  rs_a_use, cs_a_use, ps_a_use; \
-\
-			/* Set the bszid_t array and thrinfo_t pointer based on whether
-			   we will be packing A. If we won't be packing A, we alias to
-			   the _pc variables so that code further down can unconditionally
-			   reference the _pa variables. Note that *if* we will be packing
-			   A, the thrinfo_t node will have already been created by a
-			   previous call to bli_thrinfo_grow(), since bszid values of
-			   BLIS_NO_PART cause the tree to grow by two (e.g. to the next
-			   bszid that is a normal bszid_t value). */ \
-			bszid_t*   bszids_pa; \
-			if ( packa ) { bszids_pa = &bszids_pc[1]; \
-			               thread_pa = bli_thrinfo_sub_node( thread_pc ); } \
-			else         { bszids_pa = &bszids_pc[0]; \
-			               thread_pa = thread_pc; } \
-\
+	   function pointer type. */
+	gemmsup_ker_vft gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx );
+
+	const char* a_00       = buf_a;
+	const char* b_00       = buf_b;
+	      char* c_00       = buf_c;
+    const void* one        = bli_obj_buffer_for_const( dt, &BLIS_ONE );
+
+	auxinfo_t aux;
+
+	/* Determine whether we are using more than one thread. */
+	const bool is_mt = ( bli_rntm_calc_num_threads( rntm ) > 1 );
+
+	thrinfo_t* thread_jc = thread;
+	thrinfo_t* thread_pc = bli_thrinfo_sub_node( thread_jc );
+	thrinfo_t* thread_pa = bli_thrinfo_sub_node( thread_pc );
+	thrinfo_t* thread_ic = bli_thrinfo_sub_node( thread_pa );
+	thrinfo_t* thread_pb = bli_thrinfo_sub_node( thread_ic );
+	thrinfo_t* thread_jr = bli_thrinfo_sub_node( thread_pb );
+
+	/* Compute the JC loop thread range for the current thread. */
+	dim_t jc_start, jc_end;
+	bli_thread_range_sub( thread_jc, m, MR, FALSE, &jc_start, &jc_end );
+	const dim_t m_local = jc_end - jc_start;
+
+	/* Compute number of primary and leftover components of the JC loop. */
+	/*const dim_t jc_iter = ( m_local + NC - 1 ) / NC;*/
+	const dim_t jc_left =   m_local % NC;
+
+	/* Loop over the m dimension (NC rows/columns at a time). */
+	/*for ( dim_t jj = 0; jj < jc_iter; jj += 1 )*/
+	for ( dim_t jj = jc_start; jj < jc_end; jj += NC )
+	{
+		/* Calculate the thread's current JC block dimension. */
+		const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left );
+
+		const char* a_jc = a_00 + jj * jcstep_a;
+		      char* c_jc = c_00 + jj * jcstep_c;
+
+		/* Compute the PC loop thread range for the current thread. */
+		const dim_t pc_start = 0, pc_end = k;
+		const dim_t k_local = k;
+
+		/* Compute number of primary and leftover components of the PC loop. */
+		/*const dim_t pc_iter = ( k_local + KC - 1 ) / KC;*/
+		const dim_t pc_left =   k_local % KC;
+
+		/* Loop over the k dimension (KC rows/columns at a time). */
+		/*for ( dim_t pp = 0; pp < pc_iter; pp += 1 )*/
+		for ( dim_t pp = pc_start; pp < pc_end; pp += KC )
+		{
+			/* Calculate the thread's current PC block dimension. */
+			const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left );
+
+			const char* a_pc = a_jc + pp * pcstep_a;
+			const char* b_pc = b_00 + pp * pcstep_b;
+
+			/* Only apply beta to the first iteration of the pc loop. */
+			const void* beta_use = ( pp == 0 ? buf_beta : one );
+
+		          char* a_use;
+			      inc_t rs_a_use, cs_a_use, ps_a_use;
+
 			/* Determine the packing buffer and related parameters for matrix
 			   A. (If A will not be packed, then a_use will be set to point to
 			   a and the _a_use strides will be set accordingly.) Then call
 			   the packm sup variant chooser, which will call the appropriate
 			   implementation based on the schema deduced from the stor_id.
 			   NOTE: packing matrix A in this panel-block algorithm corresponds
-			   to packing matrix B in the block-panel algorithm. */ \
-			PASTEMAC(ch,packm_sup_a) \
-			( \
-			  packa, \
-			  BLIS_BUFFER_FOR_B_PANEL, /* This algorithm packs matrix A to */ \
-			  stor_id,                 /* a "panel of B".                  */ \
-			  BLIS_NO_TRANSPOSE, \
-			  NC,     KC,       /* This "panel of B" is (at most) NC x KC. */ \
-			  nc_cur, kc_cur, MR, \
-			  &one_local, \
-			  a_pc,   rs_a,      cs_a, \
-			  &a_use, &rs_a_use, &cs_a_use, \
-			                     &ps_a_use, \
-			  cntx, \
-			  rntm, \
-			  &mem_a, \
-			  thread_pa  \
-			); \
-\
+			   to packing matrix B in the block-panel algorithm. */
+			bli_packm_sup
+			(
+			  packa,
+			  BLIS_BUFFER_FOR_B_PANEL, /* This algorithm packs matrix A to */
+			  stor_id,                 /* a "panel of B".                  */
+			  BLIS_NO_TRANSPOSE,
+              dt,
+			  NC,     KC,       /* This "panel of B" is (at most) NC x KC. */
+			  nc_cur, kc_cur, MR,
+			  one,
+			  a_pc,   rs_a,      cs_a,
+			  ( void** )&a_use, &rs_a_use, &cs_a_use,
+			                    &ps_a_use,
+              cntx,
+			  thread_pa
+			);
+
 			/* Alias a_use so that it's clear this is our current block of
-			   matrix A. */ \
-			ctype* a_pc_use = a_use; \
-\
+			   matrix A. */
+			const char* a_pc_use = a_use;
+
 			/* We don't need to embed the panel stride of A within the auxinfo_t
 			   object because this variant iterates through A in the jr loop,
 			   which occurs here, within the macrokernel, not within the
-			   millikernel. */ \
-			/*bli_auxinfo_set_ps_a( ps_a_use, &aux );*/ \
-\
-			/* Grow the thrinfo_t tree. */ \
-			bszid_t*   bszids_ic = &bszids_pa[1]; \
-			               thread_ic = bli_thrinfo_sub_node( thread_pa ); \
-			bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \
-\
-			/* Compute the IC loop thread range for the current thread. */ \
-			dim_t ic_start, ic_end; \
-			bli_thread_range_sub( thread_ic, n, NR, FALSE, &ic_start, &ic_end ); \
-			const dim_t n_local = ic_end - ic_start; \
-\
-			/* Compute number of primary and leftover components of the IC loop. */ \
-			/*const dim_t ic_iter = ( n_local + MC - 1 ) / MC;*/ \
-			const dim_t ic_left =   n_local % MC; \
-\
-			/* Loop over the n dimension (MC rows at a time). */ \
-			/*for ( dim_t ii = 0; ii < ic_iter; ii += 1 )*/ \
-			for ( dim_t ii = ic_start; ii < ic_end; ii += MC ) \
-			{ \
-				/* Calculate the thread's current IC block dimension. */ \
-				const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \
-\
-				ctype* b_ic = b_pc + ii * icstep_b; \
-				ctype* c_ic = c_jc + ii * icstep_c; \
-\
-				ctype* b_use; \
-				      inc_t  rs_b_use, cs_b_use, ps_b_use; \
-\
-				/* Set the bszid_t array and thrinfo_t pointer based on whether
-				   we will be packing A. If we won't be packing A, we alias to
-				   the _pc variables so that code further down can unconditionally
-				   reference the _pa variables. Note that *if* we will be packing
-				   A, the thrinfo_t node will have already been created by a
-				   previous call to bli_thrinfo_grow(), since bszid values of
-				   BLIS_NO_PART cause the tree to grow by two (e.g. to the next
-				   bszid that is a normal bszid_t value). */ \
-				bszid_t*   bszids_pb; \
-				if ( packb ) { bszids_pb = &bszids_ic[1]; \
-							   thread_pb = bli_thrinfo_sub_node( thread_ic ); } \
-				else         { bszids_pb = &bszids_ic[0]; \
-							   thread_pb = thread_ic; } \
-\
+			   millikernel. */
+			/*bli_auxinfo_set_ps_a( ps_a_use, &aux );*/
+
+			/* Compute the IC loop thread range for the current thread. */
+			dim_t ic_start, ic_end;
+			bli_thread_range_sub( thread_ic, n, NR, FALSE, &ic_start, &ic_end );
+			const dim_t n_local = ic_end - ic_start;
+
+			/* Compute number of primary and leftover components of the IC loop. */
+			/*const dim_t ic_iter = ( n_local + MC - 1 ) / MC;*/
+			const dim_t ic_left =   n_local % MC;
+
+			/* Loop over the n dimension (MC rows at a time). */
+			/*for ( dim_t ii = 0; ii < ic_iter; ii += 1 )*/
+			for ( dim_t ii = ic_start; ii < ic_end; ii += MC )
+			{
+				/* Calculate the thread's current IC block dimension. */
+				const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left );
+
+				const char* b_ic = b_pc + ii * icstep_b;
+				      char* c_ic = c_jc + ii * icstep_c;
+
+				      char* b_use;
+				      inc_t rs_b_use, cs_b_use, ps_b_use;
+
 				/* Determine the packing buffer and related parameters for matrix
 				   B. (If B will not be packed, then b_use will be set to point to
 				   b and the _b_use strides will be set accordingly.) Then call
 				   the packm sup variant chooser, which will call the appropriate
 				   implementation based on the schema deduced from the stor_id.
 				   NOTE: packing matrix B in this panel-block algorithm corresponds
-				   to packing matrix A in the block-panel algorithm. */ \
-				PASTEMAC(ch,packm_sup_b) \
-				( \
-				  packb, \
-				  BLIS_BUFFER_FOR_A_BLOCK, /* This algorithm packs matrix B to */ \
-				  stor_id,                 /* a "block of A".                  */ \
-				  BLIS_NO_TRANSPOSE, \
-				  KC,     MC,       /* This "block of A" is (at most) KC x MC. */ \
-				  kc_cur, mc_cur, NR, \
-				  &one_local, \
-				  b_ic,   rs_b,      cs_b, \
-				  &b_use, &rs_b_use, &cs_b_use, \
-				                     &ps_b_use, \
-				  cntx, \
-				  rntm, \
-				  &mem_b, \
-				  thread_pb  \
-				); \
-\
+				   to packing matrix A in the block-panel algorithm. */
+				bli_packm_sup
+				(
+				  packb,
+				  BLIS_BUFFER_FOR_A_BLOCK, /* This algorithm packs matrix B to */
+				  stor_id,                 /* a "block of A".                  */
+				  BLIS_NO_TRANSPOSE,
+                  dt,
+				  MC,     KC,       /* This "block of A" is (at most) KC x MC. */
+				  mc_cur, kc_cur, NR,
+				  one,
+				  b_ic,   cs_b,      rs_b,
+				  ( void** )&b_use, &cs_b_use, &rs_b_use,
+				                    &ps_b_use,
+                  cntx,
+				  thread_pb
+				);
+
 				/* Alias b_use so that it's clear this is our current block of
-				   matrix B. */ \
-				ctype* b_ic_use = b_use; \
-\
+				   matrix B. */
+				const char* b_ic_use = b_use;
+
 				/* Embed the panel stride of B within the auxinfo_t object. The
 				   millikernel will query and use this to iterate through
-				   micropanels of B. */ \
-				bli_auxinfo_set_ps_b( ps_b_use, &aux ); \
-\
-				/* Grow the thrinfo_t tree. */ \
-				bszid_t*   bszids_jr = &bszids_pb[1]; \
-				               thread_jr = bli_thrinfo_sub_node( thread_pb ); \
-				bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \
-\
-				/* Compute number of primary and leftover components of the JR loop. */ \
-				dim_t jr_iter = ( nc_cur + MR - 1 ) / MR; \
-				dim_t jr_left =   nc_cur % MR; \
-\
+				   micropanels of B. */
+				bli_auxinfo_set_ps_b( ps_b_use, &aux );
+
+				/* Compute number of primary and leftover components of the JR loop. */
+				dim_t jr_iter = ( nc_cur + MR - 1 ) / MR;
+				dim_t jr_left =   nc_cur % MR;
+
 				/* An optimization: allow the last jr iteration to contain up to MRE
 				   rows of C and A. (If MRE > MR, the mkernel has agreed to handle
 				   these cases.) Note that this prevents us from declaring jr_iter and
 				   jr_left as const. NOTE: We forgo this optimization when packing A
-				   since packing an extended edge case is not yet supported. */ \
-				if ( !packa && !is_mt ) \
-				if ( MRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= MRE ) \
-				{ \
-					jr_iter--; jr_left += MR; \
-				} \
-\
-				/* Compute the JR loop thread range for the current thread. */ \
-				dim_t jr_start, jr_end; \
-				bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \
-\
-				/* Loop over the m dimension (NR columns at a time). */ \
-				/*for ( dim_t j = 0; j < jr_iter; j += 1 )*/ \
-				for ( dim_t j = jr_start; j < jr_end; j += 1 ) \
-				{ \
-					const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? MR : jr_left ); \
-\
+				   since packing an extended edge case is not yet supported. */
+				if ( !packa && !is_mt )
+				if ( MRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= MRE )
+				{
+					jr_iter--; jr_left += MR;
+				}
+
+				/* Compute the JR loop thread range for the current thread. */
+				dim_t jr_start, jr_end;
+				bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end );
+
+				/* Loop over the m dimension (NR columns at a time). */
+				/*for ( dim_t j = 0; j < jr_iter; j += 1 )*/
+				for ( dim_t j = jr_start; j < jr_end; j += 1 )
+				{
+					const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? MR : jr_left );
+
 					/*
-					ctype* a_jr = a_pc + j * jrstep_a; \
-					*/ \
-					ctype* a_jr = a_pc_use + j * ps_a_use; \
-					ctype* c_jr = c_ic     + j * jrstep_c; \
-\
+					ctype* a_jr = a_pc + j * jrstep_a;
+					*/
+					const char* a_jr = a_pc_use + j * ps_a_use;
+					      char* c_jr = c_ic     + j * jrstep_c;
+
 					/*
-					const dim_t ir_iter = ( mc_cur + NR - 1 ) / NR; \
-					const dim_t ir_left =   mc_cur % NR; \
-					*/ \
-\
-					/* Loop over the n dimension (MR rows at a time). */ \
-					{ \
-						/* Invoke the gemmsup millikernel. */ \
-						gemmsup_ker \
-						( \
-						  conja, \
-						  conjb, \
-						  nr_cur, /* Notice: nr_cur <= MR. */ \
-						  mc_cur, /* Recall: mc_cur partitions the n dimension! */ \
-						  kc_cur, \
-						  alpha_cast, \
-						  a_jr,     rs_a_use, cs_a_use, \
-						  b_ic_use, rs_b_use, cs_b_use, \
-						  beta_use, \
-						            c_jr,     rs_c,     cs_c, \
-						  &aux, \
-						  cntx  \
-						); \
-					} \
-				} \
-			} \
-\
+					const dim_t ir_iter = ( mc_cur + NR - 1 ) / NR;
+					const dim_t ir_left =   mc_cur % NR;
+					*/
+
+					/* Loop over the n dimension (MR rows at a time). */
+					{
+						/* Invoke the gemmsup millikernel. */
+						gemmsup_ker
+						(
+						  conja,
+						  conjb,
+						  nr_cur, /* Notice: nr_cur <= MR. */
+						  mc_cur, /* Recall: mc_cur partitions the n dimension! */
+						  kc_cur,
+						  ( char* )buf_alpha,
+						  ( char* )a_jr,     rs_a_use, cs_a_use,
+						  ( char* )b_ic_use, rs_b_use, cs_b_use,
+						  ( char* )beta_use,
+						           c_jr,     rs_c,     cs_c,
+						  &aux,
+						  ( cntx_t* )cntx
+						);
+					}
+				}
+			}
+
 			/* NOTE: This barrier is only needed if we are packing A (since
-			   that matrix is packed within the pc loop of this variant). */ \
-			if ( packa ) bli_thread_barrier( thread_pa ); \
-		} \
-	} \
-\
-	/* Release any memory that was acquired for packing matrices A and B. */ \
-	PASTEMAC(ch,packm_sup_finalize_mem_a) \
-	( \
-	  packa, \
-	  rntm, \
-	  &mem_a, \
-	  thread_pa  \
-	); \
-	PASTEMAC(ch,packm_sup_finalize_mem_b) \
-	( \
-	  packb, \
-	  rntm, \
-	  &mem_b, \
-	  thread_pb  \
-	); \
-\
+			   that matrix is packed within the pc loop of this variant). */
+			if ( packa ) bli_thread_barrier( thread_pa );
+		}
+	}
+
+	/* Release any memory that was acquired for packing matrices A and B. */
+	bli_packm_sup_finalize_mem
+	(
+	  packa,
+	  thread_pa
+	);
+	bli_packm_sup_finalize_mem
+	(
+	  packb,
+	  thread_pb
+	);
+
 /*
-PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: b1", kc_cur, nr_cur, b_jr, rs_b, cs_b, "%4.1f", "" ); \
-PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: a1", mr_cur, kc_cur, a_ir, rs_a, cs_a, "%4.1f", "" ); \
-PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%4.1f", "" ); \
-*/ \
+PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: b1", kc_cur, nr_cur, b_jr, rs_b, cs_b, "%4.1f", "" );
+PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: a1", mr_cur, kc_cur, a_ir, rs_a, cs_a, "%4.1f", "" );
+PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%4.1f", "" );
+*/
 }
 
-INSERT_GENTFUNC_BASIC0( gemmsup_ref_var1n )
-
 
 //
 // -- var2m --------------------------------------------------------------------
 //
 
-static FUNCPTR_T GENARRAY(ftypes_var2m,gemmsup_ref_var2m);
-
 void bli_gemmsup_ref_var2m
      (
              trans_t    trans,
@@ -718,67 +489,30 @@ void bli_gemmsup_ref_var2m
        const obj_t*     b,
        const obj_t*     beta,
        const obj_t*     c,
-             stor3_t    eff_id,
+             stor3_t    stor_id,
        const cntx_t*    cntx,
-             rntm_t*    rntm,
+       const rntm_t*    rntm,
              thrinfo_t* thread
      )
 {
-#if 0
-	obj_t at, bt;
-
-	bli_obj_alias_to( a, &at );
-	bli_obj_alias_to( b, &bt );
-
-	// Induce transpositions on A and/or B if either object is marked for
-	// transposition. We can induce "fast" transpositions since they objects
-	// are guaranteed to not have structure or be packed.
-	if ( bli_obj_has_trans( &at ) ) { bli_obj_induce_fast_trans( &at ); }
-	if ( bli_obj_has_trans( &bt ) ) { bli_obj_induce_fast_trans( &bt ); }
-
-	const num_t    dt        = bli_obj_dt( c );
-
-	const conj_t   conja     = bli_obj_conj_status( a );
-	const conj_t   conjb     = bli_obj_conj_status( b );
-
-	const dim_t    m         = bli_obj_length( c );
-	const dim_t    n         = bli_obj_width( c );
-
-	const dim_t    k         = bli_obj_width( &at );
-
-	void* buf_a     = bli_obj_buffer_at_off( &at );
-	const inc_t    rs_a      = bli_obj_row_stride( &at );
-	const inc_t    cs_a      = bli_obj_col_stride( &at );
-
-	void* buf_b     = bli_obj_buffer_at_off( &bt );
-	const inc_t    rs_b      = bli_obj_row_stride( &bt );
-	const inc_t    cs_b      = bli_obj_col_stride( &bt );
-
-	void* buf_c     = bli_obj_buffer_at_off( c );
-	const inc_t    rs_c      = bli_obj_row_stride( c );
-	const inc_t    cs_c      = bli_obj_col_stride( c );
-
-	void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
-	void* buf_beta  = bli_obj_buffer_for_1x1( dt, beta );
+	const num_t  dt      = bli_obj_dt( c );
+    const dim_t  dt_size = bli_dt_size( dt );
 
-#else
-	const num_t  dt    = bli_obj_dt( c );
+	      bool   packa   = bli_rntm_pack_a( rntm );
+	      bool   packb   = bli_rntm_pack_b( rntm );
 
-	const bool   packa = bli_rntm_pack_a( rntm );
-	const bool   packb = bli_rntm_pack_b( rntm );
+	      conj_t conja   = bli_obj_conj_status( a );
+	      conj_t conjb   = bli_obj_conj_status( b );
 
-	const conj_t conja = bli_obj_conj_status( a );
-	const conj_t conjb = bli_obj_conj_status( b );
-
-	const dim_t  m     = bli_obj_length( c );
-	const dim_t  n     = bli_obj_width( c );
+	      dim_t  m       = bli_obj_length( c );
+	      dim_t  n       = bli_obj_width( c );
 	      dim_t  k;
 
-	const void*  buf_a = bli_obj_buffer_at_off( a );
+	const void*  buf_a   = bli_obj_buffer_at_off( a );
 	      inc_t  rs_a;
 	      inc_t  cs_a;
 
-	const void*  buf_b = bli_obj_buffer_at_off( b );
+	const void*  buf_b   = bli_obj_buffer_at_off( b );
 	      inc_t  rs_b;
 	      inc_t  cs_b;
 
@@ -811,513 +545,357 @@ void bli_gemmsup_ref_var2m
 	}
 
 	      void* buf_c     = bli_obj_buffer_at_off( c );
-	const inc_t rs_c      = bli_obj_row_stride( c );
-	const inc_t cs_c      = bli_obj_col_stride( c );
+	      inc_t rs_c      = bli_obj_row_stride( c );
+	      inc_t cs_c      = bli_obj_col_stride( c );
 
 	const void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
 	const void* buf_beta  = bli_obj_buffer_for_1x1( dt, beta );
 
-#endif
+    // These checks should happen WAY earlier
+#if 0
 
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	FUNCPTR_T f = ftypes_var2m[dt];
+	/* If m or n is zero, return immediately. */
+	if ( bli_zero_dim2( m, n ) ) return;
+
+	/* If k < 1 or alpha is zero, scale by beta and return. */
+	if ( k < 1 || PASTEMAC(ch,eq0)( *(( ctype* )alpha) ) )
+	{
+		if ( bli_thread_am_chief( thread ) )
+		{
+			PASTEMAC(ch,scalm)
+			(
+			  BLIS_NO_CONJUGATE,
+			  0,
+			  BLIS_NONUNIT_DIAG,
+			  BLIS_DENSE,
+			  m, n,
+			  beta,
+			  c, rs_c, cs_c
+			);
+		}
+		return;
+	}
+
+#endif
 
 #if 1
 	// Optimize some storage/packing cases by transforming them into others.
-	// These optimizations are expressed by changing trans and/or eff_id.
-	bli_gemmsup_ref_var1n2m_opt_cases( dt, &trans, packa, packb, &eff_id, cntx );
+	// These optimizations are expressed by changing trans and/or stor_id.
+	bli_gemmsup_ref_var1n2m_opt_cases( dt, &trans, packa, packb, &stor_id, cntx );
 #endif
 
-	if ( bli_is_notrans( trans ) )
+    if ( bli_is_trans( trans ) )
+    {
+              bool   packtmp = packa; packa = packb; packb = packtmp;
+              conj_t conjtmp = conja; conja = conjb; conjb = conjtmp;
+              dim_t  len_tmp =     m;     m =     n;     n = len_tmp;
+        const void*  buf_tmp = buf_a; buf_a = buf_b; buf_b = buf_tmp;
+              inc_t  str_tmp =  rs_a;  rs_a =  cs_b;  cs_b = str_tmp;
+                     str_tmp =  cs_a;  cs_a =  rs_b;  rs_b = str_tmp;
+                     str_tmp =  rs_c;  rs_c =  cs_c;  cs_c = str_tmp;
+
+        stor_id = bli_stor3_trans( stor_id );
+    }
+
+	/* Query the context for various blocksizes. */
+	const dim_t NR  = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx );
+	const dim_t MR  = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx );
+	const dim_t NC  = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx );
+	const dim_t MC  = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx );
+	const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx );
+
+	dim_t KC;
+	if      ( packa && packb )
 	{
-		// Invoke the function.
-		f
-		(
-		  packa,
-		  packb,
-		  conja,
-		  conjb,
-		  m,
-		  n,
-		  k,
-		  ( void* )buf_alpha,
-		  ( void* )buf_a, rs_a, cs_a,
-		  ( void* )buf_b, rs_b, cs_b,
-		  ( void* )buf_beta,
-		           buf_c, rs_c, cs_c,
-		  eff_id,
-		  ( cntx_t* )cntx,
-		  rntm,
-		  thread
-		);
+		KC = KC0;
 	}
-	else
+	else if ( packb )
 	{
-		// Invoke the function (transposing the operation).
-		f
-		(
-		  packb,             // swap the pack values.
-		  packa,
-		  conjb,             // swap the conj values.
-		  conja,
-		  n,                 // swap the m and n dimensions.
-		  m,
-		  k,
-		  ( void* )buf_alpha,
-		  ( void* )buf_b, cs_b, rs_b, // swap the positions of A and B.
-		  ( void* )buf_a, cs_a, rs_a, // swap the strides of A and B.
-		  ( void* )buf_beta,
-		           buf_c, cs_c, rs_c, // swap the strides of C.
-		  bli_stor3_trans( eff_id ), // transpose the stor3_t id.
-		  ( cntx_t* )cntx,
-		  rntm,
-		  thread
-		);
+		if      ( stor_id == BLIS_RRR ||
+				  stor_id == BLIS_CCC    ) KC = KC0;
+		else if ( stor_id == BLIS_RRC ||
+				  stor_id == BLIS_CRC    ) KC = KC0;
+		else if ( stor_id == BLIS_RCR ||
+		          stor_id == BLIS_CCR    ) KC = (( KC0 / 4 ) / 4 ) * 4;
+		else                               KC = KC0;
+	}
+	else if ( packa )
+	{
+		if      ( stor_id == BLIS_RRR ||
+				  stor_id == BLIS_CCC    ) KC = (( KC0 / 2 ) / 2 ) * 2;
+		else if ( stor_id == BLIS_RRC ||
+				  stor_id == BLIS_CRC    ) KC = KC0;
+		else if ( stor_id == BLIS_RCR ||
+		          stor_id == BLIS_CCR    ) KC = (( KC0 / 4 ) / 4 ) * 4;
+		else                               KC = KC0;
+	}
+	else /* if ( !packa && !packb ) */
+	{
+		if      ( stor_id == BLIS_RRR ||
+				  stor_id == BLIS_CCC    ) KC = KC0;
+		else if ( stor_id == BLIS_RRC ||
+				  stor_id == BLIS_CRC    ) KC = KC0;
+		else if ( m <=   MR && n <=   NR ) KC = KC0;
+		else if ( m <= 2*MR && n <= 2*NR ) KC = KC0 / 2;
+		else if ( m <= 3*MR && n <= 3*NR ) KC = (( KC0 / 3 ) / 4 ) * 4;
+		else if ( m <= 4*MR && n <= 4*NR ) KC = KC0 / 4;
+		else                               KC = (( KC0 / 5 ) / 4 ) * 4;
 	}
-}
-
 
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       bool       packa, \
-       bool       packb, \
-       conj_t     conja, \
-       conj_t     conjb, \
-       dim_t      m, \
-       dim_t      n, \
-       dim_t      k, \
-       void*      alpha, \
-       void*      a, inc_t rs_a, inc_t cs_a, \
-       void*      b, inc_t rs_b, inc_t cs_b, \
-       void*      beta, \
-       void*      c, inc_t rs_c, inc_t cs_c, \
-       stor3_t    stor_id, \
-       cntx_t*    cntx, \
-       rntm_t*    rntm, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	const num_t dt = PASTEMAC(ch,type); \
-\
-	/* If m or n is zero, return immediately. */ \
-	if ( bli_zero_dim2( m, n ) ) return; \
-\
-	/* If k < 1 or alpha is zero, scale by beta and return. */ \
-	if ( k < 1 || PASTEMAC(ch,eq0)( *(( ctype* )alpha) ) ) \
-	{ \
-		if ( bli_thread_am_ochief( thread ) ) \
-		{ \
-			PASTEMAC(ch,scalm) \
-			( \
-			  BLIS_NO_CONJUGATE, \
-			  0, \
-			  BLIS_NONUNIT_DIAG, \
-			  BLIS_DENSE, \
-			  m, n, \
-			  beta, \
-			  c, rs_c, cs_c \
-			); \
-		} \
-		return; \
-	} \
-\
-	/* Query the context for various blocksizes. */ \
-	const dim_t NR  = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \
-	const dim_t MR  = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \
-	const dim_t NC  = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \
-	const dim_t MC  = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \
-	const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \
-\
-	dim_t KC; \
-	if      ( packa && packb ) \
-	{ \
-		KC = KC0; \
-	} \
-	else if ( packb ) \
-	{ \
-		if      ( stor_id == BLIS_RRR || \
-				  stor_id == BLIS_CCC    ) KC = KC0; \
-		else if ( stor_id == BLIS_RRC || \
-				  stor_id == BLIS_CRC    ) KC = KC0; \
-		else if ( stor_id == BLIS_RCR || \
-		          stor_id == BLIS_CCR    ) KC = (( KC0 / 4 ) / 4 ) * 4; \
-		else                               KC = KC0; \
-	} \
-	else if ( packa ) \
-	{ \
-		if      ( stor_id == BLIS_RRR || \
-				  stor_id == BLIS_CCC    ) KC = (( KC0 / 2 ) / 2 ) * 2; \
-		else if ( stor_id == BLIS_RRC || \
-				  stor_id == BLIS_CRC    ) KC = KC0; \
-		else if ( stor_id == BLIS_RCR || \
-		          stor_id == BLIS_CCR    ) KC = (( KC0 / 4 ) / 4 ) * 4; \
-		else                               KC = KC0; \
-	} \
-	else /* if ( !packa && !packb ) */ \
-	{ \
-		if      ( stor_id == BLIS_RRR || \
-				  stor_id == BLIS_CCC    ) KC = KC0; \
-		else if ( stor_id == BLIS_RRC || \
-				  stor_id == BLIS_CRC    ) KC = KC0; \
-		else if ( m <=   MR && n <=   NR ) KC = KC0; \
-		else if ( m <= 2*MR && n <= 2*NR ) KC = KC0 / 2; \
-		else if ( m <= 3*MR && n <= 3*NR ) KC = (( KC0 / 3 ) / 4 ) * 4; \
-		else if ( m <= 4*MR && n <= 4*NR ) KC = KC0 / 4; \
-		else                               KC = (( KC0 / 5 ) / 4 ) * 4; \
-	} \
-\
 	/* Query the maximum blocksize for NR, which implies a maximum blocksize
-	   extension for the final iteration. */ \
-	const dim_t NRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_NR, cntx ); \
-	const dim_t NRE = NRM - NR; \
-\
-	/* Compute partitioning step values for each matrix of each loop. */ \
-	const inc_t jcstep_c = cs_c; \
-	const inc_t jcstep_b = cs_b; \
-\
-	const inc_t pcstep_a = cs_a; \
-	const inc_t pcstep_b = rs_b; \
-\
-	const inc_t icstep_c = rs_c; \
-	const inc_t icstep_a = rs_a; \
-\
-	const inc_t jrstep_c = cs_c * NR; \
-\
+	   extension for the final iteration. */
+	const dim_t NRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_NR, cntx );
+	const dim_t NRE = NRM - NR;
+
+	/* Compute partitioning step values for each matrix of each loop. */
+	const inc_t jcstep_c = cs_c * dt_size;
+	const inc_t jcstep_b = cs_b * dt_size;
+
+	const inc_t pcstep_a = cs_a * dt_size;
+	const inc_t pcstep_b = rs_b * dt_size;
+
+	const inc_t icstep_c = rs_c * dt_size;
+	const inc_t icstep_a = rs_a * dt_size;
+
+	const inc_t jrstep_c = cs_c * NR * dt_size;
+
 	/*
-	const inc_t jrstep_b = cs_b * NR; \
-	( void )jrstep_b; \
-\
-	const inc_t irstep_c = rs_c * MR; \
-	const inc_t irstep_a = rs_a * MR; \
-	*/ \
-\
+	const inc_t jrstep_b = cs_b * NR;
+	( void )jrstep_b;
+
+	const inc_t irstep_c = rs_c * MR;
+	const inc_t irstep_a = rs_a * MR;
+	*/
+
 	/* Query the context for the sup microkernel address and cast it to its
-	   function pointer type. */ \
-	PASTECH(ch,gemmsup_ker_ft) \
-        gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \
-\
-	ctype* a_00       = a; \
-	ctype* b_00       = b; \
-	ctype* c_00       = c; \
-	ctype* alpha_cast = alpha; \
-	ctype* beta_cast  = beta; \
-\
-	/* Make local copies of beta and one scalars to prevent any unnecessary
-	   sharing of cache lines between the cores' caches. */ \
-	ctype           beta_local = *beta_cast; \
-	ctype           one_local  = *PASTEMAC(ch,1); \
-\
-	auxinfo_t       aux; \
-\
-	/* Parse and interpret the contents of the rntm_t object to properly
-	   set the ways of parallelism for each loop. */ \
-	/*bli_rntm_set_ways_from_rntm_sup( m, n, k, rntm );*/ \
-\
-	/* Initialize a mem_t entry for A and B. Strictly speaking, this is only
-	   needed for the matrix we will be packing (if any), but we do it
-	   unconditionally to be safe. An alternative way of initializing the
-	   mem_t entries is:
-
-	     bli_mem_clear( &mem_a ); \
-	     bli_mem_clear( &mem_b ); \
-	*/ \
-	mem_t mem_a = BLIS_MEM_INITIALIZER; \
-	mem_t mem_b = BLIS_MEM_INITIALIZER; \
-\
-	/* Define an array of bszid_t ids, which will act as our substitute for
-	   the cntl_t tree. */ \
-	/*                           5thloop  4thloop         packb  3rdloop         packa  2ndloop  1stloop  ukrloop */ \
-	bszid_t bszids_nopack[6] = { BLIS_NC, BLIS_KC,               BLIS_MC,               BLIS_NR, BLIS_MR, BLIS_KR }; \
-	bszid_t bszids_packa [7] = { BLIS_NC, BLIS_KC,               BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \
-	bszid_t bszids_packb [7] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC,               BLIS_NR, BLIS_MR, BLIS_KR }; \
-	bszid_t bszids_packab[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \
-	bszid_t* bszids; \
-\
-	/* Set the bszids pointer to the correct bszids array above based on which
-	   matrices (if any) are being packed. */ \
-	if ( packa ) { if ( packb ) bszids = bszids_packab; \
-	               else         bszids = bszids_packa; } \
-	else         { if ( packb ) bszids = bszids_packb; \
-	               else         bszids = bszids_nopack; } \
-\
-	/* Determine whether we are using more than one thread. */ \
-	const bool is_mt = ( bli_rntm_calc_num_threads( rntm ) > 1 ); \
-\
-	thrinfo_t* thread_jc = NULL; \
-	thrinfo_t* thread_pc = NULL; \
-	thrinfo_t* thread_pb = NULL; \
-	thrinfo_t* thread_ic = NULL; \
-	thrinfo_t* thread_pa = NULL; \
-	thrinfo_t* thread_jr = NULL; \
-\
-	/* Grow the thrinfo_t tree. */ \
-	bszid_t*   bszids_jc = bszids; \
-	               thread_jc = thread; \
-	bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \
-\
-	/* Compute the JC loop thread range for the current thread. */ \
-	dim_t jc_start, jc_end; \
-	bli_thread_range_sub( thread_jc, n, NR, FALSE, &jc_start, &jc_end ); \
-	const dim_t n_local = jc_end - jc_start; \
-\
-	/* Compute number of primary and leftover components of the JC loop. */ \
-	/*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/ \
-	const dim_t jc_left =   n_local % NC; \
-\
-	/* Loop over the n dimension (NC rows/columns at a time). */ \
-	/*for ( dim_t jj = 0; jj < jc_iter; jj += 1 )*/ \
-	for ( dim_t jj = jc_start; jj < jc_end; jj += NC ) \
-	{ \
-		/* Calculate the thread's current JC block dimension. */ \
-		const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \
-\
-		ctype* b_jc = b_00 + jj * jcstep_b; \
-		ctype* c_jc = c_00 + jj * jcstep_c; \
-\
-		/* Grow the thrinfo_t tree. */ \
-		bszid_t*   bszids_pc = &bszids_jc[1]; \
-		               thread_pc = bli_thrinfo_sub_node( thread_jc ); \
-		bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \
-\
-		/* Compute the PC loop thread range for the current thread. */ \
-		const dim_t pc_start = 0, pc_end = k; \
-		const dim_t k_local = k; \
-\
-		/* Compute number of primary and leftover components of the PC loop. */ \
-		/*const dim_t pc_iter = ( k_local + KC - 1 ) / KC;*/ \
-		const dim_t pc_left =   k_local % KC; \
-\
-		/* Loop over the k dimension (KC rows/columns at a time). */ \
-		/*for ( dim_t pp = 0; pp < pc_iter; pp += 1 )*/ \
-		for ( dim_t pp = pc_start; pp < pc_end; pp += KC ) \
-		{ \
-			/* Calculate the thread's current PC block dimension. */ \
-			const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \
-\
-			ctype* a_pc = a_00 + pp * pcstep_a; \
-			ctype* b_pc = b_jc + pp * pcstep_b; \
-\
-			/* Only apply beta to the first iteration of the pc loop. */ \
-			ctype* beta_use = ( pp == 0 ? &beta_local : &one_local ); \
-\
-			ctype* b_use; \
-			inc_t  rs_b_use, cs_b_use, ps_b_use; \
-\
-			/* Set the bszid_t array and thrinfo_t pointer based on whether
-			   we will be packing B. If we won't be packing B, we alias to
-			   the _pc variables so that code further down can unconditionally
-			   reference the _pb variables. Note that *if* we will be packing
-			   B, the thrinfo_t node will have already been created by a
-			   previous call to bli_thrinfo_grow(), since bszid values of
-			   BLIS_NO_PART cause the tree to grow by two (e.g. to the next
-			   bszid that is a normal bszid_t value). */ \
-			bszid_t*   bszids_pb; \
-			if ( packb ) { bszids_pb = &bszids_pc[1]; \
-			               thread_pb = bli_thrinfo_sub_node( thread_pc ); } \
-			else         { bszids_pb = &bszids_pc[0]; \
-			               thread_pb = thread_pc; } \
-\
+	   function pointer type. */
+	gemmsup_ker_vft gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx );
+
+	const char* a_00       = buf_a;
+	const char* b_00       = buf_b;
+	      char* c_00       = buf_c;
+    const void* one        = bli_obj_buffer_for_const( dt, &BLIS_ONE );
+
+	auxinfo_t       aux;
+
+	/* Determine whether we are using more than one thread. */
+	const bool is_mt = ( bli_rntm_calc_num_threads( rntm ) > 1 );
+
+	thrinfo_t* thread_jc = thread;
+	thrinfo_t* thread_pc = bli_thrinfo_sub_node( thread_jc );
+	thrinfo_t* thread_pb = bli_thrinfo_sub_node( thread_pc );
+	thrinfo_t* thread_ic = bli_thrinfo_sub_node( thread_pb );
+	thrinfo_t* thread_pa = bli_thrinfo_sub_node( thread_ic );
+	thrinfo_t* thread_jr = bli_thrinfo_sub_node( thread_pa );
+
+	/* Compute the JC loop thread range for the current thread. */
+	dim_t jc_start, jc_end;
+	bli_thread_range_sub( thread_jc, n, NR, FALSE, &jc_start, &jc_end );
+	const dim_t n_local = jc_end - jc_start;
+
+	/* Compute number of primary and leftover components of the JC loop. */
+	/*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/
+	const dim_t jc_left =   n_local % NC;
+
+	/* Loop over the n dimension (NC rows/columns at a time). */
+	/*for ( dim_t jj = 0; jj < jc_iter; jj += 1 )*/
+	for ( dim_t jj = jc_start; jj < jc_end; jj += NC )
+	{
+		/* Calculate the thread's current JC block dimension. */
+		const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left );
+
+		const char* b_jc = b_00 + jj * jcstep_b;
+		      char* c_jc = c_00 + jj * jcstep_c;
+
+		/* Compute the PC loop thread range for the current thread. */
+		const dim_t pc_start = 0, pc_end = k;
+		const dim_t k_local = k;
+
+		/* Compute number of primary and leftover components of the PC loop. */
+		/*const dim_t pc_iter = ( k_local + KC - 1 ) / KC;*/
+		const dim_t pc_left =   k_local % KC;
+
+		/* Loop over the k dimension (KC rows/columns at a time). */
+		/*for ( dim_t pp = 0; pp < pc_iter; pp += 1 )*/
+		for ( dim_t pp = pc_start; pp < pc_end; pp += KC )
+		{
+			/* Calculate the thread's current PC block dimension. */
+			const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left );
+
+			const char* a_pc = a_00 + pp * pcstep_a;
+			const char* b_pc = b_jc + pp * pcstep_b;
+
+			/* Only apply beta to the first iteration of the pc loop. */
+			const void* beta_use = ( pp == 0 ? buf_beta : one );
+
+			      char* b_use;
+			      inc_t rs_b_use, cs_b_use, ps_b_use;
+
 			/* Determine the packing buffer and related parameters for matrix
 			   B. (If B will not be packed, then a_use will be set to point to
 			   b and the _b_use strides will be set accordingly.) Then call
 			   the packm sup variant chooser, which will call the appropriate
-			   implementation based on the schema deduced from the stor_id. */ \
-			PASTEMAC(ch,packm_sup_b) \
-			( \
-			  packb, \
-			  BLIS_BUFFER_FOR_B_PANEL, /* This algorithm packs matrix B to */ \
-			  stor_id,                 /* a "panel of B."                  */ \
-			  BLIS_NO_TRANSPOSE, \
-			  KC,     NC,       /* This "panel of B" is (at most) KC x NC. */ \
-			  kc_cur, nc_cur, NR, \
-			  &one_local, \
-			  b_pc,   rs_b,      cs_b, \
-			  &b_use, &rs_b_use, &cs_b_use, \
-			                     &ps_b_use, \
-			  cntx, \
-			  rntm, \
-			  &mem_b, \
-			  thread_pb  \
-			); \
-\
+			   implementation based on the schema deduced from the stor_id. */
+			bli_packm_sup
+			(
+			  packb,
+			  BLIS_BUFFER_FOR_B_PANEL, /* This algorithm packs matrix B to */
+			  stor_id,                 /* a "panel of B."                  */
+			  BLIS_NO_TRANSPOSE,
+              dt,
+			  NC,     KC,       /* This "panel of B" is (at most) KC x NC. */
+			  nc_cur, kc_cur, NR,
+			  one,
+			  b_pc,   cs_b,      rs_b,
+			  ( void** )&b_use, &cs_b_use, &rs_b_use,
+			                    &ps_b_use,
+              cntx,
+			  thread_pb
+			);
+
 			/* Alias b_use so that it's clear this is our current block of
-			   matrix B. */ \
-			ctype* b_pc_use = b_use; \
-\
+			   matrix B. */
+			char* b_pc_use = b_use;
+
 			/* We don't need to embed the panel stride of B within the auxinfo_t
 			   object because this variant iterates through B in the jr loop,
 			   which occurs here, within the macrokernel, not within the
-			   millikernel. */ \
-			/*bli_auxinfo_set_ps_b( ps_b_use, &aux );*/ \
-\
-			/* Grow the thrinfo_t tree. */ \
-			bszid_t*   bszids_ic = &bszids_pb[1]; \
-			               thread_ic = bli_thrinfo_sub_node( thread_pb ); \
-			bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \
-\
-			/* Compute the IC loop thread range for the current thread. */ \
-			dim_t ic_start, ic_end; \
-			bli_thread_range_sub( thread_ic, m, MR, FALSE, &ic_start, &ic_end ); \
-			const dim_t m_local = ic_end - ic_start; \
-\
-			/* Compute number of primary and leftover components of the IC loop. */ \
-			/*const dim_t ic_iter = ( m_local + MC - 1 ) / MC;*/ \
-			const dim_t ic_left =   m_local % MC; \
-\
-			/* Loop over the m dimension (MC rows at a time). */ \
-			/*for ( dim_t ii = 0; ii < ic_iter; ii += 1 )*/ \
-			for ( dim_t ii = ic_start; ii < ic_end; ii += MC ) \
-			{ \
-				/* Calculate the thread's current IC block dimension. */ \
-				const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \
-\
-				ctype* a_ic = a_pc + ii * icstep_a; \
-				ctype* c_ic = c_jc + ii * icstep_c; \
-\
-				ctype* a_use; \
-				inc_t  rs_a_use, cs_a_use, ps_a_use; \
-\
-				/* Set the bszid_t array and thrinfo_t pointer based on whether
-				   we will be packing B. If we won't be packing A, we alias to
-				   the _ic variables so that code further down can unconditionally
-				   reference the _pa variables. Note that *if* we will be packing
-				   A, the thrinfo_t node will have already been created by a
-				   previous call to bli_thrinfo_grow(), since bszid values of
-				   BLIS_NO_PART cause the tree to grow by two (e.g. to the next
-				   bszid that is a normal bszid_t value). */ \
-				bszid_t*   bszids_pa; \
-				if ( packa ) { bszids_pa = &bszids_ic[1]; \
-							   thread_pa = bli_thrinfo_sub_node( thread_ic ); } \
-				else         { bszids_pa = &bszids_ic[0]; \
-							   thread_pa = thread_ic; } \
-\
+			   millikernel. */
+			/*bli_auxinfo_set_ps_b( ps_b_use, &aux );*/
+
+			/* Compute the IC loop thread range for the current thread. */
+			dim_t ic_start, ic_end;
+			bli_thread_range_sub( thread_ic, m, MR, FALSE, &ic_start, &ic_end );
+			const dim_t m_local = ic_end - ic_start;
+
+			/* Compute number of primary and leftover components of the IC loop. */
+			/*const dim_t ic_iter = ( m_local + MC - 1 ) / MC;*/
+			const dim_t ic_left =   m_local % MC;
+
+			/* Loop over the m dimension (MC rows at a time). */
+			/*for ( dim_t ii = 0; ii < ic_iter; ii += 1 )*/
+			for ( dim_t ii = ic_start; ii < ic_end; ii += MC )
+			{
+				/* Calculate the thread's current IC block dimension. */
+				const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left );
+
+				const char* a_ic = a_pc + ii * icstep_a;
+				      char* c_ic = c_jc + ii * icstep_c;
+
+				      char* a_use;
+				      inc_t rs_a_use, cs_a_use, ps_a_use;
+
 				/* Determine the packing buffer and related parameters for matrix
 				   A. (If A will not be packed, then a_use will be set to point to
 				   a and the _a_use strides will be set accordingly.) Then call
 				   the packm sup variant chooser, which will call the appropriate
-				   implementation based on the schema deduced from the stor_id. */ \
-				PASTEMAC(ch,packm_sup_a) \
-				( \
-				  packa, \
-				  BLIS_BUFFER_FOR_A_BLOCK, /* This algorithm packs matrix A to */ \
-				  stor_id,                 /* a "block of A."                  */ \
-				  BLIS_NO_TRANSPOSE, \
-				  MC,     KC,       /* This "block of A" is (at most) MC x KC. */ \
-				  mc_cur, kc_cur, MR, \
-				  &one_local, \
-				  a_ic,   rs_a,      cs_a, \
-				  &a_use, &rs_a_use, &cs_a_use, \
-				                     &ps_a_use, \
-				  cntx, \
-				  rntm, \
-				  &mem_a, \
-				  thread_pa  \
-				); \
-\
+				   implementation based on the schema deduced from the stor_id. */
+				bli_packm_sup
+				(
+				  packa,
+				  BLIS_BUFFER_FOR_A_BLOCK, /* This algorithm packs matrix A to */
+				  stor_id,                 /* a "block of A."                  */
+				  BLIS_NO_TRANSPOSE,
+                  dt,
+				  MC,     KC,       /* This "block of A" is (at most) MC x KC. */
+				  mc_cur, kc_cur, MR,
+				  one,
+				  a_ic,   rs_a,      cs_a,
+				  ( void** )&a_use, &rs_a_use, &cs_a_use,
+				                    &ps_a_use,
+                  cntx,
+				  thread_pa
+				);
+
 				/* Alias a_use so that it's clear this is our current block of
-				   matrix A. */ \
-				ctype* a_ic_use = a_use; \
-\
+				   matrix A. */
+				char* a_ic_use = a_use;
+
 				/* Embed the panel stride of A within the auxinfo_t object. The
 				   millikernel will query and use this to iterate through
-				   micropanels of A (if needed). */ \
-				bli_auxinfo_set_ps_a( ps_a_use, &aux ); \
-\
-				/* Grow the thrinfo_t tree. */ \
-				bszid_t*   bszids_jr = &bszids_pa[1]; \
-				               thread_jr = bli_thrinfo_sub_node( thread_pa ); \
-				bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \
-\
-				/* Compute number of primary and leftover components of the JR loop. */ \
-				dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \
-				dim_t jr_left =   nc_cur % NR; \
-\
+				   micropanels of A (if needed). */
+				bli_auxinfo_set_ps_a( ps_a_use, &aux );
+
+				/* Compute number of primary and leftover components of the JR loop. */
+				dim_t jr_iter = ( nc_cur + NR - 1 ) / NR;
+				dim_t jr_left =   nc_cur % NR;
+
 				/* An optimization: allow the last jr iteration to contain up to NRE
 				   columns of C and B. (If NRE > NR, the mkernel has agreed to handle
 				   these cases.) Note that this prevents us from declaring jr_iter and
 				   jr_left as const. NOTE: We forgo this optimization when packing B
-				   since packing an extended edge case is not yet supported. */ \
-				if ( !packb && !is_mt ) \
-				if ( NRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= NRE ) \
-				{ \
-					jr_iter--; jr_left += NR; \
-				} \
-\
-				/* Compute the JR loop thread range for the current thread. */ \
-				dim_t jr_start, jr_end; \
-				bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \
-\
-				/* Loop over the n dimension (NR columns at a time). */ \
-				/*for ( dim_t j = 0; j < jr_iter; j += 1 )*/ \
-				for ( dim_t j = jr_start; j < jr_end; j += 1 ) \
-				{ \
-					const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \
-\
+				   since packing an extended edge case is not yet supported. */
+				if ( !packb && !is_mt )
+				if ( NRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= NRE )
+				{
+					jr_iter--; jr_left += NR;
+				}
+
+				/* Compute the JR loop thread range for the current thread. */
+				dim_t jr_start, jr_end;
+				bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end );
+
+				/* Loop over the n dimension (NR columns at a time). */
+				/*for ( dim_t j = 0; j < jr_iter; j += 1 )*/
+				for ( dim_t j = jr_start; j < jr_end; j += 1 )
+				{
+					const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left );
+
 					/*
-					ctype* b_jr = b_pc_use + j * jrstep_b; \
-					*/ \
-					ctype* b_jr = b_pc_use + j * ps_b_use; \
-					ctype* c_jr = c_ic     + j * jrstep_c; \
-\
+					ctype* b_jr = b_pc_use + j * jrstep_b;
+					*/
+					const char* b_jr = b_pc_use + j * ps_b_use;
+					      char* c_jr = c_ic     + j * jrstep_c;
+
 					/*
-					const dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \
-					const dim_t ir_left =   mc_cur % MR; \
-					*/ \
-\
-					/* Loop over the m dimension (MR rows at a time). */ \
-					{ \
-						/* Invoke the gemmsup millikernel. */ \
-						gemmsup_ker \
-						( \
-						  conja, \
-						  conjb, \
-						  mc_cur, \
-						  nr_cur, \
-						  kc_cur, \
-						  alpha_cast, \
-						  a_ic_use, rs_a_use, cs_a_use, \
-						  b_jr,     rs_b_use, cs_b_use, \
-						  beta_use, \
-						            c_jr,     rs_c,     cs_c, \
-						  &aux, \
-						  cntx  \
-						); \
-					} \
-				} \
-			} \
-\
+					const dim_t ir_iter = ( mc_cur + MR - 1 ) / MR;
+					const dim_t ir_left =   mc_cur % MR;
+					*/
+
+					/* Loop over the m dimension (MR rows at a time). */
+					{
+						/* Invoke the gemmsup millikernel. */
+						gemmsup_ker
+						(
+						  conja,
+						  conjb,
+						  mc_cur,
+						  nr_cur,
+						  kc_cur,
+						  ( void* )buf_alpha,
+						  a_ic_use, rs_a_use, cs_a_use,
+						  ( void* )b_jr,     rs_b_use, cs_b_use,
+						  ( void* )beta_use,
+						           c_jr,     rs_c,     cs_c,
+						  &aux,
+						  ( cntx_t* )cntx
+						);
+					}
+				}
+			}
+
 			/* NOTE: This barrier is only needed if we are packing B (since
-			   that matrix is packed within the pc loop of this variant). */ \
-			if ( packb ) bli_thread_barrier( thread_pb ); \
-		} \
-	} \
-\
-	/* Release any memory that was acquired for packing matrices A and B. */ \
-	PASTEMAC(ch,packm_sup_finalize_mem_a) \
-	( \
-	  packa, \
-	  rntm, \
-	  &mem_a, \
-	  thread_pa  \
-	); \
-	PASTEMAC(ch,packm_sup_finalize_mem_b) \
-	( \
-	  packb, \
-	  rntm, \
-	  &mem_b, \
-	  thread_pb  \
-	); \
-\
+			   that matrix is packed within the pc loop of this variant). */
+			if ( packb ) bli_thread_barrier( thread_pb );
+		}
+	}
+
+	/* Release any memory that was acquired for packing matrices A and B. */
+	bli_packm_sup_finalize_mem
+	(
+	  packa,
+	  thread_pa
+	);
+	bli_packm_sup_finalize_mem
+	(
+	  packb,
+	  thread_pb
+	);
+
 /*
-PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: b1", kc_cur, nr_cur, b_jr, rs_b, cs_b, "%4.1f", "" ); \
-PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: a1", mr_cur, kc_cur, a_ir, rs_a, cs_a, "%4.1f", "" ); \
-PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%4.1f", "" ); \
-*/ \
+PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: b1", kc_cur, nr_cur, b_jr, rs_b, cs_b, "%4.1f", "" );
+PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: a1", mr_cur, kc_cur, a_ir, rs_a, cs_a, "%4.1f", "" );
+PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%4.1f", "" );
+*/
 }
 
-INSERT_GENTFUNC_BASIC0( gemmsup_ref_var2m )
-
diff --git a/frame/3/bli_l3_sup_vars.h b/frame/3/bli_l3_sup_vars.h
index df9a747abd..8bbb73ca94 100644
--- a/frame/3/bli_l3_sup_vars.h
+++ b/frame/3/bli_l3_sup_vars.h
@@ -50,7 +50,7 @@ void PASTEMAC0(opname) \
        const obj_t*     c, \
              stor3_t    eff_id, \
        const cntx_t*    cntx, \
-             rntm_t*    rntm, \
+       const rntm_t*    rntm, \
              thrinfo_t* thread  \
      );
 
@@ -89,32 +89,6 @@ void PASTEMAC(ch,varname) \
 INSERT_GENTPROT_BASIC0( gemmsup_ref_var1 )
 INSERT_GENTPROT_BASIC0( gemmsup_ref_var2 )
 
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       bool       packa, \
-       bool       packb, \
-       conj_t     conja, \
-       conj_t     conjb, \
-       dim_t      m, \
-       dim_t      n, \
-       dim_t      k, \
-       void*      alpha, \
-       void*      a, inc_t rs_a, inc_t cs_a, \
-       void*      b, inc_t rs_b, inc_t cs_b, \
-       void*      beta, \
-       void*      c, inc_t rs_c, inc_t cs_c, \
-       stor3_t    eff_id, \
-       cntx_t*    cntx, \
-       rntm_t*    rntm, \
-       thrinfo_t* thread  \
-     );
-
-INSERT_GENTPROT_BASIC0( gemmsup_ref_var1n )
-INSERT_GENTPROT_BASIC0( gemmsup_ref_var2m )
-
 // -----------------------------------------------------------------------------
 
 BLIS_INLINE void bli_gemmsup_ref_var1n2m_opt_cases
diff --git a/frame/3/bli_l3_tapi_ex.c b/frame/3/bli_l3_tapi_ex.c
index c934ba9493..130237ee4d 100644
--- a/frame/3/bli_l3_tapi_ex.c
+++ b/frame/3/bli_l3_tapi_ex.c
@@ -55,7 +55,7 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
        const ctype*  beta, \
              ctype*  c, inc_t rs_c, inc_t cs_c, \
        const cntx_t* cntx, \
-             rntm_t* rntm  \
+       const rntm_t* rntm  \
      ) \
 { \
 	bli_init_once(); \
@@ -115,7 +115,7 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
        const ctype*  beta, \
              ctype*  c, inc_t rs_c, inc_t cs_c, \
        const cntx_t* cntx, \
-             rntm_t* rntm  \
+       const rntm_t* rntm  \
      ) \
 { \
 	bli_init_once(); \
@@ -178,7 +178,7 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
        const ctype_r* beta, \
              ctype*   c, inc_t rs_c, inc_t cs_c, \
        const cntx_t*  cntx, \
-             rntm_t*  rntm  \
+       const rntm_t*  rntm  \
      ) \
 { \
 	bli_init_once(); \
@@ -236,7 +236,7 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
        const ctype_r* beta, \
              ctype*   c, inc_t rs_c, inc_t cs_c, \
        const cntx_t*  cntx, \
-             rntm_t*  rntm  \
+       const rntm_t*  rntm  \
      ) \
 { \
 	bli_init_once(); \
@@ -298,7 +298,7 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
        const ctype*  beta, \
              ctype*  c, inc_t rs_c, inc_t cs_c, \
        const cntx_t* cntx, \
-             rntm_t* rntm  \
+       const rntm_t* rntm  \
      ) \
 { \
 	bli_init_once(); \
@@ -355,7 +355,7 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
        const ctype*  beta, \
              ctype*  c, inc_t rs_c, inc_t cs_c, \
        const cntx_t* cntx, \
-             rntm_t* rntm  \
+       const rntm_t* rntm  \
      ) \
 { \
 	bli_init_once(); \
@@ -418,7 +418,7 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
        const ctype*  beta, \
              ctype*  c, inc_t rs_c, inc_t cs_c, \
        const cntx_t* cntx, \
-             rntm_t* rntm  \
+       const rntm_t* rntm  \
      ) \
 { \
 	bli_init_once(); \
@@ -481,7 +481,7 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
        const ctype*  beta, \
              ctype*  c, inc_t rs_c, inc_t cs_c, \
        const cntx_t* cntx, \
-             rntm_t* rntm  \
+       const rntm_t* rntm  \
      ) \
 { \
 	bli_init_once(); \
@@ -545,7 +545,7 @@ void PASTEMAC2(ch,opname,BLIS_OAPI_EX_SUF) \
        const ctype*  a, inc_t rs_a, inc_t cs_a, \
              ctype*  b, inc_t rs_b, inc_t cs_b, \
        const cntx_t* cntx, \
-             rntm_t* rntm  \
+       const rntm_t* rntm  \
      ) \
 { \
 	bli_init_once(); \
diff --git a/frame/3/bli_l3_tapi_ex.h b/frame/3/bli_l3_tapi_ex.h
index eb142af05d..d8610dee82 100644
--- a/frame/3/bli_l3_tapi_ex.h
+++ b/frame/3/bli_l3_tapi_ex.h
@@ -54,7 +54,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
        const ctype*  beta, \
              ctype*  c, inc_t rs_c, inc_t cs_c, \
        const cntx_t* cntx, \
-             rntm_t* rntm  \
+       const rntm_t* rntm  \
      );
 
 INSERT_GENTPROT_BASIC0( gemm )
@@ -76,7 +76,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
        const ctype*  beta, \
              ctype*  c, inc_t rs_c, inc_t cs_c, \
        const cntx_t* cntx, \
-             rntm_t* rntm  \
+       const rntm_t* rntm  \
      );
 
 INSERT_GENTPROT_BASIC0( hemm )
@@ -97,7 +97,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
        const ctype_r* beta, \
              ctype*   c, inc_t rs_c, inc_t cs_c, \
        const cntx_t*  cntx, \
-             rntm_t*  rntm  \
+       const rntm_t*  rntm  \
      );
 
 INSERT_GENTPROTR_BASIC0( herk )
@@ -119,7 +119,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
        const ctype_r* beta, \
              ctype*   c, inc_t rs_c, inc_t cs_c, \
        const cntx_t*  cntx, \
-             rntm_t*  rntm  \
+       const rntm_t*  rntm  \
      );
 
 INSERT_GENTPROTR_BASIC0( her2k )
@@ -139,7 +139,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
        const ctype*  beta, \
              ctype*  c, inc_t rs_c, inc_t cs_c, \
        const cntx_t* cntx, \
-             rntm_t* rntm  \
+       const rntm_t* rntm  \
      );
 
 INSERT_GENTPROT_BASIC0( syrk )
@@ -161,7 +161,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
        const ctype*  beta, \
              ctype*  c, inc_t rs_c, inc_t cs_c, \
        const cntx_t* cntx, \
-             rntm_t* rntm  \
+       const rntm_t* rntm  \
      );
 
 INSERT_GENTPROT_BASIC0( gemmt )
@@ -186,7 +186,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
        const ctype*  beta, \
              ctype*  c, inc_t rs_c, inc_t cs_c, \
        const cntx_t* cntx, \
-             rntm_t* rntm  \
+       const rntm_t* rntm  \
      );
 
 INSERT_GENTPROT_BASIC0( trmm3 )
@@ -207,7 +207,7 @@ BLIS_EXPORT_BLIS void PASTEMAC2(ch,opname,BLIS_TAPI_EX_SUF) \
        const ctype*  a, inc_t rs_a, inc_t cs_a, \
              ctype*  b, inc_t rs_b, inc_t cs_b, \
        const cntx_t* cntx, \
-             rntm_t* rntm  \
+       const rntm_t* rntm  \
      );
 
 INSERT_GENTPROT_BASIC0( trmm )
diff --git a/frame/3/bli_l3_thrinfo.c b/frame/3/bli_l3_thrinfo.c
index f866cfd4c5..cbf5895f68 100644
--- a/frame/3/bli_l3_thrinfo.c
+++ b/frame/3/bli_l3_thrinfo.c
@@ -36,137 +36,101 @@
 #include "blis.h"
 #include "assert.h"
 
-void bli_l3_thrinfo_init_single
+thrinfo_t* bli_l3_thrinfo_create
      (
-       thrinfo_t* thread
+             dim_t       id,
+             thrcomm_t*  gl_comm,
+             array_t*    array,
+       const rntm_t*     rntm,
+       const cntl_t*     cntl
      )
 {
-	bli_thrinfo_init_single( thread );
-}
+	// Create the root thrinfo_t node.
+	thrinfo_t* root = bli_thrinfo_create_root
+	(
+	  gl_comm,
+	  id,
+      bli_apool_array_elem( id, array ),
+      bli_pba_query()
+	);
 
-void bli_l3_thrinfo_free
-     (
-       rntm_t*    rntm,
-       thrinfo_t* thread
-     )
-{
-	bli_thrinfo_free( rntm, thread );
-}
+    thrinfo_t* thread = bli_l3_thrinfo_grow( root, rntm, cntl );
+    bli_thrinfo_set_sub_node( root, thread );
 
-void bli_l3_sup_thrinfo_free
-     (
-       rntm_t*    rntm,
-       thrinfo_t* thread
-     )
-{
-	bli_thrinfo_free( rntm, thread );
+    return root;
 }
 
-// -----------------------------------------------------------------------------
-
-void bli_l3_thrinfo_create_root
+thrinfo_t* bli_l3_thrinfo_grow
      (
-       dim_t       id,
-       thrcomm_t*  gl_comm,
-       rntm_t*     rntm,
-       cntl_t*     cntl,
-       thrinfo_t** thread
+             thrinfo_t*  thread_par,
+       const rntm_t*     rntm,
+       const cntl_t*     cntl
      )
 {
-	// Query the global communicator for the total number of threads to use.
-	dim_t   n_threads  = bli_thrcomm_num_threads( gl_comm );
-
-	// Use the thread id passed in as the global communicator id.
-	dim_t   gl_comm_id = id;
-
-	// Use the blocksize id of the current (root) control tree node to
-	// query the top-most ways of parallelism to obtain.
-	bszid_t bszid      = bli_cntl_bszid( cntl );
-	dim_t   xx_way     = bli_rntm_ways_for( bszid, rntm );
-
-	// Determine the work id for this thrinfo_t node.
-	dim_t   work_id    = gl_comm_id / ( n_threads / xx_way );
-
-	// Create the root thrinfo_t node.
-	*thread = bli_thrinfo_create
-	(
-	  rntm,
-	  gl_comm,
-	  gl_comm_id,
-	  xx_way,
-	  work_id,
-	  TRUE,
-	  bszid,
-	  NULL
-	);
+    const cntl_t* sub_prenode = bli_cntl_sub_prenode( cntl );
+    const cntl_t* sub_node    = bli_cntl_sub_node( cntl );
+    const bszid_t bszid       = bli_cntl_part( cntl );
+    const dim_t   n_way       = bli_rntm_ways_for( bszid, rntm );
+
+    thrinfo_t* thread_cur = bli_thrinfo_split( n_way, thread_par );
+
+    if ( sub_prenode != NULL )
+    {
+        thrinfo_t* thread_chl = bli_l3_thrinfo_grow( thread_cur, rntm, sub_prenode );
+        bli_thrinfo_set_sub_prenode( thread_chl, thread_cur );
+    }
+
+    if ( sub_node != NULL )
+    {
+        thrinfo_t* thread_chl = bli_l3_thrinfo_grow( thread_cur, rntm, sub_node );
+        bli_thrinfo_set_sub_node( thread_chl, thread_cur );
+    }
+
+    return thread_cur;
 }
 
 // -----------------------------------------------------------------------------
 
-void bli_l3_sup_thrinfo_create_root
+thrinfo_t* bli_l3_sup_thrinfo_create
      (
-       dim_t       id,
-       thrcomm_t*  gl_comm,
-       rntm_t*     rntm,
-       thrinfo_t** thread
+             dim_t       id,
+             thrcomm_t*  gl_comm,
+             array_t*    array,
+       const rntm_t*     rntm
      )
 {
-	// Query the global communicator for the total number of threads to use.
-	dim_t   n_threads  = bli_thrcomm_num_threads( gl_comm );
-
-	// Use the thread id passed in as the global communicator id.
-	dim_t   gl_comm_id = id;
-
-	// Use the BLIS_NC blocksize id to query the top-most ways of parallelism
-	// to obtain. Note that hard-coding BLIS_NC like this is a little bit of a
-	// hack, but it works fine since both of the sup algorithms (bp and pb) use
-	// the cache blocksizes down to the 3rd loop. (See the definitions of
-	// bli_rntm_calc_num_threads_bp() and bli_rntm_calc_num_threads_pb() for
-	// a concise enumeration of these bszid_t ids.)
-	const bszid_t bszid  = BLIS_NC;
-	dim_t         xx_way = bli_rntm_ways_for( BLIS_NC, rntm );
-
-	// Determine the work id for this thrinfo_t node.
-	dim_t   work_id    = gl_comm_id / ( n_threads / xx_way );
-
 	// Create the root thrinfo_t node.
-	*thread = bli_thrinfo_create
+	thrinfo_t* root = bli_thrinfo_create_root
 	(
-	  rntm,
 	  gl_comm,
-	  gl_comm_id,
-	  xx_way,
-	  work_id,
-	  TRUE,
-	  bszid,
-	  NULL
+	  id,
+      bli_apool_array_elem( id, array ),
+      bli_pba_query()
 	);
-}
-
-// -----------------------------------------------------------------------------
-
-void bli_l3_sup_thrinfo_update_root
-     (
-       rntm_t*    rntm,
-       thrinfo_t* thread
-     )
-{
-	// Query the current root for the total number of threads to use.
-	const dim_t n_threads  = bli_thread_num_threads( thread );
-
-	// Query the current root for the (global) comm id.
-	const dim_t gl_comm_id = bli_thread_ocomm_id( thread );
-
-	// Query the rntm_t for the updated number of ways of parallelism.
-	const dim_t xx_way     = bli_rntm_ways_for( BLIS_NC, rntm );
-
-	// Recompute the work id for this thrinfo_t node using the updated
-	// number of ways of parallelism.
-	dim_t       work_id    = gl_comm_id / ( n_threads / xx_way );
 
-	// Save the updated ways of parallelism and work id to the thrinfo_t node.
-	bli_thrinfo_set_n_way( xx_way, thread );
-	bli_thrinfo_set_work_id( work_id, thread );
+    const dim_t n_way_jc = bli_rntm_ways_for( BLIS_NC, rntm );
+    const dim_t n_way_pc = bli_rntm_ways_for( BLIS_KC, rntm );
+    const dim_t n_way_ic = bli_rntm_ways_for( BLIS_MC, rntm );
+    const dim_t n_way_jr = bli_rntm_ways_for( BLIS_NR, rntm );
+    const dim_t n_way_ir = bli_rntm_ways_for( BLIS_MR, rntm );
+
+    thrinfo_t* thread_jc = bli_thrinfo_split( n_way_jc,      root );
+    thrinfo_t* thread_pc = bli_thrinfo_split( n_way_pc, thread_jc );
+    thrinfo_t* thread_pb = bli_thrinfo_split(        1, thread_pc );
+    thrinfo_t* thread_ic = bli_thrinfo_split( n_way_ic, thread_pb );
+    thrinfo_t* thread_pa = bli_thrinfo_split(        1, thread_ic );
+    thrinfo_t* thread_jr = bli_thrinfo_split( n_way_jr, thread_pa );
+    thrinfo_t* thread_ir = bli_thrinfo_split( n_way_ir, thread_jr );
+
+    bli_thrinfo_set_sub_node( thread_jc,      root );
+    bli_thrinfo_set_sub_node( thread_pc, thread_jc );
+    bli_thrinfo_set_sub_node( thread_pb, thread_pc );
+    bli_thrinfo_set_sub_node( thread_ic, thread_pb );
+    bli_thrinfo_set_sub_node( thread_pa, thread_ic );
+    bli_thrinfo_set_sub_node( thread_jr, thread_pa );
+    bli_thrinfo_set_sub_node( thread_ir, thread_jr );
+
+    return root;
 }
 
 // -----------------------------------------------------------------------------
@@ -283,43 +247,43 @@ void bli_l3_thrinfo_print_gemm_paths
 
 		if ( !jc_info ) goto print_thrinfo;
 
-		jc_comm_id = bli_thread_ocomm_id( jc_info );
+		jc_comm_id = bli_thread_thread_id( jc_info );
 		jc_work_id = bli_thread_work_id( jc_info );
 		pc_info    = bli_thrinfo_sub_node( jc_info );
 
 		if ( !pc_info ) goto print_thrinfo;
 
-		pc_comm_id = bli_thread_ocomm_id( pc_info );
+		pc_comm_id = bli_thread_thread_id( pc_info );
 		pc_work_id = bli_thread_work_id( pc_info );
 		pb_info    = bli_thrinfo_sub_node( pc_info );
 
 		if ( !pb_info ) goto print_thrinfo;
 
-		pb_comm_id = bli_thread_ocomm_id( pb_info );
+		pb_comm_id = bli_thread_thread_id( pb_info );
 		pb_work_id = bli_thread_work_id( pb_info );
 		ic_info    = bli_thrinfo_sub_node( pb_info );
 
 		if ( !ic_info ) goto print_thrinfo;
 
-		ic_comm_id = bli_thread_ocomm_id( ic_info );
+		ic_comm_id = bli_thread_thread_id( ic_info );
 		ic_work_id = bli_thread_work_id( ic_info );
 		pa_info    = bli_thrinfo_sub_node( ic_info );
 
 		if ( !pa_info ) goto print_thrinfo;
 
-		pa_comm_id = bli_thread_ocomm_id( pa_info );
+		pa_comm_id = bli_thread_thread_id( pa_info );
 		pa_work_id = bli_thread_work_id( pa_info );
 		jr_info    = bli_thrinfo_sub_node( pa_info );
 
 		if ( !jr_info ) goto print_thrinfo;
 
-		jr_comm_id = bli_thread_ocomm_id( jr_info );
+		jr_comm_id = bli_thread_thread_id( jr_info );
 		jr_work_id = bli_thread_work_id( jr_info );
 		ir_info    = bli_thrinfo_sub_node( jr_info );
 
 		if ( !ir_info ) goto print_thrinfo;
 
-		ir_comm_id = bli_thread_ocomm_id( ir_info );
+		ir_comm_id = bli_thread_thread_id( ir_info );
 		ir_work_id = bli_thread_work_id( ir_info );
 
 		print_thrinfo:
@@ -493,25 +457,25 @@ void bli_l3_thrinfo_print_trsm_paths
 
 		if ( !jc_info ) goto print_thrinfo;
 
-		jc_comm_id = bli_thread_ocomm_id( jc_info );
+		jc_comm_id = bli_thread_thread_id( jc_info );
 		jc_work_id = bli_thread_work_id( jc_info );
 		pc_info    = bli_thrinfo_sub_node( jc_info );
 
 		if ( !pc_info ) goto print_thrinfo;
 
-		pc_comm_id = bli_thread_ocomm_id( pc_info );
+		pc_comm_id = bli_thread_thread_id( pc_info );
 		pc_work_id = bli_thread_work_id( pc_info );
 		pb_info    = bli_thrinfo_sub_node( pc_info );
 
 		if ( !pb_info ) goto print_thrinfo;
 
-		pb_comm_id = bli_thread_ocomm_id( pb_info );
+		pb_comm_id = bli_thread_thread_id( pb_info );
 		pb_work_id = bli_thread_work_id( pb_info );
 		ic_info    = bli_thrinfo_sub_node( pb_info );
 
 		if ( !ic_info ) goto print_thrinfo;
 
-		ic_comm_id = bli_thread_ocomm_id( ic_info );
+		ic_comm_id = bli_thread_thread_id( ic_info );
 		ic_work_id = bli_thread_work_id( ic_info );
 		pa_info    = bli_thrinfo_sub_node( ic_info );
 		pa_info0   = bli_thrinfo_sub_prenode( ic_info );
@@ -520,38 +484,38 @@ void bli_l3_thrinfo_print_trsm_paths
 
 		if ( !pa_info0 ) goto check_thrinfo_node;
 
-		pa_comm_id0 = bli_thread_ocomm_id( pa_info0 );
+		pa_comm_id0 = bli_thread_thread_id( pa_info0 );
 		pa_work_id0 = bli_thread_work_id( pa_info0 );
 		jr_info0    = bli_thrinfo_sub_node( pa_info0 );
 
 		if ( !jr_info0 ) goto check_thrinfo_node;
 
-		jr_comm_id0 = bli_thread_ocomm_id( jr_info0 );
+		jr_comm_id0 = bli_thread_thread_id( jr_info0 );
 		jr_work_id0 = bli_thread_work_id( jr_info0 );
 		ir_info0    = bli_thrinfo_sub_node( jr_info0 );
 
 		if ( !ir_info0 ) goto check_thrinfo_node;
 
-		ir_comm_id0 = bli_thread_ocomm_id( ir_info0 );
+		ir_comm_id0 = bli_thread_thread_id( ir_info0 );
 		ir_work_id0 = bli_thread_work_id( ir_info0 );
 
 		check_thrinfo_node:
 
 		if ( !pa_info ) goto print_thrinfo;
 
-		pa_comm_id = bli_thread_ocomm_id( pa_info );
+		pa_comm_id = bli_thread_thread_id( pa_info );
 		pa_work_id = bli_thread_work_id( pa_info );
 		jr_info    = bli_thrinfo_sub_node( pa_info );
 
 		if ( !jr_info ) goto print_thrinfo;
 
-		jr_comm_id = bli_thread_ocomm_id( jr_info );
+		jr_comm_id = bli_thread_thread_id( jr_info );
 		jr_work_id = bli_thread_work_id( jr_info );
 		ir_info    = bli_thrinfo_sub_node( jr_info );
 
 		if ( !ir_info ) goto print_thrinfo;
 
-		ir_comm_id = bli_thread_ocomm_id( ir_info );
+		ir_comm_id = bli_thread_thread_id( ir_info );
 		ir_work_id = bli_thread_work_id( ir_info );
 
 		print_thrinfo:
@@ -584,7 +548,7 @@ void bli_l3_thrinfo_print_trsm_paths
 		}
 		else
 		{
-			jc_comm_id = bli_thread_ocomm_id( jc_info );
+			jc_comm_id = bli_thread_thread_id( jc_info );
 			jc_work_id = bli_thread_work_id( jc_info );
 			pc_info = bli_thrinfo_sub_node( jc_info );
 
@@ -595,7 +559,7 @@ void bli_l3_thrinfo_print_trsm_paths
 			}
 			else
 			{
-				pc_comm_id = bli_thread_ocomm_id( pc_info );
+				pc_comm_id = bli_thread_thread_id( pc_info );
 				pc_work_id = bli_thread_work_id( pc_info );
 				pb_info = bli_thrinfo_sub_node( pc_info );
 
@@ -606,7 +570,7 @@ void bli_l3_thrinfo_print_trsm_paths
 				}
 				else
 				{
-					pb_comm_id = bli_thread_ocomm_id( pb_info );
+					pb_comm_id = bli_thread_thread_id( pb_info );
 					pb_work_id = bli_thread_work_id( pb_info );
 					ic_info = bli_thrinfo_sub_node( pb_info );
 
@@ -617,7 +581,7 @@ void bli_l3_thrinfo_print_trsm_paths
 					}
 					else
 					{
-						ic_comm_id = bli_thread_ocomm_id( ic_info );
+						ic_comm_id = bli_thread_thread_id( ic_info );
 						ic_work_id = bli_thread_work_id( ic_info );
 						pa_info0 = bli_thrinfo_sub_prenode( ic_info );
 						pa_info = bli_thrinfo_sub_node( ic_info );
@@ -630,7 +594,7 @@ void bli_l3_thrinfo_print_trsm_paths
 						}
 						else
 						{
-							pa_comm_id0 = bli_thread_ocomm_id( pa_info0 );
+							pa_comm_id0 = bli_thread_thread_id( pa_info0 );
 							pa_work_id0 = bli_thread_work_id( pa_info0 );
 							jr_info0 = bli_thrinfo_sub_node( pa_info0 );
 
@@ -641,7 +605,7 @@ void bli_l3_thrinfo_print_trsm_paths
 							}
 							else
 							{
-								jr_comm_id0 = bli_thread_ocomm_id( jr_info0 );
+								jr_comm_id0 = bli_thread_thread_id( jr_info0 );
 								jr_work_id0 = bli_thread_work_id( jr_info0 );
 								ir_info0 = bli_thrinfo_sub_node( jr_info0 );
 
@@ -652,7 +616,7 @@ void bli_l3_thrinfo_print_trsm_paths
 								}
 								else
 								{
-									ir_comm_id0 = bli_thread_ocomm_id( ir_info0 );
+									ir_comm_id0 = bli_thread_thread_id( ir_info0 );
 									ir_work_id0 = bli_thread_work_id( ir_info0 );
 								}
 							}
@@ -666,7 +630,7 @@ void bli_l3_thrinfo_print_trsm_paths
 						}
 						else
 						{
-							pa_comm_id = bli_thread_ocomm_id( pa_info );
+							pa_comm_id = bli_thread_thread_id( pa_info );
 							pa_work_id = bli_thread_work_id( pa_info );
 							jr_info = bli_thrinfo_sub_node( pa_info );
 
@@ -677,7 +641,7 @@ void bli_l3_thrinfo_print_trsm_paths
 							}
 							else
 							{
-								jr_comm_id = bli_thread_ocomm_id( jr_info );
+								jr_comm_id = bli_thread_thread_id( jr_info );
 								jr_work_id = bli_thread_work_id( jr_info );
 								ir_info = bli_thrinfo_sub_node( jr_info );
 
@@ -688,7 +652,7 @@ void bli_l3_thrinfo_print_trsm_paths
 								}
 								else
 								{
-									ir_comm_id = bli_thread_ocomm_id( ir_info );
+									ir_comm_id = bli_thread_thread_id( ir_info );
 									ir_work_id = bli_thread_work_id( ir_info );
 								}
 							}
@@ -732,7 +696,7 @@ void bli_l3_thrinfo_free_paths
 	dim_t i;
 
 	for ( i = 0; i < n_threads; ++i )
-		bli_l3_thrinfo_free( rntm, threads[i] );
+		bli_thrinfo_free( threads[i] );
 
 	bli_free_intl( threads );
 }
diff --git a/frame/3/bli_l3_thrinfo.h b/frame/3/bli_l3_thrinfo.h
index 37a3909fd6..b8f199a996 100644
--- a/frame/3/bli_l3_thrinfo.h
+++ b/frame/3/bli_l3_thrinfo.h
@@ -68,60 +68,30 @@
 \
 	( index % thread->n_way == thread->work_id % thread->n_way )
 
-//
-// thrinfo_t APIs specific to level-3 operations.
-//
-
-void bli_l3_thrinfo_init
-     (
-       thrinfo_t* thread,
-       thrcomm_t* ocomm,
-       dim_t      ocomm_id,
-       dim_t      n_way,
-       dim_t      work_id,
-       thrinfo_t* sub_node
-     );
-
-void bli_l3_thrinfo_init_single
-     (
-       thrinfo_t* thread
-     );
-
-void bli_l3_thrinfo_free
-     (
-       rntm_t*    rntm,
-       thrinfo_t* thread
-     );
-
-void bli_l3_sup_thrinfo_free
-     (
-       rntm_t*    rntm,
-       thrinfo_t* thread
-     );
-
 // -----------------------------------------------------------------------------
 
-void bli_l3_thrinfo_create_root
+thrinfo_t* bli_l3_thrinfo_create
      (
-       dim_t       id,
-       thrcomm_t*  gl_comm,
-       rntm_t*     rntm,
-       cntl_t*     cntl,
-       thrinfo_t** thread
+             dim_t       id,
+             thrcomm_t*  gl_comm,
+             array_t*    array,
+       const rntm_t*     rntm,
+       const cntl_t*     cntl
      );
 
-void bli_l3_sup_thrinfo_create_root
+thrinfo_t* bli_l3_thrinfo_grow
      (
-       dim_t       id,
-       thrcomm_t*  gl_comm,
-       rntm_t*     rntm,
-       thrinfo_t** thread
+             thrinfo_t*  thread_par,
+       const rntm_t*     rntm,
+       const cntl_t*     cntl
      );
 
-void bli_l3_sup_thrinfo_update_root
+thrinfo_t* bli_l3_sup_thrinfo_create
      (
-       rntm_t*    rntm,
-       thrinfo_t* thread
+             dim_t       id,
+             thrcomm_t*  gl_comm,
+             array_t*    array,
+       const rntm_t*     rntm
      );
 
 void bli_l3_thrinfo_print_gemm_paths
diff --git a/frame/3/gemm/bli_gemm.h b/frame/3/gemm/bli_gemm.h
index ddd88e1633..1018f119d3 100644
--- a/frame/3/gemm/bli_gemm.h
+++ b/frame/3/gemm/bli_gemm.h
@@ -32,11 +32,12 @@
 
 */
 
+// bli_gemm_var.h must be included before bli_gemm_cntl.h
+#include "bli_gemm_var.h"
+
 #include "bli_gemm_cntl.h"
 #include "bli_gemm_front.h"
 
-#include "bli_gemm_var.h"
-
 #include "bli_gemm_ind_opt.h"
 
 // Mixed datatype support.
diff --git a/frame/3/gemm/bli_gemm_blk_var1.c b/frame/3/gemm/bli_gemm_blk_var1.c
index 485779a902..8590991903 100644
--- a/frame/3/gemm/bli_gemm_blk_var1.c
+++ b/frame/3/gemm/bli_gemm_blk_var1.c
@@ -41,8 +41,7 @@ void bli_gemm_blk_var1
        const obj_t*  b,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
+       const cntl_t* cntl,
              thrinfo_t* thread
      )
 {
@@ -60,7 +59,7 @@ void bli_gemm_blk_var1
 	dim_t my_start, my_end;
 	bli_thread_range_mdim
 	(
-	  direct, thread, &ap, b, &cp, cntl, cntx,
+	  direct, thread, &ap, b, &cp, cntl,
 	  &my_start, &my_end
 	);
 
@@ -70,7 +69,7 @@ void bli_gemm_blk_var1
 	{
 		// Determine the current algorithmic blocksize.
 		b_alg = bli_determine_blocksize( direct, i, my_end, &ap,
-		                                 bli_cntl_bszid( cntl ), cntx );
+		                                 bli_cntl_part_params_blksz( cntl ) );
 
 		// Acquire partitions for A1 and C1.
 		obj_t a1, c1;
@@ -88,7 +87,6 @@ void bli_gemm_blk_var1
 		  &BLIS_ONE,
 		  &c1,
 		  cntx,
-		  rntm,
 		  bli_cntl_sub_node( cntl ),
 		  bli_thrinfo_sub_node( thread )
 		);
diff --git a/frame/3/gemm/bli_gemm_blk_var2.c b/frame/3/gemm/bli_gemm_blk_var2.c
index 254a310648..b58600f536 100644
--- a/frame/3/gemm/bli_gemm_blk_var2.c
+++ b/frame/3/gemm/bli_gemm_blk_var2.c
@@ -41,8 +41,7 @@ void bli_gemm_blk_var2
        const obj_t*  b,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
+       const cntl_t* cntl,
              thrinfo_t* thread
      )
 {
@@ -60,7 +59,7 @@ void bli_gemm_blk_var2
 	dim_t my_start, my_end;
 	bli_thread_range_ndim
 	(
-	  direct, thread, a, &bp, &cp, cntl, cntx,
+	  direct, thread, a, &bp, &cp, cntl,
 	  &my_start, &my_end
 	);
 
@@ -70,7 +69,7 @@ void bli_gemm_blk_var2
 	{
 		// Determine the current algorithmic blocksize.
 		b_alg = bli_determine_blocksize( direct, i, my_end, &bp,
-		                                 bli_cntl_bszid( cntl ), cntx );
+		                                 bli_cntl_part_params_blksz( cntl ) );
 
 		// Acquire partitions for B1 and C1.
 		obj_t b1, c1;
@@ -88,7 +87,6 @@ void bli_gemm_blk_var2
 		  &BLIS_ONE,
 		  &c1,
 		  cntx,
-		  rntm,
 		  bli_cntl_sub_node( cntl ),
 		  bli_thrinfo_sub_node( thread )
 		);
diff --git a/frame/3/gemm/bli_gemm_blk_var3.c b/frame/3/gemm/bli_gemm_blk_var3.c
index 1bbec1d957..f84e4b8544 100644
--- a/frame/3/gemm/bli_gemm_blk_var3.c
+++ b/frame/3/gemm/bli_gemm_blk_var3.c
@@ -40,8 +40,7 @@ void bli_gemm_blk_var3
        const obj_t*  b,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
+       const cntl_t* cntl,
              thrinfo_t* thread
      )
 {
@@ -64,8 +63,8 @@ void bli_gemm_blk_var3
 	for ( dim_t i = 0; i < k_trans; i += b_alg )
 	{
 		// Determine the current algorithmic blocksize.
-		b_alg = bli_l3_determine_kc( direct, i, k_trans, &ap, &bp,
-		                             bli_cntl_bszid( cntl ), cntx, cntl );
+		b_alg = bli_determine_blocksize( direct, i, k_trans, &bp,
+		                                 bli_cntl_part_params_blksz( cntl ) );
 
 		// Acquire partitions for A1 and B1.
 		obj_t a1, b1;
@@ -83,7 +82,6 @@ void bli_gemm_blk_var3
 		  &BLIS_ONE,
 		  &cs,
 		  cntx,
-		  rntm,
 		  bli_cntl_sub_node( cntl ),
 		  bli_thrinfo_sub_node( thread )
 		);
diff --git a/frame/3/gemm/bli_gemm_cntl.c b/frame/3/gemm/bli_gemm_cntl.c
index 052c812a33..41a21fd7cf 100644
--- a/frame/3/gemm/bli_gemm_cntl.c
+++ b/frame/3/gemm/bli_gemm_cntl.c
@@ -33,123 +33,174 @@
 
 */
 
+#include "bli_cntx.h"
+#include "bli_type_defs.h"
 #include "blis.h"
 
 cntl_t* bli_gemm_cntl_create
      (
-       rntm_t* rntm,
-       opid_t  family,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       void_fp ker
+             goto_cntl_t* cntl,
+             opid_t       family,
+             num_t        dt_comp,
+             obj_t*       a,
+             obj_t*       b,
+             obj_t*       c,
+       const cntx_t*      cntx
      )
 {
-	return bli_gemmbp_cntl_create( rntm, family, schema_a, schema_b, ker );
-}
-
-// -----------------------------------------------------------------------------
-
-cntl_t* bli_gemmbp_cntl_create
-     (
-       rntm_t* rntm,
-       opid_t  family,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       void_fp ker
-     )
-{
-	void_fp macro_kernel_fp;
-
-	// Choose the default macrokernel based on the operation family...
-	if      ( family == BLIS_GEMM ) macro_kernel_fp = bli_gemm_ker_var2;
-	else if ( family == BLIS_GEMMT ) macro_kernel_fp = bli_gemmt_x_ker_var2;
-	else if ( family == BLIS_TRMM ) macro_kernel_fp = bli_trmm_xx_ker_var2;
-	else /* should never execute */ macro_kernel_fp = NULL;
-
-	// ...unless a non-NULL kernel function pointer is passed in, in which
-	// case we use that instead.
-	if ( ker ) macro_kernel_fp = ker;
+	      void_fp         ker_fp       = bli_obj_ker_fn( c );
+	      void_fp         packa_fp     = bli_obj_pack_fn( a );
+	      void_fp         packb_fp     = bli_obj_pack_fn( b );
+    const gemm_params_t*  ker_params   = bli_obj_ker_params( c );
+    const packm_params_t* packa_params = bli_obj_pack_params( a );
+    const packm_params_t* packb_params = bli_obj_pack_params( b );
+
+	// Choose the default macrokernels based on the operation family unless a
+    // non-NULL kernel function pointer is passed in, in which case we use that instead.
+    if ( ker_fp == NULL )
+    {
+    	if      ( family == BLIS_GEMM  ) ker_fp = bli_gemm_ker_var2;
+    	else if ( family == BLIS_GEMMT ) ker_fp = bli_gemmt_x_ker_var2;
+    	else if ( family == BLIS_TRMM  ) ker_fp = bli_trmm_xx_ker_var2;
+    }
+
+    if ( packa_fp == NULL )
+    {
+    	packa_fp = bli_l3_packa;
+    }
+
+    if ( packb_fp == NULL )
+    {
+    	packb_fp = bli_l3_packb;
+    }
+
+    cntl->ker_params.params = ker_params;
+    cntl->ker_params.dt_comp = dt_comp;
+    cntl->ker_params.ukr_row_pref = bli_cntx_get_ukr_prefs_dt( dt_comp, BLIS_GEMM_UKR_ROW_PREF, cntx);
+
+    cntl->mc.blksz     = bli_cntx_get_blksz_def_dt( dt_comp, BLIS_MC, cntx );
+    cntl->nc.blksz     = bli_cntx_get_blksz_def_dt( dt_comp, BLIS_NC, cntx );
+    cntl->kc.blksz     = bli_cntx_get_blksz_def_dt( dt_comp, BLIS_KC, cntx );
+    cntl->mc.blksz_max = bli_cntx_get_blksz_max_dt( dt_comp, BLIS_MC, cntx );
+    cntl->nc.blksz_max = bli_cntx_get_blksz_max_dt( dt_comp, BLIS_NC, cntx );
+    cntl->kc.blksz_max = bli_cntx_get_blksz_max_dt( dt_comp, BLIS_KC, cntx );
+    cntl->mc.bmult     = bli_cntx_get_blksz_def_dt( dt_comp, BLIS_MR, cntx );
+    cntl->nc.bmult     = bli_cntx_get_blksz_def_dt( dt_comp, BLIS_NR, cntx );
+    cntl->kc.bmult     = 1;
+
+    if ( ker_params != NULL )
+    {
+        cntl->ker_params.ukr_row_pref = bli_mbool_get_dt( dt_comp, &ker_params->ukr_row_pref );
+
+        cntl->mc.bmult = bli_blksz_get_def( dt_comp, &ker_params->mr );
+        cntl->nc.bmult = bli_blksz_get_def( dt_comp, &ker_params->nr );
+    }
+
+    cntl->mc.blksz     = bli_align_dim_to_mult( cntl->mc.blksz, cntl->mc.bmult );
+    cntl->nc.blksz     = bli_align_dim_to_mult( cntl->nc.blksz, cntl->nc.bmult );
+    cntl->mc.blksz_max = bli_align_dim_to_mult( cntl->mc.blksz_max, cntl->mc.bmult );
+    cntl->nc.blksz_max = bli_align_dim_to_mult( cntl->nc.blksz_max, cntl->nc.bmult );
+
+    bli_l3_adjust_kc( a, b, cntl->mc.bmult, cntl->nc.bmult, &cntl->kc.blksz, &cntl->kc.blksz_max, family );
+
+    cntl->packa_params.params = packa_params;
+    cntl->packa_params.does_invert_diag = FALSE;
+    cntl->packa_params.rev_iter_if_upper = FALSE;
+    cntl->packa_params.rev_iter_if_lower = FALSE;
+    cntl->packa_params.pack_schema = BLIS_PACKED_ROW_PANELS;
+    cntl->packa_params.pack_buf_type = BLIS_BUFFER_FOR_A_BLOCK;
+
+    cntl->packb_params.params = packb_params;
+    cntl->packb_params.does_invert_diag = FALSE;
+    cntl->packb_params.rev_iter_if_upper = FALSE;
+    cntl->packb_params.rev_iter_if_lower = FALSE;
+    cntl->packb_params.pack_schema = BLIS_PACKED_COL_PANELS;
+    cntl->packb_params.pack_buf_type = BLIS_BUFFER_FOR_B_PANEL;
 
 	// Create two nodes for the macro-kernel.
-	cntl_t* gemm_cntl_bu_ke = bli_gemm_cntl_create_node
-	(
-	  rntm,    // the thread's runtime structure
-	  family,  // the operation family
-	  BLIS_MR, // needed for bli_thrinfo_rgrow()
-	  NULL,    // variant function pointer not used
-	  NULL     // no sub-node; this is the leaf of the tree.
-	);
-
-	cntl_t* gemm_cntl_bp_bu = bli_gemm_cntl_create_node
-	(
-	  rntm,    // the thread's runtime structure
+    bli_cntl_initialize_node
+    (
+      &cntl->loop1,
+      family,    // the operation family
+      BLIS_MR,   // used for thread partitioning
+      NULL,      // variant function pointer not used
+      NULL,      // not used
+      NULL,      // no sub-prenode; this is the leaf of the tree.
+      NULL       // no sub-node; this is the leaf of the tree.
+    );
+
+    bli_cntl_initialize_node
+    (
+      &cntl->loop2,
 	  family,
-	  BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow()
-	  macro_kernel_fp,
-	  gemm_cntl_bu_ke
+      BLIS_NR,
+	  ker_fp,
+	  &cntl->ker_params,
+      NULL,
+      &cntl->loop1
 	);
 
 	// Create a node for packing matrix A.
-	cntl_t* gemm_cntl_packa = bli_packm_cntl_create_node
-	(
-	  rntm,
-	  bli_l3_packa,  // pack the left-hand operand
-	  BLIS_MR,
-	  BLIS_KR,
-	  FALSE,   // do NOT invert diagonal
-	  FALSE,   // reverse iteration if upper?
-	  FALSE,   // reverse iteration if lower?
-	  schema_a, // normally BLIS_PACKED_ROW_PANELS
-	  BLIS_BUFFER_FOR_A_BLOCK,
-	  gemm_cntl_bp_bu
+    bli_cntl_initialize_node
+    (
+      &cntl->packa,
+	  family,
+      BLIS_NO_PART,
+	  packa_fp,
+	  &cntl->packa_params,
+      NULL,
+      &cntl->loop2
 	);
 
 	// Create a node for partitioning the m dimension by MC.
-	cntl_t* gemm_cntl_op_bp = bli_gemm_cntl_create_node
-	(
-	  rntm,
+    bli_cntl_initialize_node
+    (
+      &cntl->loop3,
 	  family,
-	  BLIS_MC,
+      BLIS_MC,
 	  bli_gemm_blk_var1,
-	  gemm_cntl_packa
+      &cntl->mc,
+      NULL,
+	  &cntl->packa
 	);
 
 	// Create a node for packing matrix B.
-	cntl_t* gemm_cntl_packb = bli_packm_cntl_create_node
-	(
-	  rntm,
-	  bli_l3_packb,  // pack the right-hand operand
-	  BLIS_NR,
-	  BLIS_KR,
-	  FALSE,   // do NOT invert diagonal
-	  FALSE,   // reverse iteration if upper?
-	  FALSE,   // reverse iteration if lower?
-	  schema_b, // normally BLIS_PACKED_COL_PANELS
-	  BLIS_BUFFER_FOR_B_PANEL,
-	  gemm_cntl_op_bp
+    bli_cntl_initialize_node
+    (
+      &cntl->packb,
+	  family,
+      BLIS_NO_PART,
+	  packb_fp,
+	  &cntl->packb_params,
+      NULL,
+      &cntl->loop3
 	);
 
 	// Create a node for partitioning the k dimension by KC.
-	cntl_t* gemm_cntl_mm_op = bli_gemm_cntl_create_node
-	(
-	  rntm,
+    bli_cntl_initialize_node
+    (
+      &cntl->loop4,
 	  family,
-	  BLIS_KC,
+      BLIS_KC,
 	  bli_gemm_blk_var3,
-	  gemm_cntl_packb
+      &cntl->kc,
+      NULL,
+	  &cntl->packb
 	);
 
 	// Create a node for partitioning the n dimension by NC.
-	cntl_t* gemm_cntl_vl_mm = bli_gemm_cntl_create_node
-	(
-	  rntm,
+    bli_cntl_initialize_node
+    (
+      &cntl->loop5,
 	  family,
-	  BLIS_NC,
+      BLIS_NC,
 	  bli_gemm_blk_var2,
-	  gemm_cntl_mm_op
+      &cntl->nc,
+      NULL,
+	  &cntl->loop4
 	);
 
-	return gemm_cntl_vl_mm;
+	return &cntl->loop5;
 }
 
 // -----------------------------------------------------------------------------
@@ -248,30 +299,3 @@ cntl_t* bli_gemmpb_cntl_create
 	return gemm_cntl_vl_mm;
 }
 #endif
-
-// -----------------------------------------------------------------------------
-
-void bli_gemm_cntl_free
-     (
-       rntm_t*    rntm,
-       cntl_t*    cntl,
-       thrinfo_t* thread
-     )
-{
-	bli_cntl_free( rntm, cntl, thread );
-}
-
-// -----------------------------------------------------------------------------
-
-cntl_t* bli_gemm_cntl_create_node
-     (
-       rntm_t* rntm,
-       opid_t  family,
-       bszid_t bszid,
-       void_fp var_func,
-       cntl_t* sub_node
-     )
-{
-	return bli_cntl_create_node( rntm, family, bszid, var_func, NULL, sub_node );
-}
-
diff --git a/frame/3/gemm/bli_gemm_cntl.h b/frame/3/gemm/bli_gemm_cntl.h
index 5fa213ac41..fc427f8a1e 100644
--- a/frame/3/gemm/bli_gemm_cntl.h
+++ b/frame/3/gemm/bli_gemm_cntl.h
@@ -33,50 +33,69 @@
 
 */
 
-cntl_t* bli_gemm_cntl_create
-     (
-       rntm_t* rntm,
-       opid_t  family,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       void_fp ker
-     );
+typedef struct
+{
+    dim_t blksz;
+    dim_t blksz_max;
+    dim_t bmult;
+} part_cntl_t;
 
-// -----------------------------------------------------------------------------
+typedef struct
+{
+    blksz_t mr, nr;
+	func_t ukr;
+    mbool_t ukr_row_pref;
+} gemm_params_t;
 
-cntl_t* bli_gemmbp_cntl_create
-     (
-       rntm_t* rntm,
-       opid_t  family,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       void_fp ker
-     );
+typedef struct
+{
+    const gemm_params_t* params;
+    num_t dt_comp;
+    bool ukr_row_pref;
+} gemm_ker_cntl_t;
 
-#if 0
-cntl_t* bli_gemmpb_cntl_create
-     (
-       opid_t family,
-     );
-#endif
+typedef struct
+{
+    cntl_t loop5;
+    cntl_t loop4;
+    cntl_t loop3;
+    cntl_t loop2;
+    cntl_t loop1;
+    cntl_t packa;
+    cntl_t packb;
+    part_cntl_t mc;
+    part_cntl_t nc;
+    part_cntl_t kc;
+    packm_cntl_t packa_params;
+    packm_cntl_t packb_params;
+    gemm_ker_cntl_t ker_params;
+} goto_cntl_t;
 
-// -----------------------------------------------------------------------------
+BLIS_INLINE dim_t bli_cntl_part_blksz_def( const cntl_t* cntl )
+{
+	part_cntl_t* ppp = ( part_cntl_t* )cntl->params; return ppp->blksz;
+}
 
-void bli_gemm_cntl_free
-     (
-       rntm_t*    rntm,
-       cntl_t*    cntl,
-       thrinfo_t* thread
-     );
+BLIS_INLINE dim_t bli_cntl_part_blksz_max( const cntl_t* cntl )
+{
+	part_cntl_t* ppp = ( part_cntl_t* )cntl->params; return ppp->blksz_max;
+}
+
+BLIS_INLINE dim_t bli_cntl_part_bmult( const cntl_t* cntl )
+{
+	part_cntl_t* ppp = ( part_cntl_t* )cntl->params; return ppp->bmult;
+}
 
 // -----------------------------------------------------------------------------
 
-cntl_t* bli_gemm_cntl_create_node
+cntl_t* bli_gemm_cntl_create
      (
-       rntm_t* rntm,
-       opid_t  family,
-       bszid_t bszid,
-       void_fp var_func,
-       cntl_t* sub_node
+             goto_cntl_t* cntl,
+             opid_t       family,
+             num_t        dt_comp,
+             obj_t*       a,
+             obj_t*       b,
+             obj_t*       c,
+       const cntx_t*      cntx
      );
 
diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c
index 428e2079f8..9863b70e11 100644
--- a/frame/3/gemm/bli_gemm_front.c
+++ b/frame/3/gemm/bli_gemm_front.c
@@ -43,8 +43,7 @@ void bli_gemm_front
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl
+             rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -95,55 +94,77 @@ void bli_gemm_front
 	bli_obj_reset_origin( &b_local );
 	bli_obj_reset_origin( &c_local );
 
+    num_t dt_comp = bli_gemm_md_comp_dt
+    (
+      &a_local,
+      &b_local,
+      beta,
+      &c_local,
+      rntm
+    );
+
+    goto_cntl_t goto_cntl;
+    cntl_t* cntl = bli_gemm_cntl_create
+    (
+      &goto_cntl,
+      BLIS_GEMM,
+      dt_comp,
+      &a_local,
+      &b_local,
+      &c_local,
+      cntx
+    );
+
+    // Adjust the perceived row storage of C based on which operand is complex
+    // in an CCR/CRC operation. This ensures that the real microkernel will output
+    // complex elements with unit imaginary stride.
+    bool row_stored = bli_obj_is_col_stored( c );
+    if ( bli_gemm_md_is_ccr( a, b, c ) ) row_stored = FALSE;
+    if ( bli_gemm_md_is_crc( a, b, c ) ) row_stored = TRUE;
+
 	// An optimization: If C is stored by rows and the micro-kernel prefers
 	// contiguous columns, or if C is stored by columns and the micro-kernel
 	// prefers contiguous rows, transpose the entire operation to allow the
 	// micro-kernel to access elements of C in its preferred manner.
-	if ( bli_cntx_dislikes_storage_of( &c_local, BLIS_GEMM_VIR_UKR, cntx ) )
+	if ( row_stored != goto_cntl.ker_params.ukr_row_pref )
 	{
 		bli_obj_swap( &a_local, &b_local );
 
 		bli_obj_induce_trans( &a_local );
 		bli_obj_induce_trans( &b_local );
 		bli_obj_induce_trans( &c_local );
-	}
-
-	// Set the pack schemas within the objects.
-	bli_l3_set_schemas( &a_local, &b_local, &c_local, cntx );
 
-#ifdef BLIS_ENABLE_GEMM_MD
-	cntx_t cntx_local;
-
-	// If any of the storage datatypes differ, or if the computation precision
-	// differs from the storage precision of C, utilize the mixed datatype
-	// code path.
-	// NOTE: If we ever want to support the caller setting the computation
-	// domain explicitly, we will need to check the computation dt against the
-	// storage dt of C (instead of the computation precision against the
-	// storage precision of C).
-	if ( bli_obj_dt( &c_local ) != bli_obj_dt( &a_local ) ||
-	     bli_obj_dt( &c_local ) != bli_obj_dt( &b_local ) ||
-	     bli_obj_comp_prec( &c_local ) != bli_obj_prec( &c_local ) )
-	{
-		// Handle mixed datatype cases in bli_gemm_md(), which may modify
-		// the objects or the context. (If the context is modified, cntx
-		// is adjusted to point to cntx_local.)
-		bli_gemm_md( &a_local, &b_local, beta, &c_local, &cntx_local, &cntx );
+        const packm_params_t* tmp = goto_cntl.packa_params.params;
+        goto_cntl.packa_params.params = goto_cntl.packb_params.params;
+        goto_cntl.packb_params.params = tmp;
 	}
-#endif
+
+	// Handle mixed datatype cases in bli_gemm_md(), which may modify
+	// the objects.
+	bli_gemm_md( &a_local, &b_local, beta, &c_local, rntm, cntx, &goto_cntl );
 
 	// Next, we handle the possibility of needing to typecast alpha to the
 	// computation datatype and/or beta to the storage datatype of C.
 
-	// Attach alpha to B, and in the process typecast alpha to the target
+	// Attach alpha to A or B, and in the process typecast alpha to the target
 	// datatype of the matrix (which in this case is equal to the computation
 	// datatype).
-	bli_obj_scalar_attach( BLIS_NO_CONJUGATE, alpha, &b_local );
+    // Attach to A if it is the only complex input matrix, just in case
+    // the scalar is also complex.
+    if (  bli_obj_is_complex( &a_local ) &&
+         !bli_obj_is_complex( &b_local ) )
+    {
+	    bli_obj_scalar_attach( dt_comp | bli_obj_domain( a ), BLIS_NO_CONJUGATE, alpha, &a_local );
+    }
+    else
+    {
+	    bli_obj_scalar_attach( dt_comp | bli_obj_domain( b ), BLIS_NO_CONJUGATE, alpha, &b_local );
+    }
 
 	// Attach beta to C, and in the process typecast beta to the target
 	// datatype of the matrix (which in this case is equal to the storage
 	// datatype of C).
-	bli_obj_scalar_attach( BLIS_NO_CONJUGATE, beta,  &c_local );
+	bli_obj_scalar_attach( bli_obj_dt( c ), BLIS_NO_CONJUGATE, beta,  &c_local );
 
 	// Change the alpha and beta pointers to BLIS_ONE since the values have
 	// now been typecast and attached to the matrices above.
@@ -191,7 +212,7 @@ void bli_gemm_front
 	// constant like 2. And don't forget to use the same conditional for the
 	// castm() and free() at the end.
 	if (
-	     bli_obj_prec( &c_local ) != bli_obj_comp_prec( &c_local ) ||
+	     bli_obj_prec( &c_local ) != bli_dt_prec( dt_comp ) ||
 	     bli_gemm_md_is_crr( &a_local, &b_local, &c_local ) ||
 	     is_ccr_mismatch ||
 	     is_crc_mismatch
@@ -209,14 +230,13 @@ void bli_gemm_front
 		      inc_t rs    = bli_obj_row_stride( &c_local );
 		      inc_t cs    = bli_obj_col_stride( &c_local );
 
-		      num_t dt_ct = bli_obj_domain( &c_local ) |
-		                    bli_obj_comp_prec( &c_local );
+		      num_t dt_ct = bli_obj_domain( &c_local ) | dt_comp;
 
 		// When performing the crr case, accumulate to a contiguously-stored
 		// real matrix so we do not have to repeatedly update C with general
 		// stride.
 		if ( bli_gemm_md_is_crr( &a_local, &b_local, &c_local ) )
-			dt_ct = BLIS_REAL | bli_obj_comp_prec( &c_local );
+			dt_ct = dt_comp;
 
 		// When performing the mismatched ccr or crc cases, now is the time
 		// to specify the appropriate storage so the gemm_md_c2r_ref() virtual
@@ -227,9 +247,6 @@ void bli_gemm_front
 
 		bli_obj_create( dt_ct, m, n, rs, cs, &ct );
 
-		const num_t dt_exec = bli_obj_exec_dt( &c_local );
-		const num_t dt_comp = bli_obj_comp_dt( &c_local );
-
 		bli_obj_set_target_dt( dt_ct, &ct );
 		bli_obj_set_exec_dt( dt_exec, &ct );
 		bli_obj_set_comp_dt( dt_comp, &ct );
diff --git a/frame/3/gemm/bli_gemm_front.h b/frame/3/gemm/bli_gemm_front.h
index 744f88d1b2..9465c37d91 100644
--- a/frame/3/gemm/bli_gemm_front.h
+++ b/frame/3/gemm/bli_gemm_front.h
@@ -40,8 +40,7 @@ void bli_gemm_front
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl
+             rntm_t* rntm
      );
 
 #ifdef BLIS_ENABLE_SMALL_MATRIX
@@ -53,7 +52,7 @@ err_t bli_gemm_small
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             cntl_t* cntl
+       const cntl_t* cntl
      );
 #endif
 
diff --git a/frame/3/gemm/bli_gemm_ker_var2.c b/frame/3/gemm/bli_gemm_ker_var2.c
index 814b47c0cf..2a00e7b46e 100644
--- a/frame/3/gemm/bli_gemm_ker_var2.c
+++ b/frame/3/gemm/bli_gemm_ker_var2.c
@@ -69,10 +69,10 @@ void PASTEMAC2(chx,chy,op) \
 	); \
 }
 
-INSERT_GENTFUNC2_BASIC0(xbpys_mxn_fn);
-INSERT_GENTFUNC2_MIXDP0(xbpys_mxn_fn);
+INSERT_GENTFUNC2_BASIC0(xpbys_mxn_fn);
+INSERT_GENTFUNC2_MIXDP0(xpbys_mxn_fn);
 
-static xpbys_mxn_vft GENARRAY2_ALL(xbpys_mxn, xbpys_mxn_fn);
+static xpbys_mxn_vft GENARRAY2_ALL(xpbys_mxn, xpbys_mxn_fn);
 
 
 void bli_gemm_ker_var2
@@ -81,8 +81,7 @@ void bli_gemm_ker_var2
        const obj_t*  b,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
+       const cntl_t* cntl,
              thrinfo_t* thread
      )
 {
@@ -189,7 +188,7 @@ void bli_gemm_ker_var2
 	// field of the params struct. If that function pointer is non-NULL, use it
 	// as our microkernel instead of the default microkernel queried from the
 	// cntx above.
-	const gemm_ker_params_t* params = bli_obj_ker_params( c );
+	const gemm_params_t* params = bli_obj_ker_params( c );
 	gemm_ukr_vft user_ukr = params ? params->ukr : NULL;
 	if ( user_ukr ) gemm_ukr = user_ukr;
 
@@ -345,7 +344,7 @@ void bli_gemm_ker_var2
 				);
 
 				// Accumulate to C with type-casting.
-				xbpys_mxn[ dt_exec ][ dt_c ]
+				xpbys_mxn[ dt_exec ][ dt_c ]
 				(
 				    m_cur, n_cur,
 				    &ct, rs_ct, cs_ct,
diff --git a/frame/3/gemm/bli_gemm_md.c b/frame/3/gemm/bli_gemm_md.c
index a283c12354..efa67ec6c0 100644
--- a/frame/3/gemm/bli_gemm_md.c
+++ b/frame/3/gemm/bli_gemm_md.c
@@ -37,18 +37,59 @@
 
 #ifdef BLIS_ENABLE_GEMM_MD
 
-void bli_gemm_md
+num_t bli_gemm_md_comp_dt
      (
-             obj_t*   a,
-             obj_t*   b,
-       const obj_t*   beta,
-             obj_t*   c,
-             cntx_t*  cntx_local,
-       const cntx_t** cntx
+             obj_t*  a,
+             obj_t*  b,
+       const obj_t*  beta,
+             obj_t*  c,
+       const rntm_t* rntm
      )
 {
-	mddm_t doms;
+    //TODO: get comp dt from combination of A, B, C
+    num_t comp_dt = bli_obj_dt( c );
+
+    if ( bli_gemm_md_is_crr( a, b, c ) ||
+         bli_gemm_md_is_ccr( a, b, c ) ||
+         bli_gemm_md_is_crc( a, b, c ) )
+    {
+        comp_dt = bli_dt_proj_to_real( comp_dt );
+    }
+
+    if ( bli_gemm_md_is_rcc( a, b, c ) &&
+         beta is real )
+    {
+        comp_dt = bli_dt_proj_to_real( comp_dt );
+    }
+
+    if ( bli_gemm_md_is_ccc( a, b, c ) )
+    {
+    	// Find the highest priority induced method that is both enabled and
+    	// available for the current operation. (If an induced method is
+    	// available but not enabled, or simply unavailable, BLIS_NAT will
+    	// be returned here.)
+    	ind_t im = bli_gemmind_find_avail( bli_obj_dt( c ) );
+
+        if ( im == BLIS_1M && bli_rntm_ind( BLIS_1M, rntm ) )
+        {
+            comp_dt = bli_dt_proj_to_real( comp_dt );
+        }
+    }
+
+    return dt_comp;
+}
 
+void bli_gemm_md
+     (
+             obj_t*       a,
+             obj_t*       b,
+       const obj_t*       beta,
+             obj_t*       c,
+       const rntm_t*      rntm,
+       const cntx_t*      cntx,
+             goto_cntl_t* cntl
+     )
+{
 	const bool a_is_real = bli_obj_is_real( a );
 	const bool a_is_comp = bli_obj_is_complex( a );
 	const bool b_is_real = bli_obj_is_real( b );
@@ -59,105 +100,59 @@ void bli_gemm_md
 	if      ( c_is_real && a_is_real && b_is_real )
 	{
 		// C_real += A_real * B_real
-		doms = bli_gemm_md_rrr( a, b, beta, c, cntx_local, cntx );
+		bli_gemm_md_rrr( a, b, beta, c, rntm, cntx, cntl );
 	}
 	else if ( c_is_comp && a_is_comp && b_is_comp )
 	{
 		// C_complex += A_complex * B_complex
-		doms = bli_gemm_md_ccc( a, b, beta, c, cntx_local, cntx );
+		bli_gemm_md_ccc( a, b, beta, c, rntm, cntx, cntl );
 	}
 	else if ( c_is_comp && a_is_comp && b_is_real )
 	{
 		// C_complex += A_complex * B_real
-		doms = bli_gemm_md_ccr( a, b, beta, c, cntx_local, cntx );
+		bli_gemm_md_ccr( a, b, beta, c, rntm, cntx, cntl );
 	}
 	else if ( c_is_comp && a_is_real && b_is_comp )
 	{
 		// C_complex += A_real * B_complex
-		doms = bli_gemm_md_crc( a, b, beta, c, cntx_local, cntx );
+		bli_gemm_md_crc( a, b, beta, c, rntm, cntx, cntl );
 	}
 	else if ( c_is_real && a_is_comp && b_is_comp )
 	{
 		// C_real += A_complex * B_complex
-		doms = bli_gemm_md_rcc( a, b, beta, c, cntx_local, cntx );
+		bli_gemm_md_rcc( a, b, beta, c, rntm, cntx, cntl );
 	}
 	else if ( c_is_comp && a_is_real && b_is_real )
 	{
 		// C_complex += A_real * B_real
-		doms = bli_gemm_md_crr( a, b, beta, c, cntx_local, cntx );
+		bli_gemm_md_crr( a, b, beta, c, rntm, cntx, cntl );
 	}
 	else if ( c_is_real && a_is_comp && b_is_real )
 	{
 		// C_real += A_complex * B_real
-		doms = bli_gemm_md_rcr( a, b, beta, c, cntx_local, cntx );
+		bli_gemm_md_rcr( a, b, beta, c, rntm, cntx, cntl );
 	}
 	else if ( c_is_real && a_is_real && b_is_comp )
 	{
 		// C_real += A_real * B_complex
-		doms = bli_gemm_md_rrc( a, b, beta, c, cntx_local, cntx );
-	}
-	else
-	{
-		doms.comp = BLIS_REAL;
-		doms.exec = BLIS_REAL;
-
-		// This should never execute.
-		bli_abort();
+		bli_gemm_md_rrc( a, b, beta, c, rntm, cntx, cntl );
 	}
-
-	// Extract the computation and execution domains from the struct
-	// returned above.
-	dom_t dom_comp = doms.comp;
-	dom_t dom_exec = doms.exec;
-
-	// Inspect the computation precision of C. (The user may have set
-	// this explicitly to request the precision in which the computation
-	// should take place.)
-	prec_t prec_comp = bli_obj_comp_prec( c );
-
-	// The computation precision tells us the target precision of A and B.
-	// NOTE: We don't set the target domain here. The target domain would
-	// either be unchanged, or would have been changed in one of the eight
-	// domain cases above.
-	bli_obj_set_target_prec( prec_comp, a );
-	bli_obj_set_target_prec( prec_comp, b );
-
-	// Combine the execution domain with the computation precision to form
-	// the execution datatype. (The computation precision and execution
-	// precision are always equal.)
-	num_t dt_exec = dom_exec | prec_comp;
-
-	// Set the execution datatypes of A, B, and C.
-	bli_obj_set_exec_dt( dt_exec, a );
-	bli_obj_set_exec_dt( dt_exec, b );
-	bli_obj_set_exec_dt( dt_exec, c );
-
-	// Combine the computation precision and computation domain to form the
-	// computation datatype.
-	num_t dt_comp = dom_comp | prec_comp;
-
-	// Set the computation datatypes of A, B, and C.
-	bli_obj_set_comp_dt( dt_comp, a );
-	bli_obj_set_comp_dt( dt_comp, b );
-	bli_obj_set_comp_dt( dt_comp, c );
-
 }
 
 // -----------------------------------------------------------------------------
 
-//                 cab
-mddm_t bli_gemm_md_ccr
+//               cab
+void bli_gemm_md_ccr
      (
-             obj_t*   a,
-             obj_t*   b,
-       const obj_t*   beta,
-             obj_t*   c,
-             cntx_t*  cntx_local,
-       const cntx_t** cntx
+             obj_t*       a,
+             obj_t*       b,
+       const obj_t*       beta,
+             obj_t*       c,
+       const rntm_t*      rntm,
+       const cntx_t*      cntx,
+             goto_cntl_t* cntl
      )
 {
-	mddm_t doms;
-
 	// We assume that the requested computation domain is complex.
 	//dom_t dom_comp_in = bli_obj_comp_domain( c );
 	//dom_t dom_comp_in = BLIS_COMPLEX;
@@ -253,19 +248,18 @@ mddm_t bli_gemm_md_ccr
 
 // -----------------------------------------------------------------------------
 
-//                 cab
-mddm_t bli_gemm_md_crc
+//               cab
+void bli_gemm_md_crc
      (
-             obj_t*   a,
-             obj_t*   b,
-       const obj_t*   beta,
-             obj_t*   c,
-             cntx_t*  cntx_local,
-       const cntx_t** cntx
+             obj_t*       a,
+             obj_t*       b,
+       const obj_t*       beta,
+             obj_t*       c,
+       const rntm_t*      rntm,
+       const cntx_t*      cntx,
+             goto_cntl_t* cntl
      )
 {
-	mddm_t doms;
-
 	// We assume that the requested computation domain is complex.
 	//dom_t dom_comp_in = bli_obj_comp_domain( c );
 	//dom_t dom_comp_in = BLIS_COMPLEX;
@@ -361,19 +355,18 @@ mddm_t bli_gemm_md_crc
 
 // -----------------------------------------------------------------------------
 
-//                 cab
-mddm_t bli_gemm_md_rcc
+//               cab
+void bli_gemm_md_rcc
      (
-             obj_t*   a,
-             obj_t*   b,
-       const obj_t*   beta,
-             obj_t*   c,
-             cntx_t*  cntx_local,
-       const cntx_t** cntx
+             obj_t*       a,
+             obj_t*       b,
+       const obj_t*       beta,
+             obj_t*       c,
+       const rntm_t*      rntm,
+       const cntx_t*      cntx,
+             goto_cntl_t* cntl
      )
 {
-	mddm_t doms;
-
 	// We assume that the requested computation domain is complex.
 	//dom_t dom_comp_in = bli_obj_comp_domain( c );
 	//dom_t dom_comp_in = BLIS_COMPLEX;
@@ -454,18 +447,19 @@ mddm_t bli_gemm_md_rcc
 
 // -----------------------------------------------------------------------------
 
-//                 cab
-mddm_t bli_gemm_md_crr
+//               cab
+void bli_gemm_md_crr
      (
-             obj_t*   a,
-             obj_t*   b,
-       const obj_t*   beta,
-             obj_t*   c,
-             cntx_t*  cntx_local,
-       const cntx_t** cntx
+             obj_t*       a,
+             obj_t*       b,
+       const obj_t*       beta,
+             obj_t*       c,
+       const rntm_t*      rntm,
+       const cntx_t*      cntx,
+             goto_cntl_t* cntl
      )
 {
-	mddm_t doms;
+	void doms;
 #ifndef BLIS_ENABLE_GEMM_MD_EXTRA_MEM
 	obj_t  c_real;
 #endif
@@ -511,18 +505,19 @@ mddm_t bli_gemm_md_crr
 
 // -----------------------------------------------------------------------------
 
-//                 cab
-mddm_t bli_gemm_md_rcr
+//               cab
+void bli_gemm_md_rcr
      (
-             obj_t*   a,
-             obj_t*   b,
-       const obj_t*   beta,
-             obj_t*   c,
-             cntx_t*  cntx_local,
-       const cntx_t** cntx
+             obj_t*       a,
+             obj_t*       b,
+       const obj_t*       beta,
+             obj_t*       c,
+       const rntm_t*      rntm,
+       const cntx_t*      cntx,
+             goto_cntl_t* cntl
      )
 {
-	mddm_t doms;
+	void doms;
 	obj_t  a_real;
 
 	// We assume that the requested computation domain is real.
@@ -549,18 +544,19 @@ mddm_t bli_gemm_md_rcr
 
 // -----------------------------------------------------------------------------
 
-//                 cab
-mddm_t bli_gemm_md_rrc
+//               cab
+void bli_gemm_md_rrc
      (
-             obj_t*   a,
-             obj_t*   b,
-       const obj_t*   beta,
-             obj_t*   c,
-             cntx_t*  cntx_local,
-       const cntx_t** cntx
+             obj_t*       a,
+             obj_t*       b,
+       const obj_t*       beta,
+             obj_t*       c,
+       const rntm_t*      rntm,
+       const cntx_t*      cntx,
+             goto_cntl_t* cntl
      )
 {
-	mddm_t doms;
+	void doms;
 	obj_t  b_real;
 
 	// We assume that the requested computation domain is real.
@@ -587,57 +583,56 @@ mddm_t bli_gemm_md_rrc
 
 // -----------------------------------------------------------------------------
 
-//                 cab
-mddm_t bli_gemm_md_rrr
+//               cab
+void bli_gemm_md_rrr
      (
-             obj_t*   a,
-             obj_t*   b,
-       const obj_t*   beta,
-             obj_t*   c,
-             cntx_t*  cntx_local,
-       const cntx_t** cntx
+             obj_t*       a,
+             obj_t*       b,
+       const obj_t*       beta,
+             obj_t*       c,
+       const rntm_t*      rntm,
+       const cntx_t*      cntx,
+             goto_cntl_t* cntl
      )
 {
-	mddm_t doms;
-
-	// We assume that the requested computation domain is real.
-	//dom_t dom_comp_in = bli_obj_comp_domain( c );
-	//dom_t dom_comp_in = BLIS_REAL;
-
-	// For rrr, the computation (ukernel) and execution domains are both
-	// real.
-	doms.comp = BLIS_REAL;
-	doms.exec = BLIS_REAL;
-
-	// Use the default pack schemas in the objects.
-
-	// Return the computation and execution domains.
-	return doms;
+	// Nothing to do.
 }
 
 // -----------------------------------------------------------------------------
 
-//                 cab
-mddm_t bli_gemm_md_ccc
+//               cab
+void bli_gemm_md_ccc
      (
-             obj_t*   a,
-             obj_t*   b,
-       const obj_t*   beta,
-             obj_t*   c,
-             cntx_t*  cntx_local,
-       const cntx_t** cntx
+             obj_t*       a,
+             obj_t*       b,
+       const obj_t*       beta,
+             obj_t*       c,
+       const rntm_t*      rntm,
+       const cntx_t*      cntx,
+             goto_cntl_t* cntl
      )
 {
-	mddm_t doms;
-
-	// We assume that the requested computation domain is complex.
-	//dom_t dom_comp_in = bli_obj_comp_domain( c );
-	//dom_t dom_comp_in = BLIS_COMPLEX;
-
-	// For ccc, the computation (ukernel) and execution domains are both
-	// complex.
-	doms.comp = BLIS_COMPLEX;
-	doms.exec = BLIS_COMPLEX;
+	// Find the highest priority induced method that is both enabled and
+	// available for the current operation. (If an induced method is
+	// available but not enabled, or simply unavailable, BLIS_NAT will
+	// be returned here.)
+	ind_t im = bli_gemmind_find_avail( dt );
+
+    if ( im == BLIS_1M && bli_rntm_ind( BLIS_1M, rntm ) )
+    {
+		// Set the pack schemas based on the row preference of the real-domain
+        // microkernel
+		if ( cntl->ker_params.row_pref )
+		{
+			cntl->packa_params.pack_schema = BLIS_PACKED_ROW_PANELS_1R;
+			cntl->packb_params.pack_schema = BLIS_PACKED_COL_PANELS_1E;
+		}
+		else
+		{
+			cntl->packa_params.pack_schema = BLIS_PACKED_ROW_PANELS_1E;
+			cntl->packb_params.pack_schema = BLIS_PACKED_COL_PANELS_1R;
+		}
+    }
 
 	// Use the default pack schemas in the objects.
 
diff --git a/frame/3/gemm/bli_gemm_md.h b/frame/3/gemm/bli_gemm_md.h
index d71d97987a..601c18b5ef 100644
--- a/frame/3/gemm/bli_gemm_md.h
+++ b/frame/3/gemm/bli_gemm_md.h
@@ -32,59 +32,39 @@
 
 */
 
-#include "bli_gemm_md_c2r_ref.h"
-
-// Define a local struct type that makes returning two values easier.
-typedef struct mddm_s
-{
-	dom_t comp;
-	dom_t exec;
-} mddm_t;
-
-void bli_gemm_md
+num_t bli_gemm_md_comp_dt
      (
-             obj_t*   a,
-             obj_t*   b,
-       const obj_t*   beta,
-             obj_t*   c,
-             cntx_t*  cntx_local,
-       const cntx_t** cntx
-     );
-mddm_t bli_gemm_md_ccc( obj_t* a, obj_t* b, const obj_t* beta, obj_t* c, cntx_t* cntx_l, const cntx_t** cntx );
-mddm_t bli_gemm_md_ccr( obj_t* a, obj_t* b, const obj_t* beta, obj_t* c, cntx_t* cntx_l, const cntx_t** cntx );
-mddm_t bli_gemm_md_crc( obj_t* a, obj_t* b, const obj_t* beta, obj_t* c, cntx_t* cntx_l, const cntx_t** cntx );
-mddm_t bli_gemm_md_rcc( obj_t* a, obj_t* b, const obj_t* beta, obj_t* c, cntx_t* cntx_l, const cntx_t** cntx );
-mddm_t bli_gemm_md_rrc( obj_t* a, obj_t* b, const obj_t* beta, obj_t* c, cntx_t* cntx_l, const cntx_t** cntx );
-mddm_t bli_gemm_md_rcr( obj_t* a, obj_t* b, const obj_t* beta, obj_t* c, cntx_t* cntx_l, const cntx_t** cntx );
-mddm_t bli_gemm_md_crr( obj_t* a, obj_t* b, const obj_t* beta, obj_t* c, cntx_t* cntx_l, const cntx_t** cntx );
-mddm_t bli_gemm_md_rrr( obj_t* a, obj_t* b, const obj_t* beta, obj_t* c, cntx_t* cntx_l, const cntx_t** cntx );
-
-// -----------------------------------------------------------------------------
-
-void bli_gemm_md_front
-     (
-       const obj_t*  alpha,
-       const obj_t*  a,
-       const obj_t*  b,
+             obj_t*  a,
+             obj_t*  b,
        const obj_t*  beta,
-       const obj_t*  c,
-       const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl
+             obj_t*  c,
+       const rntm_t* rntm
      );
 
-void bli_gemm_md_zgemm
-     (
-       const obj_t*  alpha,
-       const obj_t*  a,
-       const obj_t*  b,
-       const obj_t*  beta,
-       const obj_t*  c,
-       const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl
+#undef GENTFUNC
+#define GENTFUNC( opname ) \
+\
+void PASTEMAC0( opname ) \
+     ( \
+             obj_t*       a, \
+             obj_t*       b, \
+       const obj_t*       beta, \
+             obj_t*       c, \
+       const rntm_t*      rntm, \
+       const cntx_t*      cntx, \
+             goto_cntl_t* cntl \
      );
 
+GENTFUNC( gemm_md );
+GENTFUNC( gemm_md_rrr );
+GENTFUNC( gemm_md_rrc );
+GENTFUNC( gemm_md_rcr );
+GENTFUNC( gemm_md_rcc );
+GENTFUNC( gemm_md_crr );
+GENTFUNC( gemm_md_crc );
+GENTFUNC( gemm_md_ccr );
+GENTFUNC( gemm_md_ccc );
+
 // -----------------------------------------------------------------------------
 
 BLIS_INLINE bool bli_gemm_md_is_crr( const obj_t* a, const obj_t* b, const obj_t* c )
@@ -100,8 +80,7 @@ BLIS_INLINE bool bli_gemm_md_is_crr( const obj_t* a, const obj_t* b, const obj_t
 	// execution domain of BLIS_REAL.)
 	if ( bli_obj_is_complex( c ) &&
 	     bli_obj_is_real( a )    &&
-	     bli_obj_is_real( b )    &&
-	     bli_obj_exec_domain( c ) == BLIS_REAL )
+	     bli_obj_is_real( b ) )
 		r_val = TRUE;
 
 	return r_val;
@@ -120,8 +99,7 @@ BLIS_INLINE bool bli_gemm_md_is_ccr( const obj_t* a, const obj_t* b, const obj_t
 	// execution domain of BLIS_COMPLEX.)
 	if ( bli_obj_is_complex( c ) &&
 	     bli_obj_is_complex( a ) &&
-	     bli_obj_is_real( b )    &&
-	     bli_obj_exec_domain( c ) == BLIS_COMPLEX )
+	     bli_obj_is_real( b ) )
 		r_val = TRUE;
 
 	return r_val;
@@ -140,8 +118,7 @@ BLIS_INLINE bool bli_gemm_md_is_crc( const obj_t* a, const obj_t* b, const obj_t
 	// execution domain of BLIS_COMPLEX.)
 	if ( bli_obj_is_complex( c ) &&
 	     bli_obj_is_real( a )    &&
-	     bli_obj_is_complex( b ) &&
-	     bli_obj_exec_domain( c ) == BLIS_COMPLEX )
+	     bli_obj_is_complex( b ) )
 		r_val = TRUE;
 
 	return r_val;
diff --git a/frame/3/gemm/bli_gemm_var.h b/frame/3/gemm/bli_gemm_var.h
index d3109e6003..63060a0993 100644
--- a/frame/3/gemm/bli_gemm_var.h
+++ b/frame/3/gemm/bli_gemm_var.h
@@ -34,16 +34,6 @@
 */
 
 
-//
-// gemm kernel parameter struct.
-//
-
-typedef struct
-{
-	gemm_ukr_vft ukr;
-} gemm_ker_params_t;
-
-
 //
 // Prototype object-based interfaces.
 //
@@ -57,8 +47,7 @@ void PASTEMAC0(opname) \
        const obj_t*  b, \
        const obj_t*  c, \
        const cntx_t* cntx, \
-             rntm_t* rntm, \
-             cntl_t* cntl, \
+       const cntl_t* cntl, \
              thrinfo_t* thread  \
      );
 
diff --git a/frame/3/gemmt/bli_gemmt_front.c b/frame/3/gemmt/bli_gemmt_front.c
index e291b5f275..5526151dc2 100644
--- a/frame/3/gemmt/bli_gemmt_front.c
+++ b/frame/3/gemmt/bli_gemmt_front.c
@@ -43,8 +43,7 @@ void bli_gemmt_front
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl
+             rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -69,6 +68,10 @@ void bli_gemmt_front
 		return;
 	}
 
+    // Create an initial control tree, assuming native execution.
+    goto_cntl_t gemm_cntl;
+    cntl_t* cntl = bli_gemm_cntl_create( &gemm_cntl, BLIS_GEMMT, dt_comp, a, b, c, cntx );
+
 	// Alias A, B, and C in case we need to apply transformations.
 	bli_obj_alias_to( a, &a_local );
 	bli_obj_alias_to( b, &b_local );
diff --git a/frame/3/gemmt/bli_gemmt_front.h b/frame/3/gemmt/bli_gemmt_front.h
index 0f2a9ada2b..4a7cd7abe9 100644
--- a/frame/3/gemmt/bli_gemmt_front.h
+++ b/frame/3/gemmt/bli_gemmt_front.h
@@ -41,6 +41,5 @@ void bli_gemmt_front
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl
+             rntm_t* rntm
      );
diff --git a/frame/3/gemmt/bli_gemmt_l_ker_var2.c b/frame/3/gemmt/bli_gemmt_l_ker_var2.c
index c6fc045b45..f11aa0d022 100644
--- a/frame/3/gemmt/bli_gemmt_l_ker_var2.c
+++ b/frame/3/gemmt/bli_gemmt_l_ker_var2.c
@@ -35,30 +35,46 @@
 
 #include "blis.h"
 
-#define FUNCPTR_T gemmt_fp
+typedef void (*xpbys_mxn_l_vft)
+    (
+      doff_t diagoff,
+      dim_t  m,
+      dim_t  n,
+      void*  x, inc_t rs_x, inc_t cs_x,
+      void*  b,
+      void*  y, inc_t rs_y, inc_t cs_y
+    );
 
-typedef void (*FUNCPTR_T)
-     (
-       doff_t  diagoffc,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       dim_t   m,
-       dim_t   n,
-       dim_t   k,
-       void*   alpha,
-       void*   a, inc_t cs_a, inc_t is_a,
-                  dim_t pd_a, inc_t ps_a,
-       void*   b, inc_t rs_b, inc_t is_b,
-                  dim_t pd_b, inc_t ps_b,
-       void*   beta,
-       void*   c, inc_t rs_c, inc_t cs_c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
-     );
-
-static FUNCPTR_T GENARRAY(ftypes,gemmt_l_ker_var2);
+#undef GENTFUNC
+#define GENTFUNC(ctype,ch,op) \
+\
+void PASTEMAC(ch,op) \
+    ( \
+      doff_t diagoff, \
+      dim_t  m, \
+      dim_t  n, \
+      void*  x, inc_t rs_x, inc_t cs_x, \
+      void*  b, \
+      void*  y, inc_t rs_y, inc_t cs_y \
+    ) \
+{ \
+	ctype* restrict x_cast = x; \
+	ctype* restrict b_cast = b; \
+	ctype* restrict y_cast = y; \
+\
+	PASTEMAC3(ch,ch,ch,xpbys_mxn_l) \
+	( \
+      diagoff, \
+	  m, n, \
+	  x_cast, rs_x, cs_x, \
+	  b_cast, \
+	  y_cast, rs_y,  cs_y \
+	); \
+}
 
+INSERT_GENTFUNC_BASIC0(xpbys_mxn_l_fn);
+
+static xpbys_mxn_l_vft GENARRAY(xpbys_mxn_l, xpbys_mxn_l_fn);
 
 void bli_gemmt_l_ker_var2
      (
@@ -66,21 +82,21 @@ void bli_gemmt_l_ker_var2
        const obj_t*  b,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
+       const cntl_t* cntl,
              thrinfo_t* thread
      )
 {
-	const num_t  dt_exec   = bli_obj_exec_dt( c );
+	const num_t  dt        = bli_obj_exec_dt( c );
+    const dim_t  dt_size   = bli_dt_size( dt );
 
-	const doff_t diagoffc  = bli_obj_diag_offset( c );
+	      doff_t diagoffc  = bli_obj_diag_offset( c );
 
 	const pack_t schema_a  = bli_obj_pack_schema( a );
 	const pack_t schema_b  = bli_obj_pack_schema( b );
 
-	const dim_t  m         = bli_obj_length( c );
-	const dim_t  n         = bli_obj_width( c );
-	const dim_t  k         = bli_obj_width( a );
+	      dim_t  m         = bli_obj_length( c );
+	      dim_t  n         = bli_obj_width( c );
+	      dim_t  k         = bli_obj_width( a );
 
 	const void*  buf_a     = bli_obj_buffer_at_off( a );
 	const inc_t  cs_a      = bli_obj_col_stride( a );
@@ -109,97 +125,34 @@ void bli_gemmt_l_ker_var2
 	const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
 	const void* buf_beta  = bli_obj_internal_scalar_buffer( c );
 
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	ftypes[dt_exec]
-    (
-      diagoffc,
-	  schema_a,
-	  schema_b,
-	  m,
-	  n,
-	  k,
-	  ( void* )buf_alpha,
-	  ( void* )buf_a, cs_a, is_a,
-	                  pd_a, ps_a,
-	  ( void* )buf_b, rs_b, is_b,
-	                  pd_b, ps_b,
-	  ( void* )buf_beta,
-	           buf_c, rs_c, cs_c,
-	  ( cntx_t* )cntx,
-	  rntm,
-	  thread
-	);
-}
-
+	/* Alias some constants to simpler names. */
+	const dim_t     MR         = pd_a;
+	const dim_t     NR         = pd_b;
+	/*const dim_t     PACKMR     = cs_a;*/
+	/*const dim_t     PACKNR     = rs_b;*/
 
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       doff_t  diagoffc, \
-       pack_t  schema_a, \
-       pack_t  schema_b, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   k, \
-       void*   alpha, \
-       void*   a, inc_t cs_a, inc_t is_a, \
-                  dim_t pd_a, inc_t ps_a, \
-       void*   b, inc_t rs_b, inc_t is_b, \
-                  dim_t pd_b, inc_t ps_b, \
-       void*   beta, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	const num_t     dt         = PASTEMAC(ch,type); \
-\
-	/* Alias some constants to simpler names. */ \
-	const dim_t     MR         = pd_a; \
-	const dim_t     NR         = pd_b; \
-	/*const dim_t     PACKMR     = cs_a;*/ \
-	/*const dim_t     PACKNR     = rs_b;*/ \
-\
 	/* Query the context for the micro-kernel address and cast it to its
-	   function pointer type. */ \
-	PASTECH(ch,gemm_ukr_ft) \
-	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
-\
+	   function pointer type. */
+	gemm_ukr_vft    gemm_ukr        = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
+    xpbys_mxn_l_vft xpbys_mxn_l_ukr = xpbys_mxn_l[ dt ];
+
 	/* Temporary C buffer for edge cases. Note that the strides of this
 	   temporary buffer are set so that they match the storage of the
 	   original C matrix. For example, if C is column-stored, ct will be
-	   column-stored as well. */ \
-	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
-	                    / sizeof( ctype ) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
-	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
-	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
-\
-	ctype* restrict zero       = PASTEMAC(ch,0); \
-	ctype* restrict a_cast     = a; \
-	ctype* restrict b_cast     = b; \
-	ctype* restrict c_cast     = c; \
-	ctype* restrict alpha_cast = alpha; \
-	ctype* restrict beta_cast  = beta; \
-	ctype* restrict b1; \
-	ctype* restrict c1; \
-\
-	doff_t          diagoffc_ij; \
-	dim_t           m_iter, m_left; \
-	dim_t           n_iter, n_left; \
-	dim_t           m_cur; \
-	dim_t           n_cur; \
-	dim_t           i, j, ip; \
-	inc_t           rstep_a; \
-	inc_t           cstep_b; \
-	inc_t           rstep_c, cstep_c; \
-	auxinfo_t       aux; \
-\
+	   column-stored as well. */
+	char            ct[ BLIS_STACK_BUF_MAX_SIZE ]
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE)));
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx );
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR );
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 );
+
+	const void* zero       = bli_obj_buffer_for_const( dt, &BLIS_ZERO );
+	const char* a_cast     = buf_a;
+	const char* b_cast     = buf_b;
+	      char* c_cast     = buf_c;
+	const char* alpha_cast = buf_alpha;
+	const char* beta_cast  = buf_beta;
+
 	/*
 	   Assumptions/assertions:
 	     rs_a == 1
@@ -212,100 +165,97 @@ void PASTEMAC(ch,varname) \
 	     ps_b == stride to next micro-panel of B
 	     rs_c == (no assumptions)
 	     cs_c == (no assumptions)
-	*/ \
-\
-	/* If any dimension is zero, return immediately. */ \
-	if ( bli_zero_dim3( m, n, k ) ) return; \
-\
+	*/
+
+	/* If any dimension is zero, return immediately. */
+	if ( bli_zero_dim3( m, n, k ) ) return;
+
 	/* Safeguard: If the current panel of C is entirely above the diagonal,
-	   it is not stored. So we do nothing. */ \
-	if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return; \
-\
+	   it is not stored. So we do nothing. */
+	if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) return;
+
 	/* If there is a zero region above where the diagonal of C intersects
 	   the left edge of the panel, adjust the pointer to C and A and treat
-	   this case as if the diagonal offset were zero. */ \
-	if ( diagoffc < 0 ) \
-	{ \
-		ip       = -diagoffc / MR; \
-		i        = ip * MR; \
-		m        = m - i; \
-		diagoffc = -diagoffc % MR; \
-		c_cast   = c_cast + (i  )*rs_c; \
-		a_cast   = a_cast + (ip )*ps_a; \
-	} \
-\
+	   this case as if the diagonal offset were zero. */
+	if ( diagoffc < 0 )
+	{
+		dim_t ip       = -diagoffc / MR;
+		dim_t i        = ip * MR;
+		      m        = m - i;
+		      diagoffc = -diagoffc % MR;
+		      c_cast   = c_cast + (i  )*rs_c;
+		      a_cast   = a_cast + (ip )*ps_a;
+	}
+
 	/* If there is a zero region to the right of where the diagonal
 	   of C intersects the bottom of the panel, shrink it to prevent
-	   "no-op" iterations from executing. */ \
-	if ( diagoffc + m < n ) \
-	{ \
-		n = diagoffc + m; \
-	} \
-\
-	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
-	PASTEMAC(ch,set0s_mxn)( MR, NR, \
-	                        ct, rs_ct, cs_ct ); \
-\
+	   "no-op" iterations from executing. */
+	if ( diagoffc + m < n )
+	{
+		n = diagoffc + m;
+	}
+
 	/* Compute number of primary and leftover components of the m and n
-	   dimensions. */ \
-	n_iter = n / NR; \
-	n_left = n % NR; \
-\
-	m_iter = m / MR; \
-	m_left = m % MR; \
-\
-	if ( n_left ) ++n_iter; \
-	if ( m_left ) ++m_iter; \
-\
-	/* Determine some increments used to step through A, B, and C. */ \
-	rstep_a = ps_a; \
-\
-	cstep_b = ps_b; \
-\
-	rstep_c = rs_c * MR; \
-	cstep_c = cs_c * NR; \
-\
-	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
-	bli_auxinfo_set_schema_a( schema_a, &aux ); \
-	bli_auxinfo_set_schema_b( schema_b, &aux ); \
-\
-	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
-	bli_auxinfo_set_is_a( is_a, &aux ); \
-	bli_auxinfo_set_is_b( is_b, &aux ); \
-\
-	/* Save the desired output datatype (indicating no typecasting). */ \
-	/*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \
-\
+	   dimensions. */
+	dim_t n_iter = n / NR;
+	dim_t n_left = n % NR;
+
+	dim_t m_iter = m / MR;
+	dim_t m_left = m % MR;
+
+	if ( n_left ) ++n_iter;
+	if ( m_left ) ++m_iter;
+
+	/* Determine some increments used to step through A, B, and C. */
+	inc_t rstep_a = ps_a;
+
+	inc_t cstep_b = ps_b;
+
+	inc_t rstep_c = rs_c * MR;
+	inc_t cstep_c = cs_c * NR;
+
+	/* Save the pack schemas of A and B to the auxinfo_t object. */
+    auxinfo_t aux;
+	bli_auxinfo_set_schema_a( schema_a, &aux );
+	bli_auxinfo_set_schema_b( schema_b, &aux );
+
+	/* Save the imaginary stride of A and B to the auxinfo_t object. */
+	bli_auxinfo_set_is_a( is_a, &aux );
+	bli_auxinfo_set_is_b( is_b, &aux );
+
+	/* Save the desired output datatype (indicating no typecasting). */
+	/*bli_auxinfo_set_dt_on_output( dt, &aux );*/
+
 	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
 	   loop around the microkernel. Here we query the thrinfo_t node for the
-	   1st (ir) loop around the microkernel. */ \
-	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
-\
-	/* Query the number of threads and thread ids for each loop. */ \
-	dim_t jr_nt  = bli_thread_n_way( thread ); \
-	dim_t jr_tid = bli_thread_work_id( thread ); \
-	dim_t ir_nt  = bli_thread_n_way( caucus ); \
-	dim_t ir_tid = bli_thread_work_id( caucus ); \
-\
-	dim_t jr_start, jr_end; \
-	dim_t ir_start, ir_end; \
-	dim_t jr_inc,   ir_inc; \
-\
+	   1st (ir) loop around the microkernel. */
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread );
+
+	/* Query the number of threads and thread ids for each loop. */
+	dim_t jr_nt  = bli_thread_n_way( thread );
+	dim_t jr_tid = bli_thread_work_id( thread );
+	dim_t ir_nt  = bli_thread_n_way( caucus );
+	dim_t ir_tid = bli_thread_work_id( caucus );
+
+	dim_t jr_start, jr_end;
+	dim_t ir_start, ir_end;
+	dim_t jr_inc,   ir_inc;
+
 	/* Note that we partition the 2nd loop into two regions: the rectangular
-	   part of C, and the triangular portion. */ \
-	dim_t n_iter_rct; \
-	dim_t n_iter_tri; \
-\
-	if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) \
-	{ \
+	   part of C, and the triangular portion. */
+	dim_t n_iter_rct;
+	dim_t n_iter_tri;
+
+	if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) )
+	{
 		/* If the entire panel of C does not intersect the diagonal, there is
 		   no triangular region, and therefore we can skip the second set of
-		   loops. */ \
-		n_iter_rct = n_iter; \
-		n_iter_tri = 0; \
-	} \
-	else \
-	{ \
+		   loops. */
+		n_iter_rct = n_iter;
+		n_iter_tri = 0;
+	}
+	else
+	{
 		/* If the panel of C does intersect the diagonal, compute the number of
 		   iterations in the rectangular region by dividing NR into the diagonal
 		   offset. Any remainder from this integer division is discarded, which
@@ -313,195 +263,181 @@ void PASTEMAC(ch,varname) \
 		   as many columns of whole microtiles as possible without including any
 		   microtiles that intersect the diagonal. The number of iterations in
 		   the triangular (or trapezoidal) region is computed as the remaining
-		   number of iterations in the n dimension. */ \
-		n_iter_rct = diagoffc / NR; \
-		n_iter_tri = n_iter - n_iter_rct; \
-	} \
-\
+		   number of iterations in the n dimension. */
+		n_iter_rct = diagoffc / NR;
+		n_iter_tri = n_iter - n_iter_rct;
+	}
+
 	/* Determine the thread range and increment for the 2nd and 1st loops for
 	   the initial rectangular region of C (if it exists).
 	   NOTE: The definition of bli_thread_range_jrir() will depend on whether
-	   slab or round-robin partitioning was requested at configure-time. */ \
-	bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
-	bli_thread_range_jrir( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc ); \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_start; j < jr_end; j += jr_inc ) \
-	{ \
-		ctype* restrict a1; \
-		ctype* restrict c11; \
-		ctype* restrict b2; \
-\
-		b1 = b_cast + j * cstep_b; \
-		c1 = c_cast + j * cstep_c; \
-\
-		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
-		/* Interior loop over the m dimension (MR rows at a time). */ \
-		for ( i = ir_start; i < ir_end; i += ir_inc ) \
-		{ \
-			ctype* restrict a2; \
-\
-			a1  = a_cast + i * rstep_a; \
-			c11 = c1     + i * rstep_c; \
-\
+	   slab or round-robin partitioning was requested at configure-time. */
+	bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc );
+	bli_thread_range_jrir( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc );
+
+	/* Loop over the n dimension (NR columns at a time). */
+	for ( dim_t j = jr_start; j < jr_end; j += jr_inc )
+	{
+		const char* b1 = b_cast + j * cstep_b;
+		      char* c1 = c_cast + j * cstep_c;
+
+		dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
+
+		/* Initialize our next panel of B to be the current panel of B. */
+		const char* b2 = b1;
+
+		/* Interior loop over the m dimension (MR rows at a time). */
+		for ( dim_t i = ir_start; i < ir_end; i += ir_inc )
+		{
+			const char* a1  = a_cast + i * rstep_a;
+			      char* c11 = c1     + i * rstep_c;
+
 			/* No need to compute the diagonal offset for the rectangular
-			   region. */ \
-			/*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \
-\
-			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \
-			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
-			{ \
-				a2 = a_cast; \
-				b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \
-				if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
-					b2 = b_cast; \
-			} \
-\
+			   region. */
+			/*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/
+
+			dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
+
+			/* Compute the addresses of the next panels of A and B. */
+			const char* a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc );
+			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) )
+			{
+				a2 = a_cast;
+				b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc );
+				if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) )
+					b2 = b_cast;
+			}
+
 			/* Save addresses of next panels of A and B to the auxinfo_t
-			   object. */ \
-			bli_auxinfo_set_next_a( a2, &aux ); \
-			bli_auxinfo_set_next_b( b2, &aux ); \
-\
+			   object. */
+			bli_auxinfo_set_next_a( a2, &aux );
+			bli_auxinfo_set_next_b( b2, &aux );
+
 			/* If the diagonal intersects the current MR x NR submatrix, we
 			   compute it the temporary buffer and then add in the elements
 			   on or below the diagonal.
 			   Otherwise, if the submatrix is strictly below the diagonal,
 			   we compute and store as we normally would.
 			   And if we're strictly above the diagonal, we do nothing and
-			   continue. */ \
-			{ \
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  m_cur, \
-				  n_cur, \
-				  k, \
-				  alpha_cast, \
-				  a1, \
-				  b1, \
-				  beta_cast, \
-				  c11, rs_c, cs_c, \
-				  &aux, \
-				  cntx  \
-				); \
-			} \
-		} \
-	} \
-\
-	/* If there is no triangular region, then we're done. */ \
-	if ( n_iter_tri == 0 ) return; \
-\
+			   continue. */
+			{
+				/* Invoke the gemm micro-kernel. */
+				gemm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k,
+				  ( void* )alpha_cast,
+				  ( void* )a1,
+				  ( void* )b1,
+				  ( void* )beta_cast,
+				  c11, rs_c, cs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+			}
+		}
+	}
+
+	/* If there is no triangular region, then we're done. */
+	if ( n_iter_tri == 0 ) return;
+
 	/* Use round-robin assignment of micropanels to threads in the 2nd loop
 	   and the default (slab or rr) partitioning in the 1st loop for the
-	   remaining triangular region of C. */ \
-	bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
-\
+	   remaining triangular region of C. */
+	bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc );
+
 	/* Advance the start and end iteration offsets for the triangular region
-	   by the number of iterations used for the rectangular region. */ \
-	jr_start += n_iter_rct; \
-	jr_end   += n_iter_rct; \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_start; j < jr_end; j += jr_inc ) \
-	{ \
-		ctype* restrict a1; \
-		ctype* restrict c11; \
-		ctype* restrict b2; \
-\
-		b1 = b_cast + j * cstep_b; \
-		c1 = c_cast + j * cstep_c; \
-\
-		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
-		/* Interior loop over the m dimension (MR rows at a time). */ \
-		for ( i = ir_start; i < ir_end; i += ir_inc ) \
-		{ \
-			ctype* restrict a2; \
-\
-			a1  = a_cast + i * rstep_a; \
-			c11 = c1     + i * rstep_c; \
-\
-			/* Compute the diagonal offset for the submatrix at (i,j). */ \
-			diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
-\
-			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \
-			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
-			{ \
-				a2 = a_cast; \
-				b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \
-				if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
-					b2 = b_cast; \
-			} \
-\
+	   by the number of iterations used for the rectangular region. */
+	jr_start += n_iter_rct;
+	jr_end   += n_iter_rct;
+
+	/* Loop over the n dimension (NR columns at a time). */
+	for ( dim_t j = jr_start; j < jr_end; j += jr_inc )
+	{
+		const char* b1 = b_cast + j * cstep_b;
+		      char* c1 = c_cast + j * cstep_c;
+
+		dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
+
+		/* Initialize our next panel of B to be the current panel of B. */
+		const char* b2 = b1;
+
+		/* Interior loop over the m dimension (MR rows at a time). */
+		for ( dim_t i = ir_start; i < ir_end; i += ir_inc )
+		{
+			const char* a1  = a_cast + i * rstep_a;
+			      char* c11 = c1     + i * rstep_c;
+
+			/* Compute the diagonal offset for the submatrix at (i,j). */
+			doff_t diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;
+
+			dim_t  m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
+
+			/* Compute the addresses of the next panels of A and B. */
+			const char* a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc );
+			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) )
+			{
+				a2 = a_cast;
+				b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc );
+				if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) )
+					b2 = b_cast;
+			}
+
 			/* Save addresses of next panels of A and B to the auxinfo_t
-			   object. */ \
-			bli_auxinfo_set_next_a( a2, &aux ); \
-			bli_auxinfo_set_next_b( b2, &aux ); \
-\
+			   object. */
+			bli_auxinfo_set_next_a( a2, &aux );
+			bli_auxinfo_set_next_b( b2, &aux );
+
 			/* If the diagonal intersects the current MR x NR submatrix, we
 			   compute it the temporary buffer and then add in the elements
 			   on or below the diagonal.
 			   Otherwise, if the submatrix is strictly below the diagonal,
 			   we compute and store as we normally would.
 			   And if we're strictly above the diagonal, we do nothing and
-			   continue. */ \
-			if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
-			{ \
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  MR, \
-				  NR, \
-				  k, \
-				  alpha_cast, \
-				  a1, \
-				  b1, \
-				  zero, \
-				  ct, rs_ct, cs_ct, \
-				  &aux, \
-				  cntx  \
-				); \
-\
-				/* Scale C and add the result to only the stored part. */ \
-				PASTEMAC(ch,xpbys_mxn_l)( diagoffc_ij, \
-				                          m_cur, n_cur, \
-				                          ct,  rs_ct, cs_ct, \
-				                          beta_cast, \
-				                          c11, rs_c,  cs_c ); \
-			} \
-			else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
-			{ \
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  m_cur, \
-				  n_cur, \
-				  k, \
-				  alpha_cast, \
-				  a1, \
-				  b1, \
-				  beta_cast, \
-				  c11, rs_c, cs_c, \
-				  &aux, \
-				  cntx  \
-				); \
-			} \
-		} \
-	} \
-}
+			   continue. */
+			if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) )
+			{
+				/* Invoke the gemm micro-kernel. */
+				gemm_ukr
+				(
+				  MR,
+				  NR,
+				  k,
+				  ( void* )alpha_cast,
+				  ( void* )a1,
+				  ( void* )b1,
+				  ( void* )zero,
+				  ct, rs_ct, cs_ct,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
 
-INSERT_GENTFUNC_BASIC0( gemmt_l_ker_var2 )
+				/* Scale C and add the result to only the stored part. */
+				xpbys_mxn_l_ukr( diagoffc_ij,
+	                             m_cur, n_cur,
+	                             ct,  rs_ct, cs_ct,
+	                             ( void* )beta_cast,
+	                             c11, rs_c,  cs_c );
+			}
+			else if ( bli_is_strictly_below_diag_n( diagoffc_ij, m_cur, n_cur ) )
+			{
+				/* Invoke the gemm micro-kernel. */
+				gemm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k,
+				  ( void* )alpha_cast,
+				  ( void* )a1,
+				  ( void* )b1,
+				  ( void* )beta_cast,
+				  c11, rs_c, cs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+			}
+		}
+	}
+}
 
diff --git a/frame/3/gemmt/bli_gemmt_u_ker_var2.c b/frame/3/gemmt/bli_gemmt_u_ker_var2.c
index f64a84ef15..9010b36a2b 100644
--- a/frame/3/gemmt/bli_gemmt_u_ker_var2.c
+++ b/frame/3/gemmt/bli_gemmt_u_ker_var2.c
@@ -35,30 +35,46 @@
 
 #include "blis.h"
 
-#define FUNCPTR_T gemmt_fp
+typedef void (*xpbys_mxn_u_vft)
+    (
+      doff_t diagoff,
+      dim_t  m,
+      dim_t  n,
+      void*  x, inc_t rs_x, inc_t cs_x,
+      void*  b,
+      void*  y, inc_t rs_y, inc_t cs_y
+    );
 
-typedef void (*FUNCPTR_T)
-     (
-       doff_t  diagoffc,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       dim_t   m,
-       dim_t   n,
-       dim_t   k,
-       void*   alpha,
-       void*   a, inc_t cs_a, inc_t is_a,
-                  dim_t pd_a, inc_t ps_a,
-       void*   b, inc_t rs_b, inc_t is_b,
-                  dim_t pd_b, inc_t ps_b,
-       void*   beta,
-       void*   c, inc_t rs_c, inc_t cs_c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
-     );
-
-static FUNCPTR_T GENARRAY(ftypes,gemmt_u_ker_var2);
+#undef GENTFUNC
+#define GENTFUNC(ctype,ch,op) \
+\
+void PASTEMAC(ch,op) \
+    ( \
+      doff_t diagoff, \
+      dim_t  m, \
+      dim_t  n, \
+      void*  x, inc_t rs_x, inc_t cs_x, \
+      void*  b, \
+      void*  y, inc_t rs_y, inc_t cs_y \
+    ) \
+{ \
+	ctype* restrict x_cast = x; \
+	ctype* restrict b_cast = b; \
+	ctype* restrict y_cast = y; \
+\
+	PASTEMAC3(ch,ch,ch,xpbys_mxn_u) \
+	( \
+      diagoff, \
+	  m, n, \
+	  x_cast, rs_x, cs_x, \
+	  b_cast, \
+	  y_cast, rs_y,  cs_y \
+	); \
+}
 
+INSERT_GENTFUNC_BASIC0(xpbys_mxn_u_fn);
+
+static xpbys_mxn_u_vft GENARRAY(xpbys_mxn_u, xpbys_mxn_u_fn);
 
 void bli_gemmt_u_ker_var2
      (
@@ -66,21 +82,21 @@ void bli_gemmt_u_ker_var2
        const obj_t*  b,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
+       const cntl_t* cntl,
              thrinfo_t* thread
      )
 {
-	const num_t  dt_exec   = bli_obj_exec_dt( c );
+	const num_t  dt        = bli_obj_exec_dt( c );
+    const dim_t  dt_size   = bli_dt_size( dt );
 
-	const doff_t diagoffc  = bli_obj_diag_offset( c );
+	      doff_t diagoffc  = bli_obj_diag_offset( c );
 
 	const pack_t schema_a  = bli_obj_pack_schema( a );
 	const pack_t schema_b  = bli_obj_pack_schema( b );
 
-	const dim_t  m         = bli_obj_length( c );
-	const dim_t  n         = bli_obj_width( c );
-	const dim_t  k         = bli_obj_width( a );
+	      dim_t  m         = bli_obj_length( c );
+	      dim_t  n         = bli_obj_width( c );
+	      dim_t  k         = bli_obj_width( a );
 
 	const void*  buf_a     = bli_obj_buffer_at_off( a );
 	const inc_t  cs_a      = bli_obj_col_stride( a );
@@ -109,97 +125,34 @@ void bli_gemmt_u_ker_var2
 	const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
 	const void* buf_beta  = bli_obj_internal_scalar_buffer( c );
 
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	ftypes[dt_exec]
-    (
-      diagoffc,
-	  schema_a,
-	  schema_b,
-	  m,
-	  n,
-	  k,
-	  ( void* )buf_alpha,
-	  ( void* )buf_a, cs_a, is_a,
-	                  pd_a, ps_a,
-	  ( void* )buf_b, rs_b, is_b,
-	                  pd_b, ps_b,
-	  ( void* )buf_beta,
-	           buf_c, rs_c, cs_c,
-	  ( cntx_t* )cntx,
-	  rntm,
-	  thread
-	);
-}
-
+	/* Alias some constants to simpler names. */
+	const dim_t     MR         = pd_a;
+	const dim_t     NR         = pd_b;
+	/*const dim_t     PACKMR     = cs_a;*/
+	/*const dim_t     PACKNR     = rs_b;*/
 
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       doff_t  diagoffc, \
-       pack_t  schema_a, \
-       pack_t  schema_b, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   k, \
-       void*   alpha, \
-       void*   a, inc_t cs_a, inc_t is_a, \
-                  dim_t pd_a, inc_t ps_a, \
-       void*   b, inc_t rs_b, inc_t is_b, \
-                  dim_t pd_b, inc_t ps_b, \
-       void*   beta, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	const num_t     dt         = PASTEMAC(ch,type); \
-\
-	/* Alias some constants to simpler names. */ \
-	const dim_t     MR         = pd_a; \
-	const dim_t     NR         = pd_b; \
-	/*const dim_t     PACKMR     = cs_a;*/ \
-	/*const dim_t     PACKNR     = rs_b;*/ \
-\
 	/* Query the context for the micro-kernel address and cast it to its
-	   function pointer type. */ \
-	PASTECH(ch,gemm_ukr_ft) \
-	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
-\
+	   function pointer type. */
+	gemm_ukr_vft    gemm_ukr        = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
+    xpbys_mxn_u_vft xpbys_mxn_u_ukr = xpbys_mxn_u[ dt ];
+
 	/* Temporary C buffer for edge cases. Note that the strides of this
 	   temporary buffer are set so that they match the storage of the
 	   original C matrix. For example, if C is column-stored, ct will be
-	   column-stored as well. */ \
-	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
-	                    / sizeof( ctype ) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
-	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
-	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
-\
-	ctype* restrict zero       = PASTEMAC(ch,0); \
-	ctype* restrict a_cast     = a; \
-	ctype* restrict b_cast     = b; \
-	ctype* restrict c_cast     = c; \
-	ctype* restrict alpha_cast = alpha; \
-	ctype* restrict beta_cast  = beta; \
-	ctype* restrict b1; \
-	ctype* restrict c1; \
-\
-	doff_t          diagoffc_ij; \
-	dim_t           m_iter, m_left; \
-	dim_t           n_iter, n_left; \
-	dim_t           m_cur; \
-	dim_t           n_cur; \
-	dim_t           i, j, jp; \
-	inc_t           rstep_a; \
-	inc_t           cstep_b; \
-	inc_t           rstep_c, cstep_c; \
-	auxinfo_t       aux; \
-\
+	   column-stored as well. */
+	char           ct[ BLIS_STACK_BUF_MAX_SIZE ]
+	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE)));
+	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx );
+	const inc_t     rs_ct       = ( col_pref ? 1 : NR );
+	const inc_t     cs_ct       = ( col_pref ? MR : 1 );
+
+	const void* zero       = bli_obj_buffer_for_const( dt, &BLIS_ZERO );
+	const char* a_cast     = buf_a;
+	const char* b_cast     = buf_b;
+	      char* c_cast     = buf_c;
+	const char* alpha_cast = buf_alpha;
+	const char* beta_cast  = buf_beta;
+
 	/*
 	   Assumptions/assertions:
 	     rs_a == 1
@@ -212,102 +165,99 @@ void PASTEMAC(ch,varname) \
 	     ps_b == stride to next micro-panel of B
 	     rs_c == (no assumptions)
 	     cs_c == (no assumptions)
-	*/ \
-\
-	/* If any dimension is zero, return immediately. */ \
-	if ( bli_zero_dim3( m, n, k ) ) return; \
-\
+	*/
+
+	/* If any dimension is zero, return immediately. */
+	if ( bli_zero_dim3( m, n, k ) ) return;
+
 	/* Safeguard: If the current panel of C is entirely below the diagonal,
-	   it is not stored. So we do nothing. */ \
-	if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return; \
-\
+	   it is not stored. So we do nothing. */
+	if ( bli_is_strictly_below_diag_n( diagoffc, m, n ) ) return;
+
 	/* If there is a zero region to the left of where the diagonal of C
 	   intersects the top edge of the panel, adjust the pointer to C and B
 	   and treat this case as if the diagonal offset were zero.
 	   NOTE: It's possible that after this pruning that the diagonal offset
-	   is still positive (though it is guaranteed to be less than NR). */ \
-	if ( diagoffc > 0 ) \
-	{ \
-		jp       = diagoffc / NR; \
-		j        = jp * NR; \
-		n        = n - j; \
-		diagoffc = diagoffc % NR; \
-		c_cast   = c_cast + (j  )*cs_c; \
-		b_cast   = b_cast + (jp )*ps_b; \
-	} \
-\
+	   is still positive (though it is guaranteed to be less than NR). */
+	if ( diagoffc > 0 )
+	{
+		dim_t jp       = diagoffc / NR;
+		dim_t j        = jp * NR;
+		      n        = n - j;
+		      diagoffc = diagoffc % NR;
+		      c_cast   = c_cast + (j  )*cs_c;
+		      b_cast   = b_cast + (jp )*ps_b;
+	}
+
 	/* If there is a zero region below where the diagonal of C intersects
 	   the right edge of the panel, shrink it to prevent "no-op" iterations
-	   from executing. */ \
-	if ( -diagoffc + n < m ) \
-	{ \
-		m = -diagoffc + n; \
-	} \
-\
-	/* Clear the temporary C buffer in case it has any infs or NaNs. */ \
-	PASTEMAC(ch,set0s_mxn)( MR, NR, \
-	                        ct, rs_ct, cs_ct ); \
-\
+	   from executing. */
+	if ( -diagoffc + n < m )
+	{
+		m = -diagoffc + n;
+	}
+
 	/* Compute number of primary and leftover components of the m and n
-	   dimensions. */ \
-	n_iter = n / NR; \
-	n_left = n % NR; \
-\
-	m_iter = m / MR; \
-	m_left = m % MR; \
-\
-	if ( n_left ) ++n_iter; \
-	if ( m_left ) ++m_iter; \
-\
-	/* Determine some increments used to step through A, B, and C. */ \
-	rstep_a = ps_a; \
-\
-	cstep_b = ps_b; \
-\
-	rstep_c = rs_c * MR; \
-	cstep_c = cs_c * NR; \
-\
-	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
-	bli_auxinfo_set_schema_a( schema_a, &aux ); \
-	bli_auxinfo_set_schema_b( schema_b, &aux ); \
-\
-	/* Save the imaginary stride of A and B to the auxinfo_t object. */ \
-	bli_auxinfo_set_is_a( is_a, &aux ); \
-	bli_auxinfo_set_is_b( is_b, &aux ); \
-\
-	/* Save the desired output datatype (indicating no typecasting). */ \
-	/*bli_auxinfo_set_dt_on_output( dt, &aux );*/ \
-\
+	   dimensions. */
+	dim_t n_iter = n / NR;
+	dim_t n_left = n % NR;
+
+	dim_t m_iter = m / MR;
+	dim_t m_left = m % MR;
+
+	if ( n_left ) ++n_iter;
+	if ( m_left ) ++m_iter;
+
+	/* Determine some increments used to step through A, B, and C. */
+	inc_t rstep_a = ps_a;
+
+	inc_t cstep_b = ps_b;
+
+	inc_t rstep_c = rs_c * MR;
+	inc_t cstep_c = cs_c * NR;
+
+	/* Save the pack schemas of A and B to the auxinfo_t object. */
+    auxinfo_t aux;
+	bli_auxinfo_set_schema_a( schema_a, &aux );
+	bli_auxinfo_set_schema_b( schema_b, &aux );
+
+	/* Save the imaginary stride of A and B to the auxinfo_t object. */
+	bli_auxinfo_set_is_a( is_a, &aux );
+	bli_auxinfo_set_is_b( is_b, &aux );
+
+	/* Save the desired output datatype (indicating no typecasting). */
+	/*bli_auxinfo_set_dt_on_output( dt, &aux );*/
+
 	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
 	   loop around the microkernel. Here we query the thrinfo_t node for the
-	   1st (ir) loop around the microkernel. */ \
-	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
-\
-	/* Query the number of threads and thread ids for each loop. */ \
-	dim_t jr_nt  = bli_thread_n_way( thread ); \
-	dim_t jr_tid = bli_thread_work_id( thread ); \
-	dim_t ir_nt  = bli_thread_n_way( caucus ); \
-	dim_t ir_tid = bli_thread_work_id( caucus ); \
-\
-	dim_t jr_start, jr_end; \
-	dim_t ir_start, ir_end; \
-	dim_t jr_inc,   ir_inc; \
-\
+	   1st (ir) loop around the microkernel. */
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread );
+
+	/* Query the number of threads and thread ids for each loop. */
+	dim_t jr_nt  = bli_thread_n_way( thread );
+	dim_t jr_tid = bli_thread_work_id( thread );
+	dim_t ir_nt  = bli_thread_n_way( caucus );
+	dim_t ir_tid = bli_thread_work_id( caucus );
+
+	dim_t jr_start, jr_end;
+	dim_t ir_start, ir_end;
+	dim_t jr_inc,   ir_inc;
+
 	/* Note that we partition the 2nd loop into two regions: the triangular
-	   part of C, and the rectangular portion. */ \
-	dim_t n_iter_tri; \
-	dim_t n_iter_rct; \
-\
-	if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) ) \
-	{ \
+	   part of C, and the rectangular portion. */
+	dim_t n_iter_tri;
+	dim_t n_iter_rct;
+
+	if ( bli_is_strictly_above_diag_n( diagoffc, m, n ) )
+	{
 		/* If the entire panel of C does not intersect the diagonal, there is
 		   no triangular region, and therefore we can skip the first set of
-		   loops. */ \
-		n_iter_tri = 0; \
-		n_iter_rct = n_iter; \
-	} \
-	else \
-	{ \
+		   loops. */
+		n_iter_tri = 0;
+		n_iter_rct = n_iter;
+	}
+	else
+	{
 		/* If the panel of C does intersect the diagonal, compute the number of
 		   iterations in the triangular (or trapezoidal) region by dividing NR
 		   into the number of rows in C. A non-zero remainder means we need to
@@ -315,196 +265,182 @@ void PASTEMAC(ch,varname) \
 		   to contain as few columns of whole microtiles as possible while still
 		   including all microtiles that intersect the diagonal. The number of
 		   iterations in the rectangular region is computed as the remaining
-		   number of iterations in the n dimension. */ \
-		n_iter_tri = ( m + diagoffc ) / NR + ( ( m + diagoffc ) % NR ? 1 : 0 ); \
-		n_iter_rct = n_iter - n_iter_tri; \
-	} \
-\
+		   number of iterations in the n dimension. */
+		n_iter_tri = ( m + diagoffc ) / NR + ( ( m + diagoffc ) % NR ? 1 : 0 );
+		n_iter_rct = n_iter - n_iter_tri;
+	}
+
 	/* Use round-robin assignment of micropanels to threads in the 2nd loop
 	   and the default (slab or rr) partitioning in the 1st loop for the
-	   initial triangular region of C (if it exists). */ \
-	bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
-	bli_thread_range_jrir   ( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc ); \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_start; j < jr_end; j += jr_inc ) \
-	{ \
-		ctype* restrict a1; \
-		ctype* restrict c11; \
-		ctype* restrict b2; \
-\
-		b1 = b_cast + j * cstep_b; \
-		c1 = c_cast + j * cstep_c; \
-\
-		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
-		/* Interior loop over the m dimension (MR rows at a time). */ \
-		for ( i = ir_start; i < ir_end; i += ir_inc ) \
-		{ \
-			ctype* restrict a2; \
-\
-			a1  = a_cast + i * rstep_a; \
-			c11 = c1     + i * rstep_c; \
-\
-			/* Compute the diagonal offset for the submatrix at (i,j). */ \
-			diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR; \
-\
-			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \
-			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
-			{ \
-				a2 = a_cast; \
-				b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \
-				if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
-					b2 = b_cast; \
-			} \
-\
+	   initial triangular region of C (if it exists). */
+	bli_thread_range_jrir_rr( thread, n_iter_tri, 1, FALSE, &jr_start, &jr_end, &jr_inc );
+	bli_thread_range_jrir   ( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc );
+
+	/* Loop over the n dimension (NR columns at a time). */
+	for ( dim_t j = jr_start; j < jr_end; j += jr_inc )
+	{
+		const char* b1 = b_cast + j * cstep_b;
+		      char* c1 = c_cast + j * cstep_c;
+
+		dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
+
+		/* Initialize our next panel of B to be the current panel of B. */
+		const char* b2 = b1;
+
+		/* Interior loop over the m dimension (MR rows at a time). */
+		for ( dim_t i = ir_start; i < ir_end; i += ir_inc )
+		{
+			const char* a1  = a_cast + i * rstep_a;
+			      char* c11 = c1     + i * rstep_c;
+
+			/* Compute the diagonal offset for the submatrix at (i,j). */
+			doff_t diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;
+
+			dim_t  m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
+
+			/* Compute the addresses of the next panels of A and B. */
+			const char* a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc );
+			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) )
+			{
+				a2 = a_cast;
+				b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc );
+				if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) )
+					b2 = b_cast;
+			}
+
 			/* Save addresses of next panels of A and B to the auxinfo_t
-			   object. */ \
-			bli_auxinfo_set_next_a( a2, &aux ); \
-			bli_auxinfo_set_next_b( b2, &aux ); \
-\
+			   object. */
+			bli_auxinfo_set_next_a( a2, &aux );
+			bli_auxinfo_set_next_b( b2, &aux );
+
 			/* If the diagonal intersects the current MR x NR submatrix, we
 			   compute it the temporary buffer and then add in the elements
 			   on or below the diagonal.
 			   Otherwise, if the submatrix is strictly above the diagonal,
 			   we compute and store as we normally would.
 			   And if we're strictly below the diagonal, we do nothing and
-			   continue. */ \
-			if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
-			{ \
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  MR, \
-				  NR, \
-				  k, \
-				  alpha_cast, \
-				  a1, \
-				  b1, \
-				  zero, \
-				  ct, rs_ct, cs_ct, \
-				  &aux, \
-				  cntx  \
-				); \
-\
-				/* Scale C and add the result to only the stored part. */ \
-				PASTEMAC(ch,xpbys_mxn_u)( diagoffc_ij, \
-				                          m_cur, n_cur, \
-				                          ct,  rs_ct, cs_ct, \
-				                          beta_cast, \
-				                          c11, rs_c,  cs_c ); \
-			} \
-			else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) ) \
-			{ \
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  m_cur, \
-				  n_cur, \
-				  k, \
-				  alpha_cast, \
-				  a1, \
-				  b1, \
-				  beta_cast, \
-				  c11, rs_c, cs_c, \
-				  &aux, \
-				  cntx  \
-				); \
-			} \
-		} \
-	} \
-\
-	/* If there is no rectangular region, then we're done. */ \
-	if ( n_iter_rct == 0 ) return; \
-\
+			   continue. */
+			if ( bli_intersects_diag_n( diagoffc_ij, m_cur, n_cur ) )
+			{
+				/* Invoke the gemm micro-kernel. */
+				gemm_ukr
+				(
+				  MR,
+				  NR,
+				  k,
+				  ( void* )alpha_cast,
+				  ( void* )a1,
+				  ( void* )b1,
+				  ( void* )zero,
+				  ct, rs_ct, cs_ct,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+
+				/* Scale C and add the result to only the stored part. */
+				xpbys_mxn_u_ukr( diagoffc_ij,
+		                         m_cur, n_cur,
+		                         ct,  rs_ct, cs_ct,
+		                         ( void* )beta_cast,
+		                         c11, rs_c,  cs_c );
+			}
+			else if ( bli_is_strictly_above_diag_n( diagoffc_ij, m_cur, n_cur ) )
+			{
+				/* Invoke the gemm micro-kernel. */
+				gemm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k,
+				  ( void* )alpha_cast,
+				  ( void* )a1,
+				  ( void* )b1,
+				  ( void* )beta_cast,
+				  c11, rs_c, cs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+			}
+		}
+	}
+
+	/* If there is no rectangular region, then we're done. */
+	if ( n_iter_rct == 0 ) return;
+
 	/* Determine the thread range and increment for the 2nd loop of the
 	   remaining rectangular region of C (and also use default partitioning
 	   for the 1st loop).
 	   NOTE: The definition of bli_thread_range_jrir() will depend on whether
-	   slab or round-robin partitioning was requested at configure-time. */ \
-	bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
-\
+	   slab or round-robin partitioning was requested at configure-time. */
+	bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc );
+
 	/* Advance the start and end iteration offsets for the rectangular region
-	   by the number of iterations used for the triangular region. */ \
-	jr_start += n_iter_tri; \
-	jr_end   += n_iter_tri; \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_start; j < jr_end; j += jr_inc ) \
-	{ \
-		ctype* restrict a1; \
-		ctype* restrict c11; \
-		ctype* restrict b2; \
-\
-		b1 = b_cast + j * cstep_b; \
-		c1 = c_cast + j * cstep_c; \
-\
-		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
-		/* Interior loop over the m dimension (MR rows at a time). */ \
-		for ( i = ir_start; i < ir_end; i += ir_inc ) \
-		{ \
-			ctype* restrict a2; \
-\
-			a1  = a_cast + i * rstep_a; \
-			c11 = c1     + i * rstep_c; \
-\
+	   by the number of iterations used for the triangular region. */
+	jr_start += n_iter_tri;
+	jr_end   += n_iter_tri;
+
+	/* Loop over the n dimension (NR columns at a time). */
+	for ( dim_t j = jr_start; j < jr_end; j += jr_inc )
+	{
+		const char* b1 = b_cast + j * cstep_b;
+		      char* c1 = c_cast + j * cstep_c;
+
+		dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
+
+		/* Initialize our next panel of B to be the current panel of B. */
+		const char* b2 = b1;
+
+		/* Interior loop over the m dimension (MR rows at a time). */
+		for ( dim_t i = ir_start; i < ir_end; i += ir_inc )
+		{
+			const char* a1  = a_cast + i * rstep_a;
+			      char* c11 = c1     + i * rstep_c;
+
 			/* No need to compute the diagonal offset for the rectangular
-			   region. */ \
-			/*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/ \
-\
-			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-			/* Compute the addresses of the next panels of A and B. */ \
-			a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc ); \
-			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
-			{ \
-				a2 = a_cast; \
-				b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc ); \
-				if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
-					b2 = b_cast; \
-			} \
-\
+			   region. */
+			/*diagoffc_ij = diagoffc - (doff_t)j*NR + (doff_t)i*MR;*/
+
+			dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
+
+			/* Compute the addresses of the next panels of A and B. */
+			const char* a2 = bli_gemmt_get_next_a_upanel( a1, rstep_a, ir_inc );
+			if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) )
+			{
+				a2 = a_cast;
+				b2 = bli_gemmt_get_next_b_upanel( b1, cstep_b, jr_inc );
+				if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) )
+					b2 = b_cast;
+			}
+
 			/* Save addresses of next panels of A and B to the auxinfo_t
-			   object. */ \
-			bli_auxinfo_set_next_a( a2, &aux ); \
-			bli_auxinfo_set_next_b( b2, &aux ); \
-\
+			   object. */
+			bli_auxinfo_set_next_a( a2, &aux );
+			bli_auxinfo_set_next_b( b2, &aux );
+
 			/* If the diagonal intersects the current MR x NR submatrix, we
 			   compute it the temporary buffer and then add in the elements
 			   on or below the diagonal.
 			   Otherwise, if the submatrix is strictly above the diagonal,
 			   we compute and store as we normally would.
 			   And if we're strictly below the diagonal, we do nothing and
-			   continue. */ \
-			{ \
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  m_cur, \
-				  n_cur, \
-				  k, \
-				  alpha_cast, \
-				  a1, \
-				  b1, \
-				  beta_cast, \
-				  c11, rs_c, cs_c, \
-				  &aux, \
-				  cntx  \
-				); \
-			} \
-		} \
-	} \
+			   continue. */
+			{
+				/* Invoke the gemm micro-kernel. */
+				gemm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k,
+				  ( void* )alpha_cast,
+				  ( void* )a1,
+				  ( void* )b1,
+				  ( void* )beta_cast,
+				  c11, rs_c, cs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+			}
+		}
+	}
 }
 
-INSERT_GENTFUNC_BASIC0( gemmt_u_ker_var2 )
-
diff --git a/frame/3/gemmt/bli_gemmt_var.h b/frame/3/gemmt/bli_gemmt_var.h
index 98d8f55633..eb6e160180 100644
--- a/frame/3/gemmt/bli_gemmt_var.h
+++ b/frame/3/gemmt/bli_gemmt_var.h
@@ -47,8 +47,7 @@ void PASTEMAC0(opname) \
        const obj_t*  ah, \
        const obj_t*  c, \
        const cntx_t* cntx, \
-             rntm_t* rntm, \
-             cntl_t* cntl, \
+       const cntl_t* cntl, \
              thrinfo_t* thread  \
      );
 
@@ -81,7 +80,6 @@ void PASTEMAC(ch,varname) \
        void*   beta, \
        void*   c, inc_t rs_c, inc_t cs_c, \
        cntx_t* cntx, \
-       rntm_t* rntm, \
        thrinfo_t* thread  \
      );
 
diff --git a/frame/3/gemmt/bli_gemmt_x_ker_var2.c b/frame/3/gemmt/bli_gemmt_x_ker_var2.c
index 76fe106b08..0ccefc2498 100644
--- a/frame/3/gemmt/bli_gemmt_x_ker_var2.c
+++ b/frame/3/gemmt/bli_gemmt_x_ker_var2.c
@@ -46,8 +46,7 @@ void bli_gemmt_x_ker_var2
        const obj_t*  ah,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
+       const cntl_t* cntl,
              thrinfo_t* thread
      )
 {
@@ -68,7 +67,6 @@ void bli_gemmt_x_ker_var2
 	  ah,
 	  c,
 	  cntx,
-	  rntm,
 	  cntl,
 	  thread
 	);
diff --git a/frame/3/hemm/bli_hemm_front.c b/frame/3/hemm/bli_hemm_front.c
index c39703503d..22ed8a51f4 100644
--- a/frame/3/hemm/bli_hemm_front.c
+++ b/frame/3/hemm/bli_hemm_front.c
@@ -44,7 +44,7 @@ void bli_hemm_front
        const obj_t*  c,
        const cntx_t* cntx,
              rntm_t* rntm,
-             cntl_t* cntl
+       const cntl_t* cntl
      )
 {
 	bli_init_once();
diff --git a/frame/3/hemm/bli_hemm_front.h b/frame/3/hemm/bli_hemm_front.h
index 63eb91cd3a..766be00898 100644
--- a/frame/3/hemm/bli_hemm_front.h
+++ b/frame/3/hemm/bli_hemm_front.h
@@ -40,7 +40,5 @@ void bli_hemm_front
        const obj_t*  b,
        const obj_t*  beta,
        const obj_t*  c,
-       const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl
+       const cntx_t* cntx
      );
diff --git a/frame/3/symm/bli_symm_front.c b/frame/3/symm/bli_symm_front.c
index c9aada9893..ab32d3b386 100644
--- a/frame/3/symm/bli_symm_front.c
+++ b/frame/3/symm/bli_symm_front.c
@@ -44,7 +44,7 @@ void bli_symm_front
        const obj_t*  c,
        const cntx_t* cntx,
              rntm_t* rntm,
-             cntl_t* cntl
+       const cntl_t* cntl
      )
 {
 	bli_init_once();
diff --git a/frame/3/symm/bli_symm_front.h b/frame/3/symm/bli_symm_front.h
index 417cb9acb2..cc5f3fc431 100644
--- a/frame/3/symm/bli_symm_front.h
+++ b/frame/3/symm/bli_symm_front.h
@@ -40,7 +40,5 @@ void bli_symm_front
        const obj_t*  b,
        const obj_t*  beta,
        const obj_t*  c,
-       const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl
+       const cntx_t* cntx
      );
diff --git a/frame/3/trmm/bli_trmm_front.c b/frame/3/trmm/bli_trmm_front.c
index edd4ce1efb..00f6d99279 100644
--- a/frame/3/trmm/bli_trmm_front.c
+++ b/frame/3/trmm/bli_trmm_front.c
@@ -43,7 +43,7 @@ void bli_trmm_front
        const obj_t*  b,
        const cntx_t* cntx,
              rntm_t* rntm,
-             cntl_t* cntl
+       const cntl_t* cntl
      )
 {
 	bli_init_once();
diff --git a/frame/3/trmm/bli_trmm_front.h b/frame/3/trmm/bli_trmm_front.h
index cfefdd39bc..75e7396d18 100644
--- a/frame/3/trmm/bli_trmm_front.h
+++ b/frame/3/trmm/bli_trmm_front.h
@@ -38,7 +38,5 @@ void bli_trmm_front
        const obj_t*  alpha,
        const obj_t*  a,
        const obj_t*  b,
-       const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl
+       const cntx_t* cntx
      );
diff --git a/frame/3/trmm/bli_trmm_ll_ker_var2.c b/frame/3/trmm/bli_trmm_ll_ker_var2.c
index f5476b2cad..ac161e3dfc 100644
--- a/frame/3/trmm/bli_trmm_ll_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_ll_ker_var2.c
@@ -35,50 +35,27 @@
 
 #include "blis.h"
 
-#define FUNCPTR_T gemm_fp
-
-typedef void (*FUNCPTR_T)
-     (
-       doff_t  diagoffa,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       dim_t   m,
-       dim_t   n,
-       dim_t   k,
-       void*   alpha,
-       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
-       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
-       void*   beta,
-       void*   c, inc_t rs_c, inc_t cs_c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
-     );
-
-static FUNCPTR_T GENARRAY(ftypes,trmm_ll_ker_var2);
-
-
 void bli_trmm_ll_ker_var2
      (
        const obj_t*  a,
        const obj_t*  b,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
+       const cntl_t* cntl,
              thrinfo_t* thread
      )
 {
-	const num_t     dt_exec   = bli_obj_exec_dt( c );
+	const num_t     dt        = bli_obj_exec_dt( c );
+    const dim_t     dt_size   = bli_dt_size( dt );
 
-	const doff_t    diagoffa  = bli_obj_diag_offset( a );
+	      doff_t    diagoffa  = bli_obj_diag_offset( a );
 
 	const pack_t    schema_a  = bli_obj_pack_schema( a );
 	const pack_t    schema_b  = bli_obj_pack_schema( b );
 
-	const dim_t     m         = bli_obj_length( c );
-	const dim_t     n         = bli_obj_width( c );
-	const dim_t     k         = bli_obj_width( a );
+	      dim_t     m         = bli_obj_length( c );
+	      dim_t     n         = bli_obj_width( c );
+	      dim_t     k         = bli_obj_width( a );
 
 	const void*     buf_a     = bli_obj_buffer_at_off( a );
 	const inc_t     cs_a      = bli_obj_col_stride( a );
@@ -105,89 +82,23 @@ void bli_trmm_ll_ker_var2
 	const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
 	const void* buf_beta  = bli_obj_internal_scalar_buffer( c );
 
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	ftypes[dt_exec]
-	(
-	  diagoffa,
-	  schema_a,
-	  schema_b,
-	  m,
-	  n,
-	  k,
-	  ( void* )buf_alpha,
-	  ( void* )buf_a, cs_a, pd_a, ps_a,
-	  ( void* )buf_b, rs_b, pd_b, ps_b,
-	  ( void* )buf_beta,
-	           buf_c, rs_c, cs_c,
-	  ( cntx_t* )cntx,
-	  rntm,
-	  thread
-	);
-}
-
+	/* Alias some constants to simpler names. */
+	const dim_t     MR         = pd_a;
+	const dim_t     NR         = pd_b;
+	const dim_t     PACKMR     = cs_a;
+	const dim_t     PACKNR     = rs_b;
 
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       doff_t  diagoffa, \
-       pack_t  schema_a, \
-       pack_t  schema_b, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   k, \
-       void*   alpha, \
-       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
-       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
-       void*   beta, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	const num_t     dt         = PASTEMAC(ch,type); \
-\
-	/* Alias some constants to simpler names. */ \
-	const dim_t     MR         = pd_a; \
-	const dim_t     NR         = pd_b; \
-	const dim_t     PACKMR     = cs_a; \
-	const dim_t     PACKNR     = rs_b; \
-\
 	/* Query the context for the micro-kernel address and cast it to its
-	   function pointer type. */ \
-	PASTECH(ch,gemm_ukr_ft) \
-	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
-\
-	ctype* restrict one        = PASTEMAC(ch,1); \
-	ctype* restrict a_cast     = a; \
-	ctype* restrict b_cast     = b; \
-	ctype* restrict c_cast     = c; \
-	ctype* restrict alpha_cast = alpha; \
-	ctype* restrict beta_cast  = beta; \
-	ctype* restrict b1; \
-	ctype* restrict c1; \
-\
-	doff_t          diagoffa_i; \
-	dim_t           k_full; \
-	dim_t           m_iter, m_left; \
-	dim_t           n_iter, n_left; \
-	dim_t           m_cur; \
-	dim_t           n_cur; \
-	dim_t           k_a1011; \
-	dim_t           off_a1011; \
-	dim_t           i, j; \
-	inc_t           rstep_a; \
-	inc_t           cstep_b; \
-	inc_t           rstep_c, cstep_c; \
-	inc_t           istep_a; \
-	inc_t           istep_b; \
-	inc_t           ps_a_cur; \
-	inc_t           is_a_cur; \
-	auxinfo_t       aux; \
-\
+	   function pointer type. */
+	gemm_ukr_vft gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
+
+	const void* one        = bli_obj_buffer_for_const( dt, &BLIS_ONE );
+	const char* a_cast     = buf_a;
+	const char* b_cast     = buf_b;
+	      char* c_cast     = buf_c;
+	const char* alpha_cast = buf_alpha;
+	const char* beta_cast  = buf_beta;
+
 	/*
 	   Assumptions/assertions:
 	     rs_a == 1
@@ -200,227 +111,199 @@ void PASTEMAC(ch,varname) \
 	     ps_b == stride to next micro-panel of B
 	     rs_c == (no assumptions)
 	     cs_c == (no assumptions)
-	*/ \
-\
+	*/
+
 	/* Safety trap: Certain indexing within this macro-kernel does not
-	   work as intended if both MR and NR are odd. */ \
-	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
-	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
-\
-	/* If any dimension is zero, return immediately. */ \
-	if ( bli_zero_dim3( m, n, k ) ) return; \
-\
+	   work as intended if both MR and NR are odd. */
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) ||
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort();
+
+	/* If any dimension is zero, return immediately. */
+	if ( bli_zero_dim3( m, n, k ) ) return;
+
 	/* Safeguard: If the current block of A is entirely above the diagonal,
-	   it is implicitly zero. So we do nothing. */ \
-	if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \
-\
-	/* Compute k_full. For all trmm, k_full is simply k. This is
-	   needed because some parameter combinations of trmm reduce k
-	   to advance past zero regions in the triangular matrix, and
-	   when computing the imaginary stride of B (the non-triangular
-	   matrix), which is used by 4m1/3m1 implementations, we need
-	   this unreduced value of k. */ \
-	k_full = k; \
-\
+	   it is implicitly zero. So we do nothing. */
+	if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return;
+
 	/* If there is a zero region above where the diagonal of A intersects the
 	   left edge of the block, adjust the pointer to C and treat this case as
 	   if the diagonal offset were zero. This skips over the region that was
 	   not packed. (Note we assume the diagonal offset is a multiple of MR;
 	   this assumption will hold as long as the cache blocksizes are each a
-	   multiple of MR and NR.) */ \
-	if ( diagoffa < 0 ) \
-	{ \
-		i        = -diagoffa; \
-		m        = m - i; \
-		diagoffa = 0; \
-		c_cast   = c_cast + (i  )*rs_c; \
-	} \
-\
+	   multiple of MR and NR.) */
+	if ( diagoffa < 0 )
+	{
+		m        += diagoffa;
+		c_cast   -= diagoffa * rs_c * dt_size;
+		diagoffa  = 0;
+	}
+
 	/* Compute number of primary and leftover components of the m and n
-	   dimensions. */ \
-	n_iter = n / NR; \
-	n_left = n % NR; \
-\
-	m_iter = m / MR; \
-	m_left = m % MR; \
-\
-	if ( n_left ) ++n_iter; \
-	if ( m_left ) ++m_iter; \
-\
-	/* Determine some increments used to step through A, B, and C. */ \
-	rstep_a = ps_a; \
-\
-	cstep_b = ps_b; \
-\
-	rstep_c = rs_c * MR; \
-	cstep_c = cs_c * NR; \
-\
-	istep_a = PACKMR * k; \
-	istep_b = PACKNR * k_full; \
-\
-	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
-	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
-\
-	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
-	bli_auxinfo_set_schema_a( schema_a, &aux ); \
-	bli_auxinfo_set_schema_b( schema_b, &aux ); \
-\
-	/* Save the imaginary stride of B to the auxinfo_t object. */ \
-	bli_auxinfo_set_is_b( istep_b, &aux ); \
-\
+	   dimensions. */
+	dim_t n_iter = n / NR;
+	dim_t n_left = n % NR;
+
+	dim_t m_iter = m / MR;
+	dim_t m_left = m % MR;
+
+	if ( n_left ) ++n_iter;
+	if ( m_left ) ++m_iter;
+
+	/* Determine some increments used to step through A, B, and C. */
+	inc_t rstep_a = ps_a * dt_size;
+
+	inc_t cstep_b = ps_b * dt_size;
+
+	inc_t rstep_c = rs_c * MR * dt_size;
+	inc_t cstep_c = cs_c * NR * dt_size;
+
+	/* Save the pack schemas of A and B to the auxinfo_t object. */
+    auxinfo_t aux;
+	bli_auxinfo_set_schema_a( schema_a, &aux );
+	bli_auxinfo_set_schema_b( schema_b, &aux );
+
 	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
 	   loop around the microkernel. Here we query the thrinfo_t node for the
-	   1st (ir) loop around the microkernel. */ \
-	/*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \
-\
-	/* Query the number of threads and thread ids for each loop. */ \
-	dim_t jr_nt  = bli_thread_n_way( thread ); \
-	dim_t jr_tid = bli_thread_work_id( thread ); \
-	/*dim_t ir_nt  = bli_thread_n_way( ir_thread ); \
-	dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \
-\
-	dim_t jr_start, jr_end; \
-	/*dim_t ir_start, ir_end;*/ \
-	dim_t jr_inc; \
-\
+	   1st (ir) loop around the microkernel. */
+	/*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/
+
+	/* Query the number of threads and thread ids for each loop. */
+	dim_t jr_nt  = bli_thread_n_way( thread );
+	dim_t jr_tid = bli_thread_work_id( thread );
+	/*dim_t ir_nt  = bli_thread_n_way( ir_thread );
+	dim_t ir_tid = bli_thread_work_id( ir_thread );*/
+
+	dim_t jr_start, jr_end;
+	/*dim_t ir_start, ir_end;*/
+	dim_t jr_inc;
+
 	/* Determine the thread range and increment for the 2nd loop.
 	   NOTE: The definition of bli_thread_range_jrir() will depend on whether
-	   slab or round-robin partitioning was requested at configure-time. \
-	   NOTE: Parallelism in the 1st loop is disabled for now. */ \
-	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
-	/*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_start; j < jr_end; j += jr_inc ) \
-	{ \
-		ctype* restrict a1; \
-		ctype* restrict c11; \
-		ctype* restrict b2; \
-\
-		b1 = b_cast + j * cstep_b; \
-		c1 = c_cast + j * cstep_c; \
-\
-		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
-		a1  = a_cast; \
-		c11 = c1; \
-\
-		/* Loop over the m dimension (MR rows at a time). */ \
-		for ( i = 0; i < m_iter; ++i ) \
-		{ \
-			diagoffa_i = diagoffa + ( doff_t )i*MR; \
-\
-			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
+	   slab or round-robin partitioning was requested at configure-time.
+	   NOTE: Parallelism in the 1st loop is disabled for now. */
+	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc );
+	/*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/
+
+	/* Loop over the n dimension (NR columns at a time). */
+	for ( dim_t j = jr_start; j < jr_end; j += jr_inc )
+	{
+		const char* b1 = b_cast + j * cstep_b;
+		      char* c1 = c_cast + j * cstep_c;
+
+		dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
+
+		/* Initialize our next panel of B to be the current panel of B. */
+		const char* b2 = b1;
+
+		const char* a1  = a_cast;
+		      char* c11 = c1;
+
+		/* Loop over the m dimension (MR rows at a time). */
+		for ( dim_t i = 0; i < m_iter; ++i )
+		{
+			doff_t diagoffa_i = diagoffa + ( doff_t )i*MR;
+
+			dim_t  m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
+
 			/* If the current panel of A intersects the diagonal, scale C
 			   by beta. If it is strictly below the diagonal, scale by one.
 			   This allows the current macro-kernel to work for both trmm
-			   and trmm3. */ \
-			if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
-			{ \
-				ctype* restrict b1_i; \
-				ctype* restrict a2; \
-\
+			   and trmm3. */
+			if ( bli_intersects_diag_n( diagoffa_i, MR, k ) )
+			{
 				/* Determine the offset to and length of the panel that was
 				   packed so we can index into the corresponding location in
-				   b1. */ \
-				off_a1011 = 0; \
-				k_a1011   = bli_min( diagoffa_i + MR, k ); \
-\
+				   b1. */
+				dim_t off_a1011 = 0;
+				dim_t k_a1011   = bli_min( diagoffa_i + MR, k );
+
 				/* Compute the panel stride for the current diagonal-
-				   intersecting micro-panel. */ \
-				is_a_cur  = k_a1011 * PACKMR; \
-				is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
-				ps_a_cur  = is_a_cur; \
-\
-				/* NOTE: ir loop parallelism disabled for now. */ \
-				/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
-\
-				b1_i = b1 + off_a1011 * PACKNR; \
-\
-				/* Compute the addresses of the next panels of A and B. */ \
-				a2 = a1; \
-				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
-				{ \
-					a2 = a_cast; \
-					b2 = b1; \
-					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
-						b2 = b_cast; \
-				} \
-\
+				   intersecting micro-panel. */
+				inc_t ps_a_cur  = k_a1011 * PACKMR;
+    				  ps_a_cur += ( bli_is_odd( ps_a_cur ) ? 1 : 0 );
+    				  ps_a_cur *= dt_size;
+
+				/* NOTE: ir loop parallelism disabled for now. */
+				/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/
+
+				const char* b1_i = b1 + off_a1011 * PACKNR;
+
+				/* Compute the addresses of the next panels of A and B. */
+				const char* a2 = a1;
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) )
+				{
+					a2 = a_cast;
+					b2 = b1;
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) )
+						b2 = b_cast;
+				}
+
 				/* Save addresses of next panels of A and B to the auxinfo_t
-				   object. */ \
-				bli_auxinfo_set_next_a( a2, &aux ); \
-				bli_auxinfo_set_next_b( b2, &aux ); \
-\
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  m_cur, \
-				  n_cur, \
-				  k_a1011, \
-				  alpha_cast, \
-				  a1, \
-				  b1_i, \
-				  beta_cast, \
-				  c11, rs_c, cs_c, \
-				  &aux, \
-				  cntx  \
-				); \
-				/*}*/ \
-\
-				a1 += ps_a_cur; \
-			} \
-			else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
-			{ \
-				/* NOTE: ir loop parallelism disabled for now. */ \
-				/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
-\
-				ctype* restrict a2; \
-\
-				/* Compute the addresses of the next panels of A and B. */ \
-				a2 = a1; \
-				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
-				{ \
-					a2 = a_cast; \
-					b2 = b1; \
-					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
-						b2 = b_cast; \
-				} \
-\
+				   object. */
+				bli_auxinfo_set_next_a( a2, &aux );
+				bli_auxinfo_set_next_b( b2, &aux );
+
+				/* Invoke the gemm micro-kernel. */
+				gemm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k_a1011,
+				  ( void* )alpha_cast,
+				  ( void* )a1,
+				  ( void* )b1_i,
+				  ( void* )beta_cast,
+				  c11, rs_c, cs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+				/*}*/
+
+				a1 += ps_a_cur;
+			}
+			else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) )
+			{
+				/* NOTE: ir loop parallelism disabled for now. */
+				/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/
+
+				/* Compute the addresses of the next panels of A and B. */
+				const char* a2 = a1;
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) )
+				{
+					a2 = a_cast;
+					b2 = b1;
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) )
+						b2 = b_cast;
+				}
+
 				/* Save addresses of next panels of A and B to the auxinfo_t
-				   object. */ \
-				bli_auxinfo_set_next_a( a2, &aux ); \
-				bli_auxinfo_set_next_b( b2, &aux ); \
-\
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  m_cur, \
-				  n_cur, \
-				  k, \
-				  alpha_cast, \
-				  a1, \
-				  b1, \
-				  one, \
-				  c11, rs_c, cs_c, \
-				  &aux, \
-				  cntx  \
-				); \
-				/*}*/ \
-\
-				a1 += rstep_a; \
-			} \
-\
-			c11 += rstep_c; \
-		} \
-	} \
-/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: a1", MR, k_a1011, a1, 1, MR, "%4.1f", "" );*/ \
-/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
-}
+				   object. */
+				bli_auxinfo_set_next_a( a2, &aux );
+				bli_auxinfo_set_next_b( b2, &aux );
+
+				/* Invoke the gemm micro-kernel. */
+				gemm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k,
+				  ( void* )alpha_cast,
+				  ( void* )a1,
+				  ( void* )b1,
+				  ( void* )one,
+				  c11, rs_c, cs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+				/*}*/
 
-INSERT_GENTFUNC_BASIC0( trmm_ll_ker_var2 )
+				a1 += rstep_a;
+			}
+
+			c11 += rstep_c;
+		}
+	}
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: a1", MR, k_a1011, a1, 1, MR, "%4.1f", "" );*/
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_ll_ker_var2: b1", k_a1011, NR, b1_i, NR, 1, "%4.1f", "" );*/
+}
 
diff --git a/frame/3/trmm/bli_trmm_lu_ker_var2.c b/frame/3/trmm/bli_trmm_lu_ker_var2.c
index df5b2dac55..a7bc862006 100644
--- a/frame/3/trmm/bli_trmm_lu_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_lu_ker_var2.c
@@ -35,50 +35,27 @@
 
 #include "blis.h"
 
-#define FUNCPTR_T gemm_fp
-
-typedef void (*FUNCPTR_T)
-     (
-       doff_t  diagoffa,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       dim_t   m,
-       dim_t   n,
-       dim_t   k,
-       void*   alpha,
-       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
-       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
-       void*   beta,
-       void*   c, inc_t rs_c, inc_t cs_c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
-     );
-
-static FUNCPTR_T GENARRAY(ftypes,trmm_lu_ker_var2);
-
-
 void bli_trmm_lu_ker_var2
      (
        const obj_t*  a,
        const obj_t*  b,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
+       const cntl_t* cntl,
              thrinfo_t* thread
      )
 {
-	const num_t     dt_exec   = bli_obj_exec_dt( c );
+	const num_t     dt        = bli_obj_exec_dt( c );
+    const dim_t     dt_size   = bli_dt_size( dt );
 
-	const doff_t    diagoffa  = bli_obj_diag_offset( a );
+	      doff_t    diagoffa  = bli_obj_diag_offset( a );
 
 	const pack_t    schema_a  = bli_obj_pack_schema( a );
 	const pack_t    schema_b  = bli_obj_pack_schema( b );
 
-	const dim_t     m         = bli_obj_length( c );
-	const dim_t     n         = bli_obj_width( c );
-	const dim_t     k         = bli_obj_width( a );
+	      dim_t     m         = bli_obj_length( c );
+	      dim_t     n         = bli_obj_width( c );
+	      dim_t     k         = bli_obj_width( a );
 
 	const void*     buf_a     = bli_obj_buffer_at_off( a );
 	const inc_t     cs_a      = bli_obj_col_stride( a );
@@ -105,89 +82,23 @@ void bli_trmm_lu_ker_var2
 	const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
 	const void* buf_beta  = bli_obj_internal_scalar_buffer( c );
 
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	ftypes[dt_exec]
-	(
-	  diagoffa,
-	  schema_a,
-	  schema_b,
-	  m,
-	  n,
-	  k,
-	  ( void* )buf_alpha,
-	  ( void* )buf_a, cs_a, pd_a, ps_a,
-	  ( void* )buf_b, rs_b, pd_b, ps_b,
-	  ( void* )buf_beta,
-	           buf_c, rs_c, cs_c,
-	  ( cntx_t* )cntx,
-	  rntm,
-	  thread
-	);
-}
-
+	/* Alias some constants to simpler names. */
+	const dim_t     MR         = pd_a;
+	const dim_t     NR         = pd_b;
+	const dim_t     PACKMR     = cs_a;
+	const dim_t     PACKNR     = rs_b;
 
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       doff_t  diagoffa, \
-       pack_t  schema_a, \
-       pack_t  schema_b, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   k, \
-       void*   alpha, \
-       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
-       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
-       void*   beta, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	const num_t     dt         = PASTEMAC(ch,type); \
-\
-	/* Alias some constants to simpler names. */ \
-	const dim_t     MR         = pd_a; \
-	const dim_t     NR         = pd_b; \
-	const dim_t     PACKMR     = cs_a; \
-	const dim_t     PACKNR     = rs_b; \
-\
 	/* Query the context for the micro-kernel address and cast it to its
-	   function pointer type. */ \
-	PASTECH(ch,gemm_ukr_ft) \
-	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
-\
-	ctype* restrict one        = PASTEMAC(ch,1); \
-	ctype* restrict a_cast     = a; \
-	ctype* restrict b_cast     = b; \
-	ctype* restrict c_cast     = c; \
-	ctype* restrict alpha_cast = alpha; \
-	ctype* restrict beta_cast  = beta; \
-	ctype* restrict b1; \
-	ctype* restrict c1; \
-\
-	doff_t          diagoffa_i; \
-	dim_t           k_full; \
-	dim_t           m_iter, m_left; \
-	dim_t           n_iter, n_left; \
-	dim_t           m_cur; \
-	dim_t           n_cur; \
-	dim_t           k_a1112; \
-	dim_t           off_a1112; \
-	dim_t           i, j; \
-	inc_t           rstep_a; \
-	inc_t           cstep_b; \
-	inc_t           rstep_c, cstep_c; \
-	inc_t           istep_a; \
-	inc_t           istep_b; \
-	inc_t           ps_a_cur; \
-	inc_t           is_a_cur; \
-	auxinfo_t       aux; \
-\
+	   function pointer type. */
+	gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
+
+	const void* one        = bli_obj_buffer_for_const( dt, &BLIS_ONE );
+	const char* a_cast     = buf_a;
+	const char* b_cast     = buf_b;
+	      char* c_cast     = buf_c;
+	const char* alpha_cast = buf_alpha;
+	const char* beta_cast  = buf_beta;
+
 	/*
 	   Assumptions/assertions:
 	     rs_a == 1
@@ -200,235 +111,207 @@ void PASTEMAC(ch,varname) \
 	     ps_b == stride to next micro-panel of B
 	     rs_c == (no assumptions)
 	     cs_c == (no assumptions)
-	*/ \
-\
+	*/
+
 	/* Safety trap: Certain indexing within this macro-kernel does not
-	   work as intended if both MR and NR are odd. */ \
-	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
-	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
-\
-	/* If any dimension is zero, return immediately. */ \
-	if ( bli_zero_dim3( m, n, k ) ) return; \
-\
+	   work as intended if both MR and NR are odd. */
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) ||
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort();
+
+	/* If any dimension is zero, return immediately. */
+	if ( bli_zero_dim3( m, n, k ) ) return;
+
 	/* Safeguard: If the current block of A is entirely below the diagonal,
-	   it is implicitly zero. So we do nothing. */ \
-	if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \
-\
-	/* Compute k_full. For all trmm, k_full is simply k. This is
-	   needed because some parameter combinations of trmm reduce k
-	   to advance past zero regions in the triangular matrix, and
-	   when computing the imaginary stride of B (the non-triangular
-	   matrix), which is used by 4m1/3m1 implementations, we need
-	   this unreduced value of k. */ \
-	k_full = k; \
-\
+	   it is implicitly zero. So we do nothing. */
+	if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return;
+
 	/* If there is a zero region to the left of where the diagonal of A
 	   intersects the top edge of the block, adjust the pointer to B and
 	   treat this case as if the diagonal offset were zero. Note that we
 	   don't need to adjust the pointer to A since packm would have simply
-	   skipped over the region that was not stored. */ \
-	if ( diagoffa > 0 ) \
-	{ \
-		i        = diagoffa; \
-		k        = k - i; \
-		diagoffa = 0; \
-		b_cast   = b_cast + i * PACKNR; \
-	} \
-\
+	   skipped over the region that was not stored. */
+	if ( diagoffa > 0 )
+	{
+		k        -= diagoffa;
+		b_cast   += diagoffa * PACKNR * dt_size;
+		diagoffa  = 0;
+	}
+
 	/* If there is a zero region below where the diagonal of A intersects the
 	   right side of the block, shrink it to prevent "no-op" iterations from
-	   executing. */ \
-	if ( -diagoffa + k < m ) \
-	{ \
-		m = -diagoffa + k; \
-	} \
-\
+	   executing. */
+	if ( -diagoffa + k < m )
+	{
+		m = -diagoffa + k;
+	}
+
 	/* Compute number of primary and leftover components of the m and n
-	   dimensions. */ \
-	n_iter = n / NR; \
-	n_left = n % NR; \
-\
-	m_iter = m / MR; \
-	m_left = m % MR; \
-\
-	if ( n_left ) ++n_iter; \
-	if ( m_left ) ++m_iter; \
-\
-	/* Determine some increments used to step through A, B, and C. */ \
-	rstep_a = ps_a; \
-\
-	cstep_b = ps_b; \
-\
-	rstep_c = rs_c * MR; \
-	cstep_c = cs_c * NR; \
-\
-	istep_a = PACKMR * k; \
-	istep_b = PACKNR * k_full; \
-\
-	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
-	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
-\
-	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
-	bli_auxinfo_set_schema_a( schema_a, &aux ); \
-	bli_auxinfo_set_schema_b( schema_b, &aux ); \
-\
-	/* Save the imaginary stride of B to the auxinfo_t object. */ \
-	bli_auxinfo_set_is_b( istep_b, &aux ); \
-\
+	   dimensions. */
+	dim_t n_iter = n / NR;
+	dim_t n_left = n % NR;
+
+	dim_t m_iter = m / MR;
+	dim_t m_left = m % MR;
+
+	if ( n_left ) ++n_iter;
+	if ( m_left ) ++m_iter;
+
+	/* Determine some increments used to step through A, B, and C. */
+	inc_t rstep_a = ps_a * dt_size;
+
+	inc_t cstep_b = ps_b * dt_size;
+
+	inc_t rstep_c = rs_c * MR * dt_size;
+	inc_t cstep_c = cs_c * NR * dt_size;
+
+	/* Save the pack schemas of A and B to the auxinfo_t object. */
+    auxinfo_t aux;
+	bli_auxinfo_set_schema_a( schema_a, &aux );
+	bli_auxinfo_set_schema_b( schema_b, &aux );
+
 	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
 	   loop around the microkernel. Here we query the thrinfo_t node for the
-	   1st (ir) loop around the microkernel. */ \
-	/*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/ \
-\
-	/* Query the number of threads and thread ids for each loop. */ \
-	dim_t jr_nt  = bli_thread_n_way( thread ); \
-	dim_t jr_tid = bli_thread_work_id( thread ); \
-	/*dim_t ir_nt  = bli_thread_n_way( ir_thread ); \
-	dim_t ir_tid = bli_thread_work_id( ir_thread );*/ \
-\
-	dim_t jr_start, jr_end; \
-	/*dim_t ir_start, ir_end;*/ \
-	dim_t jr_inc; \
-\
+	   1st (ir) loop around the microkernel. */
+	/*thrinfo_t* ir_thread = bli_thrinfo_sub_node( thread );*/
+
+	/* Query the number of threads and thread ids for each loop. */
+	dim_t jr_nt  = bli_thread_n_way( thread );
+	dim_t jr_tid = bli_thread_work_id( thread );
+	/*dim_t ir_nt  = bli_thread_n_way( ir_thread );
+	dim_t ir_tid = bli_thread_work_id( ir_thread );*/
+
+	dim_t jr_start, jr_end;
+	/*dim_t ir_start, ir_end;*/
+	dim_t jr_inc;
+
 	/* Determine the thread range and increment for the 2nd loop.
 	   NOTE: The definition of bli_thread_range_jrir() will depend on whether
-	   slab or round-robin partitioning was requested at configure-time. \
-	   NOTE: Parallelism in the 1st loop is disabled for now. */ \
-	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
-	/*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/ \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_start; j < jr_end; j += jr_inc ) \
-	{ \
-		ctype* restrict a1; \
-		ctype* restrict c11; \
-		ctype* restrict b2; \
-\
-		b1 = b_cast + j * cstep_b; \
-		c1 = c_cast + j * cstep_c; \
-\
-		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
-		a1  = a_cast; \
-		c11 = c1; \
-\
-		/* Loop over the m dimension (MR rows at a time). */ \
-		for ( i = 0; i < m_iter; ++i ) \
-		{ \
-			diagoffa_i = diagoffa + ( doff_t )i*MR; \
-\
-			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
+	   slab or round-robin partitioning was requested at configure-time.
+	   NOTE: Parallelism in the 1st loop is disabled for now. */
+	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc );
+	/*bli_thread_range_jrir_rr( caucus, m_iter, 1, FALSE, &ir_start, &ir_end, &ir_inc );*/
+
+	/* Loop over the n dimension (NR columns at a time). */
+	for ( dim_t j = jr_start; j < jr_end; j += jr_inc )
+	{
+		const char* b1 = b_cast + j * cstep_b;
+		      char* c1 = c_cast + j * cstep_c;
+
+		dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
+
+		/* Initialize our next panel of B to be the current panel of B. */
+		const char* b2 = b1;
+
+		const char* a1  = a_cast;
+		      char* c11 = c1;
+
+		/* Loop over the m dimension (MR rows at a time). */
+		for ( dim_t i = 0; i < m_iter; ++i )
+		{
+			doff_t diagoffa_i = diagoffa + ( doff_t )i*MR;
+
+			dim_t  m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
+
 			/* If the current panel of A intersects the diagonal, scale C
 			   by beta. If it is strictly above the diagonal, scale by one.
 			   This allows the current macro-kernel to work for both trmm
-			   and trmm3. */ \
-			if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
-			{ \
-				ctype* restrict b1_i; \
-				ctype* restrict a2; \
-\
+			   and trmm3. */
+			if ( bli_intersects_diag_n( diagoffa_i, MR, k ) )
+			{
 				/* Determine the offset to and length of the panel that was
 				   packed so we can index into the corresponding location in
-				   b1. */ \
-				off_a1112 = diagoffa_i; \
-				k_a1112   = k - off_a1112; \
-\
+				   b1. */
+				dim_t off_a1112 = diagoffa_i;
+				dim_t k_a1112   = k - off_a1112;
+
 				/* Compute the panel stride for the current diagonal-
-				   intersecting micro-panel. */ \
-				is_a_cur  = k_a1112 * PACKMR; \
-				is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
-				ps_a_cur  = is_a_cur; \
-\
-				/* NOTE: ir loop parallelism disabled for now. */ \
-				/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
-\
-				b1_i = b1 + off_a1112 * PACKNR; \
-\
-				/* Compute the addresses of the next panels of A and B. */ \
-				a2 = a1; \
-				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
-				{ \
-					a2 = a_cast; \
-					b2 = b1; \
-					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
-						b2 = b_cast; \
-				} \
-\
+				   intersecting micro-panel. */
+				inc_t ps_a_cur  = k_a1112 * PACKMR;
+    				  ps_a_cur += ( bli_is_odd( ps_a_cur ) ? 1 : 0 );
+    				  ps_a_cur *= dt_size;
+
+				/* NOTE: ir loop parallelism disabled for now. */
+				/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/
+
+				const char* b1_i = b1 + off_a1112 * PACKNR;
+
+				/* Compute the addresses of the next panels of A and B. */
+				const char* a2 = a1;
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) )
+				{
+					a2 = a_cast;
+					b2 = b1;
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) )
+						b2 = b_cast;
+				}
+
 				/* Save addresses of next panels of A and B to the auxinfo_t
-				   object. */ \
-				bli_auxinfo_set_next_a( a2, &aux ); \
-				bli_auxinfo_set_next_b( b2, &aux ); \
-\
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  m_cur, \
-				  n_cur, \
-				  k_a1112, \
-				  alpha_cast, \
-				  a1, \
-				  b1_i, \
-				  beta_cast, \
-				  c11, rs_c, cs_c, \
-				  &aux, \
-				  cntx  \
-				); \
-				/*}*/ \
-\
-				a1 += ps_a_cur; \
-			} \
-			else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
-			{ \
-				/* NOTE: ir loop parallelism disabled for now. */ \
-				/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/ \
-\
-				ctype* restrict a2; \
-\
-				/* Compute the addresses of the next panels of A and B. */ \
-				a2 = a1; \
-				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
-				{ \
-					a2 = a_cast; \
-					b2 = b1; \
-					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
-						b2 = b_cast; \
-				} \
-\
+				   object. */
+				bli_auxinfo_set_next_a( a2, &aux );
+				bli_auxinfo_set_next_b( b2, &aux );
+
+				/* Invoke the gemm micro-kernel. */
+				gemm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k_a1112,
+				  ( void* )alpha_cast,
+				  ( void* )a1,
+				  ( void* )b1_i,
+				  ( void* )beta_cast,
+				  c11, rs_c, cs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+				/*}*/
+
+				a1 += ps_a_cur;
+			}
+			else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) )
+			{
+				/* NOTE: ir loop parallelism disabled for now. */
+				/*if ( bli_trmm_my_iter( i, ir_thread ) ) {*/
+
+				/* Compute the addresses of the next panels of A and B. */
+				const char* a2 = a1;
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) )
+				{
+					a2 = a_cast;
+					b2 = b1;
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) )
+						b2 = b_cast;
+				}
+
 				/* Save addresses of next panels of A and B to the auxinfo_t
-				   object. */ \
-				bli_auxinfo_set_next_a( a2, &aux ); \
-				bli_auxinfo_set_next_b( b2, &aux ); \
-\
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  m_cur, \
-				  n_cur, \
-				  k, \
-				  alpha_cast, \
-				  a1, \
-				  b1, \
-				  one, \
-				  c11, rs_c, cs_c, \
-				  &aux, \
-				  cntx  \
-				); \
-				/*}*/ \
-\
-				a1 += rstep_a; \
-			} \
-\
-			c11 += rstep_c; \
-		} \
-	} \
-\
-/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" );*/ \
-/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: b1", k_a1112, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
-}
+				   object. */
+				bli_auxinfo_set_next_a( a2, &aux );
+				bli_auxinfo_set_next_b( b2, &aux );
+
+				/* Invoke the gemm micro-kernel. */
+				gemm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k,
+				  ( void* )alpha_cast,
+				  ( void* )a1,
+				  ( void* )b1,
+				  ( void* )one,
+				  c11, rs_c, cs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+				/*}*/
+
+				a1 += rstep_a;
+			}
 
-INSERT_GENTFUNC_BASIC0( trmm_lu_ker_var2 )
+			c11 += rstep_c;
+		}
+	}
+
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: a1", MR, k_a1112, a1, 1, MR, "%4.1f", "" );*/
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_lu_ker_var2: b1", k_a1112, NR, b1_i, NR, 1, "%4.1f", "" );*/
+}
 
diff --git a/frame/3/trmm/bli_trmm_rl_ker_var2.c b/frame/3/trmm/bli_trmm_rl_ker_var2.c
index 89f86aa3a8..94ed71d503 100644
--- a/frame/3/trmm/bli_trmm_rl_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_rl_ker_var2.c
@@ -35,50 +35,27 @@
 
 #include "blis.h"
 
-#define FUNCPTR_T gemm_fp
-
-typedef void (*FUNCPTR_T)
-     (
-       doff_t  diagoffb,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       dim_t   m,
-       dim_t   n,
-       dim_t   k,
-       void*   alpha,
-       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
-       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
-       void*   beta,
-       void*   c, inc_t rs_c, inc_t cs_c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
-     );
-
-static FUNCPTR_T GENARRAY(ftypes,trmm_rl_ker_var2);
-
-
 void bli_trmm_rl_ker_var2
      (
        const obj_t*  a,
        const obj_t*  b,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
+       const cntl_t* cntl,
              thrinfo_t* thread
      )
 {
-	const num_t     dt_exec   = bli_obj_exec_dt( c );
+	const num_t     dt        = bli_obj_exec_dt( c );
+    const dim_t     dt_size   = bli_dt_size( dt );
 
-	const doff_t    diagoffb  = bli_obj_diag_offset( b );
+	      doff_t    diagoffb  = bli_obj_diag_offset( b );
 
 	const pack_t    schema_a  = bli_obj_pack_schema( a );
 	const pack_t    schema_b  = bli_obj_pack_schema( b );
 
-	const dim_t     m         = bli_obj_length( c );
-	const dim_t     n         = bli_obj_width( c );
-	const dim_t     k         = bli_obj_width( a );
+	      dim_t     m         = bli_obj_length( c );
+	      dim_t     n         = bli_obj_width( c );
+	      dim_t     k         = bli_obj_width( a );
 
 	const void*     buf_a     = bli_obj_buffer_at_off( a );
 	const inc_t     cs_a      = bli_obj_col_stride( a );
@@ -105,89 +82,23 @@ void bli_trmm_rl_ker_var2
 	const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
 	const void* buf_beta  = bli_obj_internal_scalar_buffer( c );
 
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	ftypes[dt_exec]
-	(
-	  diagoffb,
-	  schema_a,
-	  schema_b,
-	  m,
-	  n,
-	  k,
-	  ( void* )buf_alpha,
-	  ( void* )buf_a, cs_a, pd_a, ps_a,
-	  ( void* )buf_b, rs_b, pd_b, ps_b,
-	  ( void* )buf_beta,
-	           buf_c, rs_c, cs_c,
-	  ( cntx_t* )cntx,
-	  rntm,
-	  thread
-	);
-}
-
+	/* Alias some constants to simpler names. */
+	const dim_t     MR         = pd_a;
+	const dim_t     NR         = pd_b;
+	const dim_t     PACKMR     = cs_a;
+	const dim_t     PACKNR     = rs_b;
 
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       doff_t  diagoffb, \
-       pack_t  schema_a, \
-       pack_t  schema_b, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   k, \
-       void*   alpha, \
-       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
-       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
-       void*   beta, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	const num_t     dt         = PASTEMAC(ch,type); \
-\
-	/* Alias some constants to simpler names. */ \
-	const dim_t     MR         = pd_a; \
-	const dim_t     NR         = pd_b; \
-	const dim_t     PACKMR     = cs_a; \
-	const dim_t     PACKNR     = rs_b; \
-\
 	/* Query the context for the micro-kernel address and cast it to its
-	   function pointer type. */ \
-	PASTECH(ch,gemm_ukr_ft) \
-	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
-\
-	ctype* restrict one        = PASTEMAC(ch,1); \
-	ctype* restrict a_cast     = a; \
-	ctype* restrict b_cast     = b; \
-	ctype* restrict c_cast     = c; \
-	ctype* restrict alpha_cast = alpha; \
-	ctype* restrict beta_cast  = beta; \
-	ctype* restrict b1; \
-	ctype* restrict c1; \
-\
-	doff_t          diagoffb_j; \
-	dim_t           k_full; \
-	dim_t           m_iter, m_left; \
-	dim_t           n_iter, n_left; \
-	dim_t           m_cur; \
-	dim_t           n_cur; \
-	dim_t           k_b1121; \
-	dim_t           off_b1121; \
-	dim_t           i, j; \
-	inc_t           rstep_a; \
-	inc_t           cstep_b; \
-	inc_t           rstep_c, cstep_c; \
-	inc_t           istep_a; \
-	inc_t           istep_b; \
-	inc_t           ps_b_cur; \
-	inc_t           is_b_cur; \
-	auxinfo_t       aux; \
-\
+	   function pointer type. */
+	gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
+
+	const void* one        = bli_obj_buffer_for_const( dt, &BLIS_ONE );
+	const char* a_cast     = buf_a;
+	const char* b_cast     = buf_b;
+	      char* c_cast     = buf_c;
+	const char* alpha_cast = buf_alpha;
+	const char* beta_cast  = buf_beta;
+
 	/*
 	   Assumptions/assertions:
 	     rs_a == 1
@@ -200,292 +111,268 @@ void PASTEMAC(ch,varname) \
 	     ps_b == stride to next micro-panel of B
 	     rs_c == (no assumptions)
 	     cs_c == (no assumptions)
-	*/ \
-\
+	*/
+
 	/* Safety trap: Certain indexing within this macro-kernel does not
-	   work as intended if both MR and NR are odd. */ \
-	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
-	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
-\
-	/* If any dimension is zero, return immediately. */ \
-	if ( bli_zero_dim3( m, n, k ) ) return; \
-\
+	   work as intended if both MR and NR are odd. */
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) ||
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort();
+
+	/* If any dimension is zero, return immediately. */
+	if ( bli_zero_dim3( m, n, k ) ) return;
+
 	/* Safeguard: If the current panel of B is entirely above the diagonal,
-	   it is implicitly zero. So we do nothing. */ \
-	if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; \
-\
+	   it is implicitly zero. So we do nothing. */
+	if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return;
+
 	/* Compute k_full. For all trmm, k_full is simply k. This is
 	   needed because some parameter combinations of trmm reduce k
 	   to advance past zero regions in the triangular matrix, and
 	   when computing the imaginary stride of A (the non-triangular
 	   matrix), which is used by 4m1/3m1 implementations, we need
-	   this unreduced value of k. */ \
-	k_full = k; \
-\
+	   this unreduced value of k. */
+	dim_t k_full = k;
+
 	/* If there is a zero region above where the diagonal of B intersects
 	   the left edge of the panel, adjust the pointer to A and treat this
 	   case as if the diagonal offset were zero. Note that we don't need to
 	   adjust the pointer to B since packm would have simply skipped over
-	   the region that was not stored. */ \
-	if ( diagoffb < 0 ) \
-	{ \
-		j        = -diagoffb; \
-		k        = k - j; \
-		diagoffb = 0; \
-		a_cast   = a_cast + j * PACKMR; \
-	} \
-\
+	   the region that was not stored. */
+	if ( diagoffb < 0 )
+	{
+		k        += diagoffb;
+		a_cast   -= diagoffb * PACKMR * dt_size;
+		diagoffb  = 0;
+	}
+
 	/* If there is a zero region to the right of where the diagonal
 	   of B intersects the bottom of the panel, shrink it to prevent
-	   "no-op" iterations from executing. */ \
-	if ( diagoffb + k < n ) \
-	{ \
-		n = diagoffb + k; \
-	} \
-\
+	   "no-op" iterations from executing. */
+	if ( diagoffb + k < n )
+	{
+		n = diagoffb + k;
+	}
+
 	/* Compute number of primary and leftover components of the m and n
-	   dimensions. */ \
-	n_iter = n / NR; \
-	n_left = n % NR; \
-\
-	m_iter = m / MR; \
-	m_left = m % MR; \
-\
-	if ( n_left ) ++n_iter; \
-	if ( m_left ) ++m_iter; \
-\
-	/* Determine some increments used to step through A, B, and C. */ \
-	rstep_a = ps_a; \
-\
-	cstep_b = ps_b; \
-\
-	rstep_c = rs_c * MR; \
-	cstep_c = cs_c * NR; \
-\
-	istep_a = PACKMR * k_full; \
-	istep_b = PACKNR * k; \
-\
-	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
-	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
-\
-	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
-	bli_auxinfo_set_schema_a( schema_a, &aux ); \
-	bli_auxinfo_set_schema_b( schema_b, &aux ); \
-\
-	/* Save the imaginary stride of A to the auxinfo_t object. */ \
-	bli_auxinfo_set_is_a( istep_a, &aux ); \
-\
-	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
-\
-	dim_t jr_nt  = bli_thread_n_way( thread ); \
-	dim_t jr_tid = bli_thread_work_id( thread ); \
-	dim_t ir_nt  = bli_thread_n_way( caucus ); \
-	dim_t ir_tid = bli_thread_work_id( caucus ); \
-\
-	dim_t jr_start, jr_end; \
-	dim_t ir_start, ir_end; \
-	dim_t jr_inc,   ir_inc; \
-\
+	   dimensions. */
+	dim_t n_iter = n / NR;
+	dim_t n_left = n % NR;
+
+	dim_t m_iter = m / MR;
+	dim_t m_left = m % MR;
+
+	if ( n_left ) ++n_iter;
+	if ( m_left ) ++m_iter;
+
+	/* Determine some increments used to step through A, B, and C. */
+	inc_t rstep_a = ps_a * dt_size;
+
+	inc_t cstep_b = ps_b * dt_size;
+
+	inc_t rstep_c = rs_c * MR * dt_size;
+	inc_t cstep_c = cs_c * NR * dt_size;
+
+	/* Save the pack schemas of A and B to the auxinfo_t object. */
+    auxinfo_t aux;
+	bli_auxinfo_set_schema_a( schema_a, &aux );
+	bli_auxinfo_set_schema_b( schema_b, &aux );
+
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread );
+
+	dim_t jr_nt  = bli_thread_n_way( thread );
+	dim_t jr_tid = bli_thread_work_id( thread );
+	dim_t ir_nt  = bli_thread_n_way( caucus );
+	dim_t ir_tid = bli_thread_work_id( caucus );
+
+	dim_t jr_start, jr_end;
+	dim_t ir_start, ir_end;
+	dim_t jr_inc,   ir_inc;
+
 	/* Note that we partition the 2nd loop into two regions: the rectangular
-	   part of B, and the triangular portion. */ \
-	dim_t n_iter_rct; \
-	dim_t n_iter_tri; \
-\
-	if ( bli_is_strictly_below_diag_n( diagoffb, m, n ) ) \
-	{ \
+	   part of B, and the triangular portion. */
+	dim_t n_iter_rct;
+	dim_t n_iter_tri;
+
+	if ( bli_is_strictly_below_diag_n( diagoffb, m, n ) )
+	{
 		/* If the entire panel of B does not intersect the diagonal, there is
 		   no triangular region, and therefore we can skip the second set of
-		   loops. */ \
-		n_iter_rct = n_iter; \
-		n_iter_tri = 0; \
-	} \
-	else \
-	{ \
+		   loops. */
+		n_iter_rct = n_iter;
+		n_iter_tri = 0;
+	}
+	else
+	{
 		/* If the panel of B does intersect the diagonal, compute the number of
 		   iterations in the rectangular region by dividing NR into the diagonal
 		   offset. (There should never be any remainder in this division.) The
 		   number of iterations in the triangular (or trapezoidal) region is
-		   computed as the remaining number of iterations in the n dimension. */ \
-		n_iter_rct = diagoffb / NR; \
-		n_iter_tri = n_iter - n_iter_rct; \
-	} \
-\
+		   computed as the remaining number of iterations in the n dimension. */
+		n_iter_rct = diagoffb / NR;
+		n_iter_tri = n_iter - n_iter_rct;
+	}
+
 	/* Determine the thread range and increment for the 2nd and 1st loops for
 	   the initial rectangular region of B (if it exists).
 	   NOTE: The definition of bli_thread_range_jrir() will depend on whether
-	   slab or round-robin partitioning was requested at configure-time. \
-	   NOTE: Parallelism in the 1st loop is disabled for now. */ \
-	bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
-	bli_thread_range_jrir( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc ); \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_start; j < jr_end; j += jr_inc ) \
-	{ \
-		ctype* restrict a1; \
-		ctype* restrict c11; \
-		ctype* restrict b2; \
-\
-		b1 = b_cast + j * cstep_b; \
-		c1 = c_cast + j * cstep_c; \
-\
-		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
-		{ \
-			/* Loop over the m dimension (MR rows at a time). */ \
-			for ( i = ir_start; i < ir_end; i += ir_inc ) \
-			{ \
-				ctype* restrict a2; \
-\
-				a1  = a_cast + i * rstep_a; \
-				c11 = c1     + i * rstep_c; \
-\
-				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-				/* Compute the addresses of the next panels of A and B. */ \
-				a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
-				if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
-				{ \
-					a2 = a_cast; \
-					b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
-					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
-						b2 = b_cast; \
-				} \
-\
+	   slab or round-robin partitioning was requested at configure-time.
+	   NOTE: Parallelism in the 1st loop is disabled for now. */
+	bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc );
+	bli_thread_range_jrir( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc );
+
+	/* Loop over the n dimension (NR columns at a time). */
+	for ( dim_t j = jr_start; j < jr_end; j += jr_inc )
+	{
+		const char* b1 = b_cast + j * cstep_b;
+		      char* c1 = c_cast + j * cstep_c;
+
+		dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
+
+		/* Initialize our next panel of B to be the current panel of B. */
+		const char* b2 = b1;
+
+		{
+			/* Loop over the m dimension (MR rows at a time). */
+			for ( dim_t i = ir_start; i < ir_end; i += ir_inc )
+			{
+				const char* a1  = a_cast + i * rstep_a;
+				      char* c11 = c1     + i * rstep_c;
+
+				dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
+
+				/* Compute the addresses of the next panels of A and B. */
+				const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc );
+				if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) )
+				{
+					a2 = a_cast;
+					b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc );
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) )
+						b2 = b_cast;
+				}
+
 				/* Save addresses of next panels of A and B to the auxinfo_t
-				   object. */ \
-				bli_auxinfo_set_next_a( a2, &aux ); \
-				bli_auxinfo_set_next_b( b2, &aux ); \
-\
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  m_cur, \
-				  n_cur, \
-				  k, \
-				  alpha_cast, \
-				  a1, \
-				  b1, \
-				  one, \
-				  c11, rs_c, cs_c, \
-				  &aux, \
-				  cntx  \
-				); \
-			} \
-		} \
-	} \
-\
-	/* If there is no triangular region, then we're done. */ \
-	if ( n_iter_tri == 0 ) return; \
-\
+				   object. */
+				bli_auxinfo_set_next_a( a2, &aux );
+				bli_auxinfo_set_next_b( b2, &aux );
+
+				/* Invoke the gemm micro-kernel. */
+				gemm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k,
+				  ( void* )alpha_cast,
+				  ( void* )a1,
+				  ( void* )b1,
+				  ( void* )one,
+				  c11, rs_c, cs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+			}
+		}
+	}
+
+	/* If there is no triangular region, then we're done. */
+	if ( n_iter_tri == 0 ) return;
+
 	/* Use round-robin assignment of micropanels to threads in the 2nd and
 	   1st loops for the remaining triangular region of B (if it exists).
 	   NOTE: We don't need to call bli_thread_range_jrir_rr() here since we
 	   employ a hack that calls for each thread to execute every iteration
 	   of the jr and ir loops but skip all but the pointer increment for
-	   iterations that are not assigned to it. */ \
-\
+	   iterations that are not assigned to it. */
+
 	/* Advance the starting b1 and c1 pointers to the positions corresponding
-	   to the start of the triangular region of B. */ \
-	jr_start = n_iter_rct; \
-	b1 = b_cast + jr_start * cstep_b; \
-	c1 = c_cast + jr_start * cstep_c; \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_start; j < n_iter; ++j ) \
-	{ \
-		ctype* restrict a1; \
-		ctype* restrict c11; \
-		ctype* restrict b2; \
-\
-		diagoffb_j = diagoffb - ( doff_t )j*NR; \
-\
+	   to the start of the triangular region of B. */
+	jr_start = n_iter_rct;
+	const char* b1 = b_cast + jr_start * cstep_b;
+	      char* c1 = c_cast + jr_start * cstep_c;
+
+	/* Loop over the n dimension (NR columns at a time). */
+	for ( dim_t j = jr_start; j < n_iter; ++j )
+	{
+		doff_t diagoffb_j = diagoffb - ( doff_t )j*NR;
+
 		/* Determine the offset to the beginning of the panel that
 		   was packed so we can index into the corresponding location
-		   in A. Then compute the length of that panel. */ \
-		off_b1121 = bli_max( -diagoffb_j, 0 ); \
-		k_b1121   = k - off_b1121; \
-\
-		a1  = a_cast; \
-		c11 = c1; \
-\
-		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
+		   in A. Then compute the length of that panel. */
+		dim_t off_b1121 = bli_max( -diagoffb_j, 0 );
+		dim_t k_b1121   = k - off_b1121;
+
+		const char* a1  = a_cast;
+		      char* c11 = c1;
+
+		dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
+
+		/* Initialize our next panel of B to be the current panel of B. */
+		const char* b2 = b1;
+
 		/* If the current panel of B intersects the diagonal, scale C
 		   by beta. If it is strictly below the diagonal, scale by one.
 		   This allows the current macro-kernel to work for both trmm
-		   and trmm3. */ \
-		{ \
+		   and trmm3. */
+		{
 			/* Compute the panel stride for the current diagonal-
-			   intersecting micro-panel. */ \
-			is_b_cur  = k_b1121 * PACKNR; \
-			is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
-			ps_b_cur  = is_b_cur; \
-\
-			if ( bli_trmm_my_iter_rr( j, thread ) ) { \
-\
-			/* Loop over the m dimension (MR rows at a time). */ \
-			for ( i = 0; i < m_iter; ++i ) \
-			{ \
-				if ( bli_trmm_my_iter_rr( i, caucus ) ) { \
-\
-				ctype* restrict a1_i; \
-				ctype* restrict a2; \
-\
-				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-				a1_i = a1 + off_b1121 * PACKMR; \
-\
-				/* Compute the addresses of the next panels of A and B. */ \
-				a2 = a1; \
-				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
-				{ \
-					a2 = a_cast; \
-					b2 = b1; \
-					if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
-						b2 = b_cast; \
-				} \
-\
+			   intersecting micro-panel. */
+			inc_t ps_b_cur  = k_b1121 * PACKNR;
+    			  ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 );
+    			  ps_b_cur *= dt_size;
+
+			if ( bli_trmm_my_iter_rr( j, thread ) ) {
+
+			/* Loop over the m dimension (MR rows at a time). */
+			for ( dim_t i = 0; i < m_iter; ++i )
+			{
+				if ( bli_trmm_my_iter_rr( i, caucus ) ) {
+
+				dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
+
+				const char* a1_i = a1 + off_b1121 * PACKMR;
+
+				/* Compute the addresses of the next panels of A and B. */
+				const char* a2 = a1;
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) )
+				{
+					a2 = a_cast;
+					b2 = b1;
+					if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) )
+						b2 = b_cast;
+				}
+
 				/* Save addresses of next panels of A and B to the auxinfo_t
-				   object. */ \
-				bli_auxinfo_set_next_a( a2, &aux ); \
-				bli_auxinfo_set_next_b( b2, &aux ); \
-\
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  m_cur, \
-				  n_cur, \
-				  k_b1121, \
-				  alpha_cast, \
-				  a1_i, \
-				  b1, \
-				  beta_cast, \
-				  c11, rs_c, cs_c, \
-				  &aux, \
-				  cntx  \
-				); \
-				} \
-\
-				a1  += rstep_a; \
-				c11 += rstep_c; \
-			} \
-			} \
-\
-			b1 += ps_b_cur; \
-		} \
-\
-		c1 += cstep_c; \
-	} \
-\
-/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" );*/ \
-/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
-}
+				   object. */
+				bli_auxinfo_set_next_a( a2, &aux );
+				bli_auxinfo_set_next_b( b2, &aux );
+
+				/* Invoke the gemm micro-kernel. */
+				gemm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k_b1121,
+				  ( void* )alpha_cast,
+				  ( void* )a1_i,
+				  ( void* )b1,
+				  ( void* )beta_cast,
+				  c11, rs_c, cs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+				}
+
+				a1  += rstep_a;
+				c11 += rstep_c;
+			}
+			}
 
-INSERT_GENTFUNC_BASIC0( trmm_rl_ker_var2 )
+			b1 += ps_b_cur;
+		}
+
+		c1 += cstep_c;
+	}
+
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: a1", MR, k_b1121, a1, 1, MR, "%4.1f", "" );*/
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_rl_ker_var2: b1", k_b1121, NR, b1_i, NR, 1, "%4.1f", "" );*/
+}
 
diff --git a/frame/3/trmm/bli_trmm_ru_ker_var2.c b/frame/3/trmm/bli_trmm_ru_ker_var2.c
index 4ed38e7610..d4dbe1d5a6 100644
--- a/frame/3/trmm/bli_trmm_ru_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_ru_ker_var2.c
@@ -35,50 +35,27 @@
 
 #include "blis.h"
 
-#define FUNCPTR_T gemm_fp
-
-typedef void (*FUNCPTR_T)
-     (
-       doff_t  diagoffb,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       dim_t   m,
-       dim_t   n,
-       dim_t   k,
-       void*   alpha,
-       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
-       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
-       void*   beta,
-       void*   c, inc_t rs_c, inc_t cs_c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
-     );
-
-static FUNCPTR_T GENARRAY(ftypes,trmm_ru_ker_var2);
-
-
 void bli_trmm_ru_ker_var2
      (
        const obj_t*  a,
        const obj_t*  b,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
+       const cntl_t* cntl,
              thrinfo_t* thread
      )
 {
-	const num_t     dt_exec   = bli_obj_exec_dt( c );
+	const num_t     dt        = bli_obj_exec_dt( c );
+    const dim_t     dt_size   = bli_dt_size( dt );
 
-	const doff_t    diagoffb  = bli_obj_diag_offset( b );
+	      doff_t    diagoffb  = bli_obj_diag_offset( b );
 
 	const pack_t    schema_a  = bli_obj_pack_schema( a );
 	const pack_t    schema_b  = bli_obj_pack_schema( b );
 
-	const dim_t     m         = bli_obj_length( c );
-	const dim_t     n         = bli_obj_width( c );
-	const dim_t     k         = bli_obj_width( a );
+	      dim_t     m         = bli_obj_length( c );
+	      dim_t     n         = bli_obj_width( c );
+	      dim_t     k         = bli_obj_width( a );
 
 	const void*     buf_a     = bli_obj_buffer_at_off( a );
 	const inc_t     cs_a      = bli_obj_col_stride( a );
@@ -105,89 +82,23 @@ void bli_trmm_ru_ker_var2
 	const void* buf_alpha = bli_obj_internal_scalar_buffer( &scalar_b );
 	const void* buf_beta  = bli_obj_internal_scalar_buffer( c );
 
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	ftypes[dt_exec]
-	(
-	  diagoffb,
-	  schema_a,
-	  schema_b,
-	  m,
-	  n,
-	  k,
-	  ( void* )buf_alpha,
-	  ( void* )buf_a, cs_a, pd_a, ps_a,
-	  ( void* )buf_b, rs_b, pd_b, ps_b,
-	  ( void* )buf_beta,
-	           buf_c, rs_c, cs_c,
-	  ( cntx_t* )cntx,
-	  rntm,
-	  thread
-	);
-}
-
+	/* Alias some constants to simpler names. */
+	const dim_t     MR         = pd_a;
+	const dim_t     NR         = pd_b;
+	const dim_t     PACKMR     = cs_a;
+	const dim_t     PACKNR     = rs_b;
 
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       doff_t  diagoffb, \
-       pack_t  schema_a, \
-       pack_t  schema_b, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   k, \
-       void*   alpha, \
-       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
-       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
-       void*   beta, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	const num_t     dt         = PASTEMAC(ch,type); \
-\
-	/* Alias some constants to simpler names. */ \
-	const dim_t     MR         = pd_a; \
-	const dim_t     NR         = pd_b; \
-	const dim_t     PACKMR     = cs_a; \
-	const dim_t     PACKNR     = rs_b; \
-\
 	/* Query the context for the micro-kernel address and cast it to its
-	   function pointer type. */ \
-	PASTECH(ch,gemm_ukr_ft) \
-	                gemm_ukr   = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
-\
-	ctype* restrict one        = PASTEMAC(ch,1); \
-	ctype* restrict a_cast     = a; \
-	ctype* restrict b_cast     = b; \
-	ctype* restrict c_cast     = c; \
-	ctype* restrict alpha_cast = alpha; \
-	ctype* restrict beta_cast  = beta; \
-	ctype* restrict b1; \
-	ctype* restrict c1; \
-\
-	doff_t          diagoffb_j; \
-	dim_t           k_full; \
-	dim_t           m_iter, m_left; \
-	dim_t           n_iter, n_left; \
-	dim_t           m_cur; \
-	dim_t           n_cur; \
-	dim_t           k_b0111; \
-	dim_t           off_b0111; \
-	dim_t           i, j, jb0; \
-	inc_t           rstep_a; \
-	inc_t           cstep_b; \
-	inc_t           rstep_c, cstep_c; \
-	inc_t           istep_a; \
-	inc_t           istep_b; \
-	inc_t           ps_b_cur; \
-	inc_t           is_b_cur; \
-	auxinfo_t       aux; \
-\
+	   function pointer type. */
+	gemm_ukr_vft gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
+
+	const void* one        = bli_obj_buffer_for_const( dt, &BLIS_ONE );
+	const char* a_cast     = buf_a;
+	const char* b_cast     = buf_b;
+	      char* c_cast     = buf_c;
+	const char* alpha_cast = buf_alpha;
+	const char* beta_cast  = buf_beta;
+
 	/*
 	   Assumptions/assertions:
 	     rs_a == 1
@@ -200,312 +111,288 @@ void PASTEMAC(ch,varname) \
 	     ps_b == stride to next micro-panel of B
 	     rs_c == (no assumptions)
 	     cs_c == (no assumptions)
-	*/ \
-\
+	*/
+
 	/* Safety trap: Certain indexing within this macro-kernel does not
-	   work as intended if both MR and NR are odd. */ \
-	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
-	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
-\
-	/* If any dimension is zero, return immediately. */ \
-	if ( bli_zero_dim3( m, n, k ) ) return; \
-\
+	   work as intended if both MR and NR are odd. */
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) ||
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort();
+
+	/* If any dimension is zero, return immediately. */
+	if ( bli_zero_dim3( m, n, k ) ) return;
+
 	/* Safeguard: If the current panel of B is entirely below its diagonal,
-	   it is implicitly zero. So we do nothing. */ \
-	if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return; \
-\
+	   it is implicitly zero. So we do nothing. */
+	if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return;
+
 	/* Compute k_full. For all trmm, k_full is simply k. This is
 	   needed because some parameter combinations of trmm reduce k
 	   to advance past zero regions in the triangular matrix, and
 	   when computing the imaginary stride of A (the non-triangular
 	   matrix), which is used by 4m1/3m1 implementations, we need
-	   this unreduced value of k. */ \
-	k_full = k; \
-\
+	   this unreduced value of k. */
+	dim_t k_full = k;
+
 	/* If there is a zero region to the left of where the diagonal of B
 	   intersects the top edge of the panel, adjust the pointer to C and
 	   treat this case as if the diagonal offset were zero. This skips over
 	   the region that was not packed. (Note we assume the diagonal offset
 	   is a multiple of MR; this assumption will hold as long as the cache
-	   blocksizes are each a multiple of MR and NR.) */ \
-	if ( diagoffb > 0 ) \
-	{ \
-		j        = diagoffb; \
-		n        = n - j; \
-		diagoffb = 0; \
-		c_cast   = c_cast + (j  )*cs_c; \
-	} \
-\
+	   blocksizes are each a multiple of MR and NR.) */
+	if ( diagoffb > 0 )
+	{
+		n        -= diagoffb;
+		c_cast   += diagoffb * cs_c * dt_size;
+		diagoffb  = 0;
+	}
+
 	/* If there is a zero region below where the diagonal of B intersects the
 	   right side of the block, shrink it to prevent "no-op" iterations from
-	   executing. */ \
-	if ( -diagoffb + n < k ) \
-	{ \
-		k = -diagoffb + n; \
-	} \
-\
+	   executing. */
+	if ( -diagoffb + n < k )
+	{
+		k = -diagoffb + n;
+	}
+
 	/* Compute number of primary and leftover components of the m and n
-	   dimensions. */ \
-	n_iter = n / NR; \
-	n_left = n % NR; \
-\
-	m_iter = m / MR; \
-	m_left = m % MR; \
-\
-	if ( n_left ) ++n_iter; \
-	if ( m_left ) ++m_iter; \
-\
-	/* Determine some increments used to step through A, B, and C. */ \
-	rstep_a = ps_a; \
-\
-	cstep_b = ps_b; \
-\
-	rstep_c = rs_c * MR; \
-	cstep_c = cs_c * NR; \
-\
-	istep_a = PACKMR * k_full; \
-	istep_b = PACKNR * k; \
-\
-	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
-	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
-\
-	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
-	bli_auxinfo_set_schema_a( schema_a, &aux ); \
-	bli_auxinfo_set_schema_b( schema_b, &aux ); \
-\
-	/* Save the imaginary stride of A to the auxinfo_t object. */ \
-	bli_auxinfo_set_is_a( istep_a, &aux ); \
-\
+	   dimensions. */
+	dim_t n_iter = n / NR;
+	dim_t n_left = n % NR;
+
+	dim_t m_iter = m / MR;
+	dim_t m_left = m % MR;
+
+	if ( n_left ) ++n_iter;
+	if ( m_left ) ++m_iter;
+
+	/* Determine some increments used to step through A, B, and C. */
+	inc_t rstep_a = ps_a * dt_size;
+
+	inc_t cstep_b = ps_b * dt_size;
+
+	inc_t rstep_c = rs_c * MR * dt_size;
+	inc_t cstep_c = cs_c * NR * dt_size;
+
+	/* Save the pack schemas of A and B to the auxinfo_t object. */
+    auxinfo_t aux;
+	bli_auxinfo_set_schema_a( schema_a, &aux );
+	bli_auxinfo_set_schema_b( schema_b, &aux );
+
 	/* The 'thread' argument points to the thrinfo_t node for the 2nd (jr)
 	   loop around the microkernel. Here we query the thrinfo_t node for the
-	   1st (ir) loop around the microkernel. */ \
-	thrinfo_t* caucus = bli_thrinfo_sub_node( thread ); \
-\
-	/* Query the number of threads and thread ids for each loop. */ \
-	dim_t jr_nt  = bli_thread_n_way( thread ); \
-	dim_t jr_tid = bli_thread_work_id( thread ); \
-	dim_t ir_nt  = bli_thread_n_way( caucus ); \
-	dim_t ir_tid = bli_thread_work_id( caucus ); \
-\
-	dim_t jr_start, jr_end; \
-	dim_t ir_start, ir_end; \
-	dim_t jr_inc,   ir_inc; \
-\
+	   1st (ir) loop around the microkernel. */
+	thrinfo_t* caucus = bli_thrinfo_sub_node( thread );
+
+	/* Query the number of threads and thread ids for each loop. */
+	dim_t jr_nt  = bli_thread_n_way( thread );
+	dim_t jr_tid = bli_thread_work_id( thread );
+	dim_t ir_nt  = bli_thread_n_way( caucus );
+	dim_t ir_tid = bli_thread_work_id( caucus );
+
+	dim_t jr_start, jr_end;
+	dim_t ir_start, ir_end;
+	dim_t jr_inc,   ir_inc;
+
 	/* Note that we partition the 2nd loop into two regions: the triangular
-	   part of C, and the rectangular portion. */ \
-	dim_t n_iter_tri; \
-	dim_t n_iter_rct; \
-\
-	if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) \
-	{ \
+	   part of C, and the rectangular portion. */
+	dim_t n_iter_tri;
+	dim_t n_iter_rct;
+
+	if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) )
+	{
 		/* If the entire panel of B does not intersect the diagonal, there is
 		   no triangular region, and therefore we can skip the first set of
-		   loops. */ \
-		n_iter_tri = 0; \
-		n_iter_rct = n_iter; \
-	} \
-	else \
-	{ \
+		   loops. */
+		n_iter_tri = 0;
+		n_iter_rct = n_iter;
+	}
+	else
+	{
 		/* If the panel of B does intersect the diagonal, compute the number of
 		   iterations in the triangular (or trapezoidal) region by dividing NR
 		   into the number of rows in B. (There should never be any remainder
 		   in this division.) The number of iterations in the rectangular region
-		   is computed as the remaining number of iterations in the n dimension. */ \
-		n_iter_tri = ( k + diagoffb ) / NR + ( ( k + diagoffb ) % NR ? 1 : 0 ); \
-		n_iter_rct = n_iter - n_iter_tri; \
-	} \
-\
+		   is computed as the remaining number of iterations in the n dimension. */
+		n_iter_tri = ( k + diagoffb ) / NR + ( ( k + diagoffb ) % NR ? 1 : 0 );
+		n_iter_rct = n_iter - n_iter_tri;
+	}
+
 	/* Use round-robin assignment of micropanels to threads in the 2nd and
 	   1st loops for the initial triangular region of B (if it exists).
 	   NOTE: We don't need to call bli_thread_range_jrir_rr() here since we
 	   employ a hack that calls for each thread to execute every iteration
 	   of the jr and ir loops but skip all but the pointer increment for
-	   iterations that are not assigned to it. */ \
-\
-	b1 = b_cast; \
-	c1 = c_cast; \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = 0; j < n_iter_tri; ++j ) \
-	{ \
-		ctype* restrict a1; \
-		ctype* restrict c11; \
-		ctype* restrict b2; \
-\
-		diagoffb_j = diagoffb - ( doff_t )j*NR; \
-\
+	   iterations that are not assigned to it. */
+
+	const char* b1 = b_cast;
+	      char* c1 = c_cast;
+
+	/* Loop over the n dimension (NR columns at a time). */
+	for ( dim_t j = 0; j < n_iter_tri; ++j )
+	{
+		doff_t diagoffb_j = diagoffb - ( doff_t )j*NR;
+
 		/* Determine the offset to and length of the panel that was packed
-		   so we can index into the corresponding location in A. */ \
-		off_b0111 = 0; \
-		k_b0111   = bli_min( k, -diagoffb_j + NR ); \
-\
-		a1  = a_cast; \
-		c11 = c1; \
-\
-		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
+		   so we can index into the corresponding location in A. */
+		dim_t off_b0111 = 0;
+		dim_t k_b0111   = bli_min( k, -diagoffb_j + NR );
+
+		const char* a1  = a_cast;
+		      char* c11 = c1;
+
+		dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
+
+		/* Initialize our next panel of B to be the current panel of B. */
+		const char* b2 = b1;
+
 		/* If the current panel of B intersects the diagonal, scale C
 		   by beta. If it is strictly below the diagonal, scale by one.
 		   This allows the current macro-kernel to work for both trmm
-		   and trmm3. */ \
-		{ \
+		   and trmm3. */
+		{
 			/* Compute the panel stride for the current diagonal-
-			   intersecting micro-panel. */ \
-			is_b_cur  = k_b0111 * PACKNR; \
-			is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
-			ps_b_cur  = is_b_cur; \
-\
-			if ( bli_trmm_my_iter_rr( j, thread ) ) { \
-\
-			/* Loop over the m dimension (MR rows at a time). */ \
-			for ( i = 0; i < m_iter; ++i ) \
-			{ \
-				if ( bli_trmm_my_iter_rr( i, caucus ) ) { \
-\
-				ctype* restrict a1_i; \
-				ctype* restrict a2; \
-\
-				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-				a1_i = a1 + off_b0111 * PACKMR; \
-\
-				/* Compute the addresses of the next panels of A and B. */ \
-				a2 = a1; \
-				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
-				{ \
-					a2 = a_cast; \
-					b2 = b1; \
-					if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) ) \
-						b2 = b_cast; \
-				} \
-\
+			   intersecting micro-panel. */
+			inc_t ps_b_cur  = k_b0111 * PACKNR;
+    			  ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 );
+    			  ps_b_cur *= dt_size;
+
+			if ( bli_trmm_my_iter_rr( j, thread ) ) {
+
+			/* Loop over the m dimension (MR rows at a time). */
+			for ( dim_t i = 0; i < m_iter; ++i )
+			{
+				if ( bli_trmm_my_iter_rr( i, caucus ) ) {
+
+				dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
+
+				const char* a1_i = a1 + off_b0111 * PACKMR;
+
+				/* Compute the addresses of the next panels of A and B. */
+				const char* a2 = a1;
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) )
+				{
+					a2 = a_cast;
+					b2 = b1;
+					if ( bli_is_last_iter_rr( j, n_iter, jr_tid, jr_nt ) )
+						b2 = b_cast;
+				}
+
 				/* Save addresses of next panels of A and B to the auxinfo_t
-				   object. */ \
-				bli_auxinfo_set_next_a( a2, &aux ); \
-				bli_auxinfo_set_next_b( b2, &aux ); \
-\
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  m_cur, \
-				  n_cur, \
-				  k_b0111, \
-				  alpha_cast, \
-				  a1_i, \
-				  b1, \
-				  beta_cast, \
-				  c11, rs_c, cs_c, \
-				  &aux, \
-				  cntx  \
-				); \
-				} \
-\
-				a1  += rstep_a; \
-				c11 += rstep_c; \
-			} \
-			} \
-\
-			b1 += ps_b_cur; \
-		} \
-\
-		c1 += cstep_c; \
-	} \
-\
-	/* If there is no rectangular region, then we're done. */ \
-	if ( n_iter_rct == 0 ) return; \
-\
+				   object. */
+				bli_auxinfo_set_next_a( a2, &aux );
+				bli_auxinfo_set_next_b( b2, &aux );
+
+				/* Invoke the gemm micro-kernel. */
+				gemm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k_b0111,
+				  ( void* )alpha_cast,
+				  ( void* )a1_i,
+				  ( void* )b1,
+				  ( void* )beta_cast,
+				  c11, rs_c, cs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+				}
+
+				a1  += rstep_a;
+				c11 += rstep_c;
+			}
+			}
+
+			b1 += ps_b_cur;
+		}
+
+		c1 += cstep_c;
+	}
+
+	/* If there is no rectangular region, then we're done. */
+	if ( n_iter_rct == 0 ) return;
+
 	/* Determine the thread range and increment for the 2nd and 1st loops for
 	   the remaining rectangular region of B.
 	   NOTE: The definition of bli_thread_range_jrir() will depend on whether
-	   slab or round-robin partitioning was requested at configure-time. \
-	   NOTE: Parallelism in the 1st loop is disabled for now. */ \
-	bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
-	bli_thread_range_jrir( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc ); \
-\
+	   slab or round-robin partitioning was requested at configure-time.
+	   NOTE: Parallelism in the 1st loop is disabled for now. */
+	bli_thread_range_jrir( thread, n_iter_rct, 1, FALSE, &jr_start, &jr_end, &jr_inc );
+	bli_thread_range_jrir( caucus, m_iter,     1, FALSE, &ir_start, &ir_end, &ir_inc );
+
 	/* Advance the start and end iteration offsets for the rectangular region
-	   by the number of iterations used for the triangular region. */ \
-	jr_start += n_iter_tri; \
-	jr_end   += n_iter_tri; \
-	jb0       = n_iter_tri; \
-\
+	   by the number of iterations used for the triangular region. */
+	      jr_start += n_iter_tri;
+	      jr_end   += n_iter_tri;
+	dim_t jb0       = n_iter_tri;
+
 	/* Save the resulting value of b1 from the previous loop since it represents
-	   the starting point for the rectangular region. */ \
-	b_cast = b1; \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_start; j < jr_end; j += jr_inc ) \
-	{ \
-		ctype* restrict a1; \
-		ctype* restrict c11; \
-		ctype* restrict b2; \
-\
+	   the starting point for the rectangular region. */
+	b_cast = b1;
+
+	/* Loop over the n dimension (NR columns at a time). */
+	for ( dim_t j = jr_start; j < jr_end; j += jr_inc )
+	{
 		/* NOTE: We must index through b_cast differently since it contains
 		   the starting address of the rectangular region (which is already
-		   n_iter_tri logical iterations through B). */ \
-		b1 = b_cast + (j-jb0) * cstep_b; \
-		c1 = c_cast +  j      * cstep_c; \
-\
-		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
+		   n_iter_tri logical iterations through B). */
+		b1 = b_cast + (j-jb0) * cstep_b;
+		c1 = c_cast +  j      * cstep_c;
+
+		dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
+
+		/* Initialize our next panel of B to be the current panel of B. */
+		const char* b2 = b1;
+
 		/* If the current panel of B intersects the diagonal, scale C
 		   by beta. If it is strictly below the diagonal, scale by one.
 		   This allows the current macro-kernel to work for both trmm
-		   and trmm3. */ \
-		{ \
-			/* Loop over the m dimension (MR rows at a time). */ \
-			for ( i = ir_start; i < ir_end; i += ir_inc ) \
-			{ \
-				ctype* restrict a2; \
-\
-				a1  = a_cast + i * rstep_a; \
-				c11 = c1     + i * rstep_c; \
-\
-				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-				/* Compute the addresses of the next panels of A and B. */ \
-				a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc ); \
-				if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) ) \
-				{ \
-					a2 = a_cast; \
-					b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc ); \
-					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
-						b2 = b_cast; \
-				} \
-\
+		   and trmm3. */
+		{
+			/* Loop over the m dimension (MR rows at a time). */
+			for ( dim_t i = ir_start; i < ir_end; i += ir_inc )
+			{
+				const char* a1  = a_cast + i * rstep_a;
+				      char* c11 = c1     + i * rstep_c;
+
+				dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
+
+				/* Compute the addresses of the next panels of A and B. */
+				const char* a2 = bli_trmm_get_next_a_upanel( a1, rstep_a, ir_inc );
+				if ( bli_is_last_iter( i, m_iter, ir_tid, ir_nt ) )
+				{
+					a2 = a_cast;
+					b2 = bli_trmm_get_next_b_upanel( b1, cstep_b, jr_inc );
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) )
+						b2 = b_cast;
+				}
+
 				/* Save addresses of next panels of A and B to the auxinfo_t
-				   object. */ \
-				bli_auxinfo_set_next_a( a2, &aux ); \
-				bli_auxinfo_set_next_b( b2, &aux ); \
-\
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  m_cur, \
-				  n_cur, \
-				  k, \
-				  alpha_cast, \
-				  a1, \
-				  b1, \
-				  one, \
-				  c11, rs_c, cs_c, \
-				  &aux, \
-				  cntx  \
-				); \
-			} \
-		} \
-	} \
-\
-\
-\
-/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: a1", MR, k_b0111, a1, 1, MR, "%4.1f", "" );*/ \
-/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: b1", k_b0111, NR, b1_i, NR, 1, "%4.1f", "" );*/ \
-}
+				   object. */
+				bli_auxinfo_set_next_a( a2, &aux );
+				bli_auxinfo_set_next_b( b2, &aux );
+
+				/* Invoke the gemm micro-kernel. */
+				gemm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k,
+				  ( void* )alpha_cast,
+				  ( void* )a1,
+				  ( void* )b1,
+				  ( void* )one,
+				  c11, rs_c, cs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+			}
+		}
+	}
 
-INSERT_GENTFUNC_BASIC0( trmm_ru_ker_var2 )
+
+
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: a1", MR, k_b0111, a1, 1, MR, "%4.1f", "" );*/
+/*PASTEMAC(ch,fprintm)( stdout, "trmm_ru_ker_var2: b1", k_b0111, NR, b1_i, NR, 1, "%4.1f", "" );*/
+}
 
diff --git a/frame/3/trmm/bli_trmm_var.h b/frame/3/trmm/bli_trmm_var.h
index 2f0642ca8f..f8c3d7ee20 100644
--- a/frame/3/trmm/bli_trmm_var.h
+++ b/frame/3/trmm/bli_trmm_var.h
@@ -47,8 +47,7 @@ void PASTEMAC0(opname) \
        const obj_t*  b, \
        const obj_t*  c, \
        const cntx_t* cntx, \
-             rntm_t* rntm, \
-             cntl_t* cntl, \
+       const cntl_t* cntl, \
              thrinfo_t* thread  \
      );
 
@@ -87,7 +86,6 @@ void PASTEMAC(ch,varname) \
        void*   beta, \
        void*   c, inc_t rs_c, inc_t cs_c, \
        cntx_t* cntx, \
-       rntm_t* rntm, \
        thrinfo_t* thread  \
      );
 
diff --git a/frame/3/trmm/bli_trmm_xx_ker_var2.c b/frame/3/trmm/bli_trmm_xx_ker_var2.c
index d42bc88c2d..efbd67dc72 100644
--- a/frame/3/trmm/bli_trmm_xx_ker_var2.c
+++ b/frame/3/trmm/bli_trmm_xx_ker_var2.c
@@ -47,8 +47,7 @@ void bli_trmm_xx_ker_var2
        const obj_t*  b,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
+       const cntl_t* cntl,
              thrinfo_t* thread
      )
 {
@@ -82,7 +81,6 @@ void bli_trmm_xx_ker_var2
 	  b,
 	  c,
 	  cntx,
-	  rntm,
 	  cntl,
 	  thread
 	);
diff --git a/frame/3/trmm3/bli_trmm3_front.c b/frame/3/trmm3/bli_trmm3_front.c
index 9681eb6406..022e3da354 100644
--- a/frame/3/trmm3/bli_trmm3_front.c
+++ b/frame/3/trmm3/bli_trmm3_front.c
@@ -44,7 +44,7 @@ void bli_trmm3_front
        const obj_t*  c,
        const cntx_t* cntx,
              rntm_t* rntm,
-             cntl_t* cntl
+       const cntl_t* cntl
      )
 {
 	bli_init_once();
diff --git a/frame/3/trmm3/bli_trmm3_front.h b/frame/3/trmm3/bli_trmm3_front.h
index b5dde34cd0..dcaa4d0ee3 100644
--- a/frame/3/trmm3/bli_trmm3_front.h
+++ b/frame/3/trmm3/bli_trmm3_front.h
@@ -41,6 +41,5 @@ void bli_trmm3_front
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl
+             rntm_t* rntm
      );
diff --git a/frame/3/trsm/bli_trsm_blk_var1.c b/frame/3/trsm/bli_trsm_blk_var1.c
index 915fe3e59a..fd65076450 100644
--- a/frame/3/trsm/bli_trsm_blk_var1.c
+++ b/frame/3/trsm/bli_trsm_blk_var1.c
@@ -43,8 +43,7 @@ void bli_trsm_blk_var1
        const obj_t*  b,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
+       const cntl_t* cntl,
              thrinfo_t* thread
      )
 {
@@ -81,7 +80,7 @@ void bli_trsm_blk_var1
 	for ( dim_t i = my_start; i < my_end; i += b_alg )
 	{
 		b_alg = bli_determine_blocksize( direct, i, my_end, &a11,
-		                                 bli_cntl_bszid( cntl ), cntx );
+		                                 bli_cntl_part( cntl ), cntx );
 
 		// Acquire partitions for A1 and C1.
 		obj_t a11_1, c1_1;
@@ -105,7 +104,6 @@ void bli_trsm_blk_var1
 		  &BLIS_ONE,
 		  &c1_1,
 		  cntx,
-		  rntm,
 		  bli_cntl_sub_prenode( cntl ),
 		  bli_thrinfo_sub_prenode( thread )
 		);
@@ -152,7 +150,7 @@ void bli_trsm_blk_var1
 	{
 		// Determine the current algorithmic blocksize.
 		b_alg = bli_determine_blocksize( direct, i, my_end, &ax1,
-		                                 bli_cntl_bszid( cntl ), cntx );
+		                                 bli_cntl_part( cntl ), cntx );
 
 		// Acquire partitions for A1 and C1.
 		obj_t a11, c1;
@@ -177,7 +175,6 @@ void bli_trsm_blk_var1
 		  &BLIS_ONE,
 		  &c1,
 		  cntx,
-		  rntm,
 		  bli_cntl_sub_node( cntl ),
 		  bli_thrinfo_sub_node( thread )
 		);
diff --git a/frame/3/trsm/bli_trsm_blk_var2.c b/frame/3/trsm/bli_trsm_blk_var2.c
index 88db57e519..4a90dd54d8 100644
--- a/frame/3/trsm/bli_trsm_blk_var2.c
+++ b/frame/3/trsm/bli_trsm_blk_var2.c
@@ -41,8 +41,7 @@ void bli_trsm_blk_var2
        const obj_t*  b,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
+       const cntl_t* cntl,
              thrinfo_t* thread
      )
 {
@@ -70,7 +69,7 @@ void bli_trsm_blk_var2
 	{
 		// Determine the current algorithmic blocksize.
 		b_alg = bli_determine_blocksize( direct, i, my_end, &bp,
-		                                 bli_cntl_bszid( cntl ), cntx );
+		                                 bli_cntl_part( cntl ), cntx );
 
 		// Acquire partitions for B1 and C1.
 		obj_t b1, c1;
@@ -88,7 +87,6 @@ void bli_trsm_blk_var2
 		  &BLIS_ONE,
 		  &c1,
 		  cntx,
-		  rntm,
 		  bli_cntl_sub_node( cntl ),
 		  bli_thrinfo_sub_node( thread )
 		);
diff --git a/frame/3/trsm/bli_trsm_blk_var3.c b/frame/3/trsm/bli_trsm_blk_var3.c
index 2ff3db6f1d..9e145263b5 100644
--- a/frame/3/trsm/bli_trsm_blk_var3.c
+++ b/frame/3/trsm/bli_trsm_blk_var3.c
@@ -40,8 +40,7 @@ void bli_trsm_blk_var3
        const obj_t*  b,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
+       const cntl_t* cntl,
              thrinfo_t* thread
      )
 {
@@ -65,7 +64,7 @@ void bli_trsm_blk_var3
 	{
 		// Determine the current algorithmic blocksize.
 		b_alg = bli_trsm_determine_kc( direct, i, k_trans, &ap, &bp,
-		                               bli_cntl_bszid( cntl ), cntx );
+		                               bli_cntl_part( cntl ), cntx );
 
 		// Acquire partitions for A1 and B1.
 		obj_t a1, b1;
@@ -83,7 +82,6 @@ void bli_trsm_blk_var3
 		  &BLIS_ONE,
 		  &cs,
 		  cntx,
-		  rntm,
 		  bli_cntl_sub_node( cntl ),
 		  bli_thrinfo_sub_node( thread )
 		);
diff --git a/frame/3/trsm/bli_trsm_cntl.c b/frame/3/trsm/bli_trsm_cntl.c
index 0a3be87f74..1f5c4cac51 100644
--- a/frame/3/trsm/bli_trsm_cntl.c
+++ b/frame/3/trsm/bli_trsm_cntl.c
@@ -33,293 +33,378 @@
 
 */
 
+#include "bli_type_defs.h"
 #include "blis.h"
 
 cntl_t* bli_trsm_cntl_create
      (
-       rntm_t* rntm,
-       side_t  side,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       void_fp ker
+             trsm_cntl_t* cntl,
+             side_t       side,
+       const obj_t*       a,
+       const obj_t*       b,
+       const obj_t*       c,
+       const cntx_t*      cntx
      )
 {
 	if ( bli_is_left( side ) )
-		return bli_trsm_l_cntl_create( rntm, schema_a, schema_b, ker );
+		return bli_trsm_l_cntl_create( cntl, a, b, c, cntx );
 	else
-		return bli_trsm_r_cntl_create( rntm, schema_a, schema_b, ker );
+		return bli_trsm_r_cntl_create( cntl, a, b, c, cntx );
 }
 
 cntl_t* bli_trsm_l_cntl_create
      (
-       rntm_t* rntm,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       void_fp ker
+             trsm_cntl_t* cntl,
+       const obj_t*       a,
+       const obj_t*       b,
+       const obj_t*       c,
+       const cntx_t*      cntx
      )
 {
-	void_fp macro_kernel_p;
-
-	// Set the default macrokernel. If a non-NULL kernel function pointer is
-	// passed in, we use that instead.
-	macro_kernel_p = bli_trsm_xx_ker_var2;
-	if ( ker ) macro_kernel_p = ker;
-
-	const opid_t family = BLIS_TRSM;
-
-	//
-	// Create nodes for packing A and the macro-kernel (gemm branch).
-	//
-
-	cntl_t* gemm_cntl_bu_ke = bli_trsm_cntl_create_node
-	(
-	  rntm,    // the thread's runtime structure
-	  family,  // the operation family
-	  BLIS_MR, // needed for bli_thrinfo_rgrow()
-	  NULL,    // variant function pointer not used
-	  NULL     // no sub-node; this is the leaf of the tree.
-	);
+          num_t           dt_comp      = bli_obj_comp_dt( c );
+	      void_fp         ker_fp       = bli_obj_ker_fn( c );
+	      void_fp         packa_fp     = bli_obj_ker_fn( a );
+	      void_fp         packb_fp     = bli_obj_ker_fn( b );
+    const gemm_params_t*  ker_params   = bli_obj_ker_params( c );
+    const packm_params_t* packa_params = bli_obj_ker_params( a );
+    const packm_params_t* packb_params = bli_obj_ker_params( b );
+
+	// TODO: enable customization
+    if ( ker_fp != NULL ||
+         packa_fp != NULL ||
+         packb_fp != NULL ||
+         ker_params != NULL ||
+         packa_params != NULL ||
+         packb_params != NULL )
+    {
+        bli_abort();
+    }
+
+    ker_fp = bli_trsm_xx_ker_var2;
+	packa_fp = bli_l3_packa;
+	packb_fp = bli_l3_packb;
+
+    cntl->ker_params.mr = *bli_cntx_get_blksz( BLIS_MR, cntx );
+    cntl->ker_params.nr = *bli_cntx_get_blksz( BLIS_NR, cntx );
+    cntl->ker_params.ukr = bli_cntx_get_ukr_dt( dt_comp, BLIS_GEMM_UKR, cntx );
+    ker_params = &cntl->ker_params;
+
+    cntl->gemm_packa_params.mr = ker_params->mr;
+    cntl->gemm_packa_params.kr = ker_params->mr;
+    cntl->gemm_packa_params.does_invert_diag = FALSE;
+    cntl->gemm_packa_params.rev_iter_if_upper = TRUE;
+    cntl->gemm_packa_params.rev_iter_if_lower = FALSE;
+    cntl->gemm_packa_params.pack_schema = BLIS_PACKED_ROW_PANELS;
+    cntl->gemm_packa_params.pack_buf_type = BLIS_BUFFER_FOR_A_BLOCK;
+    void* gemm_packa_params = &cntl->gemm_packa_params;
+
+    cntl->trsm_packa_params.mr = ker_params->mr;
+    cntl->trsm_packa_params.kr = ker_params->mr;
+#ifdef BLIS_ENABLE_TRSM_PREINVERSION
+    cntl->trsm_packa_params.does_invert_diag = TRUE;
+#else
+    cntl->trsm_packa_params.does_invert_diag = FALSE;
+#endif
+    cntl->trsm_packa_params.rev_iter_if_upper = TRUE;
+    cntl->trsm_packa_params.rev_iter_if_lower = FALSE;
+    cntl->trsm_packa_params.pack_schema = BLIS_PACKED_ROW_PANELS;
+    cntl->trsm_packa_params.pack_buf_type = BLIS_BUFFER_FOR_A_BLOCK;
+    void* trsm_packa_params = &cntl->gemm_packa_params;
+
+    cntl->packb_params.mr = ker_params->nr;
+    cntl->packb_params.kr = ker_params->mr;
+    cntl->packb_params.does_invert_diag = FALSE;
+    cntl->packb_params.rev_iter_if_upper = FALSE;
+    cntl->packb_params.rev_iter_if_lower = FALSE;
+    cntl->packb_params.pack_schema = BLIS_PACKED_COL_PANELS;
+    cntl->packb_params.pack_buf_type = BLIS_BUFFER_FOR_B_PANEL;
+    packb_params = &cntl->packb_params;
+
+    cntl->mc.blksz = *bli_cntx_get_blksz( BLIS_MC, cntx );
+    cntl->nc.blksz = *bli_cntx_get_blksz( BLIS_NC, cntx );
+    cntl->kc.blksz = *bli_cntx_get_blksz( BLIS_KC, cntx );
+    cntl->mc.bmult = ker_params->mr;
+    cntl->nc.bmult = ker_params->nr;
+    cntl->kc.bmult = packa_params->kr;
+
+    bli_align_blksz_to_mult( &cntl->mc.blksz, &cntl->mc.bmult );
+    bli_align_blksz_to_mult( &cntl->nc.blksz, &cntl->nc.bmult );
+    bli_l3_adjust_kc( a, b, &ker_params->mr, &ker_params->nr, &cntl->kc.blksz, BLIS_TRSM );
 
-	cntl_t* gemm_cntl_bp_bu = bli_trsm_cntl_create_node
-	(
-	  rntm,
-	  family,
-	  BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow()
-	  macro_kernel_p,
-	  gemm_cntl_bu_ke
+	// Create two nodes for the macro-kernel.
+    bli_cntl_initialize_node
+    (
+      &cntl->gemm_loop1,
+      BLIS_TRSM, // the operation family
+      BLIS_MR,   // used for thread partitioning
+      NULL,      // variant function pointer not used
+      NULL,      // not used
+      NULL,      // no sub-prenode; this is the leaf of the tree.
+      NULL       // no sub-node; this is the leaf of the tree.
+    );
+
+    bli_cntl_initialize_node
+    (
+      &cntl->gemm_loop2,
+	  BLIS_TRSM,
+      BLIS_NR,
+	  ker_fp,
+	  ker_params,
+      NULL,
+      &cntl->gemm_loop1
 	);
 
 	// Create a node for packing matrix A.
-	cntl_t* gemm_cntl_packa = bli_packm_cntl_create_node
-	(
-	  rntm,
-	  bli_l3_packa, // trsm operation's packm function for A.
-	  BLIS_MR,
-	  BLIS_MR,
-	  FALSE,   // do NOT invert diagonal
-	  TRUE,    // reverse iteration if upper?
-	  FALSE,   // reverse iteration if lower?
-	  schema_a, // normally BLIS_PACKED_ROW_PANELS
-	  BLIS_BUFFER_FOR_A_BLOCK,
-	  gemm_cntl_bp_bu
-	);
-
-	//
-	// Create nodes for packing A and the macro-kernel (trsm branch).
-	//
-
-	cntl_t* trsm_cntl_bu_ke = bli_trsm_cntl_create_node
-	(
-	  rntm,    // the thread's runtime structure
-	  family,  // the operation family
-	  BLIS_MR, // needed for bli_thrinfo_rgrow()
-	  NULL,    // variant function pointer not used
-	  NULL     // no sub-node; this is the leaf of the tree.
+    bli_cntl_initialize_node
+    (
+      &cntl->gemm_packa,
+	  BLIS_TRSM,
+      BLIS_NO_PART,
+	  packa_fp,
+	  gemm_packa_params,
+      NULL,
+      &cntl->gemm_loop2
 	);
 
-	cntl_t* trsm_cntl_bp_bu = bli_trsm_cntl_create_node
-	(
-	  rntm,
-	  family,
-	  BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow()
-	  macro_kernel_p,
-	  trsm_cntl_bu_ke
+	// Create two nodes for the macro-kernel.
+    bli_cntl_initialize_node
+    (
+      &cntl->trsm_loop1,
+      BLIS_TRSM, // the operation family
+      BLIS_MR,   // used for thread partitioning
+      NULL,      // variant function pointer not used
+      NULL,      // not used
+      NULL,      // no sub-prenode; this is the leaf of the tree.
+      NULL       // no sub-node; this is the leaf of the tree.
+    );
+
+    bli_cntl_initialize_node
+    (
+      &cntl->trsm_loop2,
+	  BLIS_TRSM,
+      BLIS_NR,
+	  ker_fp,
+	  ker_params,
+      NULL,
+      &cntl->trsm_loop1
 	);
 
 	// Create a node for packing matrix A.
-	cntl_t* trsm_cntl_packa = bli_packm_cntl_create_node
-	(
-	  rntm,
-	  bli_l3_packa, // trsm operation's packm function for A.
-	  BLIS_MR,
-	  BLIS_MR,
-#ifdef BLIS_ENABLE_TRSM_PREINVERSION
-	  TRUE,    // invert diagonal
-#else
-	  FALSE,   // do NOT invert diagonal
-#endif
-	  TRUE,    // reverse iteration if upper?
-	  FALSE,   // reverse iteration if lower?
-	  schema_a, // normally BLIS_PACKED_ROW_PANELS
-	  BLIS_BUFFER_FOR_A_BLOCK,
-	  trsm_cntl_bp_bu
+    bli_cntl_initialize_node
+    (
+      &cntl->trsm_packa,
+	  BLIS_TRSM,
+      BLIS_NO_PART,
+	  packa_fp,
+	  trsm_packa_params,
+      NULL,
+      &cntl->trsm_loop2
 	);
 
-	// -------------------------------------------------------------------------
-
 	// Create a node for partitioning the m dimension by MC.
-	// NOTE: We attach the gemm sub-tree as the main branch.
-	cntl_t* trsm_cntl_op_bp = bli_trsm_cntl_create_node
-	(
-	  rntm,
-	  family,
-	  BLIS_MC,
-	  bli_trsm_blk_var1,
-	  gemm_cntl_packa
+    bli_cntl_initialize_node
+    (
+      &cntl->loop3,
+	  BLIS_TRSM,
+      BLIS_MC,
+	  bli_gemm_blk_var1,
+      &cntl->mc,
+	  &cntl->trsm_packa,
+	  &cntl->gemm_packa
 	);
 
-	// Attach the trsm sub-tree as the auxiliary "prenode" branch.
-	bli_cntl_set_sub_prenode( trsm_cntl_packa, trsm_cntl_op_bp );
-
-	// -------------------------------------------------------------------------
-
 	// Create a node for packing matrix B.
-	cntl_t* trsm_cntl_packb = bli_packm_cntl_create_node
-	(
-	  rntm,
-	  bli_l3_packb,
-	  BLIS_NR,
-	  BLIS_MR,
-	  FALSE,   // do NOT invert diagonal
-	  FALSE,   // reverse iteration if upper?
-	  FALSE,   // reverse iteration if lower?
-	  schema_b, // normally BLIS_PACKED_COL_PANELS
-	  BLIS_BUFFER_FOR_B_PANEL,
-	  trsm_cntl_op_bp
+    bli_cntl_initialize_node
+    (
+      &cntl->packb,
+	  BLIS_TRSM,
+      BLIS_NO_PART,
+	  packb_fp,
+	  packb_params,
+      NULL,
+      &cntl->loop3
 	);
 
 	// Create a node for partitioning the k dimension by KC.
-	cntl_t* trsm_cntl_mm_op = bli_trsm_cntl_create_node
-	(
-	  rntm,
-	  family,
-	  BLIS_KC,
-	  bli_trsm_blk_var3,
-	  trsm_cntl_packb
+    bli_cntl_initialize_node
+    (
+      &cntl->loop4,
+	  BLIS_TRSM,
+      BLIS_KC,
+	  bli_gemm_blk_var3,
+      &cntl->kc,
+      NULL,
+	  &cntl->packb
 	);
 
 	// Create a node for partitioning the n dimension by NC.
-	cntl_t* trsm_cntl_vl_mm = bli_trsm_cntl_create_node
-	(
-	  rntm,
-	  family,
-	  BLIS_NC,
-	  bli_trsm_blk_var2,
-	  trsm_cntl_mm_op
+    bli_cntl_initialize_node
+    (
+      &cntl->loop5,
+	  BLIS_TRSM,
+      BLIS_NC,
+	  bli_gemm_blk_var2,
+      &cntl->nc,
+      NULL,
+	  &cntl->loop4
 	);
 
-	return trsm_cntl_vl_mm;
+	return &cntl->loop5;
 }
 
 cntl_t* bli_trsm_r_cntl_create
      (
-       rntm_t* rntm,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       void_fp ker
+             trsm_cntl_t* cntl,
+       const obj_t*       a,
+       const obj_t*       b,
+       const obj_t*       c,
+       const cntx_t*      cntx
      )
 {
-	// NOTE: trsm macrokernels are presently disabled for right-side execution.
-	// Set the default macrokernel. If a non-NULL kernel function pointer is
-	// passed in, we use that instead.
-	void_fp macro_kernel_p = bli_trsm_xx_ker_var2;
-	if ( ker ) macro_kernel_p = ker;
-
-	const opid_t family = BLIS_TRSM;
+          num_t           dt_comp      = bli_obj_comp_dt( c );
+	      void_fp         ker_fp       = bli_obj_ker_fn( c );
+	      void_fp         packa_fp     = bli_obj_ker_fn( a );
+	      void_fp         packb_fp     = bli_obj_ker_fn( b );
+    const gemm_params_t*  ker_params   = bli_obj_ker_params( c );
+    const packm_params_t* packa_params = bli_obj_ker_params( a );
+    const packm_params_t* packb_params = bli_obj_ker_params( b );
+
+	// TODO: enable customization
+    if ( ker_fp != NULL ||
+         packa_fp != NULL ||
+         packb_fp != NULL ||
+         ker_params != NULL ||
+         packa_params != NULL ||
+         packb_params != NULL )
+    {
+        bli_abort();
+    }
+
+    ker_fp = bli_trsm_xx_ker_var2;
+	packa_fp = bli_l3_packa;
+	packb_fp = bli_l3_packb;
+
+    cntl->ker_params.mr = *bli_cntx_get_blksz( BLIS_MR, cntx );
+    cntl->ker_params.nr = *bli_cntx_get_blksz( BLIS_NR, cntx );
+    cntl->ker_params.ukr = bli_cntx_get_ukr_dt( dt_comp, BLIS_GEMM_UKR, cntx );
+    ker_params = &cntl->ker_params;
+
+    cntl->trsm_packa_params.mr = ker_params->nr;
+    cntl->trsm_packa_params.kr = ker_params->mr;
+    cntl->trsm_packa_params.does_invert_diag = FALSE;
+    cntl->trsm_packa_params.rev_iter_if_upper = FALSE;
+    cntl->trsm_packa_params.rev_iter_if_lower = FALSE;
+    cntl->trsm_packa_params.pack_schema = BLIS_PACKED_ROW_PANELS;
+    cntl->trsm_packa_params.pack_buf_type = BLIS_BUFFER_FOR_A_BLOCK;
+    packa_params = &cntl->gemm_packa_params;
+
+    cntl->packb_params.mr = ker_params->mr;
+    cntl->packb_params.kr = ker_params->mr;
+#ifdef BLIS_ENABLE_TRSM_PREINVERSION
+    cntl->packb_params.does_invert_diag = TRUE;
+#else
+    cntl->trsm_packa_params.does_invert_diag = FALSE;
+#endif
+    cntl->packb_params.rev_iter_if_upper = TRUE;
+    cntl->packb_params.rev_iter_if_lower = FALSE;
+    cntl->packb_params.pack_schema = BLIS_PACKED_COL_PANELS;
+    cntl->packb_params.pack_buf_type = BLIS_BUFFER_FOR_B_PANEL;
+    packb_params = &cntl->packb_params;
+
+    cntl->mc.blksz = *bli_cntx_get_blksz( BLIS_MC, cntx );
+    cntl->nc.blksz = *bli_cntx_get_blksz( BLIS_NC, cntx );
+    cntl->kc.blksz = *bli_cntx_get_blksz( BLIS_KC, cntx );
+    cntl->mc.bmult = ker_params->mr;
+    cntl->nc.bmult = ker_params->nr;
+    cntl->kc.bmult = packa_params->kr;
+
+    bli_align_blksz_to_mult( &cntl->mc.blksz, &cntl->mc.bmult );
+    bli_align_blksz_to_mult( &cntl->nc.blksz, &cntl->nc.bmult );
+    bli_l3_adjust_kc( a, b, &ker_params->mr, &ker_params->nr, &cntl->kc.blksz, BLIS_TRSM );
 
 	// Create two nodes for the macro-kernel.
-	cntl_t* trsm_cntl_bu_ke = bli_trsm_cntl_create_node
-	(
-	  rntm,
-	  family,
-	  BLIS_MR, // needed for bli_thrinfo_rgrow()
-	  NULL,    // variant function pointer not used
-	  NULL     // no sub-node; this is the leaf of the tree.
-	);
-
-	cntl_t* trsm_cntl_bp_bu = bli_trsm_cntl_create_node
-	(
-	  rntm,
-	  family,
-	  BLIS_NR, // not used by macro-kernel, but needed for bli_thrinfo_rgrow()
-	  macro_kernel_p,
-	  trsm_cntl_bu_ke
+    bli_cntl_initialize_node
+    (
+      &cntl->trsm_loop1,
+      BLIS_TRSM, // the operation family
+      BLIS_MR,   // used for thread partitioning
+      NULL,      // variant function pointer not used
+      NULL,      // not used
+      NULL,      // no sub-prenode; this is the leaf of the tree.
+      NULL       // no sub-node; this is the leaf of the tree.
+    );
+
+    bli_cntl_initialize_node
+    (
+      &cntl->trsm_loop2,
+	  BLIS_TRSM,
+      BLIS_NR,
+	  ker_fp,
+	  ker_params,
+      NULL,
+      &cntl->trsm_loop1
 	);
 
 	// Create a node for packing matrix A.
-	cntl_t* trsm_cntl_packa = bli_packm_cntl_create_node
-	(
-	  rntm,
-	  bli_l3_packa,
-	  BLIS_NR,
-	  BLIS_MR,
-	  FALSE,   // do NOT invert diagonal
-	  FALSE,   // reverse iteration if upper?
-	  FALSE,   // reverse iteration if lower?
-	  schema_a, // normally BLIS_PACKED_ROW_PANELS
-	  BLIS_BUFFER_FOR_A_BLOCK,
-	  trsm_cntl_bp_bu
+    bli_cntl_initialize_node
+    (
+      &cntl->trsm_packa,
+	  BLIS_TRSM,
+      BLIS_NO_PART,
+	  packa_fp,
+	  packa_params,
+      NULL,
+      &cntl->trsm_loop2
 	);
 
 	// Create a node for partitioning the m dimension by MC.
-	cntl_t* trsm_cntl_op_bp = bli_trsm_cntl_create_node
-	(
-	  rntm,
-	  family,
-	  BLIS_MC,
-	  bli_trsm_blk_var1,
-	  trsm_cntl_packa
+    bli_cntl_initialize_node
+    (
+      &cntl->loop3,
+	  BLIS_TRSM,
+      BLIS_MC,
+	  bli_gemm_blk_var1,
+      &cntl->mc,
+      NULL,
+	  &cntl->trsm_packa
 	);
 
 	// Create a node for packing matrix B.
-	cntl_t* trsm_cntl_packb = bli_packm_cntl_create_node
-	(
-	  rntm,
-	  bli_l3_packb,
-	  BLIS_MR,
-	  BLIS_MR,
-	  TRUE,    // do NOT invert diagonal
-	  FALSE,   // reverse iteration if upper?
-	  TRUE,    // reverse iteration if lower?
-	  schema_b, // normally BLIS_PACKED_COL_PANELS
-	  BLIS_BUFFER_FOR_B_PANEL,
-	  trsm_cntl_op_bp
+    bli_cntl_initialize_node
+    (
+      &cntl->packb,
+	  BLIS_TRSM,
+      BLIS_NO_PART,
+	  packb_fp,
+	  packb_params,
+      NULL,
+      &cntl->loop3
 	);
 
 	// Create a node for partitioning the k dimension by KC.
-	cntl_t* trsm_cntl_mm_op = bli_trsm_cntl_create_node
-	(
-	  rntm,
-	  family,
-	  BLIS_KC,
-	  bli_trsm_blk_var3,
-	  trsm_cntl_packb
+    bli_cntl_initialize_node
+    (
+      &cntl->loop4,
+	  BLIS_TRSM,
+      BLIS_KC,
+	  bli_gemm_blk_var3,
+      &cntl->kc,
+      NULL,
+	  &cntl->packb
 	);
 
 	// Create a node for partitioning the n dimension by NC.
-	cntl_t* trsm_cntl_vl_mm = bli_trsm_cntl_create_node
-	(
-	  rntm,
-	  family,
-	  BLIS_NC,
-	  bli_trsm_blk_var2,
-	  trsm_cntl_mm_op
+    bli_cntl_initialize_node
+    (
+      &cntl->loop5,
+	  BLIS_TRSM,
+      BLIS_NC,
+	  bli_gemm_blk_var2,
+      &cntl->nc,
+      NULL,
+	  &cntl->loop4
 	);
 
-	return trsm_cntl_vl_mm;
-}
-
-void bli_trsm_cntl_free
-     (
-       rntm_t*    rntm,
-       cntl_t*    cntl,
-       thrinfo_t* thread
-     )
-{
-	bli_cntl_free( rntm, cntl, thread );
-}
-
-// -----------------------------------------------------------------------------
-
-cntl_t* bli_trsm_cntl_create_node
-     (
-       rntm_t* rntm,
-       opid_t  family,
-       bszid_t bszid,
-       void_fp var_func,
-       cntl_t* sub_node
-     )
-{
-	return bli_cntl_create_node( rntm, family, bszid, var_func, NULL, sub_node );
+	return &cntl->loop5;
 }
 
diff --git a/frame/3/trsm/bli_trsm_cntl.h b/frame/3/trsm/bli_trsm_cntl.h
index 86f4a29b2a..ebf4da50c4 100644
--- a/frame/3/trsm/bli_trsm_cntl.h
+++ b/frame/3/trsm/bli_trsm_cntl.h
@@ -33,46 +33,52 @@
 
 */
 
+typedef struct
+{
+    cntl_t loop5;
+    cntl_t loop4;
+    cntl_t packb;
+    cntl_t loop3;
+    cntl_t gemm_packa;
+    cntl_t gemm_loop2;
+    cntl_t gemm_loop1;
+    cntl_t trsm_packa;
+    cntl_t trsm_loop2;
+    cntl_t trsm_loop1;
+    part_params_t nc;
+    part_params_t kc;
+    part_params_t mc;
+    packm_params_t gemm_packa_params;
+    packm_params_t trsm_packa_params;
+    packm_params_t packb_params;
+    gemm_params_t ker_params;
+} trsm_cntl_t;
+
 cntl_t* bli_trsm_cntl_create
      (
-       rntm_t* rntm,
-       side_t  side,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       void_fp ker
+             trsm_cntl_t* cntl,
+             side_t       side,
+       const obj_t*       a,
+       const obj_t*       b,
+       const obj_t*       c,
+       const cntx_t*      cntx
      );
 
 cntl_t* bli_trsm_l_cntl_create
      (
-       rntm_t* rntm,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       void_fp ker
+             trsm_cntl_t* cntl,
+       const obj_t*       a,
+       const obj_t*       b,
+       const obj_t*       c,
+       const cntx_t*      cntx
      );
 
 cntl_t* bli_trsm_r_cntl_create
      (
-       rntm_t* rntm,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       void_fp ker
-     );
-
-void bli_trsm_cntl_free
-     (
-       rntm_t*    rntm,
-       cntl_t*    cntl,
-       thrinfo_t* thread
-     );
-
-// -----------------------------------------------------------------------------
-
-cntl_t* bli_trsm_cntl_create_node
-     (
-       rntm_t* rntm,
-       opid_t  family,
-       bszid_t bszid,
-       void_fp var_func,
-       cntl_t* sub_node
+             trsm_cntl_t* cntl,
+       const obj_t*       a,
+       const obj_t*       b,
+       const obj_t*       c,
+       const cntx_t*      cntx
      );
 
diff --git a/frame/3/trsm/bli_trsm_front.c b/frame/3/trsm/bli_trsm_front.c
index b94a129d99..9af9b1381b 100644
--- a/frame/3/trsm/bli_trsm_front.c
+++ b/frame/3/trsm/bli_trsm_front.c
@@ -43,7 +43,7 @@ void bli_trsm_front
        const obj_t*  b,
        const cntx_t* cntx,
              rntm_t* rntm,
-             cntl_t* cntl
+       const cntl_t* cntl
      )
 {
 	bli_init_once();
diff --git a/frame/3/trsm/bli_trsm_front.h b/frame/3/trsm/bli_trsm_front.h
index b31e88b041..dacfd19e95 100644
--- a/frame/3/trsm/bli_trsm_front.h
+++ b/frame/3/trsm/bli_trsm_front.h
@@ -40,8 +40,7 @@ void bli_trsm_front
        const obj_t*  a,
        const obj_t*  b,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl
+             rntm_t* rntm
      );
 
 #ifdef BLIS_ENABLE_SMALL_MATRIX
diff --git a/frame/3/trsm/bli_trsm_ll_ker_var2.c b/frame/3/trsm/bli_trsm_ll_ker_var2.c
index 075b403362..34debefd3b 100644
--- a/frame/3/trsm/bli_trsm_ll_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_ll_ker_var2.c
@@ -35,50 +35,27 @@
 
 #include "blis.h"
 
-#define FUNCPTR_T gemm_fp
-
-typedef void (*FUNCPTR_T)
-     (
-       doff_t  diagoffa,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       dim_t   m,
-       dim_t   n,
-       dim_t   k,
-       void*   alpha1,
-       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
-       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
-       void*   alpha2,
-       void*   c, inc_t rs_c, inc_t cs_c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
-     );
-
-static FUNCPTR_T GENARRAY(ftypes,trsm_ll_ker_var2);
-
-
 void bli_trsm_ll_ker_var2
      (
        const obj_t*  a,
        const obj_t*  b,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
+       const cntl_t* cntl,
              thrinfo_t* thread
      )
 {
-	const num_t     dt_exec   = bli_obj_exec_dt( c );
+	const num_t     dt        = bli_obj_exec_dt( c );
+    const dim_t     dt_size   = bli_dt_size( dt );
 
-	const doff_t    diagoffa  = bli_obj_diag_offset( a );
+	      doff_t    diagoffa  = bli_obj_diag_offset( a );
 
 	const pack_t    schema_a  = bli_obj_pack_schema( a );
 	const pack_t    schema_b  = bli_obj_pack_schema( b );
 
-	const dim_t     m         = bli_obj_length( c );
-	const dim_t     n         = bli_obj_width( c );
-	const dim_t     k         = bli_obj_width( a );
+	      dim_t     m         = bli_obj_length( c );
+	      dim_t     n         = bli_obj_width( c );
+	      dim_t     k         = bli_obj_width( a );
 
 	const void*     buf_a     = bli_obj_buffer_at_off( a );
 	const inc_t     cs_a      = bli_obj_col_stride( a );
@@ -110,105 +87,23 @@ void bli_trsm_ll_ker_var2
 	// packing.
 	const void* buf_alpha2 = bli_obj_internal_scalar_buffer( c );
 
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	ftypes[dt_exec]
-	(
-	  diagoffa,
-	  schema_a,
-	  schema_b,
-	  m,
-	  n,
-	  k,
-	  ( void* )buf_alpha1,
-	  ( void* )buf_a, cs_a, pd_a, ps_a,
-	  ( void* )buf_b, rs_b, pd_b, ps_b,
-	  ( void* )buf_alpha2,
-	           buf_c, rs_c, cs_c,
-	  ( cntx_t* )cntx,
-	  rntm,
-	  thread
-	);
-}
+	/* Alias some constants to simpler names. */
+	const dim_t MR     = pd_a;
+	const dim_t NR     = pd_b;
+	const dim_t PACKMR = cs_a;
+	const dim_t PACKNR = rs_b;
 
+	/* Cast the micro-kernel address to its function pointer type. */
+	gemmtrsm_ukr_vft gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx );
+	gemm_ukr_vft     gemm_ukr     = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
+
+	const void* minus_one   = bli_obj_buffer_for_const( dt, &BLIS_MINUS_ONE );
+	const char* a_cast      = buf_a;
+	const char* b_cast      = buf_b;
+	      char* c_cast      = buf_c;
+	const char* alpha1_cast = buf_alpha1;
+	const char* alpha2_cast = buf_alpha2;
 
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       doff_t  diagoffa, \
-       pack_t  schema_a, \
-       pack_t  schema_b, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   k, \
-       void*   alpha1, \
-       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
-       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
-       void*   alpha2, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	const num_t     dt          = PASTEMAC(ch,type); \
-\
-	/* Alias some constants to simpler names. */ \
-	const dim_t     MR          = pd_a; \
-	const dim_t     NR          = pd_b; \
-	const dim_t     PACKMR      = cs_a; \
-	const dim_t     PACKNR      = rs_b; \
-\
-	/* Cast the micro-kernel address to its function pointer type. */ \
-	PASTECH(ch,gemmtrsm_ukr_ft) \
-	               gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \
-	PASTECH(ch,gemm_ukr_ft) \
-	                   gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
-\
-	/* Temporary C buffer for edge cases. Note that the strides of this
-	   temporary buffer are set so that they match the storage of the
-	   original C matrix. For example, if C is column-stored, ct will be
-	   column-stored as well. */ \
-/*
-	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
-	                    / sizeof( ctype ) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
-	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
-	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
-*/ \
-\
-	ctype* restrict minus_one   = PASTEMAC(ch,m1); \
-	ctype* restrict a_cast      = a; \
-	ctype* restrict b_cast      = b; \
-	ctype* restrict c_cast      = c; \
-	ctype* restrict alpha1_cast = alpha1; \
-	ctype* restrict alpha2_cast = alpha2; \
-	ctype* restrict b1; \
-	ctype* restrict c1; \
-\
-	doff_t          diagoffa_i; \
-	dim_t           k_full; \
-	dim_t           m_iter, m_left; \
-	dim_t           n_iter, n_left; \
-	dim_t           m_cur; \
-	dim_t           n_cur; \
-	dim_t           k_a1011; \
-	dim_t           k_a10; \
-	dim_t           off_a10; \
-	dim_t           off_a11; \
-	dim_t           i, j; \
-	inc_t           rstep_a; \
-	inc_t           cstep_b; \
-	inc_t           rstep_c, cstep_c; \
-	inc_t           istep_a; \
-	inc_t           istep_b; \
-	inc_t           ps_a_cur; \
-	inc_t           is_a_cur; \
-	auxinfo_t       aux; \
-\
 	/*
 	   Assumptions/assertions:
 	     rs_a == 1
@@ -221,262 +116,227 @@ void PASTEMAC(ch,varname) \
 	     ps_b == stride to next micro-panel of B
 	     rs_c == (no assumptions)
 	     cs_c == (no assumptions)
-	*/ \
-\
+	*/
+
 	/* Safety trap: Certain indexing within this macro-kernel does not
-	   work as intended if both MR and NR are odd. */ \
-	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
-	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
-\
-	/* If any dimension is zero, return immediately. */ \
-	if ( bli_zero_dim3( m, n, k ) ) return; \
-\
+	   work as intended if both MR and NR are odd. */
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) ||
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort();
+
+	/* If any dimension is zero, return immediately. */
+	if ( bli_zero_dim3( m, n, k ) ) return;
+
 	/* Safeguard: If matrix A is above the diagonal, it is implicitly zero.
-	   So we do nothing. */ \
-	if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return; \
-\
+	   So we do nothing. */
+	if ( bli_is_strictly_above_diag_n( diagoffa, m, k ) ) return;
+
 	/* Compute k_full as k inflated up to a multiple of MR. This is
 	   needed because some parameter combinations of trsm reduce k
 	   to advance past zero regions in the triangular matrix, and
 	   when computing the imaginary stride of B (the non-triangular
 	   matrix), which is used by 4m1/3m1 implementations, we need
-	   this unreduced value of k. */ \
-	k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \
-\
+	   this unreduced value of k. */
+	if ( k % MR != 0 ) k += MR - ( k % MR );
+
 	/* If there is a zero region above where the diagonal of A intersects the
 	   left edge of the block, adjust the pointer to C and treat this case as
 	   if the diagonal offset were zero. This skips over the region that was
 	   not packed. (Note we assume the diagonal offset is a multiple of MR;
 	   this assumption will hold as long as the cache blocksizes are each a
-	   multiple of MR and NR.) */ \
-	if ( diagoffa < 0 ) \
-	{ \
-		i        = -diagoffa; \
-		m        = m - i; \
-		diagoffa = 0; \
-		c_cast   = c_cast + (i  )*rs_c; \
-	} \
-\
-	/* Check the k dimension, which needs to be a multiple of MR. If k
-	   isn't a multiple of MR, we adjust it higher to satisfy the micro-
-	   kernel, which is expecting to perform an MR x MR triangular solve.
-	   This adjustment of k is consistent with what happened when A was
-	   packed: all of its bottom/right edges were zero-padded, and
-	   furthermore, the panel that stores the bottom-right corner of the
-	   matrix has its diagonal extended into the zero-padded region (as
-	   identity). This allows the trsm of that bottom-right panel to
-	   proceed without producing any infs or NaNs that would infect the
-	   "good" values of the corresponding block of B. */ \
-	if ( k % MR != 0 ) k += MR - ( k % MR ); \
-\
+	   multiple of MR and NR.) */
+	if ( diagoffa < 0 )
+	{
+		m        += diagoffa;
+		c_cast   -= diagoffa * rs_c * dt_size;
+		diagoffa  = 0;
+	}
+
 	/* NOTE: We don't need to check that m is a multiple of PACKMR since we
 	   know that the underlying buffer was already allocated to have an m
 	   dimension that is a multiple of PACKMR, with the region between the
-	   last row and the next multiple of MR zero-padded accordingly. */ \
-\
+	   last row and the next multiple of MR zero-padded accordingly. */
+
 	/* Compute number of primary and leftover components of the m and n
-       dimensions. */ \
-	n_iter = n / NR; \
-	n_left = n % NR; \
-\
-	m_iter = m / MR; \
-	m_left = m % MR; \
-\
-	if ( n_left ) ++n_iter; \
-	if ( m_left ) ++m_iter; \
-\
-	/* Determine some increments used to step through A, B, and C. */ \
-	rstep_a = ps_a; \
-\
-	cstep_b = ps_b; \
-\
-	rstep_c = rs_c * MR; \
-	cstep_c = cs_c * NR; \
-\
-	istep_a = PACKMR * k; \
-	istep_b = PACKNR * k_full; \
-\
-	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
-	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
-\
-	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
-	bli_auxinfo_set_schema_a( schema_a, &aux ); \
-	bli_auxinfo_set_schema_b( schema_b, &aux ); \
-\
-	/* Save the imaginary stride of B to the auxinfo_t object. */ \
-	bli_auxinfo_set_is_b( istep_b, &aux ); \
-\
+       dimensions. */
+	dim_t n_iter = n / NR;
+	dim_t n_left = n % NR;
+
+	dim_t m_iter = m / MR;
+	dim_t m_left = m % MR;
+
+	if ( n_left ) ++n_iter;
+	if ( m_left ) ++m_iter;
+
+	/* Determine some increments used to step through A, B, and C. */
+	inc_t rstep_a = ps_a * dt_size;
+
+	inc_t cstep_b = ps_b * dt_size;
+
+	inc_t rstep_c = rs_c * MR * dt_size;
+	inc_t cstep_c = cs_c * NR * dt_size;
+
+	/* Save the pack schemas of A and B to the auxinfo_t object. */
+    auxinfo_t aux;
+	bli_auxinfo_set_schema_a( schema_a, &aux );
+	bli_auxinfo_set_schema_b( schema_b, &aux );
+
 	/* We don't bother querying the thrinfo_t node for the 1st loop because
 	   we can't parallelize that loop in trsm due to the inter-iteration
-	   dependencies that exist. */ \
-	/*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \
-\
-	/* Query the number of threads and thread ids for each loop. */ \
-	dim_t jr_nt  = bli_thread_n_way( thread ); \
-	dim_t jr_tid = bli_thread_work_id( thread ); \
-\
-	dim_t jr_start, jr_end; \
-	dim_t jr_inc; \
-\
+	   dependencies that exist. */
+	/*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/
+
+	/* Query the number of threads and thread ids for each loop. */
+	dim_t jr_nt  = bli_thread_n_way( thread );
+	dim_t jr_tid = bli_thread_work_id( thread );
+
+	dim_t jr_start, jr_end;
+	dim_t jr_inc;
+
 	/* Determine the thread range and increment for the 2nd loop.
 	   NOTE: The definition of bli_thread_range_jrir() will depend on whether
 	   slab or round-robin partitioning was requested at configure-time.
 	   NOTE: Parallelism in the 1st loop is unattainable due to the
-	   inter-iteration dependencies present in trsm. */ \
-	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_start; j < jr_end; j += jr_inc ) \
-	{ \
-		ctype* restrict a1; \
-		ctype* restrict c11; \
-		ctype* restrict b2; \
-\
-		b1 = b_cast + j * cstep_b; \
-		c1 = c_cast + j * cstep_c; \
-\
-		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
-		a1  = a_cast; \
-		c11 = c1 + (0  )*rstep_c; \
-\
-		/* Loop over the m dimension (MR rows at a time). */ \
-		for ( i = 0; i < m_iter; ++i ) \
-		{ \
-			diagoffa_i = diagoffa + ( doff_t )i*MR; \
-\
-			m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
+	   inter-iteration dependencies present in trsm. */
+	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc );
+
+	/* Loop over the n dimension (NR columns at a time). */
+	for ( dim_t j = jr_start; j < jr_end; j += jr_inc )
+	{
+		const char* b1 = b_cast + j * cstep_b;
+		      char* c1 = c_cast + j * cstep_c;
+
+	          dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
+
+		/* Initialize our next panel of B to be the current panel of B. */
+		const char* b2  = b1;
+
+		const char* a1  = a_cast;
+		      char* c11 = c1 + (0  )*rstep_c;
+
+		/* Loop over the m dimension (MR rows at a time). */
+		for ( dim_t i = 0; i < m_iter; ++i )
+		{
+			doff_t diagoffa_i = diagoffa + ( doff_t )i*MR;
+
+			dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
+
 			/* If the current panel of A intersects the diagonal, use a
 			   special micro-kernel that performs a fused gemm and trsm.
 			   If the current panel of A resides below the diagonal, use a
 			   a regular gemm micro-kernel. Otherwise, if it is above the
 			   diagonal, it was not packed (because it is implicitly zero)
-			   and so we do nothing. */ \
-			if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
-			{ \
-				ctype* restrict a10; \
-				ctype* restrict a11; \
-				ctype* restrict b01; \
-				ctype* restrict b11; \
-				ctype* restrict a2; \
-\
-				/* Compute various offsets into and lengths of parts of A. */ \
-				off_a10 = 0; \
-				k_a1011 = diagoffa_i + MR; \
-				k_a10   = k_a1011 - MR; \
-				off_a11 = k_a10; \
-\
+			   and so we do nothing. */
+			if ( bli_intersects_diag_n( diagoffa_i, MR, k ) )
+			{
+				/* Compute various offsets into and lengths of parts of A. */
+				dim_t off_a10 = 0;
+				dim_t k_a1011 = diagoffa_i + MR;
+				dim_t k_a10   = k_a1011 - MR;
+				dim_t off_a11 = k_a10;
+
 				/* Compute the panel stride for the current diagonal-
-				   intersecting micro-panel. */ \
-				is_a_cur  = k_a1011 * PACKMR; \
-				is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
-				ps_a_cur  = is_a_cur; \
-\
+				   intersecting micro-panel. */
+				inc_t ps_a_cur  = k_a1011 * PACKMR;
+				      ps_a_cur += ( bli_is_odd( ps_a_cur ) ? 1 : 0 );
+                      ps_a_cur *= dt_size;
+
 				/* Compute the addresses of the panel A10 and the triangular
-				   block A11. */ \
-				a10 = a1; \
-				a11 = a1 + k_a10 * PACKMR; \
-				/*a11 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a10 * PACKMR, 1 );*/ \
-\
+				   block A11. */
+				const char* a10 = a1;
+				const char* a11 = a1 + k_a10 * PACKMR;
+				/*a11 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a10 * PACKMR, 1 );*/
+
 				/* Compute the addresses of the panel B01 and the block
-				   B11. */ \
-				b01 = b1 + off_a10 * PACKNR; \
-				b11 = b1 + off_a11 * PACKNR; \
-\
-				/* Compute the addresses of the next panels of A and B. */ \
-				a2 = a1 + ps_a_cur; \
-				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
-				{ \
-					a2 = a_cast; \
-					b2 = b1; \
-					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
-						b2 = b_cast; \
-				} \
-\
+				   B11. */
+				const char* b01 = b1 + off_a10 * PACKNR;
+				const char* b11 = b1 + off_a11 * PACKNR;
+
+				/* Compute the addresses of the next panels of A and B. */
+				const char* a2 = a1 + ps_a_cur;
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) )
+				{
+					a2 = a_cast;
+					b2 = b1;
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) )
+						b2 = b_cast;
+				}
+
 				/* Save addresses of next panels of A and B to the auxinfo_t
-				   object. */ \
-				bli_auxinfo_set_next_a( a2, &aux ); \
-				bli_auxinfo_set_next_b( b2, &aux ); \
-\
-				gemmtrsm_ukr \
-				( \
-				  m_cur, \
-				  n_cur, \
-				  k_a10, \
-				  alpha1_cast, \
-				  a10, \
-				  a11, \
-				  b01, \
-				  b11, \
-				  c11, rs_c, cs_c, \
-				  &aux, \
-				  cntx  \
-				); \
-\
-				a1 += ps_a_cur; \
-			} \
-			else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) ) \
-			{ \
-				ctype* restrict a2; \
-\
-				/* Compute the addresses of the next panels of A and B. */ \
-				a2 = a1 + rstep_a; \
-				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) \
-				{ \
-					a2 = a_cast; \
-					b2 = b1; \
-					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
-						b2 = b_cast; \
-				} \
-\
+				   object. */
+				bli_auxinfo_set_next_a( a2, &aux );
+				bli_auxinfo_set_next_b( b2, &aux );
+
+				gemmtrsm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k_a10,
+				  ( void* )alpha1_cast,
+				  ( void* )a10,
+				  ( void* )a11,
+				  ( void* )b01,
+				  ( void* )b11,
+				  c11, rs_c, cs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+
+				a1 += ps_a_cur;
+			}
+			else if ( bli_is_strictly_below_diag_n( diagoffa_i, MR, k ) )
+			{
+				/* Compute the addresses of the next panels of A and B. */
+				const char* a2 = a1 + rstep_a;
+				if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) )
+				{
+					a2 = a_cast;
+					b2 = b1;
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) )
+						b2 = b_cast;
+				}
+
 				/* Save addresses of next panels of A and B to the auxinfo_t
-				   object. */ \
-				bli_auxinfo_set_next_a( a2, &aux ); \
-				bli_auxinfo_set_next_b( b2, &aux ); \
-\
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  m_cur, \
-				  n_cur, \
-				  k, \
-				  minus_one, \
-				  a1, \
-				  b1, \
-				  alpha2_cast, \
-				  c11, rs_c, cs_c, \
-				  &aux, \
-				  cntx  \
-				); \
-\
-				a1 += rstep_a; \
-			} \
-\
-			c11 += rstep_c; \
-		} \
-	} \
-\
+				   object. */
+				bli_auxinfo_set_next_a( a2, &aux );
+				bli_auxinfo_set_next_b( b2, &aux );
+
+				/* Invoke the gemm micro-kernel. */
+				gemm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k,
+				  ( void* )minus_one,
+				  ( void* )a1,
+				  ( void* )b1,
+				  ( void* )alpha2_cast,
+				  c11, rs_c, cs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+
+				a1 += rstep_a;
+			}
+
+			c11 += rstep_c;
+		}
+	}
+
 /*
-PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: a11p_r computed", MR, MR, \
-                     ( double* )a11, 1, PACKMR, "%4.1f", "" ); \
-*/ \
-\
+PASTEMAC(d,fprintm)( stdout, "trsm_ll_ker_var2: a11p_r computed", MR, MR,
+                     ( double* )a11, 1, PACKMR, "%4.1f", "" );
+*/
+
 /*
-PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (diag)", MR, k_a1011, a1, 1, MR, "%5.2f", "" ); \
-PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a11 (diag)", MR, MR, a11, 1, MR, "%5.2f", "" ); \
-PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (diag)", k_a1011, NR, bp_i, NR, 1, "%5.2f", "" );  \
-PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: bp11 (diag)", MR, NR, bp11, NR, 1, "%5.2f", "" );  \
-*/ \
-\
+PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (diag)", MR, k_a1011, a1, 1, MR, "%5.2f", "" );
+PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a11 (diag)", MR, MR, a11, 1, MR, "%5.2f", "" );
+PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (diag)", k_a1011, NR, bp_i, NR, 1, "%5.2f", "" );
+PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: bp11 (diag)", MR, NR, bp11, NR, 1, "%5.2f", "" );
+*/
+
 /*
-PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (ndiag)", MR, k, a1, 1, MR, "%5.2f", "" ); \
-PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (ndiag)", k, NR, bp, NR, 1, "%5.2f", "" ); \
-*/ \
+PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: a1 (ndiag)", MR, k_full, a1, 1, MR, "%5.2f", "" );
+PASTEMAC(ch,fprintm)( stdout, "trsm_ll_ker_var2: b1 (ndiag)", k_full, NR, bp, NR, 1, "%5.2f", "" );
+*/
 }
 
-INSERT_GENTFUNC_BASIC0( trsm_ll_ker_var2 )
-
diff --git a/frame/3/trsm/bli_trsm_lu_ker_var2.c b/frame/3/trsm/bli_trsm_lu_ker_var2.c
index 799fdd1013..267a1e08da 100644
--- a/frame/3/trsm/bli_trsm_lu_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_lu_ker_var2.c
@@ -35,50 +35,27 @@
 
 #include "blis.h"
 
-#define FUNCPTR_T gemm_fp
-
-typedef void (*FUNCPTR_T)
-     (
-       doff_t  diagoffa,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       dim_t   m,
-       dim_t   n,
-       dim_t   k,
-       void*   alpha1,
-       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
-       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
-       void*   alpha2,
-       void*   c, inc_t rs_c, inc_t cs_c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
-     );
-
-static FUNCPTR_T GENARRAY(ftypes,trsm_lu_ker_var2);
-
-
 void bli_trsm_lu_ker_var2
      (
        const obj_t*  a,
        const obj_t*  b,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
+       const cntl_t* cntl,
              thrinfo_t* thread
      )
 {
-	const num_t     dt_exec   = bli_obj_exec_dt( c );
+	const num_t     dt        = bli_obj_exec_dt( c );
+	const dim_t     dt_size   = bli_dt_size( dt );
 
-	const doff_t    diagoffa  = bli_obj_diag_offset( a );
+	      doff_t    diagoffa  = bli_obj_diag_offset( a );
 
 	const pack_t    schema_a  = bli_obj_pack_schema( a );
 	const pack_t    schema_b  = bli_obj_pack_schema( b );
 
-	const dim_t     m         = bli_obj_length( c );
-	const dim_t     n         = bli_obj_width( c );
-	const dim_t     k         = bli_obj_width( a );
+	      dim_t     m         = bli_obj_length( c );
+	      dim_t     n         = bli_obj_width( c );
+	      dim_t     k         = bli_obj_width( a );
 
 	const void*     buf_a     = bli_obj_buffer_at_off( a );
 	const inc_t     cs_a      = bli_obj_col_stride( a );
@@ -110,106 +87,23 @@ void bli_trsm_lu_ker_var2
 	// packing.
 	const void* buf_alpha2 = bli_obj_internal_scalar_buffer( c );
 
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	ftypes[dt_exec]
-	(
-	  diagoffa,
-	  schema_a,
-	  schema_b,
-	  m,
-	  n,
-	  k,
-	  ( void* )buf_alpha1,
-	  ( void* )buf_a, cs_a, pd_a, ps_a,
-	  ( void* )buf_b, rs_b, pd_b, ps_b,
-	  ( void* )buf_alpha2,
-	           buf_c, rs_c, cs_c,
-	  ( cntx_t* )cntx,
-	  rntm,
-	  thread
-	);
-}
+	/* Alias some constants to simpler names. */
+	const dim_t     MR          = pd_a;
+	const dim_t     NR          = pd_b;
+	const dim_t     PACKMR      = cs_a;
+	const dim_t     PACKNR      = rs_b;
 
+	/* Cast the micro-kernel address to its function pointer type. */
+	gemmtrsm_ukr_vft gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx );
+	gemm_ukr_vft     gemm_ukr     = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
+
+	const void* minus_one   = bli_obj_buffer_for_const( dt, &BLIS_MINUS_ONE );
+	const char* a_cast      = buf_a;
+	const char* b_cast      = buf_b;
+	      char* c_cast      = buf_c;
+	const char* alpha1_cast = buf_alpha1;
+	const char* alpha2_cast = buf_alpha2;
 
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       doff_t  diagoffa, \
-       pack_t  schema_a, \
-       pack_t  schema_b, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   k, \
-       void*   alpha1, \
-       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
-       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
-       void*   alpha2, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	const num_t     dt          = PASTEMAC(ch,type); \
-\
-	/* Alias some constants to simpler names. */ \
-	const dim_t     MR          = pd_a; \
-	const dim_t     NR          = pd_b; \
-	const dim_t     PACKMR      = cs_a; \
-	const dim_t     PACKNR      = rs_b; \
-\
-	/* Cast the micro-kernel address to its function pointer type. */ \
-	PASTECH(ch,gemmtrsm_ukr_ft) \
-	               gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \
-	PASTECH(ch,gemm_ukr_ft) \
-	                   gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
-\
-	/* Temporary C buffer for edge cases. Note that the strides of this
-	   temporary buffer are set so that they match the storage of the
-	   original C matrix. For example, if C is column-stored, ct will be
-	   column-stored as well. */ \
-/*
-	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
-	                    / sizeof( ctype ) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
-	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
-	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
-*/ \
-\
-	ctype* restrict minus_one   = PASTEMAC(ch,m1); \
-	ctype* restrict a_cast      = a; \
-	ctype* restrict b_cast      = b; \
-	ctype* restrict c_cast      = c; \
-	ctype* restrict alpha1_cast = alpha1; \
-	ctype* restrict alpha2_cast = alpha2; \
-	ctype* restrict b1; \
-	ctype* restrict c1; \
-\
-	doff_t          diagoffa_i; \
-	dim_t           k_full; \
-	dim_t           m_iter, m_left; \
-	dim_t           n_iter, n_left; \
-	dim_t           m_cur; \
-	dim_t           n_cur; \
-	dim_t           k_a1112; \
-	dim_t           k_a11; \
-	dim_t           k_a12; \
-	dim_t           off_a11; \
-	dim_t           off_a12; \
-	dim_t           i, j, ib; \
-	inc_t           rstep_a; \
-	inc_t           cstep_b; \
-	inc_t           rstep_c, cstep_c; \
-	inc_t           istep_a; \
-	inc_t           istep_b; \
-	inc_t           ps_a_cur; \
-	inc_t           is_a_cur; \
-	auxinfo_t       aux; \
-\
 	/*
 	   Assumptions/assertions:
 	     rs_a == 1
@@ -222,49 +116,40 @@ void PASTEMAC(ch,varname) \
 	     ps_b == stride to next micro-panel of B
 	     rs_c == (no assumptions)
 	     cs_c == (no assumptions)
-	*/ \
-\
+	*/
+
 	/* Safety trap: Certain indexing within this macro-kernel does not
-	   work as intended if both MR and NR are odd. */ \
-	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
-	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
-\
-	/* If any dimension is zero, return immediately. */ \
-	if ( bli_zero_dim3( m, n, k ) ) return; \
-\
+	   work as intended if both MR and NR are odd. */
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) ||
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort();
+
+	/* If any dimension is zero, return immediately. */
+	if ( bli_zero_dim3( m, n, k ) ) return;
+
 	/* Safeguard: If matrix A is below the diagonal, it is implicitly zero.
-	   So we do nothing. */ \
-	if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return; \
-\
-	/* Compute k_full as k inflated up to a multiple of MR. This is
-	   needed because some parameter combinations of trsm reduce k
-	   to advance past zero regions in the triangular matrix, and
-	   when computing the imaginary stride of B (the non-triangular
-	   matrix), which is used by 4m1/3m1 implementations, we need
-	   this unreduced value of k. */ \
-	k_full = ( k % MR != 0 ? k + MR - ( k % MR ) : k ); \
-\
+	   So we do nothing. */
+	if ( bli_is_strictly_below_diag_n( diagoffa, m, k ) ) return;
+
 	/* If there is a zero region to the left of where the diagonal of A
 	   intersects the top edge of the block, adjust the pointer to B and
 	   treat this case as if the diagonal offset were zero. Note that we
 	   don't need to adjust the pointer to A since packm would have simply
-	   skipped over the region that was not stored. */ \
-	if ( diagoffa > 0 ) \
-	{ \
-		i        = diagoffa; \
-		k        = k - i; \
-		diagoffa = 0; \
-		b_cast   = b_cast + i * PACKNR; \
-	} \
-\
+	   skipped over the region that was not stored. */
+	if ( diagoffa > 0 )
+	{
+		k        -= diagoffa;
+		b_cast   += diagoffa * PACKNR * dt_size;
+		diagoffa  = 0;
+	}
+
 	/* If there is a zero region below where the diagonal of A intersects the
 	   right side of the block, shrink it to prevent "no-op" iterations from
-	   executing. */ \
-	if ( -diagoffa + k < m ) \
-	{ \
-		m = -diagoffa + k; \
-	} \
-\
+	   executing. */
+	if ( -diagoffa + k < m )
+	{
+		m = -diagoffa + k;
+	}
+
 	/* Check the k dimension, which needs to be a multiple of MR. If k
 	   isn't a multiple of MR, we adjust it higher to satisfy the micro-
 	   kernel, which is expecting to perform an MR x MR triangular solve.
@@ -274,223 +159,201 @@ void PASTEMAC(ch,varname) \
 	   matrix has its diagonal extended into the zero-padded region (as
 	   identity). This allows the trsm of that bottom-right panel to
 	   proceed without producing any infs or NaNs that would infect the
-	   "good" values of the corresponding block of B. */ \
-	if ( k % MR != 0 ) k += MR - ( k % MR ); \
-\
+	   "good" values of the corresponding block of B. */
+	if ( k % MR != 0 ) k += MR - ( k % MR );
+
 	/* NOTE: We don't need to check that m is a multiple of PACKMR since we
 	   know that the underlying buffer was already allocated to have an m
 	   dimension that is a multiple of PACKMR, with the region between the
-	   last row and the next multiple of MR zero-padded accordingly. */ \
-\
+	   last row and the next multiple of MR zero-padded accordingly. */
+
 	/* Compute number of primary and leftover components of the m and n
-       dimensions. */ \
-	n_iter = n / NR; \
-	n_left = n % NR; \
-\
-	m_iter = m / MR; \
-	m_left = m % MR; \
-\
-	if ( n_left ) ++n_iter; \
-	if ( m_left ) ++m_iter; \
-\
-	/* Determine some increments used to step through A, B, and C. */ \
-	rstep_a = ps_a; \
-\
-	cstep_b = ps_b; \
-\
-	rstep_c = rs_c * MR; \
-	cstep_c = cs_c * NR; \
-\
-	istep_a = PACKMR * k; \
-	istep_b = PACKNR * k_full; \
-\
-	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
-	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
-\
-	/* Save the pack schemas of A and B to the auxinfo_t object. */ \
-	bli_auxinfo_set_schema_a( schema_a, &aux ); \
-	bli_auxinfo_set_schema_b( schema_b, &aux ); \
-\
-	/* Save the imaginary stride of B to the auxinfo_t object. */ \
-	bli_auxinfo_set_is_b( istep_b, &aux ); \
-\
+       dimensions. */
+	dim_t n_iter = n / NR;
+	dim_t n_left = n % NR;
+
+	dim_t m_iter = m / MR;
+	dim_t m_left = m % MR;
+
+	if ( n_left ) ++n_iter;
+	if ( m_left ) ++m_iter;
+
+	/* Determine some increments used to step through A, B, and C. */
+	inc_t rstep_a = ps_a * dt_size;
+
+	inc_t cstep_b = ps_b * dt_size;
+
+	inc_t rstep_c = rs_c * MR * dt_size;
+	inc_t cstep_c = cs_c * NR * dt_size;
+
+	/* Save the pack schemas of A and B to the auxinfo_t object. */
+    auxinfo_t aux;
+	bli_auxinfo_set_schema_a( schema_a, &aux );
+	bli_auxinfo_set_schema_b( schema_b, &aux );
+
 	/* We don't bother querying the thrinfo_t node for the 1st loop because
 	   we can't parallelize that loop in trsm due to the inter-iteration
-	   dependencies that exist. */ \
-	/*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/ \
-\
-	/* Query the number of threads and thread ids for each loop. */ \
-	dim_t jr_nt  = bli_thread_n_way( thread ); \
-	dim_t jr_tid = bli_thread_work_id( thread ); \
-\
-	dim_t jr_start, jr_end; \
-	dim_t jr_inc; \
-\
+	   dependencies that exist. */
+	/*thrinfo_t* caucus = bli_thrinfo_sub_node( thread );*/
+
+	/* Query the number of threads and thread ids for each loop. */
+	dim_t jr_nt  = bli_thread_n_way( thread );
+	dim_t jr_tid = bli_thread_work_id( thread );
+
+	dim_t jr_start, jr_end;
+	dim_t jr_inc;
+
 	/* Determine the thread range and increment for the 2nd loop.
 	   NOTE: The definition of bli_thread_range_jrir() will depend on whether
 	   slab or round-robin partitioning was requested at configure-time.
 	   NOTE: Parallelism in the 1st loop is unattainable due to the
-	   inter-iteration dependencies present in trsm. */ \
-	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc ); \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = jr_start; j < jr_end; j += jr_inc ) \
-	{ \
-		ctype* restrict a1; \
-		ctype* restrict c11; \
-		ctype* restrict b2; \
-\
-		b1 = b_cast + j * cstep_b; \
-		c1 = c_cast + j * cstep_c; \
-\
-		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
-		a1  = a_cast; \
-		c11 = c1 + (m_iter-1)*rstep_c; \
-\
-		/* Loop over the m dimension (MR rows at a time). */ \
-		for ( ib = 0; ib < m_iter; ++ib ) \
-		{ \
-			i          = m_iter - 1 - ib; \
-			diagoffa_i = diagoffa + ( doff_t )i*MR; \
-\
-			m_cur = ( bli_is_not_edge_b( ib, m_iter, m_left ) ? MR : m_left ); \
-\
+	   inter-iteration dependencies present in trsm. */
+	bli_thread_range_jrir( thread, n_iter, 1, FALSE, &jr_start, &jr_end, &jr_inc );
+
+	/* Loop over the n dimension (NR columns at a time). */
+	for ( dim_t j = jr_start; j < jr_end; j += jr_inc )
+	{
+		const char* b1 = b_cast + j * cstep_b;
+		      char* c1 = c_cast + j * cstep_c;
+
+		dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
+
+		/* Initialize our next panel of B to be the current panel of B. */
+		const char* b2 = b1;
+
+		const char* a1  = a_cast;
+		      char* c11 = c1 + (m_iter-1)*rstep_c;
+
+		/* Loop over the m dimension (MR rows at a time). */
+		for ( dim_t ib = 0; ib < m_iter; ++ib )
+		{
+			dim_t  i          = m_iter - 1 - ib;
+			doff_t diagoffa_i = diagoffa + ( doff_t )i*MR;
+
+			dim_t  m_cur = ( bli_is_not_edge_b( ib, m_iter, m_left ) ? MR : m_left );
+
 			/* If the current panel of A intersects the diagonal, use a
 			   special micro-kernel that performs a fused gemm and trsm.
 			   If the current panel of A resides above the diagonal, use a
 			   a regular gemm micro-kernel. Otherwise, if it is below the
 			   diagonal, it was not packed (because it is implicitly zero)
-			   and so we do nothing. */ \
-			if ( bli_intersects_diag_n( diagoffa_i, MR, k ) ) \
-			{ \
-				ctype* restrict a11; \
-				ctype* restrict a12; \
-				ctype* restrict b11; \
-				ctype* restrict b21; \
-				ctype* restrict a2; \
-\
-				/* Compute various offsets into and lengths of parts of A. */ \
-				off_a11 = diagoffa_i; \
-				k_a1112 = k - off_a11;; \
-				k_a11   = MR; \
-				k_a12   = k_a1112 - MR; \
-				off_a12 = off_a11 + k_a11; \
-\
+			   and so we do nothing. */
+			if ( bli_intersects_diag_n( diagoffa_i, MR, k ) )
+			{
+				/* Compute various offsets into and lengths of parts of A. */
+				dim_t off_a11 = diagoffa_i;
+				dim_t k_a1112 = k - off_a11;;
+				dim_t k_a11   = MR;
+				dim_t k_a12   = k_a1112 - MR;
+				dim_t off_a12 = off_a11 + k_a11;
+
 				/* Compute the panel stride for the current diagonal-
-				   intersecting micro-panel. */ \
-				is_a_cur  = k_a1112 * PACKMR; \
-				is_a_cur += ( bli_is_odd( is_a_cur ) ? 1 : 0 ); \
-				ps_a_cur  = is_a_cur; \
-\
+				   intersecting micro-panel. */
+				inc_t ps_a_cur  = k_a1112 * PACKMR;
+    			      ps_a_cur += ( bli_is_odd( ps_a_cur ) ? 1 : 0 );
+                      ps_a_cur *= dt_size;
+
 				/* Compute the addresses of the triangular block A11 and the
-				   panel A12. */ \
-				a11 = a1; \
-				a12 = a1 + k_a11 * PACKMR; \
-				/*a12 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a11 * PACKMR, 1 );*/ \
-\
+				   panel A12. */
+				const char* a11 = a1;
+				const char* a12 = a1 + k_a11 * PACKMR;
+				/*a12 = bli_ptr_inc_by_frac( a1, sizeof( ctype ), k_a11 * PACKMR, 1 );*/
+
 				/* Compute the addresses of the panel B01 and the block
-				   B11. */ \
-				b11 = b1 + off_a11 * PACKNR; \
-				b21 = b1 + off_a12 * PACKNR; \
-\
-				/* Compute the addresses of the next panels of A and B. */ \
-				a2 = a1 + ps_a_cur; \
-				if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) ) \
-				{ \
-					a2 = a_cast; \
-					b2 = b1; \
-					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
-						b2 = b_cast; \
-				} \
-\
+				   B11. */
+				const char* b11 = b1 + off_a11 * PACKNR;
+				const char* b21 = b1 + off_a12 * PACKNR;
+
+				/* Compute the addresses of the next panels of A and B. */
+				const char* a2 = a1 + ps_a_cur;
+				if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) )
+				{
+					a2 = a_cast;
+					b2 = b1;
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) )
+						b2 = b_cast;
+				}
+
 				/* Save addresses of next panels of A and B to the auxinfo_t
-				   object. */ \
-				bli_auxinfo_set_next_a( a2, &aux ); \
-				bli_auxinfo_set_next_b( b2, &aux ); \
-\
-				gemmtrsm_ukr \
-				( \
-				  m_cur, \
-				  n_cur, \
-				  k_a12, \
-				  alpha1_cast, \
-				  a12, \
-				  a11, \
-				  b21, \
-				  b11, \
-				  c11, rs_c, cs_c, \
-				  &aux, \
-				  cntx  \
-				); \
-\
-				a1 += ps_a_cur; \
-			} \
-			else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) ) \
-			{ \
-				ctype* restrict a2; \
-\
-				/* Compute the addresses of the next panels of A and B. */ \
-				a2 = a1 + rstep_a; \
-				if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) ) \
-				{ \
-					a2 = a_cast; \
-					b2 = b1; \
-					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) ) \
-						b2 = b_cast; \
-				} \
-\
+				   object. */
+				bli_auxinfo_set_next_a( a2, &aux );
+				bli_auxinfo_set_next_b( b2, &aux );
+
+				gemmtrsm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k_a12,
+				  ( void* )alpha1_cast,
+				  ( void* )a12,
+				  ( void* )a11,
+				  ( void* )b21,
+				  ( void* )b11,
+				  c11, rs_c, cs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+
+				a1 += ps_a_cur;
+			}
+			else if ( bli_is_strictly_above_diag_n( diagoffa_i, MR, k ) )
+			{
+				/* Compute the addresses of the next panels of A and B. */
+				const char* a2 = a1 + rstep_a;
+				if ( bli_is_last_iter_rr( ib, m_iter, 0, 1 ) )
+				{
+					a2 = a_cast;
+					b2 = b1;
+					if ( bli_is_last_iter( j, n_iter, jr_tid, jr_nt ) )
+						b2 = b_cast;
+				}
+
 				/* Save addresses of next panels of A and B to the auxinfo_t
-				   object. */ \
-				bli_auxinfo_set_next_a( a2, &aux ); \
-				bli_auxinfo_set_next_b( b2, &aux ); \
-\
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  m_cur, \
-				  n_cur, \
-				  k, \
-				  minus_one, \
-				  a1, \
-				  b1, \
-				  alpha2_cast, \
-				  c11, rs_c, cs_c, \
-				  &aux, \
-				  cntx  \
-				); \
-\
-				a1 += rstep_a; \
-			} \
-\
-			c11 -= rstep_c; \
-		} \
-	} \
-\
+				   object. */
+				bli_auxinfo_set_next_a( a2, &aux );
+				bli_auxinfo_set_next_b( b2, &aux );
+
+				/* Invoke the gemm micro-kernel. */
+				gemm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k,
+				  ( void* )minus_one,
+				  ( void* )a1,
+				  ( void* )b1,
+				  ( void* )alpha2_cast,
+				  c11, rs_c, cs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+
+				a1 += rstep_a;
+			}
+
+			c11 -= rstep_c;
+		}
+	}
+
 /*
-PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: a1 (diag)", MR, k_a1112, a1, 1, MR, "%5.2f", "" ); \
-PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 (diag)", MR, NR, b11, NR, 1, "%6.3f", "" ); \
-printf( "m_iter     = %lu\n", m_iter ); \
-printf( "m_cur      = %lu\n", m_cur ); \
-printf( "k          = %lu\n", k ); \
-printf( "diagoffa_i = %lu\n", diagoffa_i ); \
-printf( "off_a1112  = %lu\n", off_a1112 ); \
-printf( "k_a1112    = %lu\n", k_a1112 ); \
-printf( "k_a12      = %lu\n", k_a12 ); \
-printf( "k_a11      = %lu\n", k_a11 ); \
-printf( "rs_c,cs_c  = %lu %lu\n", rs_c, cs_c ); \
-printf( "rs_ct,cs_ct= %lu %lu\n", rs_ct, cs_ct ); \
-*/ \
-\
+PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: a1 (diag)", MR, k_a1112, a1, 1, MR, "%5.2f", "" );
+PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 (diag)", MR, NR, b11, NR, 1, "%6.3f", "" );
+printf( "m_iter     = %lu\n", m_iter );
+printf( "m_cur      = %lu\n", m_cur );
+printf( "k          = %lu\n", k );
+printf( "diagoffa_i = %lu\n", diagoffa_i );
+printf( "off_a1112  = %lu\n", off_a1112 );
+printf( "k_a1112    = %lu\n", k_a1112 );
+printf( "k_a12      = %lu\n", k_a12 );
+printf( "k_a11      = %lu\n", k_a11 );
+printf( "rs_c,cs_c  = %lu %lu\n", rs_c, cs_c );
+printf( "rs_ct,cs_ct= %lu %lu\n", rs_ct, cs_ct );
+*/
+
 /*
-PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \
-PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" ); \
-PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: ct after (diag)", m_cur, n_cur, ct, rs_ct, cs_ct, "%5.2f", "" ); \
-*/ \
+PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" );
+PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: b11 after (diag)", MR, NR, b11, NR, 1, "%5.2f", "" );
+PASTEMAC(ch,fprintm)( stdout, "trsm_lu_ker_var2: ct after (diag)", m_cur, n_cur, ct, rs_ct, cs_ct, "%5.2f", "" );
+*/
 }
 
-INSERT_GENTFUNC_BASIC0( trsm_lu_ker_var2 )
-
diff --git a/frame/3/trsm/bli_trsm_rl_ker_var2.c b/frame/3/trsm/bli_trsm_rl_ker_var2.c
index 721203df72..ed8d0bed66 100644
--- a/frame/3/trsm/bli_trsm_rl_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_rl_ker_var2.c
@@ -35,50 +35,27 @@
 
 #include "blis.h"
 
-#define FUNCPTR_T gemm_fp
-
-typedef void (*FUNCPTR_T)
-     (
-       doff_t  diagoffb,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       dim_t   m,
-       dim_t   n,
-       dim_t   k,
-       void*   alpha1,
-       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
-       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
-       void*   alpha2,
-       void*   c, inc_t rs_c, inc_t cs_c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
-     );
-
-static FUNCPTR_T GENARRAY(ftypes,trsm_rl_ker_var2);
-
-
 void bli_trsm_rl_ker_var2
      (
        const obj_t*  a,
        const obj_t*  b,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
+       const cntl_t* cntl,
              thrinfo_t* thread
      )
 {
-	const num_t     dt_exec   = bli_obj_exec_dt( c );
+	const num_t     dt        = bli_obj_exec_dt( c );
+    const dim_t     dt_size   = bli_dt_size ( dt );
 
-	const doff_t    diagoffb  = bli_obj_diag_offset( b );
+	      doff_t    diagoffb  = bli_obj_diag_offset( b );
 
 	const pack_t    schema_a  = bli_obj_pack_schema( a );
 	const pack_t    schema_b  = bli_obj_pack_schema( b );
 
-	const dim_t     m         = bli_obj_length( c );
-	const dim_t     n         = bli_obj_width( c );
-	const dim_t     k         = bli_obj_width( a );
+	      dim_t     m         = bli_obj_length( c );
+	      dim_t     n         = bli_obj_width( c );
+	      dim_t     k         = bli_obj_width( a );
 
 	const void*     buf_a     = bli_obj_buffer_at_off( a );
 	const inc_t     cs_a      = bli_obj_col_stride( a );
@@ -110,111 +87,28 @@ void bli_trsm_rl_ker_var2
 	// packing.
 	const void* buf_alpha2 = bli_obj_internal_scalar_buffer( c );
 
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	ftypes[dt_exec]
-	(
-	  diagoffb,
-	  schema_a,
-	  schema_b,
-	  m,
-	  n,
-	  k,
-	  ( void* )buf_alpha1,
-	  ( void* )buf_a, cs_a, pd_a, ps_a,
-	  ( void* )buf_b, rs_b, pd_b, ps_b,
-	  ( void* )buf_alpha2,
-	           buf_c, rs_c, cs_c,
-	  ( cntx_t* )cntx,
-	  rntm,
-	  thread
-	);
-}
-
+	/* Alias some constants to simpler names. */
+	const dim_t     MR          = pd_a;
+	const dim_t     NR          = pd_b;
+	const dim_t     PACKMR      = cs_a;
+	const dim_t     PACKNR      = rs_b;
 
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       doff_t  diagoffb, \
-       pack_t  schema_a, \
-       pack_t  schema_b, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   k, \
-       void*   alpha1, \
-       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
-       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
-       void*   alpha2, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	const num_t     dt          = PASTEMAC(ch,type); \
-\
-	/* Alias some constants to simpler names. */ \
-	const dim_t     MR          = pd_a; \
-	const dim_t     NR          = pd_b; \
-	const dim_t     PACKMR      = cs_a; \
-	const dim_t     PACKNR      = rs_b; \
-\
-	/* Cast the micro-kernel address to its function pointer type. */ \
+	/* Cast the micro-kernel address to its function pointer type. */
 	/* NOTE: We use the upper-triangular gemmtrsm ukernel because, while
 	   the current macro-kernel targets the "rl" case (right-side/lower-
 	   triangular), it becomes upper-triangular after the kernel operation
 	   is transposed so that all kernel instances are of the "left"
-	   variety (since those are the only trsm ukernels that exist). */ \
-	PASTECH(ch,gemmtrsm_ukr_ft) \
-	               gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx ); \
-	PASTECH(ch,gemm_ukr_ft) \
-	                   gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
-\
-	/* Temporary C buffer for edge cases. Note that the strides of this
-	   temporary buffer are set so that they match the storage of the
-	   original C matrix. For example, if C is column-stored, ct will be
-	   column-stored as well. */ \
-/*
-	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
-	                    / sizeof( ctype ) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
-	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
-	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
-*/ \
-\
-	ctype* restrict minus_one   = PASTEMAC(ch,m1); \
-	ctype* restrict a_cast      = a; \
-	ctype* restrict b_cast      = b; \
-	ctype* restrict c_cast      = c; \
-	ctype* restrict alpha1_cast = alpha1; \
-	ctype* restrict alpha2_cast = alpha2; \
-	ctype* restrict b1; \
-	ctype* restrict c1; \
-\
-	doff_t          diagoffb_j; \
-	dim_t           k_full; \
-	dim_t           m_iter, m_left; \
-	dim_t           n_iter, n_left; \
-	dim_t           m_cur; \
-	dim_t           n_cur; \
-	dim_t           k_b1121; \
-	dim_t           k_b11; \
-	dim_t           k_b21; \
-	dim_t           off_b11; \
-	dim_t           off_b21; \
-	dim_t           i, j, jb; \
-	inc_t           rstep_a; \
-	inc_t           cstep_b; \
-	inc_t           rstep_c, cstep_c; \
-	inc_t           istep_a; \
-	inc_t           istep_b; \
-	inc_t           ps_b_cur; \
-	inc_t           is_b_cur; \
-	auxinfo_t       aux; \
-\
+	   variety (since those are the only trsm ukernels that exist). */
+	gemmtrsm_ukr_vft gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_U_UKR, cntx );
+	gemm_ukr_vft     gemm_ukr     = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
+
+	const void* minus_one   = bli_obj_buffer_for_const( dt, &BLIS_MINUS_ONE );
+	const char* a_cast      = buf_a;
+	const char* b_cast      = buf_b;
+	      char* c_cast      = buf_c;
+	const char* alpha1_cast = buf_alpha1;
+	const char* alpha2_cast = buf_alpha2;
+
 	/*
 	   Assumptions/assertions:
 	     rs_a == 1
@@ -235,41 +129,32 @@ void PASTEMAC(ch,varname) \
 	  transposing the operation, then A needs to be packed with NR and B
 	  needs to be packed with MR (remember: B is the triangular matrix in
 	  the right-hand side parameter case).
-	*/ \
-\
+	*/
+
 	/* Safety trap: Certain indexing within this macro-kernel does not
-	   work as intended if both MR and NR are odd. */ \
-	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
-	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
-\
-	/* If any dimension is zero, return immediately. */ \
-	if ( bli_zero_dim3( m, n, k ) ) return; \
-\
+	   work as intended if both MR and NR are odd. */
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) ||
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort();
+
+	/* If any dimension is zero, return immediately. */
+	if ( bli_zero_dim3( m, n, k ) ) return;
+
 	/* Safeguard: If the current panel of B is entirely above its diagonal,
-	   it is implicitly zero. So we do nothing. */ \
-	if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return; \
-\
-	/* Compute k_full as k inflated up to a multiple of NR. This is
-	   needed because some parameter combinations of trsm reduce k
-	   to advance past zero regions in the triangular matrix, and
-	   when computing the imaginary stride of B (the non-triangular
-	   matrix), which is used by 4m1/3m1 implementations, we need
-	   this unreduced value of k. */ \
-	k_full = ( k % NR != 0 ? k + NR - ( k % NR ) : k ); \
-\
+	   it is implicitly zero. So we do nothing. */
+	if ( bli_is_strictly_above_diag_n( diagoffb, k, n ) ) return;
+
 	/* If there is a zero region above where the diagonal of B intersects
 	   the left edge of the panel, adjust the pointer to A and treat this
 	   case as if the diagonal offset were zero. Note that we don't need to
 	   adjust the pointer to B since packm would have simply skipped over
-	   the region that was not stored. */ \
-	if ( diagoffb < 0 ) \
-	{ \
-		j        = -diagoffb; \
-		k        = k - j; \
-		diagoffb = 0; \
-		a_cast   = a_cast + j * PACKMR; \
-	} \
-\
+	   the region that was not stored. */
+	if ( diagoffb < 0 )
+	{\
+		k        += diagoffb;
+		a_cast   -= diagoffb * PACKMR * dt_size;
+		diagoffb  = 0;
+	}
+
 	/* If there is a zero region to the right of where the diagonal
 	   of B intersects the bottom of the panel, shrink it so that
 	   we can index to the correct place in C (corresponding to the
@@ -277,12 +162,12 @@ void PASTEMAC(ch,varname) \
 	   NOTE: This is NOT being done to skip over "no-op" iterations,
 	   as with the trsm_lu macro-kernel. This MUST be done for correct
 	   execution because we use n (via n_iter) to compute diagonal and
-	   index offsets for backwards movement through B. */ \
-	if ( diagoffb + k < n ) \
-	{ \
-		n = diagoffb + k; \
-	} \
-\
+	   index offsets for backwards movement through B. */
+	if ( diagoffb + k < n )
+	{
+		n = diagoffb + k;
+	}
+
 	/* Check the k dimension, which needs to be a multiple of NR. If k
 	   isn't a multiple of NR, we adjust it higher to satisfy the micro-
 	   kernel, which is expecting to perform an NR x NR triangular solve.
@@ -292,209 +177,186 @@ void PASTEMAC(ch,varname) \
 	   matrix has its diagonal extended into the zero-padded region (as
 	   identity). This allows the trsm of that bottom-right panel to
 	   proceed without producing any infs or NaNs that would infect the
-	   "good" values of the corresponding block of A. */ \
-	if ( k % NR != 0 ) k += NR - ( k % NR ); \
-\
+	   "good" values of the corresponding block of A. */
+	if ( k % NR != 0 ) k += NR - ( k % NR );
+
 	/* NOTE: We don't need to check that n is a multiple of PACKNR since we
 	   know that the underlying buffer was already allocated to have an n
 	   dimension that is a multiple of PACKNR, with the region between the
-	   last column and the next multiple of NR zero-padded accordingly. */ \
-\
+	   last column and the next multiple of NR zero-padded accordingly. */
+
 	/* Compute number of primary and leftover components of the m and n
-       dimensions. */ \
-	n_iter = n / NR; \
-	n_left = n % NR; \
-\
-	m_iter = m / MR; \
-	m_left = m % MR; \
-\
-	if ( n_left ) ++n_iter; \
-	if ( m_left ) ++m_iter; \
-\
-	/* Determine some increments used to step through A, B, and C. */ \
-	rstep_a = ps_a; \
-\
-	cstep_b = ps_b; \
-\
-	rstep_c = rs_c * MR; \
-	cstep_c = cs_c * NR; \
-\
-	istep_a = PACKMR * k_full; \
-	istep_b = PACKNR * k; \
-\
-	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
-	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
-\
+       dimensions. */
+	dim_t n_iter = n / NR;
+	dim_t n_left = n % NR;
+
+	dim_t m_iter = m / MR;
+	dim_t m_left = m % MR;
+
+	if ( n_left ) ++n_iter;
+	if ( m_left ) ++m_iter;
+
+	/* Determine some increments used to step through A, B, and C. */
+	inc_t rstep_a = ps_a * dt_size;
+
+	inc_t cstep_b = ps_b * dt_size;
+
+	inc_t rstep_c = rs_c * MR * dt_size;
+	inc_t cstep_c = cs_c * NR * dt_size;
+
 	/* Save the pack schemas of A and B to the auxinfo_t object.
 	   NOTE: We swap the values for A and B since the triangular
-	   "A" matrix is actually contained within B. */ \
-	bli_auxinfo_set_schema_a( schema_b, &aux ); \
-	bli_auxinfo_set_schema_b( schema_a, &aux ); \
-\
-	/* Save the imaginary stride of A to the auxinfo_t object.
-	   NOTE: We swap the values for A and B since the triangular
-	   "A" matrix is actually contained within B. */ \
-	bli_auxinfo_set_is_b( istep_a, &aux ); \
-\
-	b1 = b_cast; \
-	c1 = c_cast; \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( jb = 0; jb < n_iter; ++jb ) \
-	{ \
-		ctype* restrict a1; \
-		ctype* restrict c11; \
-		ctype* restrict b11; \
-		ctype* restrict b21; \
-		ctype* restrict b2; \
-\
-		j          = n_iter - 1 - jb; \
-		diagoffb_j = diagoffb - ( doff_t )j*NR; \
-		a1         = a_cast; \
-		c11        = c1 + (n_iter-1)*cstep_c; \
-\
-		n_cur = ( bli_is_not_edge_b( jb, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
+	   "A" matrix is actually contained within B. */
+    auxinfo_t aux;
+	bli_auxinfo_set_schema_a( schema_b, &aux );
+	bli_auxinfo_set_schema_b( schema_a, &aux );
+
+	const char* b1 = b_cast;
+	      char* c1 = c_cast;
+
+	/* Loop over the n dimension (NR columns at a time). */
+	for ( dim_t jb = 0; jb < n_iter; ++jb )
+	{
+		dim_t  j          = n_iter - 1 - jb;
+		doff_t diagoffb_j = diagoffb - ( doff_t )j*NR;
+
+		dim_t  n_cur = ( bli_is_not_edge_b( jb, n_iter, n_left ) ? NR : n_left );
+
+		const char* a1         = a_cast;
+		      char* c11        = c1 + (n_iter-1)*cstep_c;
+
+		/* Initialize our next panel of B to be the current panel of B. */
+		const char* b2 = b1;
+
 		/* If the current panel of B intersects the diagonal, use a
 		   special micro-kernel that performs a fused gemm and trsm.
 		   If the current panel of B resides below the diagonal, use a
 		   a regular gemm micro-kernel. Otherwise, if it is above the
 		   diagonal, it was not packed (because it is implicitly zero)
-		   and so we do nothing. */ \
-		if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \
-		{ \
+		   and so we do nothing. */
+		if ( bli_intersects_diag_n( diagoffb_j, k, NR ) )
+		{
 			/* Determine the offset to and length of the panel that was packed
-			   so we can index into the corresponding location in A. */ \
-			off_b11   = bli_max( -diagoffb_j, 0 ); \
-			k_b1121   = k - off_b11; \
-			k_b11     = NR; \
-			k_b21     = k_b1121 - NR; \
-			off_b21   = off_b11 + k_b11; \
-\
+			   so we can index into the corresponding location in A. */
+			dim_t off_b11   = bli_max( -diagoffb_j, 0 );
+			dim_t k_b1121   = k - off_b11;
+			dim_t k_b11     = NR;
+			dim_t k_b21     = k_b1121 - NR;
+			dim_t off_b21   = off_b11 + k_b11;
+
 			/* Compute the addresses of the triangular block B11 and the
-			   panel B21. */ \
-			b11 = b1; \
-			b21 = b1 + k_b11 * PACKNR; \
-			/*b21 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b11 * PACKNR, 1 );*/ \
-\
-			/* Compute the panel stride for the current micro-panel. */ \
-			is_b_cur  = k_b1121 * PACKNR; \
-			is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
-			ps_b_cur  = is_b_cur; \
-\
-			/* Loop over the m dimension (MR rows at a time). */ \
-			for ( i = 0; i < m_iter; ++i ) \
-			{ \
-				if ( bli_trsm_my_iter_rr( i, thread ) ){ \
-\
-				ctype* restrict a11; \
-				ctype* restrict a12; \
-				ctype* restrict a2; \
-\
-				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-				/* Compute the addresses of the A11 block and A12 panel. */ \
-				a11  = a1 + off_b11 * PACKMR; \
-				a12  = a1 + off_b21 * PACKMR; \
-\
-				/* Compute the addresses of the next panels of A and B. */ \
-				a2 = a1; \
-				/*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */\
-				if ( i + bli_thread_num_threads(thread) >= m_iter ) \
-				{ \
-					a2 = a_cast; \
-					b2 = b1 + ps_b_cur; \
-					if ( bli_is_last_iter_rr( jb, n_iter, 0, 1 ) ) \
-						b2 = b_cast; \
-				} \
-\
+			   panel B21. */
+			const char* b11 = b1;
+			const char* b21 = b1 + k_b11 * PACKNR;
+			/*b21 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b11 * PACKNR, 1 );*/
+
+			/* Compute the panel stride for the current micro-panel. */
+			inc_t ps_b_cur  = k_b1121 * PACKNR;
+    			  ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 );
+    			  ps_b_cur *= dt_size;
+
+			/* Loop over the m dimension (MR rows at a time). */
+			for ( dim_t i = 0; i < m_iter; ++i )
+			{
+				if ( bli_trsm_my_iter_rr( i, thread ) ){
+
+				dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
+
+				/* Compute the addresses of the A11 block and A12 panel. */
+				const char* a11  = a1 + off_b11 * PACKMR;
+				const char* a12  = a1 + off_b21 * PACKMR;
+
+				/* Compute the addresses of the next panels of A and B. */
+				const char* a2 = a1;
+				/*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */
+				if ( i + bli_thread_num_threads(thread) >= m_iter )
+				{
+					a2 = a_cast;
+					b2 = b1 + ps_b_cur;
+					if ( bli_is_last_iter_rr( jb, n_iter, 0, 1 ) )
+						b2 = b_cast;
+				}
+
 				/* Save addresses of next panels of A and B to the auxinfo_t
 				   object. NOTE: We swap the values for A and B since the
-				   triangular "A" matrix is actually contained within B. */ \
-				bli_auxinfo_set_next_a( b2, &aux ); \
-				bli_auxinfo_set_next_b( a2, &aux ); \
-\
-				gemmtrsm_ukr \
-				( \
-				  m_cur, \
-				  n_cur, \
-				  k_b21, \
-				  alpha1_cast, \
-				  b21, \
-				  b11, \
-				  a12, \
-				  a11, \
-				  c11, cs_c, rs_c, \
-				  &aux, \
-				  cntx  \
-				); \
-\
-				} \
-\
-				a1  += rstep_a; \
-				c11 += rstep_c; \
-			} \
-\
-			b1 += ps_b_cur; \
-		} \
-		else if ( bli_is_strictly_below_diag_n( diagoffb_j, k, NR ) ) \
-		{ \
-			/* Loop over the m dimension (MR rows at a time). */ \
-			for ( i = 0; i < m_iter; ++i ) \
-			{ \
-				if ( bli_trsm_my_iter_rr( i, thread ) ){ \
-\
-				ctype* restrict a2; \
-\
-				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-				/* Compute the addresses of the next panels of A and B. */ \
-				a2 = a1; \
-				/*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */\
-				if ( i + bli_thread_num_threads(thread) >= m_iter ) \
-				{ \
-					a2 = a_cast; \
-					b2 = b1 + cstep_b; \
-					if ( bli_is_last_iter_rr( jb, n_iter, 0, 1 ) ) \
-						b2 = b_cast; \
-				} \
-\
+				   triangular "A" matrix is actually contained within B. */
+				bli_auxinfo_set_next_a( b2, &aux );
+				bli_auxinfo_set_next_b( a2, &aux );
+
+				gemmtrsm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k_b21,
+				  ( void* )alpha1_cast,
+				  ( void* )b21,
+				  ( void* )b11,
+				  ( void* )a12,
+				  ( void* )a11,
+				  c11, cs_c, rs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+
+				}
+
+				a1  += rstep_a;
+				c11 += rstep_c;
+			}
+
+			b1 += ps_b_cur;
+		}
+		else if ( bli_is_strictly_below_diag_n( diagoffb_j, k, NR ) )
+		{
+			/* Loop over the m dimension (MR rows at a time). */
+			for ( dim_t i = 0; i < m_iter; ++i )
+			{
+				if ( bli_trsm_my_iter_rr( i, thread ) ){
+
+				dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
+
+				/* Compute the addresses of the next panels of A and B. */
+				const char* a2 = a1;
+				/*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */
+				if ( i + bli_thread_num_threads(thread) >= m_iter )
+				{
+					a2 = a_cast;
+					b2 = b1 + cstep_b;
+					if ( bli_is_last_iter_rr( jb, n_iter, 0, 1 ) )
+						b2 = b_cast;
+				}
+
 				/* Save addresses of next panels of A and B to the auxinfo_t
 				   object. NOTE: We swap the values for A and B since the
-				   triangular "A" matrix is actually contained within B. */ \
-				bli_auxinfo_set_next_a( b2, &aux ); \
-				bli_auxinfo_set_next_b( a2, &aux ); \
-\
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  m_cur, \
-				  n_cur, \
-				  k, \
-				  minus_one, \
-				  b1, \
-				  a1, \
-				  alpha2_cast, \
-				  c11, cs_c, rs_c, \
-				  &aux, \
-				  cntx  \
-				); \
-\
-				} \
-\
-				a1  += rstep_a; \
-				c11 += rstep_c; \
-			} \
-\
-			b1 += cstep_b; \
-		} \
-\
-		c1 -= cstep_c; \
-	} \
-}
+				   triangular "A" matrix is actually contained within B. */
+				bli_auxinfo_set_next_a( b2, &aux );
+				bli_auxinfo_set_next_b( a2, &aux );
+
+				/* Invoke the gemm micro-kernel. */
+				gemm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k,
+				  ( void* )minus_one,
+				  ( void* )b1,
+				  ( void* )a1,
+				  ( void* )alpha2_cast,
+				  c11, cs_c, rs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+
+				}
 
-INSERT_GENTFUNC_BASIC0( trsm_rl_ker_var2 )
+				a1  += rstep_a;
+				c11 += rstep_c;
+			}
+
+			b1 += cstep_b;
+		}
+
+		c1 -= cstep_c;
+	}
+}
 
diff --git a/frame/3/trsm/bli_trsm_ru_ker_var2.c b/frame/3/trsm/bli_trsm_ru_ker_var2.c
index 447fbf8cd5..f203fc4445 100644
--- a/frame/3/trsm/bli_trsm_ru_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_ru_ker_var2.c
@@ -35,50 +35,27 @@
 
 #include "blis.h"
 
-#define FUNCPTR_T gemm_fp
-
-typedef void (*FUNCPTR_T)
-     (
-       doff_t  diagoffb,
-       pack_t  schema_a,
-       pack_t  schema_b,
-       dim_t   m,
-       dim_t   n,
-       dim_t   k,
-       void*   alpha1,
-       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a,
-       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b,
-       void*   alpha2,
-       void*   c, inc_t rs_c, inc_t cs_c,
-       cntx_t* cntx,
-       rntm_t* rntm,
-       thrinfo_t* thread
-     );
-
-static FUNCPTR_T GENARRAY(ftypes,trsm_ru_ker_var2);
-
-
 void bli_trsm_ru_ker_var2
      (
        const obj_t*  a,
        const obj_t*  b,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
+       const cntl_t* cntl,
              thrinfo_t* thread
      )
 {
-	const num_t     dt_exec   = bli_obj_exec_dt( c );
+	const num_t     dt        = bli_obj_exec_dt( c );
+    const dim_t     dt_size   = bli_dt_size( dt );
 
-	const doff_t    diagoffb  = bli_obj_diag_offset( b );
+	      doff_t    diagoffb  = bli_obj_diag_offset( b );
 
 	const pack_t    schema_a  = bli_obj_pack_schema( a );
 	const pack_t    schema_b  = bli_obj_pack_schema( b );
 
-	const dim_t     m         = bli_obj_length( c );
-	const dim_t     n         = bli_obj_width( c );
-	const dim_t     k         = bli_obj_width( a );
+	      dim_t     m         = bli_obj_length( c );
+	      dim_t     n         = bli_obj_width( c );
+	      dim_t     k         = bli_obj_width( a );
 
 	const void*     buf_a     = bli_obj_buffer_at_off( a );
 	const inc_t     cs_a      = bli_obj_col_stride( a );
@@ -110,110 +87,28 @@ void bli_trsm_ru_ker_var2
 	// packing.
 	const void* buf_alpha2 = bli_obj_internal_scalar_buffer( c );
 
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	ftypes[dt_exec]
-	(
-	  diagoffb,
-	  schema_a,
-	  schema_b,
-	  m,
-	  n,
-	  k,
-	  ( void* )buf_alpha1,
-	  ( void* )buf_a, cs_a, pd_a, ps_a,
-	  ( void* )buf_b, rs_b, pd_b, ps_b,
-	  ( void* )buf_alpha2,
-	           buf_c, rs_c, cs_c,
-	  ( cntx_t* )cntx,
-	  rntm,
-	  thread
-	);
-}
-
+	/* Alias some constants to simpler names. */
+	const dim_t     MR          = pd_a;
+	const dim_t     NR          = pd_b;
+	const dim_t     PACKMR      = cs_a;
+	const dim_t     PACKNR      = rs_b;
 
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       doff_t  diagoffb, \
-       pack_t  schema_a, \
-       pack_t  schema_b, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   k, \
-       void*   alpha1, \
-       void*   a, inc_t cs_a, dim_t pd_a, inc_t ps_a, \
-       void*   b, inc_t rs_b, dim_t pd_b, inc_t ps_b, \
-       void*   alpha2, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	const num_t     dt          = PASTEMAC(ch,type); \
-\
-	/* Alias some constants to simpler names. */ \
-	const dim_t     MR          = pd_a; \
-	const dim_t     NR          = pd_b; \
-	const dim_t     PACKMR      = cs_a; \
-	const dim_t     PACKNR      = rs_b; \
-\
-	/* Cast the micro-kernel address to its function pointer type. */ \
+	/* Cast the micro-kernel address to its function pointer type. */
 	/* NOTE: We use the lower-triangular gemmtrsm ukernel because, while
 	   the current macro-kernel targets the "ru" case (right-side/upper-
 	   triangular), it becomes lower-triangular after the kernel operation
 	   is transposed so that all kernel instances are of the "left"
-	   variety (since those are the only trsm ukernels that exist). */ \
-	PASTECH(ch,gemmtrsm_ukr_ft) \
-	               gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx ); \
-	PASTECH(ch,gemm_ukr_ft) \
-	                   gemm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx ); \
-\
-	/* Temporary C buffer for edge cases. Note that the strides of this
-	   temporary buffer are set so that they match the storage of the
-	   original C matrix. For example, if C is column-stored, ct will be
-	   column-stored as well. */ \
-/*
-	ctype           ct[ BLIS_STACK_BUF_MAX_SIZE \
-	                    / sizeof( ctype ) ] \
-	                    __attribute__((aligned(BLIS_STACK_BUF_ALIGN_SIZE))); \
-	const bool      col_pref    = bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_VIR_UKR, cntx ); \
-	const inc_t     rs_ct       = ( col_pref ? 1 : NR ); \
-	const inc_t     cs_ct       = ( col_pref ? MR : 1 ); \
-*/ \
-\
-	ctype* restrict minus_one   = PASTEMAC(ch,m1); \
-	ctype* restrict a_cast      = a; \
-	ctype* restrict b_cast      = b; \
-	ctype* restrict c_cast      = c; \
-	ctype* restrict alpha1_cast = alpha1; \
-	ctype* restrict alpha2_cast = alpha2; \
-	ctype* restrict b1; \
-	ctype* restrict c1; \
-\
-	doff_t          diagoffb_j; \
-	dim_t           k_full; \
-	dim_t           m_iter, m_left; \
-	dim_t           n_iter, n_left; \
-	dim_t           m_cur; \
-	dim_t           n_cur; \
-	dim_t           k_b0111; \
-	dim_t           k_b01; \
-	dim_t           off_b01; \
-	dim_t           off_b11; \
-	dim_t           i, j; \
-	inc_t           rstep_a; \
-	inc_t           cstep_b; \
-	inc_t           rstep_c, cstep_c; \
-	inc_t           istep_a; \
-	inc_t           istep_b; \
-	inc_t           ps_b_cur; \
-	inc_t           is_b_cur; \
-	auxinfo_t       aux; \
-\
+	   variety (since those are the only trsm ukernels that exist). */
+	gemmtrsm_ukr_vft gemmtrsm_ukr = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMMTRSM_L_UKR, cntx );
+	gemm_ukr_vft     gemm_ukr     = bli_cntx_get_l3_vir_ukr_dt( dt, BLIS_GEMM_UKR, cntx );
+
+	const void* minus_one   = bli_obj_buffer_for_const( dt, &BLIS_MINUS_ONE );
+	const char* a_cast      = buf_a;
+	const char* b_cast      = buf_b;
+	      char* c_cast      = buf_c;
+	const char* alpha1_cast = buf_alpha1;
+	const char* alpha2_cast = buf_alpha2;
+
 	/*
 	   Assumptions/assertions:
 	     rs_a == 1
@@ -234,50 +129,41 @@ void PASTEMAC(ch,varname) \
 	  transposing the operation, then A needs to be packed with NR and B
 	  needs to be packed with MR (remember: B is the triangular matrix in
 	  the right-hand side parameter case).
-	*/ \
-\
+	*/
+
 	/* Safety trap: Certain indexing within this macro-kernel does not
-	   work as intended if both MR and NR are odd. */ \
-	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) || \
-	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort(); \
-\
-	/* If any dimension is zero, return immediately. */ \
-	if ( bli_zero_dim3( m, n, k ) ) return; \
-\
+	   work as intended if both MR and NR are odd. */
+	if ( ( bli_is_odd( PACKMR ) && bli_is_odd( NR ) ) ||
+	     ( bli_is_odd( PACKNR ) && bli_is_odd( MR ) ) ) bli_abort();
+
+	/* If any dimension is zero, return immediately. */
+	if ( bli_zero_dim3( m, n, k ) ) return;
+
 	/* Safeguard: If the current panel of B is entirely below its diagonal,
-	   it is implicitly zero. So we do nothing. */ \
-	if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return; \
-\
-	/* Compute k_full as k inflated up to a multiple of NR. This is
-	   needed because some parameter combinations of trsm reduce k
-	   to advance past zero regions in the triangular matrix, and
-	   when computing the imaginary stride of B (the non-triangular
-	   matrix), which is used by 4m1/3m1 implementations, we need
-	   this unreduced value of k. */ \
-	k_full = ( k % NR != 0 ? k + NR - ( k % NR ) : k ); \
-\
+	   it is implicitly zero. So we do nothing. */
+	if ( bli_is_strictly_below_diag_n( diagoffb, k, n ) ) return;
+
 	/* If there is a zero region to the left of where the diagonal of B
 	   intersects the top edge of the panel, adjust the pointer to C and
 	   treat this case as if the diagonal offset were zero. This skips over
 	   the region that was not packed. (Note we assume the diagonal offset
 	   is a multiple of MR; this assumption will hold as long as the cache
-	   blocksizes are each a multiple of MR and NR.) */ \
-	if ( diagoffb > 0 ) \
-	{ \
-		j        = diagoffb; \
-		n        = n - j; \
-		diagoffb = 0; \
-		c_cast   = c_cast + (j  )*cs_c; \
-	} \
-\
+	   blocksizes are each a multiple of MR and NR.) */
+	if ( diagoffb > 0 )
+	{
+		n        -= diagoffb;
+		c_cast   += diagoffb * cs_c * dt_size;
+		diagoffb  = 0;
+	}
+
 	/* If there is a zero region below where the diagonal of B intersects the
 	   right side of the block, shrink it to prevent "no-op" iterations from
-	   executing. */ \
-	if ( -diagoffb + n < k ) \
-	{ \
-		k = -diagoffb + n; \
-	} \
-\
+	   executing. */
+	if ( -diagoffb + n < k )
+	{
+		k = -diagoffb + n;
+	}
+
 	/* Check the k dimension, which needs to be a multiple of NR. If k
 	   isn't a multiple of NR, we adjust it higher to satisfy the micro-
 	   kernel, which is expecting to perform an NR x NR triangular solve.
@@ -287,207 +173,184 @@ void PASTEMAC(ch,varname) \
 	   matrix has its diagonal extended into the zero-padded region (as
 	   identity). This allows the trsm of that bottom-right panel to
 	   proceed without producing any infs or NaNs that would infect the
-	   "good" values of the corresponding block of A. */ \
-	if ( k % NR != 0 ) k += NR - ( k % NR ); \
-\
+	   "good" values of the corresponding block of A. */
+	if ( k % NR != 0 ) k += NR - ( k % NR );
+
 	/* NOTE: We don't need to check that n is a multiple of PACKNR since we
 	   know that the underlying buffer was already allocated to have an n
 	   dimension that is a multiple of PACKNR, with the region between the
-	   last column and the next multiple of NR zero-padded accordingly. */ \
-\
+	   last column and the next multiple of NR zero-padded accordingly. */
+
 	/* Compute number of primary and leftover components of the m and n
-       dimensions. */ \
-	n_iter = n / NR; \
-	n_left = n % NR; \
-\
-	m_iter = m / MR; \
-	m_left = m % MR; \
-\
-	if ( n_left ) ++n_iter; \
-	if ( m_left ) ++m_iter; \
-\
-	/* Determine some increments used to step through A, B, and C. */ \
-	rstep_a = ps_a; \
-\
-	cstep_b = ps_b; \
-\
-	rstep_c = rs_c * MR; \
-	cstep_c = cs_c * NR; \
-\
-	istep_a = PACKMR * k_full; \
-	istep_b = PACKNR * k; \
-\
-	if ( bli_is_odd( istep_a ) ) istep_a += 1; \
-	if ( bli_is_odd( istep_b ) ) istep_b += 1; \
-\
+       dimensions. */
+	dim_t n_iter = n / NR;
+	dim_t n_left = n % NR;
+
+	dim_t m_iter = m / MR;
+	dim_t m_left = m % MR;
+
+	if ( n_left ) ++n_iter;
+	if ( m_left ) ++m_iter;
+
+	/* Determine some increments used to step through A, B, and C. */
+	inc_t rstep_a = ps_a * dt_size;
+
+	inc_t cstep_b = ps_b * dt_size;
+
+	inc_t rstep_c = rs_c * MR * dt_size;
+	inc_t cstep_c = cs_c * NR * dt_size;
+
 	/* Save the pack schemas of A and B to the auxinfo_t object.
 	   NOTE: We swap the values for A and B since the triangular
-	   "A" matrix is actually contained within B. */ \
-	bli_auxinfo_set_schema_a( schema_b, &aux ); \
-	bli_auxinfo_set_schema_b( schema_a, &aux ); \
-\
-	/* Save the imaginary stride of A to the auxinfo_t object.
-	   NOTE: We swap the values for A and B since the triangular
-	   "A" matrix is actually contained within B. */ \
-	bli_auxinfo_set_is_b( istep_a, &aux ); \
-\
-	b1 = b_cast; \
-	c1 = c_cast; \
-\
-	/* Loop over the n dimension (NR columns at a time). */ \
-	for ( j = 0; j < n_iter; ++j ) \
-	{ \
-		ctype* restrict a1; \
-		ctype* restrict c11; \
-		ctype* restrict b01; \
-		ctype* restrict b11; \
-		ctype* restrict b2; \
-\
-		diagoffb_j = diagoffb - ( doff_t )j*NR; \
-		a1         = a_cast; \
-		c11        = c1; \
-\
-		n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left ); \
-\
-		/* Initialize our next panel of B to be the current panel of B. */ \
-		b2 = b1; \
-\
+	   "A" matrix is actually contained within B. */
+    auxinfo_t aux;
+	bli_auxinfo_set_schema_a( schema_b, &aux );
+	bli_auxinfo_set_schema_b( schema_a, &aux );
+
+	const char* b1 = b_cast;
+	      char* c1 = c_cast;
+
+	/* Loop over the n dimension (NR columns at a time). */
+	for ( dim_t j = 0; j < n_iter; ++j )
+	{
+		dim_t diagoffb_j = diagoffb - ( doff_t )j*NR;
+
+		dim_t n_cur = ( bli_is_not_edge_f( j, n_iter, n_left ) ? NR : n_left );
+
+		const char* a1         = a_cast;
+		      char* c11        = c1;
+
+		/* Initialize our next panel of B to be the current panel of B. */
+		const char* b2 = b1;
+
 		/* If the current panel of B intersects the diagonal, use a
 		   special micro-kernel that performs a fused gemm and trsm.
 		   If the current panel of B resides above the diagonal, use a
 		   a regular gemm micro-kernel. Otherwise, if it is below the
 		   diagonal, it was not packed (because it is implicitly zero)
-		   and so we do nothing. */ \
-		if ( bli_intersects_diag_n( diagoffb_j, k, NR ) ) \
-		{ \
+		   and so we do nothing. */
+		if ( bli_intersects_diag_n( diagoffb_j, k, NR ) )
+		{
 			/* Determine the offset to and length of the panel that was packed
-			   so we can index into the corresponding location in A. */ \
-			off_b01   = 0; \
-			k_b0111   = bli_min( k, -diagoffb_j + NR ); \
-			k_b01     = k_b0111 - NR; \
-			off_b11   = k_b01; \
-\
+			   so we can index into the corresponding location in A. */
+			dim_t off_b01   = 0;
+			dim_t k_b0111   = bli_min( k, -diagoffb_j + NR );
+			dim_t k_b01     = k_b0111 - NR;
+			dim_t off_b11   = k_b01;
+
 			/* Compute the addresses of the panel B10 and the triangular
-			   block B11. */ \
-			b01 = b1; \
-			b11 = b1 + k_b01 * PACKNR; \
-			/*b11 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b01 * PACKNR, 1 );*/ \
-\
-			/* Compute the panel stride for the current micro-panel. */ \
-			is_b_cur  = k_b0111 * PACKNR; \
-			is_b_cur += ( bli_is_odd( is_b_cur ) ? 1 : 0 ); \
-			ps_b_cur  = is_b_cur; \
-\
-			/* Loop over the m dimension (MR rows at a time). */ \
-			for ( i = 0; i < m_iter; ++i ) \
-			{ \
-				if ( bli_trsm_my_iter_rr( i, thread ) ){ \
-\
-				ctype* restrict a10; \
-				ctype* restrict a11; \
-				ctype* restrict a2; \
-\
-				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-				/* Compute the addresses of the A10 panel and A11 block. */ \
-				a10  = a1 + off_b01 * PACKMR; \
-				a11  = a1 + off_b11 * PACKMR; \
-\
-				/* Compute the addresses of the next panels of A and B. */ \
-				a2 = a1; \
-				/*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */\
-				if ( i + bli_thread_num_threads(thread) >= m_iter ) \
-				{ \
-					a2 = a_cast; \
-					b2 = b1 + ps_b_cur; \
-					if ( bli_is_last_iter_rr( j, n_iter, 0, 1 ) ) \
-						b2 = b_cast; \
-				} \
-\
+			   block B11. */
+			const char* b01 = b1;
+			const char* b11 = b1 + k_b01 * PACKNR;
+			/*b11 = bli_ptr_inc_by_frac( b1, sizeof( ctype ), k_b01 * PACKNR, 1 );*/
+
+			/* Compute the panel stride for the current micro-panel. */
+			inc_t ps_b_cur  = k_b0111 * PACKNR;
+    			  ps_b_cur += ( bli_is_odd( ps_b_cur ) ? 1 : 0 );
+    			  ps_b_cur *= dt_size;
+
+			/* Loop over the m dimension (MR rows at a time). */
+			for ( dim_t i = 0; i < m_iter; ++i )
+			{
+				if ( bli_trsm_my_iter_rr( i, thread ) ){
+
+				dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
+
+				/* Compute the addresses of the A10 panel and A11 block. */
+				const char* a10  = a1 + off_b01 * PACKMR;
+				const char* a11  = a1 + off_b11 * PACKMR;
+
+				/* Compute the addresses of the next panels of A and B. */
+				const char* a2 = a1;
+				/*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */
+				if ( i + bli_thread_num_threads(thread) >= m_iter )
+				{
+					a2 = a_cast;
+					b2 = b1 + ps_b_cur;
+					if ( bli_is_last_iter_rr( j, n_iter, 0, 1 ) )
+						b2 = b_cast;
+				}
+
 				/* Save addresses of next panels of A and B to the auxinfo_t
 				   object. NOTE: We swap the values for A and B since the
-				   triangular "A" matrix is actually contained within B. */ \
-				bli_auxinfo_set_next_a( b2, &aux ); \
-				bli_auxinfo_set_next_b( a2, &aux ); \
-\
-				gemmtrsm_ukr \
-				( \
-				  m_cur, \
-				  n_cur, \
-				  k_b01, \
-				  alpha1_cast, \
-				  b01, \
-				  b11, \
-				  a10, \
-				  a11, \
-				  c11, cs_c, rs_c, \
-				  &aux, \
-				  cntx  \
-				); \
-\
-				} \
-\
-				a1  += rstep_a; \
-				c11 += rstep_c; \
-			} \
-\
-			b1 += ps_b_cur; \
-		} \
-		else if ( bli_is_strictly_above_diag_n( diagoffb_j, k, NR ) ) \
-		{ \
-			/* Loop over the m dimension (MR rows at a time). */ \
-			for ( i = 0; i < m_iter; ++i ) \
-			{ \
-				if ( bli_trsm_my_iter_rr( i, thread ) ){ \
-\
-				ctype* restrict a2; \
-\
-				m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left ); \
-\
-				/* Compute the addresses of the next panels of A and B. */ \
-				a2 = a1; \
-				/*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */\
-				if ( i + bli_thread_num_threads(thread) >= m_iter ) \
-				{ \
-					a2 = a_cast; \
-					b2 = b1 + cstep_b; \
-					if ( bli_is_last_iter_rr( j, n_iter, 0, 1 ) ) \
-						b2 = b_cast; \
-				} \
-\
+				   triangular "A" matrix is actually contained within B. */
+				bli_auxinfo_set_next_a( b2, &aux );
+				bli_auxinfo_set_next_b( a2, &aux );
+
+				gemmtrsm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k_b01,
+				  ( void* )alpha1_cast,
+				  ( void* )b01,
+				  ( void* )b11,
+				  ( void* )a10,
+				  ( void* )a11,
+				  c11, cs_c, rs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+
+				}
+
+				a1  += rstep_a;
+				c11 += rstep_c;
+			}
+
+			b1 += ps_b_cur;
+		}
+		else if ( bli_is_strictly_above_diag_n( diagoffb_j, k, NR ) )
+		{
+			/* Loop over the m dimension (MR rows at a time). */
+			for ( dim_t i = 0; i < m_iter; ++i )
+			{
+				if ( bli_trsm_my_iter_rr( i, thread ) ){
+
+				dim_t m_cur = ( bli_is_not_edge_f( i, m_iter, m_left ) ? MR : m_left );
+
+				/* Compute the addresses of the next panels of A and B. */
+				const char* a2 = a1;
+				/*if ( bli_is_last_iter_rr( i, m_iter, 0, 1 ) ) */
+				if ( i + bli_thread_num_threads(thread) >= m_iter )
+				{
+					a2 = a_cast;
+					b2 = b1 + cstep_b;
+					if ( bli_is_last_iter_rr( j, n_iter, 0, 1 ) )
+						b2 = b_cast;
+				}
+
 				/* Save addresses of next panels of A and B to the auxinfo_t
 				   object. NOTE: We swap the values for A and B since the
-				   triangular "A" matrix is actually contained within B. */ \
-				bli_auxinfo_set_next_a( b2, &aux ); \
-				bli_auxinfo_set_next_b( a2, &aux ); \
-\
-				/* Invoke the gemm micro-kernel. */ \
-				gemm_ukr \
-				( \
-				  m_cur, \
-				  n_cur, \
-				  k, \
-				  minus_one, \
-				  b1, \
-				  a1, \
-				  alpha2_cast, \
-				  c11, cs_c, rs_c, \
-				  &aux, \
-				  cntx  \
-				); \
-\
-				} \
-\
-				a1  += rstep_a; \
-				c11 += rstep_c; \
-			} \
-\
-			b1 += cstep_b; \
-		} \
-\
-		c1 += cstep_c; \
-	} \
-}
+				   triangular "A" matrix is actually contained within B. */
+				bli_auxinfo_set_next_a( b2, &aux );
+				bli_auxinfo_set_next_b( a2, &aux );
+
+				/* Invoke the gemm micro-kernel. */
+				gemm_ukr
+				(
+				  m_cur,
+				  n_cur,
+				  k,
+				  ( void* )minus_one,
+				  ( void* )b1,
+				  ( void* )a1,
+				  ( void* )alpha2_cast,
+				  c11, cs_c, rs_c,
+				  &aux,
+				  ( cntx_t* )cntx
+				);
+
+				}
 
-INSERT_GENTFUNC_BASIC0( trsm_ru_ker_var2 )
+				a1  += rstep_a;
+				c11 += rstep_c;
+			}
+
+			b1 += cstep_b;
+		}
+
+		c1 += cstep_c;
+	}
+}
 
diff --git a/frame/3/trsm/bli_trsm_var.h b/frame/3/trsm/bli_trsm_var.h
index 7e747b4a88..0279cff491 100644
--- a/frame/3/trsm/bli_trsm_var.h
+++ b/frame/3/trsm/bli_trsm_var.h
@@ -47,8 +47,7 @@ void PASTEMAC0(opname) \
        const obj_t*  b, \
        const obj_t*  c, \
        const cntx_t* cntx, \
-             rntm_t* rntm, \
-             cntl_t* cntl, \
+       const cntl_t* cntl, \
              thrinfo_t* thread  \
      );
 
@@ -63,36 +62,3 @@ GENPROT( trsm_lu_ker_var2 )
 GENPROT( trsm_rl_ker_var2 )
 GENPROT( trsm_ru_ker_var2 )
 
-
-//
-// Prototype BLAS-like interfaces with void pointer operands.
-//
-
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       doff_t  diagoff, \
-       pack_t  schema_a, \
-       pack_t  schema_b, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   k, \
-       void*   alpha1, \
-       void*   a, inc_t cs_a, \
-                  dim_t pd_a, inc_t ps_a, \
-       void*   b, inc_t rs_b, \
-                  dim_t pd_b, inc_t ps_b, \
-       void*   alpha2, \
-       void*   c, inc_t rs_c, inc_t cs_c, \
-       cntx_t* cntx, \
-       rntm_t* rntm, \
-       thrinfo_t* thread  \
-     );
-
-INSERT_GENTPROT_BASIC0( trsm_ll_ker_var2 )
-INSERT_GENTPROT_BASIC0( trsm_lu_ker_var2 )
-INSERT_GENTPROT_BASIC0( trsm_rl_ker_var2 )
-INSERT_GENTPROT_BASIC0( trsm_ru_ker_var2 )
-
diff --git a/frame/3/trsm/bli_trsm_xx_ker_var2.c b/frame/3/trsm/bli_trsm_xx_ker_var2.c
index a0a59c0a85..f5d25bbbe4 100644
--- a/frame/3/trsm/bli_trsm_xx_ker_var2.c
+++ b/frame/3/trsm/bli_trsm_xx_ker_var2.c
@@ -47,8 +47,7 @@ void bli_trsm_xx_ker_var2
        const obj_t*  b,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl,
+       const cntl_t* cntl,
              thrinfo_t* thread
      )
 {
@@ -82,7 +81,6 @@ void bli_trsm_xx_ker_var2
 	  b,
 	  c,
 	  cntx,
-	  rntm,
 	  cntl,
 	  thread
 	);
diff --git a/frame/base/bli_blksz.c b/frame/base/bli_blksz.c
index 3d164a7cf5..206c443e9d 100644
--- a/frame/base/bli_blksz.c
+++ b/frame/base/bli_blksz.c
@@ -235,38 +235,34 @@ void bli_blksz_reduce_max_to
 
 dim_t bli_determine_blocksize
      (
-             dir_t   direct,
-             dim_t   i,
-             dim_t   dim,
-       const obj_t*  obj,
-             bszid_t bszid,
-       const cntx_t* cntx
+             dir_t    direct,
+             dim_t    i,
+             dim_t    dim,
+       const obj_t*   obj,
+       const blksz_t* bsize
      )
 {
 	if ( direct == BLIS_FWD )
-		return bli_determine_blocksize_f( i, dim, obj, bszid, cntx );
+		return bli_determine_blocksize_f( i, dim, obj, bsize );
 	else
-		return bli_determine_blocksize_b( i, dim, obj, bszid, cntx );
+		return bli_determine_blocksize_b( i, dim, obj, bsize );
 }
 
 dim_t bli_determine_blocksize_f
      (
-             dim_t   i,
-             dim_t   dim,
-       const obj_t*  obj,
-             bszid_t bszid,
-       const cntx_t* cntx
+             dim_t    i,
+             dim_t    dim,
+       const obj_t*   obj,
+       const blksz_t* bsize
      )
 {
 	num_t    dt;
-	const blksz_t* bsize;
 	dim_t    b_alg, b_max;
 	dim_t    b_use;
 
 	// Extract the execution datatype and use it to query the corresponding
 	// blocksize and blocksize maximum values from the blksz_t object.
 	dt    = bli_obj_exec_dt( obj );
-	bsize = bli_cntx_get_blksz( bszid, cntx );
 	b_alg = bli_blksz_get_def( dt, bsize );
 	b_max = bli_blksz_get_max( dt, bsize );
 
@@ -277,22 +273,19 @@ dim_t bli_determine_blocksize_f
 
 dim_t bli_determine_blocksize_b
      (
-             dim_t   i,
-             dim_t   dim,
-       const obj_t*  obj,
-             bszid_t bszid,
-       const cntx_t* cntx
+             dim_t    i,
+             dim_t    dim,
+       const obj_t*   obj,
+       const blksz_t* bsize
      )
 {
 	num_t    dt;
-	const blksz_t* bsize;
 	dim_t    b_alg, b_max;
 	dim_t    b_use;
 
 	// Extract the execution datatype and use it to query the corresponding
 	// blocksize and blocksize maximum values from the blksz_t object.
 	dt    = bli_obj_exec_dt( obj );
-	bsize = bli_cntx_get_blksz( bszid, cntx );
 	b_alg = bli_blksz_get_def( dt, bsize );
 	b_max = bli_blksz_get_max( dt, bsize );
 
diff --git a/frame/base/bli_blksz.h b/frame/base/bli_blksz.h
index d91c0542d8..70e9c46aa8 100644
--- a/frame/base/bli_blksz.h
+++ b/frame/base/bli_blksz.h
@@ -252,30 +252,27 @@ void bli_blksz_reduce_max_to
 
 dim_t bli_determine_blocksize
      (
-             dir_t   direct,
-             dim_t   i,
-             dim_t   dim,
-       const obj_t*  obj,
-             bszid_t bszid,
-       const cntx_t* cntx
+             dir_t    direct,
+             dim_t    i,
+             dim_t    dim,
+       const obj_t*   obj,
+       const blksz_t* bsize
      );
 
 dim_t bli_determine_blocksize_f
      (
-             dim_t   i,
-             dim_t   dim,
-       const obj_t*  obj,
-             bszid_t bszid,
-       const cntx_t* cntx
+             dim_t    i,
+             dim_t    dim,
+       const obj_t*   obj,
+       const blksz_t* bsize
      );
 
 dim_t bli_determine_blocksize_b
      (
-             dim_t   i,
-             dim_t   dim,
-       const obj_t*  obj,
-             bszid_t bszid,
-       const cntx_t* cntx
+             dim_t    i,
+             dim_t    dim,
+       const obj_t*   obj,
+       const blksz_t* bsize
      );
 
 dim_t bli_determine_blocksize_f_sub
diff --git a/frame/base/bli_cntl.c b/frame/base/bli_cntl.c
index b22ddbee0b..bdf2c418e5 100644
--- a/frame/base/bli_cntl.c
+++ b/frame/base/bli_cntl.c
@@ -35,286 +35,27 @@
 
 #include "blis.h"
 
-cntl_t* bli_cntl_create_node
+void bli_cntl_initialize_node
      (
-       rntm_t* rntm,
-       opid_t  family,
-       bszid_t bszid,
-       void_fp var_func,
-       void*   params,
-       cntl_t* sub_node
+             cntl_t* cntl,
+             opid_t  family,
+             bszid_t bszid,
+             void_fp var_func,
+       const void*   params,
+             cntl_t* sub_prenode,
+             cntl_t* sub_node
      )
 {
-	cntl_t* cntl;
-	mem_t*  pack_mem;
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntl_create_node(): " );
-	#endif
-
-	// Allocate the cntl_t struct.
-	cntl = bli_sba_acquire( rntm, sizeof( cntl_t ) );
-
 	bli_cntl_set_family( family, cntl );
-	bli_cntl_set_bszid( bszid, cntl );
+    bli_cntl_set_part( bszid, cntl );
 	bli_cntl_set_var_func( var_func, cntl );
 	bli_cntl_set_params( params, cntl );
-	bli_cntl_set_sub_prenode( NULL, cntl );
+	bli_cntl_set_sub_prenode( sub_prenode, cntl );
 	bli_cntl_set_sub_node( sub_node, cntl );
-
-	// Query the address of the node's packed mem_t entry so we can initialize
-	// key fields (to NULL or 0).
-	// NOTE: This initialization is important, since it allows threads to
-	// discern whether blocks have been acquired from the memory allocator.
-	pack_mem = bli_cntl_pack_mem( cntl );
-	bli_mem_clear( pack_mem );
-
-	return cntl;
-}
-
-void bli_cntl_free_node
-     (
-       rntm_t* rntm,
-       cntl_t* cntl
-     )
-{
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_cntl_free_node(): " );
-	#endif
-
-	bli_sba_release( rntm, cntl );
-}
-
-void bli_cntl_clear_node
-     (
-       cntl_t* cntl
-     )
-{
-	mem_t* pack_mem;
-
-	// Clear various fields in the control tree. Clearing these fields
-	// actually is not needed, but we do it for debugging/completeness.
-	bli_cntl_set_var_func( NULL, cntl );
-	bli_cntl_set_params( NULL, cntl );
-	bli_cntl_set_sub_prenode( NULL, cntl );
-	bli_cntl_set_sub_node( NULL, cntl );
-
-	// Clearing these fields is potentially more important if the control
-	// tree is cached somewhere and reused.
-	pack_mem = bli_cntl_pack_mem( cntl );
-	bli_mem_clear( pack_mem );
 }
 
 // -----------------------------------------------------------------------------
 
-void bli_cntl_free
-     (
-       rntm_t*    rntm,
-       cntl_t*    cntl,
-       thrinfo_t* thread
-     )
-{
-	if ( thread != NULL ) bli_cntl_free_w_thrinfo( rntm, cntl, thread );
-	else                  bli_cntl_free_wo_thrinfo( rntm, cntl );
-}
-
-void bli_cntl_free_w_thrinfo
-     (
-       rntm_t*    rntm,
-       cntl_t*    cntl,
-       thrinfo_t* thread
-     )
-{
-	// Base case: simply return when asked to free NULL nodes.
-	if ( cntl == NULL ) return;
-
-	cntl_t* cntl_sub_prenode = bli_cntl_sub_prenode( cntl );
-	cntl_t* cntl_sub_node    = bli_cntl_sub_node( cntl );
-	void*   cntl_params      = bli_cntl_params( cntl );
-	mem_t*  cntl_pack_mem    = bli_cntl_pack_mem( cntl );
-
-	// Don't immediately dereference the prenode and subnode of the thrinfo_t
-	// node. In some cases, the thrinfo_t tree is not built out all the way,
-	// perhaps because there are more ways of parallelization than micropanels
-	// of data in this dimension, or because the problem is small enough that
-	// there is no gemm subproblem in bli_trsm_blk_var1(). Thus, we start with
-	// NULL values for these variables and only dereference the fields of the
-	// thrinfo_t struct if the thrinfo_t exists (ie: is non-NULL). We will also
-	// have to check the thrinfo_t pointer for NULLness before using it below,
-	// when checking if we need to free the pack_mem field of the cntl_t node
-	// (see below).
-	thrinfo_t* thread_sub_prenode = NULL;
-	thrinfo_t* thread_sub_node    = NULL;
-
-	if ( thread != NULL )
-	{
-		thread_sub_prenode = bli_thrinfo_sub_prenode( thread );
-		thread_sub_node    = bli_thrinfo_sub_node( thread );
-	}
-
-	// Only recurse into prenode branch if it exists.
-	if ( cntl_sub_prenode != NULL )
-	{
-		// Recursively free all memory associated with the sub-prenode and its
-		// children.
-		bli_cntl_free_w_thrinfo( rntm, cntl_sub_prenode, thread_sub_prenode );
-	}
-
-	// Only recurse into the child node if it exists.
-	if ( cntl_sub_node != NULL )
-	{
-		// Recursively free all memory associated with the sub-node and its
-		// children.
-		bli_cntl_free_w_thrinfo( rntm, cntl_sub_node, thread_sub_node );
-	}
-
-	// Free the current node's params field, if it is non-NULL.
-	if ( cntl_params != NULL )
-	{
-		#ifdef BLIS_ENABLE_MEM_TRACING
-		printf( "bli_cntl_free_w_thrinfo(): " );
-		#endif
-
-		bli_sba_release( rntm, cntl_params );
-	}
-
-	// Release the current node's pack mem_t entry back to the memory
-	// broker from which it originated, but only if the mem_t entry is
-	// allocated, and only if the current thread is chief for its group.
-	// Also note that we don't proceed with either of the above tests if
-	// the thrinfo_t pointer is NULL. (See above for background on when
-	// this can happen.)
-	if ( thread != NULL )
-	if ( bli_thread_am_ochief( thread ) )
-	if ( bli_mem_is_alloc( cntl_pack_mem ) )
-	{
-		#ifdef BLIS_ENABLE_MEM_TRACING
-		printf( "bli_cntl_free_w_thrinfo(): releasing mem pool block.\n" );
-		#endif
-
-		bli_pba_release( rntm, cntl_pack_mem );
-	}
-
-	// Free the current node.
-	bli_cntl_free_node( rntm, cntl );
-}
-
-void bli_cntl_free_wo_thrinfo
-     (
-       rntm_t* rntm,
-       cntl_t* cntl
-     )
-{
-	// Base case: simply return when asked to free NULL nodes.
-	if ( cntl == NULL ) return;
-
-	cntl_t* cntl_sub_prenode = bli_cntl_sub_prenode( cntl );
-	cntl_t* cntl_sub_node    = bli_cntl_sub_node( cntl );
-	void*   cntl_params      = bli_cntl_params( cntl );
-	mem_t*  cntl_pack_mem    = bli_cntl_pack_mem( cntl );
-
-	{
-		// Recursively free all memory associated with the sub-prenode and its
-		// children.
-		bli_cntl_free_wo_thrinfo( rntm, cntl_sub_prenode );
-	}
-
-	{
-		// Recursively free all memory associated with the sub-node and its
-		// children.
-		bli_cntl_free_wo_thrinfo( rntm, cntl_sub_node );
-	}
-
-	// Free the current node's params field, if it is non-NULL.
-	if ( cntl_params != NULL )
-	{
-		bli_sba_release( rntm, cntl_params );
-	}
-
-	// Release the current node's pack mem_t entry back to the memory
-	// broker from which it originated, but only if the mem_t entry is
-	// allocated.
-	if ( bli_mem_is_alloc( cntl_pack_mem ) )
-	{
-		bli_pba_release( rntm, cntl_pack_mem );
-	}
-
-	// Free the current node.
-	bli_cntl_free_node( rntm, cntl );
-}
-
-// -----------------------------------------------------------------------------
-
-cntl_t* bli_cntl_copy
-     (
-       rntm_t* rntm,
-       cntl_t* cntl
-     )
-{
-	// Make a copy of the current node. Notice that the source node
-	// should NOT have any allocated/cached mem_t entries, and that
-	// bli_cntl_create_node() creates a node with a cleared mem_t
-	// field.
-	cntl_t* cntl_copy = bli_cntl_create_node
-	(
-      rntm,
-	  bli_cntl_family( cntl ),
-	  bli_cntl_bszid( cntl ),
-	  bli_cntl_var_func( cntl ),
-	  NULL, NULL
-	);
-
-	// Check the params field of the existing control tree; if it's non-NULL,
-	// copy it.
-	if ( bli_cntl_params( cntl ) != NULL )
-	{
-		// Detect the size of the params struct by reading the first field
-		// as a uint64_t, and then allocate this many bytes for a new params
-		// struct.
-		uint64_t params_size = bli_cntl_params_size( cntl );
-		void*    params_orig = bli_cntl_params( cntl );
-		void*    params_copy = bli_sba_acquire( rntm, ( size_t )params_size );
-
-		// Copy the original params struct to the new memory region.
-		memcpy( params_copy, params_orig, params_size );
-
-		// Save the address of the new params struct into the new control
-		// tree node.
-		bli_cntl_set_params( params_copy, cntl_copy );
-	}
-
-	// If the sub-prenode exists, copy it recursively.
-	if ( bli_cntl_sub_prenode( cntl ) != NULL )
-	{
-		cntl_t* sub_prenode_copy = bli_cntl_copy
-		(
-		  rntm,
-		  bli_cntl_sub_prenode( cntl )
-		);
-
-		// Save the address of the new sub-node (sub-tree) to the existing
-		// node.
-		bli_cntl_set_sub_prenode( sub_prenode_copy, cntl_copy );
-	}
-
-	// If the sub-node exists, copy it recursively.
-	if ( bli_cntl_sub_node( cntl ) != NULL )
-	{
-		cntl_t* sub_node_copy = bli_cntl_copy
-		(
-		  rntm,
-		  bli_cntl_sub_node( cntl )
-		);
-
-		// Save the address of the new sub-node (sub-tree) to the existing
-		// node.
-		bli_cntl_set_sub_node( sub_node_copy, cntl_copy );
-	}
-
-	// Return the address of the newly created node.
-	return cntl_copy;
-}
-
 void bli_cntl_mark_family
      (
        opid_t  family,
@@ -357,7 +98,7 @@ dim_t bli_cntl_calc_num_threads_in
 
 	for ( ; cntl != NULL; cntl = bli_cntl_sub_node( cntl ) )
 	{
-		bszid_t bszid = bli_cntl_bszid( cntl );
+		bszid_t bszid = bli_cntl_part( cntl );
 		dim_t   cur_way;
 
 		// We assume bszid is in {NC,KC,MC,NR,MR,KR} if it is not
diff --git a/frame/base/bli_cntl.h b/frame/base/bli_cntl.h
index 406a350eec..be58a7b228 100644
--- a/frame/base/bli_cntl.h
+++ b/frame/base/bli_cntl.h
@@ -39,20 +39,12 @@
 
 struct cntl_s
 {
-	// Basic fields (usually required).
 	opid_t         family;
-	bszid_t        bszid;
+    bszid_t        bszid;
 	void_fp        var_func;
 	struct cntl_s* sub_prenode;
 	struct cntl_s* sub_node;
-
-	// Optional fields (needed only by some operations such as packm).
-	// NOTE: first field of params must be a uint64_t containing the size
-	// of the struct.
 	void*          params;
-
-	// Internal fields that track "cached" data.
-	mem_t          pack_mem;
 };
 typedef struct cntl_s cntl_t;
 */
@@ -60,55 +52,19 @@ typedef struct cntl_s cntl_t;
 
 // -- Control tree prototypes --
 
-BLIS_EXPORT_BLIS cntl_t* bli_cntl_create_node
-     (
-       rntm_t* rntm,
-       opid_t  family,
-       bszid_t bszid,
-       void_fp var_func,
-       void*   params,
-       cntl_t* sub_node
-     );
-
-BLIS_EXPORT_BLIS void bli_cntl_free_node
-     (
-       rntm_t* rntm,
-       cntl_t* cntl
-     );
-
-BLIS_EXPORT_BLIS void bli_cntl_clear_node
+BLIS_EXPORT_BLIS void bli_cntl_initialize_node
      (
-       cntl_t* cntl
+             cntl_t* cntl,
+             opid_t  family,
+             bszid_t bszid,
+             void_fp var_func,
+       const void*   params,
+             cntl_t* sub_prenode,
+             cntl_t* sub_node
      );
 
 // -----------------------------------------------------------------------------
 
-BLIS_EXPORT_BLIS void bli_cntl_free
-     (
-       rntm_t*    rntm,
-       cntl_t*    cntl,
-       thrinfo_t* thread
-     );
-
-BLIS_EXPORT_BLIS void bli_cntl_free_w_thrinfo
-     (
-       rntm_t*    rntm,
-       cntl_t*    cntl,
-       thrinfo_t* thread
-     );
-
-BLIS_EXPORT_BLIS void bli_cntl_free_wo_thrinfo
-     (
-       rntm_t*    rntm,
-       cntl_t*    cntl
-     );
-
-BLIS_EXPORT_BLIS cntl_t* bli_cntl_copy
-     (
-       rntm_t* rntm,
-       cntl_t* cntl
-     );
-
 BLIS_EXPORT_BLIS void bli_cntl_mark_family
      (
        opid_t  family,
@@ -132,7 +88,7 @@ BLIS_INLINE opid_t bli_cntl_family( const cntl_t* cntl )
 	return cntl->family;
 }
 
-BLIS_INLINE bszid_t bli_cntl_bszid( const cntl_t* cntl )
+BLIS_INLINE bszid_t bli_cntl_part( const cntl_t* cntl )
 {
 	return cntl->bszid;
 }
@@ -152,22 +108,11 @@ BLIS_INLINE cntl_t* bli_cntl_sub_node( const cntl_t* cntl )
 	return cntl->sub_node;
 }
 
-BLIS_INLINE void* bli_cntl_params( const cntl_t* cntl )
+BLIS_INLINE const void* bli_cntl_params( const cntl_t* cntl )
 {
 	return cntl->params;
 }
 
-BLIS_INLINE uint64_t bli_cntl_params_size( const cntl_t* cntl )
-{
-	// The first 64 bytes is always the size of the params structure.
-	return *( ( uint64_t* )(cntl->params) );
-}
-
-BLIS_INLINE mem_t* bli_cntl_pack_mem( cntl_t* cntl )
-{
-	return &(cntl->pack_mem);
-}
-
 // cntl_t query (complex)
 
 BLIS_INLINE bool bli_cntl_is_null( const cntl_t* cntl )
@@ -185,7 +130,7 @@ BLIS_INLINE bool bli_cntl_is_leaf( const cntl_t* cntl )
 BLIS_INLINE bool bli_cntl_does_part( const cntl_t* cntl )
 {
 	return ( bool )
-	       ( bli_cntl_bszid( cntl ) != BLIS_NO_PART );
+	       ( bli_cntl_part( cntl ) != BLIS_NO_PART );
 }
 
 // cntl_t modification
@@ -195,14 +140,14 @@ BLIS_INLINE void bli_cntl_set_family( opid_t family, cntl_t* cntl )
 	cntl->family = family;
 }
 
-BLIS_INLINE void bli_cntl_set_bszid( bszid_t bszid, cntl_t* cntl )
+BLIS_INLINE void bli_cntl_set_var_func( void_fp var_func, cntl_t* cntl )
 {
-	cntl->bszid = bszid;
+	cntl->var_func = var_func;
 }
 
-BLIS_INLINE void bli_cntl_set_var_func( void_fp var_func, cntl_t* cntl )
+BLIS_INLINE void bli_cntl_set_part( bszid_t bszid, cntl_t* cntl )
 {
-	cntl->var_func = var_func;
+	cntl->bszid = bszid;
 }
 
 BLIS_INLINE void bli_cntl_set_sub_prenode( cntl_t* sub_prenode, cntl_t* cntl )
@@ -215,13 +160,8 @@ BLIS_INLINE void bli_cntl_set_sub_node( cntl_t* sub_node, cntl_t* cntl )
 	cntl->sub_node = sub_node;
 }
 
-BLIS_INLINE void bli_cntl_set_params( void* params, cntl_t* cntl )
+BLIS_INLINE void bli_cntl_set_params( const void* params, cntl_t* cntl )
 {
 	cntl->params = params;
 }
 
-BLIS_INLINE void bli_cntl_set_pack_mem( mem_t* pack_mem, cntl_t* cntl )
-{
-	cntl->pack_mem = *pack_mem;
-}
-
diff --git a/frame/base/bli_cntx.c b/frame/base/bli_cntx.c
index 5a886a128f..5952cbc0ac 100644
--- a/frame/base/bli_cntx.c
+++ b/frame/base/bli_cntx.c
@@ -56,9 +56,9 @@ void bli_cntx_set_blkszs( cntx_t* cntx, ... )
 	   void bli_cntx_set_blkszs
 	   (
 	     cntx_t* cntx,
-	     bszid_t bs0_id, blksz_t* blksz0, bszid_t bm0_id,
-	     bszid_t bs1_id, blksz_t* blksz1, bszid_t bm1_id,
-	     bszid_t bs2_id, blksz_t* blksz2, bszid_t bm2_id,
+	     bszid_t bs0_id, blksz_t* blksz0,
+	     bszid_t bs1_id, blksz_t* blksz1,
+	     bszid_t bs2_id, blksz_t* blksz2,
 	     ...,
 	     BLIS_VA_END
 	   );
@@ -71,7 +71,6 @@ void bli_cntx_set_blkszs( cntx_t* cntx, ... )
 	// - the blocksize object array
 	// - the blocksize multiple array
 	blksz_t* cntx_blkszs = cntx->blkszs;
-	bszid_t* cntx_bmults = cntx->bmults;
 
 	// Initialize variable argument environment.
 	va_list args;
@@ -92,7 +91,6 @@ void bli_cntx_set_blkszs( cntx_t* cntx, ... )
 		//   the blksz_t object.
 		bszid_t  bs_id = ( bszid_t  )bs_id0;
 		blksz_t* blksz = ( blksz_t* )va_arg( args, blksz_t* );
-		bszid_t  bm_id = ( bszid_t  )va_arg( args, bszid_t  );
 
 		// Copy the blksz_t object contents into the appropriate
 		// location within the context's blksz_t array. Do the same
@@ -101,9 +99,6 @@ void bli_cntx_set_blkszs( cntx_t* cntx, ... )
 		//bli_blksz_copy( blksz, cntx_blksz );
 		blksz_t* cntx_blksz = &cntx_blkszs[ bs_id ];
 		bli_blksz_copy_if_pos( blksz, cntx_blksz );
-
-		// Copy the blocksize multiple id into the context.
-		cntx_bmults[ bs_id ] = bm_id;
 	}
 
 	// Shutdown variable argument environment and clean up stack.
diff --git a/frame/base/bli_cntx.h b/frame/base/bli_cntx.h
index 90050a5ed9..354dfc7d40 100644
--- a/frame/base/bli_cntx.h
+++ b/frame/base/bli_cntx.h
@@ -58,28 +58,6 @@ typedef struct cntx_s
 
 // -----------------------------------------------------------------------------
 
-//
-// -- cntx_t query (fields only) -----------------------------------------------
-//
-
-BLIS_INLINE ind_t bli_cntx_method( const cntx_t* cntx )
-{
-	return cntx->method;
-}
-
-// -----------------------------------------------------------------------------
-
-//
-// -- cntx_t modification (fields only) ----------------------------------------
-//
-
-BLIS_INLINE void bli_cntx_set_method( ind_t method, cntx_t* cntx )
-{
-	cntx->method = method;
-}
-
-// -----------------------------------------------------------------------------
-
 //
 // -- cntx_t query (complex) ---------------------------------------------------
 //
@@ -108,27 +86,6 @@ BLIS_INLINE dim_t bli_cntx_get_blksz_max_dt( num_t dt, bszid_t bs_id, const cntx
 	return bs_dt;
 }
 
-BLIS_INLINE bszid_t bli_cntx_get_bmult_id( bszid_t bs_id, const cntx_t* cntx )
-{
-	return cntx->bmults[ bs_id ];
-}
-
-BLIS_INLINE const blksz_t* bli_cntx_get_bmult( bszid_t bs_id, const cntx_t* cntx )
-{
-	bszid_t        bm_id  = bli_cntx_get_bmult_id( bs_id, cntx );
-	const blksz_t* bmult  = bli_cntx_get_blksz( bm_id, cntx );
-
-	return bmult;
-}
-
-BLIS_INLINE dim_t bli_cntx_get_bmult_dt( num_t dt, bszid_t bs_id, const cntx_t* cntx )
-{
-	const blksz_t* bmult  = bli_cntx_get_bmult( bs_id, cntx );
-	dim_t          bm_dt  = bli_blksz_get_def( dt, bmult );
-
-	return bm_dt;
-}
-
 // -----------------------------------------------------------------------------
 
 BLIS_INLINE const func_t* bli_cntx_get_ukrs( ukr_t ukr_id, const cntx_t* cntx )
@@ -143,21 +100,6 @@ BLIS_INLINE void_fp bli_cntx_get_ukr_dt( num_t dt, ukr_t ukr_id, const cntx_t* c
 	return bli_func_get_dt( dt, func );
 }
 
-BLIS_INLINE void_fp bli_cntx_get_l3_vir_ukr_dt( num_t dt, ukr_t ukr_id, const cntx_t* cntx )
-{
-	switch ( ukr_id )
-	{
-		case BLIS_GEMM_UKR:       ukr_id = BLIS_GEMM_VIR_UKR; break;
-		case BLIS_TRSM_L_UKR:     ukr_id = BLIS_TRSM_L_VIR_UKR; break;
-		case BLIS_TRSM_U_UKR:     ukr_id = BLIS_TRSM_U_VIR_UKR; break;
-		case BLIS_GEMMTRSM_L_UKR: ukr_id = BLIS_GEMMTRSM_L_VIR_UKR; break;
-		case BLIS_GEMMTRSM_U_UKR: ukr_id = BLIS_GEMMTRSM_U_VIR_UKR; break;
-		default: break;
-	};
-
-	return bli_cntx_get_ukr_dt( dt, ukr_id, cntx );
-}
-
 // -----------------------------------------------------------------------------
 
 BLIS_INLINE const mbool_t* bli_cntx_get_ukr_prefs( ukr_pref_t pref_id, const cntx_t* cntx )
@@ -192,77 +134,6 @@ BLIS_INLINE void_fp bli_cntx_get_l3_sup_handler( opid_t op, const cntx_t* cntx )
 
 // -----------------------------------------------------------------------------
 
-BLIS_INLINE bool bli_cntx_ukr_prefers_rows_dt( num_t dt, ukr_t ukr_id, const cntx_t* cntx )
-{
-	// This initial value will get overwritten during the switch statement below.
-	ukr_pref_t ukr_pref_id = BLIS_GEMM_UKR_ROW_PREF;
-
-	// Get the correct preference from the kernel ID.
-	switch ( ukr_id )
-	{
-		case BLIS_GEMM_VIR_UKR: // fallthrough
-		case BLIS_GEMM_UKR: ukr_pref_id = BLIS_GEMM_UKR_ROW_PREF; break;
-		case BLIS_TRSM_L_VIR_UKR: // fallthrough
-		case BLIS_TRSM_L_UKR: ukr_pref_id = BLIS_TRSM_L_UKR_ROW_PREF; break;
-		case BLIS_TRSM_U_VIR_UKR: // fallthrough
-		case BLIS_TRSM_U_UKR: ukr_pref_id = BLIS_TRSM_U_UKR_ROW_PREF; break;
-		case BLIS_GEMMTRSM_L_VIR_UKR: // fallthrough
-		case BLIS_GEMMTRSM_L_UKR: ukr_pref_id = BLIS_GEMMTRSM_L_UKR_ROW_PREF; break;
-		case BLIS_GEMMTRSM_U_VIR_UKR: // fallthrough
-		case BLIS_GEMMTRSM_U_UKR: ukr_pref_id = BLIS_GEMMTRSM_U_UKR_ROW_PREF; break;
-		case BLIS_GEMMSUP_RRR_UKR: ukr_pref_id = BLIS_GEMMSUP_RRR_UKR_ROW_PREF; break;
-		case BLIS_GEMMSUP_RRC_UKR: ukr_pref_id = BLIS_GEMMSUP_RRC_UKR_ROW_PREF; break;
-		case BLIS_GEMMSUP_RCR_UKR: ukr_pref_id = BLIS_GEMMSUP_RCR_UKR_ROW_PREF; break;
-		case BLIS_GEMMSUP_RCC_UKR: ukr_pref_id = BLIS_GEMMSUP_RCC_UKR_ROW_PREF; break;
-		case BLIS_GEMMSUP_CRR_UKR: ukr_pref_id = BLIS_GEMMSUP_CRR_UKR_ROW_PREF; break;
-		case BLIS_GEMMSUP_CRC_UKR: ukr_pref_id = BLIS_GEMMSUP_CRC_UKR_ROW_PREF; break;
-		case BLIS_GEMMSUP_CCR_UKR: ukr_pref_id = BLIS_GEMMSUP_CCR_UKR_ROW_PREF; break;
-		case BLIS_GEMMSUP_CCC_UKR: ukr_pref_id = BLIS_GEMMSUP_CCC_UKR_ROW_PREF; break;
-		case BLIS_GEMMSUP_XXX_UKR: ukr_pref_id = BLIS_GEMMSUP_XXX_UKR_ROW_PREF; break;
-		default: break; // TODO: should be an error condition
-	}
-
-	// For virtual ukernels during non-native execution, use the real projection of
-	// the datatype.
-	if ( bli_cntx_method( cntx ) != BLIS_NAT )
-	{
-		switch ( ukr_id )
-		{
-			case BLIS_GEMM_VIR_UKR: // fallthrough
-			case BLIS_TRSM_L_VIR_UKR: // fallthrough
-			case BLIS_TRSM_U_VIR_UKR: // fallthrough
-			case BLIS_GEMMTRSM_L_VIR_UKR: // fallthrough
-			case BLIS_GEMMTRSM_U_VIR_UKR: dt = bli_dt_proj_to_real( dt ); break;
-			default: break;
-		}
-	}
-
-	return bli_cntx_get_ukr_prefs_dt( dt, ukr_pref_id, cntx );
-}
-
-BLIS_INLINE bool bli_cntx_ukr_prefers_cols_dt( num_t dt, ukr_t ukr_id, const cntx_t* cntx )
-{
-	return ! bli_cntx_ukr_prefers_rows_dt( dt, ukr_id, cntx );
-}
-
-BLIS_INLINE bool bli_cntx_prefers_storage_of( const obj_t* obj, ukr_t ukr_id, const cntx_t* cntx )
-{
-	const bool ukr_prefers_rows
-		= bli_cntx_ukr_prefers_rows_dt( bli_obj_dt( obj ), ukr_id, cntx );
-
-	if      ( bli_obj_is_row_stored( obj ) &&  ukr_prefers_rows ) return TRUE;
-	else if ( bli_obj_is_col_stored( obj ) && !ukr_prefers_rows ) return TRUE;
-
-	return FALSE;
-}
-
-BLIS_INLINE bool bli_cntx_dislikes_storage_of( const obj_t* obj, ukr_t ukr_id, const cntx_t* cntx )
-{
-	return ! bli_cntx_prefers_storage_of( obj, ukr_id, cntx );
-}
-
-// -----------------------------------------------------------------------------
-
 //
 // -- cntx_t modification (complex) --------------------------------------------
 //
@@ -270,10 +141,9 @@ BLIS_INLINE bool bli_cntx_dislikes_storage_of( const obj_t* obj, ukr_t ukr_id, c
 // NOTE: The framework does not use any of the following functions. We provide
 // them in order to facilitate creating/modifying custom contexts.
 
-BLIS_INLINE void bli_cntx_set_blksz( bszid_t bs_id, blksz_t* blksz, bszid_t mult_id, cntx_t* cntx )
+BLIS_INLINE void bli_cntx_set_blksz( bszid_t bs_id, blksz_t* blksz, cntx_t* cntx )
 {
 	cntx->blkszs[ bs_id ] = *blksz;
-	cntx->bmults[ bs_id ] = mult_id;
 }
 
 BLIS_INLINE void bli_cntx_set_blksz_def_dt( num_t dt, bszid_t bs_id, dim_t bs, cntx_t* cntx )
diff --git a/frame/base/bli_gks.c b/frame/base/bli_gks.c
index ff4578bc5f..3e0af1eef7 100644
--- a/frame/base/bli_gks.c
+++ b/frame/base/bli_gks.c
@@ -37,20 +37,15 @@
 
 // The array of cntx_t* pointers to cache modified contexts used by
 // induced methods.
-static cntx_t** gks[ BLIS_NUM_ARCHS ];
-
-// The array of function pointers holding the registered context initialization
-// functions for induced methods.
-static void_fp  cntx_ind_init[ BLIS_NUM_ARCHS ];
+static cntx_t* gks[ BLIS_NUM_ARCHS ];
 
 // The array of function pointers holding the registered context initialization
 // functions for reference kernels.
-static void_fp  cntx_ref_init[ BLIS_NUM_ARCHS ];
+static void_fp cntx_ref_init[ BLIS_NUM_ARCHS ];
 
 // Define a function pointer type for context initialization functions.
 typedef void (*nat_cntx_init_ft)( cntx_t* cntx );
 typedef void (*ref_cntx_init_ft)( cntx_t* cntx );
-typedef void (*ind_cntx_init_ft)( ind_t method, cntx_t* cntx );
 
 // -----------------------------------------------------------------------------
 
@@ -67,146 +62,119 @@ void bli_gks_init( void )
 		// Intel architectures
 #ifdef BLIS_CONFIG_SKX
 		bli_gks_register_cntx( BLIS_ARCH_SKX,         bli_cntx_init_skx,
-		                                              bli_cntx_init_skx_ref,
-		                                              bli_cntx_init_skx_ind );
+		                                              bli_cntx_init_skx_ref );
 #endif
 #ifdef BLIS_CONFIG_KNL
 		bli_gks_register_cntx( BLIS_ARCH_KNL,         bli_cntx_init_knl,
-		                                              bli_cntx_init_knl_ref,
-		                                              bli_cntx_init_knl_ind );
+		                                              bli_cntx_init_knl_ref );
 #endif
 #ifdef BLIS_CONFIG_KNC
 		bli_gks_register_cntx( BLIS_ARCH_KNC,         bli_cntx_init_knc,
-		                                              bli_cntx_init_knc_ref,
-		                                              bli_cntx_init_knc_ind );
+		                                              bli_cntx_init_knc_ref );
 #endif
 #ifdef BLIS_CONFIG_HASWELL
 		bli_gks_register_cntx( BLIS_ARCH_HASWELL,     bli_cntx_init_haswell,
-		                                              bli_cntx_init_haswell_ref,
-		                                              bli_cntx_init_haswell_ind );
+		                                              bli_cntx_init_haswell_ref );
 #endif
 #ifdef BLIS_CONFIG_SANDYBRIDGE
 		bli_gks_register_cntx( BLIS_ARCH_SANDYBRIDGE, bli_cntx_init_sandybridge,
-		                                              bli_cntx_init_sandybridge_ref,
-		                                              bli_cntx_init_sandybridge_ind );
+		                                              bli_cntx_init_sandybridge_ref );
 #endif
 #ifdef BLIS_CONFIG_PENRYN
 		bli_gks_register_cntx( BLIS_ARCH_PENRYN,      bli_cntx_init_penryn,
-		                                              bli_cntx_init_penryn_ref,
-		                                              bli_cntx_init_penryn_ind );
+		                                              bli_cntx_init_penryn_ref );
 #endif
 
 		// AMD architectures
 #ifdef BLIS_CONFIG_ZEN3
 		bli_gks_register_cntx( BLIS_ARCH_ZEN3,        bli_cntx_init_zen3,
-		                                              bli_cntx_init_zen3_ref,
-		                                              bli_cntx_init_zen3_ind );
+		                                              bli_cntx_init_zen3_ref );
 #endif
 #ifdef BLIS_CONFIG_ZEN2
 		bli_gks_register_cntx( BLIS_ARCH_ZEN2,        bli_cntx_init_zen2,
-		                                              bli_cntx_init_zen2_ref,
-		                                              bli_cntx_init_zen2_ind );
+		                                              bli_cntx_init_zen2_ref );
 #endif
 #ifdef BLIS_CONFIG_ZEN
 		bli_gks_register_cntx( BLIS_ARCH_ZEN,         bli_cntx_init_zen,
-		                                              bli_cntx_init_zen_ref,
-		                                              bli_cntx_init_zen_ind );
+		                                              bli_cntx_init_zen_ref );
 #endif
 #ifdef BLIS_CONFIG_EXCAVATOR
 		bli_gks_register_cntx( BLIS_ARCH_EXCAVATOR,   bli_cntx_init_excavator,
-		                                              bli_cntx_init_excavator_ref,
-		                                              bli_cntx_init_excavator_ind );
+		                                              bli_cntx_init_excavator_ref );
 #endif
 #ifdef BLIS_CONFIG_STEAMROLLER
 		bli_gks_register_cntx( BLIS_ARCH_STEAMROLLER, bli_cntx_init_steamroller,
-		                                              bli_cntx_init_steamroller_ref,
-		                                              bli_cntx_init_steamroller_ind );
+		                                              bli_cntx_init_steamroller_ref );
 #endif
 #ifdef BLIS_CONFIG_PILEDRIVER
 		bli_gks_register_cntx( BLIS_ARCH_PILEDRIVER,  bli_cntx_init_piledriver,
-		                                              bli_cntx_init_piledriver_ref,
-		                                              bli_cntx_init_piledriver_ind );
+		                                              bli_cntx_init_piledriver_ref );
 #endif
 #ifdef BLIS_CONFIG_BULLDOZER
 		bli_gks_register_cntx( BLIS_ARCH_BULLDOZER,   bli_cntx_init_bulldozer,
-		                                              bli_cntx_init_bulldozer_ref,
-		                                              bli_cntx_init_bulldozer_ind );
+		                                              bli_cntx_init_bulldozer_ref );
 #endif
 
 		// ARM architectures
 #ifdef BLIS_CONFIG_A64FX
-		bli_gks_register_cntx( BLIS_ARCH_A64FX,   bli_cntx_init_a64fx,
-		                                              bli_cntx_init_a64fx_ref,
-		                                              bli_cntx_init_a64fx_ind );
+		bli_gks_register_cntx( BLIS_ARCH_A64FX,       bli_cntx_init_a64fx,
+		                                              bli_cntx_init_a64fx_ref );
 #endif
 #ifdef BLIS_CONFIG_THUNDERX2
 		bli_gks_register_cntx( BLIS_ARCH_THUNDERX2,   bli_cntx_init_thunderx2,
-		                                              bli_cntx_init_thunderx2_ref,
-		                                              bli_cntx_init_thunderx2_ind );
+		                                              bli_cntx_init_thunderx2_ref );
 #endif
 #ifdef BLIS_CONFIG_CORTEXA57
 		bli_gks_register_cntx( BLIS_ARCH_CORTEXA57,   bli_cntx_init_cortexa57,
-		                                              bli_cntx_init_cortexa57_ref,
-		                                              bli_cntx_init_cortexa57_ind );
+		                                              bli_cntx_init_cortexa57_ref );
 #endif
 #ifdef BLIS_CONFIG_CORTEXA53
 		bli_gks_register_cntx( BLIS_ARCH_CORTEXA53,   bli_cntx_init_cortexa53,
-		                                              bli_cntx_init_cortexa53_ref,
-		                                              bli_cntx_init_cortexa53_ind );
+		                                              bli_cntx_init_cortexa53_ref );
 #endif
 #ifdef BLIS_CONFIG_ARMSVE
 		bli_gks_register_cntx( BLIS_ARCH_ARMSVE,      bli_cntx_init_armsve,
-		                                              bli_cntx_init_armsve_ref,
-		                                              bli_cntx_init_armsve_ind );
+		                                              bli_cntx_init_armsve_ref );
 #endif
 #ifdef BLIS_CONFIG_A64FX
 		bli_gks_register_cntx( BLIS_ARCH_A64FX,       bli_cntx_init_a64fx,
-		                                              bli_cntx_init_a64fx_ref,
-		                                              bli_cntx_init_a64fx_ind );
+		                                              bli_cntx_init_a64fx_ref );
 #endif
 #ifdef BLIS_CONFIG_FIRESTORM
 		bli_gks_register_cntx( BLIS_ARCH_FIRESTORM,   bli_cntx_init_firestorm,
-		                                              bli_cntx_init_firestorm_ref,
-		                                              bli_cntx_init_firestorm_ind );
+		                                              bli_cntx_init_firestorm_ref );
 #endif
 #ifdef BLIS_CONFIG_CORTEXA15
 		bli_gks_register_cntx( BLIS_ARCH_CORTEXA15,   bli_cntx_init_cortexa15,
-		                                              bli_cntx_init_cortexa15_ref,
-		                                              bli_cntx_init_cortexa15_ind );
+		                                              bli_cntx_init_cortexa15_ref );
 #endif
 #ifdef BLIS_CONFIG_CORTEXA9
 		bli_gks_register_cntx( BLIS_ARCH_CORTEXA9,    bli_cntx_init_cortexa9,
-		                                              bli_cntx_init_cortexa9_ref,
-		                                              bli_cntx_init_cortexa9_ind );
+		                                              bli_cntx_init_cortexa9_ref );
 #endif
 
 		// IBM architectures
 #ifdef BLIS_CONFIG_POWER10
 		bli_gks_register_cntx( BLIS_ARCH_POWER10,     bli_cntx_init_power10,
-		                                              bli_cntx_init_power10_ref,
-		                                              bli_cntx_init_power10_ind );
+		                                              bli_cntx_init_power10_ref );
 #endif
 #ifdef BLIS_CONFIG_POWER9
 		bli_gks_register_cntx( BLIS_ARCH_POWER9,      bli_cntx_init_power9,
-		                                              bli_cntx_init_power9_ref,
-		                                              bli_cntx_init_power9_ind );
+		                                              bli_cntx_init_power9_ref );
 #endif
 #ifdef BLIS_CONFIG_POWER7
 		bli_gks_register_cntx( BLIS_ARCH_POWER7,      bli_cntx_init_power7,
-		                                              bli_cntx_init_power7_ref,
-		                                              bli_cntx_init_power7_ind );
+		                                              bli_cntx_init_power7_ref );
 #endif
 #ifdef BLIS_CONFIG_BGQ
 		bli_gks_register_cntx( BLIS_ARCH_BGQ,         bli_cntx_init_bgq,
-		                                              bli_cntx_init_bgq_ref,
-		                                              bli_cntx_init_bgq_ind );
+		                                              bli_cntx_init_bgq_ref );
 #endif
 
 		// Generic architectures
 #ifdef BLIS_CONFIG_GENERIC
 		bli_gks_register_cntx( BLIS_ARCH_GENERIC,     bli_cntx_init_generic,
-		                                              bli_cntx_init_generic_ref,
-		                                              bli_cntx_init_generic_ind );
+		                                              bli_cntx_init_generic_ref );
 #endif
 	}
 }
@@ -226,34 +194,17 @@ void bli_gks_finalize( void )
 		// Iterate over the architectures in the gks array.
 		for ( id = 0; id < BLIS_NUM_ARCHS; ++id )
 		{
-			cntx_t** gks_id = gks[ id ];
+			cntx_t* gks_id = gks[ id ];
 
 			// Only consider context arrays for architectures that were allocated
 			// in the first place.
 			if ( gks_id != NULL )
 			{
-				// Iterate over the induced methods in the current sub-array
-				// referenced by cntx_pp.
-				for ( ind = 0; ind < BLIS_NUM_IND_METHODS; ++ind )
-				{
-					cntx_t* gks_id_ind = gks_id[ ind ];
-
-					// If the current context was allocated, free it.
-					if ( gks_id_ind != NULL )
-					{
-						#ifdef BLIS_ENABLE_MEM_TRACING
-						printf( "bli_gks_finalize(): cntx for ind_t %d: ", ( int )ind );
-						#endif
-
-						bli_free_intl( gks_id_ind );
-					}
-				}
-
 				#ifdef BLIS_ENABLE_MEM_TRACING
 				printf( "bli_gks_finalize(): gks for arch_t %d: ", ( int )id );
 				#endif
 
-				// Free the array of BLIS_NUM_IND_METHODS cntx* elements.
+				// Free the context.
 				bli_free_intl( gks_id );
 			}
 		}
@@ -277,7 +228,6 @@ void bli_gks_init_index( void )
 	// allocated.
 	memset( gks,           0, gks_size );
 	memset( cntx_ref_init, 0, fpa_size );
-	memset( cntx_ind_init, 0, fpa_size );
 }
 
 // -----------------------------------------------------------------------------
@@ -290,56 +240,8 @@ const cntx_t* bli_gks_lookup_nat_cntx
 	// Return the address of the (native) context for a given architecture id.
 	// This function assumes the architecture has already been registered.
 
-	return bli_gks_lookup_ind_cntx( id, BLIS_NAT );
-}
-
-// -----------------------------------------------------------------------------
-
-const cntx_t* bli_gks_lookup_ind_cntx
-     (
-       arch_t id,
-       ind_t  ind
-     )
-{
-	// Return the address of the context for a given architecture id and
-	// induced method. This function assumes the architecture has already
-	// been registered. Note that this function returns NULL if the induced
-	// method hasn't yet been called (and thus its context pointer is still
-	// NULL).
-
-	// Sanity check: verify that the arch_t id is valid.
-	if ( bli_error_checking_is_enabled() )
-	{
-		err_t e_val = bli_check_valid_arch_id( id );
-		bli_check_error_code( e_val );
-	}
-
-	// Index into the array of context pointers for the given architecture id,
-	// and then index into the subarray for the given induced method.
-	cntx_t** gks_id     = gks[ id ];
-	cntx_t*  gks_id_ind = gks_id[ ind ];
-
-	// Return the context pointer at gks_id_ind.
-	return gks_id_ind;
-}
-
-// -----------------------------------------------------------------------------
-
-const cntx_t* const * bli_gks_lookup_id
-     (
-       arch_t id
-     )
-{
-	// Return the address of the array of context pointers for a given
-	// architecture id. This function is only used for sanity check purposes
-	// to ensure that the underlying data structures for a particular id are
-	// initialized.
-
 	// Index into the array of context pointers for the given architecture id.
-	cntx_t** gks_id = gks[ id ];
-
-	// Return the context pointer at gks_id_ind.
-	return ( const cntx_t* const * )gks_id;
+	return gks[ id ];
 }
 
 // -----------------------------------------------------------------------------
@@ -348,8 +250,7 @@ void bli_gks_register_cntx
      (
        arch_t  id,
        void_fp nat_fp,
-       void_fp ref_fp,
-       void_fp ind_fp
+       void_fp ref_fp
      )
 {
 	err_t r_val;
@@ -384,9 +285,8 @@ void bli_gks_register_cntx
 	// latter will be used later on if the user calls a level-3 function
 	// with induced execution enabled.
 	cntx_ref_init[ id ] = ref_fp;
-	cntx_ind_init[ id ] = ind_fp;
 
-	// If the the context array pointer isn't NULL, then it means the given
+	// If the the context pointer isn't NULL, then it means the given
 	// architecture id has already registered (and the underlying memory
 	// allocations and context initializations have already been performed).
 	// This is really just a safety feature to prevent memory leaks; this
@@ -398,26 +298,13 @@ void bli_gks_register_cntx
 	printf( "bli_gks_register_cntx(): " );
 	#endif
 
-	// At this point, we know the pointer to the array of cntx_t* is NULL and
-	// needs to be allocated. Allocate the memory and initialize it to
-	// zeros/NULL, storing the address of the alloacted memory at the element
-	// for the current architecture id.
-	gks[ id ] = bli_calloc_intl( sizeof( cntx_t* ) * BLIS_NUM_IND_METHODS, &r_val );
-
-	// Alias the allocated array for readability.
-	cntx_t** gks_id = gks[ id ];
-
-	#ifdef BLIS_ENABLE_MEM_TRACING
-	printf( "bli_gks_register_cntx(): " );
-	#endif
-
 	// Allocate memory for a single context and store the address at
 	// the element in the gks[ id ] array that is reserved for native
 	// execution.
-	gks_id[ BLIS_NAT ] = bli_calloc_intl( sizeof( cntx_t ), &r_val );
+	gks[ id ] = bli_calloc_intl( sizeof( cntx_t ), &r_val );
 
 	// Alias the allocated context address for readability.
-	cntx_t* gks_id_nat = gks_id[ BLIS_NAT ];
+	cntx_t* gks_id_nat = gks[ id ];
 
 	// Call the context initialization function on the element of the newly
 	// allocated array corresponding to native execution.
@@ -464,11 +351,6 @@ void bli_gks_register_cntx
 // -----------------------------------------------------------------------------
 
 const cntx_t* bli_gks_query_cntx( void )
-{
-	return bli_gks_query_nat_cntx();
-}
-
-const cntx_t* bli_gks_query_nat_cntx( void )
 {
 	bli_init_once();
 
@@ -507,104 +389,6 @@ const cntx_t* bli_gks_query_cntx_noinit( void )
 // with a new entry corresponding to a context for an ind_t value.
 static bli_pthread_mutex_t gks_mutex = BLIS_PTHREAD_MUTEX_INITIALIZER;
 
-const cntx_t* bli_gks_query_ind_cntx
-     (
-       ind_t ind,
-       num_t dt
-     )
-{
-	bli_init_once();
-
-	cntx_t* gks_id_ind;
-	err_t r_val;
-
-	// Return the address of a context that will be suited for executing a
-	// level-3 operation via the requested induced method (and datatype) for
-	// the architecture id corresponding to the current hardware, as
-	// determined by bli_arch_query_id().
-
-	// This function is called when a level-3 operation via induced method is
-	// called, e.g. bli_gemm1m(). If this is the first time that induced method
-	// is being executed since bli_gks_init(), the necessary context structure
-	// is allocated and initialized. If this is not the first time, then the
-	// address of a previously-allocated and initialized (cached) context is
-	// returned. Note that much of this must be done with mutual exclusion to
-	// ensure thread safety and deterministic behavior.
-
-	// Query the architecture id.
-	arch_t id = bli_arch_query_id();
-
-	// Sanity check: verify that the arch_t id is valid.
-	if ( bli_error_checking_is_enabled() )
-	{
-		err_t e_val = bli_check_valid_arch_id( id );
-		bli_check_error_code( e_val );
-	}
-
-	// NOTE: These initial statements can reside outside of the critical section
-	// because gks[ id ] should have already been allocated, and the native
-	// context in that array should have already been allocated/initialized.
-
-	// Query the gks for the array of context pointers corresponding to the
-	// given architecture id.
-	cntx_t** gks_id     = gks[ id ];
-	cntx_t*  gks_id_nat = gks_id[ BLIS_NAT ];
-
-	// If for some reason the native context was requested, we can return
-	// its address early.
-	if ( ind == BLIS_NAT ) return gks_id_nat;
-
-	// This function assumes that the architecture idenified by id has
-	// already been registered with the gks (which guarantees that
-	// gks[ id ] is non-NULL and gks[ id ][ BLIS_NAT ] is also non-NULL
-	// and refers to a context initialized with valid data).
-
-	// Acquire the mutex protecting the gks.
-	bli_pthread_mutex_lock( &gks_mutex );
-
-	// BEGIN CRITICAL SECTION
-	{
-		// Alias for readability the element of gks_id associated with the
-		// requested induced method.
-		gks_id_ind = gks_id[ ind ];
-
-		// If the context pointer is NULL, then we know we must allocate and
-		// then initialize the context before returning its address.
-		if ( gks_id_ind == NULL )
-		{
-			// If gks_id_ind is NULL, then we know we must allocate and then
-			// initialize the context, storing its address back to
-			// gks_id[ ind ].
-			gks_id_ind    = bli_calloc_intl( sizeof( cntx_t ), &r_val );
-			gks_id[ ind ] = gks_id_ind;
-
-			// Before we can call the induced method context initialization
-			// function on the newly allocated structure, we must first copy
-			// over the contents of the native context.
-			*gks_id_ind = *gks_id_nat;
-
-			// Use the architecture id to look up the function pointer to the
-			// context initialization function for induced methods.
-			ind_cntx_init_ft f = cntx_ind_init[ id ];
-
-			// Now we modify the context (so that it contains the proper values
-			// for its induced method) by calling the context initialization
-			// function for the current induced method. (That function assumes
-			// that the context is pre- initialized with values for native
-			// execution.)
-			f( ind, gks_id_ind );
-		}
-	}
-	// END CRITICAL SECTION
-
-	// Release the mutex protecting the gks.
-	bli_pthread_mutex_unlock( &gks_mutex );
-
-	// Return the address of the newly-allocated/initialized context.
-	return gks_id_ind;
-
-}
-
 // -----------------------------------------------------------------------------
 
 void bli_gks_init_ref_cntx
@@ -675,7 +459,7 @@ const char* bli_gks_l3_ukr_impl_string( ukr_t ukr, ind_t method, num_t dt )
 	// Query the context for the current induced method and datatype, and
 	// then query the ukernel function pointer for the given datatype from
 	// that context.
-	const cntx_t* cntx = bli_gks_query_ind_cntx( method, dt );
+	const cntx_t* cntx = bli_gks_query_cntx();
 	void_fp fp         = bli_cntx_get_ukr_dt( dt, ukr, cntx );
 
 	// Check whether the ukernel function pointer is NULL for the given
@@ -731,20 +515,10 @@ kimpl_t bli_gks_l3_ukr_impl_type( ukr_t ukr, ind_t method, num_t dt )
 		// method to the typed function pointer within the known
 		// reference ukrs object.
 
-		// Query the architecture id.
-		arch_t id = bli_arch_query_id();
-
-		// Sanity check: verify that the arch_t id is valid.
-		if ( bli_error_checking_is_enabled() )
-		{
-			err_t e_val = bli_check_valid_arch_id( id );
-			bli_check_error_code( e_val );
-		}
-
-		// Query the native context from the gks.
-		const cntx_t* nat_cntx = bli_gks_lookup_nat_cntx( id );
+		// Query the context from the gks.
+	    const cntx_t* cntx = bli_gks_query_cntx();
 
-		if ( bli_gks_cntx_l3_nat_ukr_is_ref( dt, ukr, nat_cntx ) )
+		if ( bli_gks_cntx_l3_nat_ukr_is_ref( dt, ukr, cntx ) )
 			return BLIS_REFERENCE_UKERNEL;
 		else
 			return BLIS_OPTIMIZED_UKERNEL;
diff --git a/frame/base/bli_gks.h b/frame/base/bli_gks.h
index 4a5c519880..073dd5e56d 100644
--- a/frame/base/bli_gks.h
+++ b/frame/base/bli_gks.h
@@ -41,17 +41,12 @@ void    bli_gks_finalize( void );
 void    bli_gks_init_index( void );
 
 const cntx_t*         bli_gks_lookup_nat_cntx( arch_t id );
-const cntx_t*         bli_gks_lookup_ind_cntx( arch_t id, ind_t ind );
-const cntx_t* const * bli_gks_lookup_id( arch_t id );
-void                  bli_gks_register_cntx( arch_t id, void_fp nat_fp, void_fp ref_fp, void_fp ind_fp );
+void                  bli_gks_register_cntx( arch_t id, void_fp nat_fp, void_fp ref_fp );
 
 BLIS_EXPORT_BLIS const cntx_t* bli_gks_query_cntx( void );
-BLIS_EXPORT_BLIS const cntx_t* bli_gks_query_nat_cntx( void );
 
 const cntx_t* bli_gks_query_cntx_noinit( void );
 
-BLIS_EXPORT_BLIS const cntx_t* bli_gks_query_ind_cntx( ind_t ind, num_t dt );
-
 BLIS_EXPORT_BLIS void          bli_gks_init_ref_cntx( cntx_t* cntx );
 
 bool    bli_gks_cntx_l3_nat_ukr_is_ref( num_t dt, ukr_t ukr_id, const cntx_t* cntx );
diff --git a/frame/base/bli_obj.c b/frame/base/bli_obj.c
index cd0b6ac985..775af6117e 100644
--- a/frame/base/bli_obj.c
+++ b/frame/base/bli_obj.c
@@ -33,6 +33,7 @@
 
 */
 
+#include "bli_type_defs.h"
 #include "blis.h"
 
 void bli_obj_create
@@ -580,6 +581,27 @@ dim_t bli_align_dim_to_mult
 	return dim;
 }
 
+void bli_align_blksz_to_mult
+     (
+             blksz_t* dim,
+       const blksz_t* dim_mult
+     )
+{
+    for ( int i = BLIS_DT_LO; i <= BLIS_DT_HI; i++ )
+    {
+        num_t dt          = i;
+        dim_t dim_dt      = bli_blksz_get_def( dt, dim );
+        dim_t dim_max_dt  = bli_blksz_get_max( dt, dim );
+        dim_t dim_mult_dt = bli_blksz_get_def( dt, dim_mult );
+
+        dim_dt     = bli_align_dim_to_mult( dim_dt, dim_mult_dt );
+        dim_max_dt = bli_align_dim_to_mult( dim_max_dt, dim_mult_dt );
+
+        bli_blksz_set_def( dim_dt, dt, dim );
+        bli_blksz_set_max( dim_max_dt, dt, dim );
+    }
+}
+
 dim_t bli_align_dim_to_size
      (
        dim_t dim,
diff --git a/frame/base/bli_obj.h b/frame/base/bli_obj.h
index a446c09c81..9d30ead1b3 100644
--- a/frame/base/bli_obj.h
+++ b/frame/base/bli_obj.h
@@ -130,6 +130,12 @@ BLIS_EXPORT_BLIS dim_t bli_align_dim_to_mult
        dim_t dim_mult
      );
 
+BLIS_EXPORT_BLIS void bli_align_blksz_to_mult
+     (
+             blksz_t* dim,
+       const blksz_t* dim_mult
+     );
+
 BLIS_EXPORT_BLIS dim_t bli_align_dim_to_size
      (
        dim_t dim,
diff --git a/frame/base/bli_obj_scalar.c b/frame/base/bli_obj_scalar.c
index 5c6ef8f94a..8f77d1b4f0 100644
--- a/frame/base/bli_obj_scalar.c
+++ b/frame/base/bli_obj_scalar.c
@@ -103,6 +103,7 @@ void bli_obj_scalar_detach
 
 void bli_obj_scalar_attach
      (
+             num_t  dt_targ,
              conj_t conj,
        const obj_t* alpha,
              obj_t* a
@@ -110,10 +111,6 @@ void bli_obj_scalar_attach
 {
 	obj_t alpha_cast;
 
-	// Use the target datatype of A as the datatype to which we cast
-	// alpha locally.
-	const num_t dt_targ = bli_obj_target_dt( a );
-
 	// Make a copy-cast of alpha to the target datatype of A, queried
 	// above. This step gives us the opportunity to conjugate and/or
 	// typecast alpha.
diff --git a/frame/base/bli_obj_scalar.h b/frame/base/bli_obj_scalar.h
index 23bf573c67..9c9cd26dd6 100644
--- a/frame/base/bli_obj_scalar.h
+++ b/frame/base/bli_obj_scalar.h
@@ -54,6 +54,7 @@ BLIS_EXPORT_BLIS void bli_obj_scalar_detach
 
 BLIS_EXPORT_BLIS void bli_obj_scalar_attach
      (
+             num_t  dt_targ,
              conj_t conj,
        const obj_t* alpha,
              obj_t* a
diff --git a/frame/base/bli_pba.c b/frame/base/bli_pba.c
index 0a8497d186..6cd9568473 100644
--- a/frame/base/bli_pba.c
+++ b/frame/base/bli_pba.c
@@ -92,7 +92,7 @@ void bli_pba_finalize
 
 void bli_pba_acquire_m
      (
-       rntm_t*   rntm,
+       pba_t*    pba,
        siz_t     req_size,
        packbuf_t buf_type,
        mem_t*    mem
@@ -115,10 +115,6 @@ void bli_pba_acquire_m
 	#endif
 #endif
 
-	// Query the memory broker from the runtime.
-	pba_t* pba = bli_rntm_pba( rntm );
-
-
 	if ( buf_type == BLIS_BUFFER_FOR_GEN_USE )
 	{
 		malloc_ft malloc_fp  = bli_pba_malloc_fp( pba );
@@ -197,17 +193,14 @@ void bli_pba_acquire_m
 
 void bli_pba_release
      (
-       rntm_t* rntm,
-       mem_t*  mem
+       pba_t* pba,
+       mem_t* mem
      )
 {
 	packbuf_t buf_type;
 	pool_t*   pool;
 	pblk_t*   pblk;
 
-	// Query the memory broker from the runtime.
-	pba_t* pba = bli_rntm_pba( rntm );
-
 	// Extract the buffer type so we know what kind of memory was allocated.
 	buf_type = bli_mem_buf_type( mem );
 
diff --git a/frame/base/bli_pba.h b/frame/base/bli_pba.h
index 5cd95c2d44..c6cb8ad2ad 100644
--- a/frame/base/bli_pba.h
+++ b/frame/base/bli_pba.h
@@ -132,7 +132,7 @@ void bli_pba_finalize
 
 void bli_pba_acquire_m
      (
-       rntm_t*   rntm,
+       pba_t*    pba,
        siz_t     req_size,
        packbuf_t buf_type,
        mem_t*    mem
@@ -140,20 +140,10 @@ void bli_pba_acquire_m
 
 void bli_pba_release
      (
-       rntm_t* rntm,
-       mem_t*  mem
+       pba_t* pba,
+       mem_t* mem
      );
 
-BLIS_INLINE void bli_pba_rntm_set_pba
-     (
-       rntm_t* rntm
-     )
-{
-	pba_t* pba = bli_pba_query();
-
-	bli_rntm_set_pba( pba, rntm );
-}
-
 siz_t bli_pba_pool_size
      (
        const pba_t*    pba,
diff --git a/frame/1m/unpackm/bli_unpackm_cntl.h b/frame/base/bli_plugin.h
similarity index 68%
rename from frame/1m/unpackm/bli_unpackm_cntl.h
rename to frame/base/bli_plugin.h
index 5c41d94657..6e1b1dc843 100644
--- a/frame/1m/unpackm/bli_unpackm_cntl.h
+++ b/frame/base/bli_plugin.h
@@ -4,8 +4,7 @@
    An object-based framework for developing high-performance BLAS-like
    libraries.
 
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
+   Copyright (C) 2022, Southern Methodist University
 
    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are
@@ -33,24 +32,41 @@
 
 */
 
-struct unpackm_params_s
+#ifndef BLIS_PLUGIN_H
+#define BLIS_PLUGIN_H
+
+typedef struct plugin_s
 {
-	uint64_t        size; // size field must be present and come first.
-	unpackm_var_oft var_func;
-};
-typedef struct unpackm_params_s unpackm_params_t;
+	blksz_t   blkszs[ BLIS_NUM_BLKSZS ];
+	bszid_t   bmults[ BLIS_NUM_BLKSZS ];
+
+	func_t* ukrs[ BLIS_NUM_UKRS ];
+	mbool_t   ukr_prefs[ BLIS_NUM_UKR_PREFS ];
+
+	void_fp   l3_sup_handlers[ BLIS_NUM_ARCHS ];
+
+	ind_t     method;
+
+} arch_func_t;
+
+// -----------------------------------------------------------------------------
+
+//
+// -- cntx_t query (complex) ---------------------------------------------------
+//
 
-#define bli_cntl_unpackm_params_var_func( cntl ) \
-\
-	( ( (unpackm_params_t*)(cntl)->params )->var_func )
+BLIS_INLINE const blksz_t* bli_cntx_get_blksz( bszid_t bs_id, const cntx_t* cntx )
+{
+	// Return the address of the blksz_t identified by bs_id.
+	return &cntx->blkszs[ bs_id ];
+}
 
 // -----------------------------------------------------------------------------
 
-cntl_t* bli_unpackm_cntl_create_node
-     (
-       rntm_t*   rntm,
-       void_fp   var_func,
-       void_fp   unpackm_var_func,
-       cntl_t*   sub_node
-     );
+// Function prototypes
+
+BLIS_EXPORT_BLIS void bli_cntx_clear( cntx_t* cntx );
+
+
+#endif
 
diff --git a/frame/base/bli_rntm.h b/frame/base/bli_rntm.h
index 2a39f8894c..b2f9fcb459 100644
--- a/frame/base/bli_rntm.h
+++ b/frame/base/bli_rntm.h
@@ -50,10 +50,6 @@ typedef struct rntm_s
 	bool      pack_a;
 	bool      pack_b;
 	bool      l3_sup;
-
-	pool_t*   sba_pool;
-	pba_t*    pba;
-
 } rntm_t;
 */
 
@@ -73,7 +69,7 @@ BLIS_INLINE dim_t bli_rntm_num_threads( const rntm_t* rntm )
 
 BLIS_INLINE dim_t bli_rntm_ways_for( bszid_t bszid, const rntm_t* rntm )
 {
-	return rntm->thrloop[ bszid ];
+	return bszid == BLIS_NO_PART ? 1 : rntm->thrloop[ bszid ];
 }
 
 BLIS_INLINE dim_t bli_rntm_jc_ways( const rntm_t* rntm )
@@ -109,42 +105,15 @@ BLIS_INLINE bool bli_rntm_pack_b( const rntm_t* rntm )
 {
 	return ( bool )( rntm->pack_b );
 }
-
 BLIS_INLINE bool bli_rntm_l3_sup( const rntm_t* rntm )
 {
 	return rntm->l3_sup;
 }
-
-//
-// -- rntm_t query (internal use only) -----------------------------------------
-//
-
-BLIS_INLINE pool_t* bli_rntm_sba_pool( const rntm_t* rntm )
+BLIS_INLINE bool bli_rntm_ind( ind_t im, const rntm_t* rntm )
 {
-	return rntm->sba_pool;
+	return rntm->enable_ind[ im ];
 }
 
-BLIS_INLINE pba_t* bli_rntm_pba( const rntm_t* rntm )
-{
-	return rntm->pba;
-}
-
-#if 0
-BLIS_INLINE dim_t bli_rntm_equals( rntm_t* rntm1, rntm_t* rntm2 )
-{
-	const bool nt = bli_rntm_num_threads( rntm1 ) == bli_rntm_num_threads( rntm2 );
-	const bool jc = bli_rntm_jc_ways( rntm1 ) == bli_rntm_jc_ways( rntm2 );
-	const bool pc = bli_rntm_pc_ways( rntm1 ) == bli_rntm_pc_ways( rntm2 );
-	const bool ic = bli_rntm_ic_ways( rntm1 ) == bli_rntm_ic_ways( rntm2 );
-	const bool jr = bli_rntm_jr_ways( rntm1 ) == bli_rntm_jr_ways( rntm2 );
-	const bool ir = bli_rntm_ir_ways( rntm1 ) == bli_rntm_ir_ways( rntm2 );
-	const bool pr = bli_rntm_pr_ways( rntm1 ) == bli_rntm_pr_ways( rntm2 );
-
-	if ( nt && jc && pc && ic && jr && ir && pr ) return TRUE;
-	else                                          return FALSE;
-}
-#endif
-
 //
 // -- rntm_t modification (internal use only) ----------------------------------
 //
@@ -200,16 +169,6 @@ BLIS_INLINE void bli_rntm_set_ways_only( dim_t jc, dim_t pc, dim_t ic, dim_t jr,
 	bli_rntm_set_pr_ways_only(  1, rntm );
 }
 
-BLIS_INLINE void bli_rntm_set_sba_pool( pool_t* sba_pool, rntm_t* rntm )
-{
-	rntm->sba_pool = sba_pool;
-}
-
-BLIS_INLINE void bli_rntm_set_pba( pba_t* pba, rntm_t* rntm )
-{
-	rntm->pba = pba;
-}
-
 BLIS_INLINE void bli_rntm_clear_num_threads_only( rntm_t* rntm )
 {
 	bli_rntm_set_num_threads_only( -1, rntm );
@@ -218,14 +177,6 @@ BLIS_INLINE void bli_rntm_clear_ways_only( rntm_t* rntm )
 {
 	bli_rntm_set_ways_only( -1, -1, -1, -1, -1, rntm );
 }
-BLIS_INLINE void bli_rntm_clear_sba_pool( rntm_t* rntm )
-{
-	bli_rntm_set_sba_pool( NULL, rntm );
-}
-BLIS_INLINE void bli_rntm_clear_pba( rntm_t* rntm )
-{
-	bli_rntm_set_pba( NULL, rntm );
-}
 
 //
 // -- rntm_t modification (public API) -----------------------------------------
@@ -264,12 +215,17 @@ BLIS_INLINE void bli_rntm_set_pack_b( bool pack_b, rntm_t* rntm )
 	// Set the bool indicating whether matrix B should be packed.
 	rntm->pack_b = pack_b;
 }
-
 BLIS_INLINE void bli_rntm_set_l3_sup( bool l3_sup, rntm_t* rntm )
 {
 	// Set the bool indicating whether level-3 sup handling is enabled.
 	rntm->l3_sup = l3_sup;
 }
+BLIS_INLINE void bli_rntm_set_ind( bool enable_ind, ind_t im, rntm_t* rntm )
+{
+	// Set the bools indicating whether induced methods are enabled
+	rntm->enable_ind[ im ] = enable_ind;
+}
+
 BLIS_INLINE void bli_rntm_enable_l3_sup( rntm_t* rntm )
 {
 	bli_rntm_set_l3_sup( TRUE, rntm );
@@ -278,6 +234,10 @@ BLIS_INLINE void bli_rntm_disable_l3_sup( rntm_t* rntm )
 {
 	bli_rntm_set_l3_sup( FALSE, rntm );
 }
+BLIS_INLINE void bli_rntm_disable_ind( ind_t im, rntm_t* rntm )
+{
+	bli_rntm_set_ind( FALSE, im, rntm );
+}
 
 //
 // -- rntm_t modification (internal use only) ----------------------------------
@@ -295,6 +255,11 @@ BLIS_INLINE void bli_rntm_clear_l3_sup( rntm_t* rntm )
 {
 	bli_rntm_set_l3_sup( TRUE, rntm );
 }
+BLIS_INLINE void bli_rntm_clear_ind( rntm_t* rntm )
+{
+	bli_rntm_set_ind( TRUE, BLIS_NAT, rntm );
+	bli_rntm_set_ind( TRUE, BLIS_1M, rntm );
+}
 
 //
 // -- rntm_t initialization ----------------------------------------------------
@@ -312,8 +277,7 @@ BLIS_INLINE void bli_rntm_clear_l3_sup( rntm_t* rntm )
           .pack_a      = FALSE, \
           .pack_b      = FALSE, \
           .l3_sup      = TRUE, \
-          .sba_pool    = NULL, \
-          .pba         = NULL, \
+          .enable_ind  = { TRUE, TRUE }, \
         }  \
 
 BLIS_INLINE void bli_rntm_init( rntm_t* rntm )
@@ -325,9 +289,6 @@ BLIS_INLINE void bli_rntm_init( rntm_t* rntm )
 	bli_rntm_clear_pack_a( rntm );
 	bli_rntm_clear_pack_b( rntm );
 	bli_rntm_clear_l3_sup( rntm );
-
-	bli_rntm_clear_sba_pool( rntm );
-	bli_rntm_clear_pba( rntm );
 }
 
 // -- rntm_t total thread calculation ------------------------------------------
diff --git a/frame/base/bli_sba.c b/frame/base/bli_sba.c
index 776622bb4a..0eaedbed5a 100644
--- a/frame/base/bli_sba.c
+++ b/frame/base/bli_sba.c
@@ -57,7 +57,7 @@ void bli_sba_finalize( void )
 
 void* bli_sba_acquire
      (
-       rntm_t* rntm,
+       pool_t* pool,
        siz_t   req_size
      )
 {
@@ -65,50 +65,40 @@ void* bli_sba_acquire
 	err_t r_val;
 
 #ifdef BLIS_ENABLE_SBA_POOLS
-	if ( rntm == NULL )
+	pblk_t pblk;
+
+	// We don't expect NULL sba_pool pointers in the normal course of BLIS
+	// operation. However, there are rare instances where it is convenient
+	// to support use of bli_sba_acquire() without having to pass in a valid
+	// sba pool data structure. The case that inspired this branch was the
+	// gemm_ukr and related test modules in the BLIS testsuite. (There, it
+	// is convenient to not have to checkout an array_t from the sba, and it
+	// does no harm since the malloc() happens outside of the region that
+	// would be timed.)
+	if ( pool == NULL )
 	{
-		block = bli_malloc_intl( req_size, &r_val );
+	    block = bli_malloc_intl( req_size, &r_val );
 	}
 	else
 	{
-		pblk_t pblk;
-
-		// Query the small block pool from the rntm.
-		pool_t* pool = bli_rntm_sba_pool( rntm );
-
-		// We don't expect NULL sba_pool pointers in the normal course of BLIS
-		// operation. However, there are rare instances where it is convenient
-		// to support use of bli_sba_acquire() without having to pass in a valid
-		// sba pool data structure. The case that inspired this branch was the
-		// gemm_ukr and related test modules in the BLIS testsuite. (There, it
-		// is convenient to not have to checkout an array_t from the sba, and it
-		// does no harm since the malloc() happens outside of the region that
-		// would be timed.)
-		if ( pool == NULL )
-		{
-		    block = bli_malloc_intl( req_size, &r_val );
-		}
-		else
+		// Query the block_size of the pool_t so that we can request the exact
+		// size present.
+		const siz_t block_size = bli_pool_block_size( pool );
+
+		// Sanity check: Make sure the requested size is no larger than the
+		// block_size field of the pool.
+		if ( block_size < req_size )
 		{
-			// Query the block_size of the pool_t so that we can request the exact
-			// size present.
-			const siz_t block_size = bli_pool_block_size( pool );
-
-			// Sanity check: Make sure the requested size is no larger than the
-			// block_size field of the pool.
-			if ( block_size < req_size )
-			{
-				printf( "bli_sba_acquire(): ** pool block_size is %d but req_size is %d.\n",
-				        ( int )block_size, ( int )req_size );
-				bli_abort();
-			}
-
-			// Check out a block using the block_size queried above.
-			bli_pool_checkout_block( block_size, &pblk, pool );
-
-			// The block address is stored within the pblk_t.
-			block = bli_pblk_buf( &pblk );
+			printf( "bli_sba_acquire(): ** pool block_size is %d but req_size is %d.\n",
+			        ( int )block_size, ( int )req_size );
+			bli_abort();
 		}
+
+		// Check out a block using the block_size queried above.
+		bli_pool_checkout_block( block_size, &pblk, pool );
+
+		// The block address is stored within the pblk_t.
+		block = bli_pblk_buf( &pblk );
 	}
 #else
 
@@ -122,44 +112,34 @@ void* bli_sba_acquire
 
 void bli_sba_release
      (
-       rntm_t* rntm,
+       pool_t* pool,
        void*   block
      )
 {
 #ifdef BLIS_ENABLE_SBA_POOLS
-	if ( rntm == NULL )
+	pblk_t pblk;
+
+	if ( pool == NULL )
 	{
-		bli_free_intl( block );
+	    bli_free_intl( block );
 	}
 	else
 	{
-		pblk_t pblk;
-
-		// Query the small block pool from the rntm.
-		pool_t* pool = bli_rntm_sba_pool( rntm );
-
-		if ( pool == NULL )
-		{
-		    bli_free_intl( block );
-		}
-		else
-		{
-			// Query the block_size field from the pool. This is not super-important
-			// for this particular application of the pool_t (that is, the "leaf"
-			// component of the sba), but it seems like good housekeeping to maintain
-			// the block_size field of the pblk_t in case its ever needed/read.
-			const siz_t block_size = bli_pool_block_size( pool );
-
-			// Embed the block's memory address into a pblk_t, along with the
-			// block_size queried from the pool.
-			bli_pblk_set_buf( block, &pblk );
-			bli_pblk_set_block_size( block_size, &pblk );
-
-			// Check the pblk_t back into the pool_t. (It's okay that the pblk_t is
-			// a local variable since its contents are copied into the pool's internal
-			// data structure--an array of pblk_t.)
-			bli_pool_checkin_block( &pblk, pool );
-		}
+		// Query the block_size field from the pool. This is not super-important
+		// for this particular application of the pool_t (that is, the "leaf"
+		// component of the sba), but it seems like good housekeeping to maintain
+		// the block_size field of the pblk_t in case its ever needed/read.
+		const siz_t block_size = bli_pool_block_size( pool );
+
+		// Embed the block's memory address into a pblk_t, along with the
+		// block_size queried from the pool.
+		bli_pblk_set_buf( block, &pblk );
+		bli_pblk_set_block_size( block_size, &pblk );
+
+		// Check the pblk_t back into the pool_t. (It's okay that the pblk_t is
+		// a local variable since its contents are copied into the pool's internal
+		// data structure--an array of pblk_t.)
+		bli_pool_checkin_block( &pblk, pool );
 	}
 #else
 
@@ -192,23 +172,4 @@ void bli_sba_checkin_array
 	bli_apool_checkin_array( array, &sba );
 }
 
-void bli_sba_rntm_set_pool
-     (
-       siz_t    index,
-       array_t* array,
-       rntm_t*  rntm
-     )
-{
-	#ifndef BLIS_ENABLE_SBA_POOLS
-	bli_rntm_set_sba_pool( NULL, rntm );
-	return;
-	#endif
-
-	// Query the pool_t* in the array_t corresponding to index.
-	pool_t* pool = bli_apool_array_elem( index, array );
-
-	// Embed the pool_t* into the rntm_t.
-	bli_rntm_set_sba_pool( pool, rntm );
-}
-
 
diff --git a/frame/base/bli_sba.h b/frame/base/bli_sba.h
index 4fc3aaaeea..edda199856 100644
--- a/frame/base/bli_sba.h
+++ b/frame/base/bli_sba.h
@@ -52,21 +52,15 @@ void bli_sba_checkin_array
        array_t* array
      );
 
-void bli_sba_rntm_set_pool
-     (
-       siz_t    index,
-       array_t* array,
-       rntm_t*  rntm
-     );
-
 void* bli_sba_acquire
      (
-       rntm_t* rntm,
+       pool_t* pool,
        siz_t   req_size
      );
+
 void bli_sba_release
      (
-       rntm_t* rntm,
+       pool_t* pool,
        void*   block
      );
 
diff --git a/frame/include/bli_extern_defs.h b/frame/include/bli_extern_defs.h
index 42ad9c72ba..71a6096e10 100644
--- a/frame/include/bli_extern_defs.h
+++ b/frame/include/bli_extern_defs.h
@@ -44,7 +44,5 @@ BLIS_EXPORT_BLIS extern const obj_t BLIS_MINUS_ONE;
 BLIS_EXPORT_BLIS extern const obj_t BLIS_MINUS_TWO;
 
 BLIS_EXPORT_BLIS extern thrcomm_t BLIS_SINGLE_COMM;
-BLIS_EXPORT_BLIS extern thrinfo_t BLIS_PACKM_SINGLE_THREADED;
-BLIS_EXPORT_BLIS extern thrinfo_t BLIS_GEMM_SINGLE_THREADED;
 
 #endif
diff --git a/frame/include/bli_oapi_ex.h b/frame/include/bli_oapi_ex.h
index 7252fd7fff..b150b89fca 100644
--- a/frame/include/bli_oapi_ex.h
+++ b/frame/include/bli_oapi_ex.h
@@ -48,7 +48,7 @@
 // Define the macro to add expert arguments to function signatures
 // and prototypes.
 #undef  BLIS_OAPI_EX_PARAMS
-#define BLIS_OAPI_EX_PARAMS   , const cntx_t* cntx, rntm_t* rntm
+#define BLIS_OAPI_EX_PARAMS   , const cntx_t* cntx, const rntm_t* rntm
 
 // Define the macro to omit the expert variable declaration block, since
 // it is not needed when expert parameters are passed in through the API.
diff --git a/frame/include/bli_obj_macro_defs.h b/frame/include/bli_obj_macro_defs.h
index 0db7fb5c46..c55e7e36a1 100644
--- a/frame/include/bli_obj_macro_defs.h
+++ b/frame/include/bli_obj_macro_defs.h
@@ -146,60 +146,6 @@ BLIS_INLINE num_t bli_obj_dt_proj_to_complex( const obj_t* obj )
 	       ( bli_obj_dt( obj ) | BLIS_BITVAL_COMPLEX );
 }
 
-BLIS_INLINE num_t bli_obj_target_dt( const obj_t* obj )
-{
-	return ( num_t )
-	       ( ( obj->info & BLIS_TARGET_DT_BITS ) >> BLIS_TARGET_DT_SHIFT );
-}
-
-BLIS_INLINE dom_t bli_obj_target_domain( const obj_t* obj )
-{
-	return ( dom_t )
-	       ( ( obj->info & BLIS_TARGET_DOMAIN_BIT ) >> BLIS_TARGET_DT_SHIFT );
-}
-
-BLIS_INLINE prec_t bli_obj_target_prec( const obj_t* obj )
-{
-	return ( prec_t )
-	       ( ( obj->info & BLIS_TARGET_PREC_BIT ) >> BLIS_TARGET_DT_SHIFT );
-}
-
-BLIS_INLINE num_t bli_obj_exec_dt( const obj_t* obj )
-{
-	return ( num_t )
-	       ( ( obj->info & BLIS_EXEC_DT_BITS ) >> BLIS_EXEC_DT_SHIFT );
-}
-
-BLIS_INLINE dom_t bli_obj_exec_domain( const obj_t* obj )
-{
-	return ( dom_t )
-	       ( ( obj->info & BLIS_EXEC_DOMAIN_BIT ) >> BLIS_EXEC_DT_SHIFT );
-}
-
-BLIS_INLINE prec_t bli_obj_exec_prec( const obj_t* obj )
-{
-	return ( prec_t )
-	       ( ( obj->info & BLIS_EXEC_PREC_BIT ) >> BLIS_EXEC_DT_SHIFT );
-}
-
-BLIS_INLINE num_t bli_obj_comp_dt( const obj_t* obj )
-{
-	return ( num_t )
-	       ( ( obj->info & BLIS_COMP_DT_BITS ) >> BLIS_COMP_DT_SHIFT );
-}
-
-BLIS_INLINE dom_t bli_obj_comp_domain( const obj_t* obj )
-{
-	return ( dom_t )
-	       ( ( obj->info & BLIS_COMP_DOMAIN_BIT ) >> BLIS_COMP_DT_SHIFT );
-}
-
-BLIS_INLINE prec_t bli_obj_comp_prec( const obj_t* obj )
-{
-	return ( prec_t )
-	       ( ( obj->info & BLIS_COMP_PREC_BIT ) >> BLIS_COMP_DT_SHIFT );
-}
-
 // NOTE: This function queries info2.
 BLIS_INLINE num_t bli_obj_scalar_dt( const obj_t* obj )
 {
@@ -318,62 +264,6 @@ BLIS_INLINE bool bli_obj_has_unit_diag( const obj_t* obj )
 	       ( bli_obj_diag( obj ) == BLIS_BITVAL_UNIT_DIAG );
 }
 
-BLIS_INLINE bool bli_obj_has_inverted_diag( const obj_t* obj )
-{
-	return ( bool )
-	       ( ( obj->info & BLIS_INVERT_DIAG_BIT ) == BLIS_BITVAL_INVERT_DIAG );
-}
-
-BLIS_INLINE bool bli_obj_is_pack_rev_if_upper( const obj_t* obj )
-{
-	return ( bool )
-	       ( ( obj->info & BLIS_PACK_REV_IF_UPPER_BIT ) == BLIS_BITVAL_PACK_REV_IF_UPPER );
-}
-
-BLIS_INLINE bool bli_obj_is_pack_rev_if_lower( const obj_t* obj )
-{
-	return ( bool )
-	       ( ( obj->info & BLIS_PACK_REV_IF_LOWER_BIT ) == BLIS_BITVAL_PACK_REV_IF_LOWER );
-}
-
-BLIS_INLINE pack_t bli_obj_pack_schema( const obj_t* obj )
-{
-	return ( pack_t )
-	       ( obj->info & BLIS_PACK_SCHEMA_BITS );
-}
-
-BLIS_INLINE bool bli_obj_is_packed( const obj_t* obj )
-{
-	return ( bool )
-	       ( obj->info & BLIS_PACK_BIT );
-}
-
-BLIS_INLINE bool bli_obj_is_row_packed( const obj_t* obj )
-{
-	return ( bool )
-	       ( ( obj->info & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^
-                                                   BLIS_BITVAL_PACKED_ROWS    ) );
-}
-
-BLIS_INLINE bool bli_obj_is_col_packed( const obj_t* obj )
-{
-	return ( bool )
-	       ( ( obj->info & BLIS_PACK_RC_BIT ) == ( BLIS_BITVAL_PACKED_UNSPEC ^
-                                                   BLIS_BITVAL_PACKED_COLUMNS ) );
-}
-
-BLIS_INLINE bool bli_obj_is_panel_packed( const obj_t* obj )
-{
-	return ( bool )
-	       ( obj->info & BLIS_PACK_PANEL_BIT );
-}
-
-BLIS_INLINE packbuf_t bli_obj_pack_buffer_type( const obj_t* obj )
-{
-	return ( packbuf_t )
-	       ( obj->info & BLIS_PACK_BUFFER_BITS );
-}
-
 BLIS_INLINE struc_t bli_obj_struc( const obj_t* obj )
 {
 	return ( struc_t )
@@ -448,81 +338,12 @@ BLIS_INLINE void bli_obj_set_diag( diag_t diag, obj_t* obj )
 	            ( ( obj->info & ~BLIS_UNIT_DIAG_BIT ) | diag );
 }
 
-BLIS_INLINE void bli_obj_set_invert_diag( invdiag_t invdiag, obj_t* obj )
-{
-	obj->info = ( objbits_t )
-	            ( ( obj->info & ~BLIS_INVERT_DIAG_BIT ) | invdiag );
-}
-
 BLIS_INLINE void bli_obj_set_dt( num_t dt, obj_t* obj )
 {
 	obj->info = ( objbits_t )
 	            ( ( obj->info & ~BLIS_DATATYPE_BITS ) | dt );
 }
 
-BLIS_INLINE void bli_obj_set_target_dt( num_t dt, obj_t* obj )
-{
-	obj->info = ( objbits_t )
-	            ( ( obj->info & ~BLIS_TARGET_DT_BITS ) |
-	              ( dt << BLIS_TARGET_DT_SHIFT ) );
-}
-
-BLIS_INLINE void bli_obj_set_target_domain( dom_t dt, obj_t* obj )
-{
-	obj->info = ( objbits_t )
-	            ( ( obj->info & ~BLIS_TARGET_DOMAIN_BIT ) |
-	              ( dt << BLIS_TARGET_DT_SHIFT ) );
-}
-
-BLIS_INLINE void bli_obj_set_target_prec( prec_t dt, obj_t* obj )
-{
-	obj->info = ( objbits_t )
-	            ( ( obj->info & ~BLIS_TARGET_PREC_BIT ) |
-	              ( dt << BLIS_TARGET_DT_SHIFT ) );
-}
-
-BLIS_INLINE void bli_obj_set_exec_dt( num_t dt, obj_t* obj )
-{
-	obj->info = ( objbits_t )
-	            ( ( obj->info & ~BLIS_EXEC_DT_BITS ) |
-	              ( dt << BLIS_EXEC_DT_SHIFT ) );
-}
-
-BLIS_INLINE void bli_obj_set_exec_domain( dom_t dt, obj_t* obj )
-{
-	obj->info = ( objbits_t )
-	            ( ( obj->info & ~BLIS_EXEC_DOMAIN_BIT ) |
-	              ( dt << BLIS_EXEC_DT_SHIFT ) );
-}
-
-BLIS_INLINE void bli_obj_set_exec_prec( prec_t dt, obj_t* obj )
-{
-	obj->info = ( objbits_t )
-	            ( ( obj->info & ~BLIS_EXEC_PREC_BIT ) |
-	              ( dt << BLIS_EXEC_DT_SHIFT ) );
-}
-
-BLIS_INLINE void bli_obj_set_comp_dt( num_t dt, obj_t* obj )
-{
-	obj->info = ( objbits_t )
-	            ( ( obj->info & ~BLIS_COMP_DT_BITS ) |
-	              ( dt << BLIS_COMP_DT_SHIFT ) );
-}
-
-BLIS_INLINE void bli_obj_set_comp_domain( dom_t dt, obj_t* obj )
-{
-	obj->info = ( objbits_t )
-	            ( ( obj->info & ~BLIS_COMP_DOMAIN_BIT ) |
-	              ( dt << BLIS_COMP_DT_SHIFT ) );
-}
-
-BLIS_INLINE void bli_obj_set_comp_prec( prec_t dt, obj_t* obj )
-{
-	obj->info = ( objbits_t )
-	            ( ( obj->info & ~BLIS_COMP_PREC_BIT ) |
-	              ( dt << BLIS_COMP_DT_SHIFT ) );
-}
-
 // NOTE: This function queries and modifies info2.
 BLIS_INLINE void bli_obj_set_scalar_dt( num_t dt, obj_t* obj )
 {
@@ -547,34 +368,6 @@ BLIS_INLINE void bli_obj_set_scalar_prec( prec_t dt, obj_t* obj )
 	               ( dt << BLIS_SCALAR_DT_SHIFT ) );
 }
 
-BLIS_INLINE void bli_obj_set_pack_schema( pack_t schema, obj_t* obj )
-{
-	obj->info = ( objbits_t )
-	            ( ( obj->info & ~BLIS_PACK_SCHEMA_BITS ) | schema );
-}
-
-BLIS_INLINE void bli_obj_set_pack_order_if_upper( packord_t ordif, obj_t* obj )
-{
-	obj->info = ( objbits_t )
-	            ( ( obj->info & ~BLIS_PACK_REV_IF_UPPER_BIT ) | ordif );
-}
-
-BLIS_INLINE void bli_obj_set_pack_order_if_lower( packord_t ordif, obj_t* obj )
-{
-	obj->info = ( objbits_t )
-	            ( ( obj->info & ~BLIS_PACK_REV_IF_LOWER_BIT ) | ordif );
-}
-
-// NOTE: The packbuf_t bitfield in the obj_t is currently unused. Instead,
-// packbuf_t is stored/used from the context in order to support various
-// induced methods. (Though ideally the packbuf_t field would only be
-// present in the control tree).
-BLIS_INLINE void bli_obj_set_pack_buffer_type( packbuf_t buf_type, obj_t* obj )
-{
-	obj->info = ( objbits_t )
-	            ( ( obj->info & ~BLIS_PACK_BUFFER_BITS ) | buf_type );
-}
-
 BLIS_INLINE void bli_obj_set_struc( struc_t struc, obj_t* obj )
 {
 	obj->info = ( objbits_t )
@@ -1246,9 +1039,6 @@ BLIS_INLINE void bli_obj_init_finish( num_t dt, dim_t m, dim_t n, void* p, inc_t
 	bli_obj_set_as_root( obj );
 
 	bli_obj_set_dt( dt, obj );
-	bli_obj_set_target_dt( dt, obj );
-	bli_obj_set_exec_dt( dt, obj );
-	bli_obj_set_comp_dt( dt, obj );
 
 	bli_obj_set_dims( m, n, obj );
 	bli_obj_set_strides( rs, cs, obj );
@@ -1407,14 +1197,8 @@ BLIS_INLINE void bli_obj_real_part( const obj_t* c, obj_t* r )
 	if ( bli_obj_is_complex( c ) )
 	{
 		// Change the datatypes.
-		const num_t dt_stor_r = bli_dt_proj_to_real( bli_obj_dt( c )        );
-		const num_t dt_targ_r = bli_dt_proj_to_real( bli_obj_target_dt( c ) );
-		const num_t dt_exec_r = bli_dt_proj_to_real( bli_obj_exec_dt( c )   );
-		const num_t dt_comp_r = bli_dt_proj_to_real( bli_obj_comp_dt( c )   );
-		bli_obj_set_dt(        dt_stor_r, r );
-		bli_obj_set_target_dt( dt_targ_r, r );
-		bli_obj_set_exec_dt(   dt_exec_r, r );
-		bli_obj_set_comp_dt(   dt_comp_r, r );
+		const num_t dt_stor_r = bli_dt_proj_to_real( bli_obj_dt( c ) );
+		bli_obj_set_dt( dt_stor_r, r );
 
 		// Don't touch the attached scalar datatype.
 
@@ -1440,14 +1224,8 @@ BLIS_INLINE void bli_obj_imag_part( const obj_t* c, obj_t* i )
 		bli_obj_alias_to( c, i );
 
 		// Change the datatype.
-		const num_t dt_stor_r = bli_dt_proj_to_real( bli_obj_dt( c )        );
-		const num_t dt_targ_r = bli_dt_proj_to_real( bli_obj_target_dt( c ) );
-		const num_t dt_exec_r = bli_dt_proj_to_real( bli_obj_exec_dt( c )   );
-		const num_t dt_comp_r = bli_dt_proj_to_real( bli_obj_comp_dt( c )   );
-		bli_obj_set_dt(        dt_stor_r, i );
-		bli_obj_set_target_dt( dt_targ_r, i );
-		bli_obj_set_exec_dt(   dt_exec_r, i );
-		bli_obj_set_comp_dt(   dt_comp_r, i );
+		const num_t dt_stor_r = bli_dt_proj_to_real( bli_obj_dt( c ) );
+		bli_obj_set_dt( dt_stor_r, i );
 
 		// Don't touch the attached scalar datatype.
 
@@ -1499,17 +1277,6 @@ BLIS_INLINE void bli_obj_swap( obj_t* a, obj_t* b )
 	if ( b_root_is_self ) bli_obj_set_as_root( a );
 }
 
-// Swap object pack schemas.
-
-BLIS_INLINE void bli_obj_swap_pack_schemas( obj_t* a, obj_t* b )
-{
-	const pack_t schema_a = bli_obj_pack_schema( a );
-	const pack_t schema_b = bli_obj_pack_schema( b );
-
-	bli_obj_set_pack_schema( schema_b, a );
-	bli_obj_set_pack_schema( schema_a, b );
-}
-
 // Induce a transposition on an object: swap dimensions, increments, and
 // offsets, then clear the trans bit.
 
diff --git a/frame/include/bli_tapi_ex.h b/frame/include/bli_tapi_ex.h
index f12be24b89..e7665e779a 100644
--- a/frame/include/bli_tapi_ex.h
+++ b/frame/include/bli_tapi_ex.h
@@ -48,7 +48,7 @@
 // Define the macro to add expert arguments to function signatures
 // and prototypes.
 #undef  BLIS_TAPI_EX_PARAMS
-#define BLIS_TAPI_EX_PARAMS   , const cntx_t* cntx, rntm_t* rntm
+#define BLIS_TAPI_EX_PARAMS   , const cntx_t* cntx, const rntm_t* rntm
 
 // Define the macro to omit the expert variable declaration block, since
 // it is not needed when expert parameters are passed in through the API.
diff --git a/frame/include/bli_type_defs.h b/frame/include/bli_type_defs.h
index e957fc6b23..7e66aa291c 100644
--- a/frame/include/bli_type_defs.h
+++ b/frame/include/bli_type_defs.h
@@ -596,12 +596,13 @@ typedef enum
 	BLIS_MACH_RMIN,
 	BLIS_MACH_EMAX,
 	BLIS_MACH_RMAX,
-	BLIS_MACH_EPS2
-} machval_t;
+	BLIS_MACH_EPS2,
+
+    BLIS_NUM_MACH_PARAMS,
 
-#define BLIS_NUM_MACH_PARAMS   11
-#define BLIS_MACH_PARAM_FIRST  BLIS_MACH_EPS
-#define BLIS_MACH_PARAM_LAST   BLIS_MACH_EPS2
+    BLIS_MACH_PARAM_FIRST = BLIS_MACH_EPS,
+    BLIS_MACH_PARAM_LAST  = BLIS_MACH_EPS2,
+} machval_t;
 
 
 // -- Induced method types --
@@ -610,12 +611,13 @@ typedef enum
 {
 	BLIS_1M        = 0,
 	BLIS_NAT,
+
+    BLIS_NUM_IND_METHODS,
+
 	BLIS_IND_FIRST = 0,
 	BLIS_IND_LAST  = BLIS_NAT
 } ind_t;
 
-#define BLIS_NUM_IND_METHODS (BLIS_NAT+1)
-
 // These are used in bli_l3_*_oapi.c to construct the ind_t values from
 // the induced method substrings that go into function names.
 #define bli_1m   BLIS_1M
@@ -670,12 +672,12 @@ typedef enum
 	BLIS_TRSM_L_UKR,
 	BLIS_TRSM_U_UKR,
 
-	// l3 virtual kernels
-	BLIS_GEMM_VIR_UKR,
-	BLIS_GEMMTRSM_L_VIR_UKR,
-	BLIS_GEMMTRSM_U_VIR_UKR,
-	BLIS_TRSM_L_VIR_UKR,
-	BLIS_TRSM_U_VIR_UKR,
+	// l3 induced method kernels
+	BLIS_GEMM1M_UKR,
+	BLIS_GEMMTRSM1M_L_UKR,
+	BLIS_GEMMTRSM1M_U_UKR,
+	BLIS_TRSM1M_L_UKR,
+	BLIS_TRSM1M_U_UKR,
 
 	// gemmsup kernels
 	BLIS_GEMMSUP_RRR_UKR,
@@ -723,10 +725,10 @@ typedef enum
 	BLIS_REFERENCE_UKERNEL = 0,
 	BLIS_VIRTUAL_UKERNEL,
 	BLIS_OPTIMIZED_UKERNEL,
-	BLIS_NOTAPPLIC_UKERNEL
-} kimpl_t;
+	BLIS_NOTAPPLIC_UKERNEL,
 
-#define BLIS_NUM_UKR_IMPL_TYPES 4
+    BLIS_NUM_UKR_IMPL_TYPES
+} kimpl_t;
 
 
 #if 0
@@ -797,10 +799,9 @@ typedef enum
 	BLIS_GGC,
 	BLIS_GGG,
 #endif
-} stor3_t;
 
-#define BLIS_NUM_3OP_RC_COMBOS 9
-//#define BLIS_NUM_3OP_RCG_COMBOS 27
+	BLIS_NUM_3OP_RC_COMBOS
+} stor3_t;
 
 
 #if 0
@@ -1047,20 +1048,12 @@ typedef struct mem_s
 
 struct cntl_s
 {
-	// Basic fields (usually required).
 	opid_t         family;
-	bszid_t        bszid;
+    bszid_t        bszid;
 	void_fp        var_func;
 	struct cntl_s* sub_prenode;
 	struct cntl_s* sub_node;
-
-	// Optional fields (needed only by some operations such as packm).
-	// NOTE: first field of params must be a uint64_t containing the size
-	// of the struct.
-	void*          params;
-
-	// Internal fields that track "cached" data.
-	mem_t          pack_mem;
+	const void*    params;
 };
 typedef struct cntl_s cntl_t;
 
@@ -1087,6 +1080,13 @@ typedef struct func_s
 
 } func_t;
 
+typedef struct func2_s
+{
+	// Kernel function address.
+	void_fp ptr[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES];
+
+} func2_t;
+
 
 // -- Multi-boolean object type --
 
@@ -1096,6 +1096,12 @@ typedef struct mbool_s
 
 } mbool_t;
 
+typedef struct mbool2_s
+{
+	bool v[BLIS_NUM_FP_TYPES][BLIS_NUM_FP_TYPES];
+
+} mbool2_t;
+
 
 // -- Auxiliary kernel info type --
 
@@ -1165,9 +1171,8 @@ typedef void (*obj_pack_fn_t)
       const struct obj_s*     a,
             struct obj_s*     ap,
       const struct cntx_s*    cntx,
-            struct rntm_s*    rntm,
-            struct cntl_s*    cntl,
-      const struct thrinfo_s* thread
+      const struct cntl_s*    cntl,
+            struct thrinfo_s* thread
     );
 
 typedef void (*obj_ker_fn_t)
@@ -1176,9 +1181,8 @@ typedef void (*obj_ker_fn_t)
       const struct obj_s*     b,
       const struct obj_s*     c,
       const struct cntx_s*    cntx,
-            struct rntm_s*    rntm,
-            struct cntl_s*    cntl,
-      const struct thrinfo_s* thread
+      const struct cntl_s*    cntl,
+            struct thrinfo_s* thread
     );
 
 typedef struct obj_s
@@ -1409,15 +1413,12 @@ BLIS_INLINE void bli_obj_init_subpart_from( const obj_t* a, obj_t* b )
 typedef struct cntx_s
 {
 	blksz_t   blkszs[ BLIS_NUM_BLKSZS ];
-	bszid_t   bmults[ BLIS_NUM_BLKSZS ];
 
 	func_t    ukrs[ BLIS_NUM_UKRS ];
 	mbool_t   ukr_prefs[ BLIS_NUM_UKR_PREFS ];
 
 	void_fp   l3_sup_handlers[ BLIS_NUM_LEVEL3_OPS ];
 
-	ind_t     method;
-
 } cntx_t;
 
 
@@ -1430,21 +1431,13 @@ typedef struct rntm_s
 {
 	// "External" fields: these may be queried by the end-user.
 	bool      auto_factor;
-
 	dim_t     num_threads;
 	dim_t     thrloop[ BLIS_NUM_LOOPS ];
+
 	bool      pack_a; // enable/disable packing of left-hand matrix A.
 	bool      pack_b; // enable/disable packing of right-hand matrix B.
 	bool      l3_sup; // enable/disable small matrix handling in level-3 ops.
-
-	// "Internal" fields: these should not be exposed to the end-user.
-
-	// The small block pool, which is attached in the l3 thread decorator.
-	pool_t*   sba_pool;
-
-	// The packing block allocator, which is attached in the l3 thread decorator.
-	pba_t*    pba;
-
+    bool      enable_ind[ BLIS_NUM_IND_METHODS ];
 } rntm_t;
 
 
diff --git a/frame/thread/bli_l3_decor.h b/frame/thread/bli_l3_decor.h
index e2208aae63..0f92ceb0cb 100644
--- a/frame/thread/bli_l3_decor.h
+++ b/frame/thread/bli_l3_decor.h
@@ -47,8 +47,7 @@ typedef void (*l3int_t)
        const obj_t*     beta,
        const obj_t*     c,
        const cntx_t*    cntx,
-             rntm_t*    rntm,
-             cntl_t*    cntl,
+       const cntl_t*    cntl,
              thrinfo_t* thread
      );
 
@@ -63,8 +62,8 @@ void bli_l3_thread_decorator
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl
+       const rntm_t* rntm,
+       const cntl_t* cntl
      );
 
 // Include definitions specific to the method of multithreading for the
diff --git a/frame/thread/bli_l3_decor_openmp.c b/frame/thread/bli_l3_decor_openmp.c
index 2c71c75321..8850b748e8 100644
--- a/frame/thread/bli_l3_decor_openmp.c
+++ b/frame/thread/bli_l3_decor_openmp.c
@@ -54,8 +54,8 @@ void bli_l3_thread_decorator
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl
+       const rntm_t* rntm,
+       const cntl_t* cntl
      )
 {
 	// Query the total number of threads from the rntm_t object.
@@ -74,19 +74,8 @@ void bli_l3_thread_decorator
 	// resize the array_t, if necessary.
 	array_t* array = bli_sba_checkout_array( n_threads );
 
-	// Access the pool_t* for thread 0 and embed it into the rntm. We do
-	// this up-front only so that we have the rntm_t.sba_pool field
-	// initialized and ready for the global communicator creation below.
-	bli_sba_rntm_set_pool( 0, array, rntm );
-
-	// Set the packing block allocator field of the rntm. This will be
-	// inherited by all of the child threads when they make local copies of
-	// the rntm below.
-	bli_pba_rntm_set_pba( rntm );
-
 	// Allocate a global communicator for the root thrinfo_t structures.
-	thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads );
-
+	thrcomm_t* gl_comm = bli_thrcomm_create( NULL, n_threads );
 
 	_Pragma( "omp parallel num_threads(n_threads)" )
 	{
@@ -102,21 +91,12 @@ void bli_l3_thread_decorator
 		// Check for a somewhat obscure OpenMP thread-mistmatch issue.
 		bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p );
 
-		// Use the thread id to access the appropriate pool_t* within the
-		// array_t, and use it to set the sba_pool field within the rntm_t.
-		// If the pool_t* element within the array_t is NULL, it will first
-		// be allocated/initialized.
-		bli_sba_rntm_set_pool( tid, array, rntm_p );
-
-		obj_t      a_t, b_t, c_t;
-		cntl_t*    cntl_use;
-		thrinfo_t* thread;
-
 		// Alias thread-local copies of A, B, and C. These will be the objects
 		// we pass down the algorithmic function stack. Making thread-local
 		// aliases is highly recommended in case a thread needs to change any
 		// of the properties of an object without affecting other threads'
 		// objects.
+		obj_t a_t, b_t, c_t;
 		bli_obj_alias_to( a, &a_t );
 		bli_obj_alias_to( b, &b_t );
 		bli_obj_alias_to( c, &c_t );
@@ -132,12 +112,8 @@ void bli_l3_thread_decorator
 		bli_obj_set_pack_schema( BLIS_NOT_PACKED, &a_t );
 		bli_obj_set_pack_schema( BLIS_NOT_PACKED, &b_t );
 
-		// Create a default control tree for the operation, if needed.
-		bli_l3_cntl_create_if( family, schema_a, schema_b,
-		                       &a_t, &b_t, &c_t, rntm_p, cntl, &cntl_use );
-
 		// Create the root node of the current thread's thrinfo_t structure.
-		bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
+		thrinfo_t* thread = bli_l3_thrinfo_create( tid, gl_comm, array, rntm_p, cntl );
 
 #if 1
 		func
@@ -148,8 +124,7 @@ void bli_l3_thread_decorator
 		  beta,
 		  &c_t,
 		  cntx,
-		  rntm_p,
-		  cntl_use,
+		  cntl,
 		  thread
 		);
 #else
@@ -161,27 +136,24 @@ void bli_l3_thread_decorator
 		);
 #endif
 
-		// Free the thread's local control tree.
-		bli_l3_cntl_free( rntm_p, cntl_use, thread );
-
 		#ifdef PRINT_THRINFO
 		threads[tid] = thread;
 		#else
 		// Free the current thread's thrinfo_t structure.
-		bli_l3_thrinfo_free( rntm_p, thread );
+		bli_thrinfo_free( thread );
 		#endif
 	}
 
-	// We shouldn't free the global communicator since it was already freed
-	// by the global communicator's chief thread in bli_l3_thrinfo_free()
-	// (called above).
-
 	#ifdef PRINT_THRINFO
 	if ( family != BLIS_TRSM ) bli_l3_thrinfo_print_gemm_paths( threads );
 	else                       bli_l3_thrinfo_print_trsm_paths( threads );
 	exit(1);
 	#endif
 
+	// Free the global communicator, because the root thrinfo_t node
+    // never frees its communicator.
+    bli_thrcomm_free( NULL, gl_comm );
+
 	// Check the array_t back into the small block allocator. Similar to the
 	// check-out, this is done using a lock embedded within the sba to ensure
 	// mutual exclusion.
diff --git a/frame/thread/bli_l3_decor_pthreads.c b/frame/thread/bli_l3_decor_pthreads.c
index 80247dfb1c..667c527a06 100644
--- a/frame/thread/bli_l3_decor_pthreads.c
+++ b/frame/thread/bli_l3_decor_pthreads.c
@@ -33,6 +33,7 @@
 
 */
 
+#include "bli_apool.h"
 #include "blis.h"
 
 #ifdef BLIS_ENABLE_PTHREADS
@@ -48,8 +49,8 @@ typedef struct thread_data
 	const obj_t*     beta;
 	const obj_t*     c;
 	const cntx_t*    cntx;
-	      rntm_t*    rntm;
-	      cntl_t*    cntl;
+	const rntm_t*    rntm;
+	const cntl_t*    cntl;
 	      dim_t      tid;
 	      thrcomm_t* gl_comm;
 	      array_t*   array;
@@ -68,33 +69,18 @@ void* bli_l3_thread_entry( void* data_void )
 	const obj_t*         beta     = data->beta;
 	const obj_t*         c        = data->c;
 	const cntx_t*        cntx     = data->cntx;
-	      rntm_t*        rntm     = data->rntm;
-	      cntl_t*        cntl     = data->cntl;
+	const rntm_t*        rntm     = data->rntm;
+	const cntl_t*        cntl     = data->cntl;
 	const dim_t          tid      = data->tid;
 	      array_t*       array    = data->array;
 	      thrcomm_t*     gl_comm  = data->gl_comm;
 
-	// Create a thread-local copy of the master thread's rntm_t. This is
-	// necessary since we want each thread to be able to track its own
-	// small block pool_t as it executes down the function stack.
-	rntm_t  rntm_l = *rntm;
-	rntm_t* rntm_p = &rntm_l;
-
-	// Use the thread id to access the appropriate pool_t* within the
-	// array_t, and use it to set the sba_pool field within the rntm_t.
-	// If the pool_t* element within the array_t is NULL, it will first
-	// be allocated/initialized.
-	bli_sba_rntm_set_pool( tid, array, rntm_p );
-
-	obj_t      a_t, b_t, c_t;
-	cntl_t*    cntl_use;
-	thrinfo_t* thread;
-
 	// Alias thread-local copies of A, B, and C. These will be the objects
 	// we pass down the algorithmic function stack. Making thread-local
 	// aliases is highly recommended in case a thread needs to change any
 	// of the properties of an object without affecting other threads'
 	// objects.
+	obj_t a_t, b_t, c_t;
 	bli_obj_alias_to( a, &a_t );
 	bli_obj_alias_to( b, &b_t );
 	bli_obj_alias_to( c, &c_t );
@@ -110,12 +96,10 @@ void* bli_l3_thread_entry( void* data_void )
 	bli_obj_set_pack_schema( BLIS_NOT_PACKED, &a_t );
 	bli_obj_set_pack_schema( BLIS_NOT_PACKED, &b_t );
 
-	// Create a default control tree for the operation, if needed.
-	bli_l3_cntl_create_if( family, schema_a, schema_b,
-	                       &a_t, &b_t, &c_t, rntm_p, cntl, &cntl_use );
-
 	// Create the root node of the current thread's thrinfo_t structure.
-	bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
+    // The root node is the *parent* of the node corresponding to the first
+    // control tree node.
+	thrinfo_t* thread = bli_l3_thrinfo_create( tid, gl_comm, array, rntm, cntl );
 
 	func
 	(
@@ -125,16 +109,12 @@ void* bli_l3_thread_entry( void* data_void )
 	  beta,
 	  &c_t,
 	  cntx,
-	  rntm_p,
-	  cntl_use,
-	  thread
+	  cntl,
+	  bli_thrinfo_sub_node( thread )
 	);
 
-	// Free the thread's local control tree.
-	bli_l3_cntl_free( rntm_p, cntl_use, thread );
-
 	// Free the current thread's thrinfo_t structure.
-	bli_l3_thrinfo_free( rntm_p, thread );
+	bli_thrinfo_free( thread );
 
 	return NULL;
 }
@@ -149,8 +129,8 @@ void bli_l3_thread_decorator
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl
+       const rntm_t* rntm,
+       const cntl_t* cntl
      )
 {
 	err_t r_val;
@@ -164,20 +144,10 @@ void bli_l3_thread_decorator
 	// with an internal lock to ensure only one application thread accesses
 	// the sba at a time. bli_sba_checkout_array() will also automatically
 	// resize the array_t, if necessary.
-	array_t* array = bli_sba_checkout_array( n_threads );
-
-	// Access the pool_t* for thread 0 and embed it into the rntm. We do
-	// this up-front only so that we have the rntm_t.sba_pool field
-	// initialized and ready for the global communicator creation below.
-	bli_sba_rntm_set_pool( 0, array, rntm );
-
-	// Set the packing block allocator field of the rntm. This will be
-	// inherited by all of the child threads when they make local copies of
-	// the rntm below.
-	bli_pba_rntm_set_pba( rntm );
+	array_t* array    = bli_sba_checkout_array( n_threads );
 
 	// Allocate a global communicator for the root thrinfo_t structures.
-	thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads );
+	thrcomm_t* gl_comm = bli_thrcomm_create( NULL, n_threads );
 
 	// Allocate an array of pthread objects and auxiliary data structs to pass
 	// to the thread entry functions.
@@ -218,16 +188,16 @@ void bli_l3_thread_decorator
 			bli_l3_thread_entry( ( void* )(&datas[0]) );
 	}
 
-	// We shouldn't free the global communicator since it was already freed
-	// by the global communicator's chief thread in bli_l3_thrinfo_free()
-	// (called from the thread entry function).
-
 	// Thread 0 waits for additional threads to finish.
 	for ( dim_t tid = 1; tid < n_threads; tid++ )
 	{
 		bli_pthread_join( pthreads[tid], NULL );
 	}
 
+	// Free the global communicator, because the root thrinfo_t node
+    // never frees its communicator.
+    bli_thrcomm_free( NULL, gl_comm );
+
 	// Check the array_t back into the small block allocator. Similar to the
 	// check-out, this is done using a lock embedded within the sba to ensure
 	// mutual exclusion.
diff --git a/frame/thread/bli_l3_decor_single.c b/frame/thread/bli_l3_decor_single.c
index c2c43b3703..0f908af5ab 100644
--- a/frame/thread/bli_l3_decor_single.c
+++ b/frame/thread/bli_l3_decor_single.c
@@ -47,8 +47,8 @@ void bli_l3_thread_decorator
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm,
-             cntl_t* cntl
+       const rntm_t* rntm,
+       const cntl_t* cntl
      )
 {
 	obj_t a_t, b_t;
@@ -77,25 +77,10 @@ void bli_l3_thread_decorator
 	// resize the array_t, if necessary.
 	array_t* array = bli_sba_checkout_array( n_threads );
 
-	// Access the pool_t* for thread 0 and embed it into the rntm. We do
-	// this up-front only so that we can create the global comm below.
-	bli_sba_rntm_set_pool( 0, array, rntm );
-
-	// Set the packing block allocator field of the rntm.
-	bli_pba_rntm_set_pba( rntm );
-
-	// Allcoate a global communicator for the root thrinfo_t structures.
-	thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads );
-
+	// Use the single-threaded communicator
+	thrcomm_t* gl_comm = &BLIS_SINGLE_COMM;
 
 	{
-		// NOTE: We don't need to create another copy of the rntm_t since
-		// it was already copied in one of the high-level oapi functions.
-		rntm_t* rntm_p = rntm;
-
-		cntl_t*    cntl_use;
-		thrinfo_t* thread;
-
 		const dim_t tid = 0;
 
 		// Use the thread id to access the appropriate pool_t* within the
@@ -113,12 +98,8 @@ void bli_l3_thread_decorator
 		// consistently providing local aliases, we can then eliminate aliasing
 		// elsewhere.
 
-		// Create a default control tree for the operation, if needed.
-		bli_l3_cntl_create_if( family, schema_a, schema_b,
-		                       &a_t, &b_t, c, rntm_p, cntl, &cntl_use );
-
 		// Create the root node of the thread's thrinfo_t structure.
-		bli_l3_thrinfo_create_root( tid, gl_comm, rntm_p, cntl_use, &thread );
+		thrinfo_t* thread = bli_l3_thrinfo_create( tid, gl_comm, array, rntm, cntl );
 
 		func
 		(
@@ -128,22 +109,14 @@ void bli_l3_thread_decorator
 		  beta,
 		  c,
 		  cntx,
-		  rntm_p,
-		  cntl_use,
+		  cntl,
 		  thread
 		);
 
-		// Free the thread's local control tree.
-		bli_l3_cntl_free( rntm_p, cntl_use, thread );
-
 		// Free the current thread's thrinfo_t structure.
-		bli_l3_thrinfo_free( rntm_p, thread );
+		bli_thrinfo_free( thread );
 	}
 
-	// We shouldn't free the global communicator since it was already freed
-	// by the global communicator's chief thread in bli_l3_thrinfo_free()
-	// (called above).
-
 	// Check the array_t back into the small block allocator. Similar to the
 	// check-out, this is done using a lock embedded within the sba to ensure
 	// mutual exclusion.
diff --git a/frame/thread/bli_l3_sup_decor.h b/frame/thread/bli_l3_sup_decor.h
index 6e04011513..8083dd1009 100644
--- a/frame/thread/bli_l3_sup_decor.h
+++ b/frame/thread/bli_l3_sup_decor.h
@@ -62,7 +62,7 @@ err_t bli_l3_sup_thread_decorator
        const obj_t*     beta,
        const obj_t*     c,
        const cntx_t*    cntx,
-             rntm_t*    rntm
+       const rntm_t*    rntm
      );
 
 // Include definitions specific to the method of multithreading for the
diff --git a/frame/thread/bli_l3_sup_decor_openmp.c b/frame/thread/bli_l3_sup_decor_openmp.c
index ff6bc667d3..a7eb9eee91 100644
--- a/frame/thread/bli_l3_sup_decor_openmp.c
+++ b/frame/thread/bli_l3_sup_decor_openmp.c
@@ -54,7 +54,7 @@ err_t bli_l3_sup_thread_decorator
        const obj_t*     beta,
        const obj_t*     c,
        const cntx_t*    cntx,
-             rntm_t*    rntm
+       const rntm_t*    rntm
      )
 {
 	// Query the total number of threads from the rntm_t object.
@@ -68,19 +68,8 @@ err_t bli_l3_sup_thread_decorator
 	// resize the array_t, if necessary.
 	array_t* array = bli_sba_checkout_array( n_threads );
 
-	// Access the pool_t* for thread 0 and embed it into the rntm. We do
-	// this up-front only so that we have the rntm_t.sba_pool field
-	// initialized and ready for the global communicator creation below.
-	bli_sba_rntm_set_pool( 0, array, rntm );
-
-	// Set the packing block allocator field of the rntm. This will be
-	// inherited by all of the child threads when they make local copies of
-	// the rntm below.
-	bli_pba_rntm_set_pba( rntm );
-
 	// Allcoate a global communicator for the root thrinfo_t structures.
-	thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads );
-
+	thrcomm_t* gl_comm = bli_thrcomm_create( NULL, n_threads );
 
 	_Pragma( "omp parallel num_threads(n_threads)" )
 	{
@@ -98,16 +87,8 @@ err_t bli_l3_sup_thread_decorator
 		// code path.
 		bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p );
 
-		// Use the thread id to access the appropriate pool_t* within the
-		// array_t, and use it to set the sba_pool field within the rntm_t.
-		// If the pool_t* element within the array_t is NULL, it will first
-		// be allocated/initialized.
-		bli_sba_rntm_set_pool( tid, array, rntm_p );
-
-		thrinfo_t* thread = NULL;
-
 		// Create the root node of the thread's thrinfo_t structure.
-		bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread );
+		thrinfo_t* thread = bli_l3_sup_thrinfo_create( tid, gl_comm, array, rntm_p );
 
 		func
 		(
@@ -122,12 +103,12 @@ err_t bli_l3_sup_thread_decorator
 		);
 
 		// Free the current thread's thrinfo_t structure.
-		bli_l3_sup_thrinfo_free( rntm_p, thread );
+		bli_thrinfo_free( thread );
 	}
 
-	// We shouldn't free the global communicator since it was already freed
-	// by the global communicator's chief thread in bli_l3_thrinfo_free()
-	// (called from the thread entry function).
+	// Free the global communicator, because the root thrinfo_t node
+    // never frees its communicator.
+    bli_thrcomm_free( NULL, gl_comm );
 
 	// Check the array_t back into the small block allocator. Similar to the
 	// check-out, this is done using a lock embedded within the sba to ensure
diff --git a/frame/thread/bli_l3_sup_decor_pthreads.c b/frame/thread/bli_l3_sup_decor_pthreads.c
index 375a85730e..c287459b3f 100644
--- a/frame/thread/bli_l3_sup_decor_pthreads.c
+++ b/frame/thread/bli_l3_sup_decor_pthreads.c
@@ -48,7 +48,7 @@ typedef struct thread_data
 	const obj_t*     beta;
 	const obj_t*     c;
 	const cntx_t*    cntx;
-	      rntm_t*    rntm;
+	const rntm_t*    rntm;
 	      dim_t      tid;
 	      thrcomm_t* gl_comm;
 	      array_t*   array;
@@ -67,29 +67,18 @@ void* bli_l3_sup_thread_entry( void* data_void )
 	const obj_t*         beta     = data->beta;
 	const obj_t*         c        = data->c;
 	const cntx_t*        cntx     = data->cntx;
-	      rntm_t*        rntm     = data->rntm;
+	const rntm_t*        rntm     = data->rntm;
 	      dim_t          tid      = data->tid;
 	      array_t*       array    = data->array;
 	      thrcomm_t*     gl_comm  = data->gl_comm;
 
 	( void )family;
 
-	// Create a thread-local copy of the master thread's rntm_t. This is
-	// necessary since we want each thread to be able to track its own
-	// small block pool_t as it executes down the function stack.
 	rntm_t  rntm_l = *rntm;
 	rntm_t* rntm_p = &rntm_l;
 
-	// Use the thread id to access the appropriate pool_t* within the
-	// array_t, and use it to set the sba_pool field within the rntm_t.
-	// If the pool_t* element within the array_t is NULL, it will first
-	// be allocated/initialized.
-	bli_sba_rntm_set_pool( tid, array, rntm_p );
-
-	thrinfo_t* thread = NULL;
-
 	// Create the root node of the current thread's thrinfo_t structure.
-	bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread );
+	thrinfo_t* thread = bli_l3_sup_thrinfo_create( tid, gl_comm, array, rntm_p );
 
 	func
 	(
@@ -104,7 +93,7 @@ void* bli_l3_sup_thread_entry( void* data_void )
 	);
 
 	// Free the current thread's thrinfo_t structure.
-	bli_l3_sup_thrinfo_free( rntm_p, thread );
+	bli_thrinfo_free( thread );
 
 	return NULL;
 }
@@ -119,7 +108,7 @@ err_t bli_l3_sup_thread_decorator
        const obj_t*     beta,
        const obj_t*     c,
        const cntx_t*    cntx,
-             rntm_t*    rntm
+       const rntm_t*    rntm
      )
 {
 	err_t r_val;
@@ -135,18 +124,8 @@ err_t bli_l3_sup_thread_decorator
 	// resize the array_t, if necessary.
 	array_t* array = bli_sba_checkout_array( n_threads );
 
-	// Access the pool_t* for thread 0 and embed it into the rntm. We do
-	// this up-front only so that we have the rntm_t.sba_pool field
-	// initialized and ready for the global communicator creation below.
-	bli_sba_rntm_set_pool( 0, array, rntm );
-
-	// Set the packing block allocator field of the rntm. This will be
-	// inherited by all of the child threads when they make local copies of
-	// the rntm below.
-	bli_pba_rntm_set_pba( rntm );
-
 	// Allocate a global communicator for the root thrinfo_t structures.
-	thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads );
+	thrcomm_t* gl_comm = bli_thrcomm_create( NULL, n_threads );
 
 	// Allocate an array of pthread objects and auxiliary data structs to pass
 	// to the thread entry functions.
@@ -186,16 +165,16 @@ err_t bli_l3_sup_thread_decorator
 			bli_l3_sup_thread_entry( ( void* )(&datas[0]) );
 	}
 
-	// We shouldn't free the global communicator since it was already freed
-	// by the global communicator's chief thread in bli_l3_thrinfo_free()
-	// (called from the thread entry function).
-
 	// Thread 0 waits for additional threads to finish.
 	for ( dim_t tid = 1; tid < n_threads; tid++ )
 	{
 		bli_pthread_join( pthreads[tid], NULL );
 	}
 
+	// Free the global communicator, because the root thrinfo_t node
+    // never frees its communicator.
+    bli_thrcomm_free( NULL, gl_comm );
+
 	// Check the array_t back into the small block allocator. Similar to the
 	// check-out, this is done using a lock embedded within the sba to ensure
 	// mutual exclusion.
diff --git a/frame/thread/bli_l3_sup_decor_single.c b/frame/thread/bli_l3_sup_decor_single.c
index 42dbd14563..a25b1597ff 100644
--- a/frame/thread/bli_l3_sup_decor_single.c
+++ b/frame/thread/bli_l3_sup_decor_single.c
@@ -37,7 +37,23 @@
 
 #ifndef BLIS_ENABLE_MULTITHREADING
 
-#define SKIP_THRINFO_TREE
+void bli_l3_sup_thrinfo_init_single
+     (
+       pool_t*    sba_pool,
+       pba_t*     pba,
+       thrinfo_t* thread
+     )
+{
+    bli_thrinfo_set_comm( &BLIS_SINGLE_COMM, thread );
+    bli_thrinfo_set_thread_id( 0, thread );
+    bli_thrinfo_set_n_way( 1, thread );
+    bli_thrinfo_set_work_id( 0, thread );
+    bli_thrinfo_set_free_comm( FALSE, thread );
+    bli_thrinfo_set_sba_pool( sba_pool, thread );
+    bli_thrinfo_set_pba( pba, thread );
+    bli_thrinfo_set_sub_prenode( thread, thread );
+    bli_thrinfo_set_sub_node( thread, thread );
+}
 
 err_t bli_l3_sup_thread_decorator
      (
@@ -49,7 +65,7 @@ err_t bli_l3_sup_thread_decorator
        const obj_t*     beta,
        const obj_t*     c,
        const cntx_t*    cntx,
-             rntm_t*    rntm
+       const rntm_t*    rntm
      )
 {
 	// For sequential execution, we use only one thread.
@@ -63,50 +79,13 @@ err_t bli_l3_sup_thread_decorator
 	// resize the array_t, if necessary.
 	array_t* array = bli_sba_checkout_array( n_threads );
 
-	// Access the pool_t* for thread 0 and embed it into the rntm.
-	bli_sba_rntm_set_pool( 0, array, rntm );
-
-	// Set the packing block allocator field of the rntm.
-	bli_pba_rntm_set_pba( rntm );
-
-#ifndef SKIP_THRINFO_TREE
-	// Allcoate a global communicator for the root thrinfo_t structures.
-	thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads );
-#endif
-
-
 	{
-		// NOTE: We don't need to create another copy of the rntm_t since
-		// it was already copied in one of the high-level oapi functions.
-		rntm_t* rntm_p = rntm;
-
-		// There is only one thread id (for the thief thread).
-		const dim_t tid = 0;
-
-		// Use the thread id to access the appropriate pool_t* within the
-		// array_t, and use it to set the sba_pool field within the rntm_t.
-		// If the pool_t* element within the array_t is NULL, it will first
-		// be allocated/initialized.
-		// NOTE: This is commented out because, in the single-threaded case,
-		// this is redundant since it's already been done above.
-		//bli_sba_rntm_set_pool( tid, array, rntm_p );
-
-#ifndef SKIP_THRINFO_TREE
-		thrinfo_t* thread = NULL;
-
-		// Create the root node of the thread's thrinfo_t structure.
-		bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread );
-#else
-		// This optimization allows us to use one of the global thrinfo_t
-		// objects for single-threaded execution rather than grow one from
-		// scratch. The key is that bli_thrinfo_sup_grow(), which is called
-		// from within the variants, will immediately return if it detects
-		// that the thrinfo_t* passed into it is either
-		// &BLIS_GEMM_SINGLE_THREADED or &BLIS_PACKM_SINGLE_THREADED.
-		thrinfo_t* thread = &BLIS_GEMM_SINGLE_THREADED;
-
-		( void )tid;
-#endif
+		// Create a special thrinfo_t structure which indicates
+        // single-threaded execution for all nodes.
+		thrinfo_t thread;
+        pool_t*   sba_pool = bli_apool_array_elem( 0, array );
+        pba_t*    pba      = bli_pba_query();
+		bli_l3_sup_thrinfo_init_single( sba_pool, pba, &thread );
 
 		func
 		(
@@ -116,27 +95,17 @@ err_t bli_l3_sup_thread_decorator
 		  beta,
 		  c,
 		  cntx,
-		  rntm_p,
-		  thread
+		  rntm,
+		  &thread
 		);
-
-#ifndef SKIP_THRINFO_TREE
-		// Free the current thread's thrinfo_t structure.
-		bli_l3_sup_thrinfo_free( rntm_p, thread );
-#endif
 	}
 
-	// We shouldn't free the global communicator since it was already freed
-	// by the global communicator's chief thread in bli_l3_thrinfo_free()
-	// (called above).
-
 	// Check the array_t back into the small block allocator. Similar to the
 	// check-out, this is done using a lock embedded within the sba to ensure
 	// mutual exclusion.
 	bli_sba_checkin_array( array );
 
 	return BLIS_SUCCESS;
-
 }
 
 #endif
diff --git a/frame/thread/bli_l3_sup_decor_single.h b/frame/thread/bli_l3_sup_decor_single.h
index 418c3814c3..29532bc29d 100644
--- a/frame/thread/bli_l3_sup_decor_single.h
+++ b/frame/thread/bli_l3_sup_decor_single.h
@@ -40,5 +40,12 @@
 
 #endif
 
+void bli_l3_sup_thrinfo_init_single
+     (
+       pool_t*    sba_pool,
+       pba_t*     pba,
+       thrinfo_t* thread
+     );
+
 #endif
 
diff --git a/frame/thread/bli_thrcomm.h b/frame/thread/bli_thrcomm.h
index d0ffb13461..33d9e5d0ab 100644
--- a/frame/thread/bli_thrcomm.h
+++ b/frame/thread/bli_thrcomm.h
@@ -52,8 +52,8 @@ BLIS_INLINE dim_t bli_thrcomm_num_threads( thrcomm_t* comm )
 
 
 // Thread communicator prototypes.
-thrcomm_t* bli_thrcomm_create( rntm_t* rntm, dim_t n_threads );
-void       bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm );
+thrcomm_t* bli_thrcomm_create( pool_t* sba_pool, dim_t n_threads );
+void       bli_thrcomm_free( pool_t* sba_pool, thrcomm_t* comm );
 void       bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm );
 void       bli_thrcomm_cleanup( thrcomm_t* comm );
 
diff --git a/frame/thread/bli_thrcomm_openmp.c b/frame/thread/bli_thrcomm_openmp.c
index 9bb35ea31a..39ce8f59c5 100644
--- a/frame/thread/bli_thrcomm_openmp.c
+++ b/frame/thread/bli_thrcomm_openmp.c
@@ -37,20 +37,20 @@
 
 #ifdef BLIS_ENABLE_OPENMP
 
-thrcomm_t* bli_thrcomm_create( rntm_t* rntm, dim_t n_threads )
+thrcomm_t* bli_thrcomm_create( pool_t* sba_pool, dim_t n_threads )
 {
 	#ifdef BLIS_ENABLE_MEM_TRACING
 	printf( "bli_thrcomm_create(): " );
 	#endif
 
-	thrcomm_t* comm = bli_sba_acquire( rntm, sizeof(thrcomm_t) );
+	thrcomm_t* comm = bli_sba_acquire( sba_pool, sizeof(thrcomm_t) );
 
 	bli_thrcomm_init( n_threads, comm );
 
 	return comm;
 }
 
-void bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm )
+void bli_thrcomm_free( pool_t* sba_pool, thrcomm_t* comm )
 {
 	if ( comm == NULL ) return;
 
@@ -60,7 +60,7 @@ void bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm )
 	printf( "bli_thrcomm_free(): " );
 	#endif
 
-	bli_sba_release( rntm, comm );
+	bli_sba_release( sba_pool, comm );
 }
 
 #ifndef BLIS_TREE_BARRIER
@@ -156,10 +156,10 @@ barrier_t* bli_thrcomm_tree_barrier_create( int num_threads, int arity, barrier_
 			kid->dad = me;
 
 			leaf_index += threads_this_kid;
-		}  
+		}
 		me->count = arity;
 		me->arity = arity;
-	}  
+	}
 
 	return me;
 }
diff --git a/frame/thread/bli_thrcomm_pthreads.c b/frame/thread/bli_thrcomm_pthreads.c
index d0896f94df..f1d9c210e8 100644
--- a/frame/thread/bli_thrcomm_pthreads.c
+++ b/frame/thread/bli_thrcomm_pthreads.c
@@ -37,20 +37,20 @@
 
 #ifdef BLIS_ENABLE_PTHREADS
 
-thrcomm_t* bli_thrcomm_create( rntm_t* rntm, dim_t n_threads )
+thrcomm_t* bli_thrcomm_create( pool_t* sba_pool, dim_t n_threads )
 {
 	#ifdef BLIS_ENABLE_MEM_TRACING
 	printf( "bli_thrcomm_create(): " );
 	#endif
 
-	thrcomm_t* comm = bli_sba_acquire( rntm, sizeof(thrcomm_t) );
+	thrcomm_t* comm = bli_sba_acquire( sba_pool, sizeof(thrcomm_t) );
 
 	bli_thrcomm_init( n_threads, comm );
 
 	return comm;
 }
 
-void bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm )
+void bli_thrcomm_free( pool_t* sba_pool, thrcomm_t* comm )
 {
 	if ( comm == NULL ) return;
 
@@ -60,7 +60,7 @@ void bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm )
 	printf( "bli_thrcomm_free(): " );
 	#endif
 
-	bli_sba_release( rntm, comm );
+	bli_sba_release( sba_pool, comm );
 }
 
 #ifdef BLIS_USE_PTHREAD_BARRIER
diff --git a/frame/thread/bli_thrcomm_single.c b/frame/thread/bli_thrcomm_single.c
index cedb3c5b6e..26803b375a 100644
--- a/frame/thread/bli_thrcomm_single.c
+++ b/frame/thread/bli_thrcomm_single.c
@@ -38,20 +38,20 @@
 #ifndef BLIS_ENABLE_MULTITHREADING
 
 //Constructors and destructors for constructors
-thrcomm_t* bli_thrcomm_create( rntm_t* rntm, dim_t n_threads )
+thrcomm_t* bli_thrcomm_create( pool_t* sba_pool, dim_t n_threads )
 {
 	#ifdef BLIS_ENABLE_MEM_TRACING
 	printf( "bli_thrcomm_create(): " );
 	#endif
 
-	thrcomm_t* comm = bli_sba_acquire( rntm, sizeof( thrcomm_t ) );
+	thrcomm_t* comm = bli_sba_acquire( sba_pool, sizeof( thrcomm_t ) );
 
 	bli_thrcomm_init( n_threads, comm );
 
 	return comm;
 }
 
-void bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm )
+void bli_thrcomm_free( pool_t* sba_pool, thrcomm_t* comm )
 {
 	if ( comm == NULL ) return;
 
@@ -61,7 +61,7 @@ void bli_thrcomm_free( rntm_t* rntm, thrcomm_t* comm )
 	printf( "bli_thrcomm_free(): " );
 	#endif
 
-	bli_sba_release( rntm, comm );
+	bli_sba_release( sba_pool, comm );
 }
 
 void bli_thrcomm_init( dim_t n_threads, thrcomm_t* comm )
diff --git a/frame/thread/bli_thread.c b/frame/thread/bli_thread.c
index b7fccace76..92acc88a3d 100644
--- a/frame/thread/bli_thread.c
+++ b/frame/thread/bli_thread.c
@@ -35,9 +35,7 @@
 
 #include "blis.h"
 
-thrinfo_t BLIS_PACKM_SINGLE_THREADED = {};
-thrinfo_t BLIS_GEMM_SINGLE_THREADED  = {};
-thrcomm_t BLIS_SINGLE_COMM           = {};
+thrcomm_t BLIS_SINGLE_COMM = {};
 
 // The global rntm_t structure. (The definition resides in bli_rntm.c.)
 extern rntm_t global_rntm;
@@ -51,8 +49,6 @@ extern bli_pthread_mutex_t global_rntm_mutex;
 void bli_thread_init( void )
 {
 	bli_thrcomm_init( 1, &BLIS_SINGLE_COMM );
-	bli_packm_thrinfo_init_single( &BLIS_PACKM_SINGLE_THREADED );
-	bli_l3_thrinfo_init_single( &BLIS_GEMM_SINGLE_THREADED );
 
 	// Read the environment variables and use them to initialize the
 	// global runtime object.
@@ -647,25 +643,12 @@ siz_t bli_thread_range_mdim
        const obj_t*     b,
        const obj_t*     c,
        const cntl_t*    cntl,
-       const cntx_t*    cntx,
              dim_t*     start,
              dim_t*     end
      )
 {
-	bszid_t  bszid  = bli_cntl_bszid( cntl );
-	opid_t   family = bli_cntl_family( cntl );
-
-	// This is part of trsm's current implementation, whereby right side
-	// cases are implemented in left-side micro-kernels, which requires
-	// we swap the usage of the register blocksizes for the purposes of
-	// packing A and B.
-	if ( family == BLIS_TRSM )
-	{
-		if ( bli_obj_root_is_triangular( a ) ) bszid = BLIS_MR;
-		else                                   bszid = BLIS_NR;
-	}
-
-	const blksz_t* bmult  = bli_cntx_get_bmult( bszid, cntx );
+	      opid_t   family = bli_cntl_family( cntl );
+	const blksz_t* bmult  = bli_cntl_part_params_bmult( cntl );
 	const obj_t*   x;
 	bool     use_weighted;
 
@@ -706,27 +689,14 @@ siz_t bli_thread_range_ndim
        const obj_t*     b,
        const obj_t*     c,
        const cntl_t*    cntl,
-       const cntx_t*    cntx,
              dim_t*     start,
              dim_t*     end
      )
 {
-	bszid_t  bszid  = bli_cntl_bszid( cntl );
-	opid_t   family = bli_cntl_family( cntl );
-
-	// This is part of trsm's current implementation, whereby right side
-	// cases are implemented in left-side micro-kernels, which requires
-	// we swap the usage of the register blocksizes for the purposes of
-	// packing A and B.
-	if ( family == BLIS_TRSM )
-	{
-		if ( bli_obj_root_is_triangular( b ) ) bszid = BLIS_MR;
-		else                                   bszid = BLIS_NR;
-	}
-
-	const blksz_t* bmult  = bli_cntx_get_bmult( bszid, cntx );
+	      opid_t   family = bli_cntl_family( cntl );
+	const blksz_t* bmult  = bli_cntl_part_params_bmult( cntl );
 	const obj_t*   x;
-	bool     use_weighted;
+	      bool     use_weighted;
 
 	// Use the operation family to choose the one of the two matrices
 	// being partitioned that potentially has structure, and also to
diff --git a/frame/thread/bli_thread.h b/frame/thread/bli_thread.h
index 5e9c650b5b..5f85bda229 100644
--- a/frame/thread/bli_thread.h
+++ b/frame/thread/bli_thread.h
@@ -42,11 +42,8 @@
 
 // Include thread info (thrinfo_t) object definitions and prototypes.
 #include "bli_thrinfo.h"
-#include "bli_thrinfo_sup.h"
 
 // Include some operation-specific thrinfo_t prototypes.
-// Note that the bli_packm_thrinfo.h must be included before the others!
-#include "bli_packm_thrinfo.h"
 #include "bli_l3_thrinfo.h"
 
 // Include the level-3 thread decorator and related definitions and prototypes
@@ -85,7 +82,6 @@ siz_t PASTEMAC0( opname ) \
        const obj_t*     b, \
        const obj_t*     c, \
        const cntl_t*    cntl, \
-       const cntx_t*    cntx, \
              dim_t*     start, \
              dim_t*     end  \
      );
diff --git a/frame/thread/bli_thrinfo.c b/frame/thread/bli_thrinfo.c
index ca7b6b5949..858d8f2b98 100644
--- a/frame/thread/bli_thrinfo.c
+++ b/frame/thread/bli_thrinfo.c
@@ -33,82 +33,67 @@
 
 */
 
+#include "bli_mem.h"
+#include "bli_sba.h"
 #include "blis.h"
+#include <type_traits>
+
+#define BLIS_NUM_STATIC_COMMS 80
+
+thrinfo_t* bli_thrinfo_create_root
+     (
+       thrcomm_t* comm,
+       dim_t      thread_id,
+       pool_t*    sba_pool,
+       pba_t*     pba
+     )
+{
+    return bli_thrinfo_create
+     (
+       comm,
+       thread_id,
+       1,
+       0,
+       FALSE,
+       sba_pool,
+       pba
+     );
+}
 
 thrinfo_t* bli_thrinfo_create
      (
-       rntm_t*    rntm,
-       thrcomm_t* ocomm,
-       dim_t      ocomm_id,
+       thrcomm_t* comm,
+       dim_t      thread_id,
        dim_t      n_way,
        dim_t      work_id,
        bool       free_comm,
-       bszid_t    bszid,
-       thrinfo_t* sub_node
+       pool_t*    sba_pool,
+       pba_t*     pba
      )
 {
 	#ifdef BLIS_ENABLE_MEM_TRACING
 	printf( "bli_thrinfo_create(): " );
 	#endif
 
-    thrinfo_t* thread = bli_sba_acquire( rntm, sizeof( thrinfo_t ) );
-
-    bli_thrinfo_init
-	(
-	  thread,
-	  ocomm, ocomm_id,
-	  n_way, work_id,
-	  free_comm,
-	  bszid,
-	  sub_node
-	);
-
-    return thread;
-}
+    thrinfo_t* thread = bli_sba_acquire( sba_pool, sizeof( thrinfo_t ) );
 
-void bli_thrinfo_init
-     (
-       thrinfo_t* thread,
-       thrcomm_t* ocomm,
-       dim_t      ocomm_id,
-       dim_t      n_way,
-       dim_t      work_id,
-       bool       free_comm,
-       bszid_t    bszid,
-       thrinfo_t* sub_node
-     )
-{
-	bli_thrinfo_set_ocomm( ocomm, thread );
-	bli_thrinfo_set_ocomm_id( ocomm_id, thread );
+	bli_thrinfo_set_comm( comm, thread );
+	bli_thrinfo_set_thread_id( thread_id, thread );
 	bli_thrinfo_set_n_way( n_way, thread );
 	bli_thrinfo_set_work_id( work_id, thread );
 	bli_thrinfo_set_free_comm( free_comm, thread );
-	bli_thrinfo_set_bszid( bszid, thread );
+    bli_thrinfo_set_sba_pool( sba_pool, thread );
+    bli_thrinfo_set_pba( pba, thread );
+    bli_mem_clear( bli_thread_mem( thread ) );
 
-	bli_thrinfo_set_sub_node( sub_node, thread );
+	bli_thrinfo_set_sub_node( NULL, thread );
 	bli_thrinfo_set_sub_prenode( NULL, thread );
-}
 
-void bli_thrinfo_init_single
-     (
-       thrinfo_t* thread
-     )
-{
-	bli_thrinfo_init
-	(
-	  thread,
-	  &BLIS_SINGLE_COMM, 0,
-	  1,
-	  0,
-	  FALSE,
-	  BLIS_NO_PART,
-	  thread
-	);
+    return thread;
 }
 
 void bli_thrinfo_free
      (
-       rntm_t*    rntm,
        thrinfo_t* thread
      )
 {
@@ -119,17 +104,18 @@ void bli_thrinfo_free
 
 	thrinfo_t* thrinfo_sub_prenode = bli_thrinfo_sub_prenode( thread );
 	thrinfo_t* thrinfo_sub_node    = bli_thrinfo_sub_node( thread );
+    pool_t*    sba_pool            = bli_thread_sba_pool( thread );
 
 	// Recursively free all children of the current thrinfo_t.
 	if ( thrinfo_sub_prenode != NULL )
 	{
-		bli_thrinfo_free( rntm, thrinfo_sub_prenode );
+		bli_thrinfo_free( thrinfo_sub_prenode );
 	}
 
 	// Recursively free all children of the current thrinfo_t.
 	if ( thrinfo_sub_node != NULL )
 	{
-		bli_thrinfo_free( rntm, thrinfo_sub_node );
+		bli_thrinfo_free( thrinfo_sub_node );
 	}
 
 	// Free the communicators, but only if the current thrinfo_t struct
@@ -139,8 +125,8 @@ void bli_thrinfo_free
 	if ( bli_thrinfo_needs_free_comm( thread ) )
 	{
 		// The ochief always frees his communicator.
-		if ( bli_thread_am_ochief( thread ) )
-			bli_thrcomm_free( rntm, bli_thrinfo_ocomm( thread ) );
+		if ( bli_thread_am_chief( thread ) )
+			bli_thrcomm_free( sba_pool, bli_thrinfo_comm( thread ) );
 	}
 
 	#ifdef BLIS_ENABLE_MEM_TRACING
@@ -148,212 +134,77 @@ void bli_thrinfo_free
 	#endif
 
 	// Free the thrinfo_t struct.
-	bli_sba_release( rntm, thread );
+	bli_sba_release( sba_pool, thread );
 }
 
 // -----------------------------------------------------------------------------
 
-void bli_thrinfo_grow
+thrinfo_t* bli_thrinfo_split
      (
-       rntm_t*    rntm,
-       cntl_t*    cntl,
-       thrinfo_t* thread
-     )
-{
-	// First, consider the prenode branch of the thrinfo_t tree, which should be
-	// expanded only if there exists a prenode branch in the cntl_t tree.
-
-	if ( bli_cntl_sub_prenode( cntl ) != NULL )
-	{
-		// We only need to take action if the thrinfo_t sub-node is NULL; if it
-		// is non-NULL, then it has already been created and we'll use it as-is.
-		if ( bli_thrinfo_sub_prenode( thread ) == NULL )
-		{
-			// Assertion / sanity check.
-			if ( bli_cntl_bszid( cntl ) != BLIS_MC )
-			{
-				printf( "Assertion failed: Expanding prenode for non-IC loop?\n" );
-				bli_abort();
-			}
-
-			// Now we must create the packa, jr, and ir nodes that make up
-			// the prenode branch of current cntl_t node.
-
-			// Create a new node (or, if needed, multiple nodes) along the
-			// prenode branch of the tree and return the pointer to the
-			// (highest) child.
-			thrinfo_t* thread_prenode = bli_thrinfo_rgrow_prenode
-			(
-			  rntm,
-			  cntl,
-			  bli_cntl_sub_prenode( cntl ),
-			  thread
-			);
-
-			// Attach the child thrinfo_t node for the secondary branch to its
-			// parent structure.
-			bli_thrinfo_set_sub_prenode( thread_prenode, thread );
-		}
-	}
-
-	// Now, grow the primary branch of the thrinfo_t tree.
-
-	// NOTE: If bli_thrinfo_rgrow() is being called, the sub_node field will
-	// always be non-NULL, and so there's no need to check it.
-	//if ( bli_cntl_sub_node( cntl ) != NULL )
-	{
-		// We only need to take action if the thrinfo_t sub-node is NULL; if it
-		// is non-NULL, then it has already been created and we'll use it as-is.
-		if ( bli_thrinfo_sub_node( thread ) == NULL )
-		{
-			// Create a new node (or, if needed, multiple nodes) along the
-			// main sub-node branch of the tree and return the pointer to the
-			// (highest) child.
-			thrinfo_t* thread_child = bli_thrinfo_rgrow
-			(
-			  rntm,
-			  cntl,
-			  bli_cntl_sub_node( cntl ),
-			  thread
-			);
-
-			// Attach the child thrinfo_t node for the primary branch to its
-			// parent structure.
-			bli_thrinfo_set_sub_node( thread_child, thread );
-		}
-	}
-}
-
-// -----------------------------------------------------------------------------
-
-thrinfo_t* bli_thrinfo_rgrow
-     (
-       rntm_t*    rntm,
-       cntl_t*    cntl_par,
-       cntl_t*    cntl_cur,
-       thrinfo_t* thread_par
-     )
-{
-	thrinfo_t* thread_cur;
-
-	// We must handle two cases: those where the next node in the
-	// control tree is a partitioning node, and those where it is
-	// a non-partitioning (ie: packing) node.
-	if ( bli_cntl_bszid( cntl_cur ) != BLIS_NO_PART )
-	{
-		// Create the child thrinfo_t node corresponding to cntl_cur,
-		// with cntl_par being the parent.
-		thread_cur = bli_thrinfo_create_for_cntl
-		(
-		  rntm,
-		  cntl_par,
-		  cntl_cur,
-		  thread_par
-		);
-	}
-	else // if ( bli_cntl_bszid( cntl_cur ) == BLIS_NO_PART )
-	{
-		// Recursively grow the thread structure and return the top-most
-		// thrinfo_t node of that segment.
-		thrinfo_t* thread_seg = bli_thrinfo_rgrow
-		(
-		  rntm,
-		  cntl_par,
-		  bli_cntl_sub_node( cntl_cur ),
-		  thread_par
-		);
-
-		// Create a thrinfo_t node corresponding to cntl_cur. Since the
-		// corresponding cntl node, cntl_cur, is a non-partitioning node
-		// (bszid = BLIS_NO_PART), this means it's a packing node. Packing
-		// thrinfo_t nodes are formed differently than those corresponding to
-		// partitioning nodes; specifically, their work_id's are set equal to
-		// the their comm_id's. Also, notice that the free_comm field is set
-		// to FALSE since cntl_cur is a non-partitioning node. The reason:
-		// the communicator used here will be freed when thread_seg, or one
-		// of its descendents, is freed.
-		thread_cur = bli_thrinfo_create
-		(
-		  rntm,                                           // rntm
-		  bli_thrinfo_ocomm( thread_seg ),                // ocomm
-		  bli_thread_ocomm_id( thread_seg ),              // ocomm_id
-		  bli_cntl_calc_num_threads_in( rntm, cntl_cur ), // n_way
-		  bli_thread_ocomm_id( thread_seg ),              // work_id
-		  FALSE,                                          // free_comm
-		  BLIS_NO_PART,                                   // bszid
-		  thread_seg                                      // sub_node
-		);
-	}
-
-	return thread_cur;
-}
-
-#define BLIS_NUM_STATIC_COMMS 80
-
-thrinfo_t* bli_thrinfo_create_for_cntl
-     (
-       rntm_t*    rntm,
-       cntl_t*    cntl_par,
-       cntl_t*    cntl_chl,
+       dim_t      n_way,
        thrinfo_t* thread_par
      )
 {
-	// If we are running with a single thread, all of the code can be reduced
-	// and simplified to this.
-	if ( bli_rntm_calc_num_threads( rntm ) == 1 )
-	{
-		thrinfo_t* thread_chl = bli_thrinfo_create
-		(
-		  rntm,                        // rntm
-		  &BLIS_SINGLE_COMM,           // ocomm
-		  0,                           // ocomm_id
-		  1,                           // n_way
-		  0,                           // work_id
-		  FALSE,                       // free_comm
-		  BLIS_NO_PART,                // bszid
-		  NULL                         // sub_node
-		);
-		return thread_chl;
-	}
-
-	thrcomm_t*  static_comms[ BLIS_NUM_STATIC_COMMS ];
-	thrcomm_t** new_comms = NULL;
-
-	const bszid_t bszid_chl = bli_cntl_bszid( cntl_chl );
-
-	const dim_t parent_nt_in   = bli_thread_num_threads( thread_par );
-	const dim_t parent_n_way   = bli_thread_n_way( thread_par );
-	const dim_t parent_comm_id = bli_thread_ocomm_id( thread_par );
-	const dim_t parent_work_id = bli_thread_work_id( thread_par );
+	const dim_t   parent_num_threads = bli_thread_num_threads( thread_par );
+	const dim_t   parent_thread_id   = bli_thread_thread_id( thread_par );
+          pool_t* sba_pool           = bli_thread_sba_pool( thread_par );
+          pba_t*  pba                = bli_thread_pba( thread_par );
+
+    if ( n_way == 1 )
+    {
+        return bli_thrinfo_create
+    	(
+           bli_thrinfo_comm( thread_par ),
+           parent_thread_id,
+           1,
+           0,
+           false,
+           sba_pool,
+           pba
+    	);
+    }
+    else if ( n_way == parent_num_threads )
+    {
+        return bli_thrinfo_create
+    	(
+           &BLIS_SINGLE_COMM,
+           0,
+           n_way,
+           parent_thread_id,
+           false,
+           sba_pool,
+           pba
+    	);
+    }
 
 	// Sanity check: make sure the number of threads in the parent's
 	// communicator is divisible by the number of new sub-groups.
-	if ( parent_nt_in % parent_n_way != 0 )
+	if ( parent_num_threads % n_way != 0 )
 	{
-		printf( "Assertion failed: parent_nt_in <mod> parent_n_way != 0\n" );
+		printf( "Assertion failed: parent_num_threads %% n_way != 0\n" );
 		bli_abort();
 	}
 
+	thrcomm_t*  static_comms[ BLIS_NUM_STATIC_COMMS ];
+	thrcomm_t** new_comms = NULL;
+
 	// Compute:
 	// - the number of threads inside the new child comm,
 	// - the current thread's id within the new communicator,
 	// - the current thread's work id, given the ways of parallelism
 	//   to be obtained within the next loop.
-	const dim_t child_nt_in   = bli_cntl_calc_num_threads_in( rntm, cntl_chl );
-	const dim_t child_n_way   = bli_rntm_ways_for( bszid_chl, rntm );
-	const dim_t child_comm_id = parent_comm_id % child_nt_in;
-	const dim_t child_work_id = child_comm_id / ( child_nt_in / child_n_way );
-
-//printf( "thread %d: child_n_way = %d child_nt_in = %d parent_n_way = %d (bszid = %d->%d)\n", (int)child_comm_id, (int)child_nt_in, (int)child_n_way, (int)parent_n_way, (int)bli_cntl_bszid( cntl_par ), (int)bszid_chl );
+    const dim_t child_num_threads = parent_num_threads / n_way;
+	const dim_t child_thread_id   = parent_thread_id % child_num_threads;
+	const dim_t child_work_id     = parent_thread_id / child_num_threads;
 
 	// The parent's chief thread creates a temporary array of thrcomm_t
 	// pointers.
-	if ( bli_thread_am_ochief( thread_par ) )
+	if ( bli_thread_am_chief( thread_par ) )
 	{
 		err_t r_val;
 
-		if ( parent_n_way > BLIS_NUM_STATIC_COMMS )
-			new_comms = bli_malloc_intl( parent_n_way * sizeof( thrcomm_t* ), &r_val );
+		if ( n_way > BLIS_NUM_STATIC_COMMS )
+			new_comms = bli_malloc_intl( n_way * sizeof( thrcomm_t* ), &r_val );
 		else
 			new_comms = static_comms;
 	}
@@ -365,8 +216,8 @@ thrinfo_t* bli_thrinfo_create_for_cntl
 	// Chiefs in the child communicator allocate the communicator
 	// object and store it in the array element corresponding to the
 	// parent's work id.
-	if ( child_comm_id == 0 )
-		new_comms[ parent_work_id ] = bli_thrcomm_create( rntm, child_nt_in );
+	if ( child_thread_id == 0 )
+		new_comms[ child_work_id ] = bli_thrcomm_create( sba_pool, child_num_threads );
 
 	bli_thread_barrier( thread_par );
 
@@ -374,277 +225,25 @@ thrinfo_t* bli_thrinfo_create_for_cntl
 	// that was created by their chief, as identified by parent_work_id.
 	thrinfo_t* thread_chl = bli_thrinfo_create
 	(
-	  rntm,                        // rntm
-	  new_comms[ parent_work_id ], // ocomm
-	  child_comm_id,               // ocomm_id
-	  child_n_way,                 // n_way
-	  child_work_id,               // work_id
-	  TRUE,                        // free_comm
-	  bszid_chl,                   // bszid
-	  NULL                         // sub_node
+       new_comms[ child_work_id ],
+       child_thread_id,
+       n_way,
+       child_work_id,
+       true,
+       sba_pool,
+       pba
 	);
 
 	bli_thread_barrier( thread_par );
 
 	// The parent's chief thread frees the temporary array of thrcomm_t
 	// pointers.
-	if ( bli_thread_am_ochief( thread_par ) )
-	{
-		if ( parent_n_way > BLIS_NUM_STATIC_COMMS )
-			bli_free_intl( new_comms );
-	}
-
-	return thread_chl;
-}
-
-// -----------------------------------------------------------------------------
-
-thrinfo_t* bli_thrinfo_rgrow_prenode
-     (
-       rntm_t*    rntm,
-       cntl_t*    cntl_par,
-       cntl_t*    cntl_cur,
-       thrinfo_t* thread_par
-     )
-{
-	thrinfo_t* thread_cur;
-
-	// We must handle two cases: those where the next node in the
-	// control tree is a partitioning node, and those where it is
-	// a non-partitioning (ie: packing) node.
-	if ( bli_cntl_bszid( cntl_cur ) != BLIS_NO_PART )
-	{
-		// Create the child thrinfo_t node corresponding to cntl_cur,
-		// with cntl_par being the parent.
-		thread_cur = bli_thrinfo_create_for_cntl_prenode
-		(
-		  rntm,
-		  cntl_par,
-		  cntl_cur,
-		  thread_par
-		);
-	}
-	else // if ( bli_cntl_bszid( cntl_cur ) == BLIS_NO_PART )
-	{
-		// Recursively grow the thread structure and return the top-most
-		// thrinfo_t node of that segment.
-		thrinfo_t* thread_seg = bli_thrinfo_rgrow_prenode
-		(
-		  rntm,
-		  cntl_par,
-		  bli_cntl_sub_node( cntl_cur ),
-		  thread_par
-		);
-
-		// Create a thrinfo_t node corresponding to cntl_cur. Since the
-		// corresponding cntl node, cntl_cur, is a non-partitioning node
-		// (bszid = BLIS_NO_PART), this means it's a packing node. Packing
-		// thrinfo_t nodes are formed differently than those corresponding to
-		// partitioning nodes; specifically, their work_id's are set equal to
-		// the their comm_id's. Also, notice that the free_comm field is set
-		// to FALSE since cntl_cur is a non-partitioning node. The reason:
-		// the communicator used here will be freed when thread_seg, or one
-		// of its descendents, is freed.
-		thread_cur = bli_thrinfo_create
-		(
-		  rntm,                                           // rntm
-		  bli_thrinfo_ocomm( thread_seg ),                // ocomm
-		  bli_thread_ocomm_id( thread_seg ),              // ocomm_id
-		  bli_cntl_calc_num_threads_in( rntm, cntl_par ), // n_way
-		  bli_thread_ocomm_id( thread_seg ),              // work_id
-		  FALSE,                                          // free_comm
-		  BLIS_NO_PART,                                   // bszid
-		  thread_seg                                      // sub_node
-		);
-	}
-
-	return thread_cur;
-}
-
-thrinfo_t* bli_thrinfo_create_for_cntl_prenode
-     (
-       rntm_t*    rntm,
-       cntl_t*    cntl_par,
-       cntl_t*    cntl_chl,
-       thrinfo_t* thread_par
-     )
-{
-	// NOTE: This function only has to work for the ic -> (pa -> jr)
-	// thrinfo_t tree branch extension. After that, the function
-	// bli_thrinfo_create_for_cntl() will be called for the last jr->ir
-	// branch extension.
-
-	const bszid_t bszid_chl = bli_cntl_bszid( cntl_chl );
-
-	const dim_t parent_nt_in   = bli_thread_num_threads( thread_par );
-	const dim_t parent_n_way   = bli_thread_n_way( thread_par );
-	const dim_t parent_comm_id = bli_thread_ocomm_id( thread_par );
-	//const dim_t parent_work_id = bli_thread_work_id( thread_par );
-
-	// Sanity check: make sure the number of threads in the parent's
-	// communicator is divisible by the number of new sub-groups.
-	if ( parent_nt_in % parent_n_way != 0 )
-	{
-		printf( "Assertion failed: parent_nt_in (%d) <mod> parent_n_way (%d) != 0\n",
-		        ( int )parent_nt_in, ( int )parent_n_way );
-		bli_abort();
+	if ( bli_thread_am_chief( thread_par ) &&
+         new_comms != static_comms )
+    {
+		bli_free_intl( new_comms );
 	}
 
-	//dim_t child_nt_in   = bli_cntl_calc_num_threads_in( rntm, cntl_chl );
-	//dim_t child_n_way   = bli_rntm_ways_for( bszid_chl, rntm );
-	const dim_t child_nt_in   = parent_nt_in;
-	const dim_t child_n_way   = parent_nt_in;
-	const dim_t child_comm_id = parent_comm_id % child_nt_in;
-	const dim_t child_work_id = child_comm_id / ( child_nt_in / child_n_way );
-
-	bli_thread_barrier( thread_par );
-
-	// NOTE: Recall that parent_comm_id == child_comm_id, so checking for the
-	// parent's chief-ness is equivalent to checking for chief-ness in the new
-	// about-to-be-created communicator group.
-	thrcomm_t* new_comm = NULL;
-	if ( bli_thread_am_ochief( thread_par ) )
-		new_comm = bli_thrcomm_create( rntm, child_nt_in );
-
-	// Broadcast the new thrcomm_t address to the other threads in the
-	// parent's group.
-	new_comm = bli_thread_broadcast( thread_par, new_comm );
-
-	// All threads create a new thrinfo_t node using the communicator
-	// that was created by their chief, as identified by parent_work_id.
-	thrinfo_t* thread_chl = bli_thrinfo_create
-	(
-	  rntm,          // rntm
-	  new_comm,      // ocomm
-	  child_comm_id, // ocomm_id
-	  child_n_way,   // n_way
-	  child_work_id, // work_id
-	  TRUE,          // free_comm
-	  bszid_chl,     // bszid
-	  NULL           // sub_node
-	);
-
-	bli_thread_barrier( thread_par );
-
 	return thread_chl;
 }
 
-// -----------------------------------------------------------------------------
-
-#if 0
-void bli_thrinfo_grow_tree
-     (
-       rntm_t*    rntm,
-       cntl_t*    cntl,
-       thrinfo_t* thread
-     )
-{
-	cntl_t*    cntl_jc     = cntl;
-	thrinfo_t* thrinfo_jc  = thread;
-
-	bli_thrinfo_grow( rntm, cntl_jc, thrinfo_jc );
-
-	// inside jc loop:
-	cntl_t*    cntl_pc     = bli_cntl_sub_node( cntl_jc );
-	thrinfo_t* thrinfo_pc  = bli_thrinfo_sub_node( thrinfo_jc );
-
-	bli_thrinfo_grow( rntm, cntl_pc, thrinfo_pc );
-
-	// inside pc loop:
-	cntl_t*    cntl_pb     = bli_cntl_sub_node( cntl_pc );
-	thrinfo_t* thrinfo_pb  = bli_thrinfo_sub_node( thrinfo_pc );
-
-	bli_thrinfo_grow( rntm, cntl_pb, thrinfo_pb );
-
-	// after pb packing:
-	cntl_t*    cntl_ic     = bli_cntl_sub_node( cntl_pb );
-	thrinfo_t* thrinfo_ic  = bli_thrinfo_sub_node( thrinfo_pb );
-
-	bli_thrinfo_grow( rntm, cntl_ic, thrinfo_ic );
-
-	// -- main branch --
-
-	// inside ic loop:
-	cntl_t*    cntl_pa     = bli_cntl_sub_node( cntl_ic );
-	thrinfo_t* thrinfo_pa  = bli_thrinfo_sub_node( thrinfo_ic );
-
-	bli_thrinfo_grow( rntm, cntl_pa, thrinfo_pa );
-
-	// after pa packing:
-	cntl_t*    cntl_jr     = bli_cntl_sub_node( cntl_pa );
-	thrinfo_t* thrinfo_jr  = bli_thrinfo_sub_node( thrinfo_pa );
-
-	bli_thrinfo_grow( rntm, cntl_jr, thrinfo_jr );
-
-	// inside jr loop:
-	//cntl_t*    cntl_ir     = bli_cntl_sub_node( cntl_jr );
-	//thrinfo_t* thrinfo_ir  = bli_thrinfo_sub_node( thrinfo_jr );
-
-	// -- trsm branch --
-
-	// inside ic loop:
-	cntl_t*    cntl_pa0    = bli_cntl_sub_prenode( cntl_ic );
-	thrinfo_t* thrinfo_pa0 = bli_thrinfo_sub_prenode( thrinfo_ic );
-
-	bli_thrinfo_grow( rntm, cntl_pa0, thrinfo_pa0 );
-
-	// after pa packing:
-	cntl_t*    cntl_jr0    = bli_cntl_sub_node( cntl_pa0 );
-	thrinfo_t* thrinfo_jr0 = bli_thrinfo_sub_node( thrinfo_pa0 );
-
-	bli_thrinfo_grow( rntm, cntl_jr0, thrinfo_jr0 );
-
-	// inside jr loop:
-	//cntl_t*    cntl_ir0   = bli_cntl_sub_node( cntl_jr0 );
-	//thrinfo_t* thrinfo_ir0= bli_thrinfo_sub_node( thrinfo_jr0 );
-}
-
-void bli_thrinfo_grow_tree_ic
-     (
-       rntm_t*    rntm,
-       cntl_t*    cntl,
-       thrinfo_t* thread
-     )
-{
-	cntl_t*    cntl_ic     = cntl;
-	thrinfo_t* thrinfo_ic  = thread;
-
-	bli_thrinfo_grow( rntm, cntl_ic, thrinfo_ic );
-
-	// -- main branch --
-
-	// inside ic loop:
-	cntl_t*    cntl_pa     = bli_cntl_sub_node( cntl_ic );
-	thrinfo_t* thrinfo_pa  = bli_thrinfo_sub_node( thrinfo_ic );
-
-	bli_thrinfo_grow( rntm, cntl_pa, thrinfo_pa );
-
-	// after pa packing:
-	cntl_t*    cntl_jr     = bli_cntl_sub_node( cntl_pa );
-	thrinfo_t* thrinfo_jr  = bli_thrinfo_sub_node( thrinfo_pa );
-
-	bli_thrinfo_grow( rntm, cntl_jr, thrinfo_jr );
-
-	// inside jr loop:
-	//cntl_t*    cntl_ir     = bli_cntl_sub_node( cntl_jr );
-	//thrinfo_t* thrinfo_ir  = bli_thrinfo_sub_node( thrinfo_jr );
-
-	// -- trsm branch --
-
-	// inside ic loop:
-	cntl_t*    cntl_pa0    = bli_cntl_sub_prenode( cntl_ic );
-	thrinfo_t* thrinfo_pa0 = bli_thrinfo_sub_prenode( thrinfo_ic );
-
-	bli_thrinfo_grow( rntm, cntl_pa0, thrinfo_pa0 );
-
-	// after pa packing:
-	cntl_t*    cntl_jr0    = bli_cntl_sub_node( cntl_pa0 );
-	thrinfo_t* thrinfo_jr0 = bli_thrinfo_sub_node( thrinfo_pa0 );
-
-	bli_thrinfo_grow( rntm, cntl_jr0, thrinfo_jr0 );
-
-	// inside jr loop:
-	//cntl_t*    cntl_ir0   = bli_cntl_sub_node( cntl_jr0 );
-	//thrinfo_t* thrinfo_ir0= bli_thrinfo_sub_node( thrinfo_jr0 );
-}
-#endif
diff --git a/frame/thread/bli_thrinfo.h b/frame/thread/bli_thrinfo.h
index 6b98096849..e33a6ae71c 100644
--- a/frame/thread/bli_thrinfo.h
+++ b/frame/thread/bli_thrinfo.h
@@ -41,15 +41,16 @@ struct thrinfo_s
 {
 	// The thread communicator for the other threads sharing the same work
 	// at this level.
-	thrcomm_t*         ocomm;
+	thrcomm_t*         comm;
 
-	// Our thread id within the ocomm thread communicator.
-	dim_t              ocomm_id;
+	// Our thread id within the thread communicator.
+	dim_t              thread_id;
 
-	// The number of distinct threads used to parallelize the loop.
+	// The number of communicators which are "siblings" of our communicator
 	dim_t              n_way;
 
-	// What we're working on.
+	// What we're working on. This is the same for all threads in the same
+    // communicator, and 0 <= work_id < n_way.
 	dim_t              work_id;
 
 	// When freeing, should the communicators in this node be freed? Usually,
@@ -58,9 +59,14 @@ struct thrinfo_s
 	// to false.
 	bool               free_comm;
 
-	// The bszid_t to help identify the node. This is mostly only useful when
-	// debugging or tracing the allocation and release of thrinfo_t nodes.
-	bszid_t            bszid;
+	// The small block pool.
+	pool_t*            sba_pool;
+
+	// The packing block allocator.
+	pba_t*             pba;
+
+	// Storage for allocated memory obtained from the PBA.
+	mem_t              mem;
 
 	struct thrinfo_s*  sub_prenode;
 	struct thrinfo_s*  sub_node;
@@ -77,12 +83,12 @@ typedef struct thrinfo_s thrinfo_t;
 
 BLIS_INLINE dim_t bli_thread_num_threads( const thrinfo_t* t )
 {
-	return (t->ocomm)->n_threads;
+	return (t->comm)->n_threads;
 }
 
-BLIS_INLINE dim_t bli_thread_ocomm_id( const thrinfo_t* t )
+BLIS_INLINE dim_t bli_thread_thread_id( const thrinfo_t* t )
 {
-	return t->ocomm_id;
+	return t->thread_id;
 }
 
 BLIS_INLINE dim_t bli_thread_n_way( const thrinfo_t* t )
@@ -95,9 +101,9 @@ BLIS_INLINE dim_t bli_thread_work_id( const thrinfo_t* t )
 	return t->work_id;
 }
 
-BLIS_INLINE thrcomm_t* bli_thrinfo_ocomm( const thrinfo_t* t )
+BLIS_INLINE thrcomm_t* bli_thrinfo_comm( const thrinfo_t* t )
 {
-	return t->ocomm;
+	return t->comm;
 }
 
 BLIS_INLINE bool bli_thrinfo_needs_free_comm( const thrinfo_t* t )
@@ -105,9 +111,19 @@ BLIS_INLINE bool bli_thrinfo_needs_free_comm( const thrinfo_t* t )
 	return t->free_comm;
 }
 
-BLIS_INLINE dim_t bli_thread_bszid( const thrinfo_t* t )
+BLIS_INLINE pool_t* bli_thread_sba_pool( const thrinfo_t* t )
+{
+	return t->sba_pool;
+}
+
+BLIS_INLINE pba_t* bli_thread_pba( const thrinfo_t* t )
+{
+	return t->pba;
+}
+
+BLIS_INLINE mem_t* bli_thread_mem( thrinfo_t* t )
 {
-	return t->bszid;
+	return &t->mem;
 }
 
 BLIS_INLINE thrinfo_t* bli_thrinfo_sub_node( const thrinfo_t* t )
@@ -122,21 +138,21 @@ BLIS_INLINE thrinfo_t* bli_thrinfo_sub_prenode( const thrinfo_t* t )
 
 // thrinfo_t query (complex)
 
-BLIS_INLINE bool bli_thread_am_ochief( const thrinfo_t* t )
+BLIS_INLINE bool bli_thread_am_chief( const thrinfo_t* t )
 {
-	return t->ocomm_id == 0;
+	return t->thread_id == 0;
 }
 
 // thrinfo_t modification
 
-BLIS_INLINE void bli_thrinfo_set_ocomm( thrcomm_t* ocomm, thrinfo_t* t )
+BLIS_INLINE void bli_thrinfo_set_comm( thrcomm_t* comm, thrinfo_t* t )
 {
-	t->ocomm = ocomm;
+	t->comm = comm;
 }
 
-BLIS_INLINE void bli_thrinfo_set_ocomm_id( dim_t ocomm_id, thrinfo_t* t )
+BLIS_INLINE void bli_thrinfo_set_thread_id( dim_t thread_id, thrinfo_t* t )
 {
-	t->ocomm_id = ocomm_id;
+	t->thread_id = thread_id;
 }
 
 BLIS_INLINE void bli_thrinfo_set_n_way( dim_t n_way, thrinfo_t* t )
@@ -154,9 +170,14 @@ BLIS_INLINE void bli_thrinfo_set_free_comm( bool free_comm, thrinfo_t* t )
 	t->free_comm = free_comm;
 }
 
-BLIS_INLINE void bli_thrinfo_set_bszid( bszid_t bszid, thrinfo_t* t )
+BLIS_INLINE void bli_thrinfo_set_sba_pool( pool_t* sba_pool, thrinfo_t* t )
+{
+	t->sba_pool = sba_pool;
+}
+
+BLIS_INLINE void bli_thrinfo_set_pba( pba_t* pba, thrinfo_t* t )
 {
-	t->bszid = bszid;
+	t->pba = pba;
 }
 
 BLIS_INLINE void bli_thrinfo_set_sub_node( thrinfo_t* sub_node, thrinfo_t* t )
@@ -173,12 +194,12 @@ BLIS_INLINE void bli_thrinfo_set_sub_prenode( thrinfo_t* sub_prenode, thrinfo_t*
 
 BLIS_INLINE void* bli_thread_broadcast( const thrinfo_t* t, void* p )
 {
-	return bli_thrcomm_bcast( t->ocomm_id, p, t->ocomm );
+	return bli_thrcomm_bcast( t->thread_id, p, t->comm );
 }
 
 BLIS_INLINE void bli_thread_barrier( const thrinfo_t* t )
 {
-	bli_thrcomm_barrier( t->ocomm_id, t->ocomm );
+	bli_thrcomm_barrier( t->thread_id, t->comm );
 }
 
 
@@ -186,98 +207,36 @@ BLIS_INLINE void bli_thread_barrier( const thrinfo_t* t )
 // Prototypes for level-3 thrinfo functions not specific to any operation.
 //
 
-thrinfo_t* bli_thrinfo_create
+thrinfo_t* bli_thrinfo_create_root
      (
-       rntm_t*    rntm,
-       thrcomm_t* ocomm,
-       dim_t      ocomm_id,
-       dim_t      n_way,
-       dim_t      work_id,
-       bool       free_comm,
-       bszid_t    bszid,
-       thrinfo_t* sub_node
+       thrcomm_t* comm,
+       dim_t      thread_id,
+       pool_t*    sba_pool,
+       pba_t*     pba
      );
 
-void bli_thrinfo_init
+thrinfo_t* bli_thrinfo_create
      (
-       thrinfo_t* thread,
-       thrcomm_t* ocomm,
-       dim_t      ocomm_id,
+       thrcomm_t* comm,
+       dim_t      thread_id,
        dim_t      n_way,
        dim_t      work_id,
        bool       free_comm,
-       bszid_t    bszid,
-       thrinfo_t* sub_node
-     );
-
-void bli_thrinfo_init_single
-     (
-       thrinfo_t* thread
+       pool_t*    sba_pool,
+       pba_t*     pba
      );
 
 void bli_thrinfo_free
      (
-       rntm_t*    rntm,
        thrinfo_t* thread
      );
 
 // -----------------------------------------------------------------------------
 
-void bli_thrinfo_grow
-     (
-       rntm_t*    rntm,
-       cntl_t*    cntl,
-       thrinfo_t* thread
-     );
-
-thrinfo_t* bli_thrinfo_rgrow
+thrinfo_t* bli_thrinfo_split
      (
-       rntm_t*    rntm,
-       cntl_t*    cntl_par,
-       cntl_t*    cntl_cur,
-       thrinfo_t* thread_par
-     );
-
-thrinfo_t* bli_thrinfo_create_for_cntl
-     (
-       rntm_t*    rntm,
-       cntl_t*    cntl_par,
-       cntl_t*    cntl_chl,
-       thrinfo_t* thread_par
-     );
-
-thrinfo_t* bli_thrinfo_rgrow_prenode
-     (
-       rntm_t*    rntm,
-       cntl_t*    cntl_par,
-       cntl_t*    cntl_cur,
-       thrinfo_t* thread_par
-     );
-
-thrinfo_t* bli_thrinfo_create_for_cntl_prenode
-     (
-       rntm_t*    rntm,
-       cntl_t*    cntl_par,
-       cntl_t*    cntl_chl,
+       dim_t      n_way,
        thrinfo_t* thread_par
      );
 
-// -----------------------------------------------------------------------------
-
-#if 0
-void bli_thrinfo_grow_tree
-     (
-       rntm_t*    rntm,
-       cntl_t*    cntl,
-       thrinfo_t* thread
-     );
-
-void bli_thrinfo_grow_tree_ic
-     (
-       rntm_t*    rntm,
-       cntl_t*    cntl,
-       thrinfo_t* thread
-     );
-#endif
-
 #endif
diff --git a/frame/thread/bli_thrinfo_sup.c b/frame/thread/bli_thrinfo_sup.c
deleted file mode 100644
index 966247fd04..0000000000
--- a/frame/thread/bli_thrinfo_sup.c
+++ /dev/null
@@ -1,290 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-void bli_thrinfo_sup_grow
-     (
-             rntm_t*    rntm,
-       const bszid_t*   bszid_par,
-             thrinfo_t* thread
-     )
-{
-	if ( thread == &BLIS_GEMM_SINGLE_THREADED ||
-	     thread == &BLIS_PACKM_SINGLE_THREADED ) return;
-
-	// NOTE: If bli_thrinfo_sup_rgrow() is being called, the sub_node field will
-	// always be non-NULL, and so there's no need to check it.
-	//if ( bli_cntl_sub_node( cntl ) != NULL )
-	{
-		// We only need to take action if the thrinfo_t sub-node is NULL; if it
-		// is non-NULL, then it has already been created and we'll use it as-is.
-		if ( bli_thrinfo_sub_node( thread ) == NULL )
-		{
-			// Create a new node (or, if needed, multiple nodes) along the
-			// main sub-node branch of the tree and return the pointer to the
-			// (highest) child.
-			thrinfo_t* thread_child = bli_thrinfo_sup_rgrow
-			(
-			  rntm,
-			  bszid_par,
-			  &bszid_par[1],
-			  thread
-			);
-
-			// Attach the child thrinfo_t node for the primary branch to its
-			// parent structure.
-			bli_thrinfo_set_sub_node( thread_child, thread );
-		}
-	}
-}
-
-// -----------------------------------------------------------------------------
-
-thrinfo_t* bli_thrinfo_sup_rgrow
-     (
-             rntm_t*    rntm,
-       const bszid_t*   bszid_par,
-       const bszid_t*   bszid_cur,
-             thrinfo_t* thread_par
-     )
-{
-	thrinfo_t* thread_cur;
-
-	// We must handle two cases: those where the next node in the
-	// control tree is a partitioning node, and those where it is
-	// a non-partitioning (ie: packing) node.
-	if ( *bszid_cur != BLIS_NO_PART )
-	{
-		// Create the child thrinfo_t node corresponding to cntl_cur,
-		// with cntl_par being the parent.
-		thread_cur = bli_thrinfo_sup_create_for_cntl
-		(
-		  rntm,
-		  bszid_par,
-		  bszid_cur,
-		  thread_par
-		);
-	}
-	else // if ( *bszid_cur == BLIS_NO_PART )
-	{
-		// Recursively grow the thread structure and return the top-most
-		// thrinfo_t node of that segment.
-		thrinfo_t* thread_seg = bli_thrinfo_sup_rgrow
-		(
-		  rntm,
-		  bszid_par,
-		  &bszid_cur[1],
-		  thread_par
-		);
-
-		// Create a thrinfo_t node corresponding to cntl_cur. Since the
-		// corresponding cntl node, cntl_cur, is a non-partitioning node
-		// (bszid = BLIS_NO_PART), this means it's a packing node. Packing
-		// thrinfo_t nodes are formed differently than those corresponding to
-		// partitioning nodes; specifically, their work_id's are set equal to
-		// the their comm_id's. Also, notice that the free_comm field is set
-		// to FALSE since cntl_cur is a non-partitioning node. The reason:
-		// the communicator used here will be freed when thread_seg, or one
-		// of its descendents, is freed.
-		thread_cur = bli_thrinfo_create
-		(
-		  rntm,                                            // rntm
-		  bli_thrinfo_ocomm( thread_seg ),                 // ocomm
-		  bli_thread_ocomm_id( thread_seg ),               // ocomm_id
-		  bli_rntm_calc_num_threads_in( bszid_cur, rntm ), // n_way
-		  bli_thread_ocomm_id( thread_seg ),               // work_id
-		  FALSE,                                           // free_comm
-		  BLIS_NO_PART,                                    // bszid
-		  thread_seg                                       // sub_node
-		);
-	}
-
-	return thread_cur;
-}
-
-#define BLIS_NUM_STATIC_COMMS 80
-
-thrinfo_t* bli_thrinfo_sup_create_for_cntl
-     (
-             rntm_t*    rntm,
-       const bszid_t*   bszid_par,
-       const bszid_t*   bszid_chl,
-             thrinfo_t* thread_par
-     )
-{
-	// If we are running with a single thread, all of the code can be reduced
-	// and simplified to this.
-	if ( bli_rntm_calc_num_threads( rntm ) == 1 )
-	{
-		thrinfo_t* thread_chl = bli_thrinfo_create
-		(
-		  rntm,               // rntm
-		  &BLIS_SINGLE_COMM,  // ocomm
-		  0,                  // ocomm_id
-		  1,                  // n_way
-		  0,                  // work_id
-		  FALSE,              // free_comm
-		  BLIS_NO_PART,       // bszid
-		  NULL                // sub_node
-		);
-
-		return thread_chl;
-	}
-
-	// The remainder of this function handles the cases involving the use of
-	// multiple BLIS threads.
-
-	if ( bli_rntm_pack_a( rntm ) == FALSE &&
-	     bli_rntm_pack_b( rntm ) == FALSE )
-	{
-		// If we are packing neither A nor B, there are no broadcasts or barriers
-		// needed to synchronize threads (since all threads can work completely
-		// independently). In this special case situation, the thrinfo_t can be
-		// created with much simpler logic.
-
-		const dim_t parent_comm_id = bli_thread_ocomm_id( thread_par );
-
-		// Compute:
-		// - the number of threads inside the new child comm,
-		// - the current thread's id within the new communicator,
-		// - the current thread's work id, given the ways of parallelism
-		//   to be obtained within the next loop.
-		const dim_t child_nt_in   = bli_rntm_calc_num_threads_in( bszid_chl, rntm );
-		const dim_t child_n_way   = bli_rntm_ways_for( *bszid_chl, rntm );
-		const dim_t child_comm_id = parent_comm_id % child_nt_in;
-		const dim_t child_work_id = child_comm_id / ( child_nt_in / child_n_way );
-
-		// All threads create a new thrinfo_t node using the communicator
-		// that was created by their chief, as identified by parent_work_id.
-		thrinfo_t* thread_chl = bli_thrinfo_create
-		(
-		  rntm,                        // rntm
-		  NULL,                        // ocomm
-		  child_comm_id,               // ocomm_id
-		  child_n_way,                 // n_way
-		  child_work_id,               // work_id
-		  TRUE,                        // free_comm
-		  *bszid_chl,                  // bszid
-		  NULL                         // sub_node
-		);
-
-		return thread_chl;
-	}
-	else
-	{
-		// If we are packing at least one of A or B, then we use the general
-		// approach that employs broadcasts and barriers.
-
-		thrcomm_t*  static_comms[ BLIS_NUM_STATIC_COMMS ];
-		thrcomm_t** new_comms = NULL;
-
-		const dim_t parent_nt_in   = bli_thread_num_threads( thread_par );
-		const dim_t parent_n_way   = bli_thread_n_way( thread_par );
-		const dim_t parent_comm_id = bli_thread_ocomm_id( thread_par );
-		const dim_t parent_work_id = bli_thread_work_id( thread_par );
-
-		// Sanity check: make sure the number of threads in the parent's
-		// communicator is divisible by the number of new sub-groups.
-		if ( parent_nt_in % parent_n_way != 0 )
-		{
-			printf( "Assertion failed: parent_nt_in <mod> parent_n_way != 0\n" );
-			bli_abort();
-		}
-
-		// Compute:
-		// - the number of threads inside the new child comm,
-		// - the current thread's id within the new communicator,
-		// - the current thread's work id, given the ways of parallelism
-		//   to be obtained within the next loop.
-		const dim_t child_nt_in   = bli_rntm_calc_num_threads_in( bszid_chl, rntm );
-		const dim_t child_n_way   = bli_rntm_ways_for( *bszid_chl, rntm );
-		const dim_t child_comm_id = parent_comm_id % child_nt_in;
-		const dim_t child_work_id = child_comm_id / ( child_nt_in / child_n_way );
-
-//printf( "thread %d: child_n_way = %d child_nt_in = %d parent_n_way = %d (bszid = %d->%d)\n", (int)child_comm_id, (int)child_nt_in, (int)child_n_way, (int)parent_n_way, (int)bli_cntl_bszid( cntl_par ), (int)bszid_chl );
-
-		// The parent's chief thread creates a temporary array of thrcomm_t
-		// pointers.
-		if ( bli_thread_am_ochief( thread_par ) )
-		{
-			err_t r_val;
-
-			if ( parent_n_way > BLIS_NUM_STATIC_COMMS )
-				new_comms = bli_malloc_intl( parent_n_way * sizeof( thrcomm_t* ), &r_val );
-			else
-				new_comms = static_comms;
-		}
-
-		// Broadcast the temporary array to all threads in the parent's
-		// communicator.
-		new_comms = bli_thread_broadcast( thread_par, new_comms );
-
-		// Chiefs in the child communicator allocate the communicator
-		// object and store it in the array element corresponding to the
-		// parent's work id.
-		if ( child_comm_id == 0 )
-			new_comms[ parent_work_id ] = bli_thrcomm_create( rntm, child_nt_in );
-
-		bli_thread_barrier( thread_par );
-
-		// All threads create a new thrinfo_t node using the communicator
-		// that was created by their chief, as identified by parent_work_id.
-		thrinfo_t* thread_chl = bli_thrinfo_create
-		(
-		  rntm,                        // rntm
-		  new_comms[ parent_work_id ], // ocomm
-		  child_comm_id,               // ocomm_id
-		  child_n_way,                 // n_way
-		  child_work_id,               // work_id
-		  TRUE,                        // free_comm
-		  *bszid_chl,                  // bszid
-		  NULL                         // sub_node
-		);
-
-		bli_thread_barrier( thread_par );
-
-		// The parent's chief thread frees the temporary array of thrcomm_t
-		// pointers.
-		if ( bli_thread_am_ochief( thread_par ) )
-		{
-			if ( parent_n_way > BLIS_NUM_STATIC_COMMS )
-				bli_free_intl( new_comms );
-		}
-
-		return thread_chl;
-	}
-}
-
diff --git a/frame/thread/bli_thrinfo_sup.h b/frame/thread/bli_thrinfo_sup.h
deleted file mode 100644
index 1afcd3337e..0000000000
--- a/frame/thread/bli_thrinfo_sup.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef BLIS_THRINFO_SUP_H
-#define BLIS_THRINFO_SUP_H
-
-//
-// Prototypes for level-3 thrinfo sup functions.
-//
-
-void bli_thrinfo_sup_grow
-     (
-             rntm_t*    rntm,
-       const bszid_t*   bszid_par,
-             thrinfo_t* thread
-     );
-
-thrinfo_t* bli_thrinfo_sup_rgrow
-     (
-             rntm_t*    rntm,
-       const bszid_t*   bszid_par,
-       const bszid_t*   bszid_cur,
-             thrinfo_t* thread_par
-     );
-
-thrinfo_t* bli_thrinfo_sup_create_for_cntl
-     (
-             rntm_t*    rntm,
-       const bszid_t*   bszid_par,
-       const bszid_t*   bszid_chl,
-             thrinfo_t* thread_par
-     );
-
-#endif
diff --git a/kernels/zen/1/bli_scalv_zen_int10.c b/kernels/zen/1/bli_scalv_zen_int10.c
index b06de5a3d8..0929d6dc47 100644
--- a/kernels/zen/1/bli_scalv_zen_int10.c
+++ b/kernels/zen/1/bli_scalv_zen_int10.c
@@ -82,8 +82,6 @@ void bli_sscalv_zen_int10
 	{
 		float* zero = bli_s0;
 
-		if ( cntx == NULL ) cntx = bli_gks_query_cntx();
-
 		ssetv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_FLOAT, BLIS_SETV_KER, cntx );
 
 		f
@@ -276,8 +274,6 @@ void bli_dscalv_zen_int10
 	{
 		double* zero = bli_d0;
 
-		if ( cntx == NULL ) cntx = bli_gks_query_cntx();
-
 		dsetv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_SETV_KER, cntx );
 
 		f
diff --git a/kernels/zen/1f/bli_axpyf_zen_int_4.c b/kernels/zen/1f/bli_axpyf_zen_int_4.c
index 0ec5f44f53..bd7a554fa4 100644
--- a/kernels/zen/1f/bli_axpyf_zen_int_4.c
+++ b/kernels/zen/1f/bli_axpyf_zen_int_4.c
@@ -79,8 +79,6 @@ void bli_caxpyf_zen_int_4
     // operation as a loop over axpyv.
     if ( b_n != fuse_fac )
     {
-	if ( cntx == NULL ) cntx = bli_gks_query_cntx();
-
         caxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_SCOMPLEX, BLIS_AXPYV_KER, cntx );
 
         for ( i = 0; i < b_n; ++i )
diff --git a/kernels/zen/1f/bli_axpyf_zen_int_5.c b/kernels/zen/1f/bli_axpyf_zen_int_5.c
index 1566f98091..145005582d 100644
--- a/kernels/zen/1f/bli_axpyf_zen_int_5.c
+++ b/kernels/zen/1f/bli_axpyf_zen_int_5.c
@@ -108,8 +108,6 @@ void bli_saxpyf_zen_int_5
     // operation as a loop over axpyv.
     if ( b_n != fuse_fac )
     {
-        if ( cntx == NULL ) cntx = bli_gks_query_cntx();
-
         saxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_FLOAT, BLIS_AXPYV_KER, cntx );
 
         for ( i = 0; i < b_n; ++i )
@@ -360,8 +358,6 @@ void bli_daxpyf_zen_int_5
     // operation as a loop over axpyv.
     if ( b_n != fuse_fac )
     {
-        if ( cntx == NULL ) cntx = bli_gks_query_cntx();
-
         daxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
 
         for ( i = 0; i < b_n; ++i )
@@ -603,7 +599,7 @@ void bli_daxpyf_zen_int_16x2
 
     v2df_t           a40v, a41v;
 
-    v2df_t           y4v; 
+    v2df_t           y4v;
     // If either dimension is zero, or if alpha is zero, return early.
     if ( bli_zero_dim2( m, b_n ) || bli_deq0( *alpha ) ) return;
 
@@ -899,8 +895,6 @@ void bli_daxpyf_zen_int_16x4
     // operation as a loop over axpyv.
     if ( b_n != fuse_fac )
     {
-        if ( cntx == NULL ) cntx = bli_gks_query_cntx();
-
         daxpyv_ker_ft f = bli_cntx_get_ukr_dt( BLIS_DOUBLE, BLIS_AXPYV_KER, cntx );
 
         for ( i = 0; i < b_n; ++i )
diff --git a/ref_kernels/bli_cntx_ref.c b/ref_kernels/bli_cntx_ref.c
index e094db54ba..168c61f629 100644
--- a/ref_kernels/bli_cntx_ref.c
+++ b/ref_kernels/bli_cntx_ref.c
@@ -284,22 +284,22 @@ void GENBARNAME(cntx_init)
 	bli_cntx_set_blkszs
 	(
 	  cntx,
-	  BLIS_NC,  &blkszs[ BLIS_NC  ], BLIS_NR,
-	  BLIS_KC,  &blkszs[ BLIS_KC  ], BLIS_KR,
-	  BLIS_MC,  &blkszs[ BLIS_MC  ], BLIS_MR,
-	  BLIS_NR,  &blkszs[ BLIS_NR  ], BLIS_NR,
-	  BLIS_MR,  &blkszs[ BLIS_MR  ], BLIS_MR,
-	  BLIS_KR,  &blkszs[ BLIS_KR  ], BLIS_KR,
-	  BLIS_M2,  &blkszs[ BLIS_M2  ], BLIS_M2,
-	  BLIS_N2,  &blkszs[ BLIS_N2  ], BLIS_N2,
-	  BLIS_AF,  &blkszs[ BLIS_AF  ], BLIS_AF,
-	  BLIS_DF,  &blkszs[ BLIS_DF  ], BLIS_DF,
-	  BLIS_XF,  &blkszs[ BLIS_XF  ], BLIS_XF,
-	  BLIS_MT,  &blkszs[ BLIS_MT  ], BLIS_MT,
-	  BLIS_NT,  &blkszs[ BLIS_NT  ], BLIS_NT,
-	  BLIS_KT,  &blkszs[ BLIS_KT  ], BLIS_KT,
-	  BLIS_BBM, &blkszs[ BLIS_BBM ], BLIS_BBM,
-	  BLIS_BBN, &blkszs[ BLIS_BBN ], BLIS_BBN,
+	  BLIS_NC,  &blkszs[ BLIS_NC  ],
+	  BLIS_KC,  &blkszs[ BLIS_KC  ],
+	  BLIS_MC,  &blkszs[ BLIS_MC  ],
+	  BLIS_NR,  &blkszs[ BLIS_NR  ],
+	  BLIS_MR,  &blkszs[ BLIS_MR  ],
+	  BLIS_KR,  &blkszs[ BLIS_KR  ],
+	  BLIS_M2,  &blkszs[ BLIS_M2  ],
+	  BLIS_N2,  &blkszs[ BLIS_N2  ],
+	  BLIS_AF,  &blkszs[ BLIS_AF  ],
+	  BLIS_DF,  &blkszs[ BLIS_DF  ],
+	  BLIS_XF,  &blkszs[ BLIS_XF  ],
+	  BLIS_MT,  &blkszs[ BLIS_MT  ],
+	  BLIS_NT,  &blkszs[ BLIS_NT  ],
+	  BLIS_KT,  &blkszs[ BLIS_KT  ],
+	  BLIS_BBM, &blkszs[ BLIS_BBM ],
+	  BLIS_BBN, &blkszs[ BLIS_BBN ],
 	  BLIS_VA_END
 	);
 
@@ -308,16 +308,11 @@ void GENBARNAME(cntx_init)
 
 	funcs = cntx->ukrs;
 
-	// NOTE: We set the virtual micro-kernel slots to contain the addresses
-	// of the native micro-kernels. In general, the ukernels in the virtual
-	// ukernel slots are always called, and if the function called happens to
-	// be a virtual micro-kernel, it will then know to find its native ukernel
-	// (i.e., in the native ukernel slots).
-	gen_func_init( &funcs[ BLIS_GEMM_VIR_UKR ],       gemm_ukr_name       );
-	gen_func_init( &funcs[ BLIS_GEMMTRSM_L_VIR_UKR ], gemmtrsm_l_ukr_name );
-	gen_func_init( &funcs[ BLIS_GEMMTRSM_U_VIR_UKR ], gemmtrsm_u_ukr_name );
-	gen_func_init( &funcs[ BLIS_TRSM_L_VIR_UKR ],     trsm_l_ukr_name     );
-	gen_func_init( &funcs[ BLIS_TRSM_U_VIR_UKR ],     trsm_u_ukr_name     );
+	gen_func_init( &funcs[ BLIS_GEMM1M_UKR ],       gemm1m_ukr_name       );
+	gen_func_init( &funcs[ BLIS_GEMMTRSM1M_L_UKR ], gemmtrsm1m_l_ukr_name );
+	gen_func_init( &funcs[ BLIS_GEMMTRSM1M_U_UKR ], gemmtrsm1m_u_ukr_name );
+	gen_func_init( &funcs[ BLIS_TRSM1M_L_UKR ],     trsm1m_l_ukr_name     );
+	gen_func_init( &funcs[ BLIS_TRSM1M_U_UKR ],     trsm1m_u_ukr_name     );
 
 
 	// -- Set level-3 native micro-kernels and preferences ---------------------
@@ -427,157 +422,5 @@ void GENBARNAME(cntx_init)
 	// Set the gemm slot to the default gemm sup handler.
 	vfuncs[ BLIS_GEMM ]  = bli_gemmsup_ref;
 	vfuncs[ BLIS_GEMMT ] = bli_gemmtsup_ref;
-
-
-	// -- Set miscellaneous fields ---------------------------------------------
-
-	bli_cntx_set_method( BLIS_NAT, cntx );
-}
-
-// -----------------------------------------------------------------------------
-
-void GENBAINAME(cntx_init)
-     (
-       ind_t   method,
-       cntx_t* cntx
-     )
-{
-	func_t* funcs;
-
-	// This function is designed to modify a copy of an existing native
-	// context to enable computation via an induced method for complex
-	// domain level-3 operations. It is called by bli_gks_query_ind_cntx()
-	// on a context after its contexts are set by copying from the
-	// architecture's native context.
-
-	// -- Set induced method level-3 virtual micro-kernels ---------------------
-
-	funcs = cntx->ukrs;
-
-	if ( method == BLIS_1M )
-	{
-		gen_func_init_co( &funcs[ BLIS_GEMM_VIR_UKR ],       gemm1m_ukr_name       );
-		gen_func_init_co( &funcs[ BLIS_GEMMTRSM_L_VIR_UKR ], gemmtrsm1m_l_ukr_name );
-		gen_func_init_co( &funcs[ BLIS_GEMMTRSM_U_VIR_UKR ], gemmtrsm1m_u_ukr_name );
-		gen_func_init_co( &funcs[ BLIS_TRSM_L_VIR_UKR ],     trsm1m_l_ukr_name     );
-		gen_func_init_co( &funcs[ BLIS_TRSM_U_VIR_UKR ],     trsm1m_u_ukr_name     );
-	}
-	else // if ( method == BLIS_NAT )
-	{
-		gen_func_init_co( &funcs[ BLIS_GEMM_VIR_UKR ],       gemm_ukr_name       );
-		gen_func_init_co( &funcs[ BLIS_GEMMTRSM_L_VIR_UKR ], gemmtrsm_l_ukr_name );
-		gen_func_init_co( &funcs[ BLIS_GEMMTRSM_U_VIR_UKR ], gemmtrsm_u_ukr_name );
-		gen_func_init_co( &funcs[ BLIS_TRSM_L_VIR_UKR ],     trsm_l_ukr_name     );
-		gen_func_init_co( &funcs[ BLIS_TRSM_U_VIR_UKR ],     trsm_u_ukr_name     );
-	}
-
-	// For 1m, we employ an optimization which requires that we copy the native
-	// real domain gemm ukernel function pointers to the corresponding real
-	// domain slots in the virtual gemm ukernel func_t. This optimization allows
-	// us to, under certain conditions, adjust various parameters within the gemm
-	// macrokernel so that the real-domain macrokernel (which will query and use
-	// the real-domain virtual gemm ukernel) can be called instead of calling the
-	// complex-domain macrokernel and the corresponding complex-domain virtual
-	// microkernel. The non-optimized code path would require an extra level of
-	// function call overhead, which can be avoided in most cases (i.e., when
-	// beta has a zero imaginary component and C is either row- or column-stored).
-	if ( method == BLIS_1M )
-	{
-		func_t* gemm_nat_ukrs = ( func_t* )bli_cntx_get_ukrs( BLIS_GEMM_UKR, cntx );
-		func_t* gemm_vir_ukrs = ( func_t* )bli_cntx_get_ukrs( BLIS_GEMM_VIR_UKR, cntx );
-
-		bli_func_copy_dt( BLIS_FLOAT,  gemm_nat_ukrs, BLIS_FLOAT,  gemm_vir_ukrs );
-		bli_func_copy_dt( BLIS_DOUBLE, gemm_nat_ukrs, BLIS_DOUBLE, gemm_vir_ukrs );
-	}
-
-
-	// -- Set induced method packm kernels -------------------------------------
-
-	if ( method == BLIS_1M )
-	{
-		gen_func_init_co( &funcs[ BLIS_PACKM_MRXK_KER ],  packm_mrxk_1er_ker_name );
-		gen_func_init_co( &funcs[ BLIS_PACKM_NRXK_KER ],  packm_nrxk_1er_ker_name );
-	}
-	else // if ( method == BLIS_NAT )
-	{
-		gen_func_init( &funcs[ BLIS_PACKM_MRXK_KER ],  packm_mrxk_ker_name );
-		gen_func_init( &funcs[ BLIS_PACKM_NRXK_KER ],  packm_nrxk_ker_name );
-	}
-
-	gen_func_init_co( &funcs[ BLIS_PACKM_MRXK_1ER_KER ],  packm_mrxk_1er_ker_name );
-	gen_func_init_co( &funcs[ BLIS_PACKM_NRXK_1ER_KER ],  packm_nrxk_1er_ker_name );
-
-	gen_func_init( &funcs[ BLIS_UNPACKM_MRXK_KER ],  unpackm_mrxk_ker_name );
-	gen_func_init( &funcs[ BLIS_UNPACKM_NRXK_KER ],  unpackm_nrxk_ker_name );
-
-
-	// -- Set induced method cache and register blocksizes ---------------------
-
-	// Modify the context with cache and register blocksizes (and multiples)
-	// appropriate for the current induced method.
-	if ( method == BLIS_1M )
-	{
-		//const bool is_pb = FALSE;
-
-		// Call a helper function to initialize blocksizes for each complex
-		// datatype.
-		GENBAINAME(cntx_init_blkszs)( method, BLIS_SCOMPLEX, cntx );
-		GENBAINAME(cntx_init_blkszs)( method, BLIS_DCOMPLEX, cntx );
-	}
-	else // if ( method == BLIS_NAT )
-	{
-		// No change in blocksizes needed for native execution.
-	}
-}
-
-// -----------------------------------------------------------------------------
-
-void GENBAINAME(cntx_init_blkszs)
-     (
-       ind_t   method,
-       num_t   dt,
-       cntx_t* cntx
-     )
-{
-	// Set the induced method in the context.
-	bli_cntx_set_method( method, cntx );
-
-	num_t dt_r = bli_dt_proj_to_real( dt );
-
-	// Initialize the blocksizes according to the micro-kernel preference as
-	// well as the algorithm.
-	//if ( bli_cntx_ukr_prefers_cols_dt( dt, BLIS_GEMM_UKR, cntx ) )
-	if ( ! bli_cntx_get_ukr_prefs_dt( dt_r, BLIS_GEMM_UKR_ROW_PREF, cntx ) )
-	{
-		// This branch is used for algorithm 1m_c_bp.
-
-		bli_cntx_set_ind_blkszs
-		(
-		  method, dt, cntx,
-		  BLIS_NC, 1.0, 1.0,
-		  BLIS_KC, 2.0, 2.0, // halve kc...
-		  BLIS_MC, 2.0, 2.0, // halve mc...
-		  BLIS_NR, 1.0, 1.0,
-		  BLIS_MR, 2.0, 1.0, // ...and mr (but NOT packmr)
-		  BLIS_KR, 1.0, 1.0,
-		  BLIS_VA_END
-		);
-	}
-	else // if ( bli_cntx_get_ukr_prefs_dt( dt, BLIS_GEMM_UKR_ROW_PREF, cntx ) )
-	{
-		// This branch is used for algorithm 1m_r_bp.
-
-		bli_cntx_set_ind_blkszs
-		(
-		  method, dt, cntx,
-		  BLIS_NC, 2.0, 2.0, // halve nc...
-		  BLIS_KC, 2.0, 2.0, // halve kc...
-		  BLIS_MC, 1.0, 1.0,
-		  BLIS_NR, 2.0, 1.0, // ...and nr (but NOT packnr)
-		  BLIS_MR, 1.0, 1.0,
-		  BLIS_KR, 1.0, 1.0,
-		  BLIS_VA_END
-		);
-	}
 }
 
diff --git a/ref_kernels/ind/bli_gemm1m_ref.c b/ref_kernels/ind/bli_gemm1m_ref.c
index 317cf26048..ddffef3316 100644
--- a/ref_kernels/ind/bli_gemm1m_ref.c
+++ b/ref_kernels/ind/bli_gemm1m_ref.c
@@ -56,7 +56,7 @@ void PASTEMAC3(ch,opname,arch,suf) \
 \
 	PASTECH(chr,gemm_ukr_ft) \
 	                  rgemm_ukr = bli_cntx_get_ukr_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
-	const bool        col_pref  = bli_cntx_ukr_prefers_cols_dt( dt_r, BLIS_GEMM_UKR, cntx ); \
+	const bool        col_pref  = bli_cntx_get_ukr_prefs_dt( dt_r, BLIS_GEMM_UKR_ROW_PREF, cntx ); \
 	const bool        row_pref  = !col_pref; \
 \
 	const dim_t       mr        = bli_cntx_get_blksz_def_dt( dt, BLIS_MR, cntx ); \
@@ -76,14 +76,14 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	ctype_r* restrict a_r       = ( ctype_r* )a; \
 \
 	ctype_r* restrict b_r       = ( ctype_r* )b; \
-\
-	ctype_r* restrict zero_r    = PASTEMAC(chr,0); \
 \
 	ctype_r* restrict alpha_r   = &PASTEMAC(ch,real)( *alpha ); \
 	ctype_r* restrict alpha_i   = &PASTEMAC(ch,imag)( *alpha ); \
 \
 	ctype_r* restrict beta_r    = &PASTEMAC(ch,real)( *beta ); \
 	ctype_r* restrict beta_i    = &PASTEMAC(ch,imag)( *beta ); \
+\
+	ctype_r* restrict zero_r    = PASTEMAC(chr,0); \
 \
 	ctype_r*          c_use; \
 	inc_t             rs_c_use; \
@@ -123,11 +123,6 @@ void PASTEMAC3(ch,opname,arch,suf) \
 	else if ( bli_is_gen_stored( rs_c, cs_c ) )             using_ct = TRUE; \
 	else                                                    using_ct = FALSE; \
 \
-\
-	/* If we are not computing a full micro-tile, then we must write to
-	   ct and then accumulate to c afterwards. */ \
-	if ( mr != m || nr != n ) using_ct = TRUE; \
-\
 \
 	if ( using_ct ) \
 	{ \
@@ -171,37 +166,13 @@ void PASTEMAC3(ch,opname,arch,suf) \
 		  cntx  \
 		); \
 \
-		dim_t i, j; \
-\
-		/* Accumulate the final result in ct back to c. */ \
-		if ( PASTEMAC(ch,eq1)( *beta ) ) \
-		{ \
-			for ( j = 0; j < n; ++j ) \
-			for ( i = 0; i < m; ++i ) \
-			{ \
-				PASTEMAC(ch,adds)( *(ct + i*rs_ct + j*cs_ct), \
-				                   *(c  + i*rs_c  + j*cs_c ) ); \
-			} \
-		} \
-		else if ( PASTEMAC(ch,eq0)( *beta ) ) \
-		{ \
-			for ( j = 0; j < n; ++j ) \
-			for ( i = 0; i < m; ++i ) \
-			{ \
-				PASTEMAC(ch,copys)( *(ct + i*rs_ct + j*cs_ct), \
-				                    *(c  + i*rs_c  + j*cs_c ) ); \
-			} \
-		} \
-		else \
-		{ \
-			for ( j = 0; j < n; ++j ) \
-			for ( i = 0; i < m; ++i ) \
-			{ \
-				PASTEMAC(ch,xpbys)( *(ct + i*rs_ct + j*cs_ct), \
-				                    *beta, \
-				                    *(c  + i*rs_c  + j*cs_c ) ); \
-			} \
-		} \
+    	PASTEMAC3(ch,ch,ch,xpbys_mxn) \
+    	( \
+    	  m, n, \
+    	  ct, rs_ct, cs_ct, \
+    	  beta, \
+    	  c, rs_c,  cs_c \
+    	); \
 	} \
 	else \
 	{ \

From 23fd6a27c8d8914b419f8822c7e6b7d625b12da8 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Wed, 13 Jul 2022 15:28:05 -0500
Subject: [PATCH 23/32] Add `#line` directives to flattened `blis.h`.

This enables better debugging since errors will show up based on the un-flattened filename and line number.
---
 build/flatten-headers.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/build/flatten-headers.py b/build/flatten-headers.py
index 563725a7e9..3863d196e3 100755
--- a/build/flatten-headers.py
+++ b/build/flatten-headers.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 #
-#  BLIS    
+#  BLIS
 #  An object-based framework for developing high-performance BLAS-like
 #  libraries.
 #
@@ -216,7 +216,9 @@ def flatten_header( inputfile, header_dirpaths, cursp ):
 	ifile = open( inputfile, "r" )
 
 	# Iterate over the lines in the file.
+	lineno = 0
 	while True:
+		lineno += 1
 
 		# Read a line in the file.
 		line = ifile.readline()
@@ -268,12 +270,14 @@ def flatten_header( inputfile, header_dirpaths, cursp ):
 
 				# Mark the beginning of the header being inserted.
 				ostring += "%s%s%c" % ( beginstr, header, '\n' )
+				ostring += "#line %d \"%s\"%c\n" % ( 1, header_path, '\n' )
 
 				# Recurse on the header, accumulating the string.
 				ostring += flatten_header( header_path, header_dirpaths, cursp + "  " )
 
 				# Mark the end of the header being inserted.
 				ostring += "%s%s%c" % ( endstr, header, '\n' )
+				ostring += "#line %d \"%s\"%c\n" % ( lineno+1, inputfile, '\n' )
 
 				echov2( "%sheader file '%s' fully processed." \
 				        % ( cursp, header_path ) )
@@ -300,7 +304,7 @@ def flatten_header( inputfile, header_dirpaths, cursp ):
 		# endif
 
 	# endwhile
-	
+
 	# Close the input file.
 	ifile.close()
 
@@ -330,7 +334,7 @@ def find_header_dirs( dirpath ):
 	#endfor
 
 	return header_dirpaths
-	
+
 
 # ------------------------------------------------------------------------------
 

From e44c123b38e53f204a67804faad6dc6cbfbf893a Mon Sep 17 00:00:00 2001
From: "Field G. Van Zee" <field@cs.utexas.edu>
Date: Mon, 25 Jul 2022 17:44:05 -0500
Subject: [PATCH 24/32] Comment updates.

---
 build/flatten-headers.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/build/flatten-headers.py b/build/flatten-headers.py
index 3863d196e3..40fc2a4507 100755
--- a/build/flatten-headers.py
+++ b/build/flatten-headers.py
@@ -215,9 +215,17 @@ def flatten_header( inputfile, header_dirpaths, cursp ):
 	# Open the input file to process.
 	ifile = open( inputfile, "r" )
 
-	# Iterate over the lines in the file.
+	# A counter to track the line number being parsed within the current file.
+	# This counter, when selectively encoded into the flattened header via #line
+	# directives, facilitates easier debugging. (When the compiler finds an
+	# issue, it will be able to refer to the line number within the constituent
+	# header file rather than the flattened one.)
 	lineno = 0
+
+	# Iterate over the lines in the file.
 	while True:
+
+		# Increment the line number.
 		lineno += 1
 
 		# Read a line in the file.
@@ -335,7 +343,6 @@ def find_header_dirs( dirpath ):
 
 	return header_dirpaths
 
-
 # ------------------------------------------------------------------------------
 
 # Global variables.

From 16336ce2aa09faef71a1a8f508523c40acd0c969 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Wed, 7 Sep 2022 16:40:11 -0500
Subject: [PATCH 25/32] De-template the sup_var1n2m code and combine A/B
 packing functions.

---
 frame/3/bli_l3.h                |    3 +-
 frame/3/bli_l3_oapi_ex.c        |   32 +
 frame/3/bli_l3_sup_packm.c      |  435 ++++++++
 frame/3/bli_l3_sup_packm.h      |   95 ++
 frame/3/bli_l3_sup_packm_a.c    |  430 --------
 frame/3/bli_l3_sup_packm_a.h    |  118 ---
 frame/3/bli_l3_sup_packm_b.c    |  430 --------
 frame/3/bli_l3_sup_packm_b.h    |  118 ---
 frame/3/bli_l3_sup_packm_var.c  |   38 +-
 frame/3/bli_l3_sup_packm_var.h  |   38 +-
 frame/3/bli_l3_sup_var1n2m.c    | 1754 ++++++++++++-------------------
 frame/3/bli_l3_sup_vars.h       |   26 -
 frame/3/gemm/bli_gemm_front.c   |   16 -
 frame/3/gemmt/bli_gemmt_front.c |   16 -
 14 files changed, 1293 insertions(+), 2256 deletions(-)
 create mode 100644 frame/3/bli_l3_sup_packm.c
 create mode 100644 frame/3/bli_l3_sup_packm.h
 delete mode 100644 frame/3/bli_l3_sup_packm_a.c
 delete mode 100644 frame/3/bli_l3_sup_packm_a.h
 delete mode 100644 frame/3/bli_l3_sup_packm_b.c
 delete mode 100644 frame/3/bli_l3_sup_packm_b.h

diff --git a/frame/3/bli_l3.h b/frame/3/bli_l3.h
index 4dc1a9d545..9d39fc47db 100644
--- a/frame/3/bli_l3.h
+++ b/frame/3/bli_l3.h
@@ -71,8 +71,7 @@
 #include "bli_l3_sup_ref.h"
 #include "bli_l3_sup_int.h"
 #include "bli_l3_sup_vars.h"
-#include "bli_l3_sup_packm_a.h"
-#include "bli_l3_sup_packm_b.h"
+#include "bli_l3_sup_packm.h"
 #include "bli_l3_sup_packm_var.h"
 
 // Prototype microkernel wrapper APIs.
diff --git a/frame/3/bli_l3_oapi_ex.c b/frame/3/bli_l3_oapi_ex.c
index 20b0294eb0..b58244b8c7 100644
--- a/frame/3/bli_l3_oapi_ex.c
+++ b/frame/3/bli_l3_oapi_ex.c
@@ -55,6 +55,22 @@ void PASTEMAC(gemm,BLIS_OAPI_EX_SUF)
 {
 	bli_init_once();
 
+	// If C has a zero dimension, return early.
+	if ( bli_obj_has_zero_dim( c ) )
+	{
+		return;
+	}
+
+	// If alpha is zero, or if A or B has a zero dimension, scale C by beta
+	// and return early.
+	if ( bli_obj_equals( alpha, &BLIS_ZERO ) ||
+	     bli_obj_has_zero_dim( a ) ||
+	     bli_obj_has_zero_dim( b ) )
+	{
+		bli_scalm( beta, c );
+		return;
+	}
+
 	// If the rntm is non-NULL, it may indicate that we should forgo sup
 	// handling altogether.
 	bool enable_sup = TRUE;
@@ -128,6 +144,22 @@ void PASTEMAC(gemmt,BLIS_OAPI_EX_SUF)
 {
 	bli_init_once();
 
+	// If C has a zero dimension, return early.
+	if ( bli_obj_has_zero_dim( c ) )
+	{
+		return;
+	}
+
+	// If alpha is zero, or if A or B has a zero dimension, scale C by beta
+	// and return early.
+	if ( bli_obj_equals( alpha, &BLIS_ZERO ) ||
+	     bli_obj_has_zero_dim( a ) ||
+	     bli_obj_has_zero_dim( b ) )
+	{
+		bli_scalm( beta, c );
+		return;
+	}
+
 	// Initialize a local runtime with global settings if necessary. Note
 	// that in the case that a runtime is passed in, we make a local copy.
 	rntm_t rntm_l;
diff --git a/frame/3/bli_l3_sup_packm.c b/frame/3/bli_l3_sup_packm.c
new file mode 100644
index 0000000000..0cc56f9c4b
--- /dev/null
+++ b/frame/3/bli_l3_sup_packm.c
@@ -0,0 +1,435 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+void bli_packm_sup_init_mem
+     (
+       bool       will_pack,
+       packbuf_t  pack_buf_type,
+       num_t      dt,
+       dim_t      m,
+       dim_t      k,
+       dim_t      mr,
+       rntm_t*    rntm,
+       mem_t*     mem,
+       thrinfo_t* thread
+     )
+{
+	/* Inspect whether we are going to be packing matrix A. */
+	if ( will_pack == FALSE )
+	{
+	}
+	else /* if ( will_pack == TRUE ) */
+	{
+		/* NOTE: This "rounding up" of the last upanel is actually optional
+		   for the rrc/crc cases, but absolutely necessary for the other cases
+		   since we NEED that last micropanel to have the same ldim (cs_p) as
+		   the other micropanels. Why? So that millikernels can use the same
+		   upanel ldim for all iterations of the ir loop. */
+		const dim_t m_pack = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr;
+		const dim_t k_pack = k;
+
+		/* Barrier to make sure all threads are caught up and ready to begin
+		   the packm stage. */
+		bli_thread_barrier( thread );
+
+		/* Compute the size of the memory block eneded. */
+		siz_t size_needed = bli_dt_size( dt ) * m_pack * k_pack;
+
+		/* Check the mem_t entry provided by the caller. If it is unallocated,
+		   then we need to acquire a block from the memory broker. */
+		if ( bli_mem_is_unalloc( mem ) )
+		{
+			if ( bli_thread_am_ochief( thread ) )
+			{
+				/* Acquire directly to the chief thread's mem_t that was
+				   passed in. It needs to be that mem_t struct, and not a
+				   local (temporary) mem_t, since there is no barrier until
+				   after packing is finished, which could allow a race
+				   condition whereby the chief thread exits the current
+				   function before the other threads have a chance to copy
+				   from it. (A barrier would fix that race condition, but
+				   then again, I prefer to keep barriers to a minimum.) */
+				bli_pba_acquire_m
+				(
+				  rntm,
+				  size_needed,
+				  pack_buf_type,
+				  mem
+				);
+			}
+
+			/* Broadcast the address of the chief thread's passed-in mem_t
+			   to all threads. */
+			mem_t* mem_p = bli_thread_broadcast( thread, mem );
+
+			/* Non-chief threads: Copy the contents of the chief thread's
+			   passed-in mem_t to the passed-in mem_t for this thread. (The
+			   chief thread already has the mem_t, so it does not need to
+			   perform any copy.) */
+			if ( !bli_thread_am_ochief( thread ) )
+			{
+				*mem = *mem_p;
+			}
+		}
+		else /* if ( bli_mem_is_alloc( mem ) ) */
+		{
+			/* If the mem_t entry provided by the caller does NOT contain a NULL
+			   buffer, then a block has already been acquired from the memory
+			   broker and cached by the caller. */
+
+			/* As a sanity check, we should make sure that the mem_t object isn't
+			   associated with a block that is too small compared to the size of
+			   the packed matrix buffer that is needed, according to the value
+			   computed above. */
+			siz_t mem_size = bli_mem_size( mem );
+
+			if ( mem_size < size_needed )
+			{
+				if ( bli_thread_am_ochief( thread ) )
+				{
+					/* The chief thread releases the existing block associated
+					   with the mem_t, and then re-acquires a new block, saving
+					   the associated mem_t to its passed-in mem_t. (See coment
+					   above for why the acquisition needs to be directly to
+					   the chief thread's passed-in mem_t and not a local
+					   (temporary) mem_t. */
+					bli_pba_release
+					(
+					  rntm,
+					  mem
+					);
+					bli_pba_acquire_m
+					(
+					  rntm,
+					  size_needed,
+					  pack_buf_type,
+					  mem
+					);
+				}
+
+				/* Broadcast the address of the chief thread's passed-in mem_t
+				   to all threads. */
+				mem_t* mem_p = bli_thread_broadcast( thread, mem );
+
+				/* Non-chief threads: Copy the contents of the chief thread's
+				   passed-in mem_t to the passed-in mem_t for this thread. (The
+				   chief thread already has the mem_t, so it does not need to
+				   perform any copy.) */
+				if ( !bli_thread_am_ochief( thread ) )
+				{
+					*mem = *mem_p;
+				}
+			}
+			else
+			{
+				/* If the mem_t entry is already allocated and sufficiently large,
+				   then we use it as-is. No action is needed. */
+			}
+		}
+	}
+}
+
+void bli_packm_sup_finalize_mem
+     (
+       bool       did_pack,
+       rntm_t*    rntm,
+       mem_t*     mem,
+       thrinfo_t* thread
+     )
+{
+	/* Inspect whether we previously packed matrix A. */
+	if ( did_pack == FALSE )
+	{
+		/* If we didn't pack matrix A, there's nothing to be done. */
+	}
+	else /* if ( did_pack == TRUE ) */
+	{
+		if ( thread != NULL )
+		if ( bli_thread_am_ochief( thread ) )
+		{
+			/* Check the mem_t entry provided by the caller. Only proceed if it
+			   is allocated, which it should be. */
+			if ( bli_mem_is_alloc( mem ) )
+			{
+				bli_pba_release
+				(
+				  rntm,
+				  mem
+				);
+			}
+		}
+	}
+}
+
+void bli_packm_sup_init
+     (
+             bool       will_pack,
+             stor3_t    stor_id,
+             pack_t*    schema,
+             dim_t      m,
+             dim_t      k,
+             dim_t      mr,
+             dim_t*     m_max,
+             dim_t*     k_max,
+       const void*      x, inc_t  rs_x, inc_t  cs_x,
+             void**     p, inc_t* rs_p, inc_t* cs_p,
+                           dim_t* pd_p, inc_t* ps_p,
+             mem_t*     mem
+     )
+{
+	/* Inspect whether we are going to be packing matrix A. */
+	if ( will_pack == FALSE )
+	{
+		*m_max = m;
+		*k_max = k;
+
+		/* Set the parameters for use with no packing of A (ie: using the
+		   source matrix A directly). */
+		{
+			/* Use the strides of the source matrix as the final values. */
+			*rs_p = rs_x;
+			*cs_p = cs_x;
+
+			*pd_p = mr;
+			*ps_p = mr * rs_x;
+
+			/* Set the schema to "not packed" to indicate that packing will be
+			   skipped. */
+			*schema = BLIS_NOT_PACKED;
+		}
+
+		/* Since we won't be packing, simply update the buffer address provided
+		   by the caller to point to source matrix. */
+		*p = ( void* )x;
+	}
+	else /* if ( will_pack == TRUE ) */
+	{
+		/* NOTE: This is "rounding up" of the last upanel is actually optional
+		   for the rrc/crc cases, but absolutely necessary for the other cases
+		   since we NEED that last micropanel to have the same ldim (cs_p) as
+		   the other micropanels. Why? So that millikernels can use the same
+		   upanel ldim for all iterations of the ir loop. */
+		*m_max = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr;
+		*k_max = k;
+
+		/* Determine the dimensions and strides for the packed matrix A. */
+		if ( stor_id == BLIS_RRC ||
+			 stor_id == BLIS_CRC )
+		{
+			/* stor3_t id values _RRC and _CRC: pack A to plain row storage. */
+			*rs_p = k;
+			*cs_p = 1;
+
+			*pd_p = mr;
+			*ps_p = mr * k;
+
+			/* Set the schema to "row packed" to indicate packing to plain
+			   row storage. */
+			*schema = BLIS_PACKED_ROWS;
+		}
+		else
+		{
+			/* All other stor3_t ids: pack A to column-stored row-panels. */
+			*rs_p = 1;
+			*cs_p = mr;
+
+			*pd_p = mr;
+			*ps_p = mr * k;
+
+			/* Set the schema to "packed row panels" to indicate packing to
+			   conventional column-stored row panels. */
+			*schema = BLIS_PACKED_ROW_PANELS;
+		}
+
+		/* Set the buffer address provided by the caller to point to the
+		   memory associated with the mem_t entry acquired from the memory
+		   broker. */
+		*p = bli_mem_buffer( mem );
+	}
+}
+
+typedef void (*packm_sup_var1_fp)
+     (
+       trans_t    transc,
+       pack_t     schema,
+       dim_t      m,
+       dim_t      n,
+       dim_t      m_max,
+       dim_t      n_max,
+       void*      kappa,
+       void*      c, inc_t rs_c, inc_t cs_c,
+       void*      p, inc_t rs_p, inc_t cs_p,
+                           dim_t pd_p, inc_t ps_p,
+       cntx_t*    cntx,
+       thrinfo_t* thread
+     );
+
+typedef void (*packm_sup_var2_fp)
+     (
+       trans_t    transc,
+       pack_t     schema,
+       dim_t      m,
+       dim_t      n,
+       void*      kappa,
+       void*      c, inc_t rs_c, inc_t cs_c,
+       void*      p, inc_t rs_p, inc_t cs_p,
+       cntx_t*    cntx,
+       thrinfo_t* thread
+     );
+
+static packm_sup_var1_fp GENARRAY(packm_sup_var1,packm_sup_var1);
+static packm_sup_var2_fp GENARRAY(packm_sup_var2,packm_sup_var2);
+
+//
+// Define BLAS-like interfaces to the variant chooser.
+//
+
+void bli_packm_sup
+     (
+             bool       will_pack,
+             packbuf_t  pack_buf_type,
+             stor3_t    stor_id,
+             trans_t    transc,
+             num_t      dt,
+             dim_t      m_alloc,
+             dim_t      k_alloc,
+             dim_t      m,
+             dim_t      k,
+             dim_t      mr,
+       const void*      kappa,
+       const void*      a, inc_t  rs_a, inc_t  cs_a,
+             void**     p, inc_t* rs_p, inc_t* cs_p,
+                           inc_t* ps_p,
+       const cntx_t*    cntx,
+             rntm_t*    rntm,
+             mem_t*     mem,
+             thrinfo_t* thread
+     )
+{
+	pack_t schema;
+	dim_t  m_max;
+	dim_t  k_max;
+	dim_t  pd_p;
+
+	/* Prepare the packing destination buffer. If packing is not requested,
+	   this function will reduce to a no-op. */
+	bli_packm_sup_init_mem
+	(
+	  will_pack,
+	  pack_buf_type,
+	  dt, m_alloc, k_alloc, mr,
+      rntm,
+      mem,
+	  thread
+	);
+
+	/* Determine the packing buffer and related parameters for matrix A. If A
+	   will not be packed, then a_use will be set to point to a and the _a_use
+	   strides will be set accordingly. */
+	bli_packm_sup_init
+	(
+	  will_pack,
+	  stor_id,
+	  &schema,
+	  m, k, mr,
+	  &m_max, &k_max,
+	  a, rs_a,  cs_a,
+	  p, rs_p,  cs_p,
+	     &pd_p, ps_p,
+      mem
+	);
+
+	/* Inspect whether we are going to be packing matrix A. */
+	if ( will_pack == FALSE )
+	{
+		/* If we aren't going to pack matrix A, then there's nothing to do. */
+
+		/*
+		printf( "blis_ packm_sup_a: not packing A.\n" );
+		*/
+	}
+	else /* if ( will_pack == TRUE ) */
+	{
+		if ( schema == BLIS_PACKED_ROWS )
+		{
+			/*
+			printf( "blis_ packm_sup_a: packing A to rows.\n" );
+			*/
+
+			/* For plain packing by rows, use var2. */
+			packm_sup_var2[ dt ]
+			(
+			  transc,
+			  schema,
+			  m,
+			  k,
+			  ( void* )kappa,
+			  ( void* )a,  rs_a,  cs_a,
+			  *p, *rs_p, *cs_p,
+			  ( cntx_t* )cntx,
+			  thread
+			);
+		}
+		else /* if ( schema == BLIS_PACKED_ROW_PANELS ) */
+		{
+			/*
+			printf( "blis_ packm_sup_a: packing A to row panels.\n" );
+			*/
+
+			/* For packing to column-stored row panels, use var1. */
+			packm_sup_var1[ dt ]
+			(
+			  transc,
+			  schema,
+			  m,
+			  k,
+			  m_max,
+			  k_max,
+			  ( void* )kappa,
+			  ( void* )a,  rs_a,  cs_a,
+			  *p, *rs_p, *cs_p,
+			      pd_p,  *ps_p,
+			  ( cntx_t* )cntx,
+			  thread
+			);
+		}
+
+		/* Barrier so that packing is done before computation. */
+		bli_thread_barrier( thread );
+	}
+}
+
diff --git a/frame/3/bli_l3_sup_packm.h b/frame/3/bli_l3_sup_packm.h
new file mode 100644
index 0000000000..a84d4e45c3
--- /dev/null
+++ b/frame/3/bli_l3_sup_packm.h
@@ -0,0 +1,95 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2018, Advanced Micro Devices, Inc.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+
+void bli_packm_sup_init_mem
+     (
+       bool       will_pack,
+       packbuf_t  pack_buf_type,
+       num_t      dt,
+       dim_t      m,
+       dim_t      k,
+       dim_t      mr,
+       rntm_t*    rntm,
+       mem_t*     mem,
+       thrinfo_t* thread
+     );
+
+void bli_packm_sup_finalize_mem
+     (
+       bool       did_pack,
+       rntm_t*    rntm,
+       mem_t*     mem,
+       thrinfo_t* thread
+     );
+
+void bli_packm_sup_init
+     (
+             bool       will_pack,
+             stor3_t    stor_id,
+             pack_t*    schema,
+             dim_t      m,
+             dim_t      k,
+             dim_t      mr,
+             dim_t*     m_max,
+             dim_t*     k_max,
+       const void*      x, inc_t  rs_x, inc_t  cs_x,
+             void**     p, inc_t* rs_p, inc_t* cs_p,
+                           dim_t* pd_p, inc_t* ps_p,
+             mem_t*     mem
+     );
+
+void bli_packm_sup
+     (
+             bool       will_pack,
+             packbuf_t  pack_buf_type,
+             stor3_t    stor_id,
+             trans_t    transc,
+             num_t      dt,
+             dim_t      m_alloc,
+             dim_t      k_alloc,
+             dim_t      m,
+             dim_t      k,
+             dim_t      mr,
+       const void*      kappa,
+       const void*      a, inc_t  rs_a, inc_t  cs_a,
+             void**     p, inc_t* rs_p, inc_t* cs_p,
+                           inc_t* ps_p,
+       const cntx_t*    cntx,
+             rntm_t*    rntm,
+             mem_t*     mem,
+             thrinfo_t* thread
+     );
+
diff --git a/frame/3/bli_l3_sup_packm_a.c b/frame/3/bli_l3_sup_packm_a.c
deleted file mode 100644
index 6b73050fd0..0000000000
--- a/frame/3/bli_l3_sup_packm_a.c
+++ /dev/null
@@ -1,430 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-             bool       will_pack, \
-             packbuf_t  pack_buf_type, \
-             dim_t      m, \
-             dim_t      k, \
-             dim_t      mr, \
-       const cntx_t*    cntx, \
-             rntm_t*    rntm, \
-             mem_t*     mem, \
-       const thrinfo_t* thread  \
-     ) \
-{ \
-	/* Inspect whether we are going to be packing matrix A. */ \
-	if ( will_pack == FALSE ) \
-	{ \
-	} \
-	else /* if ( will_pack == TRUE ) */ \
-	{ \
-		/* NOTE: This "rounding up" of the last upanel is actually optional
-		   for the rrc/crc cases, but absolutely necessary for the other cases
-		   since we NEED that last micropanel to have the same ldim (cs_p) as
-		   the other micropanels. Why? So that millikernels can use the same
-		   upanel ldim for all iterations of the ir loop. */ \
-		const dim_t m_pack = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr; \
-		const dim_t k_pack = k; \
-\
-		/* Barrier to make sure all threads are caught up and ready to begin
-		   the packm stage. */ \
-		bli_thread_barrier( thread ); \
-\
-		/* Compute the size of the memory block eneded. */ \
-		siz_t size_needed = sizeof( ctype ) * m_pack * k_pack; \
-\
-		/* Check the mem_t entry provided by the caller. If it is unallocated,
-		   then we need to acquire a block from the memory broker. */ \
-		if ( bli_mem_is_unalloc( mem ) ) \
-		{ \
-			if ( bli_thread_am_ochief( thread ) ) \
-			{ \
-				/* Acquire directly to the chief thread's mem_t that was
-				   passed in. It needs to be that mem_t struct, and not a
-				   local (temporary) mem_t, since there is no barrier until
-				   after packing is finished, which could allow a race
-				   condition whereby the chief thread exits the current
-				   function before the other threads have a chance to copy
-				   from it. (A barrier would fix that race condition, but
-				   then again, I prefer to keep barriers to a minimum.) */ \
-				bli_pba_acquire_m \
-				( \
-				  rntm, \
-				  size_needed, \
-				  pack_buf_type, \
-				  mem  \
-				); \
-			} \
-\
-			/* Broadcast the address of the chief thread's passed-in mem_t
-			   to all threads. */ \
-			mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
-\
-			/* Non-chief threads: Copy the contents of the chief thread's
-			   passed-in mem_t to the passed-in mem_t for this thread. (The
-			   chief thread already has the mem_t, so it does not need to
-			   perform any copy.) */ \
-			if ( !bli_thread_am_ochief( thread ) ) \
-			{ \
-				*mem = *mem_p; \
-			} \
-		} \
-		else /* if ( bli_mem_is_alloc( mem ) ) */ \
-		{ \
-			/* If the mem_t entry provided by the caller does NOT contain a NULL
-			   buffer, then a block has already been acquired from the memory
-			   broker and cached by the caller. */ \
-\
-			/* As a sanity check, we should make sure that the mem_t object isn't
-			   associated with a block that is too small compared to the size of
-			   the packed matrix buffer that is needed, according to the value
-			   computed above. */ \
-			siz_t mem_size = bli_mem_size( mem ); \
-\
-			if ( mem_size < size_needed ) \
-			{ \
-				if ( bli_thread_am_ochief( thread ) ) \
-				{ \
-					/* The chief thread releases the existing block associated
-					   with the mem_t, and then re-acquires a new block, saving
-					   the associated mem_t to its passed-in mem_t. (See coment
-					   above for why the acquisition needs to be directly to
-					   the chief thread's passed-in mem_t and not a local
-					   (temporary) mem_t. */ \
-					bli_pba_release \
-					( \
-					  rntm, \
-					  mem \
-					); \
-					bli_pba_acquire_m \
-					( \
-					  rntm, \
-					  size_needed, \
-					  pack_buf_type, \
-					  mem \
-					); \
-				} \
-\
-				/* Broadcast the address of the chief thread's passed-in mem_t
-				   to all threads. */ \
-				mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
-\
-				/* Non-chief threads: Copy the contents of the chief thread's
-				   passed-in mem_t to the passed-in mem_t for this thread. (The
-				   chief thread already has the mem_t, so it does not need to
-				   perform any copy.) */ \
-				if ( !bli_thread_am_ochief( thread ) ) \
-				{ \
-					*mem = *mem_p; \
-				} \
-			} \
-			else \
-			{ \
-				/* If the mem_t entry is already allocated and sufficiently large,
-				   then we use it as-is. No action is needed. */ \
-			} \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC0( packm_sup_init_mem_a )
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-             bool       did_pack, \
-             rntm_t*    rntm, \
-             mem_t*     mem, \
-       const thrinfo_t* thread  \
-     ) \
-{ \
-	/* Inspect whether we previously packed matrix A. */ \
-	if ( did_pack == FALSE ) \
-	{ \
-		/* If we didn't pack matrix A, there's nothing to be done. */ \
-	} \
-	else /* if ( did_pack == TRUE ) */ \
-	{ \
-		if ( thread != NULL ) \
-		if ( bli_thread_am_ochief( thread ) ) \
-		{ \
-			/* Check the mem_t entry provided by the caller. Only proceed if it
-			   is allocated, which it should be. */ \
-			if ( bli_mem_is_alloc( mem ) ) \
-			{ \
-				bli_pba_release \
-				( \
-				  rntm, \
-				  mem \
-				); \
-			} \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC0( packm_sup_finalize_mem_a )
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-       bool    will_pack, \
-       stor3_t stor_id, \
-       pack_t* schema, \
-       dim_t   m, \
-       dim_t   k, \
-       dim_t   mr, \
-       dim_t*  m_max, \
-       dim_t*  k_max, \
-       ctype*  a, inc_t  rs_a, inc_t  cs_a, \
-       ctype** p, inc_t* rs_p, inc_t* cs_p, \
-                  dim_t* pd_p, inc_t* ps_p, \
-       cntx_t* cntx, \
-       mem_t*  mem, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	/* Inspect whether we are going to be packing matrix A. */ \
-	if ( will_pack == FALSE ) \
-	{ \
-		*m_max = m; \
-		*k_max = k; \
-\
-		/* Set the parameters for use with no packing of A (ie: using the
-		   source matrix A directly). */ \
-		{ \
-			/* Use the strides of the source matrix as the final values. */ \
-			*rs_p = rs_a; \
-			*cs_p = cs_a; \
-\
-			*pd_p = mr; \
-			*ps_p = mr * rs_a; \
-\
-			/* Set the schema to "not packed" to indicate that packing will be
-			   skipped. */ \
-			*schema = BLIS_NOT_PACKED; \
-		} \
-\
-		/* Since we won't be packing, simply update the buffer address provided
-		   by the caller to point to source matrix. */ \
-		*p = a; \
-	} \
-	else /* if ( will_pack == TRUE ) */ \
-	{ \
-		/* NOTE: This is "rounding up" of the last upanel is actually optional
-		   for the rrc/crc cases, but absolutely necessary for the other cases
-		   since we NEED that last micropanel to have the same ldim (cs_p) as
-		   the other micropanels. Why? So that millikernels can use the same
-		   upanel ldim for all iterations of the ir loop. */ \
-		*m_max = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr; \
-		*k_max = k; \
-\
-		/* Determine the dimensions and strides for the packed matrix A. */ \
-		if ( stor_id == BLIS_RRC || \
-			 stor_id == BLIS_CRC ) \
-		{ \
-			/* stor3_t id values _RRC and _CRC: pack A to plain row storage. */ \
-			*rs_p = k; \
-			*cs_p = 1; \
-\
-			*pd_p = mr; \
-			*ps_p = mr * k; \
-\
-			/* Set the schema to "row packed" to indicate packing to plain
-			   row storage. */ \
-			*schema = BLIS_PACKED_ROWS; \
-		} \
-		else \
-		{ \
-			/* All other stor3_t ids: pack A to column-stored row-panels. */ \
-			*rs_p = 1; \
-			*cs_p = mr; \
-\
-			*pd_p = mr; \
-			*ps_p = mr * k; \
-\
-			/* Set the schema to "packed row panels" to indicate packing to
-			   conventional column-stored row panels. */ \
-			*schema = BLIS_PACKED_ROW_PANELS; \
-		} \
-\
-		/* Set the buffer address provided by the caller to point to the
-		   memory associated with the mem_t entry acquired from the memory
-		   broker. */ \
-		*p = bli_mem_buffer( mem ); \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC0( packm_sup_init_a )
-
-
-//
-// Define BLAS-like interfaces to the variant chooser.
-//
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-       bool      will_pack, \
-       packbuf_t pack_buf_type, \
-       stor3_t   stor_id, \
-       trans_t   transc, \
-       dim_t     m_alloc, \
-       dim_t     k_alloc, \
-       dim_t     m, \
-       dim_t     k, \
-       dim_t     mr, \
-       ctype*    kappa, \
-       ctype*    a, inc_t  rs_a, inc_t  cs_a, \
-       ctype**   p, inc_t* rs_p, inc_t* cs_p, \
-                                 inc_t* ps_p, \
-       cntx_t*   cntx, \
-       rntm_t*   rntm, \
-       mem_t*    mem, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	pack_t schema; \
-	dim_t  m_max; \
-	dim_t  k_max; \
-	dim_t  pd_p; \
-\
-	/* Prepare the packing destination buffer. If packing is not requested,
-	   this function will reduce to a no-op. */ \
-	PASTEMAC(ch,packm_sup_init_mem_a) \
-	( \
-	  will_pack, \
-	  pack_buf_type, \
-	  m_alloc, k_alloc, mr, \
-	  cntx, \
-	  rntm, \
-	  mem, \
-	  thread  \
-	); \
-\
-	/* Determine the packing buffer and related parameters for matrix A. If A
-	   will not be packed, then a_use will be set to point to a and the _a_use
-	   strides will be set accordingly. */ \
-	PASTEMAC(ch,packm_sup_init_a) \
-	( \
-	  will_pack, \
-	  stor_id, \
-	  &schema, \
-	  m, k, mr, \
-	  &m_max, &k_max, \
-	  a, rs_a,  cs_a, \
-	  p, rs_p,  cs_p, \
-	     &pd_p, ps_p, \
-	  cntx, \
-	  mem, \
-	  thread  \
-	); \
-\
-	/* Inspect whether we are going to be packing matrix A. */ \
-	if ( will_pack == FALSE ) \
-	{ \
-		/* If we aren't going to pack matrix A, then there's nothing to do. */ \
-\
-		/*
-		printf( "blis_ packm_sup_a: not packing A.\n" ); \
-		*/ \
-	} \
-	else /* if ( will_pack == TRUE ) */ \
-	{ \
-		if ( schema == BLIS_PACKED_ROWS ) \
-		{ \
-			/*
-			printf( "blis_ packm_sup_a: packing A to rows.\n" ); \
-			*/ \
-\
-			/* For plain packing by rows, use var2. */ \
-			PASTEMAC(ch,packm_sup_var2) \
-			( \
-			  transc, \
-			  schema, \
-			  m, \
-			  k, \
-			  kappa, \
-			  a,  rs_a,  cs_a, \
-			  *p, *rs_p, *cs_p, \
-			  cntx, \
-			  thread  \
-			); \
-		} \
-		else /* if ( schema == BLIS_PACKED_ROW_PANELS ) */ \
-		{ \
-			/*
-			printf( "blis_ packm_sup_a: packing A to row panels.\n" ); \
-			*/ \
-\
-			/* For packing to column-stored row panels, use var1. */ \
-			PASTEMAC(ch,packm_sup_var1) \
-			( \
-			  transc, \
-			  schema, \
-			  m, \
-			  k, \
-			  m_max, \
-			  k_max, \
-			  kappa, \
-			  a,  rs_a,  cs_a, \
-			  *p, *rs_p, *cs_p, \
-			      pd_p,  *ps_p, \
-			  cntx, \
-			  thread  \
-			); \
-		} \
-\
-		/* Barrier so that packing is done before computation. */ \
-		bli_thread_barrier( thread ); \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC0( packm_sup_a )
-
diff --git a/frame/3/bli_l3_sup_packm_a.h b/frame/3/bli_l3_sup_packm_a.h
deleted file mode 100644
index 0aaa302c8c..0000000000
--- a/frame/3/bli_l3_sup_packm_a.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-             bool       will_pack, \
-             packbuf_t  pack_buf_type, \
-             dim_t      m, \
-             dim_t      k, \
-             dim_t      mr, \
-       const cntx_t*    cntx, \
-             rntm_t*    rntm, \
-             mem_t*     mem, \
-       const thrinfo_t* thread  \
-     ); \
-
-INSERT_GENTPROT_BASIC0( packm_sup_init_mem_a )
-
-
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-             bool       did_pack, \
-             rntm_t*    rntm, \
-             mem_t*     mem, \
-       const thrinfo_t* thread  \
-     ); \
-
-INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_a )
-
-
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-       bool    will_pack, \
-       stor3_t stor_id, \
-       pack_t* schema, \
-       dim_t   m, \
-       dim_t   k, \
-       dim_t   mr, \
-       dim_t*  m_max, \
-       dim_t*  k_max, \
-       ctype*  a, inc_t  rs_a, inc_t  cs_a, \
-       ctype** p, inc_t* rs_p, inc_t* cs_p, \
-                  dim_t* pd_p, inc_t* ps_p, \
-       cntx_t* cntx, \
-       mem_t*  mem, \
-       thrinfo_t* thread  \
-     ); \
-
-INSERT_GENTPROT_BASIC0( packm_sup_init_a )
-
-
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-       bool      will_pack, \
-       packbuf_t pack_buf_type, \
-       stor3_t   stor_id, \
-       trans_t   transc, \
-       dim_t     m_alloc, \
-       dim_t     k_alloc, \
-       dim_t     m, \
-       dim_t     k, \
-       dim_t     mr, \
-       ctype*    kappa, \
-       ctype*    a, inc_t  rs_a, inc_t  cs_a, \
-       ctype**   p, inc_t* rs_p, inc_t* cs_p, \
-                                 inc_t* ps_p, \
-       cntx_t*   cntx, \
-       rntm_t*   rntm, \
-       mem_t*    mem, \
-       thrinfo_t* thread  \
-     ); \
-
-INSERT_GENTPROT_BASIC0( packm_sup_a )
-
diff --git a/frame/3/bli_l3_sup_packm_b.c b/frame/3/bli_l3_sup_packm_b.c
deleted file mode 100644
index 7a2030ccf2..0000000000
--- a/frame/3/bli_l3_sup_packm_b.c
+++ /dev/null
@@ -1,430 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "blis.h"
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-             bool       will_pack, \
-             packbuf_t  pack_buf_type, \
-             dim_t      k, \
-             dim_t      n, \
-             dim_t      nr, \
-       const cntx_t*    cntx, \
-             rntm_t*    rntm, \
-             mem_t*     mem, \
-       const thrinfo_t* thread  \
-     ) \
-{ \
-	/* Inspect whether we are going to be packing matrix B. */ \
-	if ( will_pack == FALSE ) \
-	{ \
-	} \
-	else /* if ( will_pack == TRUE ) */ \
-	{ \
-		/* NOTE: This "rounding up" of the last upanel is actually optional
-		   for the rrc/crc cases, but absolutely necessary for the other cases
-		   since we NEED that last micropanel to have the same ldim (cs_p) as
-		   the other micropanels. Why? So that millikernels can use the same
-		   upanel ldim for all iterations of the ir loop. */ \
-		const dim_t k_pack = k; \
-		const dim_t n_pack = ( n / nr + ( n % nr ? 1 : 0 ) ) * nr; \
-\
-		/* Barrier to make sure all threads are caught up and ready to begin
-		   the packm stage. */ \
-		bli_thread_barrier( thread ); \
-\
-		/* Compute the size of the memory block eneded. */ \
-		siz_t size_needed = sizeof( ctype ) * k_pack * n_pack; \
-\
-		/* Check the mem_t entry provided by the caller. If it is unallocated,
-		   then we need to acquire a block from the memory broker. */ \
-		if ( bli_mem_is_unalloc( mem ) ) \
-		{ \
-			if ( bli_thread_am_ochief( thread ) ) \
-			{ \
-				/* Acquire directly to the chief thread's mem_t that was
-				   passed in. It needs to be that mem_t struct, and not a
-				   local (temporary) mem_t, since there is no barrier until
-				   after packing is finished, which could allow a race
-				   condition whereby the chief thread exits the current
-				   function before the other threads have a chance to copy
-				   from it. (A barrier would fix that race condition, but
-				   then again, I prefer to keep barriers to a minimum.) */ \
-				bli_pba_acquire_m \
-				( \
-				  rntm, \
-				  size_needed, \
-				  pack_buf_type, \
-				  mem  \
-				); \
-			} \
-\
-			/* Broadcast the address of the chief thread's passed-in mem_t
-			   to all threads. */ \
-			mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
-\
-			/* Non-chief threads: Copy the contents of the chief thread's
-			   passed-in mem_t to the passed-in mem_t for this thread. (The
-			   chief thread already has the mem_t, so it does not need to
-			   perform any copy.) */ \
-			if ( !bli_thread_am_ochief( thread ) ) \
-			{ \
-				*mem = *mem_p; \
-			} \
-		} \
-		else /* if ( bli_mem_is_alloc( mem ) ) */ \
-		{ \
-			/* If the mem_t entry provided by the caller does NOT contain a NULL
-			   buffer, then a block has already been acquired from the memory
-			   broker and cached by the caller. */ \
-\
-			/* As a sanity check, we should make sure that the mem_t object isn't
-			   associated with a block that is too small compared to the size of
-			   the packed matrix buffer that is needed, according to the value
-			   computed above. */ \
-			siz_t mem_size = bli_mem_size( mem ); \
-\
-			if ( mem_size < size_needed ) \
-			{ \
-				if ( bli_thread_am_ochief( thread ) ) \
-				{ \
-					/* The chief thread releases the existing block associated
-					   with the mem_t, and then re-acquires a new block, saving
-					   the associated mem_t to its passed-in mem_t. (See coment
-					   above for why the acquisition needs to be directly to
-					   the chief thread's passed-in mem_t and not a local
-					   (temporary) mem_t. */ \
-					bli_pba_release \
-					( \
-					  rntm, \
-					  mem \
-					); \
-					bli_pba_acquire_m \
-					( \
-					  rntm, \
-					  size_needed, \
-					  pack_buf_type, \
-					  mem \
-					); \
-				} \
-\
-				/* Broadcast the address of the chief thread's passed-in mem_t
-				   to all threads. */ \
-				mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
-\
-				/* Non-chief threads: Copy the contents of the chief thread's
-				   passed-in mem_t to the passed-in mem_t for this thread. (The
-				   chief thread already has the mem_t, so it does not need to
-				   perform any copy.) */ \
-				if ( !bli_thread_am_ochief( thread ) ) \
-				{ \
-					*mem = *mem_p; \
-				} \
-			} \
-			else \
-			{ \
-				/* If the mem_t entry is already allocated and sufficiently large,
-				   then we use it as-is. No action is needed. */ \
-			} \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC0( packm_sup_init_mem_b )
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-             bool       did_pack, \
-             rntm_t*    rntm, \
-             mem_t*     mem, \
-       const thrinfo_t* thread  \
-     ) \
-{ \
-	/* Inspect whether we previously packed matrix A. */ \
-	if ( did_pack == FALSE ) \
-	{ \
-		/* If we didn't pack matrix A, there's nothing to be done. */ \
-	} \
-	else /* if ( did_pack == TRUE ) */ \
-	{ \
-		if ( thread != NULL ) \
-		if ( bli_thread_am_ochief( thread ) ) \
-		{ \
-			/* Check the mem_t entry provided by the caller. Only proceed if it
-			   is allocated, which it should be. */ \
-			if ( bli_mem_is_alloc( mem ) ) \
-			{ \
-				bli_pba_release \
-				( \
-				  rntm, \
-				  mem \
-				); \
-			} \
-		} \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC0( packm_sup_finalize_mem_b )
-
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-       bool    will_pack, \
-       stor3_t stor_id, \
-       pack_t* schema, \
-       dim_t   k, \
-       dim_t   n, \
-       dim_t   nr, \
-       dim_t*  k_max, \
-       dim_t*  n_max, \
-       ctype*  b, inc_t  rs_b, inc_t  cs_b, \
-       ctype** p, inc_t* rs_p, inc_t* cs_p, \
-                  dim_t* pd_p, inc_t* ps_p, \
-       cntx_t* cntx, \
-       mem_t*  mem, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	/* Inspect whether we are going to be packing matrix B. */ \
-	if ( will_pack == FALSE ) \
-	{ \
-		*k_max = k; \
-		*n_max = n; \
-\
-		/* Set the parameters for use with no packing of B (ie: using the
-		   source matrix B directly). */ \
-		{ \
-			/* Use the strides of the source matrix as the final values. */ \
-			*rs_p = rs_b; \
-			*cs_p = cs_b; \
-\
-			*pd_p = nr; \
-			*ps_p = nr * cs_b; \
-\
-			/* Set the schema to "not packed" to indicate that packing will be
-			   skipped. */ \
-			*schema = BLIS_NOT_PACKED; \
-		} \
-\
-		/* Since we won't be packing, simply update the buffer address provided
-		   by the caller to point to source matrix. */ \
-		*p = b; \
-	} \
-	else /* if ( will_pack == TRUE ) */ \
-	{ \
-		/* NOTE: This is "rounding up" of the last upanel is actually optional
-		   for the rrc/crc cases, but absolutely necessary for the other cases
-		   since we NEED that last micropanel to have the same ldim (cs_p) as
-		   the other micropanels. Why? So that millikernels can use the same
-		   upanel ldim for all iterations of the ir loop. */ \
-		*k_max = k; \
-		*n_max = ( n / nr + ( n % nr ? 1 : 0 ) ) * nr; \
-\
-		/* Determine the dimensions and strides for the packed matrix B. */ \
-		if ( stor_id == BLIS_RRC || \
-			 stor_id == BLIS_CRC ) \
-		{ \
-			/* stor3_t id values _RRC and _CRC: pack B to plain row storage. */ \
-			*rs_p = 1; \
-			*cs_p = k; \
-\
-			*pd_p = nr; \
-			*ps_p = k * nr; \
-\
-			/* Set the schema to "column packed" to indicate packing to plain
-			   column storage. */ \
-			*schema = BLIS_PACKED_COLUMNS; \
-		} \
-		else \
-		{ \
-			/* All other stor3_t ids: pack B to row-stored column-panels. */ \
-			*rs_p = nr; \
-			*cs_p = 1; \
-\
-			*pd_p = nr; \
-			*ps_p = k * nr; \
-\
-			/* Set the schema to "packed column panels" to indicate packing to
-			   conventional row-stored column panels. */ \
-			*schema = BLIS_PACKED_COL_PANELS; \
-		} \
-\
-		/* Set the buffer address provided by the caller to point to the
-		   memory associated with the mem_t entry acquired from the memory
-		   broker. */ \
-		*p = bli_mem_buffer( mem ); \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC0( packm_sup_init_b )
-
-
-//
-// Define BLAS-like interfaces to the variant chooser.
-//
-
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-       bool      will_pack, \
-       packbuf_t pack_buf_type, \
-       stor3_t   stor_id, \
-       trans_t   transc, \
-       dim_t     k_alloc, \
-       dim_t     n_alloc, \
-       dim_t     k, \
-       dim_t     n, \
-       dim_t     nr, \
-       ctype*    kappa, \
-       ctype*    b, inc_t  rs_b, inc_t  cs_b, \
-       ctype**   p, inc_t* rs_p, inc_t* cs_p, \
-                                 inc_t* ps_p, \
-       cntx_t*   cntx, \
-       rntm_t*   rntm, \
-       mem_t*    mem, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	pack_t schema; \
-	dim_t  k_max; \
-	dim_t  n_max; \
-	dim_t  pd_p; \
-\
-	/* Prepare the packing destination buffer. If packing is not requested,
-	   this function will reduce to a no-op. */ \
-	PASTEMAC(ch,packm_sup_init_mem_b) \
-	( \
-	  will_pack, \
-	  pack_buf_type, \
-	  k_alloc, n_alloc, nr, \
-	  cntx, \
-	  rntm, \
-	  mem, \
-	  thread  \
-	); \
-\
-	/* Determine the packing buffer and related parameters for matrix B. If B
-	   will not be packed, then b_use will be set to point to b and the _b_use
-	   strides will be set accordingly. */ \
-	PASTEMAC(ch,packm_sup_init_b) \
-	( \
-	  will_pack, \
-	  stor_id, \
-	  &schema, \
-	  k, n, nr, \
-	  &k_max, &n_max, \
-	  b, rs_b,  cs_b, \
-	  p, rs_p,  cs_p, \
-	     &pd_p, ps_p, \
-	  cntx, \
-	  mem, \
-	  thread  \
-	); \
-\
-	/* Inspect whether we are going to be packing matrix B. */ \
-	if ( will_pack == FALSE ) \
-	{ \
-		/* If we aren't going to pack matrix B, then there's nothing to do. */ \
-\
-		/*
-		printf( "blis_ packm_sup_b: not packing B.\n" ); \
-		*/ \
-	} \
-	else /* if ( will_pack == TRUE ) */ \
-	{ \
-		if ( schema == BLIS_PACKED_COLUMNS ) \
-		{ \
-			/*
-			printf( "blis_ packm_sup_b: packing B to columns.\n" ); \
-			*/ \
-\
-			/* For plain packing by columns, use var2. */ \
-			PASTEMAC(ch,packm_sup_var2) \
-			( \
-			  transc, \
-			  schema, \
-			  k, \
-			  n, \
-			  kappa, \
-			  b,  rs_b,  cs_b, \
-			  *p, *rs_p, *cs_p, \
-			  cntx, \
-			  thread  \
-			); \
-		} \
-		else /* if ( schema == BLIS_PACKED_COL_PANELS ) */ \
-		{ \
-			/*
-			printf( "blis_ packm_sup_b: packing B to col panels.\n" ); \
-			*/ \
-\
-			/* For packing to row-stored column panels, use var1. */ \
-			PASTEMAC(ch,packm_sup_var1) \
-			( \
-			  transc, \
-			  schema, \
-			  k, \
-			  n, \
-			  k_max, \
-			  n_max, \
-			  kappa, \
-			  b,  rs_b,  cs_b, \
-			  *p, *rs_p, *cs_p, \
-			      pd_p,  *ps_p, \
-			  cntx, \
-			  thread  \
-			); \
-		} \
-\
-		/* Barrier so that packing is done before computation. */ \
-		bli_thread_barrier( thread ); \
-	} \
-}
-
-INSERT_GENTFUNC_BASIC0( packm_sup_b )
-
diff --git a/frame/3/bli_l3_sup_packm_b.h b/frame/3/bli_l3_sup_packm_b.h
deleted file mode 100644
index bd18e5887e..0000000000
--- a/frame/3/bli_l3_sup_packm_b.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
-
-   BLIS
-   An object-based framework for developing high-performance BLAS-like
-   libraries.
-
-   Copyright (C) 2014, The University of Texas at Austin
-   Copyright (C) 2018, Advanced Micro Devices, Inc.
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-    - Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    - Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    - Neither the name(s) of the copyright holder(s) nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-             bool       will_pack, \
-             packbuf_t  pack_buf_type, \
-             dim_t      k, \
-             dim_t      n, \
-             dim_t      nr, \
-       const cntx_t*    cntx, \
-             rntm_t*    rntm, \
-             mem_t*     mem, \
-       const thrinfo_t* thread  \
-     ); \
-
-INSERT_GENTPROT_BASIC0( packm_sup_init_mem_b )
-
-
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-             bool       did_pack, \
-             rntm_t*    rntm, \
-             mem_t*     mem, \
-       const thrinfo_t* thread  \
-     ); \
-
-INSERT_GENTPROT_BASIC0( packm_sup_finalize_mem_b )
-
-
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-       bool    will_pack, \
-       stor3_t stor_id, \
-       pack_t* schema, \
-       dim_t   k, \
-       dim_t   n, \
-       dim_t   nr, \
-       dim_t*  k_max, \
-       dim_t*  n_max, \
-       ctype*  b, inc_t  rs_b, inc_t  cs_b, \
-       ctype** p, inc_t* rs_p, inc_t* cs_p, \
-                  dim_t* pd_p, inc_t* ps_p, \
-       cntx_t* cntx, \
-       mem_t*  mem, \
-       thrinfo_t* thread  \
-     ); \
-
-INSERT_GENTPROT_BASIC0( packm_sup_init_b )
-
-
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, opname ) \
-\
-void PASTEMAC(ch,opname) \
-     ( \
-       bool      will_pack, \
-       packbuf_t pack_buf_type, \
-       stor3_t   stor_id, \
-       trans_t   transc, \
-       dim_t     k_alloc, \
-       dim_t     n_alloc, \
-       dim_t     k, \
-       dim_t     n, \
-       dim_t     nr, \
-       ctype*    kappa, \
-       ctype*    b, inc_t  rs_b, inc_t  cs_b, \
-       ctype**   p, inc_t* rs_p, inc_t* cs_p, \
-                                 inc_t* ps_p, \
-       cntx_t*   cntx, \
-       rntm_t*   rntm, \
-       mem_t*    mem, \
-       thrinfo_t* thread  \
-     ); \
-
-INSERT_GENTPROT_BASIC0( packm_sup_b )
-
diff --git a/frame/3/bli_l3_sup_packm_var.c b/frame/3/bli_l3_sup_packm_var.c
index 54ecab8ff5..3572510022 100644
--- a/frame/3/bli_l3_sup_packm_var.c
+++ b/frame/3/bli_l3_sup_packm_var.c
@@ -44,17 +44,17 @@
 \
 void PASTEMAC(ch,varname) \
      ( \
-       trans_t transc, \
-       pack_t  schema, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   m_max, \
-       dim_t   n_max, \
-       ctype*  kappa, \
-       ctype*  c, inc_t rs_c, inc_t cs_c, \
-       ctype*  p, inc_t rs_p, inc_t cs_p, \
-                  dim_t pd_p, inc_t ps_p, \
-       cntx_t* cntx, \
+       trans_t    transc, \
+       pack_t     schema, \
+       dim_t      m, \
+       dim_t      n, \
+       dim_t      m_max, \
+       dim_t      n_max, \
+       void*      kappa, \
+       void*      c, inc_t rs_c, inc_t cs_c, \
+       void*      p, inc_t rs_p, inc_t cs_p, \
+                     dim_t pd_p, inc_t ps_p, \
+       cntx_t*    cntx, \
        thrinfo_t* thread  \
      ) \
 { \
@@ -317,14 +317,14 @@ bli_thread_barrier( thread ); \
 \
 void PASTEMAC(ch,varname) \
      ( \
-       trans_t transc, \
-       pack_t  schema, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  kappa, \
-       ctype*  c, inc_t rs_c, inc_t cs_c, \
-       ctype*  p, inc_t rs_p, inc_t cs_p, \
-       cntx_t* cntx, \
+       trans_t    transc, \
+       pack_t     schema, \
+       dim_t      m, \
+       dim_t      n, \
+       void*      kappa, \
+       void*      c, inc_t rs_c, inc_t cs_c, \
+       void*      p, inc_t rs_p, inc_t cs_p, \
+       cntx_t*    cntx, \
        thrinfo_t* thread  \
      ) \
 { \
diff --git a/frame/3/bli_l3_sup_packm_var.h b/frame/3/bli_l3_sup_packm_var.h
index 9c62c9c68d..17cf9a4825 100644
--- a/frame/3/bli_l3_sup_packm_var.h
+++ b/frame/3/bli_l3_sup_packm_var.h
@@ -42,17 +42,17 @@
 \
 void PASTEMAC(ch,varname) \
      ( \
-       trans_t transc, \
-       pack_t  schema, \
-       dim_t   m, \
-       dim_t   n, \
-       dim_t   m_max, \
-       dim_t   n_max, \
-       ctype*  kappa, \
-       ctype*  c, inc_t rs_c, inc_t cs_c, \
-       ctype*  p, inc_t rs_p, inc_t cs_p, \
-                  dim_t pd_p, inc_t ps_p, \
-       cntx_t* cntx, \
+       trans_t    transc, \
+       pack_t     schema, \
+       dim_t      m, \
+       dim_t      n, \
+       dim_t      m_max, \
+       dim_t      n_max, \
+       void*      kappa, \
+       void*      c, inc_t rs_c, inc_t cs_c, \
+       void*      p, inc_t rs_p, inc_t cs_p, \
+                     dim_t pd_p, inc_t ps_p, \
+       cntx_t*    cntx, \
        thrinfo_t* thread  \
      );
 
@@ -63,14 +63,14 @@ INSERT_GENTPROT_BASIC0( packm_sup_var1 )
 \
 void PASTEMAC(ch,varname) \
      ( \
-       trans_t transc, \
-       pack_t  schema, \
-       dim_t   m, \
-       dim_t   n, \
-       ctype*  kappa, \
-       ctype*  c, inc_t rs_c, inc_t cs_c, \
-       ctype*  p, inc_t rs_p, inc_t cs_p, \
-       cntx_t* cntx, \
+       trans_t    transc, \
+       pack_t     schema, \
+       dim_t      m, \
+       dim_t      n, \
+       void*      kappa, \
+       void*      c, inc_t rs_c, inc_t cs_c, \
+       void*      p, inc_t rs_p, inc_t cs_p, \
+       cntx_t*    cntx, \
        thrinfo_t* thread  \
      );
 
diff --git a/frame/3/bli_l3_sup_var1n2m.c b/frame/3/bli_l3_sup_var1n2m.c
index a5d66783fc..a01e4b7371 100644
--- a/frame/3/bli_l3_sup_var1n2m.c
+++ b/frame/3/bli_l3_sup_var1n2m.c
@@ -34,34 +34,10 @@
 
 #include "blis.h"
 
-#define FUNCPTR_T gemmsup_fp
-
-typedef void (*FUNCPTR_T)
-     (
-       bool       packa,
-       bool       packb,
-       conj_t     conja,
-       conj_t     conjb,
-       dim_t      m,
-       dim_t      n,
-       dim_t      k,
-       void*      alpha,
-       void*      a, inc_t rs_a, inc_t cs_a,
-       void*      b, inc_t rs_b, inc_t cs_b,
-       void*      beta,
-       void*      c, inc_t rs_c, inc_t cs_c,
-       stor3_t    eff_id,
-       cntx_t*    cntx,
-       rntm_t*    rntm,
-       thrinfo_t* thread
-     );
-
 //
 // -- var1n --------------------------------------------------------------------
 //
 
-static FUNCPTR_T GENARRAY(ftypes_var1n,gemmsup_ref_var1n);
-
 void bli_gemmsup_ref_var1n
      (
              trans_t trans,
@@ -70,67 +46,31 @@ void bli_gemmsup_ref_var1n
        const obj_t*  b,
        const obj_t*  beta,
        const obj_t*  c,
-             stor3_t eff_id,
+             stor3_t stor_id,
        const cntx_t* cntx,
              rntm_t* rntm,
              thrinfo_t* thread
      )
 {
-#if 0
-	obj_t at, bt;
-
-	bli_obj_alias_to( a, &at );
-	bli_obj_alias_to( b, &bt );
-
-	// Induce transpositions on A and/or B if either object is marked for
-	// transposition. We can induce "fast" transpositions since they objects
-	// are guaranteed to not have structure or be packed.
-	if ( bli_obj_has_trans( &at ) ) { bli_obj_induce_fast_trans( &at ); }
-	if ( bli_obj_has_trans( &bt ) ) { bli_obj_induce_fast_trans( &bt ); }
-
-	const num_t    dt        = bli_obj_dt( c );
-
-	const conj_t   conja     = bli_obj_conj_status( a );
-	const conj_t   conjb     = bli_obj_conj_status( b );
+	const num_t  dt      = bli_obj_dt( c );
 
-	const dim_t    m         = bli_obj_length( c );
-	const dim_t    n         = bli_obj_width( c );
+    const dim_t  dt_size = bli_dt_size( dt );
 
-	const dim_t    k         = bli_obj_width( &at );
+	      bool   packa   = bli_rntm_pack_a( rntm );
+	      bool   packb   = bli_rntm_pack_b( rntm );
 
-	void* buf_a     = bli_obj_buffer_at_off( &at );
-	const inc_t    rs_a      = bli_obj_row_stride( &at );
-	const inc_t    cs_a      = bli_obj_col_stride( &at );
+	      conj_t conja   = bli_obj_conj_status( a );
+	      conj_t conjb   = bli_obj_conj_status( b );
 
-	void* buf_b     = bli_obj_buffer_at_off( &bt );
-	const inc_t    rs_b      = bli_obj_row_stride( &bt );
-	const inc_t    cs_b      = bli_obj_col_stride( &bt );
-
-	void* buf_c     = bli_obj_buffer_at_off( c );
-	const inc_t    rs_c      = bli_obj_row_stride( c );
-	const inc_t    cs_c      = bli_obj_col_stride( c );
-
-	void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
-	void* buf_beta  = bli_obj_buffer_for_1x1( dt, beta );
-
-#else
-	const num_t  dt    = bli_obj_dt( c );
-
-	const bool   packa = bli_rntm_pack_a( rntm );
-	const bool   packb = bli_rntm_pack_b( rntm );
-
-	const conj_t conja = bli_obj_conj_status( a );
-	const conj_t conjb = bli_obj_conj_status( b );
-
-	const dim_t  m     = bli_obj_length( c );
-	const dim_t  n     = bli_obj_width( c );
+	      dim_t  m       = bli_obj_length( c );
+	      dim_t  n       = bli_obj_width( c );
 	      dim_t  k;
 
-	const void*  buf_a = bli_obj_buffer_at_off( a );
+	const void*  buf_a   = bli_obj_buffer_at_off( a );
 	      inc_t  rs_a;
 	      inc_t  cs_a;
 
-	const void*  buf_b = bli_obj_buffer_at_off( b );
+	const void*  buf_b   = bli_obj_buffer_at_off( b );
 	      inc_t  rs_b;
 	      inc_t  cs_b;
 
@@ -163,120 +103,31 @@ void bli_gemmsup_ref_var1n
 	}
 
 	      void* buf_c     = bli_obj_buffer_at_off( c );
-	const inc_t rs_c      = bli_obj_row_stride( c );
-	const inc_t cs_c      = bli_obj_col_stride( c );
+	      inc_t rs_c      = bli_obj_row_stride( c );
+	      inc_t cs_c      = bli_obj_col_stride( c );
 
 	const void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
 	const void* buf_beta  = bli_obj_buffer_for_1x1( dt, beta );
 
-#endif
-
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	FUNCPTR_T f = ftypes_var1n[dt];
-
 #if 1
 	// Optimize some storage/packing cases by transforming them into others.
-	// These optimizations are expressed by changing trans and/or eff_id.
-	bli_gemmsup_ref_var1n2m_opt_cases( dt, &trans, packa, packb, &eff_id, cntx );
+	// These optimizations are expressed by changing trans and/or stor_id.
+	bli_gemmsup_ref_var1n2m_opt_cases( dt, &trans, packa, packb, &stor_id, cntx );
 #endif
 
-	if ( bli_is_notrans( trans ) )
-	{
-		// Invoke the function.
-		f
-		(
-		  packa,
-		  packb,
-		  conja,
-		  conjb,
-		  m,
-		  n,
-		  k,
-		  ( void* )buf_alpha,
-		  ( void* )buf_a, rs_a, cs_a,
-		  ( void* )buf_b, rs_b, cs_b,
-		  ( void* )buf_beta,
-		           buf_c, rs_c, cs_c,
-		  eff_id,
-		  ( cntx_t* )cntx,
-		  rntm,
-		  thread
-		);
-	}
-	else
-	{
-		// Invoke the function (transposing the operation).
-		f
-		(
-		  packb,
-		  packa,
-		  conjb,             // swap the conj values.
-		  conja,
-		  n,                 // swap the m and n dimensions.
-		  m,
-		  k,
-		  ( void* )buf_alpha,
-		  ( void* )buf_b, cs_b, rs_b, // swap the positions of A and B.
-		  ( void* )buf_a, cs_a, rs_a, // swap the strides of A and B.
-		  ( void* )buf_beta,
-		           buf_c, cs_c, rs_c, // swap the strides of C.
-		  bli_stor3_trans( eff_id ), // transpose the stor3_t id.
-          ( cntx_t* )cntx,
-		  rntm,
-		  thread
-		);
-	}
-}
+    if ( bli_is_trans( trans ) )
+    {
+              bool   packtmp = packa; packa = packb; packb = packtmp;
+              conj_t conjtmp = conja; conja = conjb; conjb = conjtmp;
+              dim_t  len_tmp =     m;     m =     n;     n = len_tmp;
+        const void*  buf_tmp = buf_a; buf_a = buf_b; buf_b = buf_tmp;
+              inc_t  str_tmp =  rs_a;  rs_a =  cs_b;  cs_b = str_tmp;
+                     str_tmp =  cs_a;  cs_a =  rs_b;  rs_b = str_tmp;
+                     str_tmp =  rs_c;  rs_c =  cs_c;  cs_c = str_tmp;
 
+        stor_id = bli_stor3_trans( stor_id );
+    }
 
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       bool       packa, \
-       bool       packb, \
-       conj_t     conja, \
-       conj_t     conjb, \
-       dim_t      m, \
-       dim_t      n, \
-       dim_t      k, \
-       void*      alpha, \
-       void*      a, inc_t rs_a, inc_t cs_a, \
-       void*      b, inc_t rs_b, inc_t cs_b, \
-       void*      beta, \
-       void*      c, inc_t rs_c, inc_t cs_c, \
-       stor3_t    stor_id, \
-       cntx_t*    cntx, \
-       rntm_t*    rntm, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	const num_t dt = PASTEMAC(ch,type); \
-\
-	/* If m or n is zero, return immediately. */ \
-	if ( bli_zero_dim2( m, n ) ) return; \
-\
-	/* If k < 1 or alpha is zero, scale by beta and return. */ \
-	if ( k < 1 || PASTEMAC(ch,eq0)( *(( ctype* )alpha) ) ) \
-	{ \
-		if ( bli_thread_am_ochief( thread ) ) \
-		{ \
-			PASTEMAC(ch,scalm) \
-			( \
-			  BLIS_NO_CONJUGATE, \
-			  0, \
-			  BLIS_NONUNIT_DIAG, \
-			  BLIS_DENSE, \
-			  m, n, \
-			  beta, \
-			  c, rs_c, cs_c \
-			); \
-		} \
-		return; \
-	} \
-\
 	/* This transposition of the stor3_t id value is inherent to variant 1.
 	   The reason: we assume that variant 2 is the "main" variant. The
 	   consequence of this is that we assume that the millikernels that
@@ -285,115 +136,96 @@ void PASTEMAC(ch,varname) \
 	   n are assumed to be registered to the "non-primary" group associated
 	   with the ("non-primary") anti-preference. Note that this pattern holds
 	   regardless of whether the mkernel set has a row or column preference.)
-	   See bli_l3_sup_int.c for a higher-level view of how this choice is made. */ \
-	stor_id = bli_stor3_trans( stor_id ); \
-\
-	/* Query the context for various blocksizes. */ \
-	const dim_t NR  = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \
-	const dim_t MR  = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \
-	const dim_t NC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \
-	const dim_t MC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \
-	const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \
-\
-	dim_t KC; \
-	if      ( packa && packb ) \
-	{ \
-		KC = KC0; \
-	} \
-	else if ( packb ) \
-	{ \
-		if      ( stor_id == BLIS_RRR || \
-				  stor_id == BLIS_CCC    ) KC = KC0; \
-		else if ( stor_id == BLIS_RRC || \
-				  stor_id == BLIS_CRC    ) KC = KC0; \
-		else if ( stor_id == BLIS_RCR || \
-		          stor_id == BLIS_CCR    ) KC = (( KC0 / 4 ) / 4 ) * 4; \
-		else                               KC = KC0; \
-	} \
-	else if ( packa ) \
-	{ \
-		if      ( stor_id == BLIS_RRR || \
-				  stor_id == BLIS_CCC    ) KC = (( KC0 / 2 ) / 2 ) * 2; \
-		else if ( stor_id == BLIS_RRC || \
-				  stor_id == BLIS_CRC    ) KC = KC0; \
-		else if ( stor_id == BLIS_RCR || \
-		          stor_id == BLIS_CCR    ) KC = (( KC0 / 4 ) / 4 ) * 4; \
-		else                               KC = KC0; \
-	} \
-	else /* if ( !packa && !packb ) */ \
-	{ \
-		if      ( FALSE                  ) KC = KC0; \
-		else if ( stor_id == BLIS_RRC || \
-				  stor_id == BLIS_CRC    ) KC = KC0; \
-		else if ( m <=   MR && n <=   NR ) KC = KC0; \
-		else if ( m <= 2*MR && n <= 2*NR ) KC = KC0 / 2; \
-		else if ( m <= 3*MR && n <= 3*NR ) KC = (( KC0 / 3 ) / 4 ) * 4; \
-		else if ( m <= 4*MR && n <= 4*NR ) KC = KC0 / 4; \
-		else                               KC = (( KC0 / 5 ) / 4 ) * 4; \
-	} \
-\
+	   See bli_l3_sup_int.c for a higher-level view of how this choice is made. */
+	stor_id = bli_stor3_trans( stor_id );
+
+	/* Query the context for various blocksizes. */
+	const dim_t NR  = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx );
+	const dim_t MR  = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx );
+	const dim_t NC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx );
+	const dim_t MC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx );
+	const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx );
+
+	dim_t KC;
+	if      ( packa && packb )
+	{
+		KC = KC0;
+	}
+	else if ( packb )
+	{
+		if      ( stor_id == BLIS_RRR ||
+				  stor_id == BLIS_CCC    ) KC = KC0;
+		else if ( stor_id == BLIS_RRC ||
+				  stor_id == BLIS_CRC    ) KC = KC0;
+		else if ( stor_id == BLIS_RCR ||
+		          stor_id == BLIS_CCR    ) KC = (( KC0 / 4 ) / 4 ) * 4;
+		else                               KC = KC0;
+	}
+	else if ( packa )
+	{
+		if      ( stor_id == BLIS_RRR ||
+				  stor_id == BLIS_CCC    ) KC = (( KC0 / 2 ) / 2 ) * 2;
+		else if ( stor_id == BLIS_RRC ||
+				  stor_id == BLIS_CRC    ) KC = KC0;
+		else if ( stor_id == BLIS_RCR ||
+		          stor_id == BLIS_CCR    ) KC = (( KC0 / 4 ) / 4 ) * 4;
+		else                               KC = KC0;
+	}
+	else /* if ( !packa && !packb ) */
+	{
+		if      ( FALSE                  ) KC = KC0;
+		else if ( stor_id == BLIS_RRC ||
+				  stor_id == BLIS_CRC    ) KC = KC0;
+		else if ( m <=   MR && n <=   NR ) KC = KC0;
+		else if ( m <= 2*MR && n <= 2*NR ) KC = KC0 / 2;
+		else if ( m <= 3*MR && n <= 3*NR ) KC = (( KC0 / 3 ) / 4 ) * 4;
+		else if ( m <= 4*MR && n <= 4*NR ) KC = KC0 / 4;
+		else                               KC = (( KC0 / 5 ) / 4 ) * 4;
+	}
+
 	/* Nudge NC up to a multiple of MR and MC up to a multiple of NR.
 	   NOTE: This is unique to variant 1 (ie: not performed in variant 2)
-	   because MC % MR == 0 and NC % NR == 0 is already enforced at runtime. */ \
-	const dim_t NC  = bli_align_dim_to_mult( NC0, MR ); \
-	const dim_t MC  = bli_align_dim_to_mult( MC0, NR ); \
-\
+	   because MC % MR == 0 and NC % NR == 0 is already enforced at runtime. */
+	const dim_t NC  = bli_align_dim_to_mult( NC0, MR );
+	const dim_t MC  = bli_align_dim_to_mult( MC0, NR );
+
 	/* Query the maximum blocksize for MR, which implies a maximum blocksize
-	   extension for the final iteration. */ \
-	const dim_t MRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_MR, cntx ); \
-	const dim_t MRE = MRM - MR; \
-\
-	/* Compute partitioning step values for each matrix of each loop. */ \
-	const inc_t jcstep_c = rs_c; \
-	const inc_t jcstep_a = rs_a; \
-\
-	const inc_t pcstep_a = cs_a; \
-	const inc_t pcstep_b = rs_b; \
-\
-	const inc_t icstep_c = cs_c; \
-	const inc_t icstep_b = cs_b; \
-\
-	const inc_t jrstep_c = rs_c * MR; \
-\
+	   extension for the final iteration. */
+	const dim_t MRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_MR, cntx );
+	const dim_t MRE = MRM - MR;
+
+	/* Compute partitioning step values for each matrix of each loop. */
+	const inc_t jcstep_c = rs_c * dt_size;
+	const inc_t jcstep_a = rs_a * dt_size;
+
+	const inc_t pcstep_a = cs_a * dt_size;
+	const inc_t pcstep_b = rs_b * dt_size;
+
+	const inc_t icstep_c = cs_c * dt_size;
+	const inc_t icstep_b = cs_b * dt_size;
+
+	const inc_t jrstep_c = rs_c * MR * dt_size;
+
 	/*
-	const inc_t jrstep_a = rs_a * MR; \
-\
-	const inc_t irstep_c = cs_c * NR; \
-	const inc_t irstep_b = cs_b * NR; \
-	*/ \
-\
+	const inc_t jrstep_a = rs_a * MR;
+
+	const inc_t irstep_c = cs_c * NR;
+	const inc_t irstep_b = cs_b * NR;
+	*/
+
 	/* Query the context for the sup microkernel address and cast it to its
-	   function pointer type. */ \
-	PASTECH(ch,gemmsup_ker_ft) \
-	    gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \
-\
-	ctype* a_00       = a; \
-	ctype* b_00       = b; \
-	ctype* c_00       = c; \
-	ctype* alpha_cast = alpha; \
-	ctype* beta_cast  = beta; \
-\
-	/* Make local copies of beta and one scalars to prevent any unnecessary
-	   sharing of cache lines between the cores' caches. */ \
-	ctype           beta_local = *beta_cast; \
-	ctype           one_local  = *PASTEMAC(ch,1); \
-\
-	auxinfo_t aux; \
-\
-	/* Parse and interpret the contents of the rntm_t object to properly
-	   set the ways of parallelism for each loop. */ \
-	/*bli_rntm_set_ways_from_rntm_sup( m, n, k, rntm );*/ \
-\
-	/* Initialize a mem_t entry for A and B. Strictly speaking, this is only
-	   needed for the matrix we will be packing (if any), but we do it
-	   unconditionally to be safe. An alternative way of initializing the
-	   mem_t entries is:
-
-	     bli_mem_clear( &mem_a ); \
-	     bli_mem_clear( &mem_b ); \
-	*/ \
-	mem_t mem_a = BLIS_MEM_INITIALIZER; \
-	mem_t mem_b = BLIS_MEM_INITIALIZER; \
+	   function pointer type. */
+	gemmsup_ker_vft gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx );
+
+	const char* a_00       = buf_a;
+	const char* b_00       = buf_b;
+	      char* c_00       = buf_c;
+    const void* one        = bli_obj_buffer_for_const( dt, &BLIS_ONE );
+
+	auxinfo_t aux;
+
+	mem_t mem_a = BLIS_MEM_INITIALIZER;
+	mem_t mem_b = BLIS_MEM_INITIALIZER;
 \
 	/* Define an array of bszid_t ids, which will act as our substitute for
 	   the cntl_t tree.
@@ -407,309 +239,288 @@ void PASTEMAC(ch,varname) \
 	   which packa and packb nodes are encountered in the thrinfo tree.
 	   That is, this panel-block algorithm partitions an NC x KC submatrix
 	   of A to be packed in the 4th loop, and a KC x MC submatrix of B
-	   to be packed in the 3rd loop. */ \
-	/*                                  5thloop  4thloop         packa  3rdloop         packb  2ndloop  1stloop  ukrloop */ \
-	bszid_t bszids_nopack[6] = { BLIS_NC, BLIS_KC,               BLIS_MC,               BLIS_NR, BLIS_MR, BLIS_KR }; \
-	bszid_t bszids_packa [7] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC,               BLIS_NR, BLIS_MR, BLIS_KR }; \
-	bszid_t bszids_packb [7] = { BLIS_NC, BLIS_KC,               BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \
-	bszid_t bszids_packab[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \
-	bszid_t* bszids; \
+	   to be packed in the 3rd loop. */
+	/*                                  5thloop  4thloop         packa  3rdloop         packb  2ndloop  1stloop  ukrloop */
+	bszid_t bszids_nopack[6] = { BLIS_NC, BLIS_KC,               BLIS_MC,               BLIS_NR, BLIS_MR, BLIS_KR };
+	bszid_t bszids_packa [7] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC,               BLIS_NR, BLIS_MR, BLIS_KR };
+	bszid_t bszids_packb [7] = { BLIS_NC, BLIS_KC,               BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR };
+	bszid_t bszids_packab[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR };
+	bszid_t* bszids;
 \
 	/* Set the bszids pointer to the correct bszids array above based on which
-	   matrices (if any) are being packed. */ \
-	if ( packa ) { if ( packb ) bszids = bszids_packab; \
-	               else         bszids = bszids_packa; } \
-	else         { if ( packb ) bszids = bszids_packb; \
-	               else         bszids = bszids_nopack; } \
-\
-	/* Determine whether we are using more than one thread. */ \
-	const bool is_mt = ( bli_rntm_calc_num_threads( rntm ) > 1 ); \
-\
-	thrinfo_t* thread_jc = NULL; \
-	thrinfo_t* thread_pc = NULL; \
-	thrinfo_t* thread_pa = NULL; \
-	thrinfo_t* thread_ic = NULL; \
-	thrinfo_t* thread_pb = NULL; \
-	thrinfo_t* thread_jr = NULL; \
-\
-	/* Grow the thrinfo_t tree. */ \
-	bszid_t*   bszids_jc = bszids; \
-	               thread_jc = thread; \
-	bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \
-\
-	/* Compute the JC loop thread range for the current thread. */ \
-	dim_t jc_start, jc_end; \
-	bli_thread_range_sub( thread_jc, m, MR, FALSE, &jc_start, &jc_end ); \
-	const dim_t m_local = jc_end - jc_start; \
-\
-	/* Compute number of primary and leftover components of the JC loop. */ \
-	/*const dim_t jc_iter = ( m_local + NC - 1 ) / NC;*/ \
-	const dim_t jc_left =   m_local % NC; \
-\
-	/* Loop over the m dimension (NC rows/columns at a time). */ \
-	/*for ( dim_t jj = 0; jj < jc_iter; jj += 1 )*/ \
-	for ( dim_t jj = jc_start; jj < jc_end; jj += NC ) \
-	{ \
-		/* Calculate the thread's current JC block dimension. */ \
-		const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \
-\
-		ctype* a_jc = a_00 + jj * jcstep_a; \
-		ctype* c_jc = c_00 + jj * jcstep_c; \
-\
-		/* Grow the thrinfo_t tree. */ \
-		bszid_t*   bszids_pc = &bszids_jc[1]; \
-		               thread_pc = bli_thrinfo_sub_node( thread_jc ); \
-		bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \
-\
-		/* Compute the PC loop thread range for the current thread. */ \
-		const dim_t pc_start = 0, pc_end = k; \
-		const dim_t k_local = k; \
-\
-		/* Compute number of primary and leftover components of the PC loop. */ \
-		/*const dim_t pc_iter = ( k_local + KC - 1 ) / KC;*/ \
-		const dim_t pc_left =   k_local % KC; \
-\
-		/* Loop over the k dimension (KC rows/columns at a time). */ \
-		/*for ( dim_t pp = 0; pp < pc_iter; pp += 1 )*/ \
-		for ( dim_t pp = pc_start; pp < pc_end; pp += KC ) \
-		{ \
-			/* Calculate the thread's current PC block dimension. */ \
-			const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \
-\
-			ctype* a_pc = a_jc + pp * pcstep_a; \
-			ctype* b_pc = b_00 + pp * pcstep_b; \
-\
-			/* Only apply beta to the first iteration of the pc loop. */ \
-			ctype* beta_use = ( pp == 0 ? &beta_local : &one_local ); \
-\
-			ctype* a_use; \
-			      inc_t  rs_a_use, cs_a_use, ps_a_use; \
-\
-			/* Set the bszid_t array and thrinfo_t pointer based on whether
-			   we will be packing A. If we won't be packing A, we alias to
-			   the _pc variables so that code further down can unconditionally
-			   reference the _pa variables. Note that *if* we will be packing
-			   A, the thrinfo_t node will have already been created by a
-			   previous call to bli_thrinfo_grow(), since bszid values of
-			   BLIS_NO_PART cause the tree to grow by two (e.g. to the next
-			   bszid that is a normal bszid_t value). */ \
-			bszid_t*   bszids_pa; \
-			if ( packa ) { bszids_pa = &bszids_pc[1]; \
-			               thread_pa = bli_thrinfo_sub_node( thread_pc ); } \
-			else         { bszids_pa = &bszids_pc[0]; \
-			               thread_pa = thread_pc; } \
-\
+	   matrices (if any) are being packed. */
+	if ( packa ) { if ( packb ) bszids = bszids_packab;
+	               else         bszids = bszids_packa; }
+	else         { if ( packb ) bszids = bszids_packb;
+	               else         bszids = bszids_nopack; }
+\
+	/* Determine whether we are using more than one thread. */
+	const bool is_mt = ( bli_rntm_calc_num_threads( rntm ) > 1 );
+\
+	thrinfo_t* thread_jc = NULL;
+	thrinfo_t* thread_pc = NULL;
+	thrinfo_t* thread_pa = NULL;
+	thrinfo_t* thread_ic = NULL;
+	thrinfo_t* thread_pb = NULL;
+	thrinfo_t* thread_jr = NULL;
+\
+	/* Grow the thrinfo_t tree. */
+	bszid_t* bszids_jc = bszids;
+	         thread_jc = thread;
+	bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc );
+\
+	bszid_t* bszids_pc = &bszids_jc[1];
+	         thread_pc = bli_thrinfo_sub_node( thread_jc );
+	bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc );
+
+	bszid_t* bszids_pa;
+	if ( packa ) { bszids_pa = &bszids_pc[1];
+	               thread_pa = bli_thrinfo_sub_node( thread_pc ); }
+	else         { bszids_pa = &bszids_pc[0];
+	               thread_pa = thread_pc; }
+
+	bszid_t* bszids_ic = &bszids_pa[1];
+	         thread_ic = bli_thrinfo_sub_node( thread_pa );
+	bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic );
+
+	bszid_t* bszids_pb;
+	if ( packb ) { bszids_pb = &bszids_ic[1];
+				   thread_pb = bli_thrinfo_sub_node( thread_ic ); }
+	else         { bszids_pb = &bszids_ic[0];
+				   thread_pb = thread_ic; }
+
+	bszid_t* bszids_jr = &bszids_pb[1];
+	         thread_jr = bli_thrinfo_sub_node( thread_pb );
+	bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr );
+
+	/* Compute the JC loop thread range for the current thread. */
+	dim_t jc_start, jc_end;
+	bli_thread_range_sub( thread_jc, m, MR, FALSE, &jc_start, &jc_end );
+	const dim_t m_local = jc_end - jc_start;
+
+	/* Compute number of primary and leftover components of the JC loop. */
+	/*const dim_t jc_iter = ( m_local + NC - 1 ) / NC;*/
+	const dim_t jc_left =   m_local % NC;
+
+	/* Loop over the m dimension (NC rows/columns at a time). */
+	/*for ( dim_t jj = 0; jj < jc_iter; jj += 1 )*/
+	for ( dim_t jj = jc_start; jj < jc_end; jj += NC )
+	{
+		/* Calculate the thread's current JC block dimension. */
+		const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left );
+
+		const char* a_jc = a_00 + jj * jcstep_a;
+		      char* c_jc = c_00 + jj * jcstep_c;
+
+		/* Compute the PC loop thread range for the current thread. */
+		const dim_t pc_start = 0, pc_end = k;
+		const dim_t k_local = k;
+
+		/* Compute number of primary and leftover components of the PC loop. */
+		/*const dim_t pc_iter = ( k_local + KC - 1 ) / KC;*/
+		const dim_t pc_left =   k_local % KC;
+
+		/* Loop over the k dimension (KC rows/columns at a time). */
+		/*for ( dim_t pp = 0; pp < pc_iter; pp += 1 )*/
+		for ( dim_t pp = pc_start; pp < pc_end; pp += KC )
+		{
+			/* Calculate the thread's current PC block dimension. */
+			const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left );
+
+			const char* a_pc = a_jc + pp * pcstep_a;
+			const char* b_pc = b_00 + pp * pcstep_b;
+
+			/* Only apply beta to the first iteration of the pc loop. */
+			const void* beta_use = ( pp == 0 ? buf_beta : one );
+
+		          char* a_use;
+			      inc_t rs_a_use, cs_a_use, ps_a_use;
+
 			/* Determine the packing buffer and related parameters for matrix
 			   A. (If A will not be packed, then a_use will be set to point to
 			   a and the _a_use strides will be set accordingly.) Then call
 			   the packm sup variant chooser, which will call the appropriate
 			   implementation based on the schema deduced from the stor_id.
 			   NOTE: packing matrix A in this panel-block algorithm corresponds
-			   to packing matrix B in the block-panel algorithm. */ \
-			PASTEMAC(ch,packm_sup_a) \
-			( \
-			  packa, \
-			  BLIS_BUFFER_FOR_B_PANEL, /* This algorithm packs matrix A to */ \
-			  stor_id,                 /* a "panel of B".                  */ \
-			  BLIS_NO_TRANSPOSE, \
-			  NC,     KC,       /* This "panel of B" is (at most) NC x KC. */ \
-			  nc_cur, kc_cur, MR, \
-			  &one_local, \
-			  a_pc,   rs_a,      cs_a, \
-			  &a_use, &rs_a_use, &cs_a_use, \
-			                     &ps_a_use, \
-			  cntx, \
-			  rntm, \
-			  &mem_a, \
-			  thread_pa  \
-			); \
-\
+			   to packing matrix B in the block-panel algorithm. */
+			bli_packm_sup
+			(
+			  packa,
+			  BLIS_BUFFER_FOR_B_PANEL, /* This algorithm packs matrix A to */
+			  stor_id,                 /* a "panel of B".                  */
+			  BLIS_NO_TRANSPOSE,
+              dt,
+			  NC,     KC,       /* This "panel of B" is (at most) NC x KC. */
+			  nc_cur, kc_cur, MR,
+			  one,
+			  a_pc,   rs_a,      cs_a,
+			  ( void** )&a_use, &rs_a_use, &cs_a_use,
+			                    &ps_a_use,
+              cntx,
+              rntm,
+              &mem_a,
+			  thread_pa
+			);
+
 			/* Alias a_use so that it's clear this is our current block of
-			   matrix A. */ \
-			ctype* a_pc_use = a_use; \
-\
+			   matrix A. */
+			const char* a_pc_use = a_use;
+
 			/* We don't need to embed the panel stride of A within the auxinfo_t
 			   object because this variant iterates through A in the jr loop,
 			   which occurs here, within the macrokernel, not within the
-			   millikernel. */ \
-			/*bli_auxinfo_set_ps_a( ps_a_use, &aux );*/ \
-\
-			/* Grow the thrinfo_t tree. */ \
-			bszid_t*   bszids_ic = &bszids_pa[1]; \
-			               thread_ic = bli_thrinfo_sub_node( thread_pa ); \
-			bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \
-\
-			/* Compute the IC loop thread range for the current thread. */ \
-			dim_t ic_start, ic_end; \
-			bli_thread_range_sub( thread_ic, n, NR, FALSE, &ic_start, &ic_end ); \
-			const dim_t n_local = ic_end - ic_start; \
-\
-			/* Compute number of primary and leftover components of the IC loop. */ \
-			/*const dim_t ic_iter = ( n_local + MC - 1 ) / MC;*/ \
-			const dim_t ic_left =   n_local % MC; \
-\
-			/* Loop over the n dimension (MC rows at a time). */ \
-			/*for ( dim_t ii = 0; ii < ic_iter; ii += 1 )*/ \
-			for ( dim_t ii = ic_start; ii < ic_end; ii += MC ) \
-			{ \
-				/* Calculate the thread's current IC block dimension. */ \
-				const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \
-\
-				ctype* b_ic = b_pc + ii * icstep_b; \
-				ctype* c_ic = c_jc + ii * icstep_c; \
-\
-				ctype* b_use; \
-				      inc_t  rs_b_use, cs_b_use, ps_b_use; \
-\
-				/* Set the bszid_t array and thrinfo_t pointer based on whether
-				   we will be packing A. If we won't be packing A, we alias to
-				   the _pc variables so that code further down can unconditionally
-				   reference the _pa variables. Note that *if* we will be packing
-				   A, the thrinfo_t node will have already been created by a
-				   previous call to bli_thrinfo_grow(), since bszid values of
-				   BLIS_NO_PART cause the tree to grow by two (e.g. to the next
-				   bszid that is a normal bszid_t value). */ \
-				bszid_t*   bszids_pb; \
-				if ( packb ) { bszids_pb = &bszids_ic[1]; \
-							   thread_pb = bli_thrinfo_sub_node( thread_ic ); } \
-				else         { bszids_pb = &bszids_ic[0]; \
-							   thread_pb = thread_ic; } \
-\
+			   millikernel. */
+			/*bli_auxinfo_set_ps_a( ps_a_use, &aux );*/
+
+			/* Compute the IC loop thread range for the current thread. */
+			dim_t ic_start, ic_end;
+			bli_thread_range_sub( thread_ic, n, NR, FALSE, &ic_start, &ic_end );
+			const dim_t n_local = ic_end - ic_start;
+
+			/* Compute number of primary and leftover components of the IC loop. */
+			/*const dim_t ic_iter = ( n_local + MC - 1 ) / MC;*/
+			const dim_t ic_left =   n_local % MC;
+
+			/* Loop over the n dimension (MC rows at a time). */
+			/*for ( dim_t ii = 0; ii < ic_iter; ii += 1 )*/
+			for ( dim_t ii = ic_start; ii < ic_end; ii += MC )
+			{
+				/* Calculate the thread's current IC block dimension. */
+				const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left );
+
+				const char* b_ic = b_pc + ii * icstep_b;
+				      char* c_ic = c_jc + ii * icstep_c;
+
+				      char* b_use;
+				      inc_t rs_b_use, cs_b_use, ps_b_use;
+
 				/* Determine the packing buffer and related parameters for matrix
 				   B. (If B will not be packed, then b_use will be set to point to
 				   b and the _b_use strides will be set accordingly.) Then call
 				   the packm sup variant chooser, which will call the appropriate
 				   implementation based on the schema deduced from the stor_id.
 				   NOTE: packing matrix B in this panel-block algorithm corresponds
-				   to packing matrix A in the block-panel algorithm. */ \
-				PASTEMAC(ch,packm_sup_b) \
-				( \
-				  packb, \
-				  BLIS_BUFFER_FOR_A_BLOCK, /* This algorithm packs matrix B to */ \
-				  stor_id,                 /* a "block of A".                  */ \
-				  BLIS_NO_TRANSPOSE, \
-				  KC,     MC,       /* This "block of A" is (at most) KC x MC. */ \
-				  kc_cur, mc_cur, NR, \
-				  &one_local, \
-				  b_ic,   rs_b,      cs_b, \
-				  &b_use, &rs_b_use, &cs_b_use, \
-				                     &ps_b_use, \
-				  cntx, \
-				  rntm, \
-				  &mem_b, \
-				  thread_pb  \
-				); \
-\
+				   to packing matrix A in the block-panel algorithm. */
+				bli_packm_sup
+				(
+				  packb,
+				  BLIS_BUFFER_FOR_A_BLOCK, /* This algorithm packs matrix B to */
+				  stor_id,                 /* a "block of A".                  */
+				  BLIS_NO_TRANSPOSE,
+                  dt,
+				  MC,     KC,       /* This "block of A" is (at most) KC x MC. */
+				  mc_cur, kc_cur, NR,
+				  one,
+				  b_ic,   cs_b,      rs_b,
+				  ( void** )&b_use, &cs_b_use, &rs_b_use,
+				                    &ps_b_use,
+                  cntx,
+                  rntm,
+                  &mem_b,
+				  thread_pb
+				);
+
 				/* Alias b_use so that it's clear this is our current block of
-				   matrix B. */ \
-				ctype* b_ic_use = b_use; \
-\
+				   matrix B. */
+				const char* b_ic_use = b_use;
+
 				/* Embed the panel stride of B within the auxinfo_t object. The
 				   millikernel will query and use this to iterate through
-				   micropanels of B. */ \
-				bli_auxinfo_set_ps_b( ps_b_use, &aux ); \
-\
-				/* Grow the thrinfo_t tree. */ \
-				bszid_t*   bszids_jr = &bszids_pb[1]; \
-				               thread_jr = bli_thrinfo_sub_node( thread_pb ); \
-				bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \
-\
-				/* Compute number of primary and leftover components of the JR loop. */ \
-				dim_t jr_iter = ( nc_cur + MR - 1 ) / MR; \
-				dim_t jr_left =   nc_cur % MR; \
-\
+				   micropanels of B. */
+				bli_auxinfo_set_ps_b( ps_b_use, &aux );
+
+				/* Compute number of primary and leftover components of the JR loop. */
+				dim_t jr_iter = ( nc_cur + MR - 1 ) / MR;
+				dim_t jr_left =   nc_cur % MR;
+
 				/* An optimization: allow the last jr iteration to contain up to MRE
 				   rows of C and A. (If MRE > MR, the mkernel has agreed to handle
 				   these cases.) Note that this prevents us from declaring jr_iter and
 				   jr_left as const. NOTE: We forgo this optimization when packing A
-				   since packing an extended edge case is not yet supported. */ \
-				if ( !packa && !is_mt ) \
-				if ( MRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= MRE ) \
-				{ \
-					jr_iter--; jr_left += MR; \
-				} \
-\
-				/* Compute the JR loop thread range for the current thread. */ \
-				dim_t jr_start, jr_end; \
-				bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \
-\
-				/* Loop over the m dimension (NR columns at a time). */ \
-				/*for ( dim_t j = 0; j < jr_iter; j += 1 )*/ \
-				for ( dim_t j = jr_start; j < jr_end; j += 1 ) \
-				{ \
-					const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? MR : jr_left ); \
-\
+				   since packing an extended edge case is not yet supported. */
+				if ( !packa && !is_mt )
+				if ( MRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= MRE )
+				{
+					jr_iter--; jr_left += MR;
+				}
+
+				/* Compute the JR loop thread range for the current thread. */
+				dim_t jr_start, jr_end;
+				bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end );
+
+				/* Loop over the m dimension (NR columns at a time). */
+				/*for ( dim_t j = 0; j < jr_iter; j += 1 )*/
+				for ( dim_t j = jr_start; j < jr_end; j += 1 )
+				{
+					const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? MR : jr_left );
+
 					/*
-					ctype* a_jr = a_pc + j * jrstep_a; \
-					*/ \
-					ctype* a_jr = a_pc_use + j * ps_a_use; \
-					ctype* c_jr = c_ic     + j * jrstep_c; \
-\
+					ctype* a_jr = a_pc + j * jrstep_a;
+					*/
+					const char* a_jr = a_pc_use + j * ps_a_use * dt_size;
+					      char* c_jr = c_ic     + j * jrstep_c;
+
 					/*
-					const dim_t ir_iter = ( mc_cur + NR - 1 ) / NR; \
-					const dim_t ir_left =   mc_cur % NR; \
-					*/ \
-\
-					/* Loop over the n dimension (MR rows at a time). */ \
-					{ \
-						/* Invoke the gemmsup millikernel. */ \
-						gemmsup_ker \
-						( \
-						  conja, \
-						  conjb, \
-						  nr_cur, /* Notice: nr_cur <= MR. */ \
-						  mc_cur, /* Recall: mc_cur partitions the n dimension! */ \
-						  kc_cur, \
-						  alpha_cast, \
-						  a_jr,     rs_a_use, cs_a_use, \
-						  b_ic_use, rs_b_use, cs_b_use, \
-						  beta_use, \
-						            c_jr,     rs_c,     cs_c, \
-						  &aux, \
-						  cntx  \
-						); \
-					} \
-				} \
-			} \
-\
+					const dim_t ir_iter = ( mc_cur + NR - 1 ) / NR;
+					const dim_t ir_left =   mc_cur % NR;
+					*/
+
+					/* Loop over the n dimension (MR rows at a time). */
+					{
+						/* Invoke the gemmsup millikernel. */
+						gemmsup_ker
+						(
+						  conja,
+						  conjb,
+						  nr_cur, /* Notice: nr_cur <= MR. */
+						  mc_cur, /* Recall: mc_cur partitions the n dimension! */
+						  kc_cur,
+						  ( void* )buf_alpha,
+						  ( void* )a_jr,     rs_a_use, cs_a_use,
+						  ( void* )b_ic_use, rs_b_use, cs_b_use,
+						  ( void* )beta_use,
+						  ( void* )c_jr,     rs_c,     cs_c,
+						  &aux,
+						  ( cntx_t* )cntx
+						);
+					}
+				}
+			}
+
 			/* NOTE: This barrier is only needed if we are packing A (since
-			   that matrix is packed within the pc loop of this variant). */ \
-			if ( packa ) bli_thread_barrier( thread_pa ); \
-		} \
-	} \
-\
-	/* Release any memory that was acquired for packing matrices A and B. */ \
-	PASTEMAC(ch,packm_sup_finalize_mem_a) \
-	( \
-	  packa, \
-	  rntm, \
-	  &mem_a, \
-	  thread_pa  \
-	); \
-	PASTEMAC(ch,packm_sup_finalize_mem_b) \
-	( \
-	  packb, \
-	  rntm, \
-	  &mem_b, \
-	  thread_pb  \
-	); \
-\
+			   that matrix is packed within the pc loop of this variant). */
+			if ( packa ) bli_thread_barrier( thread_pa );
+		}
+	}
+
+	/* Release any memory that was acquired for packing matrices A and B. */
+	bli_packm_sup_finalize_mem
+	(
+	  packa,
+      rntm,
+      &mem_a,
+	  thread_pa
+	);
+	bli_packm_sup_finalize_mem
+	(
+	  packb,
+      rntm,
+      &mem_b,
+	  thread_pb
+	);
+
 /*
-PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: b1", kc_cur, nr_cur, b_jr, rs_b, cs_b, "%4.1f", "" ); \
-PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: a1", mr_cur, kc_cur, a_ir, rs_a, cs_a, "%4.1f", "" ); \
-PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%4.1f", "" ); \
-*/ \
+PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: b1", kc_cur, nr_cur, b_jr, rs_b, cs_b, "%4.1f", "" );
+PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: a1", mr_cur, kc_cur, a_ir, rs_a, cs_a, "%4.1f", "" );
+PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%4.1f", "" );
+*/
 }
 
-INSERT_GENTFUNC_BASIC0( gemmsup_ref_var1n )
-
 
 //
 // -- var2m --------------------------------------------------------------------
 //
 
-static FUNCPTR_T GENARRAY(ftypes_var2m,gemmsup_ref_var2m);
-
 void bli_gemmsup_ref_var2m
      (
              trans_t    trans,
@@ -718,67 +529,30 @@ void bli_gemmsup_ref_var2m
        const obj_t*     b,
        const obj_t*     beta,
        const obj_t*     c,
-             stor3_t    eff_id,
+             stor3_t    stor_id,
        const cntx_t*    cntx,
              rntm_t*    rntm,
              thrinfo_t* thread
      )
 {
-#if 0
-	obj_t at, bt;
-
-	bli_obj_alias_to( a, &at );
-	bli_obj_alias_to( b, &bt );
-
-	// Induce transpositions on A and/or B if either object is marked for
-	// transposition. We can induce "fast" transpositions since they objects
-	// are guaranteed to not have structure or be packed.
-	if ( bli_obj_has_trans( &at ) ) { bli_obj_induce_fast_trans( &at ); }
-	if ( bli_obj_has_trans( &bt ) ) { bli_obj_induce_fast_trans( &bt ); }
-
-	const num_t    dt        = bli_obj_dt( c );
-
-	const conj_t   conja     = bli_obj_conj_status( a );
-	const conj_t   conjb     = bli_obj_conj_status( b );
-
-	const dim_t    m         = bli_obj_length( c );
-	const dim_t    n         = bli_obj_width( c );
-
-	const dim_t    k         = bli_obj_width( &at );
-
-	void* buf_a     = bli_obj_buffer_at_off( &at );
-	const inc_t    rs_a      = bli_obj_row_stride( &at );
-	const inc_t    cs_a      = bli_obj_col_stride( &at );
-
-	void* buf_b     = bli_obj_buffer_at_off( &bt );
-	const inc_t    rs_b      = bli_obj_row_stride( &bt );
-	const inc_t    cs_b      = bli_obj_col_stride( &bt );
-
-	void* buf_c     = bli_obj_buffer_at_off( c );
-	const inc_t    rs_c      = bli_obj_row_stride( c );
-	const inc_t    cs_c      = bli_obj_col_stride( c );
-
-	void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
-	void* buf_beta  = bli_obj_buffer_for_1x1( dt, beta );
-
-#else
-	const num_t  dt    = bli_obj_dt( c );
+	const num_t  dt      = bli_obj_dt( c );
+    const dim_t  dt_size = bli_dt_size( dt );
 
-	const bool   packa = bli_rntm_pack_a( rntm );
-	const bool   packb = bli_rntm_pack_b( rntm );
+	      bool   packa   = bli_rntm_pack_a( rntm );
+	      bool   packb   = bli_rntm_pack_b( rntm );
 
-	const conj_t conja = bli_obj_conj_status( a );
-	const conj_t conjb = bli_obj_conj_status( b );
+	      conj_t conja   = bli_obj_conj_status( a );
+	      conj_t conjb   = bli_obj_conj_status( b );
 
-	const dim_t  m     = bli_obj_length( c );
-	const dim_t  n     = bli_obj_width( c );
+	      dim_t  m       = bli_obj_length( c );
+	      dim_t  n       = bli_obj_width( c );
 	      dim_t  k;
 
-	const void*  buf_a = bli_obj_buffer_at_off( a );
+	const void*  buf_a   = bli_obj_buffer_at_off( a );
 	      inc_t  rs_a;
 	      inc_t  cs_a;
 
-	const void*  buf_b = bli_obj_buffer_at_off( b );
+	const void*  buf_b   = bli_obj_buffer_at_off( b );
 	      inc_t  rs_b;
 	      inc_t  cs_b;
 
@@ -811,513 +585,369 @@ void bli_gemmsup_ref_var2m
 	}
 
 	      void* buf_c     = bli_obj_buffer_at_off( c );
-	const inc_t rs_c      = bli_obj_row_stride( c );
-	const inc_t cs_c      = bli_obj_col_stride( c );
+	      inc_t rs_c      = bli_obj_row_stride( c );
+	      inc_t cs_c      = bli_obj_col_stride( c );
 
 	const void* buf_alpha = bli_obj_buffer_for_1x1( dt, alpha );
 	const void* buf_beta  = bli_obj_buffer_for_1x1( dt, beta );
 
-#endif
-
-	// Index into the type combination array to extract the correct
-	// function pointer.
-	FUNCPTR_T f = ftypes_var2m[dt];
-
 #if 1
 	// Optimize some storage/packing cases by transforming them into others.
-	// These optimizations are expressed by changing trans and/or eff_id.
-	bli_gemmsup_ref_var1n2m_opt_cases( dt, &trans, packa, packb, &eff_id, cntx );
+	// These optimizations are expressed by changing trans and/or stor_id.
+	bli_gemmsup_ref_var1n2m_opt_cases( dt, &trans, packa, packb, &stor_id, cntx );
 #endif
 
-	if ( bli_is_notrans( trans ) )
+    if ( bli_is_trans( trans ) )
+    {
+              bool   packtmp = packa; packa = packb; packb = packtmp;
+              conj_t conjtmp = conja; conja = conjb; conjb = conjtmp;
+              dim_t  len_tmp =     m;     m =     n;     n = len_tmp;
+        const void*  buf_tmp = buf_a; buf_a = buf_b; buf_b = buf_tmp;
+              inc_t  str_tmp =  rs_a;  rs_a =  cs_b;  cs_b = str_tmp;
+                     str_tmp =  cs_a;  cs_a =  rs_b;  rs_b = str_tmp;
+                     str_tmp =  rs_c;  rs_c =  cs_c;  cs_c = str_tmp;
+
+        stor_id = bli_stor3_trans( stor_id );
+    }
+
+	/* Query the context for various blocksizes. */
+	const dim_t NR  = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx );
+	const dim_t MR  = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx );
+	const dim_t NC  = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx );
+	const dim_t MC  = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx );
+	const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx );
+
+	dim_t KC;
+	if      ( packa && packb )
 	{
-		// Invoke the function.
-		f
-		(
-		  packa,
-		  packb,
-		  conja,
-		  conjb,
-		  m,
-		  n,
-		  k,
-		  ( void* )buf_alpha,
-		  ( void* )buf_a, rs_a, cs_a,
-		  ( void* )buf_b, rs_b, cs_b,
-		  ( void* )buf_beta,
-		           buf_c, rs_c, cs_c,
-		  eff_id,
-		  ( cntx_t* )cntx,
-		  rntm,
-		  thread
-		);
+		KC = KC0;
 	}
-	else
+	else if ( packb )
 	{
-		// Invoke the function (transposing the operation).
-		f
-		(
-		  packb,             // swap the pack values.
-		  packa,
-		  conjb,             // swap the conj values.
-		  conja,
-		  n,                 // swap the m and n dimensions.
-		  m,
-		  k,
-		  ( void* )buf_alpha,
-		  ( void* )buf_b, cs_b, rs_b, // swap the positions of A and B.
-		  ( void* )buf_a, cs_a, rs_a, // swap the strides of A and B.
-		  ( void* )buf_beta,
-		           buf_c, cs_c, rs_c, // swap the strides of C.
-		  bli_stor3_trans( eff_id ), // transpose the stor3_t id.
-		  ( cntx_t* )cntx,
-		  rntm,
-		  thread
-		);
+		if      ( stor_id == BLIS_RRR ||
+				  stor_id == BLIS_CCC    ) KC = KC0;
+		else if ( stor_id == BLIS_RRC ||
+				  stor_id == BLIS_CRC    ) KC = KC0;
+		else if ( stor_id == BLIS_RCR ||
+		          stor_id == BLIS_CCR    ) KC = (( KC0 / 4 ) / 4 ) * 4;
+		else                               KC = KC0;
+	}
+	else if ( packa )
+	{
+		if      ( stor_id == BLIS_RRR ||
+				  stor_id == BLIS_CCC    ) KC = (( KC0 / 2 ) / 2 ) * 2;
+		else if ( stor_id == BLIS_RRC ||
+				  stor_id == BLIS_CRC    ) KC = KC0;
+		else if ( stor_id == BLIS_RCR ||
+		          stor_id == BLIS_CCR    ) KC = (( KC0 / 4 ) / 4 ) * 4;
+		else                               KC = KC0;
+	}
+	else /* if ( !packa && !packb ) */
+	{
+		if      ( stor_id == BLIS_RRR ||
+				  stor_id == BLIS_CCC    ) KC = KC0;
+		else if ( stor_id == BLIS_RRC ||
+				  stor_id == BLIS_CRC    ) KC = KC0;
+		else if ( m <=   MR && n <=   NR ) KC = KC0;
+		else if ( m <= 2*MR && n <= 2*NR ) KC = KC0 / 2;
+		else if ( m <= 3*MR && n <= 3*NR ) KC = (( KC0 / 3 ) / 4 ) * 4;
+		else if ( m <= 4*MR && n <= 4*NR ) KC = KC0 / 4;
+		else                               KC = (( KC0 / 5 ) / 4 ) * 4;
 	}
-}
-
 
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       bool       packa, \
-       bool       packb, \
-       conj_t     conja, \
-       conj_t     conjb, \
-       dim_t      m, \
-       dim_t      n, \
-       dim_t      k, \
-       void*      alpha, \
-       void*      a, inc_t rs_a, inc_t cs_a, \
-       void*      b, inc_t rs_b, inc_t cs_b, \
-       void*      beta, \
-       void*      c, inc_t rs_c, inc_t cs_c, \
-       stor3_t    stor_id, \
-       cntx_t*    cntx, \
-       rntm_t*    rntm, \
-       thrinfo_t* thread  \
-     ) \
-{ \
-	const num_t dt = PASTEMAC(ch,type); \
-\
-	/* If m or n is zero, return immediately. */ \
-	if ( bli_zero_dim2( m, n ) ) return; \
-\
-	/* If k < 1 or alpha is zero, scale by beta and return. */ \
-	if ( k < 1 || PASTEMAC(ch,eq0)( *(( ctype* )alpha) ) ) \
-	{ \
-		if ( bli_thread_am_ochief( thread ) ) \
-		{ \
-			PASTEMAC(ch,scalm) \
-			( \
-			  BLIS_NO_CONJUGATE, \
-			  0, \
-			  BLIS_NONUNIT_DIAG, \
-			  BLIS_DENSE, \
-			  m, n, \
-			  beta, \
-			  c, rs_c, cs_c \
-			); \
-		} \
-		return; \
-	} \
-\
-	/* Query the context for various blocksizes. */ \
-	const dim_t NR  = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NR, cntx ); \
-	const dim_t MR  = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MR, cntx ); \
-	const dim_t NC  = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_NC, cntx ); \
-	const dim_t MC  = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_MC, cntx ); \
-	const dim_t KC0 = bli_cntx_get_l3_sup_blksz_def_dt( dt, BLIS_KC, cntx ); \
-\
-	dim_t KC; \
-	if      ( packa && packb ) \
-	{ \
-		KC = KC0; \
-	} \
-	else if ( packb ) \
-	{ \
-		if      ( stor_id == BLIS_RRR || \
-				  stor_id == BLIS_CCC    ) KC = KC0; \
-		else if ( stor_id == BLIS_RRC || \
-				  stor_id == BLIS_CRC    ) KC = KC0; \
-		else if ( stor_id == BLIS_RCR || \
-		          stor_id == BLIS_CCR    ) KC = (( KC0 / 4 ) / 4 ) * 4; \
-		else                               KC = KC0; \
-	} \
-	else if ( packa ) \
-	{ \
-		if      ( stor_id == BLIS_RRR || \
-				  stor_id == BLIS_CCC    ) KC = (( KC0 / 2 ) / 2 ) * 2; \
-		else if ( stor_id == BLIS_RRC || \
-				  stor_id == BLIS_CRC    ) KC = KC0; \
-		else if ( stor_id == BLIS_RCR || \
-		          stor_id == BLIS_CCR    ) KC = (( KC0 / 4 ) / 4 ) * 4; \
-		else                               KC = KC0; \
-	} \
-	else /* if ( !packa && !packb ) */ \
-	{ \
-		if      ( stor_id == BLIS_RRR || \
-				  stor_id == BLIS_CCC    ) KC = KC0; \
-		else if ( stor_id == BLIS_RRC || \
-				  stor_id == BLIS_CRC    ) KC = KC0; \
-		else if ( m <=   MR && n <=   NR ) KC = KC0; \
-		else if ( m <= 2*MR && n <= 2*NR ) KC = KC0 / 2; \
-		else if ( m <= 3*MR && n <= 3*NR ) KC = (( KC0 / 3 ) / 4 ) * 4; \
-		else if ( m <= 4*MR && n <= 4*NR ) KC = KC0 / 4; \
-		else                               KC = (( KC0 / 5 ) / 4 ) * 4; \
-	} \
-\
 	/* Query the maximum blocksize for NR, which implies a maximum blocksize
-	   extension for the final iteration. */ \
-	const dim_t NRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_NR, cntx ); \
-	const dim_t NRE = NRM - NR; \
-\
-	/* Compute partitioning step values for each matrix of each loop. */ \
-	const inc_t jcstep_c = cs_c; \
-	const inc_t jcstep_b = cs_b; \
-\
-	const inc_t pcstep_a = cs_a; \
-	const inc_t pcstep_b = rs_b; \
-\
-	const inc_t icstep_c = rs_c; \
-	const inc_t icstep_a = rs_a; \
-\
-	const inc_t jrstep_c = cs_c * NR; \
-\
+	   extension for the final iteration. */
+	const dim_t NRM = bli_cntx_get_l3_sup_blksz_max_dt( dt, BLIS_NR, cntx );
+	const dim_t NRE = NRM - NR;
+
+	/* Compute partitioning step values for each matrix of each loop. */
+	const inc_t jcstep_c = cs_c * dt_size;
+	const inc_t jcstep_b = cs_b * dt_size;
+
+	const inc_t pcstep_a = cs_a * dt_size;
+	const inc_t pcstep_b = rs_b * dt_size;
+
+	const inc_t icstep_c = rs_c * dt_size;
+	const inc_t icstep_a = rs_a * dt_size;
+
+	const inc_t jrstep_c = cs_c * NR * dt_size;
+
 	/*
-	const inc_t jrstep_b = cs_b * NR; \
-	( void )jrstep_b; \
-\
-	const inc_t irstep_c = rs_c * MR; \
-	const inc_t irstep_a = rs_a * MR; \
-	*/ \
-\
+	const inc_t jrstep_b = cs_b * NR;
+	( void )jrstep_b;
+
+	const inc_t irstep_c = rs_c * MR;
+	const inc_t irstep_a = rs_a * MR;
+	*/
+
 	/* Query the context for the sup microkernel address and cast it to its
-	   function pointer type. */ \
-	PASTECH(ch,gemmsup_ker_ft) \
-        gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx ); \
-\
-	ctype* a_00       = a; \
-	ctype* b_00       = b; \
-	ctype* c_00       = c; \
-	ctype* alpha_cast = alpha; \
-	ctype* beta_cast  = beta; \
-\
-	/* Make local copies of beta and one scalars to prevent any unnecessary
-	   sharing of cache lines between the cores' caches. */ \
-	ctype           beta_local = *beta_cast; \
-	ctype           one_local  = *PASTEMAC(ch,1); \
-\
-	auxinfo_t       aux; \
-\
-	/* Parse and interpret the contents of the rntm_t object to properly
-	   set the ways of parallelism for each loop. */ \
-	/*bli_rntm_set_ways_from_rntm_sup( m, n, k, rntm );*/ \
-\
-	/* Initialize a mem_t entry for A and B. Strictly speaking, this is only
-	   needed for the matrix we will be packing (if any), but we do it
-	   unconditionally to be safe. An alternative way of initializing the
-	   mem_t entries is:
-
-	     bli_mem_clear( &mem_a ); \
-	     bli_mem_clear( &mem_b ); \
-	*/ \
-	mem_t mem_a = BLIS_MEM_INITIALIZER; \
-	mem_t mem_b = BLIS_MEM_INITIALIZER; \
-\
+	   function pointer type. */
+	gemmsup_ker_vft gemmsup_ker = bli_cntx_get_l3_sup_ker_dt( dt, stor_id, cntx );
+
+	const char* a_00       = buf_a;
+	const char* b_00       = buf_b;
+	      char* c_00       = buf_c;
+    const void* one        = bli_obj_buffer_for_const( dt, &BLIS_ONE );
+
+	auxinfo_t       aux;
+
+	mem_t mem_a = BLIS_MEM_INITIALIZER;
+	mem_t mem_b = BLIS_MEM_INITIALIZER;
+
 	/* Define an array of bszid_t ids, which will act as our substitute for
-	   the cntl_t tree. */ \
-	/*                           5thloop  4thloop         packb  3rdloop         packa  2ndloop  1stloop  ukrloop */ \
-	bszid_t bszids_nopack[6] = { BLIS_NC, BLIS_KC,               BLIS_MC,               BLIS_NR, BLIS_MR, BLIS_KR }; \
-	bszid_t bszids_packa [7] = { BLIS_NC, BLIS_KC,               BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \
-	bszid_t bszids_packb [7] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC,               BLIS_NR, BLIS_MR, BLIS_KR }; \
-	bszid_t bszids_packab[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR }; \
-	bszid_t* bszids; \
-\
-	/* Set the bszids pointer to the correct bszids array above based on which
-	   matrices (if any) are being packed. */ \
-	if ( packa ) { if ( packb ) bszids = bszids_packab; \
-	               else         bszids = bszids_packa; } \
-	else         { if ( packb ) bszids = bszids_packb; \
-	               else         bszids = bszids_nopack; } \
-\
-	/* Determine whether we are using more than one thread. */ \
-	const bool is_mt = ( bli_rntm_calc_num_threads( rntm ) > 1 ); \
-\
-	thrinfo_t* thread_jc = NULL; \
-	thrinfo_t* thread_pc = NULL; \
-	thrinfo_t* thread_pb = NULL; \
-	thrinfo_t* thread_ic = NULL; \
-	thrinfo_t* thread_pa = NULL; \
-	thrinfo_t* thread_jr = NULL; \
-\
-	/* Grow the thrinfo_t tree. */ \
-	bszid_t*   bszids_jc = bszids; \
-	               thread_jc = thread; \
-	bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \
-\
-	/* Compute the JC loop thread range for the current thread. */ \
-	dim_t jc_start, jc_end; \
-	bli_thread_range_sub( thread_jc, n, NR, FALSE, &jc_start, &jc_end ); \
-	const dim_t n_local = jc_end - jc_start; \
-\
-	/* Compute number of primary and leftover components of the JC loop. */ \
-	/*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/ \
-	const dim_t jc_left =   n_local % NC; \
-\
-	/* Loop over the n dimension (NC rows/columns at a time). */ \
-	/*for ( dim_t jj = 0; jj < jc_iter; jj += 1 )*/ \
-	for ( dim_t jj = jc_start; jj < jc_end; jj += NC ) \
-	{ \
-		/* Calculate the thread's current JC block dimension. */ \
-		const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left ); \
-\
-		ctype* b_jc = b_00 + jj * jcstep_b; \
-		ctype* c_jc = c_00 + jj * jcstep_c; \
-\
-		/* Grow the thrinfo_t tree. */ \
-		bszid_t*   bszids_pc = &bszids_jc[1]; \
-		               thread_pc = bli_thrinfo_sub_node( thread_jc ); \
-		bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \
-\
-		/* Compute the PC loop thread range for the current thread. */ \
-		const dim_t pc_start = 0, pc_end = k; \
-		const dim_t k_local = k; \
-\
-		/* Compute number of primary and leftover components of the PC loop. */ \
-		/*const dim_t pc_iter = ( k_local + KC - 1 ) / KC;*/ \
-		const dim_t pc_left =   k_local % KC; \
-\
-		/* Loop over the k dimension (KC rows/columns at a time). */ \
-		/*for ( dim_t pp = 0; pp < pc_iter; pp += 1 )*/ \
-		for ( dim_t pp = pc_start; pp < pc_end; pp += KC ) \
-		{ \
-			/* Calculate the thread's current PC block dimension. */ \
-			const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left ); \
-\
-			ctype* a_pc = a_00 + pp * pcstep_a; \
-			ctype* b_pc = b_jc + pp * pcstep_b; \
-\
-			/* Only apply beta to the first iteration of the pc loop. */ \
-			ctype* beta_use = ( pp == 0 ? &beta_local : &one_local ); \
-\
-			ctype* b_use; \
-			inc_t  rs_b_use, cs_b_use, ps_b_use; \
-\
-			/* Set the bszid_t array and thrinfo_t pointer based on whether
-			   we will be packing B. If we won't be packing B, we alias to
-			   the _pc variables so that code further down can unconditionally
-			   reference the _pb variables. Note that *if* we will be packing
-			   B, the thrinfo_t node will have already been created by a
-			   previous call to bli_thrinfo_grow(), since bszid values of
-			   BLIS_NO_PART cause the tree to grow by two (e.g. to the next
-			   bszid that is a normal bszid_t value). */ \
-			bszid_t*   bszids_pb; \
-			if ( packb ) { bszids_pb = &bszids_pc[1]; \
-			               thread_pb = bli_thrinfo_sub_node( thread_pc ); } \
-			else         { bszids_pb = &bszids_pc[0]; \
-			               thread_pb = thread_pc; } \
-\
+	   the cntl_t tree. */
+	/*                    5thloop  4thloop         packb  3rdloop         packa  2ndloop  1stloop  ukrloop */
+	bszid_t bszids[8] = { BLIS_NC, BLIS_KC, BLIS_NO_PART, BLIS_MC, BLIS_NO_PART, BLIS_NR, BLIS_MR, BLIS_KR };
+
+	/* Determine whether we are using more than one thread. */
+	const bool is_mt = ( bli_rntm_calc_num_threads( rntm ) > 1 );
+
+	thrinfo_t* thread_jc = NULL;
+	thrinfo_t* thread_pc = NULL;
+	thrinfo_t* thread_pb = NULL;
+	thrinfo_t* thread_ic = NULL;
+	thrinfo_t* thread_pa = NULL;
+	thrinfo_t* thread_jr = NULL;
+
+	/* Grow the thrinfo_t tree. */
+	bszid_t* bszids_jc = bszids;
+	         thread_jc = thread;
+	bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc );
+
+	bszid_t* bszids_pc = &bszids_jc[1];
+	         thread_pc = bli_thrinfo_sub_node( thread_jc );
+	bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc );
+
+	bszid_t* bszids_pb = &bszids_pc[1];
+	         thread_pb = bli_thrinfo_sub_node( thread_pc );
+
+	bszid_t* bszids_ic = &bszids_pb[1];
+	         thread_ic = bli_thrinfo_sub_node( thread_pb );
+	bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic );
+
+	bszid_t* bszids_pa = &bszids_ic[1];
+	         thread_pa = bli_thrinfo_sub_node( thread_ic );
+
+	bszid_t* bszids_jr = &bszids_pa[1];
+	         thread_jr = bli_thrinfo_sub_node( thread_pa );
+	bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr );
+
+	/* Compute the JC loop thread range for the current thread. */
+	dim_t jc_start, jc_end;
+	bli_thread_range_sub( thread_jc, n, NR, FALSE, &jc_start, &jc_end );
+	const dim_t n_local = jc_end - jc_start;
+
+	/* Compute number of primary and leftover components of the JC loop. */
+	/*const dim_t jc_iter = ( n_local + NC - 1 ) / NC;*/
+	const dim_t jc_left =   n_local % NC;
+
+	/* Loop over the n dimension (NC rows/columns at a time). */
+	/*for ( dim_t jj = 0; jj < jc_iter; jj += 1 )*/
+	for ( dim_t jj = jc_start; jj < jc_end; jj += NC )
+	{
+		/* Calculate the thread's current JC block dimension. */
+		const dim_t nc_cur = ( NC <= jc_end - jj ? NC : jc_left );
+
+		const char* b_jc = b_00 + jj * jcstep_b;
+		      char* c_jc = c_00 + jj * jcstep_c;
+
+		/* Compute the PC loop thread range for the current thread. */
+		const dim_t pc_start = 0, pc_end = k;
+		const dim_t k_local = k;
+
+		/* Compute number of primary and leftover components of the PC loop. */
+		/*const dim_t pc_iter = ( k_local + KC - 1 ) / KC;*/
+		const dim_t pc_left =   k_local % KC;
+
+		/* Loop over the k dimension (KC rows/columns at a time). */
+		/*for ( dim_t pp = 0; pp < pc_iter; pp += 1 )*/
+		for ( dim_t pp = pc_start; pp < pc_end; pp += KC )
+		{
+			/* Calculate the thread's current PC block dimension. */
+			const dim_t kc_cur = ( KC <= pc_end - pp ? KC : pc_left );
+
+			const char* a_pc = a_00 + pp * pcstep_a;
+			const char* b_pc = b_jc + pp * pcstep_b;
+
+			/* Only apply beta to the first iteration of the pc loop. */
+			const void* beta_use = ( pp == 0 ? buf_beta : one );
+
+			      char* b_use;
+			      inc_t rs_b_use, cs_b_use, ps_b_use;
+
 			/* Determine the packing buffer and related parameters for matrix
 			   B. (If B will not be packed, then a_use will be set to point to
 			   b and the _b_use strides will be set accordingly.) Then call
 			   the packm sup variant chooser, which will call the appropriate
-			   implementation based on the schema deduced from the stor_id. */ \
-			PASTEMAC(ch,packm_sup_b) \
-			( \
-			  packb, \
-			  BLIS_BUFFER_FOR_B_PANEL, /* This algorithm packs matrix B to */ \
-			  stor_id,                 /* a "panel of B."                  */ \
-			  BLIS_NO_TRANSPOSE, \
-			  KC,     NC,       /* This "panel of B" is (at most) KC x NC. */ \
-			  kc_cur, nc_cur, NR, \
-			  &one_local, \
-			  b_pc,   rs_b,      cs_b, \
-			  &b_use, &rs_b_use, &cs_b_use, \
-			                     &ps_b_use, \
-			  cntx, \
-			  rntm, \
-			  &mem_b, \
-			  thread_pb  \
-			); \
-\
+			   implementation based on the schema deduced from the stor_id. */
+			bli_packm_sup
+			(
+			  packb,
+			  BLIS_BUFFER_FOR_B_PANEL, /* This algorithm packs matrix B to */
+			  stor_id,                 /* a "panel of B."                  */
+			  BLIS_NO_TRANSPOSE,
+              dt,
+			  NC,     KC,       /* This "panel of B" is (at most) KC x NC. */
+			  nc_cur, kc_cur, NR,
+			  one,
+			  b_pc,   cs_b,      rs_b,
+			  ( void** )&b_use, &cs_b_use, &rs_b_use,
+			                    &ps_b_use,
+              cntx,
+              rntm,
+              &mem_b,
+			  thread_pb
+			);
+
 			/* Alias b_use so that it's clear this is our current block of
-			   matrix B. */ \
-			ctype* b_pc_use = b_use; \
-\
+			   matrix B. */
+			char* b_pc_use = b_use;
+
 			/* We don't need to embed the panel stride of B within the auxinfo_t
 			   object because this variant iterates through B in the jr loop,
 			   which occurs here, within the macrokernel, not within the
-			   millikernel. */ \
-			/*bli_auxinfo_set_ps_b( ps_b_use, &aux );*/ \
-\
-			/* Grow the thrinfo_t tree. */ \
-			bszid_t*   bszids_ic = &bszids_pb[1]; \
-			               thread_ic = bli_thrinfo_sub_node( thread_pb ); \
-			bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \
-\
-			/* Compute the IC loop thread range for the current thread. */ \
-			dim_t ic_start, ic_end; \
-			bli_thread_range_sub( thread_ic, m, MR, FALSE, &ic_start, &ic_end ); \
-			const dim_t m_local = ic_end - ic_start; \
-\
-			/* Compute number of primary and leftover components of the IC loop. */ \
-			/*const dim_t ic_iter = ( m_local + MC - 1 ) / MC;*/ \
-			const dim_t ic_left =   m_local % MC; \
-\
-			/* Loop over the m dimension (MC rows at a time). */ \
-			/*for ( dim_t ii = 0; ii < ic_iter; ii += 1 )*/ \
-			for ( dim_t ii = ic_start; ii < ic_end; ii += MC ) \
-			{ \
-				/* Calculate the thread's current IC block dimension. */ \
-				const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left ); \
-\
-				ctype* a_ic = a_pc + ii * icstep_a; \
-				ctype* c_ic = c_jc + ii * icstep_c; \
-\
-				ctype* a_use; \
-				inc_t  rs_a_use, cs_a_use, ps_a_use; \
-\
-				/* Set the bszid_t array and thrinfo_t pointer based on whether
-				   we will be packing B. If we won't be packing A, we alias to
-				   the _ic variables so that code further down can unconditionally
-				   reference the _pa variables. Note that *if* we will be packing
-				   A, the thrinfo_t node will have already been created by a
-				   previous call to bli_thrinfo_grow(), since bszid values of
-				   BLIS_NO_PART cause the tree to grow by two (e.g. to the next
-				   bszid that is a normal bszid_t value). */ \
-				bszid_t*   bszids_pa; \
-				if ( packa ) { bszids_pa = &bszids_ic[1]; \
-							   thread_pa = bli_thrinfo_sub_node( thread_ic ); } \
-				else         { bszids_pa = &bszids_ic[0]; \
-							   thread_pa = thread_ic; } \
-\
+			   millikernel. */
+			/*bli_auxinfo_set_ps_b( ps_b_use, &aux );*/
+
+			/* Compute the IC loop thread range for the current thread. */
+			dim_t ic_start, ic_end;
+			bli_thread_range_sub( thread_ic, m, MR, FALSE, &ic_start, &ic_end );
+			const dim_t m_local = ic_end - ic_start;
+
+			/* Compute number of primary and leftover components of the IC loop. */
+			/*const dim_t ic_iter = ( m_local + MC - 1 ) / MC;*/
+			const dim_t ic_left =   m_local % MC;
+
+			/* Loop over the m dimension (MC rows at a time). */
+			/*for ( dim_t ii = 0; ii < ic_iter; ii += 1 )*/
+			for ( dim_t ii = ic_start; ii < ic_end; ii += MC )
+			{
+				/* Calculate the thread's current IC block dimension. */
+				const dim_t mc_cur = ( MC <= ic_end - ii ? MC : ic_left );
+
+				const char* a_ic = a_pc + ii * icstep_a;
+				      char* c_ic = c_jc + ii * icstep_c;
+
+				      char* a_use;
+				      inc_t rs_a_use, cs_a_use, ps_a_use;
+
 				/* Determine the packing buffer and related parameters for matrix
 				   A. (If A will not be packed, then a_use will be set to point to
 				   a and the _a_use strides will be set accordingly.) Then call
 				   the packm sup variant chooser, which will call the appropriate
-				   implementation based on the schema deduced from the stor_id. */ \
-				PASTEMAC(ch,packm_sup_a) \
-				( \
-				  packa, \
-				  BLIS_BUFFER_FOR_A_BLOCK, /* This algorithm packs matrix A to */ \
-				  stor_id,                 /* a "block of A."                  */ \
-				  BLIS_NO_TRANSPOSE, \
-				  MC,     KC,       /* This "block of A" is (at most) MC x KC. */ \
-				  mc_cur, kc_cur, MR, \
-				  &one_local, \
-				  a_ic,   rs_a,      cs_a, \
-				  &a_use, &rs_a_use, &cs_a_use, \
-				                     &ps_a_use, \
-				  cntx, \
-				  rntm, \
-				  &mem_a, \
-				  thread_pa  \
-				); \
-\
+				   implementation based on the schema deduced from the stor_id. */
+				bli_packm_sup
+				(
+				  packa,
+				  BLIS_BUFFER_FOR_A_BLOCK, /* This algorithm packs matrix A to */
+				  stor_id,                 /* a "block of A."                  */
+				  BLIS_NO_TRANSPOSE,
+                  dt,
+				  MC,     KC,       /* This "block of A" is (at most) MC x KC. */
+				  mc_cur, kc_cur, MR,
+				  one,
+				  a_ic,   rs_a,      cs_a,
+				  ( void** )&a_use, &rs_a_use, &cs_a_use,
+				                    &ps_a_use,
+                  cntx,
+                  rntm,
+                  &mem_a,
+				  thread_pa
+				);
+
 				/* Alias a_use so that it's clear this is our current block of
-				   matrix A. */ \
-				ctype* a_ic_use = a_use; \
-\
+				   matrix A. */
+				char* a_ic_use = a_use;
+
 				/* Embed the panel stride of A within the auxinfo_t object. The
 				   millikernel will query and use this to iterate through
-				   micropanels of A (if needed). */ \
-				bli_auxinfo_set_ps_a( ps_a_use, &aux ); \
-\
-				/* Grow the thrinfo_t tree. */ \
-				bszid_t*   bszids_jr = &bszids_pa[1]; \
-				               thread_jr = bli_thrinfo_sub_node( thread_pa ); \
-				bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \
-\
-				/* Compute number of primary and leftover components of the JR loop. */ \
-				dim_t jr_iter = ( nc_cur + NR - 1 ) / NR; \
-				dim_t jr_left =   nc_cur % NR; \
-\
+				   micropanels of A (if needed). */
+				bli_auxinfo_set_ps_a( ps_a_use, &aux );
+
+				/* Compute number of primary and leftover components of the JR loop. */
+				dim_t jr_iter = ( nc_cur + NR - 1 ) / NR;
+				dim_t jr_left =   nc_cur % NR;
+
 				/* An optimization: allow the last jr iteration to contain up to NRE
 				   columns of C and B. (If NRE > NR, the mkernel has agreed to handle
 				   these cases.) Note that this prevents us from declaring jr_iter and
 				   jr_left as const. NOTE: We forgo this optimization when packing B
-				   since packing an extended edge case is not yet supported. */ \
-				if ( !packb && !is_mt ) \
-				if ( NRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= NRE ) \
-				{ \
-					jr_iter--; jr_left += NR; \
-				} \
-\
-				/* Compute the JR loop thread range for the current thread. */ \
-				dim_t jr_start, jr_end; \
-				bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end ); \
-\
-				/* Loop over the n dimension (NR columns at a time). */ \
-				/*for ( dim_t j = 0; j < jr_iter; j += 1 )*/ \
-				for ( dim_t j = jr_start; j < jr_end; j += 1 ) \
-				{ \
-					const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left ); \
-\
+				   since packing an extended edge case is not yet supported. */
+				if ( !packb && !is_mt )
+				if ( NRE != 0 && 1 < jr_iter && jr_left != 0 && jr_left <= NRE )
+				{
+					jr_iter--; jr_left += NR;
+				}
+
+				/* Compute the JR loop thread range for the current thread. */
+				dim_t jr_start, jr_end;
+				bli_thread_range_sub( thread_jr, jr_iter, 1, FALSE, &jr_start, &jr_end );
+
+				/* Loop over the n dimension (NR columns at a time). */
+				/*for ( dim_t j = 0; j < jr_iter; j += 1 )*/
+				for ( dim_t j = jr_start; j < jr_end; j += 1 )
+				{
+					const dim_t nr_cur = ( bli_is_not_edge_f( j, jr_iter, jr_left ) ? NR : jr_left );
+
 					/*
-					ctype* b_jr = b_pc_use + j * jrstep_b; \
-					*/ \
-					ctype* b_jr = b_pc_use + j * ps_b_use; \
-					ctype* c_jr = c_ic     + j * jrstep_c; \
-\
+					ctype* b_jr = b_pc_use + j * jrstep_b;
+					*/
+					const char* b_jr = b_pc_use + j * ps_b_use * dt_size;
+					      char* c_jr = c_ic     + j * jrstep_c;
+
 					/*
-					const dim_t ir_iter = ( mc_cur + MR - 1 ) / MR; \
-					const dim_t ir_left =   mc_cur % MR; \
-					*/ \
-\
-					/* Loop over the m dimension (MR rows at a time). */ \
-					{ \
-						/* Invoke the gemmsup millikernel. */ \
-						gemmsup_ker \
-						( \
-						  conja, \
-						  conjb, \
-						  mc_cur, \
-						  nr_cur, \
-						  kc_cur, \
-						  alpha_cast, \
-						  a_ic_use, rs_a_use, cs_a_use, \
-						  b_jr,     rs_b_use, cs_b_use, \
-						  beta_use, \
-						            c_jr,     rs_c,     cs_c, \
-						  &aux, \
-						  cntx  \
-						); \
-					} \
-				} \
-			} \
-\
+					const dim_t ir_iter = ( mc_cur + MR - 1 ) / MR;
+					const dim_t ir_left =   mc_cur % MR;
+					*/
+
+					/* Loop over the m dimension (MR rows at a time). */
+					{
+						/* Invoke the gemmsup millikernel. */
+						gemmsup_ker
+						(
+						  conja,
+						  conjb,
+						  mc_cur,
+						  nr_cur,
+						  kc_cur,
+						  ( void* )buf_alpha,
+						  ( void* )a_ic_use, rs_a_use, cs_a_use,
+						  ( void* )b_jr,     rs_b_use, cs_b_use,
+						  ( void* )beta_use,
+						  ( void* )c_jr,     rs_c,     cs_c,
+						  &aux,
+						  ( cntx_t* )cntx
+						);
+					}
+				}
+			}
+
 			/* NOTE: This barrier is only needed if we are packing B (since
-			   that matrix is packed within the pc loop of this variant). */ \
-			if ( packb ) bli_thread_barrier( thread_pb ); \
-		} \
-	} \
-\
-	/* Release any memory that was acquired for packing matrices A and B. */ \
-	PASTEMAC(ch,packm_sup_finalize_mem_a) \
-	( \
-	  packa, \
-	  rntm, \
-	  &mem_a, \
-	  thread_pa  \
-	); \
-	PASTEMAC(ch,packm_sup_finalize_mem_b) \
-	( \
-	  packb, \
-	  rntm, \
-	  &mem_b, \
-	  thread_pb  \
-	); \
-\
+			   that matrix is packed within the pc loop of this variant). */
+			if ( packb ) bli_thread_barrier( thread_pb );
+		}
+	}
+
+	/* Release any memory that was acquired for packing matrices A and B. */
+	bli_packm_sup_finalize_mem
+	(
+	  packa,
+      rntm,
+      &mem_a,
+	  thread_pa
+	);
+	bli_packm_sup_finalize_mem
+	(
+	  packb,
+      rntm,
+      &mem_b,
+	  thread_pb
+	);
+
 /*
-PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: b1", kc_cur, nr_cur, b_jr, rs_b, cs_b, "%4.1f", "" ); \
-PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: a1", mr_cur, kc_cur, a_ir, rs_a, cs_a, "%4.1f", "" ); \
-PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%4.1f", "" ); \
-*/ \
+PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: b1", kc_cur, nr_cur, b_jr, rs_b, cs_b, "%4.1f", "" );
+PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: a1", mr_cur, kc_cur, a_ir, rs_a, cs_a, "%4.1f", "" );
+PASTEMAC(ch,fprintm)( stdout, "gemmsup_ref_var2: c ", mr_cur, nr_cur, c_ir, rs_c, cs_c, "%4.1f", "" );
+*/
 }
 
-INSERT_GENTFUNC_BASIC0( gemmsup_ref_var2m )
-
diff --git a/frame/3/bli_l3_sup_vars.h b/frame/3/bli_l3_sup_vars.h
index df9a747abd..be6b17f390 100644
--- a/frame/3/bli_l3_sup_vars.h
+++ b/frame/3/bli_l3_sup_vars.h
@@ -89,32 +89,6 @@ void PASTEMAC(ch,varname) \
 INSERT_GENTPROT_BASIC0( gemmsup_ref_var1 )
 INSERT_GENTPROT_BASIC0( gemmsup_ref_var2 )
 
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, varname ) \
-\
-void PASTEMAC(ch,varname) \
-     ( \
-       bool       packa, \
-       bool       packb, \
-       conj_t     conja, \
-       conj_t     conjb, \
-       dim_t      m, \
-       dim_t      n, \
-       dim_t      k, \
-       void*      alpha, \
-       void*      a, inc_t rs_a, inc_t cs_a, \
-       void*      b, inc_t rs_b, inc_t cs_b, \
-       void*      beta, \
-       void*      c, inc_t rs_c, inc_t cs_c, \
-       stor3_t    eff_id, \
-       cntx_t*    cntx, \
-       rntm_t*    rntm, \
-       thrinfo_t* thread  \
-     );
-
-INSERT_GENTPROT_BASIC0( gemmsup_ref_var1n )
-INSERT_GENTPROT_BASIC0( gemmsup_ref_var2m )
-
 // -----------------------------------------------------------------------------
 
 BLIS_INLINE void bli_gemmsup_ref_var1n2m_opt_cases
diff --git a/frame/3/gemm/bli_gemm_front.c b/frame/3/gemm/bli_gemm_front.c
index 1ae904abf9..5f992bd679 100644
--- a/frame/3/gemm/bli_gemm_front.c
+++ b/frame/3/gemm/bli_gemm_front.c
@@ -53,22 +53,6 @@ void bli_gemm_front
 	obj_t   b_local;
 	obj_t   c_local;
 
-	// If C has a zero dimension, return early.
-	if ( bli_obj_has_zero_dim( c ) )
-	{
-		return;
-	}
-
-	// If alpha is zero, or if A or B has a zero dimension, scale C by beta
-	// and return early.
-	if ( bli_obj_equals( alpha, &BLIS_ZERO ) ||
-	     bli_obj_has_zero_dim( a ) ||
-	     bli_obj_has_zero_dim( b ) )
-	{
-		bli_scalm( beta, c );
-		return;
-	}
-
 #if 0
 #ifdef BLIS_ENABLE_SMALL_MATRIX
 	// Only handle small problems separately for homogeneous datatypes.
diff --git a/frame/3/gemmt/bli_gemmt_front.c b/frame/3/gemmt/bli_gemmt_front.c
index e291b5f275..49b32c9762 100644
--- a/frame/3/gemmt/bli_gemmt_front.c
+++ b/frame/3/gemmt/bli_gemmt_front.c
@@ -53,22 +53,6 @@ void bli_gemmt_front
 	obj_t   b_local;
 	obj_t   c_local;
 
-	// If C has a zero dimension, return early.
-	if ( bli_obj_has_zero_dim( c ) )
-	{
-		return;
-	}
-
-	// If alpha is zero, or if A or B has a zero dimension, scale C by beta
-	// and return early.
-	if ( bli_obj_equals( alpha, &BLIS_ZERO ) ||
-	     bli_obj_has_zero_dim( a ) ||
-	     bli_obj_has_zero_dim( b ) )
-	{
-		bli_scalm( beta, c );
-		return;
-	}
-
 	// Alias A, B, and C in case we need to apply transformations.
 	bli_obj_alias_to( a, &a_local );
 	bli_obj_alias_to( b, &b_local );

From 7a8a58de24dd1ea72215108ce72237c79a53e455 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Thu, 29 Sep 2022 10:31:16 -0500
Subject: [PATCH 26/32] Fix some bugs in bli_pool.c

1. Add a check for pool exhaustion when freeing blocks. This detects double-free and other bad conditions without segfault.
2. Make sure to copy *all* block pointers when growing the pool size. Previously, checked-out block pointers were not copied, leading to the presence of uninitialized data.
---
 frame/base/bli_pool.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/frame/base/bli_pool.c b/frame/base/bli_pool.c
index 684b0ef736..2016d4e2b5 100644
--- a/frame/base/bli_pool.c
+++ b/frame/base/bli_pool.c
@@ -335,6 +335,9 @@ void bli_pool_checkin_block
 	// Query the top_index of the pool.
 	const siz_t top_index = bli_pool_top_index( pool );
 
+    // Check for double-free and other conditions which exhaust the memory pool
+    if ( top_index == 0 ) bli_abort();
+
 	#ifdef BLIS_ENABLE_MEM_TRACING
 	printf( "bli_pool_checkin_block(): checking in block %d of size %d "
 	        "(align %d, offset %d).\n",
@@ -407,10 +410,8 @@ void bli_pool_grow
 		const siz_t top_index = bli_pool_top_index( pool );
 
 		// Copy the contents of the old block_ptrs array to the new/resized
-		// array. Notice that we can begin with top_index since all entries
-		// from 0 to top_index-1 have been (and are currently) checked out
-		// to threads.
-		for ( dim_t i = top_index; i < num_blocks_cur; ++i )
+		// array.
+		for ( dim_t i = 0; i < num_blocks_cur; ++i )
 		{
 			block_ptrs_new[i] = block_ptrs_cur[i];
 		}

From 989d28250267ff0c532eef9703144ccdcc4de0b7 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Thu, 29 Sep 2022 10:32:52 -0500
Subject: [PATCH 27/32] Add an --enable-asan option.

This option (disbaled by default) enables compiling and linking with the Address Sanitizer library (ASan), via the -fsanitize=address flag supported by clang, gcc, and probably others. This flag is included for all files *except* optimized kernels, since it usually reguires an extra register which violates the constraints for many gemm microkernels.
---
 Makefile           |  1 +
 build/config.mk.in |  3 +++
 common.mk          | 17 +++++++++++++++++
 configure          | 29 ++++++++++++++++++++++++++++-
 4 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 5605dd8fc3..cb86b117d8 100644
--- a/Makefile
+++ b/Makefile
@@ -1121,6 +1121,7 @@ showconfig: check-env
 	@echo "install libdir:          $(INSTALL_LIBDIR)"
 	@echo "install includedir:      $(INSTALL_INCDIR)"
 	@echo "install sharedir:        $(INSTALL_SHAREDIR)"
+	@echo "ASan support:            $(ENABLE_ASAN)"
 	@echo "debugging status:        $(DEBUG_TYPE)"
 	@echo "multithreading status:   $(THREADING_MODEL)"
 	@echo "enable BLAS API?         $(MK_ENABLE_BLAS)"
diff --git a/build/config.mk.in b/build/config.mk.in
index 7ef8c6bd00..867293ed46 100644
--- a/build/config.mk.in
+++ b/build/config.mk.in
@@ -121,6 +121,9 @@ PYTHON            := @PYTHON@
 CFLAGS_PRESET     := @cflags_preset@
 LDFLAGS_PRESET    := @ldflags_preset@
 
+# Whether to compile and link the Address Sanitizer library
+ENABLE_ASAN       := @enable_asan@
+
 # The level of debugging info to generate.
 DEBUG_TYPE        := @debug_type@
 
diff --git a/common.mk b/common.mk
index 33713e9f5f..ab28f68fc6 100644
--- a/common.mk
+++ b/common.mk
@@ -118,6 +118,7 @@ get-noopt-cxxflags-for   = $(strip $(CFLAGS_PRESET) \
 get-refinit-cflags-for   = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
                                    $(call get-noopt-cflags-for,$(1)) \
                                    -DBLIS_CNAME=$(1) \
+                                   $(ASAN_CPPFLAGS) \
                                    $(BUILD_CPPFLAGS) \
                                    $(BUILD_SYMFLAGS) \
                                    -DBLIS_IN_REF_KERNEL=1 \
@@ -129,6 +130,7 @@ get-refkern-cflags-for   = $(strip $(call load-var-for,CROPTFLAGS,$(1)) \
                                    $(call get-noopt-cflags-for,$(1)) \
                                    $(COMPSIMDFLAGS) \
                                    -DBLIS_CNAME=$(1) \
+                                   $(ASAN_CPPFLAGS) \
                                    $(BUILD_CPPFLAGS) \
                                    $(BUILD_SYMFLAGS) \
                                    -DBLIS_IN_REF_KERNEL=1 \
@@ -137,12 +139,14 @@ get-refkern-cflags-for   = $(strip $(call load-var-for,CROPTFLAGS,$(1)) \
 
 get-config-cflags-for    = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
                                    $(call get-noopt-cflags-for,$(1)) \
+                                   $(ASAN_CPPFLAGS) \
                                    $(BUILD_CPPFLAGS) \
                                    $(BUILD_SYMFLAGS) \
                             )
 
 get-frame-cflags-for     = $(strip $(call load-var-for,COPTFLAGS,$(1)) \
                                    $(call get-noopt-cflags-for,$(1)) \
+                                   $(ASAN_CPPFLAGS) \
                                    $(BUILD_CPPFLAGS) \
                                    $(BUILD_SYMFLAGS) \
                             )
@@ -553,6 +557,11 @@ ifeq ($(DEBUG_TYPE),sde)
 LDFLAGS    := $(filter-out $(LIBMEMKIND),$(LDFLAGS))
 endif
 
+# Never use libmemkind with Intel SDE.
+ifeq ($(ENABLE_ASAN),yes)
+LDFLAGS    += -fsanitize=address
+endif
+
 # Specify the shared library's 'soname' field.
 # NOTE: The flag for creating shared objects is different for Linux and OS X.
 ifeq ($(OS_NAME),Darwin)
@@ -786,6 +795,14 @@ $(foreach c, $(CONFIG_LIST_FAM), $(eval $(call append-var-for,CXXLANGFLAGS,$(c))
 CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
 $(foreach c, $(CONFIG_LIST_FAM), $(eval $(call append-var-for,CPPROCFLAGS,$(c))))
 
+# --- Address Sanitizer flags ---
+
+ifeq ($(ENABLE_ASAN),yes)
+ASAN_CPPFLAGS := -fsanitize=address
+else
+ASAN_CPPFLAGS :=
+endif
+
 # --- Threading flags ---
 
 # NOTE: We don't have to explicitly omit -pthread when --disable-system is given
diff --git a/configure b/configure
index a6018edab2..94d96b54af 100755
--- a/configure
+++ b/configure
@@ -214,6 +214,15 @@ print_usage()
 	echo "                 Enable (disable by default) output to stdout that traces"
 	echo "                 the allocation and freeing of memory, including the names"
 	echo "                 of the functions that triggered the allocation/freeing."
+	echo "                 Enabling this option WILL NEGATIVELY IMPACT PERFORMANCE."
+	echo "                 Please use only for informational/debugging purposes."
+	echo " "
+	echo "   --enable-asan, --disable-asan"
+	echo " "
+	echo "                 Enable (disable by default) compiling and linking"
+    echo "                 framework code with the Address Sanitizer (Asan) library."
+    echo "                 Optimized kernels are NOT compiled with ASan support"
+    echo "                 due to limitations of register assignment in inline assembly."
 	echo "                 Enabling this option WILL NEGATIVELY IMPACT PERFORMANCE."
 	echo "                 Please use only for informational/debugging purposes."
 	echo " "
@@ -2438,6 +2447,9 @@ main()
 	debug_type=''
 	debug_flag=''
 
+	# A flag indicating whether Address Sanitizer should be used.
+	enable_asan='no'
+
 	# The system flag.
 	enable_system='yes'
 
@@ -2556,6 +2568,12 @@ main()
 							debug_flag=1
 							debug_type=noopt
 							;;
+						enable-asan)
+							enable_asan='yes'
+							;;
+						disable-asan)
+							enable_asan='no'
+							;;
 						enable-debug=*)
 							debug_flag=1
 							debug_type=${OPTARG#*=}
@@ -3344,6 +3362,14 @@ main()
 		echo "${script_name}: no preset LDFLAGS detected."
 	fi
 
+	# Check if the ASan flag was specified.
+	if [ "x${enable_asan}" = "xyes" ]; then
+		echo "${script_name}: enabling ASan support (except optimized kernels)."
+	else
+        enable_asan='no'
+		echo "${script_name}: ASan support disabled."
+	fi
+
 	# Check if the debug flag was specified.
 	if [ -n "${debug_flag}" ]; then
 		if [ "x${debug_type}" = "xopt" ]; then
@@ -3758,7 +3784,7 @@ main()
 	# Create a #define for the configuration family (config_name).
 	uconf=$(echo ${config_name} | tr '[:lower:]' '[:upper:]')
 	config_name_define="#define BLIS_FAMILY_${uconf}\n"
-	
+
 	# Create a list of #defines, one for each configuration in config_list.
 	config_list_defines=""
 	for conf in ${config_list}; do
@@ -3853,6 +3879,7 @@ main()
 		| sed -e "s/@libpthread@/${libpthread_esc}/g" \
 		| sed -e "s/@cflags_preset@/${cflags_preset_esc}/g" \
 		| sed -e "s/@ldflags_preset@/${ldflags_preset_esc}/g" \
+		| sed -e "s/@enable_asan@/${enable_asan}/g" \
 		| sed -e "s/@debug_type@/${debug_type}/g" \
 		| sed -e "s/@enable_system@/${enable_system}/g" \
 		| sed -e "s/@threading_model@/${threading_model}/g" \

From aef6656ddb1799e27f89b96f872b121e5b871aab Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Mon, 3 Oct 2022 10:57:09 -0500
Subject: [PATCH 28/32] Reinstate sanity check in bli_pool_finalize.

Reinstate check for checked-out blocks upon finalization. A flag has been added to indicate that the pool is actually under reinitialization (where checked-out blocks are OK), which temporarily disables the check. A memory leak where blocks are not checked back in is now correctly detected upon exit.
---
 frame/base/bli_apool.c |  2 +-
 frame/base/bli_pba.c   |  6 +++---
 frame/base/bli_pool.c  | 22 +++++++++-------------
 frame/base/bli_pool.h  |  3 ++-
 4 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/frame/base/bli_apool.c b/frame/base/bli_apool.c
index a42c7103e5..693e91bf92 100644
--- a/frame/base/bli_apool.c
+++ b/frame/base/bli_apool.c
@@ -188,7 +188,7 @@ void bli_apool_free_block
 		if ( pool != NULL )
 		{
 			// Finalize the pool.
-			bli_pool_finalize( pool );
+			bli_pool_finalize( pool, FALSE );
 
 			#ifdef BLIS_ENABLE_MEM_TRACING
 			printf( "bli_apool_free_block(): pool_t %d: ", ( int )i );
diff --git a/frame/base/bli_pba.c b/frame/base/bli_pba.c
index 91532c757b..6b3456dfbe 100644
--- a/frame/base/bli_pba.c
+++ b/frame/base/bli_pba.c
@@ -386,9 +386,9 @@ void bli_pba_finalize_pools
 	pool_t* pool_c  = bli_pba_pool( index_c, pba );
 
 	// Finalize the memory pools for A, B, and C.
-	bli_pool_finalize( pool_a );
-	bli_pool_finalize( pool_b );
-	bli_pool_finalize( pool_c );
+	bli_pool_finalize( pool_a, FALSE );
+	bli_pool_finalize( pool_b, FALSE );
+	bli_pool_finalize( pool_c, FALSE );
 }
 
 // -----------------------------------------------------------------------------
diff --git a/frame/base/bli_pool.c b/frame/base/bli_pool.c
index 2016d4e2b5..b3c3ee85fa 100644
--- a/frame/base/bli_pool.c
+++ b/frame/base/bli_pool.c
@@ -115,7 +115,8 @@ void bli_pool_init
 
 void bli_pool_finalize
      (
-       pool_t* pool
+       pool_t* pool,
+       bool    reinit
      )
 {
 	// NOTE: This implementation assumes that either:
@@ -129,24 +130,22 @@ void bli_pool_finalize
 	// Query the total number of blocks currently allocated.
 	const siz_t num_blocks = bli_pool_num_blocks( pool );
 
-	// NOTE: This sanity check has been disabled because bli_pool_reinit()
-	// is currently implemented in terms of bli_pool_finalize() followed by
-	// bli_pool_init(). If that _reinit() takes place when some blocks are
-	// checked out, then we would expect top_index != 0, and therefore this
-	// check is not universally appropriate.
-#if 0
 	// Query the top_index of the pool.
 	const siz_t top_index = bli_pool_top_index( pool );
 
 	// Sanity check: The top_index should be zero.
-	if ( top_index != 0 )
+	// NOTE: This sanity check is disabled when called from bli_pool_reinit()
+	// because it is currently implemented in terms of bli_pool_finalize() followed by
+	// bli_pool_init(). If that _reinit() takes place when some blocks are
+	// checked out, then we would expect top_index != 0, and therefore this
+	// check is not universally appropriate.
+	if ( top_index != 0 && !reinit )
 	{
 		printf( "bli_pool_finalize(): final top_index == %d (expected 0); block_size: %d.\n",
 		        ( int )top_index, ( int )bli_pool_block_size( pool ) );
 		printf( "bli_pool_finalize(): Implication: not all blocks were checked back in!\n" );
 		bli_abort();
 	}
-#endif
 
 	// Query the free() function pointer for the pool.
 	free_ft free_fp = bli_pool_free_fp( pool );
@@ -215,7 +214,7 @@ void bli_pool_reinit
 	// those blocks back into the pool. (This condition can be detected
 	// since the block size is encoded into each pblk, which is copied
 	// upon checkout.)
-	bli_pool_finalize( pool );
+	bli_pool_finalize( pool, TRUE );
 
 	// Reinitialize the pool with the new parameters, in particular,
 	// the new block size.
@@ -406,9 +405,6 @@ void bli_pool_grow
 		=
 		bli_malloc_intl( block_ptrs_len_new * sizeof( pblk_t ), &r_val );
 
-		// Query the top_index of the pool.
-		const siz_t top_index = bli_pool_top_index( pool );
-
 		// Copy the contents of the old block_ptrs array to the new/resized
 		// array.
 		for ( dim_t i = 0; i < num_blocks_cur; ++i )
diff --git a/frame/base/bli_pool.h b/frame/base/bli_pool.h
index 0b16ae8eea..6f199f7a4c 100644
--- a/frame/base/bli_pool.h
+++ b/frame/base/bli_pool.h
@@ -228,7 +228,8 @@ void bli_pool_init
      );
 void bli_pool_finalize
      (
-       pool_t* pool
+       pool_t* pool,
+       bool    reinit
      );
 void bli_pool_reinit
      (

From 8d3a84ca0993d325c8e3ea95c173dca917963bbf Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Mon, 3 Oct 2022 12:33:44 -0500
Subject: [PATCH 29/32] Fix memory leak in ukr tests.

---
 frame/3/bli_l3_thrinfo.c          |  6 +++++-
 frame/thread/bli_thrinfo.c        | 12 ++++++++++++
 testsuite/src/test_gemm_ukr.c     |  8 ++------
 testsuite/src/test_gemmtrsm_ukr.c |  8 ++------
 testsuite/src/test_libblis.c      | 10 ++++------
 testsuite/src/test_libblis.h      |  2 +-
 testsuite/src/test_trsm_ukr.c     |  8 ++------
 7 files changed, 28 insertions(+), 26 deletions(-)

diff --git a/frame/3/bli_l3_thrinfo.c b/frame/3/bli_l3_thrinfo.c
index c350bc69b2..31ec87c61f 100644
--- a/frame/3/bli_l3_thrinfo.c
+++ b/frame/3/bli_l3_thrinfo.c
@@ -44,12 +44,16 @@ thrinfo_t* bli_l3_thrinfo_create
        const cntl_t*     cntl
      )
 {
+	pool_t* pool = NULL;
+	if ( array != NULL )
+		pool = bli_apool_array_elem( id, array );
+
 	// Create the root thrinfo_t node.
 	thrinfo_t* root = bli_thrinfo_create_root
 	(
 	  gl_comm,
 	  id,
-      bli_apool_array_elem( id, array ),
+      pool,
       bli_pba_query()
 	);
 
diff --git a/frame/thread/bli_thrinfo.c b/frame/thread/bli_thrinfo.c
index 0d1126ad55..85ddcbab0b 100644
--- a/frame/thread/bli_thrinfo.c
+++ b/frame/thread/bli_thrinfo.c
@@ -99,6 +99,8 @@ void bli_thrinfo_free
 	thrinfo_t* thrinfo_sub_prenode = bli_thrinfo_sub_prenode( thread );
 	thrinfo_t* thrinfo_sub_node    = bli_thrinfo_sub_node( thread );
     pool_t*    sba_pool            = bli_thread_sba_pool( thread );
+	mem_t*     cntl_mem_p          = bli_thread_mem( thread );
+    pba_t*     pba                 = bli_thread_pba( thread );
 
 	// Recursively free all children of the current thrinfo_t.
 	if ( thrinfo_sub_prenode != NULL )
@@ -127,6 +129,16 @@ void bli_thrinfo_free
 	printf( "bli_thrinfo_free(): " );
 	#endif
 
+	// Free any allocated memory from the pba.
+	if ( bli_mem_is_alloc( cntl_mem_p ) )
+	{
+		bli_pba_release
+		(
+		  pba,
+		  cntl_mem_p
+		);
+	}
+
 	// Free the thrinfo_t struct.
 	bli_sba_release( sba_pool, thread );
 }
diff --git a/testsuite/src/test_gemm_ukr.c b/testsuite/src/test_gemm_ukr.c
index 655c9a7586..f3b5f7b520 100644
--- a/testsuite/src/test_gemm_ukr.c
+++ b/testsuite/src/test_gemm_ukr.c
@@ -231,8 +231,6 @@ void libblis_test_gemm_ukr_experiment
 	libblis_test_mobj_randomize( params, TRUE, &c );
 	bli_copym( &c, &c_save );
 
-	array_t* array = bli_sba_checkout_array( 1 );
-
 	// Transpose B to B^T for packing.
 	bli_obj_induce_trans( &b );
 
@@ -246,8 +244,7 @@ void libblis_test_gemm_ukr_experiment
 	  BLIS_PACKED_ROW_PANELS,
 	  BLIS_BUFFER_FOR_A_BLOCK,
 	  &a, &ap,
-	  cntx,
-	  array
+	  cntx
 	);
 	thrinfo_t* thread_b = libblis_test_pobj_create
 	(
@@ -257,8 +254,7 @@ void libblis_test_gemm_ukr_experiment
 	  BLIS_PACKED_COL_PANELS,
 	  BLIS_BUFFER_FOR_B_PANEL,
 	  &b, &bp,
-	  cntx,
-	  array
+	  cntx
 	);
 
 	// Transpose B^T back to B and Bp^T back to Bp.
diff --git a/testsuite/src/test_gemmtrsm_ukr.c b/testsuite/src/test_gemmtrsm_ukr.c
index 562876792b..cf2b9d0409 100644
--- a/testsuite/src/test_gemmtrsm_ukr.c
+++ b/testsuite/src/test_gemmtrsm_ukr.c
@@ -283,8 +283,6 @@ void libblis_test_gemmtrsm_ukr_experiment
 	bli_copym( &b11, &c11 );
 	bli_copym( &c11, &c11_save );
 
-	array_t* array = bli_sba_checkout_array( 1 );
-
 	// Create pack objects for a and b, and pack them to ap and bp,
 	// respectively.
 	thrinfo_t* thread_a = libblis_test_pobj_create
@@ -295,8 +293,7 @@ void libblis_test_gemmtrsm_ukr_experiment
 	  BLIS_PACKED_ROW_PANELS,
 	  BLIS_BUFFER_FOR_A_BLOCK,
 	  &a, &ap,
-	  cntx,
-	  array
+	  cntx
 	);
 
 	// Set the diagonal offset of ap.
@@ -331,8 +328,7 @@ bli_printm( "ap", &ap, "%5.2f", "" );
 		  BLIS_PACKED_COL_PANELS,
 		  BLIS_BUFFER_FOR_B_PANEL,
 		  &b, &bp,
-		  cntx,
-		  array
+		  cntx
 		);
 
 		// Transpose B^T back to B and Bp^T back to Bp.
diff --git a/testsuite/src/test_libblis.c b/testsuite/src/test_libblis.c
index 80ab263365..068294445e 100644
--- a/testsuite/src/test_libblis.c
+++ b/testsuite/src/test_libblis.c
@@ -2632,7 +2632,7 @@ void libblis_test_mobj_create( test_params_t* params, num_t dt, trans_t trans, c
 }
 
 
-thrinfo_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx, array_t* array )
+thrinfo_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx )
 {
 	bool does_inv_diag;
 
@@ -2642,12 +2642,10 @@ thrinfo_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, inv
 	rntm_t rntm;
 	bli_rntm_init( &rntm );
 
-    pool_t* pool = bli_apool_array_elem( 0, array );
-
 	// Create a control tree node for the packing operation.
 	cntl_t* cntl = bli_packm_cntl_create_node
 	(
-	  pool,
+	  NULL, // pass NULL as the pool so that malloc() is used.
 	  NULL, // func ptr is not referenced b/c we don't call via l3 _int().
 	  bmult_id_m,
 	  bmult_id_n,
@@ -2659,13 +2657,13 @@ thrinfo_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, inv
 	  NULL  // no child node needed
 	);
 
-    thrinfo_t* thread = bli_l3_thrinfo_create( 0, &BLIS_SINGLE_COMM, array, &rntm, cntl );
+    thrinfo_t* thread = bli_l3_thrinfo_create( 0, &BLIS_SINGLE_COMM, NULL, &rntm, cntl );
 
 	// Pack the contents of A to P.
 	bli_packm_blk_var1( a, p, cntx, cntl, thread );
 
     // Free the control tree.
-    bli_l3_cntl_free( pool, cntl );
+    bli_l3_cntl_free( NULL, cntl );
 
 	// Return the thread control tree pointer so the caller can free the thrinfo_t and its
 	// mem_t entry later on.
diff --git a/testsuite/src/test_libblis.h b/testsuite/src/test_libblis.h
index 95920b7e51..78135deda1 100644
--- a/testsuite/src/test_libblis.h
+++ b/testsuite/src/test_libblis.h
@@ -418,7 +418,7 @@ void fill_string_with_n_spaces( char* str, unsigned int n_spaces );
 // --- Create object ---
 
 void libblis_test_mobj_create( test_params_t* params, num_t dt, trans_t trans, char storage, dim_t m, dim_t n, obj_t* a );
-thrinfo_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx, array_t* array );
+thrinfo_t* libblis_test_pobj_create( bszid_t bmult_id_m, bszid_t bmult_id_n, invdiag_t inv_diag, pack_t pack_schema, packbuf_t pack_buf, obj_t* a, obj_t* p, cntx_t* cntx );
 void libblis_test_vobj_create( test_params_t* params, num_t dt, char storage, dim_t m, obj_t* x );
 
 // --- Randomize/initialize object ---
diff --git a/testsuite/src/test_trsm_ukr.c b/testsuite/src/test_trsm_ukr.c
index 985f07daf4..df8c2b8eab 100644
--- a/testsuite/src/test_trsm_ukr.c
+++ b/testsuite/src/test_trsm_ukr.c
@@ -232,8 +232,6 @@ void libblis_test_trsm_ukr_experiment
 	libblis_test_mobj_randomize( params, TRUE, &c );
 	bli_copym( &c, &c_save );
 
-	array_t* array = bli_sba_checkout_array( 1 );
-
 	// Create pack objects for a and b, and pack them to ap and bp,
 	// respectively.
 	thrinfo_t* thread_a = libblis_test_pobj_create
@@ -244,8 +242,7 @@ void libblis_test_trsm_ukr_experiment
 	  BLIS_PACKED_ROW_PANELS,
 	  BLIS_BUFFER_FOR_A_BLOCK,
 	  &a, &ap,
-	  cntx,
-	  array
+	  cntx
 	);
 
 	// Set the diagonal offset of ap.
@@ -277,8 +274,7 @@ bli_printm( "ap", &ap, "%5.2f", "" );
 		  BLIS_PACKED_COL_PANELS,
 		  BLIS_BUFFER_FOR_B_PANEL,
 		  &b, &bp,
-		  cntx,
-		  array
+		  cntx
 		);
 
 		// Transpose B^T back to B and Bp^T back to Bp.

From 4b48a63a6e9e3c78ae4deff4cb7c42c7cb67269e Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Wed, 5 Oct 2022 14:27:13 -0500
Subject: [PATCH 30/32] Fix threading bugs in TRSM.

---
 frame/1m/packm/bli_packm_blk_var1.c |  4 ++--
 frame/3/bli_l3_thrinfo.c            | 13 ++++++++++++-
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/frame/1m/packm/bli_packm_blk_var1.c b/frame/1m/packm/bli_packm_blk_var1.c
index 33c6d90d83..71aaeb67f6 100644
--- a/frame/1m/packm/bli_packm_blk_var1.c
+++ b/frame/1m/packm/bli_packm_blk_var1.c
@@ -159,8 +159,8 @@ void bli_packm_blk_var1
 
 	// Query the number of threads and thread ids from the current thread's
 	// packm thrinfo_t node.
-	const dim_t nt  = bli_thread_n_way( thread );
-	const dim_t tid = bli_thread_work_id( thread );
+	const dim_t nt  = bli_thread_num_threads( thread );
+	const dim_t tid = bli_thread_thread_id( thread );
 
 	// Determine the thread range and increment using the current thread's
 	// packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
diff --git a/frame/3/bli_l3_thrinfo.c b/frame/3/bli_l3_thrinfo.c
index 31ec87c61f..97f70bbf2b 100644
--- a/frame/3/bli_l3_thrinfo.c
+++ b/frame/3/bli_l3_thrinfo.c
@@ -79,7 +79,18 @@ thrinfo_t* bli_l3_thrinfo_grow
 
     if ( sub_prenode != NULL )
     {
-        thrinfo_t* thread_chl = bli_l3_thrinfo_grow( thread_cur, rntm, sub_prenode );
+        // A pre-node is only used in the IC loop of trsm. In this case,
+        // we cannot actually thread in the m dimension due to data dependencies
+        // and so all parallelism must be moved down to the JR loop.
+        rntm_t rntm_l = *rntm;
+        const dim_t ic_nway = bli_rntm_ic_ways( &rntm_l );
+        const dim_t jr_nway = bli_rntm_jr_ways( &rntm_l );
+        bli_rntm_set_ic_ways_only(               1, &rntm_l );
+        bli_rntm_set_jr_ways_only( ic_nway*jr_nway, &rntm_l );
+
+        // Use thread_par instead of thread_cur since we *don't* want to
+        // do any parallelism at this level.
+        thrinfo_t* thread_chl = bli_l3_thrinfo_grow( thread_par, &rntm_l, sub_prenode );
         bli_thrinfo_set_sub_prenode( thread_chl, thread_cur );
     }
 

From 92c1bf4b6f1af661ddaf26c5ec0b6893d36cd172 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Wed, 5 Oct 2022 16:34:38 -0500
Subject: [PATCH 31/32] Export needed public symbols

Also, update symbols definition file for Windows. It seems this file was quite out-of-date.
---
 build/libblis-symbols.def  | 1484 +++---------------------------------
 frame/3/bli_l3_cntl.h      |    1 +
 frame/3/bli_l3_thrinfo.h   |    1 +
 frame/thread/bli_thrinfo.h |    1 +
 4 files changed, 125 insertions(+), 1362 deletions(-)

diff --git a/build/libblis-symbols.def b/build/libblis-symbols.def
index 8d29d73b25..db20ffbca4 100644
--- a/build/libblis-symbols.def
+++ b/build/libblis-symbols.def
@@ -1,122 +1,69 @@
 EXPORTS
 bli_abort
 bli_absqsc
-bli_absqsc_check
-bli_absqsc_qfp
 bli_acquire_mij
 bli_acquire_mpart
 bli_acquire_mpart_b2t
 bli_acquire_mpart_br2tl
 bli_acquire_mpart_l2r
-bli_acquire_mpart_l2r_check
 bli_acquire_mpart_mdim
 bli_acquire_mpart_mndim
 bli_acquire_mpart_ndim
 bli_acquire_mpart_r2l
 bli_acquire_mpart_t2b
-bli_acquire_mpart_t2b_check
 bli_acquire_mpart_tl2br
-bli_acquire_mpart_tl2br_check
 bli_acquire_vi
 bli_acquire_vpart_b2f
 bli_acquire_vpart_f2b
 bli_addd
-bli_addd_check
 bli_addd_ex
-bli_addd_ex_qfp
 bli_addm
-bli_addm_check
 bli_addm_ex
-bli_addm_ex_qfp
 bli_addsc
-bli_addsc_check
-bli_addsc_qfp
 bli_addv
-bli_addv_check
 bli_addv_ex
-bli_addv_ex_qfp
-bli_adjust_strides
 bli_align_dim_to_mult
 bli_align_dim_to_size
 bli_align_ptr_to_size
 bli_amaxv
-bli_amaxv_check
 bli_amaxv_ex
-bli_amaxv_ex_qfp
-bli_apool_alloc_block
-bli_apool_array_elem
-bli_apool_checkin_array
-bli_apool_checkout_array
-bli_apool_finalize
-bli_apool_free_block
-bli_apool_grow
-bli_apool_init
 bli_arch_query_id
-bli_arch_set_id
-bli_arch_set_id_once
 bli_arch_string
-bli_array_elem
-bli_array_finalize
-bli_array_init
-bli_array_resize
-bli_array_set_elem
 bli_asumv
-bli_asumv_check
 bli_asumv_ex
-bli_asumv_ex_qfp
 bli_axpbyv
-bli_axpbyv_check
 bli_axpbyv_ex
-bli_axpbyv_ex_qfp
 bli_axpy2v
-bli_axpy2v_check
 bli_axpy2v_ex
-bli_axpy2v_ex_qfp
 bli_axpyd
-bli_axpyd_check
 bli_axpyd_ex
-bli_axpyd_ex_qfp
 bli_axpyf
-bli_axpyf_check
 bli_axpyf_ex
-bli_axpyf_ex_qfp
 bli_axpym
-bli_axpym_check
 bli_axpym_ex
-bli_axpym_ex_qfp
 bli_axpyv
-bli_axpyv_check
 bli_axpyv_ex
-bli_axpyv_ex_qfp
 bli_blksz_create
 bli_blksz_create_ed
 bli_blksz_free
 bli_blksz_init
 bli_blksz_init_easy
 bli_blksz_init_ed
-bli_blksz_reduce_def_to
-bli_blksz_reduce_max_to
 bli_cabsqsc
 bli_caddd
 bli_caddd_ex
 bli_caddm
 bli_caddm_ex
-bli_caddm_unb_var1
 bli_caddsc
 bli_caddv
 bli_caddv_ex
-bli_calloc_intl
 bli_camaxv
 bli_camaxv_ex
 bli_castm
-bli_castm_check
 bli_castnzm
-bli_castnzm_check
 bli_castv
-bli_castv_check
 bli_casumv
 bli_casumv_ex
-bli_casumv_unb_var1
 bli_caxpbyv
 bli_caxpbyv_ex
 bli_caxpy2v
@@ -127,33 +74,24 @@ bli_caxpyf
 bli_caxpyf_ex
 bli_caxpym
 bli_caxpym_ex
-bli_caxpym_unb_var1
 bli_caxpyv
 bli_caxpyv_ex
 bli_cccastm
 bli_cccastnzm
 bli_cccastv
 bli_cccopysc
-bli_ccgemm_ker_var2_md
 bli_ccopyd
 bli_ccopyd_ex
 bli_ccopym
 bli_ccopym_ex
-bli_ccopym_unb_var1
 bli_ccopyv
 bli_ccopyv_ex
-bli_ccpackm_blk_var1_md
-bli_ccpackm_cxk_1e_md
-bli_ccpackm_cxk_1r_md
-bli_ccpackm_struc_cxk_md
 bli_ccxpbym_md
 bli_ccxpbym_md_ex
-bli_ccxpbym_md_unb_var1
 bli_cdcastm
 bli_cdcastnzm
 bli_cdcastv
 bli_cdcopysc
-bli_cdgemm_ker_var2_md
 bli_cdivsc
 bli_cdotaxpyv
 bli_cdotaxpyv_ex
@@ -165,288 +103,111 @@ bli_cdotxf
 bli_cdotxf_ex
 bli_cdotxv
 bli_cdotxv_ex
-bli_cdpackm_blk_var1_md
-bli_cdpackm_cxk_1e_md
-bli_cdpackm_cxk_1r_md
-bli_cdpackm_struc_cxk_md
 bli_cdxpbym_md
 bli_cdxpbym_md_ex
-bli_cdxpbym_md_unb_var1
+bli_ceqm
+bli_ceqsc
+bli_ceqv
 bli_cfprintm
 bli_cfprintv
 bli_cgemm
-bli_cgemm1m
-bli_cgemm3m1
-bli_cgemm3mh
-bli_cgemm4m1
-bli_cgemm4mb
-bli_cgemm4mb_ker_var2
-bli_cgemm4mh
 bli_cgemm_ex
-bli_cgemm_ker_var2
-bli_cgemm_md_c2r_ref
-bli_cgemmtrsm_l_ukernel
-bli_cgemmtrsm_u_ukernel
-bli_cgemm_ukernel
+bli_cgemmt
+bli_cgemmt_ex
 bli_cgemv
 bli_cgemv_ex
-bli_cgemv_unb_var1
-bli_cgemv_unb_var2
-bli_cgemv_unf_var1
-bli_cgemv_unf_var2
 bli_cger
 bli_cger_ex
-bli_cger_unb_var1
-bli_cger_unb_var2
 bli_cgetijm
+bli_cgetijv
 bli_cgetsc
-bli_check_alignment_is_mult_of_ptr_size
-bli_check_alignment_is_power_of_two
-bli_check_conformal_dims
-bli_check_consistent_datatypes
-bli_check_consistent_object_datatypes
-bli_check_consistent_object_precisions
-bli_check_consistent_precisions
-bli_check_datatype_real_proj_of
-bli_check_equal_vector_lengths
 bli_check_error_code_helper
-bli_check_floating_datatype
-bli_check_floating_object
-bli_check_general_object
-bli_check_hermitian_object
-bli_check_if_exhausted_pool
-bli_check_integer_datatype
-bli_check_integer_object
-bli_check_level3_dims
-bli_check_matrix_object
-bli_check_matrix_strides
-bli_check_nonconstant_datatype
-bli_check_nonconstant_object
-bli_check_noninteger_datatype
-bli_check_noninteger_object
-bli_check_nonunit_diag
-bli_check_null_pointer
-bli_check_object_alias_of
-bli_check_object_buffer
-bli_check_object_diag_offset_equals
-bli_check_object_length_equals
-bli_check_object_real_proj_of
-bli_check_object_struc
-bli_check_object_valid_datatype
-bli_check_object_width_equals
-bli_check_packm_schema_on_unpack
-bli_check_packv_schema_on_unpack
-bli_check_real_datatype
-bli_check_real_object
-bli_check_real_valued_object
-bli_check_scalar_object
-bli_check_square_object
-bli_check_sufficient_stack_buf_size
-bli_check_symmetric_object
-bli_check_triangular_object
-bli_check_upper_or_lower_object
-bli_check_valid_1x3_subpart
-bli_check_valid_3x1_subpart
-bli_check_valid_3x3_subpart
-bli_check_valid_arch_id
-bli_check_valid_cntl
-bli_check_valid_datatype
-bli_check_valid_diag
-bli_check_valid_error_level
-bli_check_valid_kc_mod_mult
-bli_check_valid_malloc_buf
-bli_check_valid_mc_mod_mult
-bli_check_valid_nc_mod_mult
-bli_check_valid_packbuf
-bli_check_valid_side
-bli_check_valid_trans
-bli_check_valid_uplo
-bli_check_vector_dim_equals
-bli_check_vector_object
 bli_chemm
-bli_chemm1m
-bli_chemm3m1
-bli_chemm3mh
-bli_chemm4m1
-bli_chemm4mh
 bli_chemm_ex
 bli_chemv
 bli_chemv_ex
-bli_chemv_unb_var1
-bli_chemv_unb_var2
-bli_chemv_unb_var3
-bli_chemv_unb_var4
-bli_chemv_unf_var1
-bli_chemv_unf_var1a
-bli_chemv_unf_var3
-bli_chemv_unf_var3a
 bli_cher
 bli_cher2
 bli_cher2_ex
 bli_cher2k
-bli_cher2k1m
-bli_cher2k3m1
-bli_cher2k3mh
-bli_cher2k4m1
-bli_cher2k4mh
 bli_cher2k_ex
-bli_cher2_unb_var1
-bli_cher2_unb_var2
-bli_cher2_unb_var3
-bli_cher2_unb_var4
-bli_cher2_unf_var1
-bli_cher2_unf_var4
 bli_cher_ex
 bli_cherk
-bli_cherk1m
-bli_cherk3m1
-bli_cherk3mh
-bli_cherk4m1
-bli_cherk4mh
 bli_cherk_ex
-bli_cherk_l_ker_var2
-bli_cherk_u_ker_var2
-bli_cher_unb_var1
-bli_cher_unb_var2
 bli_cinvertd
 bli_cinvertd_ex
 bli_cinvertsc
 bli_cinvertv
 bli_cinvertv_ex
+bli_cinvscald
+bli_cinvscald_ex
+bli_cinvscalm
+bli_cinvscalm_ex
+bli_cinvscalv
+bli_cinvscalv_ex
 bli_clock
-bli_clock_helper
 bli_clock_min_diff
 bli_cmachval
 bli_cmkherm
 bli_cmkherm_ex
-bli_cmkherm_unb_var1
 bli_cmksymm
 bli_cmksymm_ex
-bli_cmksymm_unb_var1
 bli_cmktrim
 bli_cmktrim_ex
-bli_cmktrim_unb_var1
 bli_cmulsc
 bli_cnorm1m
 bli_cnorm1m_ex
-bli_cnorm1m_unb_var1
 bli_cnorm1v
 bli_cnorm1v_ex
-bli_cnorm1v_unb_var1
 bli_cnormfm
 bli_cnormfm_ex
-bli_cnormfm_unb_var1
 bli_cnormfsc
 bli_cnormfv
 bli_cnormfv_ex
-bli_cnormfv_unb_var1
 bli_cnormim
 bli_cnormim_ex
-bli_cnormim_unb_var1
 bli_cnormiv
 bli_cnormiv_ex
-bli_cnormiv_unb_var1
-bli_cntl_calc_num_threads_in
 bli_cntl_clear_node
 bli_cntl_copy
 bli_cntl_create_node
 bli_cntl_free
 bli_cntl_free_node
-bli_cntl_free_wo_thrinfo
-bli_cntl_free_w_thrinfo
 bli_cntl_mark_family
-bli_cntx_1m_stage
-bli_cntx_3m1_stage
-bli_cntx_3mh_stage
-bli_cntx_4m1_stage
-bli_cntx_4mb_stage
-bli_cntx_4mh_stage
 bli_cntx_clear
-bli_cntx_ind_stage
-bli_cntx_nat_stage
 bli_cntx_print
 bli_cntx_set_blkszs
 bli_cntx_set_ind_blkszs
-bli_cntx_set_l1f_kers
-bli_cntx_set_l1v_kers
-bli_cntx_set_l3_nat_ukrs
-bli_cntx_set_packm_kers
+bli_cntx_set_l3_sup_handlers
+bli_cntx_set_ukr_prefs
+bli_cntx_set_ukrs
 bli_copyd
-bli_copyd_check
 bli_copyd_ex
-bli_copyd_ex_qfp
 bli_copym
-bli_copym_check
 bli_copym_ex
-bli_copym_ex_qfp
 bli_copysc
-bli_copysc_check
 bli_copyv
-bli_copyv_check
 bli_copyv_ex
-bli_copyv_ex_qfp
-bli_cpackm_blk_var1
-bli_cpackm_cxk
-bli_cpackm_cxk_1er
-bli_cpackm_cxk_3mis
-bli_cpackm_cxk_4mi
-bli_cpackm_cxk_rih
-bli_cpackm_herm_cxk
-bli_cpackm_herm_cxk_1er
-bli_cpackm_herm_cxk_3mis
-bli_cpackm_herm_cxk_4mi
-bli_cpackm_herm_cxk_rih
-bli_cpackm_struc_cxk
-bli_cpackm_struc_cxk_1er
-bli_cpackm_struc_cxk_3mis
-bli_cpackm_struc_cxk_4mi
-bli_cpackm_struc_cxk_rih
-bli_cpackm_tri_cxk
-bli_cpackm_tri_cxk_1er
-bli_cpackm_tri_cxk_3mis
-bli_cpackm_tri_cxk_4mi
-bli_cpackm_tri_cxk_rih
-bli_cpackm_unb_var1
 bli_cprintm
-bli_cprintm_ex
 bli_cprintv
-bli_cprintv_ex
-bli_cpuid_is_bulldozer
-bli_cpuid_is_excavator
-bli_cpuid_is_haswell
-bli_cpuid_is_knl
-bli_cpuid_is_penryn
-bli_cpuid_is_piledriver
-bli_cpuid_is_sandybridge
-bli_cpuid_is_skx
-bli_cpuid_is_steamroller
-bli_cpuid_is_zen
-bli_cpuid_query
-bli_cpuid_query_id
 bli_crandm
 bli_crandm_ex
-bli_crandm_unb_var1
 bli_crandnm
 bli_crandnm_ex
-bli_crandnm_unb_var1
 bli_crandnv
 bli_crandnv_ex
-bli_crandnv_unb_var1
 bli_crandv
 bli_crandv_ex
-bli_crandv_unb_var1
 bli_cscal2d
 bli_cscal2d_ex
 bli_cscal2m
 bli_cscal2m_ex
-bli_cscal2m_unb_var1
 bli_cscal2v
 bli_cscal2v_ex
 bli_cscald
 bli_cscald_ex
 bli_cscalm
 bli_cscalm_ex
-bli_cscalm_unb_var1
 bli_cscalv
 bli_cscalv_ex
 bli_cscastm
@@ -458,42 +219,29 @@ bli_csetd_ex
 bli_csetid
 bli_csetid_ex
 bli_csetijm
+bli_csetijv
 bli_csetm
 bli_csetm_ex
-bli_csetm_unb_var1
 bli_csetsc
 bli_csetv
 bli_csetv_ex
-bli_csgemm_ker_var2_md
 bli_cshiftd
 bli_cshiftd_ex
-bli_cspackm_blk_var1_md
-bli_cspackm_cxk_1e_md
-bli_cspackm_cxk_1r_md
-bli_cspackm_struc_cxk_md
 bli_csqrtsc
 bli_csubd
 bli_csubd_ex
 bli_csubm
 bli_csubm_ex
-bli_csubm_unb_var1
 bli_csubsc
 bli_csubv
 bli_csubv_ex
 bli_csumsqv
 bli_csumsqv_ex
-bli_csumsqv_unb_var1
 bli_cswapv
 bli_cswapv_ex
 bli_csxpbym_md
 bli_csxpbym_md_ex
-bli_csxpbym_md_unb_var1
 bli_csymm
-bli_csymm1m
-bli_csymm3m1
-bli_csymm3mh
-bli_csymm4m1
-bli_csymm4mh
 bli_csymm_ex
 bli_csymv
 bli_csymv_ex
@@ -501,89 +249,39 @@ bli_csyr
 bli_csyr2
 bli_csyr2_ex
 bli_csyr2k
-bli_csyr2k1m
-bli_csyr2k3m1
-bli_csyr2k3mh
-bli_csyr2k4m1
-bli_csyr2k4mh
 bli_csyr2k_ex
 bli_csyr_ex
 bli_csyrk
-bli_csyrk1m
-bli_csyrk3m1
-bli_csyrk3mh
-bli_csyrk4m1
-bli_csyrk4mh
 bli_csyrk_ex
 bli_ctrmm
-bli_ctrmm1m
 bli_ctrmm3
-bli_ctrmm31m
-bli_ctrmm33m1
-bli_ctrmm33mh
-bli_ctrmm34m1
-bli_ctrmm34mh
 bli_ctrmm3_ex
-bli_ctrmm3m1
-bli_ctrmm4m1
 bli_ctrmm_ex
-bli_ctrmm_ll_ker_var2
-bli_ctrmm_lu_ker_var2
-bli_ctrmm_rl_ker_var2
-bli_ctrmm_ru_ker_var2
 bli_ctrmv
 bli_ctrmv_ex
-bli_ctrmv_unb_var1
-bli_ctrmv_unb_var2
-bli_ctrmv_unf_var1
-bli_ctrmv_unf_var2
 bli_ctrsm
-bli_ctrsm1m
-bli_ctrsm3m1
-bli_ctrsm4m1
 bli_ctrsm_ex
-bli_ctrsm_ll_ker_var2
-bli_ctrsm_l_ukernel
-bli_ctrsm_lu_ker_var2
-bli_ctrsm_rl_ker_var2
-bli_ctrsm_ru_ker_var2
-bli_ctrsm_u_ukernel
 bli_ctrsv
 bli_ctrsv_ex
-bli_ctrsv_unb_var1
-bli_ctrsv_unb_var2
-bli_ctrsv_unf_var1
-bli_ctrsv_unf_var2
-bli_cunpackm_blk_var1
-bli_cunpackm_cxk
-bli_cunpackm_unb_var1
 bli_cunzipsc
 bli_cxpbyd
 bli_cxpbyd_ex
 bli_cxpbym
 bli_cxpbym_ex
-bli_cxpbym_unb_var1
 bli_cxpbyv
 bli_cxpbyv_ex
 bli_czcastm
 bli_czcastnzm
 bli_czcastv
 bli_czcopysc
-bli_czgemm_ker_var2_md
 bli_czipsc
-bli_czpackm_blk_var1_md
-bli_czpackm_cxk_1e_md
-bli_czpackm_cxk_1r_md
-bli_czpackm_struc_cxk_md
 bli_czxpbym_md
 bli_czxpbym_md_ex
-bli_czxpbym_md_unb_var1
 bli_dabsqsc
 bli_daddd
 bli_daddd_ex
 bli_daddm
 bli_daddm_ex
-bli_daddm_unb_var1
 bli_daddsc
 bli_daddv
 bli_daddv_ex
@@ -591,7 +289,6 @@ bli_damaxv
 bli_damaxv_ex
 bli_dasumv
 bli_dasumv_ex
-bli_dasumv_unb_var1
 bli_daxpbyv
 bli_daxpbyv_ex
 bli_daxpy2v
@@ -602,33 +299,24 @@ bli_daxpyf
 bli_daxpyf_ex
 bli_daxpym
 bli_daxpym_ex
-bli_daxpym_unb_var1
 bli_daxpyv
 bli_daxpyv_ex
 bli_dccastm
 bli_dccastnzm
 bli_dccastv
 bli_dccopysc
-bli_dcgemm_ker_var2_md
 bli_dcopyd
 bli_dcopyd_ex
 bli_dcopym
 bli_dcopym_ex
-bli_dcopym_unb_var1
 bli_dcopyv
 bli_dcopyv_ex
-bli_dcpackm_blk_var1_md
-bli_dcpackm_cxk_1e_md
-bli_dcpackm_cxk_1r_md
-bli_dcpackm_struc_cxk_md
 bli_dcxpbym_md
 bli_dcxpbym_md_ex
-bli_dcxpbym_md_unb_var1
 bli_ddcastm
 bli_ddcastnzm
 bli_ddcastv
 bli_ddcopysc
-bli_ddgemm_ker_var2_md
 bli_ddivsc
 bli_ddotaxpyv
 bli_ddotaxpyv_ex
@@ -640,183 +328,99 @@ bli_ddotxf
 bli_ddotxf_ex
 bli_ddotxv
 bli_ddotxv_ex
-bli_ddpackm_blk_var1_md
-bli_ddpackm_cxk_1e_md
-bli_ddpackm_cxk_1r_md
-bli_ddpackm_struc_cxk_md
 bli_ddxpbym_md
 bli_ddxpbym_md_ex
-bli_ddxpbym_md_unb_var1
-bli_determine_blocksize
-bli_determine_blocksize_b
-bli_determine_blocksize_b_sub
-bli_determine_blocksize_f
-bli_determine_blocksize_f_sub
+bli_deqm
+bli_deqsc
+bli_deqv
 bli_dfprintm
 bli_dfprintv
 bli_dgemm
-bli_dgemm1m
-bli_dgemm3m1
-bli_dgemm3mh
-bli_dgemm4m1
-bli_dgemm4mb
-bli_dgemm4mb_ker_var2
-bli_dgemm4mh
 bli_dgemm_ex
-bli_dgemm_ker_var2
-bli_dgemmtrsm_l_ukernel
-bli_dgemmtrsm_u_ukernel
-bli_dgemm_ukernel
+bli_dgemmt
+bli_dgemmt_ex
 bli_dgemv
 bli_dgemv_ex
-bli_dgemv_unb_var1
-bli_dgemv_unb_var2
-bli_dgemv_unf_var1
-bli_dgemv_unf_var2
 bli_dger
 bli_dger_ex
-bli_dger_unb_var1
-bli_dger_unb_var2
 bli_dgetijm
+bli_dgetijv
 bli_dgetsc
 bli_dhemm
-bli_dhemm1m
-bli_dhemm3m1
-bli_dhemm3mh
-bli_dhemm4m1
-bli_dhemm4mh
 bli_dhemm_ex
 bli_dhemv
 bli_dhemv_ex
-bli_dhemv_unb_var1
-bli_dhemv_unb_var2
-bli_dhemv_unb_var3
-bli_dhemv_unb_var4
-bli_dhemv_unf_var1
-bli_dhemv_unf_var1a
-bli_dhemv_unf_var3
-bli_dhemv_unf_var3a
 bli_dher
 bli_dher2
 bli_dher2_ex
 bli_dher2k
-bli_dher2k1m
-bli_dher2k3m1
-bli_dher2k3mh
-bli_dher2k4m1
-bli_dher2k4mh
 bli_dher2k_ex
-bli_dher2_unb_var1
-bli_dher2_unb_var2
-bli_dher2_unb_var3
-bli_dher2_unb_var4
-bli_dher2_unf_var1
-bli_dher2_unf_var4
 bli_dher_ex
 bli_dherk
-bli_dherk1m
-bli_dherk3m1
-bli_dherk3mh
-bli_dherk4m1
-bli_dherk4mh
 bli_dherk_ex
-bli_dherk_l_ker_var2
-bli_dherk_u_ker_var2
-bli_dher_unb_var1
-bli_dher_unb_var2
 bli_dinvertd
 bli_dinvertd_ex
 bli_dinvertsc
 bli_dinvertv
 bli_dinvertv_ex
+bli_dinvscald
+bli_dinvscald_ex
+bli_dinvscalm
+bli_dinvscalm_ex
+bli_dinvscalv
+bli_dinvscalv_ex
 bli_divsc
-bli_divsc_check
-bli_divsc_qfp
-bli_dlamch
 bli_dmachval
 bli_dmkherm
 bli_dmkherm_ex
-bli_dmkherm_unb_var1
 bli_dmksymm
 bli_dmksymm_ex
-bli_dmksymm_unb_var1
 bli_dmktrim
 bli_dmktrim_ex
-bli_dmktrim_unb_var1
 bli_dmulsc
 bli_dnorm1m
 bli_dnorm1m_ex
-bli_dnorm1m_unb_var1
 bli_dnorm1v
 bli_dnorm1v_ex
-bli_dnorm1v_unb_var1
 bli_dnormfm
 bli_dnormfm_ex
-bli_dnormfm_unb_var1
 bli_dnormfsc
 bli_dnormfv
 bli_dnormfv_ex
-bli_dnormfv_unb_var1
 bli_dnormim
 bli_dnormim_ex
-bli_dnormim_unb_var1
 bli_dnormiv
 bli_dnormiv_ex
-bli_dnormiv_unb_var1
 bli_dotaxpyv
-bli_dotaxpyv_check
 bli_dotaxpyv_ex
-bli_dotaxpyv_ex_qfp
 bli_dotv
-bli_dotv_check
 bli_dotv_ex
-bli_dotv_ex_qfp
 bli_dotxaxpyf
-bli_dotxaxpyf_check
 bli_dotxaxpyf_ex
-bli_dotxaxpyf_ex_qfp
 bli_dotxf
-bli_dotxf_check
 bli_dotxf_ex
-bli_dotxf_ex_qfp
 bli_dotxv
-bli_dotxv_check
 bli_dotxv_ex
-bli_dotxv_ex_qfp
-bli_dpackm_blk_var1
-bli_dpackm_cxk
-bli_dpackm_herm_cxk
-bli_dpackm_struc_cxk
-bli_dpackm_tri_cxk
-bli_dpackm_unb_var1
 bli_dprintm
-bli_dprintm_ex
 bli_dprintv
-bli_dprintv_ex
 bli_drandm
 bli_drandm_ex
-bli_drandm_unb_var1
 bli_drandnm
 bli_drandnm_ex
-bli_drandnm_unb_var1
 bli_drandnv
 bli_drandnv_ex
-bli_drandnv_unb_var1
 bli_drandv
 bli_drandv_ex
-bli_drandv_unb_var1
 bli_dscal2d
 bli_dscal2d_ex
 bli_dscal2m
 bli_dscal2m_ex
-bli_dscal2m_unb_var1
 bli_dscal2v
 bli_dscal2v_ex
 bli_dscald
 bli_dscald_ex
 bli_dscalm
 bli_dscalm_ex
-bli_dscalm_unb_var1
 bli_dscalv
 bli_dscalv_ex
 bli_dscastm
@@ -828,42 +432,29 @@ bli_dsetd_ex
 bli_dsetid
 bli_dsetid_ex
 bli_dsetijm
+bli_dsetijv
 bli_dsetm
 bli_dsetm_ex
-bli_dsetm_unb_var1
 bli_dsetsc
 bli_dsetv
 bli_dsetv_ex
-bli_dsgemm_ker_var2_md
 bli_dshiftd
 bli_dshiftd_ex
-bli_dspackm_blk_var1_md
-bli_dspackm_cxk_1e_md
-bli_dspackm_cxk_1r_md
-bli_dspackm_struc_cxk_md
 bli_dsqrtsc
 bli_dsubd
 bli_dsubd_ex
 bli_dsubm
 bli_dsubm_ex
-bli_dsubm_unb_var1
 bli_dsubsc
 bli_dsubv
 bli_dsubv_ex
 bli_dsumsqv
 bli_dsumsqv_ex
-bli_dsumsqv_unb_var1
 bli_dswapv
 bli_dswapv_ex
 bli_dsxpbym_md
 bli_dsxpbym_md_ex
-bli_dsxpbym_md_unb_var1
 bli_dsymm
-bli_dsymm1m
-bli_dsymm3m1
-bli_dsymm3mh
-bli_dsymm4m1
-bli_dsymm4mh
 bli_dsymm_ex
 bli_dsymv
 bli_dsymv_ex
@@ -871,301 +462,79 @@ bli_dsyr
 bli_dsyr2
 bli_dsyr2_ex
 bli_dsyr2k
-bli_dsyr2k1m
-bli_dsyr2k3m1
-bli_dsyr2k3mh
-bli_dsyr2k4m1
-bli_dsyr2k4mh
 bli_dsyr2k_ex
 bli_dsyr_ex
 bli_dsyrk
-bli_dsyrk1m
-bli_dsyrk3m1
-bli_dsyrk3mh
-bli_dsyrk4m1
-bli_dsyrk4mh
 bli_dsyrk_ex
 bli_dtrmm
-bli_dtrmm1m
 bli_dtrmm3
-bli_dtrmm31m
-bli_dtrmm33m1
-bli_dtrmm33mh
-bli_dtrmm34m1
-bli_dtrmm34mh
 bli_dtrmm3_ex
-bli_dtrmm3m1
-bli_dtrmm4m1
 bli_dtrmm_ex
-bli_dtrmm_ll_ker_var2
-bli_dtrmm_lu_ker_var2
-bli_dtrmm_rl_ker_var2
-bli_dtrmm_ru_ker_var2
 bli_dtrmv
 bli_dtrmv_ex
-bli_dtrmv_unb_var1
-bli_dtrmv_unb_var2
-bli_dtrmv_unf_var1
-bli_dtrmv_unf_var2
 bli_dtrsm
-bli_dtrsm1m
-bli_dtrsm3m1
-bli_dtrsm4m1
 bli_dtrsm_ex
-bli_dtrsm_ll_ker_var2
-bli_dtrsm_l_ukernel
-bli_dtrsm_lu_ker_var2
-bli_dtrsm_rl_ker_var2
-bli_dtrsm_ru_ker_var2
-bli_dtrsm_u_ukernel
 bli_dtrsv
 bli_dtrsv_ex
-bli_dtrsv_unb_var1
-bli_dtrsv_unb_var2
-bli_dtrsv_unf_var1
-bli_dtrsv_unf_var2
 bli_dt_size
-bli_dt_size_check
 bli_dt_string
-bli_dt_string_check
-bli_dt_union_check
-bli_dunpackm_blk_var1
-bli_dunpackm_cxk
-bli_dunpackm_unb_var1
 bli_dunzipsc
 bli_dxpbyd
 bli_dxpbyd_ex
 bli_dxpbym
 bli_dxpbym_ex
-bli_dxpbym_unb_var1
 bli_dxpbyv
 bli_dxpbyv_ex
 bli_dzcastm
 bli_dzcastnzm
 bli_dzcastv
 bli_dzcopysc
-bli_dzgemm_ker_var2_md
 bli_dzipsc
-bli_dzpackm_blk_var1_md
-bli_dzpackm_cxk_1e_md
-bli_dzpackm_cxk_1r_md
-bli_dzpackm_struc_cxk_md
 bli_dzxpbym_md
 bli_dzxpbym_md_ex
-bli_dzxpbym_md_unb_var1
+bli_eqm
+bli_eqsc
+bli_eqv
 bli_error_checking_is_enabled
 bli_error_checking_level
 bli_error_checking_level_set
-bli_error_string_for_code
-bli_ffree_align
-bli_ffree_noalign
 bli_finalize
-bli_finalize_apis
-bli_finalize_auto
-bli_finalize_once
-bli_find_area_trap_l
-bli_fmalloc_align
-bli_fmalloc_align_check
-bli_fmalloc_noalign
-bli_fmalloc_post_check
 bli_fprintm
-bli_fprintm_check
-bli_fprintm_ex
-bli_fprintm_qfp
 bli_fprintv
-bli_fprintv_check
-bli_fprintv_ex
-bli_fprintv_qfp
-bli_free_intl
 bli_free_user
-bli_func_create
-bli_func_free
-bli_func_init
-bli_func_init_null
-bli_func_is_null
-bli_func_is_null_dt
-bli_gcd
 bli_gemm
-bli_gemm1m
-bli_gemm3m1
-bli_gemm3mh
-bli_gemm4m1
-bli_gemm4mb
-bli_gemm4mb_ker_var2
-bli_gemm4mh
-bli_gemm_basic_check
-bli_gemm_blk_var1
-bli_gemm_blk_var2
-bli_gemm_blk_var3
-bli_gemmbp_cntl_create
-bli_gemm_check
-bli_gemm_cntl_create
-bli_gemm_cntl_create_node
-bli_gemm_cntl_free
-bli_gemm_determine_kc
-bli_gemm_determine_kc_b
-bli_gemm_determine_kc_f
-bli_gemm_direct
 bli_gemm_ex
-bli_gemm_front
-bli_gemmind
-bli_gemmind_get_avail
-bli_gemm_int
-bli_gemm_ker_var2
-bli_gemm_ker_var2_md
-bli_gemm_md
-bli_gemm_md_ccc
-bli_gemm_md_ccr
-bli_gemm_md_crc
-bli_gemm_md_crr
-bli_gemm_md_rcc
-bli_gemm_md_rcr
-bli_gemm_md_rrc
-bli_gemm_md_rrr
-bli_gemmnat
-bli_gemm_packa
-bli_gemm_packb
-bli_gemm_prune_unref_mparts_k
-bli_gemm_prune_unref_mparts_m
-bli_gemm_prune_unref_mparts_n
-bli_gemmtrsm_l_ukernel_qfp
+bli_gemmt
+bli_gemmt_ex
 bli_gemmtrsm_ukernel
-bli_gemmtrsm_u_ukernel_qfp
 bli_gemm_ukernel
-bli_gemm_ukernel_qfp
 bli_gemv
-bli_gemv_check
 bli_gemv_ex
-bli_gemv_ex_qfp
-bli_gemv_unb_var1
-bli_gemv_unb_var1_qfp
-bli_gemv_unb_var2
-bli_gemv_unb_var2_qfp
-bli_gemv_unf_var1
-bli_gemv_unf_var1_qfp
-bli_gemv_unf_var2
-bli_gemv_unf_var2_qfp
 bli_ger
-bli_ger_check
 bli_ger_ex
-bli_ger_ex_qfp
-bli_ger_unb_var1
-bli_ger_unb_var1_qfp
-bli_ger_unb_var2
-bli_ger_unb_var2_qfp
 bli_getijm
+bli_getijv
 bli_getopt
 bli_getopt_init_state
 bli_getsc
-bli_getsc_check
-bli_getsc_qfp
-bli_gks_cntx_l3_nat_ukr_is_ref
-bli_gks_finalize
-bli_gks_init
-bli_gks_init_index
 bli_gks_init_ref_cntx
 bli_gks_l3_ukr_impl_string
 bli_gks_l3_ukr_impl_type
-bli_gks_lookup_ind_cntx
-bli_gks_lookup_nat_cntx
 bli_gks_query_cntx
-bli_gks_query_cntx_noinit
 bli_gks_query_ind_cntx
 bli_gks_query_nat_cntx
-bli_gks_register_cntx
 bli_hemm
-bli_hemm1m
-bli_hemm3m1
-bli_hemm3mh
-bli_hemm4m1
-bli_hemm4mh
-bli_hemm_basic_check
-bli_hemm_check
 bli_hemm_ex
-bli_hemm_front
-bli_hemmind
-bli_hemmind_get_avail
-bli_hemmnat
 bli_hemv
-bli_hemv_check
 bli_hemv_ex
-bli_hemv_ex_qfp
-bli_hemv_unb_var1
-bli_hemv_unb_var1_qfp
-bli_hemv_unb_var2
-bli_hemv_unb_var2_qfp
-bli_hemv_unb_var3
-bli_hemv_unb_var3_qfp
-bli_hemv_unb_var4
-bli_hemv_unb_var4_qfp
-bli_hemv_unf_var1
-bli_hemv_unf_var1a
-bli_hemv_unf_var1a_qfp
-bli_hemv_unf_var1_qfp
-bli_hemv_unf_var3
-bli_hemv_unf_var3a
-bli_hemv_unf_var3a_qfp
-bli_hemv_unf_var3_qfp
 bli_her
 bli_her2
-bli_her2_check
 bli_her2_ex
-bli_her2_ex_qfp
 bli_her2k
-bli_her2k1m
-bli_her2k3m1
-bli_her2k3mh
-bli_her2k4m1
-bli_her2k4mh
-bli_her2k_basic_check
-bli_her2k_check
 bli_her2k_ex
-bli_her2k_front
-bli_her2kind
-bli_her2kind_get_avail
-bli_her2knat
-bli_her2_unb_var1
-bli_her2_unb_var1_qfp
-bli_her2_unb_var2
-bli_her2_unb_var2_qfp
-bli_her2_unb_var3
-bli_her2_unb_var3_qfp
-bli_her2_unb_var4
-bli_her2_unb_var4_qfp
-bli_her2_unf_var1
-bli_her2_unf_var1_qfp
-bli_her2_unf_var4
-bli_her2_unf_var4_qfp
-bli_her_check
 bli_her_ex
-bli_her_ex_qfp
 bli_herk
-bli_herk1m
-bli_herk3m1
-bli_herk3mh
-bli_herk4m1
-bli_herk4mh
-bli_herk_basic_check
-bli_herk_check
-bli_herk_determine_kc
-bli_herk_determine_kc_b
-bli_herk_determine_kc_f
-bli_herk_direct
 bli_herk_ex
-bli_herk_front
-bli_herkind
-bli_herkind_get_avail
-bli_herk_l_ker_var2
-bli_herknat
-bli_herk_prune_unref_mparts_k
-bli_herk_prune_unref_mparts_m
-bli_herk_prune_unref_mparts_n
-bli_herk_u_ker_var2
-bli_herk_x_ker_var2
-bli_her_unb_var1
-bli_her_unb_var1_qfp
-bli_her_unb_var2
-bli_her_unb_var2_qfp
 bli_ifprintm
 bli_ifprintv
 bli_igetsc
@@ -1175,13 +544,8 @@ bli_ind_disable_all_dt
 bli_ind_disable_dt
 bli_ind_enable
 bli_ind_enable_dt
-bli_ind_finalize
-bli_ind_get_impl_string
-bli_ind_init
-bli_ind_map_cdt_to_index
 bli_ind_oper_enable_only
 bli_ind_oper_find_avail
-bli_ind_oper_get_avail
 bli_ind_oper_get_avail_impl_string
 bli_ind_oper_is_impl
 bli_info_get_blas_int_type_size
@@ -1189,13 +553,15 @@ bli_info_get_enable_blas
 bli_info_get_enable_cblas
 bli_info_get_enable_memkind
 bli_info_get_enable_openmp
+bli_info_get_enable_openmp_as_default
 bli_info_get_enable_pba_pools
 bli_info_get_enable_pthreads
+bli_info_get_enable_pthreads_as_default
 bli_info_get_enable_sandbox
 bli_info_get_enable_sba_pools
-bli_info_get_enable_stay_auto_init
 bli_info_get_enable_threading
 bli_info_get_gemm_impl_string
+bli_info_get_gemmt_impl_string
 bli_info_get_gemmtrsm_l_ukr_impl_string
 bli_info_get_gemmtrsm_u_ukr_impl_string
 bli_info_get_gemm_ukr_impl_string
@@ -1209,7 +575,14 @@ bli_info_get_int_type_size_str
 bli_info_get_max_type_size
 bli_info_get_num_fp_types
 bli_info_get_page_size
-bli_info_get_pool_addr_align_size
+bli_info_get_pool_addr_align_size_a
+bli_info_get_pool_addr_align_size_b
+bli_info_get_pool_addr_align_size_c
+bli_info_get_pool_addr_align_size_gen
+bli_info_get_pool_addr_offset_size_a
+bli_info_get_pool_addr_offset_size_b
+bli_info_get_pool_addr_offset_size_c
+bli_info_get_pool_addr_offset_size_gen
 bli_info_get_simd_align_size
 bli_info_get_simd_num_registers
 bli_info_get_simd_size
@@ -1227,152 +600,57 @@ bli_info_get_trsm_l_ukr_impl_string
 bli_info_get_trsm_u_ukr_impl_string
 bli_info_get_version_str
 bli_init
-bli_init_apis
-bli_init_auto
-bli_init_once
 bli_invertd
-bli_invertd_check
 bli_invertd_ex
-bli_invertd_ex_qfp
 bli_invertsc
-bli_invertsc_check
-bli_invertsc_qfp
 bli_invertv
-bli_invertv_check
 bli_invertv_ex
-bli_invertv_ex_qfp
-bli_ipow
+bli_invscald
+bli_invscald_ex
+bli_invscalm
+bli_invscalm_ex
+bli_invscalv
+bli_invscalv_ex
 bli_iprintm
-bli_iprintm_ex
 bli_iprintv
-bli_iprintv_ex
 bli_isetsc
-bli_l0_xsc_check
-bli_l0_xx2sc_check
-bli_l0_xxsc_check
-bli_l1d_ax_check
-bli_l1d_axy_check
-bli_l1d_x_check
-bli_l1d_xy_check
-bli_l1m_ax_check
-bli_l1m_axy_check
-bli_l1m_xy_check
-bli_l1v_axby_check
-bli_l1v_ax_check
-bli_l1v_axy_check
-bli_l1v_dot_check
-bli_l1v_xby_check
-bli_l1v_x_check
-bli_l1v_xi_check
-bli_l1v_xy_check
-bli_l3_basic_check
-bli_l3_cntl_create_if
 bli_l3_cntl_free
-bli_l3_determine_kc
-bli_l3_direct
-bli_l3_ind_oper_enable_only
-bli_l3_ind_oper_find_avail
-bli_l3_ind_oper_get_enable
-bli_l3_ind_oper_get_func
-bli_l3_ind_oper_set_enable
-bli_l3_ind_oper_set_enable_all
-bli_l3_ind_set_enable_dt
-bli_l3_packm
-bli_l3_prune_unref_mparts_k
-bli_l3_prune_unref_mparts_m
-bli_l3_prune_unref_mparts_n
-bli_l3_thread_decorator
-bli_l3_thread_entry
-bli_l3_thrinfo_create_root
-bli_l3_thrinfo_free
-bli_l3_thrinfo_free_paths
-bli_l3_thrinfo_init_single
-bli_l3_thrinfo_print_gemm_paths
-bli_l3_thrinfo_print_trsm_paths
-bli_lcm
-bli_lsame
+bli_l3_thrinfo_create
 bli_machval
-bli_malloc_intl
 bli_malloc_user
-bli_mbool_create
-bli_mbool_free
-bli_mbool_init
-bli_pba_acquire_m
-bli_pba_compute_pool_block_sizes
-bli_pba_compute_pool_block_sizes_dt
-bli_pba_finalize
-bli_pba_finalize_pools
-bli_pba_init
-bli_pba_init_pools
-bli_pba_pool_size
-bli_pba_query
-bli_pba_release
-bli_memsys_finalize
-bli_memsys_init
 bli_mkherm
-bli_mkherm_check
 bli_mkherm_ex
-bli_mkherm_ex_qfp
 bli_mksymm
-bli_mksymm_check
 bli_mksymm_ex
-bli_mksymm_ex_qfp
 bli_mktrim
-bli_mktrim_check
 bli_mktrim_ex
-bli_mktrim_ex_qfp
 bli_mulsc
-bli_mulsc_check
-bli_mulsc_qfp
-bli_next_prime_factor
 bli_norm1m
-bli_norm1m_check
 bli_norm1m_ex
-bli_norm1m_ex_qfp
 bli_norm1v
-bli_norm1v_check
 bli_norm1v_ex
-bli_norm1v_ex_qfp
 bli_normfm
-bli_normfm_check
 bli_normfm_ex
-bli_normfm_ex_qfp
 bli_normfsc
-bli_normfsc_check
-bli_normfsc_qfp
 bli_normfv
-bli_normfv_check
 bli_normfv_ex
-bli_normfv_ex_qfp
 bli_normim
-bli_normim_check
 bli_normim_ex
-bli_normim_ex_qfp
 bli_normiv
-bli_normiv_check
 bli_normiv_ex
-bli_normiv_ex_qfp
 bli_obj_alloc_buffer
-bli_obj_alloc_buffer_check
 bli_obj_attach_buffer
-bli_obj_attach_buffer_check
 bli_obj_create
 bli_obj_create_1x1
 bli_obj_create_1x1_with_attached_buffer
-bli_obj_create_check
 bli_obj_create_conf_to
-bli_obj_create_const_check
-bli_obj_create_scalar_check
 bli_obj_create_with_attached_buffer
 bli_obj_create_without_buffer
-bli_obj_create_without_buffer_check
 bli_obj_equals
 bli_obj_free
-bli_obj_free_check
 bli_obj_imag_equals
 bli_obj_imag_is_zero
 bli_obj_print
-bli_obj_print_check
 bli_obj_scalar_apply_scalar
 bli_obj_scalar_attach
 bli_obj_scalar_cast_to
@@ -1382,21 +660,16 @@ bli_obj_scalar_has_nonzero_imag
 bli_obj_scalar_init_detached
 bli_obj_scalar_init_detached_copy_of
 bli_obj_scalar_reset
-bli_packm_acquire_mpart_l2r
-bli_packm_acquire_mpart_t2b
-bli_packm_acquire_mpart_tl2br
+bli_pack_get_pack_a
+bli_pack_get_pack_b
+bli_packm_alloc
+bli_packm_alloc_ex
 bli_packm_blk_var1
-bli_packm_blk_var1_md
 bli_packm_cntl_create_node
 bli_packm_init
-bli_packm_init_check
-bli_packm_init_pack
-bli_packm_int
-bli_packm_int_check
-bli_packm_offset_to_panel_for
-bli_packm_thrinfo_init
-bli_packm_thrinfo_init_single
-bli_packm_unb_var1
+bli_packm_scalar
+bli_pack_set_pack_a
+bli_pack_set_pack_b
 bli_param_map_blis_to_char_conj
 bli_param_map_blis_to_char_diag
 bli_param_map_blis_to_char_dt
@@ -1414,33 +687,11 @@ bli_param_map_char_to_blis_dt
 bli_param_map_char_to_blis_side
 bli_param_map_char_to_blis_trans
 bli_param_map_char_to_blis_uplo
-bli_param_map_netlib_to_blis_diag
-bli_param_map_netlib_to_blis_side
-bli_param_map_netlib_to_blis_trans
-bli_param_map_netlib_to_blis_uplo
-bli_partition_2x2
-bli_pblk_print
-bli_pool_alloc_block
-bli_pool_checkin_block
-bli_pool_checkout_block
-bli_pool_finalize
-bli_pool_free_block
-bli_pool_grow
-bli_pool_init
-bli_pool_print
-bli_pool_reinit
-bli_pool_shrink
-bli_prime_factorization
+bli_pba_query
 bli_printm
-bli_printm_ex
-bli_print_msg
 bli_printv
-bli_printv_ex
 bli_projm
-bli_projm_check
 bli_projv
-bli_projv_check
-bli_prune_unref_mparts
 bli_pthread_barrier_destroy
 bli_pthread_barrier_init
 bli_pthread_barrier_wait
@@ -1457,30 +708,22 @@ bli_pthread_mutex_trylock
 bli_pthread_mutex_unlock
 bli_pthread_once
 bli_randm
-bli_randm_check
 bli_randm_ex
-bli_randm_ex_qfp
 bli_randnm
-bli_randnm_check
 bli_randnm_ex
-bli_randnm_ex_qfp
 bli_randnv
-bli_randnv_check
 bli_randnv_ex
-bli_randnv_ex_qfp
 bli_randv
-bli_randv_check
 bli_randv_ex
-bli_randv_ex_qfp
-bli_rntm_print
+bli_rntm_init_from_global
+bli_rntm_set_num_threads
+bli_rntm_set_ways
 bli_rntm_set_ways_for_op
-bli_rntm_set_ways_from_rntm
 bli_sabsqsc
 bli_saddd
 bli_saddd_ex
 bli_saddm
 bli_saddm_ex
-bli_saddm_unb_var1
 bli_saddsc
 bli_saddv
 bli_saddv_ex
@@ -1488,7 +731,6 @@ bli_samaxv
 bli_samaxv_ex
 bli_sasumv
 bli_sasumv_ex
-bli_sasumv_unb_var1
 bli_saxpbyv
 bli_saxpbyv_ex
 bli_saxpy2v
@@ -1499,65 +741,36 @@ bli_saxpyf
 bli_saxpyf_ex
 bli_saxpym
 bli_saxpym_ex
-bli_saxpym_unb_var1
 bli_saxpyv
 bli_saxpyv_ex
-bli_sba_acquire
-bli_sba_checkin_array
-bli_sba_checkout_array
-bli_sba_finalize
-bli_sba_init
-bli_sba_query
-bli_sba_release
-bli_sba_rntm_set_pool
 bli_scal2d
-bli_scal2d_check
 bli_scal2d_ex
-bli_scal2d_ex_qfp
 bli_scal2m
-bli_scal2m_check
 bli_scal2m_ex
-bli_scal2m_ex_qfp
 bli_scal2v
-bli_scal2v_check
 bli_scal2v_ex
-bli_scal2v_ex_qfp
 bli_scald
-bli_scald_check
 bli_scald_ex
-bli_scald_ex_qfp
 bli_scalm
-bli_scalm_check
 bli_scalm_ex
-bli_scalm_ex_qfp
 bli_scalv
-bli_scalv_check
 bli_scalv_ex
-bli_scalv_ex_qfp
 bli_sccastm
 bli_sccastnzm
 bli_sccastv
 bli_sccopysc
-bli_scgemm_ker_var2_md
 bli_scopyd
 bli_scopyd_ex
 bli_scopym
 bli_scopym_ex
-bli_scopym_unb_var1
 bli_scopyv
 bli_scopyv_ex
-bli_scpackm_blk_var1_md
-bli_scpackm_cxk_1e_md
-bli_scpackm_cxk_1r_md
-bli_scpackm_struc_cxk_md
 bli_scxpbym_md
 bli_scxpbym_md_ex
-bli_scxpbym_md_unb_var1
 bli_sdcastm
 bli_sdcastnzm
 bli_sdcastv
 bli_sdcopysc
-bli_sdgemm_ker_var2_md
 bli_sdivsc
 bli_sdotaxpyv
 bli_sdotaxpyv_ex
@@ -1569,187 +782,107 @@ bli_sdotxf
 bli_sdotxf_ex
 bli_sdotxv
 bli_sdotxv_ex
-bli_sdpackm_blk_var1_md
-bli_sdpackm_cxk_1e_md
-bli_sdpackm_cxk_1r_md
-bli_sdpackm_struc_cxk_md
 bli_sdxpbym_md
 bli_sdxpbym_md_ex
-bli_sdxpbym_md_unb_var1
+bli_seqm
+bli_seqsc
+bli_seqv
 bli_setd
-bli_setd_check
 bli_setd_ex
-bli_setd_ex_qfp
 bli_setid
-bli_setid_check
 bli_setid_ex
-bli_setid_ex_qfp
 bli_setijm
+bli_setijv
 bli_setim
 bli_setiv
 bli_setm
-bli_setm_check
 bli_setm_ex
-bli_setm_ex_qfp
 bli_setrm
 bli_setrv
 bli_setsc
-bli_setsc_check
-bli_setsc_qfp
 bli_setv
-bli_setv_check
 bli_setv_ex
-bli_setv_ex_qfp
 bli_sfprintm
 bli_sfprintv
 bli_sgemm
-bli_sgemm1m
-bli_sgemm3m1
-bli_sgemm3mh
-bli_sgemm4m1
-bli_sgemm4mb
-bli_sgemm4mb_ker_var2
-bli_sgemm4mh
 bli_sgemm_ex
-bli_sgemm_ker_var2
-bli_sgemmtrsm_l_ukernel
-bli_sgemmtrsm_u_ukernel
-bli_sgemm_ukernel
+bli_sgemmt
+bli_sgemmt_ex
 bli_sgemv
 bli_sgemv_ex
-bli_sgemv_unb_var1
-bli_sgemv_unb_var2
-bli_sgemv_unf_var1
-bli_sgemv_unf_var2
 bli_sger
 bli_sger_ex
-bli_sger_unb_var1
-bli_sger_unb_var2
 bli_sgetijm
+bli_sgetijv
 bli_sgetsc
 bli_shemm
-bli_shemm1m
-bli_shemm3m1
-bli_shemm3mh
-bli_shemm4m1
-bli_shemm4mh
 bli_shemm_ex
 bli_shemv
 bli_shemv_ex
-bli_shemv_unb_var1
-bli_shemv_unb_var2
-bli_shemv_unb_var3
-bli_shemv_unb_var4
-bli_shemv_unf_var1
-bli_shemv_unf_var1a
-bli_shemv_unf_var3
-bli_shemv_unf_var3a
 bli_sher
 bli_sher2
 bli_sher2_ex
 bli_sher2k
-bli_sher2k1m
-bli_sher2k3m1
-bli_sher2k3mh
-bli_sher2k4m1
-bli_sher2k4mh
 bli_sher2k_ex
-bli_sher2_unb_var1
-bli_sher2_unb_var2
-bli_sher2_unb_var3
-bli_sher2_unb_var4
-bli_sher2_unf_var1
-bli_sher2_unf_var4
 bli_sher_ex
 bli_sherk
-bli_sherk1m
-bli_sherk3m1
-bli_sherk3mh
-bli_sherk4m1
-bli_sherk4mh
 bli_sherk_ex
-bli_sherk_l_ker_var2
-bli_sherk_u_ker_var2
-bli_sher_unb_var1
-bli_sher_unb_var2
 bli_shiftd
-bli_shiftd_check
 bli_shiftd_ex
-bli_shiftd_ex_qfp
 bli_sinvertd
 bli_sinvertd_ex
 bli_sinvertsc
 bli_sinvertv
 bli_sinvertv_ex
-bli_slamch
+bli_sinvscald
+bli_sinvscald_ex
+bli_sinvscalm
+bli_sinvscalm_ex
+bli_sinvscalv
+bli_sinvscalv_ex
 bli_sleep
 bli_smachval
 bli_smkherm
 bli_smkherm_ex
-bli_smkherm_unb_var1
 bli_smksymm
 bli_smksymm_ex
-bli_smksymm_unb_var1
 bli_smktrim
 bli_smktrim_ex
-bli_smktrim_unb_var1
 bli_smulsc
 bli_snorm1m
 bli_snorm1m_ex
-bli_snorm1m_unb_var1
 bli_snorm1v
 bli_snorm1v_ex
-bli_snorm1v_unb_var1
 bli_snormfm
 bli_snormfm_ex
-bli_snormfm_unb_var1
 bli_snormfsc
 bli_snormfv
 bli_snormfv_ex
-bli_snormfv_unb_var1
 bli_snormim
 bli_snormim_ex
-bli_snormim_unb_var1
 bli_snormiv
 bli_snormiv_ex
-bli_snormiv_unb_var1
-bli_spackm_blk_var1
-bli_spackm_cxk
-bli_spackm_herm_cxk
-bli_spackm_struc_cxk
-bli_spackm_tri_cxk
-bli_spackm_unb_var1
 bli_sprintm
-bli_sprintm_ex
 bli_sprintv
-bli_sprintv_ex
 bli_sqrtsc
-bli_sqrtsc_check
-bli_sqrtsc_qfp
 bli_srandm
 bli_srandm_ex
-bli_srandm_unb_var1
 bli_srandnm
 bli_srandnm_ex
-bli_srandnm_unb_var1
 bli_srandnv
 bli_srandnv_ex
-bli_srandnv_unb_var1
 bli_srandv
 bli_srandv_ex
-bli_srandv_unb_var1
 bli_sscal2d
 bli_sscal2d_ex
 bli_sscal2m
 bli_sscal2m_ex
-bli_sscal2m_unb_var1
 bli_sscal2v
 bli_sscal2v_ex
 bli_sscald
 bli_sscald_ex
 bli_sscalm
 bli_sscalm_ex
-bli_sscalm_unb_var1
 bli_sscalv
 bli_sscalv_ex
 bli_sscastm
@@ -1761,42 +894,29 @@ bli_ssetd_ex
 bli_ssetid
 bli_ssetid_ex
 bli_ssetijm
+bli_ssetijv
 bli_ssetm
 bli_ssetm_ex
-bli_ssetm_unb_var1
 bli_ssetsc
 bli_ssetv
 bli_ssetv_ex
-bli_ssgemm_ker_var2_md
 bli_sshiftd
 bli_sshiftd_ex
-bli_sspackm_blk_var1_md
-bli_sspackm_cxk_1e_md
-bli_sspackm_cxk_1r_md
-bli_sspackm_struc_cxk_md
 bli_ssqrtsc
 bli_ssubd
 bli_ssubd_ex
 bli_ssubm
 bli_ssubm_ex
-bli_ssubm_unb_var1
 bli_ssubsc
 bli_ssubv
 bli_ssubv_ex
 bli_ssumsqv
 bli_ssumsqv_ex
-bli_ssumsqv_unb_var1
 bli_sswapv
 bli_sswapv_ex
 bli_ssxpbym_md
 bli_ssxpbym_md_ex
-bli_ssxpbym_md_unb_var1
 bli_ssymm
-bli_ssymm1m
-bli_ssymm3m1
-bli_ssymm3mh
-bli_ssymm4m1
-bli_ssymm4mh
 bli_ssymm_ex
 bli_ssymv
 bli_ssymv_ex
@@ -1804,330 +924,99 @@ bli_ssyr
 bli_ssyr2
 bli_ssyr2_ex
 bli_ssyr2k
-bli_ssyr2k1m
-bli_ssyr2k3m1
-bli_ssyr2k3mh
-bli_ssyr2k4m1
-bli_ssyr2k4mh
 bli_ssyr2k_ex
 bli_ssyr_ex
 bli_ssyrk
-bli_ssyrk1m
-bli_ssyrk3m1
-bli_ssyrk3mh
-bli_ssyrk4m1
-bli_ssyrk4mh
 bli_ssyrk_ex
-bli_string_mkupper
 bli_strmm
-bli_strmm1m
 bli_strmm3
-bli_strmm31m
-bli_strmm33m1
-bli_strmm33mh
-bli_strmm34m1
-bli_strmm34mh
 bli_strmm3_ex
-bli_strmm3m1
-bli_strmm4m1
 bli_strmm_ex
-bli_strmm_ll_ker_var2
-bli_strmm_lu_ker_var2
-bli_strmm_rl_ker_var2
-bli_strmm_ru_ker_var2
 bli_strmv
 bli_strmv_ex
-bli_strmv_unb_var1
-bli_strmv_unb_var2
-bli_strmv_unf_var1
-bli_strmv_unf_var2
 bli_strsm
-bli_strsm1m
-bli_strsm3m1
-bli_strsm4m1
 bli_strsm_ex
-bli_strsm_ll_ker_var2
-bli_strsm_l_ukernel
-bli_strsm_lu_ker_var2
-bli_strsm_rl_ker_var2
-bli_strsm_ru_ker_var2
-bli_strsm_u_ukernel
 bli_strsv
 bli_strsv_ex
-bli_strsv_unb_var1
-bli_strsv_unb_var2
-bli_strsv_unf_var1
-bli_strsv_unf_var2
 bli_subd
-bli_subd_check
 bli_subd_ex
-bli_subd_ex_qfp
 bli_subm
-bli_subm_check
 bli_subm_ex
-bli_subm_ex_qfp
 bli_subsc
-bli_subsc_check
-bli_subsc_qfp
 bli_subv
-bli_subv_check
 bli_subv_ex
-bli_subv_ex_qfp
 bli_sumsqv
-bli_sumsqv_check
 bli_sumsqv_ex
-bli_sumsqv_ex_qfp
-bli_sunpackm_blk_var1
-bli_sunpackm_cxk
-bli_sunpackm_unb_var1
 bli_sunzipsc
 bli_swapv
-bli_swapv_check
 bli_swapv_ex
-bli_swapv_ex_qfp
 bli_sxpbyd
 bli_sxpbyd_ex
 bli_sxpbym
 bli_sxpbym_ex
-bli_sxpbym_unb_var1
 bli_sxpbyv
 bli_sxpbyv_ex
 bli_symm
-bli_symm1m
-bli_symm3m1
-bli_symm3mh
-bli_symm4m1
-bli_symm4mh
-bli_symm_check
 bli_symm_ex
-bli_symm_front
-bli_symmind
-bli_symmind_get_avail
-bli_symmnat
 bli_symv
-bli_symv_check
 bli_symv_ex
-bli_symv_ex_qfp
 bli_syr
 bli_syr2
-bli_syr2_check
 bli_syr2_ex
-bli_syr2_ex_qfp
 bli_syr2k
-bli_syr2k1m
-bli_syr2k3m1
-bli_syr2k3mh
-bli_syr2k4m1
-bli_syr2k4mh
-bli_syr2k_check
 bli_syr2k_ex
-bli_syr2k_front
-bli_syr2kind
-bli_syr2kind_get_avail
-bli_syr2knat
-bli_syr_check
 bli_syr_ex
-bli_syr_ex_qfp
 bli_syrk
-bli_syrk1m
-bli_syrk3m1
-bli_syrk3mh
-bli_syrk4m1
-bli_syrk4mh
-bli_syrk_check
 bli_syrk_ex
-bli_syrk_front
-bli_syrkind
-bli_syrkind_get_avail
-bli_syrknat
 bli_szcastm
 bli_szcastnzm
 bli_szcastv
 bli_szcopysc
-bli_szgemm_ker_var2_md
 bli_szipsc
-bli_szpackm_blk_var1_md
-bli_szpackm_cxk_1e_md
-bli_szpackm_cxk_1r_md
-bli_szpackm_struc_cxk_md
 bli_szxpbym_md
 bli_szxpbym_md_ex
-bli_szxpbym_md_unb_var1
 bli_thrcomm_barrier
-bli_thrcomm_barrier_atomic
 bli_thrcomm_bcast
-bli_thrcomm_cleanup
-bli_thrcomm_create
-bli_thrcomm_free
-bli_thrcomm_init
-bli_thread_finalize
-bli_thread_get_env
 bli_thread_get_ic_nt
 bli_thread_get_ir_nt
 bli_thread_get_jc_nt
 bli_thread_get_jr_nt
 bli_thread_get_num_threads
 bli_thread_get_pc_nt
-bli_thread_init
-bli_thread_init_rntm
-bli_thread_init_rntm_from_env
-bli_thread_range_b2t
-bli_thread_range_l2r
-bli_thread_range_mdim
-bli_thread_range_ndim
-bli_thread_range_r2l
+bli_thread_get_thread_impl
+bli_thread_get_thread_impl_str
 bli_thread_range_sub
-bli_thread_range_t2b
-bli_thread_range_weighted_b2t
-bli_thread_range_weighted_l2r
-bli_thread_range_weighted_r2l
-bli_thread_range_weighted_sub
-bli_thread_range_weighted_t2b
-bli_thread_range_width_l
 bli_thread_set_num_threads
 bli_thread_set_num_threads_
+bli_thread_set_thread_impl
 bli_thread_set_ways
 bli_thread_set_ways_
-bli_thrinfo_create
-bli_thrinfo_create_for_cntl
-bli_thrinfo_create_for_cntl_prenode
 bli_thrinfo_free
-bli_thrinfo_grow
-bli_thrinfo_init
-bli_thrinfo_init_single
-bli_thrinfo_rgrow
-bli_thrinfo_rgrow_prenode
 bli_trmm
-bli_trmm1m
 bli_trmm3
-bli_trmm31m
-bli_trmm33m1
-bli_trmm33mh
-bli_trmm34m1
-bli_trmm34mh
 bli_trmm3_ex
-bli_trmm3_front
-bli_trmm3ind
-bli_trmm3ind_get_avail
-bli_trmm3m1
-bli_trmm3nat
-bli_trmm4m1
-bli_trmm_check
-bli_trmm_determine_kc
-bli_trmm_determine_kc_b
-bli_trmm_determine_kc_f
-bli_trmm_direct
 bli_trmm_ex
-bli_trmm_front
-bli_trmmind
-bli_trmmind_get_avail
-bli_trmm_ll_ker_var2
-bli_trmm_lu_ker_var2
-bli_trmmnat
-bli_trmm_prune_unref_mparts_k
-bli_trmm_prune_unref_mparts_m
-bli_trmm_prune_unref_mparts_n
-bli_trmm_rl_ker_var2
-bli_trmm_ru_ker_var2
-bli_trmm_xx_ker_var2
 bli_trmv
-bli_trmv_check
 bli_trmv_ex
-bli_trmv_ex_qfp
-bli_trmv_unb_var1
-bli_trmv_unb_var1_qfp
-bli_trmv_unb_var2
-bli_trmv_unb_var2_qfp
-bli_trmv_unf_var1
-bli_trmv_unf_var1_qfp
-bli_trmv_unf_var2
-bli_trmv_unf_var2_qfp
 bli_trsm
-bli_trsm1m
-bli_trsm3m1
-bli_trsm4m1
-bli_trsm_blk_var1
-bli_trsm_blk_var2
-bli_trsm_blk_var3
-bli_trsm_check
-bli_trsm_cntl_create
-bli_trsm_cntl_create_node
-bli_trsm_cntl_free
-bli_trsm_determine_kc
-bli_trsm_determine_kc_b
-bli_trsm_determine_kc_f
-bli_trsm_direct
 bli_trsm_ex
-bli_trsm_front
-bli_trsmind
-bli_trsmind_get_avail
-bli_trsm_int
-bli_trsm_l_cntl_create
-bli_trsm_ll_ker_var2
-bli_trsm_l_ukernel_qfp
-bli_trsm_lu_ker_var2
-bli_trsmnat
-bli_trsm_packa
-bli_trsm_packb
-bli_trsm_prune_unref_mparts_k
-bli_trsm_prune_unref_mparts_m
-bli_trsm_prune_unref_mparts_n
-bli_trsm_r_cntl_create
-bli_trsm_rl_ker_var2
-bli_trsm_ru_ker_var2
 bli_trsm_ukernel
-bli_trsm_u_ukernel_qfp
-bli_trsm_xx_ker_var2
 bli_trsv
-bli_trsv_check
 bli_trsv_ex
-bli_trsv_ex_qfp
-bli_trsv_unb_var1
-bli_trsv_unb_var1_qfp
-bli_trsv_unb_var2
-bli_trsv_unb_var2_qfp
-bli_trsv_unf_var1
-bli_trsv_unf_var1_qfp
-bli_trsv_unf_var2
-bli_trsv_unf_var2_qfp
-bli_unpackm_blk_var1
-bli_unpackm_cntl_create_node
-bli_unpackm_int
-bli_unpackm_int_check
-bli_unpackm_unb_var1
 bli_unzipsc
-bli_unzipsc_check
-bli_unzipsc_qfp
-bli_utilm_fprint_check
-bli_utilm_mkhst_check
-bli_utilm_norm_check
-bli_utilm_rand_check
-bli_utilv_norm_check
-bli_utilv_sumsqv_check
-bli_utilv_xa_check
 bli_xpbyd
-bli_xpbyd_check
 bli_xpbyd_ex
-bli_xpbyd_ex_qfp
 bli_xpbym
-bli_xpbym_check
 bli_xpbym_ex
-bli_xpbym_ex_qfp
 bli_xpbym_md
 bli_xpbym_md_ex
-bli_xpbym_md_ex_qfp2
 bli_xpbyv
-bli_xpbyv_check
 bli_xpbyv_ex
-bli_xpbyv_ex_qfp
-bli_xxmv_check
-bli_xxr_check
 bli_zabsqsc
 bli_zaddd
 bli_zaddd_ex
 bli_zaddm
 bli_zaddm_ex
-bli_zaddm_unb_var1
 bli_zaddsc
 bli_zaddv
 bli_zaddv_ex
@@ -2135,7 +1024,6 @@ bli_zamaxv
 bli_zamaxv_ex
 bli_zasumv
 bli_zasumv_ex
-bli_zasumv_unb_var1
 bli_zaxpbyv
 bli_zaxpbyv_ex
 bli_zaxpy2v
@@ -2146,33 +1034,24 @@ bli_zaxpyf
 bli_zaxpyf_ex
 bli_zaxpym
 bli_zaxpym_ex
-bli_zaxpym_unb_var1
 bli_zaxpyv
 bli_zaxpyv_ex
 bli_zccastm
 bli_zccastnzm
 bli_zccastv
 bli_zccopysc
-bli_zcgemm_ker_var2_md
 bli_zcopyd
 bli_zcopyd_ex
 bli_zcopym
 bli_zcopym_ex
-bli_zcopym_unb_var1
 bli_zcopyv
 bli_zcopyv_ex
-bli_zcpackm_blk_var1_md
-bli_zcpackm_cxk_1e_md
-bli_zcpackm_cxk_1r_md
-bli_zcpackm_struc_cxk_md
 bli_zcxpbym_md
 bli_zcxpbym_md_ex
-bli_zcxpbym_md_unb_var1
 bli_zdcastm
 bli_zdcastnzm
 bli_zdcastv
 bli_zdcopysc
-bli_zdgemm_ker_var2_md
 bli_zdivsc
 bli_zdotaxpyv
 bli_zdotaxpyv_ex
@@ -2184,174 +1063,89 @@ bli_zdotxf
 bli_zdotxf_ex
 bli_zdotxv
 bli_zdotxv_ex
-bli_zdpackm_blk_var1_md
-bli_zdpackm_cxk_1e_md
-bli_zdpackm_cxk_1r_md
-bli_zdpackm_struc_cxk_md
 bli_zdxpbym_md
 bli_zdxpbym_md_ex
-bli_zdxpbym_md_unb_var1
+bli_zeqm
+bli_zeqsc
+bli_zeqv
 bli_zfprintm
 bli_zfprintv
 bli_zgemm
-bli_zgemm1m
-bli_zgemm3m1
-bli_zgemm3mh
-bli_zgemm4m1
-bli_zgemm4mb
-bli_zgemm4mb_ker_var2
-bli_zgemm4mh
 bli_zgemm_ex
-bli_zgemm_ker_var2
-bli_zgemm_md_c2r_ref
-bli_zgemmtrsm_l_ukernel
-bli_zgemmtrsm_u_ukernel
-bli_zgemm_ukernel
+bli_zgemmt
+bli_zgemmt_ex
 bli_zgemv
 bli_zgemv_ex
-bli_zgemv_unb_var1
-bli_zgemv_unb_var2
-bli_zgemv_unf_var1
-bli_zgemv_unf_var2
 bli_zger
 bli_zger_ex
-bli_zger_unb_var1
-bli_zger_unb_var2
 bli_zgetijm
+bli_zgetijv
 bli_zgetsc
 bli_zhemm
-bli_zhemm1m
-bli_zhemm3m1
-bli_zhemm3mh
-bli_zhemm4m1
-bli_zhemm4mh
 bli_zhemm_ex
 bli_zhemv
 bli_zhemv_ex
-bli_zhemv_unb_var1
-bli_zhemv_unb_var2
-bli_zhemv_unb_var3
-bli_zhemv_unb_var4
-bli_zhemv_unf_var1
-bli_zhemv_unf_var1a
-bli_zhemv_unf_var3
-bli_zhemv_unf_var3a
 bli_zher
 bli_zher2
 bli_zher2_ex
 bli_zher2k
-bli_zher2k1m
-bli_zher2k3m1
-bli_zher2k3mh
-bli_zher2k4m1
-bli_zher2k4mh
 bli_zher2k_ex
-bli_zher2_unb_var1
-bli_zher2_unb_var2
-bli_zher2_unb_var3
-bli_zher2_unb_var4
-bli_zher2_unf_var1
-bli_zher2_unf_var4
 bli_zher_ex
 bli_zherk
-bli_zherk1m
-bli_zherk3m1
-bli_zherk3mh
-bli_zherk4m1
-bli_zherk4mh
 bli_zherk_ex
-bli_zherk_l_ker_var2
-bli_zherk_u_ker_var2
-bli_zher_unb_var1
-bli_zher_unb_var2
 bli_zinvertd
 bli_zinvertd_ex
 bli_zinvertsc
 bli_zinvertv
 bli_zinvertv_ex
+bli_zinvscald
+bli_zinvscald_ex
+bli_zinvscalm
+bli_zinvscalm_ex
+bli_zinvscalv
+bli_zinvscalv_ex
 bli_zipsc
-bli_zipsc_check
-bli_zipsc_qfp
 bli_zmachval
 bli_zmkherm
 bli_zmkherm_ex
-bli_zmkherm_unb_var1
 bli_zmksymm
 bli_zmksymm_ex
-bli_zmksymm_unb_var1
 bli_zmktrim
 bli_zmktrim_ex
-bli_zmktrim_unb_var1
 bli_zmulsc
 bli_znorm1m
 bli_znorm1m_ex
-bli_znorm1m_unb_var1
 bli_znorm1v
 bli_znorm1v_ex
-bli_znorm1v_unb_var1
 bli_znormfm
 bli_znormfm_ex
-bli_znormfm_unb_var1
 bli_znormfsc
 bli_znormfv
 bli_znormfv_ex
-bli_znormfv_unb_var1
 bli_znormim
 bli_znormim_ex
-bli_znormim_unb_var1
 bli_znormiv
 bli_znormiv_ex
-bli_znormiv_unb_var1
-bli_zpackm_blk_var1
-bli_zpackm_cxk
-bli_zpackm_cxk_1er
-bli_zpackm_cxk_3mis
-bli_zpackm_cxk_4mi
-bli_zpackm_cxk_rih
-bli_zpackm_herm_cxk
-bli_zpackm_herm_cxk_1er
-bli_zpackm_herm_cxk_3mis
-bli_zpackm_herm_cxk_4mi
-bli_zpackm_herm_cxk_rih
-bli_zpackm_struc_cxk
-bli_zpackm_struc_cxk_1er
-bli_zpackm_struc_cxk_3mis
-bli_zpackm_struc_cxk_4mi
-bli_zpackm_struc_cxk_rih
-bli_zpackm_tri_cxk
-bli_zpackm_tri_cxk_1er
-bli_zpackm_tri_cxk_3mis
-bli_zpackm_tri_cxk_4mi
-bli_zpackm_tri_cxk_rih
-bli_zpackm_unb_var1
 bli_zprintm
-bli_zprintm_ex
 bli_zprintv
-bli_zprintv_ex
 bli_zrandm
 bli_zrandm_ex
-bli_zrandm_unb_var1
 bli_zrandnm
 bli_zrandnm_ex
-bli_zrandnm_unb_var1
 bli_zrandnv
 bli_zrandnv_ex
-bli_zrandnv_unb_var1
 bli_zrandv
 bli_zrandv_ex
-bli_zrandv_unb_var1
 bli_zscal2d
 bli_zscal2d_ex
 bli_zscal2m
 bli_zscal2m_ex
-bli_zscal2m_unb_var1
 bli_zscal2v
 bli_zscal2v_ex
 bli_zscald
 bli_zscald_ex
 bli_zscalm
 bli_zscalm_ex
-bli_zscalm_unb_var1
 bli_zscalv
 bli_zscalv_ex
 bli_zscastm
@@ -2363,42 +1157,29 @@ bli_zsetd_ex
 bli_zsetid
 bli_zsetid_ex
 bli_zsetijm
+bli_zsetijv
 bli_zsetm
 bli_zsetm_ex
-bli_zsetm_unb_var1
 bli_zsetsc
 bli_zsetv
 bli_zsetv_ex
-bli_zsgemm_ker_var2_md
 bli_zshiftd
 bli_zshiftd_ex
-bli_zspackm_blk_var1_md
-bli_zspackm_cxk_1e_md
-bli_zspackm_cxk_1r_md
-bli_zspackm_struc_cxk_md
 bli_zsqrtsc
 bli_zsubd
 bli_zsubd_ex
 bli_zsubm
 bli_zsubm_ex
-bli_zsubm_unb_var1
 bli_zsubsc
 bli_zsubv
 bli_zsubv_ex
 bli_zsumsqv
 bli_zsumsqv_ex
-bli_zsumsqv_unb_var1
 bli_zswapv
 bli_zswapv_ex
 bli_zsxpbym_md
 bli_zsxpbym_md_ex
-bli_zsxpbym_md_unb_var1
 bli_zsymm
-bli_zsymm1m
-bli_zsymm3m1
-bli_zsymm3mh
-bli_zsymm4m1
-bli_zsymm4mh
 bli_zsymm_ex
 bli_zsymv
 bli_zsymv_ex
@@ -2406,85 +1187,37 @@ bli_zsyr
 bli_zsyr2
 bli_zsyr2_ex
 bli_zsyr2k
-bli_zsyr2k1m
-bli_zsyr2k3m1
-bli_zsyr2k3mh
-bli_zsyr2k4m1
-bli_zsyr2k4mh
 bli_zsyr2k_ex
 bli_zsyr_ex
 bli_zsyrk
-bli_zsyrk1m
-bli_zsyrk3m1
-bli_zsyrk3mh
-bli_zsyrk4m1
-bli_zsyrk4mh
 bli_zsyrk_ex
 bli_ztrmm
-bli_ztrmm1m
 bli_ztrmm3
-bli_ztrmm31m
-bli_ztrmm33m1
-bli_ztrmm33mh
-bli_ztrmm34m1
-bli_ztrmm34mh
 bli_ztrmm3_ex
-bli_ztrmm3m1
-bli_ztrmm4m1
 bli_ztrmm_ex
-bli_ztrmm_ll_ker_var2
-bli_ztrmm_lu_ker_var2
-bli_ztrmm_rl_ker_var2
-bli_ztrmm_ru_ker_var2
 bli_ztrmv
 bli_ztrmv_ex
-bli_ztrmv_unb_var1
-bli_ztrmv_unb_var2
-bli_ztrmv_unf_var1
-bli_ztrmv_unf_var2
 bli_ztrsm
-bli_ztrsm1m
-bli_ztrsm3m1
-bli_ztrsm4m1
 bli_ztrsm_ex
-bli_ztrsm_ll_ker_var2
-bli_ztrsm_l_ukernel
-bli_ztrsm_lu_ker_var2
-bli_ztrsm_rl_ker_var2
-bli_ztrsm_ru_ker_var2
-bli_ztrsm_u_ukernel
 bli_ztrsv
 bli_ztrsv_ex
-bli_ztrsv_unb_var1
-bli_ztrsv_unb_var2
-bli_ztrsv_unf_var1
-bli_ztrsv_unf_var2
-bli_zunpackm_blk_var1
-bli_zunpackm_cxk
-bli_zunpackm_unb_var1
 bli_zunzipsc
 bli_zxpbyd
 bli_zxpbyd_ex
 bli_zxpbym
 bli_zxpbym_ex
-bli_zxpbym_unb_var1
 bli_zxpbyv
 bli_zxpbyv_ex
 bli_zzcastm
 bli_zzcastnzm
 bli_zzcastv
 bli_zzcopysc
-bli_zzgemm_ker_var2_md
 bli_zzipsc
-bli_zzpackm_blk_var1_md
-bli_zzpackm_cxk_1e_md
-bli_zzpackm_cxk_1r_md
-bli_zzpackm_struc_cxk_md
 bli_zzxpbym_md
 bli_zzxpbym_md_ex
-bli_zzxpbym_md_unb_var1
 sasum_
 sasumsub_
+saxpby_
 saxpy_
 scabs1_
 scasum_
@@ -2498,6 +1231,8 @@ sdsdot_
 sdsdotsub_
 sgbmv_
 sgemm_
+sgemm_batch_
+sgemmt_
 sgemv_
 sger_
 snrm2_
@@ -2528,6 +1263,7 @@ strsm_
 strsv_
 dasum_
 dasumsub_
+daxpby_
 daxpy_
 dcabs1_
 dcopy_
@@ -2535,6 +1271,8 @@ ddot_
 ddotsub_
 dgbmv_
 dgemm_
+dgemm_batch_
+dgemmt_
 dgemv_
 dger_
 dnrm2_
@@ -2569,6 +1307,7 @@ dzasum_
 dzasumsub_
 dznrm2_
 dznrm2sub_
+caxpby_
 caxpy_
 ccopy_
 cdotc_
@@ -2577,6 +1316,9 @@ cdotu_
 cdotusub_
 cgbmv_
 cgemm_
+cgemm3m_
+cgemm_batch_
+cgemmt_
 cgemv_
 cgerc_
 cgeru_
@@ -2606,6 +1348,7 @@ ctrmm_
 ctrmv_
 ctrsm_
 ctrsv_
+zaxpby_
 zaxpy_
 zcopy_
 zdotc_
@@ -2616,6 +1359,9 @@ zdrot_
 zdscal_
 zgbmv_
 zgemm_
+zgemm3m_
+zgemm_batch_
+zgemmt_
 zgemv_
 zgerc_
 zgeru_
@@ -2651,12 +1397,16 @@ isamax_
 isamaxsub_
 izamax_
 izamaxsub_
+cblas_caxpby
 cblas_caxpy
 cblas_ccopy
 cblas_cdotc_sub
 cblas_cdotu_sub
 cblas_cgbmv
 cblas_cgemm
+cblas_cgemm3m
+cblas_cgemm_batch
+cblas_cgemmt
 cblas_cgemv
 cblas_cgerc
 cblas_cgeru
@@ -2685,11 +1435,14 @@ cblas_ctrmv
 cblas_ctrsm
 cblas_ctrsv
 cblas_dasum
+cblas_daxpby
 cblas_daxpy
 cblas_dcopy
 cblas_ddot
 cblas_dgbmv
 cblas_dgemm
+cblas_dgemm_batch
+cblas_dgemmt
 cblas_dgemv
 cblas_dger
 cblas_dnrm2
@@ -2725,6 +1478,7 @@ cblas_idamax
 cblas_isamax
 cblas_izamax
 cblas_sasum
+cblas_saxpby
 cblas_saxpy
 cblas_scasum
 cblas_scnrm2
@@ -2733,6 +1487,8 @@ cblas_sdot
 cblas_sdsdot
 cblas_sgbmv
 cblas_sgemm
+cblas_sgemm_batch
+cblas_sgemmt
 cblas_sgemv
 cblas_sger
 cblas_snrm2
@@ -2761,6 +1517,7 @@ cblas_strmv
 cblas_strsm
 cblas_strsv
 cblas_xerbla
+cblas_zaxpby
 cblas_zaxpy
 cblas_zcopy
 cblas_zdotc_sub
@@ -2768,6 +1525,9 @@ cblas_zdotu_sub
 cblas_zdscal
 cblas_zgbmv
 cblas_zgemm
+cblas_zgemm3m
+cblas_zgemm_batch
+cblas_zgemmt
 cblas_zgemv
 cblas_zgerc
 cblas_zgeru
diff --git a/frame/3/bli_l3_cntl.h b/frame/3/bli_l3_cntl.h
index cbee0e7f78..337f82c319 100644
--- a/frame/3/bli_l3_cntl.h
+++ b/frame/3/bli_l3_cntl.h
@@ -51,6 +51,7 @@ void bli_l3_cntl_create_if
              cntl_t** cntl_use
      );
 
+BLIS_EXPORT_BLIS
 void bli_l3_cntl_free
      (
        pool_t* pool,
diff --git a/frame/3/bli_l3_thrinfo.h b/frame/3/bli_l3_thrinfo.h
index 0c00662466..35e26ec01b 100644
--- a/frame/3/bli_l3_thrinfo.h
+++ b/frame/3/bli_l3_thrinfo.h
@@ -70,6 +70,7 @@
 
 // -----------------------------------------------------------------------------
 
+BLIS_EXPORT_BLIS
 thrinfo_t* bli_l3_thrinfo_create
      (
              dim_t       id,
diff --git a/frame/thread/bli_thrinfo.h b/frame/thread/bli_thrinfo.h
index 70ece0764b..d7d8190bce 100644
--- a/frame/thread/bli_thrinfo.h
+++ b/frame/thread/bli_thrinfo.h
@@ -226,6 +226,7 @@ thrinfo_t* bli_thrinfo_create
        pba_t*     pba
      );
 
+BLIS_EXPORT_BLIS
 void bli_thrinfo_free
      (
        thrinfo_t* thread

From 32895533ed15e5ece4d3aa45f3a1dde4b8c503e2 Mon Sep 17 00:00:00 2001
From: Devin Matthews <damatthews@smu.edu>
Date: Wed, 5 Oct 2022 21:37:29 -0500
Subject: [PATCH 32/32] Fix the gemmlike sandbox.

---
 sandbox/gemmlike/bli_gemm_ex.c                |  2 +-
 sandbox/gemmlike/bls_gemm.c                   | 15 +--
 sandbox/gemmlike/bls_gemm.h                   |  2 +-
 sandbox/gemmlike/bls_gemm_bp_var1.c           | 92 ++-----------------
 sandbox/gemmlike/bls_l3_packm_a.c             | 61 +++---------
 sandbox/gemmlike/bls_l3_packm_a.h             | 21 -----
 sandbox/gemmlike/bls_l3_packm_b.c             | 61 +++---------
 sandbox/gemmlike/bls_l3_packm_b.h             | 21 -----
 sandbox/gemmlike/bls_l3_packm_var.h           |  2 +-
 sandbox/gemmlike/bls_l3_packm_var1.c          | 11 +--
 sandbox/gemmlike/bls_l3_packm_var2.c          | 11 +--
 sandbox/gemmlike/bls_l3_packm_var3.c          | 11 +--
 sandbox/gemmlike/thread/bls_l3_decor_openmp.c | 35 ++-----
 .../gemmlike/thread/bls_l3_decor_pthreads.c   | 36 ++------
 sandbox/gemmlike/thread/bls_l3_decor_single.c | 50 ++--------
 15 files changed, 75 insertions(+), 356 deletions(-)

diff --git a/sandbox/gemmlike/bli_gemm_ex.c b/sandbox/gemmlike/bli_gemm_ex.c
index fe220e6031..00d4448c6b 100644
--- a/sandbox/gemmlike/bli_gemm_ex.c
+++ b/sandbox/gemmlike/bli_gemm_ex.c
@@ -52,7 +52,7 @@ void bli_gemm_ex
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm
+       const rntm_t* rntm
      )
 {
 	bli_init_once();
diff --git a/sandbox/gemmlike/bls_gemm.c b/sandbox/gemmlike/bls_gemm.c
index 1e567a114b..5ce2dcc2f1 100644
--- a/sandbox/gemmlike/bls_gemm.c
+++ b/sandbox/gemmlike/bls_gemm.c
@@ -33,6 +33,7 @@
 */
 
 #include "blis.h"
+#include "thread/bls_l3_decor.h"
 
 //
 // -- Define the gemm-like operation's object API ------------------------------
@@ -67,7 +68,7 @@ void bls_gemm_ex
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm
+       const rntm_t* rntm
      )
 {
 	bli_init_once();
@@ -75,8 +76,8 @@ void bls_gemm_ex
 	// Initialize a local runtime with global settings if necessary. Note
 	// that in the case that a runtime is passed in, we make a local copy.
 	rntm_t rntm_l;
-	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); rntm = &rntm_l; }
-	else                { rntm_l = *rntm;                       rntm = &rntm_l; }
+	if ( rntm == NULL ) { bli_rntm_init_from_global( &rntm_l ); }
+	else                { rntm_l = *rntm;                       }
 
 	// Set the .pack_a and .pack_b fields to TRUE. This is only needed because
 	// this sandbox uses bli_thrinfo_sup_grow(), which calls
@@ -87,8 +88,8 @@ void bls_gemm_ex
 	// while this sandbox implementation executes (and it also reinforces the
 	// fact that we *are* indeed packing A and B, albeit not in the sup context
 	// originally envisioned for the .pack_a and .pack_b fields).
-	bli_rntm_set_pack_a( TRUE, rntm );
-	bli_rntm_set_pack_b( TRUE, rntm );
+	bli_rntm_set_pack_a( TRUE, &rntm_l );
+	bli_rntm_set_pack_b( TRUE, &rntm_l );
 
 	// Obtain a valid (native) context from the gks if necessary.
 	// NOTE: This must be done before calling the _check() function, since
@@ -166,7 +167,7 @@ void bls_gemm_ex
 	  bli_obj_length( &c_local ),
 	  bli_obj_width( &c_local ),
 	  bli_obj_width( &a_local ),
-	  rntm
+	  &rntm_l
 	);
 
 	// Spawn threads (if applicable), where bls_gemm_int() is the thread entry
@@ -182,7 +183,7 @@ void bls_gemm_ex
 	  ( obj_t* )beta,
 	  ( obj_t* )&c_local,
 	  ( cntx_t* )cntx,
-	  rntm
+	  &rntm_l
 	);
 }
 
diff --git a/sandbox/gemmlike/bls_gemm.h b/sandbox/gemmlike/bls_gemm.h
index d01c6647ee..7380f02add 100644
--- a/sandbox/gemmlike/bls_gemm.h
+++ b/sandbox/gemmlike/bls_gemm.h
@@ -53,7 +53,7 @@ void bls_gemm_ex
        const obj_t*  beta,
        const obj_t*  c,
        const cntx_t* cntx,
-             rntm_t* rntm
+       const rntm_t* rntm
      );
 
 //
diff --git a/sandbox/gemmlike/bls_gemm_bp_var1.c b/sandbox/gemmlike/bls_gemm_bp_var1.c
index 1e3e5ea03f..dac38bab0b 100644
--- a/sandbox/gemmlike/bls_gemm_bp_var1.c
+++ b/sandbox/gemmlike/bls_gemm_bp_var1.c
@@ -186,42 +186,13 @@ void PASTECH2(bls_,ch,varname) \
 \
 	auxinfo_t       aux; \
 \
-	/* Initialize a mem_t entry for A and B. Strictly speaking, this is only
-	   needed for the matrix we will be packing (if any), but we do it
-	   unconditionally to be safe. */ \
-	mem_t mem_a = BLIS_MEM_INITIALIZER; \
-	mem_t mem_b = BLIS_MEM_INITIALIZER; \
-\
-	/* Define an array of bszid_t ids, which will act as our substitute for
-	   the cntl_t tree. */ \
-	bszid_t bszids[8] = { BLIS_NC,      /* 5th loop */ \
-	                      BLIS_KC,      /* 4th loop */ \
-	                      BLIS_NO_PART, /* pack B */ \
-	                      BLIS_MC,      /* 3rd loop */ \
-	                      BLIS_NO_PART, /* pack A */ \
-	                      BLIS_NR,      /* 2nd loop */ \
-	                      BLIS_MR,      /* 1st loop */ \
-	                      BLIS_KR };    /* microkernel loop */  \
-\
-	bszid_t* restrict bszids_jc = &bszids[0]; \
-	bszid_t* restrict bszids_pc = &bszids[1]; \
-	/*bszid_t* restrict bszids_pb = &bszids[2];*/ \
-	bszid_t* restrict bszids_ic = &bszids[3]; \
-	/*bszid_t* restrict bszids_pa = &bszids[4];*/ \
-	bszid_t* restrict bszids_jr = &bszids[5]; \
-	/*bszid_t* restrict bszids_ir = &bszids[6];*/ \
-\
-	thrinfo_t* restrict thread_jc = NULL; \
-	thrinfo_t* restrict thread_pc = NULL; \
-	thrinfo_t* restrict thread_pb = NULL; \
-	thrinfo_t* restrict thread_ic = NULL; \
-	thrinfo_t* restrict thread_pa = NULL; \
-	thrinfo_t* restrict thread_jr = NULL; \
-	thrinfo_t* restrict thread_ir = NULL; \
-\
-	/* Identify the current thrinfo_t node and then grow the tree. */ \
-	thread_jc = thread; \
-	bli_thrinfo_sup_grow( rntm, bszids_jc, thread_jc ); \
+	thrinfo_t* restrict thread_jc = thread; \
+	thrinfo_t* restrict thread_pc = bli_thrinfo_sub_node( thread_jc ); \
+	thrinfo_t* restrict thread_pb = bli_thrinfo_sub_node( thread_pc ); \
+	thrinfo_t* restrict thread_ic = bli_thrinfo_sub_node( thread_pb ); \
+	thrinfo_t* restrict thread_pa = bli_thrinfo_sub_node( thread_ic ); \
+	thrinfo_t* restrict thread_jr = bli_thrinfo_sub_node( thread_pa ); \
+	thrinfo_t* restrict thread_ir = bli_thrinfo_sub_node( thread_jr ); \
 \
 	/* Compute the JC loop thread range for the current thread. */ \
 	dim_t jc_start, jc_end; \
@@ -240,10 +211,6 @@ void PASTECH2(bls_,ch,varname) \
 \
 		ctype* restrict b_jc = b_00 + jj * jcstep_b; \
 		ctype* restrict c_jc = c_00 + jj * jcstep_c; \
-\
-		/* Identify the current thrinfo_t node and then grow the tree. */ \
-		thread_pc = bli_thrinfo_sub_node( thread_jc ); \
-		bli_thrinfo_sup_grow( rntm, bszids_pc, thread_pc ); \
 \
 		/* Compute the PC loop thread range for the current thread. */ \
 		const dim_t pc_start = 0, pc_end = k; \
@@ -267,14 +234,6 @@ void PASTECH2(bls_,ch,varname) \
 \
 			ctype* b_use; \
 			inc_t  rs_b_use, cs_b_use, ps_b_use; \
-\
-			/* Identify the current thrinfo_t node. Note that the thrinfo_t
-			   node will have already been created by a previous call to
-			   bli_thrinfo_sup_grow() since bszid_t values of BLIS_NO_PART
-			   cause the tree to grow by two (e.g. to the next bszid that is
-			   a normal bszid_t value). */ \
-			thread_pb = bli_thrinfo_sub_node( thread_pc ); \
-			/*bli_thrinfo_sup_grow( rntm, bszids_pb, thread_pb );*/ \
 \
 			/* Determine the packing buffer and related parameters for matrix
 			   B. Then call the packm implementation. */ \
@@ -288,18 +247,12 @@ void PASTECH2(bls_,ch,varname) \
 			  &b_use, &rs_b_use, &cs_b_use, \
 			                     &ps_b_use, \
 			  cntx, \
-			  rntm, \
-			  &mem_b, \
 			  thread_pb  \
 			); \
 \
 			/* Alias b_use so that it's clear this is our current block of
 			   matrix B. */ \
 			ctype* restrict b_pc_use = b_use; \
-\
-			/* Identify the current thrinfo_t node and then grow the tree. */ \
-			thread_ic = bli_thrinfo_sub_node( thread_pb ); \
-			bli_thrinfo_sup_grow( rntm, bszids_ic, thread_ic ); \
 \
 			/* Compute the IC loop thread range for the current thread. */ \
 			dim_t ic_start, ic_end; \
@@ -321,14 +274,6 @@ void PASTECH2(bls_,ch,varname) \
 \
 				ctype* a_use; \
 				inc_t  rs_a_use, cs_a_use, ps_a_use; \
-\
-				/* Identify the current thrinfo_t node. Note that the thrinfo_t
-				   node will have already been created by a previous call to
-				   bli_thrinfo_sup_grow() since bszid_t values of BLIS_NO_PART
-				   cause the tree to grow by two (e.g. to the next bszid that is
-				   a normal bszid_t value). */ \
-				thread_pa = bli_thrinfo_sub_node( thread_ic ); \
-				/*bli_thrinfo_sup_grow( rntm, bszids_pa, thread_pa );*/ \
 \
 				/* Determine the packing buffer and related parameters for matrix
 				   A. Then call the packm implementation. */ \
@@ -342,18 +287,12 @@ void PASTECH2(bls_,ch,varname) \
 				  &a_use, &rs_a_use, &cs_a_use, \
 				                     &ps_a_use, \
 				  cntx, \
-				  rntm, \
-				  &mem_a, \
 				  thread_pa  \
 				); \
 \
 				/* Alias a_use so that it's clear this is our current block of
 				   matrix A. */ \
 				ctype* restrict a_ic_use = a_use; \
-\
-				/* Identify the current thrinfo_t node and then grow the tree. */ \
-				thread_jr = bli_thrinfo_sub_node( thread_pa ); \
-				bli_thrinfo_sup_grow( rntm, bszids_jr, thread_jr ); \
 \
 				/* Query the number of threads and thread ids for the JR loop.
 				   NOTE: These values are only needed when computing the next
@@ -381,9 +320,6 @@ void PASTECH2(bls_,ch,varname) \
 					/* Assume for now that our next panel of B to be the current panel
 					   of B. */ \
 					ctype* restrict b2 = b_jr; \
-\
-					/* Identify the current thrinfo_t node. */ \
-					thread_ir = bli_thrinfo_sub_node( thread_jr ); \
 \
 					/* Query the number of threads and thread ids for the IR loop.
 					   NOTE: These values are only needed when computing the next
@@ -449,20 +385,6 @@ void PASTECH2(bls_,ch,varname) \
 			bli_thread_barrier( thread_pb ); \
 		} \
 	} \
-\
-	/* Release any memory that was acquired for packing matrices A and B. */ \
-	PASTECH2(bls_,ch,packm_finalize_mem_a) \
-	( \
-	  rntm, \
-	  &mem_a, \
-	  thread_pa  \
-	); \
-	PASTECH2(bls_,ch,packm_finalize_mem_b) \
-	( \
-	  rntm, \
-	  &mem_b, \
-	  thread_pb  \
-	); \
 \
 /*
 PASTEMAC(ch,fprintm)( stdout, "gemm_bp_var1: a1_packed", mr_cur, kc_cur, a_ir, rs_a_use, cs_a_use, "%5.2f", "" ); \
diff --git a/sandbox/gemmlike/bls_l3_packm_a.c b/sandbox/gemmlike/bls_l3_packm_a.c
index 97c437f980..326f83b0cd 100644
--- a/sandbox/gemmlike/bls_l3_packm_a.c
+++ b/sandbox/gemmlike/bls_l3_packm_a.c
@@ -43,8 +43,6 @@ void PASTECH2(bls_,ch,opname) \
        dim_t            k, \
        dim_t            mr, \
        cntx_t* restrict cntx, \
-       rntm_t* restrict rntm, \
-       mem_t*  restrict mem, \
        thrinfo_t* restrict thread  \
      ) \
 { \
@@ -65,12 +63,14 @@ void PASTECH2(bls_,ch,opname) \
 \
 	/* Compute the size of the memory block eneded. */ \
 	siz_t size_needed = sizeof( ctype ) * m_pack * k_pack; \
+\
+    mem_t* mem = bli_thread_mem( thread ); \
 \
 	/* Check the mem_t entry provided by the caller. If it is unallocated,
 	   then we need to acquire a block from the packed block allocator. */ \
 	if ( bli_mem_is_unalloc( mem ) ) \
 	{ \
-		if ( bli_thread_am_ochief( thread ) ) \
+		if ( bli_thread_am_chief( thread ) ) \
 		{ \
 			/* Acquire directly to the chief thread's mem_t that was passed in.
 			   It needs to be that mem_t struct, and not a local (temporary)
@@ -81,7 +81,7 @@ void PASTECH2(bls_,ch,opname) \
 			   again, I prefer to keep barriers to a minimum.) */ \
 			bli_pba_acquire_m \
 			( \
-			  rntm, \
+			  bli_thread_pba( thread ), \
 			  size_needed, \
 			  pack_buf_type, \
 			  mem  \
@@ -90,13 +90,13 @@ void PASTECH2(bls_,ch,opname) \
 \
 		/* Broadcast the address of the chief thread's passed-in mem_t to all
 		   threads. */ \
-		mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); \
+		mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
 \
 		/* Non-chief threads: Copy the contents of the chief thread's
 		   passed-in mem_t to the passed-in mem_t for this thread. (The
 		   chief thread already has the mem_t, so it does not need to
 		   perform any copy.) */ \
-		if ( !bli_thread_am_ochief( thread ) ) \
+		if ( !bli_thread_am_chief( thread ) ) \
 		{ \
 			*mem = *mem_p; \
 		} \
@@ -115,7 +115,7 @@ void PASTECH2(bls_,ch,opname) \
 \
 		if ( mem_size < size_needed ) \
 		{ \
-			if ( bli_thread_am_ochief( thread ) ) \
+			if ( bli_thread_am_chief( thread ) ) \
 			{ \
 				/* The chief thread releases the existing block associated
 				   with the mem_t, and then re-acquires a new block, saving
@@ -125,12 +125,12 @@ void PASTECH2(bls_,ch,opname) \
 				   (temporary) mem_t. */ \
 				bli_pba_release \
 				( \
-				  rntm, \
+				  bli_thread_pba( thread ), \
 				  mem \
 				); \
 				bli_pba_acquire_m \
 				( \
-				  rntm, \
+				  bli_thread_pba( thread ), \
 				  size_needed, \
 				  pack_buf_type, \
 				  mem \
@@ -139,13 +139,13 @@ void PASTECH2(bls_,ch,opname) \
 \
 			/* Broadcast the address of the chief thread's passed-in mem_t
 			   to all threads. */ \
-			mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); \
+			mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
 \
 			/* Non-chief threads: Copy the contents of the chief thread's
 			   passed-in mem_t to the passed-in mem_t for this thread. (The
 			   chief thread already has the mem_t, so it does not need to
 			   perform any copy.) */ \
-			if ( !bli_thread_am_ochief( thread ) ) \
+			if ( !bli_thread_am_chief( thread ) ) \
 			{ \
 				*mem = *mem_p; \
 			} \
@@ -165,39 +165,6 @@ GENTFUNC( scomplex, c, packm_init_mem_a )
 GENTFUNC( dcomplex, z, packm_init_mem_a )
 
 
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname ) \
-\
-void PASTECH2(bls_,ch,opname) \
-     ( \
-       rntm_t* restrict rntm, \
-       mem_t*  restrict mem, \
-       thrinfo_t* restrict thread  \
-     ) \
-{ \
-	if ( thread != NULL ) \
-	if ( bli_thread_am_ochief( thread ) ) \
-	{ \
-		/* Check the mem_t entry provided by the caller. Only proceed if it
-		   is allocated, which it should be. */ \
-		if ( bli_mem_is_alloc( mem ) ) \
-		{ \
-			bli_pba_release \
-			( \
-			  rntm, \
-			  mem \
-			); \
-		} \
-	} \
-}
-
-//INSERT_GENTFUNC_BASIC0( packm_finalize_mem_a )
-GENTFUNC( float,    s, packm_finalize_mem_a )
-GENTFUNC( double,   d, packm_finalize_mem_a )
-GENTFUNC( scomplex, c, packm_finalize_mem_a )
-GENTFUNC( dcomplex, z, packm_finalize_mem_a )
-
-
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname ) \
 \
@@ -267,8 +234,6 @@ void PASTECH2(bls_,ch,opname) \
        ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
                                                  inc_t* restrict ps_p, \
        cntx_t* restrict cntx, \
-       rntm_t* restrict rntm, \
-       mem_t*  restrict mem, \
        thrinfo_t* restrict thread  \
      ) \
 { \
@@ -282,8 +247,6 @@ void PASTECH2(bls_,ch,opname) \
 	( \
 	  m_alloc, k_alloc, mr, \
 	  cntx, \
-	  rntm, \
-	  mem, \
 	  thread  \
 	); \
 \
@@ -295,7 +258,7 @@ void PASTECH2(bls_,ch,opname) \
 	  &m_max, &k_max, \
 	  p, rs_p,  cs_p, \
 	     &pd_p, ps_p, \
-	  mem  \
+	  bli_thread_mem( thread )  \
 	); \
 \
 	/* Pack matrix A to the destination buffer chosen above. Here, the packed
diff --git a/sandbox/gemmlike/bls_l3_packm_a.h b/sandbox/gemmlike/bls_l3_packm_a.h
index 201a24efae..2ab53dcbf2 100644
--- a/sandbox/gemmlike/bls_l3_packm_a.h
+++ b/sandbox/gemmlike/bls_l3_packm_a.h
@@ -41,8 +41,6 @@ void PASTECH2(bls_,ch,opname) \
        dim_t            k, \
        dim_t            mr, \
        cntx_t* restrict cntx, \
-       rntm_t* restrict rntm, \
-       mem_t*  restrict mem, \
        thrinfo_t* restrict thread  \
      ); \
 
@@ -53,23 +51,6 @@ GENTPROT( scomplex, c, packm_init_mem_a )
 GENTPROT( dcomplex, z, packm_init_mem_a )
 
 
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, opname ) \
-\
-void PASTECH2(bls_,ch,opname) \
-     ( \
-       rntm_t* restrict rntm, \
-       mem_t*  restrict mem, \
-       thrinfo_t* restrict thread  \
-     ); \
-
-//INSERT_GENTPROT_BASIC0( packm_finalize_mem_a )
-GENTPROT( float,    s, packm_finalize_mem_a )
-GENTPROT( double,   d, packm_finalize_mem_a )
-GENTPROT( scomplex, c, packm_finalize_mem_a )
-GENTPROT( dcomplex, z, packm_finalize_mem_a )
-
-
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
@@ -109,8 +90,6 @@ void PASTECH2(bls_,ch,opname) \
        ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
                                                  inc_t* restrict ps_p, \
        cntx_t* restrict cntx, \
-       rntm_t* restrict rntm, \
-       mem_t*  restrict mem, \
        thrinfo_t* restrict thread  \
      ); \
 
diff --git a/sandbox/gemmlike/bls_l3_packm_b.c b/sandbox/gemmlike/bls_l3_packm_b.c
index 37dbe22191..4ebe1062ba 100644
--- a/sandbox/gemmlike/bls_l3_packm_b.c
+++ b/sandbox/gemmlike/bls_l3_packm_b.c
@@ -43,8 +43,6 @@ void PASTECH2(bls_,ch,opname) \
        dim_t            n, \
        dim_t            nr, \
        cntx_t* restrict cntx, \
-       rntm_t* restrict rntm, \
-       mem_t*  restrict mem, \
        thrinfo_t* restrict thread  \
      ) \
 { \
@@ -65,12 +63,14 @@ void PASTECH2(bls_,ch,opname) \
 \
 	/* Compute the size of the memory block eneded. */ \
 	siz_t size_needed = sizeof( ctype ) * k_pack * n_pack; \
+\
+    mem_t* mem = bli_thread_mem( thread ); \
 \
 	/* Check the mem_t entry provided by the caller. If it is unallocated,
 	   then we need to acquire a block from the packed block allocator. */ \
 	if ( bli_mem_is_unalloc( mem ) ) \
 	{ \
-		if ( bli_thread_am_ochief( thread ) ) \
+		if ( bli_thread_am_chief( thread ) ) \
 		{ \
 			/* Acquire directly to the chief thread's mem_t that was passed in.
 			   It needs to be that mem_t struct, and not a local (temporary)
@@ -81,7 +81,7 @@ void PASTECH2(bls_,ch,opname) \
 			   again, I prefer to keep barriers to a minimum.) */ \
 			bli_pba_acquire_m \
 			( \
-			  rntm, \
+			  bli_thread_pba( thread ), \
 			  size_needed, \
 			  pack_buf_type, \
 			  mem  \
@@ -90,13 +90,13 @@ void PASTECH2(bls_,ch,opname) \
 \
 		/* Broadcast the address of the chief thread's passed-in mem_t to all
 		   threads. */ \
-		mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); \
+		mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
 \
 		/* Non-chief threads: Copy the contents of the chief thread's
 		   passed-in mem_t to the passed-in mem_t for this thread. (The
 		   chief thread already has the mem_t, so it does not need to
 		   perform any copy.) */ \
-		if ( !bli_thread_am_ochief( thread ) ) \
+		if ( !bli_thread_am_chief( thread ) ) \
 		{ \
 			*mem = *mem_p; \
 		} \
@@ -115,7 +115,7 @@ void PASTECH2(bls_,ch,opname) \
 \
 		if ( mem_size < size_needed ) \
 		{ \
-			if ( bli_thread_am_ochief( thread ) ) \
+			if ( bli_thread_am_chief( thread ) ) \
 			{ \
 				/* The chief thread releases the existing block associated
 				   with the mem_t, and then re-acquires a new block, saving
@@ -125,12 +125,12 @@ void PASTECH2(bls_,ch,opname) \
 				   (temporary) mem_t. */ \
 				bli_pba_release \
 				( \
-				  rntm, \
+				  bli_thread_pba( thread ), \
 				  mem \
 				); \
 				bli_pba_acquire_m \
 				( \
-				  rntm, \
+				  bli_thread_pba( thread ), \
 				  size_needed, \
 				  pack_buf_type, \
 				  mem \
@@ -139,13 +139,13 @@ void PASTECH2(bls_,ch,opname) \
 \
 			/* Broadcast the address of the chief thread's passed-in mem_t
 			   to all threads. */ \
-			mem_t* mem_p = bli_thread_broadcast( rntm, thread, mem ); \
+			mem_t* mem_p = bli_thread_broadcast( thread, mem ); \
 \
 			/* Non-chief threads: Copy the contents of the chief thread's
 			   passed-in mem_t to the passed-in mem_t for this thread. (The
 			   chief thread already has the mem_t, so it does not need to
 			   perform any copy.) */ \
-			if ( !bli_thread_am_ochief( thread ) ) \
+			if ( !bli_thread_am_chief( thread ) ) \
 			{ \
 				*mem = *mem_p; \
 			} \
@@ -165,39 +165,6 @@ GENTFUNC( scomplex, c, packm_init_mem_b )
 GENTFUNC( dcomplex, z, packm_init_mem_b )
 
 
-#undef  GENTFUNC
-#define GENTFUNC( ctype, ch, opname ) \
-\
-void PASTECH2(bls_,ch,opname) \
-     ( \
-       rntm_t* restrict rntm, \
-       mem_t*  restrict mem, \
-       thrinfo_t* restrict thread  \
-     ) \
-{ \
-	if ( thread != NULL ) \
-	if ( bli_thread_am_ochief( thread ) ) \
-	{ \
-		/* Check the mem_t entry provided by the caller. Only proceed if it
-		   is allocated, which it should be. */ \
-		if ( bli_mem_is_alloc( mem ) ) \
-		{ \
-			bli_pba_release \
-			( \
-			  rntm, \
-			  mem \
-			); \
-		} \
-	} \
-}
-
-//INSERT_GENTFUNC_BASIC0( packm_finalize_mem_b )
-GENTFUNC( float,    s, packm_finalize_mem_b )
-GENTFUNC( double,   d, packm_finalize_mem_b )
-GENTFUNC( scomplex, c, packm_finalize_mem_b )
-GENTFUNC( dcomplex, z, packm_finalize_mem_b )
-
-
 #undef  GENTFUNC
 #define GENTFUNC( ctype, ch, opname ) \
 \
@@ -267,8 +234,6 @@ void PASTECH2(bls_,ch,opname) \
        ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
                                                  inc_t* restrict ps_p, \
        cntx_t* restrict cntx, \
-       rntm_t* restrict rntm, \
-       mem_t*  restrict mem, \
        thrinfo_t* restrict thread  \
      ) \
 { \
@@ -282,8 +247,6 @@ void PASTECH2(bls_,ch,opname) \
 	( \
 	  k_alloc, n_alloc, nr, \
 	  cntx, \
-	  rntm, \
-	  mem, \
 	  thread  \
 	); \
 \
@@ -295,7 +258,7 @@ void PASTECH2(bls_,ch,opname) \
 	  &k_max, &n_max, \
 	  p, rs_p,  cs_p, \
 	     &pd_p, ps_p, \
-	  mem  \
+	  bli_thread_mem( thread )  \
 	); \
 \
 	/* Pack matrix B to the destination buffer chosen above. Here, the packed
diff --git a/sandbox/gemmlike/bls_l3_packm_b.h b/sandbox/gemmlike/bls_l3_packm_b.h
index 728d21aed5..791cf9b712 100644
--- a/sandbox/gemmlike/bls_l3_packm_b.h
+++ b/sandbox/gemmlike/bls_l3_packm_b.h
@@ -41,8 +41,6 @@ void PASTECH2(bls_,ch,opname) \
        dim_t            n, \
        dim_t            nr, \
        cntx_t* restrict cntx, \
-       rntm_t* restrict rntm, \
-       mem_t*  restrict mem, \
        thrinfo_t* restrict thread  \
      ); \
 
@@ -53,23 +51,6 @@ GENTPROT( scomplex, c, packm_init_mem_b )
 GENTPROT( dcomplex, z, packm_init_mem_b )
 
 
-#undef  GENTPROT
-#define GENTPROT( ctype, ch, opname ) \
-\
-void PASTECH2(bls_,ch,opname) \
-     ( \
-       rntm_t* restrict rntm, \
-       mem_t*  restrict mem, \
-       thrinfo_t* restrict thread  \
-     ); \
-
-//INSERT_GENTPROT_BASIC0( packm_finalize_mem_b )
-GENTPROT( float,    s, packm_finalize_mem_b )
-GENTPROT( double,   d, packm_finalize_mem_b )
-GENTPROT( scomplex, c, packm_finalize_mem_b )
-GENTPROT( dcomplex, z, packm_finalize_mem_b )
-
-
 #undef  GENTPROT
 #define GENTPROT( ctype, ch, opname ) \
 \
@@ -109,8 +90,6 @@ void PASTECH2(bls_,ch,opname) \
        ctype** restrict p, inc_t* restrict rs_p, inc_t* restrict cs_p, \
                                                  inc_t* restrict ps_p, \
        cntx_t* restrict cntx, \
-       rntm_t* restrict rntm, \
-       mem_t*  restrict mem, \
        thrinfo_t* restrict thread  \
      ); \
 
diff --git a/sandbox/gemmlike/bls_l3_packm_var.h b/sandbox/gemmlike/bls_l3_packm_var.h
index 98300536bc..4c6db2cac6 100644
--- a/sandbox/gemmlike/bls_l3_packm_var.h
+++ b/sandbox/gemmlike/bls_l3_packm_var.h
@@ -41,7 +41,7 @@
 \
 void PASTECH2(bls_,ch,varname) \
      ( \
-       trans_t          transc, \
+       conj_t           conjc, \
        pack_t           schema, \
        dim_t            m, \
        dim_t            n, \
diff --git a/sandbox/gemmlike/bls_l3_packm_var1.c b/sandbox/gemmlike/bls_l3_packm_var1.c
index c0649a9ec4..263ee8bbeb 100644
--- a/sandbox/gemmlike/bls_l3_packm_var1.c
+++ b/sandbox/gemmlike/bls_l3_packm_var1.c
@@ -43,7 +43,7 @@
 \
 void PASTECH2(bls_,ch,varname) \
      ( \
-       trans_t          transc, \
+       conj_t           conjc, \
        pack_t           schema, \
        dim_t            m, \
        dim_t            n, \
@@ -73,11 +73,6 @@ void PASTECH2(bls_,ch,varname) \
 	inc_t           incc; \
 	inc_t           ldc; \
 	inc_t           ldp; \
-	conj_t          conjc; \
-\
-\
-	/* Extract the conjugation bit from the transposition argument. */ \
-	conjc = bli_extract_conj( transc ); \
 \
 	/* Create flags to incidate row or column storage. Note that the
 	   schema bit that encodes row or column is describing the form of
@@ -126,8 +121,8 @@ void PASTECH2(bls_,ch,varname) \
 \
 	/* Query the number of threads and thread ids from the current thread's
 	   packm thrinfo_t node. */ \
-	const dim_t nt  = bli_thread_n_way( thread ); \
-	const dim_t tid = bli_thread_work_id( thread ); \
+	const dim_t nt  = bli_thread_num_threads( thread ); \
+	const dim_t tid = bli_thread_thread_id( thread ); \
 \
 	/* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \
 	( void )nt; \
diff --git a/sandbox/gemmlike/bls_l3_packm_var2.c b/sandbox/gemmlike/bls_l3_packm_var2.c
index 8d2b90cac1..b3dddd72ac 100644
--- a/sandbox/gemmlike/bls_l3_packm_var2.c
+++ b/sandbox/gemmlike/bls_l3_packm_var2.c
@@ -43,7 +43,7 @@
 \
 void PASTECH2(bls_,ch,varname) \
      ( \
-       trans_t          transc, \
+       conj_t           conjc, \
        pack_t           schema, \
        dim_t            m, \
        dim_t            n, \
@@ -73,11 +73,6 @@ void PASTECH2(bls_,ch,varname) \
 	inc_t           incc; \
 	inc_t           ldc; \
 	inc_t           ldp; \
-	conj_t          conjc; \
-\
-\
-	/* Extract the conjugation bit from the transposition argument. */ \
-	conjc = bli_extract_conj( transc ); \
 \
 	/* Create flags to incidate row or column storage. Note that the
 	   schema bit that encodes row or column is describing the form of
@@ -126,8 +121,8 @@ void PASTECH2(bls_,ch,varname) \
 \
 	/* Query the number of threads and thread ids from the current thread's
 	   packm thrinfo_t node. */ \
-	const dim_t nt  = bli_thread_n_way( thread ); \
-	const dim_t tid = bli_thread_work_id( thread ); \
+	const dim_t nt  = bli_thread_num_threads( thread ); \
+	const dim_t tid = bli_thread_thread_id( thread ); \
 \
 	/* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \
 	( void )nt; \
diff --git a/sandbox/gemmlike/bls_l3_packm_var3.c b/sandbox/gemmlike/bls_l3_packm_var3.c
index 5ea80ff424..62c1a895fb 100644
--- a/sandbox/gemmlike/bls_l3_packm_var3.c
+++ b/sandbox/gemmlike/bls_l3_packm_var3.c
@@ -45,7 +45,7 @@
 \
 void PASTECH2(bls_,ch,varname) \
      ( \
-       trans_t          transc, \
+       conj_t           conjc, \
        pack_t           schema, \
        dim_t            m, \
        dim_t            n, \
@@ -75,11 +75,6 @@ void PASTECH2(bls_,ch,varname) \
 	inc_t           incc; \
 	inc_t           ldc; \
 	inc_t           ldp; \
-	conj_t          conjc; \
-\
-\
-	/* Extract the conjugation bit from the transposition argument. */ \
-	conjc = bli_extract_conj( transc ); \
 \
 	/* Create flags to incidate row or column storage. Note that the
 	   schema bit that encodes row or column is describing the form of
@@ -126,8 +121,8 @@ void PASTECH2(bls_,ch,varname) \
 \
 	/* Query the number of threads and thread ids from the current thread's
 	   packm thrinfo_t node. */ \
-	const dim_t nt  = bli_thread_n_way( thread ); \
-	const dim_t tid = bli_thread_work_id( thread ); \
+	const dim_t nt  = bli_thread_num_threads( thread ); \
+	const dim_t tid = bli_thread_thread_id( thread ); \
 \
 	/* Suppress warnings in case tid isn't used (ie: as in slab partitioning). */ \
 	( void )nt; \
diff --git a/sandbox/gemmlike/thread/bls_l3_decor_openmp.c b/sandbox/gemmlike/thread/bls_l3_decor_openmp.c
index 9c29ef27e7..d8ad17e94e 100644
--- a/sandbox/gemmlike/thread/bls_l3_decor_openmp.c
+++ b/sandbox/gemmlike/thread/bls_l3_decor_openmp.c
@@ -62,44 +62,25 @@ void bls_l3_thread_decorator_openmp
 	// resize the array_t, if necessary.
 	array_t* array = bli_sba_checkout_array( n_threads );
 
-	// Access the pool_t* for thread 0 and embed it into the rntm. We do
-	// this up-front only so that we have the rntm_t.sba_pool field
-	// initialized and ready for the global communicator creation below.
-	bli_sba_rntm_set_pool( 0, array, rntm );
-
-	// Set the packing block allocator field of the rntm. This will be
-	// inherited by all of the child threads when they make local copies of
-	// the rntm below.
-	bli_pba_rntm_set_pba( rntm );
-
 	// Allcoate a global communicator for the root thrinfo_t structures.
-	thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads );
-
+	thrcomm_t* gl_comm = bli_thrcomm_create( NULL, BLIS_OPENMP, n_threads );
 
 	_Pragma( "omp parallel num_threads(n_threads)" )
 	{
 		// Create a thread-local copy of the master thread's rntm_t. This is
 		// necessary since we want each thread to be able to track its own
 		// small block pool_t as it executes down the function stack.
-		rntm_t           rntm_l = *rntm;
-		rntm_t* restrict rntm_p = &rntm_l;
+		rntm_t rntm_l = *rntm;
 
 		// Query the thread's id from OpenMP.
 		const dim_t tid = omp_get_thread_num();
 
 		// Check for a somewhat obscure OpenMP thread-mistmatch issue.
-		bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, rntm_p );
-
-		// Use the thread id to access the appropriate pool_t* within the
-		// array_t, and use it to set the sba_pool field within the rntm_t.
-		// If the pool_t* element within the array_t is NULL, it will first
-		// be allocated/initialized.
-		bli_sba_rntm_set_pool( tid, array, rntm_p );
-
-		thrinfo_t* thread = NULL;
+		bli_l3_thread_decorator_thread_check( n_threads, tid, gl_comm, &rntm_l );
 
 		// Create the root node of the thread's thrinfo_t structure.
-		bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread );
+        pool_t*    pool   = bli_apool_array_elem( tid, array );
+		thrinfo_t* thread = bli_l3_sup_thrinfo_create( tid, gl_comm, pool, &rntm_l );
 
 		func
 		(
@@ -109,12 +90,12 @@ void bls_l3_thread_decorator_openmp
 		  beta,
 		  c,
 		  cntx,
-		  rntm_p,
-		  thread
+		  &rntm_l,
+		  bli_thrinfo_sub_node( thread )
 		);
 
 		// Free the current thread's thrinfo_t structure.
-		bli_l3_sup_thrinfo_free( rntm_p, thread );
+		bli_thrinfo_free( thread );
 	}
 
 	// We shouldn't free the global communicator since it was already freed
diff --git a/sandbox/gemmlike/thread/bls_l3_decor_pthreads.c b/sandbox/gemmlike/thread/bls_l3_decor_pthreads.c
index 95d0e968ec..9f57dc4e61 100644
--- a/sandbox/gemmlike/thread/bls_l3_decor_pthreads.c
+++ b/sandbox/gemmlike/thread/bls_l3_decor_pthreads.c
@@ -76,19 +76,11 @@ void* bls_l3_thread_entry( void* data_void )
 	// Create a thread-local copy of the master thread's rntm_t. This is
 	// necessary since we want each thread to be able to track its own
 	// small block pool_t as it executes down the function stack.
-	rntm_t           rntm_l = *rntm;
-	rntm_t* restrict rntm_p = &rntm_l;
+	rntm_t rntm_l = *rntm;
 
-	// Use the thread id to access the appropriate pool_t* within the
-	// array_t, and use it to set the sba_pool field within the rntm_t.
-	// If the pool_t* element within the array_t is NULL, it will first
-	// be allocated/initialized.
-	bli_sba_rntm_set_pool( tid, array, rntm_p );
-
-	thrinfo_t* thread = NULL;
-
-	// Create the root node of the current thread's thrinfo_t structure.
-	bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread );
+	// Create the root node of the thread's thrinfo_t structure.
+    pool_t*    pool   = bli_apool_array_elem( tid, array );
+	thrinfo_t* thread = bli_l3_sup_thrinfo_create( tid, gl_comm, pool, &rntm_l );
 
 	func
 	(
@@ -98,12 +90,12 @@ void* bls_l3_thread_entry( void* data_void )
 	  beta,
 	  c,
 	  cntx,
-	  rntm_p,
-	  thread
+	  &rntm_l,
+	  bli_thrinfo_sub_node( thread )
 	);
 
 	// Free the current thread's thrinfo_t structure.
-	bli_l3_sup_thrinfo_free( rntm_p, thread );
+	bli_thrinfo_free( thread );
 
 	return NULL;
 }
@@ -132,20 +124,10 @@ void bls_l3_thread_decorator_pthreads
 	// with an internal lock to ensure only one application thread accesses
 	// the sba at a time. bli_sba_checkout_array() will also automatically
 	// resize the array_t, if necessary.
-	array_t* restrict array = bli_sba_checkout_array( n_threads );
-
-	// Access the pool_t* for thread 0 and embed it into the rntm. We do
-	// this up-front only so that we have the rntm_t.sba_pool field
-	// initialized and ready for the global communicator creation below.
-	bli_sba_rntm_set_pool( 0, array, rntm );
-
-	// Set the packing block allocator field of the rntm. This will be
-	// inherited by all of the child threads when they make local copies of
-	// the rntm below.
-	bli_pba_rntm_set_pba( rntm );
+	array_t* array = bli_sba_checkout_array( n_threads );
 
 	// Allocate a global communicator for the root thrinfo_t structures.
-	thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads );
+	thrcomm_t* gl_comm = bli_thrcomm_create( NULL, BLIS_POSIX, n_threads );
 
 	// Allocate an array of pthread objects and auxiliary data structs to pass
 	// to the thread entry functions.
diff --git a/sandbox/gemmlike/thread/bls_l3_decor_single.c b/sandbox/gemmlike/thread/bls_l3_decor_single.c
index b5f5a66692..118712a062 100644
--- a/sandbox/gemmlike/thread/bls_l3_decor_single.c
+++ b/sandbox/gemmlike/thread/bls_l3_decor_single.c
@@ -62,50 +62,16 @@ void bls_l3_thread_decorator_single
 	// resize the array_t, if necessary.
 	array_t* array = bli_sba_checkout_array( n_threads );
 
-	// Access the pool_t* for thread 0 and embed it into the rntm.
-	bli_sba_rntm_set_pool( 0, array, rntm );
-
-	// Set the packing block allocator field of the rntm.
-	bli_pba_rntm_set_pba( rntm );
-
-#ifndef SKIP_THRINFO_TREE
 	// Allcoate a global communicator for the root thrinfo_t structures.
-	thrcomm_t* gl_comm = bli_thrcomm_create( rntm, n_threads );
-#endif
-
+	thrcomm_t* gl_comm = &BLIS_SINGLE_COMM;
 
 	{
-		// NOTE: We don't need to create another copy of the rntm_t since
-		// it was already copied in one of the high-level oapi functions.
-		rntm_t* rntm_p = rntm;
-
 		// There is only one thread id (for the thief thread).
 		const dim_t tid = 0;
 
-		// Use the thread id to access the appropriate pool_t* within the
-		// array_t, and use it to set the sba_pool field within the rntm_t.
-		// If the pool_t* element within the array_t is NULL, it will first
-		// be allocated/initialized.
-		// NOTE: This is commented out because, in the single-threaded case,
-		// this is redundant since it's already been done above.
-		//bli_sba_rntm_set_pool( tid, array, rntm_p );
-
-#ifndef SKIP_THRINFO_TREE
-		thrinfo_t* thread = NULL;
-
-		// Create the root node of the thread's thrinfo_t structure.
-		bli_l3_sup_thrinfo_create_root( tid, gl_comm, rntm_p, &thread );
-#else
-		// This optimization allows us to use one of the global thrinfo_t
-		// objects for single-threaded execution rather than grow one from
-		// scratch. The key is that bli_thrinfo_sup_grow(), which is called
-		// from within the variants, will immediately return if it detects
-		// that the thrinfo_t* passed into it is either
-		// &BLIS_GEMM_SINGLE_THREADED or &BLIS_PACKM_SINGLE_THREADED.
-		thrinfo_t* thread = &BLIS_GEMM_SINGLE_THREADED;
-
-		( void )tid;
-#endif
+    	// Create the root node of the thread's thrinfo_t structure.
+        pool_t*    pool   = bli_apool_array_elem( tid, array );
+    	thrinfo_t* thread = bli_l3_sup_thrinfo_create( tid, gl_comm, pool, rntm );
 
 		func
 		(
@@ -115,14 +81,12 @@ void bls_l3_thread_decorator_single
 		  beta,
 		  c,
 		  cntx,
-		  rntm_p,
-		  thread
+		  rntm,
+	      bli_thrinfo_sub_node( thread )
 		);
 
-#ifndef SKIP_THRINFO_TREE
 		// Free the current thread's thrinfo_t structure.
-		bli_l3_sup_thrinfo_free( rntm_p, thread );
-#endif
+		bli_thrinfo_free( thread );
 	}
 
 	// We shouldn't free the global communicator since it was already freed