From 9e80f85dcac94bedaea6857772a26864aab207e3 Mon Sep 17 00:00:00 2001
From: Arnaud Bergeron <abergeron@gmail.com>
Date: Tue, 6 Dec 2016 07:54:14 +0100
Subject: [PATCH 01/41] Add a strb method to read from a file.

---
 src/util/strb.c | 18 +++++++++++++++++-
 src/util/strb.h |  7 +++++++
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/src/util/strb.c b/src/util/strb.c
index b202b5065c..273aa8fa6e 100644
--- a/src/util/strb.c
+++ b/src/util/strb.c
@@ -1,5 +1,7 @@
-
+#include <errno.h>
 #include <stdarg.h>
+#include <unistd.h>
+
 #include "util/strb.h"
 
 strb *strb_alloc(size_t i) {
@@ -55,3 +57,17 @@ void strb_appendf(strb *sb, const char *f, ...) {
   va_end(ap);
   sb->l += s;
 }
+
+void strb_read(strb *sb, int fd, size_t sz) {
+  ssize_t res;
+  char *b;
+  if (strb_ensure(sb, sz)) return;
+  b = sb->s + sb->l;
+  sb->l += sz;
+  while (sz) {
+    res = read(fd, b, sz);
+    if (res == -1 && !(errno == EAGAIN || errno == EINTR)) { strb_seterror(sb); return; }
+    sz -= (size_t)res;
+    b += (size_t)res;
+  }
+}
diff --git a/src/util/strb.h b/src/util/strb.h
index b2f18449d7..490031969d 100644
--- a/src/util/strb.h
+++ b/src/util/strb.h
@@ -161,6 +161,13 @@ static inline void strb_appendb(strb *sb, strb *sb2) {
  */
 GPUARRAY_LOCAL void strb_appendf(strb *, const char *f, ...);
 
+/*
+ * Reads from the file specified by the given file descriptor.
+ *
+ * A read error will place the strb in error mode.
+ */
+GPUARRAY_LOCAL void strb_read(strb *, int fd, size_t sz);
+
 /*
  * Returns a C string from the content of the strb.
  *

From ea6dfb6b82f8c5e6a274658331f94567f76ce91f Mon Sep 17 00:00:00 2001
From: Arnaud Bergeron <abergeron@gmail.com>
Date: Wed, 7 Dec 2016 06:49:00 +0100
Subject: [PATCH 02/41] Import skein, stripping away everything but
 skein512-512 and the portable goo.

---
 src/util/CMakeLists.txt |   1 +
 src/util/skein.c        | 309 ++++++++++++++++++++++++++++++++++++++++
 src/util/skein.h        | 145 +++++++++++++++++++
 3 files changed, 455 insertions(+)
 create mode 100644 src/util/skein.c
 create mode 100644 src/util/skein.h

diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt
index 61a603b44a..5c21cc3e90 100644
--- a/src/util/CMakeLists.txt
+++ b/src/util/CMakeLists.txt
@@ -2,4 +2,5 @@ set_rel(UTIL_SRC
 strb.c
 xxhash.c
 integerfactoring.c
+skein.c
 )
diff --git a/src/util/skein.c b/src/util/skein.c
new file mode 100644
index 0000000000..50285ea2c9
--- /dev/null
+++ b/src/util/skein.c
@@ -0,0 +1,309 @@
+/***********************************************************************
+**
+** Implementation of the Skein hash function.
+**
+** Source code author: Doug Whiting, 2008.
+**
+** This algorithm and source code is released to the public domain.
+**
+************************************************************************/
+
+#include <string.h>      /* get the memcpy/memset functions */
+#include "skein.h"       /* get the Skein API definitions   */
+
+#define MK_64 SKEIN_MK_64
+
+/* blkSize =  512 bits. hashSize =  512 bits */
+static const u64b_t SKEIN_512_IV_512[] =
+  {
+    MK_64(0x4903ADFF,0x749C51CE),
+    MK_64(0x0D95DE39,0x9746DF03),
+    MK_64(0x8FD19341,0x27C79BCE),
+    MK_64(0x9A255629,0xFF352CB1),
+    MK_64(0x5DB62599,0xDF6CA7B0),
+    MK_64(0xEABE394C,0xA9D5C3F4),
+    MK_64(0x991112C7,0x1A75B523),
+    MK_64(0xAE18A40B,0x660FCC33)
+  };
+
+static void Skein_Put64_LSB_First(u08b_t *dst,const u64b_t *src,size_t bCnt) {
+  size_t n;
+
+  for (n = 0; n < bCnt; n++)
+    dst[n] = (u08b_t)(src[n>>3] >> (8*(n&7)));
+}
+
+static void Skein_Get64_LSB_First(u64b_t *dst, const u08b_t *src,
+                                  size_t wCnt) {
+  size_t n;
+
+  for (n=0; n<8*wCnt; n+=8)
+    dst[n/8] = (((u64b_t) src[n  ])) +
+      (((u64b_t) src[n+1]) <<  8) +
+      (((u64b_t) src[n+2]) << 16) +
+      (((u64b_t) src[n+3]) << 24) +
+      (((u64b_t) src[n+4]) << 32) +
+      (((u64b_t) src[n+5]) << 40) +
+      (((u64b_t) src[n+6]) << 48) +
+      (((u64b_t) src[n+7]) << 56) ;
+}
+
+static u64b_t Skein_Swap64(u64b_t in) {
+  u64b_t o;
+  u08b_t *out = (u08b_t *)&o;
+  out[7] = in >> 56;
+  out[6] = in >> 48;
+  out[5] = in >> 40;
+  out[4] = in >> 32;
+  out[3] = in >> 24;
+  out[2] = in >> 16;
+  out[1] = in >> 8;
+  out[0] = in;
+  return o;
+}
+
+/*****************************************************************/
+/* Function to process blkCnt (nonzero) full block(s) of data. */
+#define BLK_BITS        (WCNT*64)               /* some useful definitions for \
+                                                   code here */
+#define KW_TWK_BASE     (0)
+#define KW_KEY_BASE     (3)
+#define ks              (kw + KW_KEY_BASE)
+#define ts              (kw + KW_TWK_BASE)
+
+#define RotL_64(x,N)    (((x) << (N)) | ((x) >> (64-(N))))
+
+static void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx, const u08b_t *blkPtr,
+                             size_t blkCnt, size_t byteCntAdd) {
+  enum {
+      WCNT = SKEIN_512_STATE_WORDS
+  };
+#define RCNT  (SKEIN_512_ROUNDS_TOTAL/8)
+
+  u64b_t  kw[WCNT+4];                         /* key schedule words : chaining vars + tweak */
+  u64b_t  X0,X1,X2,X3,X4,X5,X6,X7;            /* local copy of vars, for speed */
+  u64b_t  w [WCNT];                           /* local copy of input block */
+
+  Skein_assert(blkCnt != 0);                  /* never call with blkCnt == 0! */
+  ts[0] = ctx->h.T[0];
+  ts[1] = ctx->h.T[1];
+  do  {
+        /* this implementation only supports 2**64 input bytes (no carry out here) */
+    ts[0] += byteCntAdd;                    /* update processed length */
+
+    /* precompute the key schedule for this block */
+    ks[0] = ctx->X[0];
+    ks[1] = ctx->X[1];
+    ks[2] = ctx->X[2];
+    ks[3] = ctx->X[3];
+    ks[4] = ctx->X[4];
+    ks[5] = ctx->X[5];
+    ks[6] = ctx->X[6];
+    ks[7] = ctx->X[7];
+    ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^
+      ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY;
+
+    ts[2] = ts[0] ^ ts[1];
+
+    Skein_Get64_LSB_First(w,blkPtr,WCNT); /* get input block in little-endian format */
+
+    X0   = w[0] + ks[0];                    /* do the first full key injection */
+    X1   = w[1] + ks[1];
+    X2   = w[2] + ks[2];
+    X3   = w[3] + ks[3];
+    X4   = w[4] + ks[4];
+    X5   = w[5] + ks[5] + ts[0];
+    X6   = w[6] + ks[6] + ts[1];
+    X7   = w[7] + ks[7];
+
+    blkPtr += SKEIN_512_BLOCK_BYTES;
+
+    /* run the rounds */
+#define Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)                  \
+    X##p0 += X##p1; X##p1 = RotL_64(X##p1,ROT##_0); X##p1 ^= X##p0; \
+    X##p2 += X##p3; X##p3 = RotL_64(X##p3,ROT##_1); X##p3 ^= X##p2; \
+    X##p4 += X##p5; X##p5 = RotL_64(X##p5,ROT##_2); X##p5 ^= X##p4; \
+    X##p6 += X##p7; X##p7 = RotL_64(X##p7,ROT##_3); X##p7 ^= X##p6; \
+
+#define R512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)      /* unrolled */  \
+    Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)
+
+#define I512(R)                                                     \
+    X0   += ks[((R)+1) % 9];   /* inject the key schedule value */  \
+    X1   += ks[((R)+2) % 9];                                        \
+    X2   += ks[((R)+3) % 9];                                        \
+    X3   += ks[((R)+4) % 9];                                        \
+    X4   += ks[((R)+5) % 9];                                        \
+    X5   += ks[((R)+6) % 9] + ts[((R)+1) % 3];                      \
+    X6   += ks[((R)+7) % 9] + ts[((R)+2) % 3];                      \
+    X7   += ks[((R)+8) % 9] +     (R)+1;
+
+    {
+
+#define R512_8_rounds(R)  /* do 8 full rounds */  \
+        R512(0,1,2,3,4,5,6,7,R_512_0,8*(R)+ 1);   \
+        R512(2,1,4,7,6,5,0,3,R_512_1,8*(R)+ 2);   \
+        R512(4,1,6,3,0,5,2,7,R_512_2,8*(R)+ 3);   \
+        R512(6,1,0,7,2,5,4,3,R_512_3,8*(R)+ 4);   \
+        I512(2*(R));                              \
+        R512(0,1,2,3,4,5,6,7,R_512_4,8*(R)+ 5);   \
+        R512(2,1,4,7,6,5,0,3,R_512_5,8*(R)+ 6);   \
+        R512(4,1,6,3,0,5,2,7,R_512_6,8*(R)+ 7);   \
+        R512(6,1,0,7,2,5,4,3,R_512_7,8*(R)+ 8);   \
+        I512(2*(R)+1);        /* and key injection */
+
+      R512_8_rounds( 0);
+
+#define R512_Unroll_R(NN) (SKEIN_512_ROUNDS_TOTAL/8 > (NN))
+
+  #if   R512_Unroll_R( 1)
+      R512_8_rounds( 1);
+  #endif
+  #if   R512_Unroll_R( 2)
+      R512_8_rounds( 2);
+  #endif
+  #if   R512_Unroll_R( 3)
+      R512_8_rounds( 3);
+  #endif
+  #if   R512_Unroll_R( 4)
+      R512_8_rounds( 4);
+  #endif
+  #if   R512_Unroll_R( 5)
+      R512_8_rounds( 5);
+  #endif
+  #if   R512_Unroll_R( 6)
+      R512_8_rounds( 6);
+  #endif
+  #if   R512_Unroll_R( 7)
+      R512_8_rounds( 7);
+  #endif
+  #if   R512_Unroll_R( 8)
+      R512_8_rounds( 8);
+  #endif
+  #if   R512_Unroll_R( 9)
+      R512_8_rounds( 9);
+  #endif
+  #if   R512_Unroll_R(10)
+      R512_8_rounds(10);
+  #endif
+  #if   R512_Unroll_R(11)
+      R512_8_rounds(11);
+  #endif
+  #if   R512_Unroll_R(12)
+      R512_8_rounds(12);
+  #endif
+  #if   R512_Unroll_R(13)
+      R512_8_rounds(13);
+  #endif
+  #if   R512_Unroll_R(14)
+      R512_8_rounds(14);
+  #endif
+    }
+
+    /* do the final "feedforward" xor, update context chaining vars */
+    ctx->X[0] = X0 ^ w[0];
+    ctx->X[1] = X1 ^ w[1];
+    ctx->X[2] = X2 ^ w[2];
+    ctx->X[3] = X3 ^ w[3];
+    ctx->X[4] = X4 ^ w[4];
+    ctx->X[5] = X5 ^ w[5];
+    ctx->X[6] = X6 ^ w[6];
+    ctx->X[7] = X7 ^ w[7];
+
+    ts[1] &= ~SKEIN_T1_FLAG_FIRST;
+  }
+  while (--blkCnt);
+  ctx->h.T[0] = ts[0];
+  ctx->h.T[1] = ts[1];
+}
+
+/*****************************************************************/
+/*     512-bit Skein                                             */
+/*****************************************************************/
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* init the context for a straight hashing operation  */
+int Skein_512_Init(Skein_512_Ctxt_t *ctx) {
+  ctx->h.hashBitLen = 512;         /* output hash bit count */
+  memcpy(ctx->X,SKEIN_512_IV_512,sizeof(ctx->X));
+
+  /* Set up to process the data message portion of the hash (default) */
+  Skein_Start_New_Type(ctx,MSG);              /* T0=0, T1= MSG type */
+
+  return SKEIN_SUCCESS;
+}
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* process the input bytes */
+int Skein_512_Update(Skein_512_Ctxt_t *ctx, const u08b_t *msg,
+                     size_t msgByteCnt) {
+  size_t n;
+
+  Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+  /* process full blocks, if any */
+  if (msgByteCnt + ctx->h.bCnt > SKEIN_512_BLOCK_BYTES) {
+    if (ctx->h.bCnt) {                              /* finish up any buffered message data */
+      n = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt;  /* # bytes free in buffer b[] */
+      if (n) {
+        Skein_assert(n < msgByteCnt);         /* check on our logic here */
+        memcpy(&ctx->b[ctx->h.bCnt],msg,n);
+        msgByteCnt  -= n;
+        msg         += n;
+        ctx->h.bCnt += n;
+      }
+      Skein_assert(ctx->h.bCnt == SKEIN_512_BLOCK_BYTES);
+      Skein_512_Process_Block(ctx,ctx->b,1,SKEIN_512_BLOCK_BYTES);
+      ctx->h.bCnt = 0;
+    }
+    /* now process any remaining full blocks, directly from input message data */
+    if (msgByteCnt > SKEIN_512_BLOCK_BYTES) {
+      n = (msgByteCnt-1) / SKEIN_512_BLOCK_BYTES;   /* number of full blocks to process */
+      Skein_512_Process_Block(ctx,msg,n,SKEIN_512_BLOCK_BYTES);
+      msgByteCnt -= n * SKEIN_512_BLOCK_BYTES;
+      msg        += n * SKEIN_512_BLOCK_BYTES;
+    }
+    Skein_assert(ctx->h.bCnt == 0);
+  }
+
+  /* copy any remaining source message data bytes into b[] */
+  if (msgByteCnt) {
+    Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES);
+    memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt);
+    ctx->h.bCnt += msgByteCnt;
+  }
+
+  return SKEIN_SUCCESS;
+}
+
+/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
+/* finalize the hash computation and output the result */
+int Skein_512_Final(Skein_512_Ctxt_t *ctx, u08b_t *hashVal) {
+  size_t i,n,byteCnt;
+  u64b_t X[SKEIN_512_STATE_WORDS];
+  Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL);    /* catch uninitialized context */
+
+  ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;                 /* tag as the final block */
+  if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES)            /* zero pad b[] if necessary */
+    memset(&ctx->b[ctx->h.bCnt],0,SKEIN_512_BLOCK_BYTES - ctx->h.bCnt);
+
+  Skein_512_Process_Block(ctx,ctx->b,1,ctx->h.bCnt);  /* process the final block */
+
+  /* now output the result */
+  byteCnt = (ctx->h.hashBitLen + 7) >> 3;             /* total number of output bytes */
+
+  /* run Threefish in "counter mode" to generate output */
+  memset(ctx->b,0,sizeof(ctx->b));  /* zero out b[], so it can hold the counter */
+  memcpy(X,ctx->X,sizeof(X));       /* keep a local copy of counter mode "key" */
+  for (i=0;i*SKEIN_512_BLOCK_BYTES < byteCnt;i++) {
+    ((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */
+    Skein_Start_New_Type(ctx,OUT_FINAL);
+    Skein_512_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */
+    n = byteCnt - i*SKEIN_512_BLOCK_BYTES;   /* number of output bytes left to go */
+    if (n >= SKEIN_512_BLOCK_BYTES)
+      n  = SKEIN_512_BLOCK_BYTES;
+    Skein_Put64_LSB_First(hashVal+i*SKEIN_512_BLOCK_BYTES,ctx->X,n);   /* "output" the ctr mode bytes */
+    memcpy(ctx->X,X,sizeof(X));   /* restore the counter mode key for next time */
+  }
+  return SKEIN_SUCCESS;
+}
diff --git a/src/util/skein.h b/src/util/skein.h
new file mode 100644
index 0000000000..f21e64409a
--- /dev/null
+++ b/src/util/skein.h
@@ -0,0 +1,145 @@
+#ifndef _SKEIN_H_
+#define _SKEIN_H_     1
+/**************************************************************************
+**
+** Interface declarations and internal definitions for Skein hashing.
+**
+** Source code author: Doug Whiting, 2008.
+**
+** This algorithm and source code is released to the public domain.
+**
+***************************************************************************
+**
+** The following compile-time switches may be defined to control some
+** tradeoffs between speed, code size, error checking, and security.
+**
+** The "default" note explains what happens when the switch is not defined.
+**
+**  SKEIN_ERR_CHECK        -- how error checking is handled inside Skein
+**                            code. If not defined, most error checking
+**                            is disabled (for performance). Otherwise,
+**                            the switch value is interpreted as:
+**                                0: use assert()      to flag errors
+**                                1: return SKEIN_FAIL to flag errors
+**
+***************************************************************************/
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#include <stddef.h>                          /* get size_t definition */
+#include <gpuarray/config.h>
+typedef unsigned int uint_t;
+typedef uint8_t  u08b_t;
+typedef uint64_t u64b_t;
+
+enum {
+  SKEIN_SUCCESS         =      0,          /* return codes from Skein calls */
+  SKEIN_FAIL            =      1
+};
+
+#define  SKEIN_MODIFIER_WORDS  ( 2)     /* number of modifier (tweak) words */
+
+#define  SKEIN_512_STATE_WORDS ( 8)
+
+#define  SKEIN_512_STATE_BYTES ( 8*SKEIN_512_STATE_WORDS)
+#define  SKEIN_512_STATE_BITS  (64*SKEIN_512_STATE_WORDS)
+#define  SKEIN_512_BLOCK_BYTES ( 8*SKEIN_512_STATE_WORDS)
+
+typedef struct {
+  size_t  hashBitLen;                        /* size of hash result, in bits */
+  size_t  bCnt;                          /* current byte count in buffer b[] */
+  u64b_t  T[SKEIN_MODIFIER_WORDS]; /* tweak words: T[0]=byte cnt, T[1]=flags */
+} Skein_Ctxt_Hdr_t;
+
+typedef struct {                     /* 512-bit Skein hash context structure */
+  Skein_Ctxt_Hdr_t h;                     /* common header context variables */
+  u64b_t  X[SKEIN_512_STATE_WORDS];                    /* chaining variables */
+  u08b_t  b[SKEIN_512_BLOCK_BYTES]; /* partial block buffer (8-byte aligned) */
+} Skein_512_Ctxt_t;
+
+/*   Skein APIs for (incremental) "straight hashing" */
+int  Skein_512_Init  (Skein_512_Ctxt_t *ctx);
+int  Skein_512_Update(Skein_512_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt);
+int  Skein_512_Final (Skein_512_Ctxt_t *ctx, u08b_t * hashVal);
+
+/*****************************************************************
+** "Internal" Skein definitions
+**    -- not needed for sequential hashing API, but will be
+**           helpful for other uses of Skein (e.g., tree hash mode).
+**    -- included here so that they can be shared between
+**           reference and optimized code.
+******************************************************************/
+
+/* tweak word T[1]: bit field starting positions */
+#define SKEIN_T1_BIT(BIT)       ((BIT) - 64)            /* offset 64 because it's the second word  */
+
+#define SKEIN_T1_POS_BLK_TYPE   SKEIN_T1_BIT(120)       /* bits 120..125: type field               */
+#define SKEIN_T1_POS_FIRST      SKEIN_T1_BIT(126)       /* bits 126     : first block flag         */
+#define SKEIN_T1_POS_FINAL      SKEIN_T1_BIT(127)       /* bit  127     : final block flag         */
+
+/* tweak word T[1]: flag bit definition(s) */
+#define SKEIN_T1_FLAG_FIRST     (((u64b_t)  1 ) << SKEIN_T1_POS_FIRST)
+#define SKEIN_T1_FLAG_FINAL     (((u64b_t)  1 ) << SKEIN_T1_POS_FINAL)
+
+/* tweak word T[1]: block type field */
+#define SKEIN_BLK_TYPE_MSG      (48)              /* message processing */
+#define SKEIN_BLK_TYPE_OUT      (63)                    /* output stage */
+
+#define SKEIN_T1_BLK_TYPE(T)   (((u64b_t) (SKEIN_BLK_TYPE_##T)) << SKEIN_T1_POS_BLK_TYPE)
+#define SKEIN_T1_BLK_TYPE_MSG   SKEIN_T1_BLK_TYPE(MSG) /* message processing */
+#define SKEIN_T1_BLK_TYPE_OUT   SKEIN_T1_BLK_TYPE(OUT)       /* output stage */
+
+#define SKEIN_T1_BLK_TYPE_OUT_FINAL       (SKEIN_T1_BLK_TYPE_OUT | SKEIN_T1_FLAG_FINAL)
+
+#define SKEIN_MK_64(hi32,lo32)  ((lo32) + (((u64b_t) (hi32)) << 32))
+#define SKEIN_KS_PARITY         SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22)
+
+/*
+**   Skein macros for setting tweak words, etc.
+**/
+#define Skein_Set_Tweak(ctxPtr,TWK_NUM,tVal)    {(ctxPtr)->h.T[TWK_NUM] = (tVal);}
+
+#define Skein_Set_T0(ctxPtr,T0) Skein_Set_Tweak(ctxPtr,0,T0)
+#define Skein_Set_T1(ctxPtr,T1) Skein_Set_Tweak(ctxPtr,1,T1)
+
+/* set both tweak words at once */
+#define Skein_Set_T0_T1(ctxPtr,T0,T1)         \
+    {                                           \
+    Skein_Set_T0(ctxPtr,(T0));                  \
+    Skein_Set_T1(ctxPtr,(T1));                  \
+    }
+
+/* set up for starting with a new type: h.T[0]=0; h.T[1] = NEW_TYPE; h.bCnt=0; */
+#define Skein_Start_New_Type(ctxPtr,BLK_TYPE)                         \
+  { Skein_Set_T0_T1(ctxPtr,0,SKEIN_T1_FLAG_FIRST | SKEIN_T1_BLK_TYPE_##BLK_TYPE); (ctxPtr)->h.bCnt=0; }
+
+/**************************************************
+** "Internal" Skein definitions for error checking
+***************************************************/
+
+#include <assert.h>
+#define Skein_Assert(x,retCode) { if (!(x)) return retCode; } /*  caller  error */
+#define Skein_assert(x)         assert(x)                     /* internal error */
+
+/*****************************************************************
+** Skein block function constants (shared across Ref and Opt code)
+******************************************************************/
+enum {
+  /* Skein_512 round rotation constants */
+  R_512_0_0=46, R_512_0_1=36, R_512_0_2=19, R_512_0_3=37,
+  R_512_1_0=33, R_512_1_1=27, R_512_1_2=14, R_512_1_3=42,
+  R_512_2_0=17, R_512_2_1=49, R_512_2_2=36, R_512_2_3=39,
+  R_512_3_0=44, R_512_3_1= 9, R_512_3_2=54, R_512_3_3=56,
+  R_512_4_0=39, R_512_4_1=30, R_512_4_2=34, R_512_4_3=24,
+  R_512_5_0=13, R_512_5_1=50, R_512_5_2=10, R_512_5_3=17,
+  R_512_6_0=25, R_512_6_1=29, R_512_6_2=39, R_512_6_3=43,
+  R_512_7_0= 8, R_512_7_1=35, R_512_7_2=56, R_512_7_3=22,
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* ifndef _SKEIN_H_ */

From 0cb043f2fcf72396c85088dfb36897553353d777 Mon Sep 17 00:00:00 2001
From: Arnaud Bergeron <abergeron@gmail.com>
Date: Wed, 7 Dec 2016 12:21:55 +0100
Subject: [PATCH 03/41] Add a function to do all the steps.

---
 src/util/skein.c | 8 ++++++++
 src/util/skein.h | 1 +
 2 files changed, 9 insertions(+)

diff --git a/src/util/skein.c b/src/util/skein.c
index 50285ea2c9..51362e5efb 100644
--- a/src/util/skein.c
+++ b/src/util/skein.c
@@ -307,3 +307,11 @@ int Skein_512_Final(Skein_512_Ctxt_t *ctx, u08b_t *hashVal) {
   }
   return SKEIN_SUCCESS;
 }
+
+int Skein_512(const u08b_t *msg, size_t msgByteCnt, u08b_t *hashVal) {
+  Skein_512_Ctxt_t ctx;
+  if (Skein_512_Init(&ctx)) return SKEIN_FAIL;
+  if (Skein_512_Update(&ctx, msg, msgByteCnt)) return SKEIN_FAIL;
+  if (Skein_512_Final(&ctx, hashVal)) return SKEIN_FAIL;
+  return SKEIN_SUCCESS;
+}
diff --git a/src/util/skein.h b/src/util/skein.h
index f21e64409a..89d7ebf209 100644
--- a/src/util/skein.h
+++ b/src/util/skein.h
@@ -63,6 +63,7 @@ typedef struct {                     /* 512-bit Skein hash context structure */
 int  Skein_512_Init  (Skein_512_Ctxt_t *ctx);
 int  Skein_512_Update(Skein_512_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt);
 int  Skein_512_Final (Skein_512_Ctxt_t *ctx, u08b_t * hashVal);
+int  Skein_512(const u08b_t *msg, size_t msgByteCnt, u08b_t *hashVal);
 
 /*****************************************************************
 ** "Internal" Skein definitions

From dd06f091bbc518883a5e117a931ef9b9008aaecc Mon Sep 17 00:00:00 2001
From: Arnaud Bergeron <abergeron@gmail.com>
Date: Thu, 8 Dec 2016 21:06:31 +0100
Subject: [PATCH 04/41] Add strb_write().

---
 src/util/strb.c | 24 +++++++++++++++++++++++-
 src/util/strb.h |  7 +++++++
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/src/util/strb.c b/src/util/strb.c
index 273aa8fa6e..15cd496c4f 100644
--- a/src/util/strb.c
+++ b/src/util/strb.c
@@ -66,8 +66,30 @@ void strb_read(strb *sb, int fd, size_t sz) {
   sb->l += sz;
   while (sz) {
     res = read(fd, b, sz);
-    if (res == -1 && !(errno == EAGAIN || errno == EINTR)) { strb_seterror(sb); return; }
+    if (res == -1) {
+      if (errno == EAGAIN || errno == EINTR)
+        continue;
+      strb_seterror(sb);
+      return;
+    }
     sz -= (size_t)res;
     b += (size_t)res;
   }
 }
+
+int strb_write(int fd, strb *sb) {
+  ssize_t res;
+  size_t l = sb->l;
+  char *b = sb->s;
+  while (l) {
+    res = write(fd, b, l);
+    if (res == -1) {
+      if (errno == EAGAIN || errno == EINTR)
+        continue;
+      return -1;
+    }
+    l -= (size_t)res;
+    b += (size_t)res;
+  }
+  return 0;
+}
diff --git a/src/util/strb.h b/src/util/strb.h
index 490031969d..3fc1071ea4 100644
--- a/src/util/strb.h
+++ b/src/util/strb.h
@@ -168,6 +168,13 @@ GPUARRAY_LOCAL void strb_appendf(strb *, const char *f, ...);
  */
 GPUARRAY_LOCAL void strb_read(strb *, int fd, size_t sz);
 
+/*
+ * Write the content of an strb to the specified file descriptor.
+ *
+ * Write errors will be signaled by a nonzero return value.
+ */
+GPUARRAY_LOCAL int strb_write(int fd, strb *sb);
+
 /*
  * Returns a C string from the content of the strb.
  *

From c59ddae81487b2d71196f0627d205cbb7dc6714b Mon Sep 17 00:00:00 2001
From: Arnaud Bergeron <abergeron@gmail.com>
Date: Thu, 8 Dec 2016 21:55:58 +0100
Subject: [PATCH 05/41] Fix wrong export type.

---
 src/gpuarray/buffer.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/gpuarray/buffer.h b/src/gpuarray/buffer.h
index d6d3dd8a09..800756a072 100644
--- a/src/gpuarray/buffer.h
+++ b/src/gpuarray/buffer.h
@@ -328,9 +328,9 @@ GPUARRAY_PUBLIC int gpudata_move(gpudata *dst, size_t dstoff,
  * \returns the new buffer in dst_ctx or NULL if no efficient way to
  *          transfer could be found.
  */
-GPUARRAY_LOCAL int gpudata_transfer(gpudata *dst, size_t dstoff,
-                                    gpudata *src, size_t srcoff,
-                                    size_t sz);
+GPUARRAY_PUBLIC int gpudata_transfer(gpudata *dst, size_t dstoff,
+                                     gpudata *src, size_t srcoff,
+                                     size_t sz);
 
 /**
  * Transfer data from a buffer to memory.

From b12013bf410ea1d7fb6b1a4665a6dc353b530b4f Mon Sep 17 00:00:00 2001
From: Arnaud Bergeron <abergeron@gmail.com>
Date: Thu, 8 Dec 2016 22:02:39 +0100
Subject: [PATCH 06/41] Remove GPUARRAY_LOCAL, it is not needed since we
 default to visibility=hidden

---
 src/gpuarray/config.h                |  6 +-----
 src/gpuarray_blas_cuda_cublas.c      |  2 +-
 src/gpuarray_blas_opencl_clblas.c    |  2 +-
 src/gpuarray_blas_opencl_clblast.c   |  2 +-
 src/gpuarray_buffer_cuda.c           |  3 +--
 src/gpuarray_buffer_opencl.c         |  3 +--
 src/gpuarray_collectives_cuda_nccl.c |  2 +-
 src/private.h                        | 28 ++++++++++++++--------------
 src/private_config.h.in              |  6 +++---
 src/private_cuda.h                   | 17 ++++++++---------
 src/private_opencl.h                 |  8 ++++----
 src/util/strb.h                      | 12 ++++++------
 src/util/xxhash.h                    |  8 ++++----
 13 files changed, 46 insertions(+), 53 deletions(-)

diff --git a/src/gpuarray/config.h b/src/gpuarray/config.h
index f8fc86a01d..571f81cfe6 100644
--- a/src/gpuarray/config.h
+++ b/src/gpuarray/config.h
@@ -12,19 +12,15 @@
   #else
    #define GPUARRAY_PUBLIC __declspec(dllimport)
   #endif
-  #define GPUARRAY_LOCAL
  #else
   #if __GNUC__ >= 4
    #define GPUARRAY_PUBLIC __attribute__((visibility ("default")))
-   #define GPUARRAY_LOCAL  __attribute__((visibility ("hidden")))
   #else
-   #define GPUARRAY_PUBLIC
-   #define GPUARRAY_LOCAL
+   #error "Don't know how to export symbols on this platform"
   #endif
  #endif
 #else
  #define GPUARRAY_PUBLIC
- #define GPUARRAY_LOCAL
 #endif
 
 #ifdef _MSC_VER
diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c
index 6d4648e232..a7b91ed87e 100644
--- a/src/gpuarray_blas_cuda_cublas.c
+++ b/src/gpuarray_blas_cuda_cublas.c
@@ -1640,7 +1640,7 @@ static int dgerBatch(cb_order order, size_t M, size_t N, double alpha,
   return GA_NO_ERROR;
 }
 
-GPUARRAY_LOCAL gpuarray_blas_ops cublas_ops = {
+gpuarray_blas_ops cublas_ops = {
   setup,
   teardown,
   error,
diff --git a/src/gpuarray_blas_opencl_clblas.c b/src/gpuarray_blas_opencl_clblas.c
index 2041710735..8ee019afb7 100644
--- a/src/gpuarray_blas_opencl_clblas.c
+++ b/src/gpuarray_blas_opencl_clblas.c
@@ -491,7 +491,7 @@ static int dger(cb_order order, size_t M, size_t N, double alpha,
   return GA_NO_ERROR;
 }
 
-GPUARRAY_LOCAL gpuarray_blas_ops clblas_ops = {
+gpuarray_blas_ops clblas_ops = {
   setup,
   teardown,
   error,
diff --git a/src/gpuarray_blas_opencl_clblast.c b/src/gpuarray_blas_opencl_clblast.c
index 78cca10f20..4a5369e56e 100644
--- a/src/gpuarray_blas_opencl_clblast.c
+++ b/src/gpuarray_blas_opencl_clblast.c
@@ -525,7 +525,7 @@ static int dger(cb_order order, size_t M, size_t N, double alpha,
   return GA_NO_ERROR;
 }
 
-GPUARRAY_LOCAL gpuarray_blas_ops clblast_ops = {
+gpuarray_blas_ops clblast_ops = {
   setup,
   teardown,
   error,
diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c
index 447400f277..1c883deaa7 100644
--- a/src/gpuarray_buffer_cuda.c
+++ b/src/gpuarray_buffer_cuda.c
@@ -39,7 +39,7 @@ STATIC_ASSERT(sizeof(GpuArrayIpcMemHandle) == sizeof(CUipcMemHandle), cuda_ipcme
 
 static CUresult err;
 
-GPUARRAY_LOCAL const gpuarray_buffer_ops cuda_ops;
+const gpuarray_buffer_ops cuda_ops;
 
 static void cuda_freekernel(gpukernel *);
 static int cuda_property(gpucontext *, gpudata *, gpukernel *, int, void *);
@@ -1689,7 +1689,6 @@ static const char *cuda_error(gpucontext *c) {
   return errstr;
 }
 
-GPUARRAY_LOCAL
 const gpuarray_buffer_ops cuda_ops = {cuda_get_platform_count,
                                       cuda_get_device_count,
                                       cuda_init,
diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c
index adf34a3825..84bcb6584d 100644
--- a/src/gpuarray_buffer_opencl.c
+++ b/src/gpuarray_buffer_opencl.c
@@ -28,7 +28,7 @@ static cl_int err;
 #define CHKFAIL(v) if (err != CL_SUCCESS) FAIL(v, GA_IMPL_ERROR)
 
 
-GPUARRAY_LOCAL const gpuarray_buffer_ops opencl_ops;
+const gpuarray_buffer_ops opencl_ops;
 
 static int cl_property(gpucontext *c, gpudata *b, gpukernel *k, int p, void *r);
 static gpudata *cl_alloc(gpucontext *c, size_t size, void *data, int flags,
@@ -1448,7 +1448,6 @@ static const char *cl_error(gpucontext *c) {
   }
 }
 
-GPUARRAY_LOCAL
 const gpuarray_buffer_ops opencl_ops = {cl_get_platform_count,
                                         cl_get_device_count,
                                         cl_init,
diff --git a/src/gpuarray_collectives_cuda_nccl.c b/src/gpuarray_collectives_cuda_nccl.c
index a0f6d12060..e382cfa066 100644
--- a/src/gpuarray_collectives_cuda_nccl.c
+++ b/src/gpuarray_collectives_cuda_nccl.c
@@ -455,6 +455,6 @@ static int all_gather(gpudata* src, size_t offsrc, gpudata* dest,
  * linked in \ref gpuarray_buffer_cuda.c, in order to fill a /ref gpucontext's
  * comm_ops.
  */
-GPUARRAY_LOCAL gpuarray_comm_ops nccl_ops = {
+gpuarray_comm_ops nccl_ops = {
     comm_new, comm_free,  generate_clique_id, get_count, get_rank,
     reduce,   all_reduce, reduce_scatter,     broadcast, all_gather};
diff --git a/src/private.h b/src/private.h
index 0513df8605..7405cf8995 100644
--- a/src/private.h
+++ b/src/private.h
@@ -256,26 +256,26 @@ static inline void *memdup(const void *p, size_t s) {
   return res;
 }
 
-GPUARRAY_LOCAL int GpuArray_is_c_contiguous(const GpuArray *a);
-GPUARRAY_LOCAL int GpuArray_is_f_contiguous(const GpuArray *a);
-GPUARRAY_LOCAL int GpuArray_is_aligned(const GpuArray *a);
+int GpuArray_is_c_contiguous(const GpuArray *a);
+int GpuArray_is_f_contiguous(const GpuArray *a);
+int GpuArray_is_aligned(const GpuArray *a);
 
-GPUARRAY_LOCAL extern const gpuarray_type scalar_types[];
-GPUARRAY_LOCAL extern const gpuarray_type vector_types[];
+extern const gpuarray_type scalar_types[];
+extern const gpuarray_type vector_types[];
 
 /*
  * This function generates the kernel code to perform indexing on var id
  * from planar index 'i' using the dimensions and strides provided.
  */
-GPUARRAY_LOCAL void gpuarray_elem_perdim(strb *sb, unsigned int nd,
-                                         const size_t *dims,
-                                         const ssize_t *str,
-                                         const char *id);
-
-GPUARRAY_LOCAL void gpukernel_source_with_line_numbers(unsigned int count,
-                                                       const char **news,
-                                                       size_t *newl,
-                                                       strb *src);
+void gpuarray_elem_perdim(strb *sb, unsigned int nd,
+                          const size_t *dims,
+                          const ssize_t *str,
+                          const char *id);
+
+void gpukernel_source_with_line_numbers(unsigned int count,
+                                        const char **news,
+                                        size_t *newl,
+                                        strb *src);
 
 static inline uint16_t float_to_half(float value) {
 #define ga__shift 13
diff --git a/src/private_config.h.in b/src/private_config.h.in
index 23db862c4f..c3cd3a0195 100644
--- a/src/private_config.h.in
+++ b/src/private_config.h.in
@@ -39,12 +39,12 @@ extern "C" {
 #define nelems(a) (sizeof(a)/sizeof(a[0]))
 
 #ifndef HAVE_MKSTEMP
-GPUARRAY_LOCAL int mkstemp(char *path);
+int mkstemp(char *path);
 #endif
 
 #ifndef HAVE_STRL
-GPUARRAY_LOCAL size_t strlcpy(char *dst, const char *src, size_t size);
-GPUARRAY_LOCAL size_t strlcat(char *dst, const char *src, size_t size);
+size_t strlcpy(char *dst, const char *src, size_t size);
+size_t strlcat(char *dst, const char *src, size_t size);
 #endif
 
 #ifdef __cplusplus
diff --git a/src/private_cuda.h b/src/private_cuda.h
index 6fab1597ac..da6f60ad7a 100644
--- a/src/private_cuda.h
+++ b/src/private_cuda.h
@@ -93,10 +93,10 @@ STATIC_ASSERT(sizeof(cuda_context) <= sizeof(gpucontext),
 
 #define ARCH_PREFIX "compute_"
 
-GPUARRAY_LOCAL cuda_context *cuda_make_ctx(CUcontext ctx, int flags);
-GPUARRAY_LOCAL CUstream cuda_get_stream(cuda_context *ctx);
-GPUARRAY_LOCAL void cuda_enter(cuda_context *ctx);
-GPUARRAY_LOCAL void cuda_exit(cuda_context *ctx);
+cuda_context *cuda_make_ctx(CUcontext ctx, int flags);
+CUstream cuda_get_stream(cuda_context *ctx);
+void cuda_enter(cuda_context *ctx);
+void cuda_exit(cuda_context *ctx);
 
 struct _gpudata {
   CUdeviceptr ptr;
@@ -115,11 +115,10 @@ struct _gpudata {
 #endif
 };
 
-GPUARRAY_LOCAL gpudata *cuda_make_buf(cuda_context *c, CUdeviceptr p,
-                                      size_t sz);
-GPUARRAY_LOCAL size_t cuda_get_sz(gpudata *g);
-GPUARRAY_LOCAL int cuda_wait(gpudata *, int);
-GPUARRAY_LOCAL int cuda_record(gpudata *, int);
+gpudata *cuda_make_buf(cuda_context *c, CUdeviceptr p, size_t sz);
+size_t cuda_get_sz(gpudata *g);
+int cuda_wait(gpudata *, int);
+int cuda_record(gpudata *, int);
 
 /* private flags are in the upper 16 bits */
 #define CUDA_WAIT_READ  0x10000
diff --git a/src/private_opencl.h b/src/private_opencl.h
index e40242d57e..2a523f5bda 100644
--- a/src/private_opencl.h
+++ b/src/private_opencl.h
@@ -67,9 +67,9 @@ struct _gpukernel {
 #endif
 };
 
-GPUARRAY_LOCAL cl_ctx *cl_make_ctx(cl_context ctx, int flags);
-GPUARRAY_LOCAL cl_command_queue cl_get_stream(gpucontext *ctx);
-GPUARRAY_LOCAL gpudata *cl_make_buf(gpucontext *c, cl_mem buf);
-GPUARRAY_LOCAL cl_mem cl_get_buf(gpudata *g);
+cl_ctx *cl_make_ctx(cl_context ctx, int flags);
+cl_command_queue cl_get_stream(gpucontext *ctx);
+gpudata *cl_make_buf(gpucontext *c, cl_mem buf);
+cl_mem cl_get_buf(gpudata *g);
 
 #endif
diff --git a/src/util/strb.h b/src/util/strb.h
index 3fc1071ea4..267941417e 100644
--- a/src/util/strb.h
+++ b/src/util/strb.h
@@ -39,14 +39,14 @@ typedef struct _strb {
  *
  * Returns NULL on error.
  */
-GPUARRAY_LOCAL strb *strb_alloc(size_t s);
+strb *strb_alloc(size_t s);
 
 /*
  * Frees an strb that was dynamically allocated.
  *
  * Don't call this for stack of global declarations, see strb_clear() instead.
  */
-GPUARRAY_LOCAL void strb_free(strb *);
+void strb_free(strb *);
 
 /*
  * Return a pointer to a dynamically allocated strb with a default
@@ -96,7 +96,7 @@ static inline void strb_clear(strb *sb) {
  * This should almost never be called directly.  Use strb_ensure()
  * instead.
  */
-GPUARRAY_LOCAL int strb_grow(strb *, size_t s);
+int strb_grow(strb *, size_t s);
 
 /*
  * Make sure there is space to store at least `s` bytes of data after
@@ -159,21 +159,21 @@ static inline void strb_appendb(strb *sb, strb *sb2) {
  *
  * A format error will place the strb in error mode.
  */
-GPUARRAY_LOCAL void strb_appendf(strb *, const char *f, ...);
+void strb_appendf(strb *, const char *f, ...);
 
 /*
  * Reads from the file specified by the given file descriptor.
  *
  * A read error will place the strb in error mode.
  */
-GPUARRAY_LOCAL void strb_read(strb *, int fd, size_t sz);
+void strb_read(strb *, int fd, size_t sz);
 
 /*
  * Write the content of an strb to the specified file descriptor.
  *
  * Write errors will be signaled by a nonzero return value.
  */
-GPUARRAY_LOCAL int strb_write(int fd, strb *sb);
+int strb_write(int fd, strb *sb);
 
 /*
  * Returns a C string from the content of the strb.
diff --git a/src/util/xxhash.h b/src/util/xxhash.h
index c33938234d..6403b9e6f9 100644
--- a/src/util/xxhash.h
+++ b/src/util/xxhash.h
@@ -106,7 +106,7 @@ They will be automatically translated by this header.
 *  Simple Hash Functions
 *****************************/
 
-GPUARRAY_LOCAL unsigned int XXH32 (const void* input, size_t length, unsigned seed);
+unsigned int XXH32 (const void* input, size_t length, unsigned seed);
 
 /*
 XXH32() :
@@ -129,9 +129,9 @@ These structures allow static allocation of XXH states.
 States must then be initialized using XXH32_reset() before first use.
 */
 
-GPUARRAY_LOCAL XXH_errorcode XXH32_reset  (XXH32_state_t* statePtr, unsigned seed);
-GPUARRAY_LOCAL XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);
-GPUARRAY_LOCAL unsigned int  XXH32_digest (const XXH32_state_t* statePtr);
+XXH_errorcode XXH32_reset  (XXH32_state_t* statePtr, unsigned seed);
+XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);
+unsigned int  XXH32_digest (const XXH32_state_t* statePtr);
 
 /*
 These functions calculate the xxHash of an input provided in multiple smaller packets,

From 339ac160824a3c916614f8133b1526f39a5aead6 Mon Sep 17 00:00:00 2001
From: Arnaud Bergeron <abergeron@gmail.com>
Date: Thu, 8 Dec 2016 22:04:47 +0100
Subject: [PATCH 07/41] Remove some useless stuff in xxhash.

---
 src/util/xxhash.c | 64 +----------------------------------------------
 src/util/xxhash.h | 28 ---------------------
 2 files changed, 1 insertion(+), 91 deletions(-)

diff --git a/src/util/xxhash.c b/src/util/xxhash.c
index 58101b0902..bd2447ca16 100644
--- a/src/util/xxhash.c
+++ b/src/util/xxhash.c
@@ -31,39 +31,6 @@ You can contact the author at :
 - xxHash source repository : https://github.com/Cyan4973/xxHash
 */
 
-
-/**************************************
-*  Tuning parameters
-**************************************/
-/* XXH_FORCE_MEMORY_ACCESS
- * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
- * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
- * The below switch allow to select different access method for improved performance.
- * Method 0 (default) : use `memcpy()`. Safe and portable.
- * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
- *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
- * Method 2 : direct access. This method is portable but violate C standard.
- *            It can generate buggy code on targets which generate assembly depending on alignment.
- *            But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
- * See http://stackoverflow.com/a/32095106/646947 for details.
- * Prefer these methods in priority order (0 > 1 > 2)
- */
-#ifndef XXH_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
-#  if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
-#    define XXH_FORCE_MEMORY_ACCESS 2
-#  elif defined(__INTEL_COMPILER) || \
-  (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) ))
-#    define XXH_FORCE_MEMORY_ACCESS 1
-#  endif
-#endif
-
-/* XXH_ACCEPT_NULL_INPUT_POINTER :
- * If the input pointer is a null pointer, xxHash default behavior is to trigger a memory access error, since it is a bad pointer.
- * When this option is enabled, xxHash output for null input pointers will be the same as a null-length input.
- * By default, this option is disabled. To enable it, uncomment below define :
- */
-/* #define XXH_ACCEPT_NULL_INPUT_POINTER 1 */
-
 /* XXH_FORCE_NATIVE_FORMAT :
  * By default, xxHash library provides endian-independant Hash values, based on little-endian convention.
  * Results are therefore identical for little-endian and big-endian CPU.
@@ -72,7 +39,7 @@ You can contact the author at :
  * to improve speed for Big-endian CPU.
  * This option has no impact on Little_Endian CPU.
  */
-#define XXH_FORCE_NATIVE_FORMAT 0
+#define XXH_FORCE_NATIVE_FORMAT 1
 
 /* XXH_USELESS_ALIGN_BRANCH :
  * This is a minor performance trick, only useful with lots of very small keys.
@@ -132,25 +99,6 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcp
 #endif
 
 
-#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
-
-/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
-static U32 XXH_read32(const void* memPtr) { return *(const U32*) memPtr; }
-
-#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
-
-/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
-/* currently only defined for gcc and icc */
-typedef union { U32 u32; U64 u64; } __attribute__((packed)) unalign;
-
-static U32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
-
-#else
-
-/* portable and safe solution. Generally efficient.
- * see : http://stackoverflow.com/a/32095106/646947
- */
-
 static U32 XXH_read32(const void* memPtr)
 {
     U32 val;
@@ -158,8 +106,6 @@ static U32 XXH_read32(const void* memPtr)
     return val;
 }
 
-#endif // XXH_FORCE_DIRECT_MEMORY_ACCESS
-
 
 /******************************************
 *  Compiler-specific Functions and Macros
@@ -243,14 +189,6 @@ FORCE_INLINE U32 XXH32_endian_align(const void* input, size_t len, U32 seed, XXH
     U32 h32;
 #define XXH_get32bits(p) XXH_readLE32_align(p, endian, align)
 
-#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
-    if (p==NULL)
-    {
-        len=0;
-        bEnd=p=(const BYTE*)(size_t)16;
-    }
-#endif
-
     if (len>=16)
     {
         const BYTE* const limit = bEnd - 16;
diff --git a/src/util/xxhash.h b/src/util/xxhash.h
index 6403b9e6f9..1d11a095fb 100644
--- a/src/util/xxhash.h
+++ b/src/util/xxhash.h
@@ -74,34 +74,6 @@ extern "C" {
 typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
 
 
-/*****************************
-*  Namespace Emulation
-*****************************/
-/* Motivations :
-
-If you need to include xxHash into your library,
-but wish to avoid xxHash symbols to be present on your library interface
-in an effort to avoid potential name collision if another library also includes xxHash,
-
-you can use XXH_NAMESPACE, which will automatically prefix any symbol from xxHash
-with the value of XXH_NAMESPACE (so avoid to keep it NULL, and avoid numeric values).
-
-Note that no change is required within the calling program :
-it can still call xxHash functions using their regular name.
-They will be automatically translated by this header.
-*/
-#ifdef XXH_NAMESPACE
-#  define XXH_CAT(A,B) A##B
-#  define XXH_NAME2(A,B) XXH_CAT(A,B)
-#  define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
-#  define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
-#  define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
-#  define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)
-#  define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)
-#  define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)
-#endif
-
-
 /*****************************
 *  Simple Hash Functions
 *****************************/

From 7ca34ed68908a348d3c40d80205556d369f321e0 Mon Sep 17 00:00:00 2001
From: Arnaud Bergeron <abergeron@gmail.com>
Date: Thu, 8 Dec 2016 22:46:22 +0100
Subject: [PATCH 08/41] Disk cache implementation.

---
 src/CMakeLists.txt |   1 +
 src/cache/disk.c   | 301 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 302 insertions(+)
 create mode 100644 src/cache/disk.c

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 02e32eccd4..0ed776533d 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -35,6 +35,7 @@ endmacro()
 set(_GPUARRAY_SRC
 cache/lru.c
 cache/twoq.c
+cache/disk.c
 gpuarray_types.c
 gpuarray_error.c
 gpuarray_util.c
diff --git a/src/cache/disk.c b/src/cache/disk.c
new file mode 100644
index 0000000000..59e6f1ba42
--- /dev/null
+++ b/src/cache/disk.c
@@ -0,0 +1,301 @@
+#include <dirent.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <sys/stat.h>
+
+#include "cache.h"
+#include "private_config.h"
+#include "util/strb.h"
+#include "util/skein.h"
+
+#define HEXP_LEN (128 + 2)
+
+typedef int (*kwrite_fn)(strb *res, cache_key_t key);
+typedef int (*vwrite_fn)(strb *res, cache_value_t val);
+typedef cache_key_t (*kread_fn)(const strb *b);
+typedef cache_value_t (*vread_fn)(const strb *b);
+
+typedef struct _disk_cache {
+  cache c;
+  cache * mem;
+  kwrite_fn kwrite;
+  vwrite_fn vwrite;
+  kread_fn kread;
+  vread_fn vread;
+  int dirfd;
+} disk_cache;
+
+
+static unsigned long long ntohull(const char *in) {
+  return ((unsigned long long)in[0] << 56 | (unsigned long long)in[1] << 48 |
+          (unsigned long long)in[2] << 40 | (unsigned long long)in[3] << 32 |
+          (unsigned long long)in[4] << 24 | (unsigned long long)in[5] << 16 |
+          (unsigned long long)in[6] << 8 | (unsigned long long)in[7]);
+}
+
+static void htonull(unsigned long long in, char *out) {
+  out[0] = in >> 56;
+  out[1] = in >> 48;
+  out[2] = in >> 40;
+  out[3] = in >> 32;
+  out[4] = in >> 24;
+  out[5] = in >> 16;
+  out[6] = in >> 8;
+  out[7] = in;
+}
+
+static int mkstempat(int dfd, char *template) {
+  static const char letters[] =
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
+  size_t length;
+  char *XXXXXX;
+  struct timeval tv;
+  unsigned long long  randnum, working;
+  int i, tries, fd;
+
+  length = strlen(template);
+  if (length < 6) {
+    errno = EINVAL;
+    return -1;
+  }
+  XXXXXX = template + length - 6;
+  if (strcmp(XXXXXX, "XXXXXX") != 0) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  /* This is kind of crappy, but the point is to not step on each
+     other's feet */
+  gettimeofday(&tv, NULL);
+  randnum = ((unsigned long long) tv.tv_usec << 16) ^ tv.tv_sec ^ getpid();
+
+  for (tries = 0; tries < TMP_MAX; tries++) {
+    for (working = randnum, i = 0; i < 6; i++) {
+      XXXXXX[i] = letters[working % 62];
+      working /= 62;
+    }
+    fd = openat(dfd, template, O_RDWR | O_CREAT | O_EXCL, 0600);
+    if (fd >= 0 || (errno != EEXIST && errno != EISDIR))
+      return fd;
+
+    randnum += (tv.tv_usec >> 10) & 0xfff;
+  }
+  errno = EEXIST;
+  return -1;
+}
+
+static int key_path(disk_cache *c, const cache_key_t key, char *out) {
+  strb kb = STRB_STATIC_INIT;
+  unsigned char hash[64];
+  int i;
+
+  if (c->kwrite(&kb, key)) return -1;
+  if (Skein_512((unsigned char *)kb.s, kb.l, hash)) return -1;
+  if (snprintf(out, 6, "%02x%02x/%02x%02x",
+               hash[0], hash[1], hash[2], hash[3]) != 5)
+    return -1;
+  for (i = 4; i < 64; i += 4) {
+    if (snprintf(out+(i * 2 + 1), 9, "%02x%02x%02x%02x",
+                 hash[i], hash[i+1], hash[i+2], hash[i+3]) != 8)
+      return -1;
+  }
+  return 0;
+}
+
+static int write_entry(disk_cache *c, const cache_key_t k,
+                       const cache_value_t v) {
+  char hexp[HEXP_LEN];
+  char tmp_path[] = "tmp.XXXXXXXX";
+  strb b = STRB_STATIC_INIT;
+  size_t kl, vl;
+  int fd, err;
+
+  if (key_path(c, k, hexp)) return -1;
+
+  if (!strb_ensure(&b, 16)) return -1;
+  b.l = 16;
+  c->kwrite(&b, k);
+  kl = b.l - 16;
+  c->vwrite(&b, v);
+  vl = b.l - kl - 16;
+  htonull(kl, b.s);
+  htonull(vl, b.s + 8);
+  if (strb_error(&b)) {
+    strb_clear(&b);
+    return -1;
+  }
+
+  fd = mkstempat(c->dirfd, tmp_path);
+  if (fd == -1) {
+    strb_clear(&b);
+    return -1;
+  }
+
+  err = strb_write(fd, &b);
+  strb_clear(&b);
+  close(fd);
+  if (err) {
+    unlinkat(c->dirfd, tmp_path, 0);
+    return -1;
+  }
+  
+  if (renameat(c->dirfd, tmp_path, c->dirfd, hexp)) {
+    unlinkat(c->dirfd, tmp_path, 0);
+    return -1;
+  }
+
+  return 0;
+}
+
+static int find_entry(disk_cache *c, const cache_key_t key,
+                      cache_key_t *_k, cache_value_t *_v) {
+  struct stat st;
+  strb b = STRB_STATIC_INIT;
+  char *ts;
+  size_t kl, vl;
+  cache_key_t k;
+  char hexp[HEXP_LEN];
+  int fd;
+
+  if (key_path(c, key, hexp)) return 0;
+
+  fd = openat(c->dirfd, hexp, O_RDONLY);
+
+  if (fd == -1) return 0;
+
+  if (fstat(fd, &st)) {
+    close(fd);
+    return 0;
+  }
+
+  if (!(st.st_mode & S_IFREG)) {
+    close(fd);
+    return 0;
+  }
+
+  strb_read(&b, fd, st.st_size);
+  close(fd);
+
+  if (strb_error(&b) || b.l < 16) {
+    strb_clear(&b);
+    return 0;
+  }
+
+  kl = ntohull(b.s);
+  vl = ntohull(b.s + 8);
+
+  if (b.l < 16 + kl + vl) {
+    strb_clear(&b);
+    return 0;
+  }
+
+  ts = b.s;
+
+  b.s += 16;
+  b.l = kl;
+
+  k = c->kread(&b);
+  if (k && c->c.keq(key, k)) {
+    if (_v) {
+      b.s += kl;
+      b.l = vl;
+      *_v = c->vread(&b);
+      if (*_v == NULL)
+        goto error;
+    }
+    if (_k)
+      *_k = k;
+    else
+      c->c.kfree(k);
+    b.s = ts;
+    strb_clear(&b);
+    return 1;
+  }
+ error:
+  c->c.kfree(k);
+  b.s = ts;
+  strb_clear(&b);
+  return 0;
+}
+
+static int disk_add(cache *_c, cache_key_t k, cache_value_t v) {
+  disk_cache *c = (disk_cache *)_c;
+
+  /* Ignore write errors */
+  write_entry(c, k, v);
+
+  return cache_add(c->mem, k, v);
+}
+
+static int disk_del(cache *_c, const cache_key_t key) {
+  disk_cache *c = (disk_cache *)_c;
+  char hexp[HEXP_LEN] = {0};
+  
+  cache_del(c->mem, key);
+
+  key_path(c, key, hexp);
+
+  return (unlinkat(c->dirfd, hexp, 0) == 0);
+}
+
+static cache_value_t disk_get(cache *_c, const cache_key_t key) {
+  disk_cache *c = (disk_cache *)_c;
+  cache_key_t k;
+  cache_value_t v;
+
+  v = cache_get(c->mem, key);
+  if (v != NULL)
+    return v;
+
+  if (find_entry(c, key, &k, &v)) {
+    if (cache_add(c->mem, k, v)) return NULL;
+    return v;
+  }
+  return NULL;
+}
+
+static void disk_destroy(cache *_c) {
+  disk_cache *c = (disk_cache *)_c;
+  cache_destroy(c->mem);
+  close(c->dirfd);
+}
+
+cache *cache_disk(const char *dirpath, cache *mem,
+                  kwrite_fn kwrite, vwrite_fn vwrite,
+                  kread_fn kread, vread_fn vread) {
+  struct stat st;
+  disk_cache *res;
+
+  mkdir(dirpath, 0777); /* This may fail, but we don't care */
+  if (lstat(dirpath, &st) != 0)
+    return NULL;
+  if (!(st.st_mode & S_IFDIR))
+    return NULL;
+
+  res = calloc(sizeof(*res), 1);
+  if (res == NULL) return NULL;
+
+  res->dirfd = open(dirpath, O_RDWR|O_CLOEXEC);
+  if (res->dirfd == -1) {
+    free(res);
+    return NULL;
+  }
+
+  res->mem = mem;
+  res->kwrite = kwrite;
+  res->vwrite = vwrite;
+  res->kread = kread;
+  res->vread = vread;
+  res->c.add = disk_add;
+  res->c.del = disk_del;
+  res->c.get = disk_get;
+  res->c.destroy = disk_destroy;
+  res->c.keq = mem->keq;
+  res->c.khash = mem->khash;
+  res->c.kfree = mem->kfree;
+  res->c.vfree = mem->vfree;
+  return (cache *)res;
+}

From 02b97e7469d6dfd47765414c062ca507e9af6005 Mon Sep 17 00:00:00 2001
From: Arnaud Bergeron <abergeron@gmail.com>
Date: Fri, 9 Dec 2016 22:07:18 +0100
Subject: [PATCH 09/41] Cleanup tool.

---
 bin/gpuarray-cache | 51 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)
 create mode 100644 bin/gpuarray-cache

diff --git a/bin/gpuarray-cache b/bin/gpuarray-cache
new file mode 100644
index 0000000000..528e3eb9a5
--- /dev/null
+++ b/bin/gpuarray-cache
@@ -0,0 +1,51 @@
+#!/usr/bin/env python
+
+import os
+
+def clean(max_size):
+    content = []
+    for root, dirs, files in os.walk(os.environ.get('GPUARRAY_CACHE',
+                                                    '~/.gpuarray/cache/')):
+        for file in files:
+            fpath = os.path.join(root, file)
+            st = os.stat(fpath)
+            content.append((st.st_atime, st.st_size, fpath))
+
+    content.sort()
+    cur_size = 0
+    for _, size, path in content:
+        cur_size += size
+        if cur_size > max_size:
+            os.remove(path)
+
+
+SUFFIXES = {'B': 1, 'K': 1 << 10, 'M': 1 < 20, 'G': 1 << 30, 'T': 1 << 40,
+            'P': 1 << 50, 'E': 1 << 60, 'Z': 1 << 70, 'Y': 1 << 80}
+
+
+def get_size(s):
+    i = 0
+    while i < len(s) and (s[i].isdigit() or s[i] == '.'):
+        i += 1
+    num = s[:i]
+    suf = s[i:]
+    num = float(num)
+    if suf != "":
+        letter = suf.strip().upper()
+        if letter not in SUFFIXES:
+            raise ValueError("can't interpret %r" % init)
+        mult = SUFFIXES[letter]
+    else:
+        mult = 0
+    return int(num * mult)
+
+
+if __name__ == '__main__':
+    import argparse
+
+    parser = argparse.ArgumentParser(description='libgpuarray cache maintenance utility')
+    parser.add_argument('-s', '--max_size', help='Set the maximum size for pruning')
+    args = parser.parse_args()
+
+    clean(get_size(args.max_size))
+

From d58675a195b4e752cb47a91b331cdaf13f4131dd Mon Sep 17 00:00:00 2001
From: Arnaud Bergeron <abergeron@gmail.com>
Date: Sat, 10 Dec 2016 09:06:20 +0100
Subject: [PATCH 10/41] Expose the definition of cache_disk().

---
 src/cache.h      | 10 ++++++++++
 src/cache/disk.c |  6 ------
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/cache.h b/src/cache.h
index f2059e73cc..f2e610f3dd 100644
--- a/src/cache.h
+++ b/src/cache.h
@@ -4,6 +4,7 @@
 #include <stdlib.h>
 #include <gpuarray/config.h>
 #include "private_config.h"
+#include "util/strb.h"
 
 typedef void *cache_key_t;
 typedef void *cache_value_t;
@@ -13,6 +14,11 @@ typedef uint32_t (*cache_hash_fn)(cache_key_t);
 typedef void (*cache_freek_fn)(cache_key_t);
 typedef void (*cache_freev_fn)(cache_value_t);
 
+typedef int (*kwrite_fn)(strb *res, cache_key_t key);
+typedef int (*vwrite_fn)(strb *res, cache_value_t val);
+typedef cache_key_t (*kread_fn)(const strb *b);
+typedef cache_value_t (*vread_fn)(const strb *b);
+
 typedef struct _cache cache;
 
 struct _cache {
@@ -78,6 +84,10 @@ cache *cache_twoq(size_t hot_size, size_t warm_size,
                   cache_eq_fn keq, cache_hash_fn khash,
                   cache_freek_fn kfree, cache_freev_fn vfree);
 
+cache *cache_disk(const char *dirpath, cache *mem,
+                  kwrite_fn kwrite, vwrite_fn vwrite,
+                  kread_fn kread, vread_fn vread);
+
 /* API functions */
 static inline int cache_add(cache *c, cache_key_t k, cache_value_t v) {
   return c->add(c, k, v);
diff --git a/src/cache/disk.c b/src/cache/disk.c
index 59e6f1ba42..2cf90d7c77 100644
--- a/src/cache/disk.c
+++ b/src/cache/disk.c
@@ -8,16 +8,10 @@
 
 #include "cache.h"
 #include "private_config.h"
-#include "util/strb.h"
 #include "util/skein.h"
 
 #define HEXP_LEN (128 + 2)
 
-typedef int (*kwrite_fn)(strb *res, cache_key_t key);
-typedef int (*vwrite_fn)(strb *res, cache_value_t val);
-typedef cache_key_t (*kread_fn)(const strb *b);
-typedef cache_value_t (*vread_fn)(const strb *b);
-
 typedef struct _disk_cache {
   cache c;
   cache * mem;

From 605d58c8d9eb740bba6c80653c09abbd842e2ba7 Mon Sep 17 00:00:00 2001
From: Arnaud Bergeron <abergeron@gmail.com>
Date: Tue, 17 Jan 2017 19:27:17 -0500
Subject: [PATCH 11/41] Add the functions and setup for disk cache (nothing
 uses it yet).

---
 src/gpuarray_buffer_cuda.c | 99 +++++++++++++++++++++++++++++++++++---
 src/private_cuda.h         |  1 +
 src/util/strb.h            |  2 +-
 3 files changed, 95 insertions(+), 7 deletions(-)

diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c
index 1c883deaa7..cf34d0288d 100644
--- a/src/gpuarray_buffer_cuda.c
+++ b/src/gpuarray_buffer_cuda.c
@@ -49,18 +49,72 @@ static int cuda_records(gpudata *, int, CUstream);
 static int detect_arch(const char *prefix, char *ret, CUresult *err);
 static gpudata *new_gpudata(cuda_context *ctx, CUdeviceptr ptr, size_t size);
 
-static int strb_eq(void *_k1, void *_k2) {
-  strb *k1 = (strb *)_k1;
-  strb *k2 = (strb *)_k2;
+typedef struct _kernel_key {
+  char bin_id[64];
+  strb *src;
+} kernel_key;
+
+static void key_free(cache_key_t _k) {
+  kernel_key *k = (kernel_key *)_k;
+  strb_free(k->src);
+  free(k);
+}
+
+static int strb_eq(strb *k1, strb *k2) {
   return (k1->l == k2->l &&
           memcmp(k1->s, k2->s, k1->l) == 0);
 }
 
-static uint32_t strb_hash(void *_k) {
-  strb *k = (strb *)_k;
+static uint32_t strb_hash(strb *k) {
   return XXH32(k->s, k->l, 42);
 }
 
+static int key_eq(kernel_key *k1, kernel_key *k2) {
+  return (memcmp(k1->bin_id, k2->bin_id, 64) == 0 &&
+          strb_eq(k1->src, k2->src));
+}
+
+static int key_hash(kernel_key *k) {
+  XXH32_state_t state;
+  XXH32_reset(&state, 42);
+  XXH32_update(&state, k->bin_id, 64);
+  XXH32_update(&state, k->src->s, k->src->l);
+  return XXH32_digest(&state);
+}
+
+static int key_write(strb *res, kernel_key *k) {
+  strb_appendn(res, k->bin_id, 64);
+  strb_appendb(res, k->src);
+  return strb_error(res);
+}
+
+static kernel_key *key_read(const strb *b) {
+  kernel_key *k;
+  if (b->l < 64) return NULL;
+  k = malloc(sizeof(*k));
+  if (k == NULL) return NULL;
+  k->src = strb_alloc(b->l - 64);
+  if (k->src == NULL) {
+    free(k);
+    return NULL;
+  }
+  memcpy(k->bin_id, b->s, 64);
+  strb_appendn(k->src, b->s+64, b->l-64);
+  return k;
+}
+
+static int kernel_write(strb *res, strb *bin) {
+  strb_appendb(res, bin);
+  return strb_error(res);
+}
+
+static strb *kernel_read(const strb *b) {
+  strb *res = strb_alloc(b->l);
+  if (res != NULL)
+    strb_appendb(res, b);
+  return res;
+}
+
 static int setup_done = 0;
 static int major = -1;
 static int minor = -1;
@@ -114,6 +168,8 @@ static int cuda_get_device_count(unsigned int platform,
 
 cuda_context *cuda_make_ctx(CUcontext ctx, int flags) {
   cuda_context *res;
+  cache *mem_cache;
+  char *cache_path;
   void *p;
   int e;
 
@@ -152,11 +208,38 @@ cuda_context *cuda_make_ctx(CUcontext ctx, int flags) {
       goto fail_mem_stream;
     }
   }
-  res->kernel_cache = cache_twoq(64, 128, 64, 8, strb_eq, strb_hash,
+
+  res->kernel_cache = cache_twoq(64, 128, 64, 8,
+                                 (cache_eq_fn)strb_eq,
+                                 (cache_hash_fn)strb_hash,
                                  (cache_freek_fn)strb_free,
                                  (cache_freev_fn)cuda_freekernel);
   if (res->kernel_cache == NULL)
     goto fail_cache;
+
+  cache_path = getenv("GPUARRAY_CACHE_PATH");
+  if (cache_path != NULL) {
+    mem_cache = cache_lru(64, 8,
+                          (cache_eq_fn)key_eq,
+                          (cache_hash_fn)key_hash,
+                          (cache_freek_fn)key_free,
+                          (cache_freev_fn)strb_free);
+    if (mem_cache == NULL)
+      goto fail_disk_cache;
+    res->disk_cache = cache_disk(cache_path, mem_cache,
+                                 (kwrite_fn)key_write,
+                                 (vwrite_fn)kernel_write,
+                                 (kread_fn)key_read,
+                                 (vread_fn)kernel_read);
+    if (res->disk_cache == NULL) {
+      cache_destroy(mem_cache);
+      goto fail_disk_cache;
+    }
+  } else {
+  fail_disk_cache:
+    res->disk_cache = NULL;
+  }
+
   err = cuMemAllocHost(&p, 16);
   if (err != CUDA_SUCCESS) {
     goto fail_errbuf;
@@ -174,6 +257,8 @@ cuda_context *cuda_make_ctx(CUcontext ctx, int flags) {
  fail_end:
   cuMemFreeHost(p);
  fail_errbuf:
+  if (res->disk_cache)
+    cache_destroy(res->disk_cache);
   cache_destroy(res->kernel_cache);
  fail_cache:
   if (ISCLR(res->flags, GA_CTX_SINGLE_STREAM))
@@ -215,6 +300,8 @@ static void cuda_free_ctx(cuda_context *ctx) {
       deallocate(curr);
     }
     cache_destroy(ctx->kernel_cache);
+    if (ctx->disk_cache)
+      cache_destroy(ctx->disk_cache);
 
     if (!(ctx->flags & DONTFREE)) {
       cuCtxPushCurrent(ctx->ctx);
diff --git a/src/private_cuda.h b/src/private_cuda.h
index da6f60ad7a..ad9ff7f8ae 100644
--- a/src/private_cuda.h
+++ b/src/private_cuda.h
@@ -68,6 +68,7 @@ typedef struct _cuda_context {
   CUstream mem_s;
   gpudata *freeblocks;
   cache *kernel_cache;
+  cache *disk_cache;
   unsigned int enter;
   unsigned char major;
   unsigned char minor;
diff --git a/src/util/strb.h b/src/util/strb.h
index 267941417e..01ea7a2495 100644
--- a/src/util/strb.h
+++ b/src/util/strb.h
@@ -146,7 +146,7 @@ static inline void strb_appends(strb *sb, const char *s) {
 /*
  * Appends the content of another strb.
  */
-static inline void strb_appendb(strb *sb, strb *sb2) {
+static inline void strb_appendb(strb *sb, const strb *sb2) {
   strb_appendn(sb, sb2->s, sb2->l);
 }
 

From 6f1fd645a3c1197c3b5cfc3849ed0b5a695596b4 Mon Sep 17 00:00:00 2001
From: Arnaud Bergeron <abergeron@gmail.com>
Date: Wed, 18 Jan 2017 19:49:00 -0500
Subject: [PATCH 12/41] Rework the compile logic and integrate the disk cache
 into the mix.

This removes the gpukernel_binary support also, because it became
burdensome and redundant.  If the kernels are already cached at the
library level, there is no need for applications to do the same.
---
 src/gpuarray/buffer.h        |  17 +-
 src/gpuarray/error.h         |   1 +
 src/gpuarray/kernel.h        |   1 +
 src/gpuarray_buffer.c        |   2 +-
 src/gpuarray_buffer_cuda.c   | 375 +++++++++++++++++------------------
 src/gpuarray_buffer_opencl.c |  29 ---
 src/gpuarray_error.c         |   1 +
 src/loaders/libcuda.fn       |   4 +
 src/loaders/libcuda.h        |  11 +
 src/private.h                |   1 -
 src/private_cuda.h           |   2 -
 11 files changed, 205 insertions(+), 239 deletions(-)

diff --git a/src/gpuarray/buffer.h b/src/gpuarray/buffer.h
index 800756a072..34878d503a 100644
--- a/src/gpuarray/buffer.h
+++ b/src/gpuarray/buffer.h
@@ -494,22 +494,9 @@ GPUARRAY_PUBLIC int gpukernel_call(gpukernel *k, unsigned int n,
                                    size_t shared, void **args);
 
 /**
- * (Deprecated) Get the kernel binary.
+ * Get the kernel binary (REMOVED).
  *
- * This function is deprecated and will be removed in the next release.
- *
- * This can be use to cache kernel binaries after compilation of a
- * specific device.  The kernel can be recreated by calling
- * kernel_alloc with the binary and size and passing `GA_USE_BINARY`
- * as the use flags.
- *
- * The returned pointer is allocated and must be freed by the caller.
- *
- * \param k kernel
- * \param sz size of the returned binary
- * \param obj pointer to the binary for the kernel.
- *
- * \returns GA_NO_ERROR or an error code if an error occurred.
+ * Always returns GA_DEPRECATED_ERROR.
  */
 GPUARRAY_PUBLIC int gpukernel_binary(gpukernel *k, size_t *sz, void **obj);
 
diff --git a/src/gpuarray/error.h b/src/gpuarray/error.h
index af963c1531..84c852a257 100644
--- a/src/gpuarray/error.h
+++ b/src/gpuarray/error.h
@@ -36,6 +36,7 @@ enum ga_error {
   GA_COMM_ERROR,
   GA_XLARGE_ERROR,
   GA_LOAD_ERROR,
+  GA_DEPRECATED_ERROR,
   /* Add more error types if needed, but at the end */
   /* Don't forget to sync with Gpu_error() */
 };
diff --git a/src/gpuarray/kernel.h b/src/gpuarray/kernel.h
index f88d74ffc6..da779123b9 100644
--- a/src/gpuarray/kernel.h
+++ b/src/gpuarray/kernel.h
@@ -107,6 +107,7 @@ GPUARRAY_PUBLIC int GpuKernel_call(GpuKernel *k, unsigned int n,
                                    const size_t *gs, const size_t *ls,
                                    size_t shared, void **args);
 
+/* Deprecated and to be removed */
 GPUARRAY_PUBLIC int GpuKernel_binary(const GpuKernel *k, size_t *sz,
                                     void **obj);
 
diff --git a/src/gpuarray_buffer.c b/src/gpuarray_buffer.c
index a4dfd3329b..d3226ea94e 100644
--- a/src/gpuarray_buffer.c
+++ b/src/gpuarray_buffer.c
@@ -187,7 +187,7 @@ int gpukernel_call(gpukernel *k, unsigned int n, const size_t *gs,
 }
 
 int gpukernel_binary(gpukernel *k, size_t *sz, void **obj) {
-  return ((partial_gpukernel *)k)->ctx->ops->kernel_binary(k, sz, obj);
+  return GA_DEPRECATED_ERROR;
 }
 
 int gpukernel_property(gpukernel *k, int prop_id, void *res) {
diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c
index cf34d0288d..3820dff023 100644
--- a/src/gpuarray_buffer_cuda.c
+++ b/src/gpuarray_buffer_cuda.c
@@ -51,12 +51,12 @@ static gpudata *new_gpudata(cuda_context *ctx, CUdeviceptr ptr, size_t size);
 
 typedef struct _kernel_key {
   char bin_id[64];
-  strb *src;
+  strb src;
 } kernel_key;
 
 static void key_free(cache_key_t _k) {
   kernel_key *k = (kernel_key *)_k;
-  strb_free(k->src);
+  strb_clear(&k->src);
   free(k);
 }
 
@@ -71,35 +71,35 @@ static uint32_t strb_hash(strb *k) {
 
 static int key_eq(kernel_key *k1, kernel_key *k2) {
   return (memcmp(k1->bin_id, k2->bin_id, 64) == 0 &&
-          strb_eq(k1->src, k2->src));
+          strb_eq(&k1->src, &k2->src));
 }
 
 static int key_hash(kernel_key *k) {
   XXH32_state_t state;
   XXH32_reset(&state, 42);
   XXH32_update(&state, k->bin_id, 64);
-  XXH32_update(&state, k->src->s, k->src->l);
+  XXH32_update(&state, k->src.s, k->src.l);
   return XXH32_digest(&state);
 }
 
 static int key_write(strb *res, kernel_key *k) {
   strb_appendn(res, k->bin_id, 64);
-  strb_appendb(res, k->src);
+  strb_appendb(res, &k->src);
   return strb_error(res);
 }
 
 static kernel_key *key_read(const strb *b) {
   kernel_key *k;
   if (b->l < 64) return NULL;
-  k = malloc(sizeof(*k));
+  k = calloc(1, sizeof(*k));
   if (k == NULL) return NULL;
-  k->src = strb_alloc(b->l - 64);
-  if (k->src == NULL) {
+  if (strb_ensure(&k->src, b->l - 64) != 0) {
+    strb_clear(&k->src);
     free(k);
     return NULL;
   }
   memcpy(k->bin_id, b->s, 64);
-  strb_appendn(k->src, b->s+64, b->l-64);
+  strb_appendn(&k->src, b->s+64, b->l-64);
   return k;
 }
 
@@ -1000,22 +1000,22 @@ static int detect_arch(const char *prefix, char *ret, CUresult *err) {
   return GA_NO_ERROR;
 }
 
-static void *call_compiler(const char *src, size_t len, const char *arch_arg,
-                           size_t *bin_len, char **log, size_t *log_len,
-                           int *ret) {
+static int call_compiler(cuda_context *ctx, strb *src, strb *ptx, strb *log) {
   nvrtcProgram prog;
-  void *buf = NULL;
   size_t buflen;
   const char *opts[4] = {
     "-arch", ""
     , "-G", "-lineinfo"
   };
-  nvrtcResult err, err2;
+  nvrtcResult err;
 
-  opts[1] = arch_arg;
+  opts[1] = ctx->bin_id;
 
-  err = nvrtcCreateProgram(&prog, src, NULL, 0, NULL, NULL);
-  if (err != NVRTC_SUCCESS) FAIL(NULL, GA_SYS_ERROR);
+  strb_append0(src);
+  if (strb_error(src))
+    return GA_MEMORY_ERROR;
+  err = nvrtcCreateProgram(&prog, src->s, NULL, 0, NULL, NULL);
+  if (err != NVRTC_SUCCESS) return GA_SYS_ERROR;
 
   err = nvrtcCompileProgram(prog,
 #ifdef DEBUG
@@ -1024,41 +1024,115 @@ static void *call_compiler(const char *src, size_t len, const char *arch_arg,
                             2,
 #endif
                             opts);
-  if (log != NULL) {
-    err2 = nvrtcGetProgramLogSize(prog, &buflen);
-    if (err2 != NVRTC_SUCCESS) goto end2;
-    buf = malloc(buflen);
-    if (buf == NULL) goto end2;
-    err2 = nvrtcGetProgramLog(prog, (char *)buf);
-    if (err2 != NVRTC_SUCCESS) goto end2;
-    if (log_len != NULL) *log_len = buflen;
-    *log = (char *)buf;
-    buf = NULL;
+  if (nvrtcGetProgramLogSize(prog, &buflen) == NVRTC_SUCCESS) {
+    strb_appends(log, "NVRTC compile log::\n");
+    if (strb_ensure(log, buflen) == 0)
+      if (nvrtcGetProgramLog(prog, log->s+log->l) == NVRTC_SUCCESS)
+        log->l += buflen - 1;
+    strb_appendc(log, '\n');
   }
-end2:
-  if (err != NVRTC_SUCCESS) goto end;
 
   err = nvrtcGetPTXSize(prog, &buflen);
   if (err != NVRTC_SUCCESS) goto end;
 
-  buf = malloc(buflen);
-  if (buf == NULL) {
-    nvrtcDestroyProgram(&prog);
-    FAIL(NULL, GA_MEMORY_ERROR);
+  if (strb_ensure(ptx, buflen) == 0)
+    err = nvrtcGetPTX(prog, ptx->s+ptx->l);
+
+end:
+  nvrtcDestroyProgram(&prog);
+  if (err != NVRTC_SUCCESS)
+    return GA_SYS_ERROR;
+  return GA_NO_ERROR;
+}
+
+static int make_bin(cuda_context *ctx, const strb *ptx, strb *bin, strb *log) {
+  char info_log[2048];
+  char error_log[2048];
+  void *out;
+  size_t out_size;
+  CUlinkState st;
+  CUjit_option cujit_opts[] = {
+    CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
+    CU_JIT_INFO_LOG_BUFFER,
+    CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
+    CU_JIT_ERROR_LOG_BUFFER,
+    CU_JIT_LOG_VERBOSE,
+    CU_JIT_GENERATE_DEBUG_INFO,
+    CU_JIT_GENERATE_LINE_INFO,
+  };
+  void *cujit_opt_vals[] = {
+    (void *)sizeof(info_log), info_log,
+    (void *)sizeof(error_log), error_log,
+#ifdef DEBUG
+    (void *)1, (void *)1, (void *)1
+#else
+    (void *)0, (void *)0, (void *)0
+#endif
+  };
+
+  ctx->err = cuLinkCreate(sizeof(cujit_opts)/sizeof(cujit_opts[0]),
+                          cujit_opts, cujit_opt_vals, &st);
+  if (ctx->err != CUDA_SUCCESS)
+    return GA_IMPL_ERROR;
+  ctx->err = cuLinkAddData(st, CU_JIT_INPUT_PTX, ptx->s, ptx->l,
+                           "kernel code", 0, NULL, NULL);
+  if (ctx->err != CUDA_SUCCESS) {
+    cuLinkDestroy(st);
+    return GA_IMPL_ERROR;
+  }
+  ctx->err = cuLinkComplete(st, &out, &out_size);
+  if (ctx->err != CUDA_SUCCESS) {
+    cuLinkDestroy(st);
+    return GA_IMPL_ERROR;
   }
+  strb_appendn(bin, out, out_size);
+  cuLinkDestroy(st);
+  strb_appends(log, "Link info log::\n");
+  strb_appends(log, info_log);
+  strb_appends(log, "\nLink error log::\n");
+  strb_appends(log, error_log);
+  strb_appendc(log, '\n');
+  return GA_NO_ERROR;
+}
 
-  err = nvrtcGetPTX(prog, (char *)buf);
-  if (err != NVRTC_SUCCESS) goto end;
+static int compile(cuda_context *ctx, strb *src, strb* bin, strb *log) {
+  strb ptx = STRB_STATIC_INIT;
+  strb *cbin;
+  kernel_key k;
+  kernel_key *pk;
+  int err;
 
-  *bin_len = buflen;
+  memcpy(k.bin_id, ctx->bin_id, 64);
+  memcpy(&k.src, src, sizeof(strb));
 
-end:
-  nvrtcDestroyProgram(&prog);
-  if (err != NVRTC_SUCCESS) {
-    free(buf);
-    FAIL(NULL, GA_SYS_ERROR);
+  // Look up the binary in the disk cache
+  cbin = cache_get(ctx->disk_cache, &k);
+  if (cbin != NULL) {
+    strb_appendb(bin, cbin);
+    return GA_NO_ERROR;
+  }
+
+  err = call_compiler(ctx, src, &ptx, log);
+  if (err != GA_NO_ERROR) return err;
+  err = make_bin(ctx, &ptx, bin, log);
+  if (err != GA_NO_ERROR) return err;
+  pk = memdup(&k, sizeof(k));
+  if (pk == NULL)
+    return err;
+  cbin = strb_alloc(bin->l);
+  if (cbin == NULL) {
+    free(pk);
+    return err;
   }
-  return buf;
+  strb_appendb(cbin, bin);
+  if (strb_error(cbin)) {
+    free(pk);
+    strb_free(cbin);
+    return err;
+  }
+  cache_add(ctx->disk_cache, pk, cbin);
+
+  return err;
 }
 
 static void _cuda_freekernel(gpukernel *k) {
@@ -1072,7 +1146,6 @@ static void _cuda_freekernel(gpukernel *k) {
     }
     CLEAR(k);
     free(k->args);
-    free(k->bin);
     free(k->types);
     free(k);
   }
@@ -1084,45 +1157,21 @@ static gpukernel *cuda_newkernel(gpucontext *c, unsigned int count,
                                  const int *types, int flags, int *ret,
                                  char **err_str) {
     cuda_context *ctx = (cuda_context *)c;
-    strb sb = STRB_STATIC_INIT;
-    strb *psb;
-    char *bin, *log = NULL;
+    strb src = STRB_STATIC_INIT;
+    strb bin = STRB_STATIC_INIT;
+    strb log = STRB_STATIC_INIT;
+    strb *psrc;
     gpukernel *res;
-    size_t bin_len = 0, log_len = 0;
     CUdevice dev;
     unsigned int i;
     int major, minor;
-    strb debug_msg = STRB_STATIC_INIT;
-
-    // options for cuModuleLoadDataEx
-    const size_t cujit_log_size = 4096;
-    char *cujit_info_log = NULL;
-    unsigned int num_cujit_opts = 4;
-    CUjit_option cujit_opts[] = {
-        CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
-        CU_JIT_INFO_LOG_BUFFER,
-        CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
-        CU_JIT_ERROR_LOG_BUFFER
-    };
-    void *cujit_opt_vals[] = {
-        (void*)(size_t)cujit_log_size, NULL,
-        (void*)(size_t)cujit_log_size, NULL,
-    };
+    int err;
 
     if (count == 0) FAIL(NULL, GA_VALUE_ERROR);
 
     if (flags & GA_USE_OPENCL)
       FAIL(NULL, GA_DEVSUP_ERROR);
 
-    if (flags & GA_USE_BINARY) {
-      // GA_USE_BINARY is exclusive
-      if (flags & ~GA_USE_BINARY)
-        FAIL(NULL, GA_INVALID_ERROR);
-      // We need the length for binary data and there is only one blob.
-      if (count != 1 || lengths == NULL || lengths[0] == 0)
-        FAIL(NULL, GA_VALUE_ERROR);
-    }
-
     cuda_enter(ctx);
 
     ctx->err = cuCtxGetDevice(&dev);
@@ -1138,6 +1187,7 @@ static gpukernel *cuda_newkernel(gpucontext *c, unsigned int count,
 
     // GA_USE_CLUDA is done later
     // GA_USE_SMALL will always work
+    // GA_USE_HALF should always work
     if (flags & GA_USE_DOUBLE) {
       if (major < 1 || (major == 1 && minor < 3)) {
         cuda_exit(ctx);
@@ -1149,90 +1199,73 @@ static gpukernel *cuda_newkernel(gpucontext *c, unsigned int count,
       cuda_exit(ctx);
       FAIL(NULL, GA_DEVSUP_ERROR);
     }
-    // GA_USE_HALF should always work
 
-    if (flags & GA_USE_BINARY) {
-      bin = memdup(strings[0], lengths[0]);
-      bin_len = lengths[0];
-      if (bin == NULL) {
-        cuda_exit(ctx);
-        FAIL(NULL, GA_MEMORY_ERROR);
-      }
+    if (flags & GA_USE_CLUDA) {
+      strb_appends(&src, CUDA_PREAMBLE);
+    }
+
+    if (lengths == NULL) {
+      for (i = 0; i < count; i++)
+        strb_appends(&src, strings[i]);
     } else {
-      if (flags & GA_USE_CLUDA) {
-        strb_appends(&sb, CUDA_PREAMBLE);
+      for (i = 0; i < count; i++) {
+        if (lengths[i] == 0)
+          strb_appends(&src, strings[i]);
+        else
+          strb_appendn(&src, strings[i], lengths[i]);
       }
+    }
 
-      if (lengths == NULL) {
-        for (i = 0; i < count; i++)
-        strb_appends(&sb, strings[i]);
-      } else {
-        for (i = 0; i < count; i++) {
-          if (lengths[i] == 0)
-            strb_appends(&sb, strings[i]);
-          else
-            strb_appendn(&sb, strings[i], lengths[i]);
-        }
-      }
+    strb_append0(&src);
 
-      strb_append0(&sb);
+    if (strb_error(&src)) {
+      strb_clear(&src);
+      cuda_exit(ctx);
+      FAIL(NULL, GA_MEMORY_ERROR);
+    }
 
-      if (strb_error(&sb)) {
-        strb_clear(&sb);
-        cuda_exit(ctx);
-        FAIL(NULL, GA_MEMORY_ERROR);
-      }
+    res = (gpukernel *)cache_get(ctx->kernel_cache, &src);
+    if (res != NULL) {
+      res->refcnt++;
+      strb_clear(&src);
+      return res;
+    }
 
-      res = (gpukernel *)cache_get(ctx->kernel_cache, &sb);
-      if (res != NULL) {
-        res->refcnt++;
-        strb_clear(&sb);
-        return res;
-      }
-      bin = call_compiler(sb.s, sb.l, ctx->bin_id, &bin_len,
-                          &log, &log_len, ret);
-      if (bin == NULL) {
-        if (err_str != NULL) {
-
-          // We're substituting debug_msg for a string with this first line:
-          strb_appends(&debug_msg, "CUDA kernel compile failure ::\n");
-
-          /* Delete the final NUL */
-          sb.l--;
-          gpukernel_source_with_line_numbers(1, (const char **)&sb.s,
-                                             &sb.l, &debug_msg);
-
-          if (log != NULL) {
-            strb_appends(&debug_msg, "\nCompiler log:\n");
-            strb_appendn(&debug_msg, log, log_len);
-            free(log);
-          }
-          *err_str = strb_cstr(&debug_msg);
-          // *err_str will be free()d by the caller (see docs in kernel.h)
-        }
-        strb_clear(&sb);
-        cuda_exit(ctx);
-        FAIL(NULL, GA_IMPL_ERROR);
+    err = compile(ctx, &src, &bin, &log);
+    if (err != GA_NO_ERROR || strb_error(&bin)) {
+      if (err_str != NULL) {
+        strb debug_msg = STRB_STATIC_INIT;
+        strb_appends(&debug_msg, "CUDA kernel compile failure ::\n");
+        src.l--;
+        gpukernel_source_with_line_numbers(1, (const char **)&src.s,
+                                           &src.l, &debug_msg);
+        strb_appends(&debug_msg, "\nCompile log:\n");
+        strb_appendb(&debug_msg, &log);
+        *err_str = strb_cstr(&debug_msg);
       }
+      strb_clear(&src);
+      strb_clear(&bin);
+      strb_clear(&log);
+      cuda_exit(ctx);
+      FAIL(NULL, err);
     }
+    strb_clear(&log);
 
     res = calloc(1, sizeof(*res));
     if (res == NULL) {
-      free(bin);
-      strb_clear(&sb);
+      strb_clear(&src);
+      strb_clear(&bin);
       cuda_exit(ctx);
       FAIL(NULL, GA_SYS_ERROR);
     }
 
-    res->bin_sz = bin_len;
-    res->bin = bin;
-
     res->refcnt = 1;
     res->argcount = argcount;
     res->types = calloc(argcount, sizeof(int));
     if (res->types == NULL) {
       _cuda_freekernel(res);
-      strb_clear(&sb);
+      strb_clear(&src);
+      strb_clear(&bin);
       cuda_exit(ctx);
       FAIL(NULL, GA_MEMORY_ERROR);
     }
@@ -1240,55 +1273,26 @@ static gpukernel *cuda_newkernel(gpucontext *c, unsigned int count,
     res->args = calloc(argcount, sizeof(void *));
     if (res->args == NULL) {
       _cuda_freekernel(res);
-      strb_clear(&sb);
+      strb_clear(&src);
+      strb_clear(&bin);
       cuda_exit(ctx);
       FAIL(NULL, GA_MEMORY_ERROR);
     }
 
-    // for both info/err log
-    cujit_info_log = (char*)malloc(2*cujit_log_size*sizeof(char));
-    if(cujit_info_log == NULL) {
-      _cuda_freekernel(res);
-      strb_clear(&sb);
-      cuda_exit(ctx);
-      FAIL(NULL, GA_MEMORY_ERROR);
-    }
-    cujit_info_log[0] = 0;
-    cujit_info_log[cujit_log_size] = 0;
-    cujit_opt_vals[1] = (void*)cujit_info_log;
-    cujit_opt_vals[3] = (void*)(cujit_info_log+cujit_log_size);
-
-    ctx->err = cuModuleLoadDataEx(
-            &res->m, bin,
-            num_cujit_opts, cujit_opts, (void**)cujit_opt_vals);
-
+    ctx->err = cuModuleLoadData(&res->m, bin.s);
     if (ctx->err != CUDA_SUCCESS) {
-      if (err_str != NULL) {
-        strb_appends(&debug_msg, "CUDA kernel link failure::\n");
-        if (cujit_info_log[0]) {
-          strb_appends(&debug_msg, "\nLinker msg:\n");
-          strb_appends(&debug_msg, cujit_info_log);
-        }
-        if (cujit_info_log[cujit_log_size]) {
-          strb_appends(&debug_msg, "\nLinker error log:\n");
-          strb_appends(&debug_msg, cujit_info_log+cujit_log_size);
-        }
-        strb_append0(&debug_msg);
-        *err_str = strb_cstr(&debug_msg);
-      }
-      free(cujit_info_log);
       _cuda_freekernel(res);
-      strb_clear(&sb);
+      strb_clear(&src);
+      strb_clear(&bin);
       cuda_exit(ctx);
       FAIL(NULL, GA_IMPL_ERROR);
     }
-
-    free(cujit_info_log);
+    strb_clear(&bin);
 
     ctx->err = cuModuleGetFunction(&res->k, res->m, fname);
     if (ctx->err != CUDA_SUCCESS) {
       _cuda_freekernel(res);
-      strb_clear(&sb);
+      strb_clear(&src);
       cuda_exit(ctx);
       FAIL(NULL, GA_IMPL_ERROR);
     }
@@ -1297,16 +1301,16 @@ static gpukernel *cuda_newkernel(gpucontext *c, unsigned int count,
     ctx->refcnt++;
     cuda_exit(ctx);
     TAG_KER(res);
-    psb = memdup(&sb, sizeof(strb));
-    if (psb == NULL) {
-      cuda_freekernel(res);
-      strb_clear(&sb);
-      FAIL(NULL, GA_MEMORY_ERROR);
+    psrc = memdup(&src, sizeof(strb));
+    if (psrc != NULL) {
+      /* One of the refs is for the cache */
+      res->refcnt++;
+      /* If this fails, it will free the key and remove a ref from the
+         kernel. */
+      cache_add(ctx->kernel_cache, psrc, res);
+    } else {
+      strb_clear(&src);
     }
-    /* One of the refs is for the cache */
-    res->refcnt++;
-    /* If this fails, it will free the key and remove a ref from the kernel. */
-    cache_add(ctx->kernel_cache, psb, res);
     return res;
 }
 
@@ -1381,16 +1385,6 @@ static int cuda_callkernel(gpukernel *k, unsigned int n,
     return GA_NO_ERROR;
 }
 
-static int cuda_kernelbin(gpukernel *k, size_t *sz, void **obj) {
-  void *res = malloc(k->bin_sz);
-  if (res == NULL)
-    return GA_MEMORY_ERROR;
-  memcpy(res, k->bin, k->bin_sz);
-  *sz = k->bin_sz;
-  *obj = res;
-  return GA_NO_ERROR;
-}
-
 static int cuda_sync(gpudata *b) {
   cuda_context *ctx = (cuda_context *)b->ctx;
   int err = GA_NO_ERROR;
@@ -1793,7 +1787,6 @@ const gpuarray_buffer_ops cuda_ops = {cuda_get_platform_count,
                                       cuda_freekernel,
                                       cuda_kernelsetarg,
                                       cuda_callkernel,
-                                      cuda_kernelbin,
                                       cuda_sync,
                                       cuda_transfer,
                                       cuda_property,
diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c
index 84bcb6584d..89b56f80a7 100644
--- a/src/gpuarray_buffer_opencl.c
+++ b/src/gpuarray_buffer_opencl.c
@@ -1076,34 +1076,6 @@ static int cl_callkernel(gpukernel *k, unsigned int n,
   return GA_NO_ERROR;
 }
 
-static int cl_kernelbin(gpukernel *k, size_t *sz, void **obj) {
-  cl_ctx *ctx = k->ctx;
-  cl_program p;
-  size_t rsz;
-  void *res;
-
-  ASSERT_KER(k);
-  ASSERT_CTX(ctx);
-
-  ctx->err = clGetKernelInfo(k->k, CL_KERNEL_PROGRAM, sizeof(p), &p, NULL);
-  if (ctx->err != CL_SUCCESS)
-    return GA_IMPL_ERROR;
-  ctx->err = clGetProgramInfo(p, CL_PROGRAM_BINARY_SIZES, sizeof(rsz), &rsz, NULL);
-  if (ctx->err != CL_SUCCESS)
-    return GA_IMPL_ERROR;
-  res = malloc(rsz);
-  if (res == NULL)
-    return GA_MEMORY_ERROR;
-  ctx->err = clGetProgramInfo(p, CL_PROGRAM_BINARIES, sizeof(res), &res, NULL);
-  if (ctx->err != CL_SUCCESS) {
-    free(res);
-    return GA_IMPL_ERROR;
-  }
-  *sz = rsz;
-  *obj = res;
-  return GA_NO_ERROR;
-}
-
 static int cl_sync(gpudata *b) {
   cl_ctx *ctx = (cl_ctx *)b->ctx;
 
@@ -1465,7 +1437,6 @@ const gpuarray_buffer_ops opencl_ops = {cl_get_platform_count,
                                         cl_releasekernel,
                                         cl_setkernelarg,
                                         cl_callkernel,
-                                        cl_kernelbin,
                                         cl_sync,
                                         cl_transfer,
                                         cl_property,
diff --git a/src/gpuarray_error.c b/src/gpuarray_error.c
index b7d5011f5b..ddebd3e9dc 100644
--- a/src/gpuarray_error.c
+++ b/src/gpuarray_error.c
@@ -25,6 +25,7 @@ const char *gpuarray_error_str(int err) {
   case GA_COMM_ERROR:        return "Error in collectives call";
   case GA_XLARGE_ERROR:      return "Input size too large for operation";
   case GA_LOAD_ERROR:        return "Error loading library";
+  case GA_DEPRECATED_ERROR:  return "Deprecated (removed) functionality";
   default: return "Unknown GA error";
   }
 }
diff --git a/src/loaders/libcuda.fn b/src/loaders/libcuda.fn
index 487706f4f3..5bfc890fba 100644
--- a/src/loaders/libcuda.fn
+++ b/src/loaders/libcuda.fn
@@ -17,6 +17,10 @@ DEF_PROC(cuCtxGetDevice, (CUdevice *device));
 DEF_PROC_V2(cuCtxPushCurrent, (CUcontext ctx));
 DEF_PROC_V2(cuCtxPopCurrent, (CUcontext *pctx));
 
+DEF_PROC(cuLinkCreate, (unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut));
+DEF_PROC(cuLinkAddData, (CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, unsigned int numOptions, CUjit_option *options, void **optionValues));
+DEF_PROC(cuLinkComplete, (CUlinkState state, void **cubinOut, size_t *sizeOut));
+DEF_PROC(cuLinkDestroy, (CUlinkState state));
 DEF_PROC(cuModuleLoadData, (CUmodule *module, const void *image));
 DEF_PROC(cuModuleLoadDataEx, (CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues));
 DEF_PROC(cuModuleUnload, (CUmodule hmod));
diff --git a/src/loaders/libcuda.h b/src/loaders/libcuda.h
index e62f8b85df..3a6bf35a8a 100644
--- a/src/loaders/libcuda.h
+++ b/src/loaders/libcuda.h
@@ -23,6 +23,7 @@ typedef struct CUmod_st *CUmodule;
 typedef struct CUfunc_st *CUfunction;
 typedef struct CUevent_st *CUevent;
 typedef struct CUstream_st *CUstream;
+typedef struct CUlinkState_st *CUlinkState;
 
 typedef enum CUdevice_attribute_enum CUdevice_attribute;
 typedef enum CUfunction_attribute_enum CUfunction_attribute;
@@ -30,6 +31,7 @@ typedef enum CUevent_flags_enum CUevent_flags;
 typedef enum CUctx_flags_enum CUctx_flags;
 typedef enum CUipcMem_flags_enum CUipcMem_flags;
 typedef enum CUjit_option_enum CUjit_option;
+typedef enum CUjitInputType_enum CUjitInputType;
 
 #define CU_IPC_HANDLE_SIZE 64
 
@@ -206,4 +208,13 @@ enum CUjit_option_enum {
     CU_JIT_NUM_OPTIONS
 };
 
+enum CUjitInputType_enum {
+    CU_JIT_INPUT_CUBIN = 0,
+    CU_JIT_INPUT_PTX,
+    CU_JIT_INPUT_FATBINARY,
+    CU_JIT_INPUT_OBJECT,
+    CU_JIT_INPUT_LIBRARY,
+    CU_JIT_NUM_INPUT_TYPES
+};
+
 #endif
diff --git a/src/private.h b/src/private.h
index 7405cf8995..abe9783de7 100644
--- a/src/private.h
+++ b/src/private.h
@@ -100,7 +100,6 @@ struct _gpuarray_buffer_ops {
                      const size_t *gs, const size_t *ls,
                      size_t shared, void **args);
 
-  int (*kernel_binary)(gpukernel *k, size_t *sz, void **obj);
   int (*buffer_sync)(gpudata *b);
   int (*buffer_transfer)(gpudata *dst, size_t dstoff,
                          gpudata *src, size_t srcoff, size_t sz);
diff --git a/src/private_cuda.h b/src/private_cuda.h
index ad9ff7f8ae..a0b4557977 100644
--- a/src/private_cuda.h
+++ b/src/private_cuda.h
@@ -137,8 +137,6 @@ struct _gpukernel {
   CUmodule m;
   CUfunction k;
   void **args;
-  size_t bin_sz;
-  void *bin;
   int *types;
   unsigned int argcount;
   unsigned int refcnt;

From 0cd407243475e4a13821c07ea216136fce270256 Mon Sep 17 00:00:00 2001
From: Arnaud Bergeron <abergeron@gmail.com>
Date: Wed, 25 Jan 2017 15:54:24 -0500
Subject: [PATCH 13/41] Fix some problems with the disk cache.

---
 src/gpuarray_buffer_cuda.c | 61 +++++++++++++++++++++-----------------
 1 file changed, 33 insertions(+), 28 deletions(-)

diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c
index 3820dff023..7b1f540c26 100644
--- a/src/gpuarray_buffer_cuda.c
+++ b/src/gpuarray_buffer_cuda.c
@@ -1011,9 +1011,6 @@ static int call_compiler(cuda_context *ctx, strb *src, strb *ptx, strb *log) {
 
   opts[1] = ctx->bin_id;
 
-  strb_append0(src);
-  if (strb_error(src))
-    return GA_MEMORY_ERROR;
   err = nvrtcCreateProgram(&prog, src->s, NULL, 0, NULL, NULL);
   if (err != NVRTC_SUCCESS) return GA_SYS_ERROR;
 
@@ -1035,8 +1032,10 @@ static int call_compiler(cuda_context *ctx, strb *src, strb *ptx, strb *log) {
   err = nvrtcGetPTXSize(prog, &buflen);
   if (err != NVRTC_SUCCESS) goto end;
 
-  if (strb_ensure(ptx, buflen) == 0)
+  if (strb_ensure(ptx, buflen) == 0) {
     err = nvrtcGetPTX(prog, ptx->s+ptx->l);
+    if (err == NVRTC_SUCCESS) ptx->l = buflen;
+  }
 
 end:
   nvrtcDestroyProgram(&prog);
@@ -1069,6 +1068,7 @@ static int make_bin(cuda_context *ctx, const strb *ptx, strb *bin, strb *log) {
     (void *)0, (void *)0, (void *)0
 #endif
   };
+  int err = GA_NO_ERROR;
 
   ctx->err = cuLinkCreate(sizeof(cujit_opts)/sizeof(cujit_opts[0]),
                           cujit_opts, cujit_opt_vals, &st);
@@ -1077,22 +1077,23 @@ static int make_bin(cuda_context *ctx, const strb *ptx, strb *bin, strb *log) {
   ctx->err = cuLinkAddData(st, CU_JIT_INPUT_PTX, ptx->s, ptx->l,
                            "kernel code", 0, NULL, NULL);
   if (ctx->err != CUDA_SUCCESS) {
-    cuLinkDestroy(st);
-    return GA_IMPL_ERROR;
+    err = GA_IMPL_ERROR;
+    goto out;
   }
   ctx->err = cuLinkComplete(st, &out, &out_size);
   if (ctx->err != CUDA_SUCCESS) {
-    cuLinkDestroy(st);
-    return GA_IMPL_ERROR;
+    err = GA_IMPL_ERROR;
+    goto out;
   }
   strb_appendn(bin, out, out_size);
+out:
   cuLinkDestroy(st);
   strb_appends(log, "Link info log::\n");
   strb_appends(log, info_log);
   strb_appends(log, "\nLink error log::\n");
   strb_appends(log, error_log);
   strb_appendc(log, '\n');
-  return GA_NO_ERROR;
+  return err;
 }
 
 static int compile(cuda_context *ctx, strb *src, strb* bin, strb *log) {
@@ -1106,33 +1107,37 @@ static int compile(cuda_context *ctx, strb *src, strb* bin, strb *log) {
   memcpy(&k.src, src, sizeof(strb));
 
   // Look up the binary in the disk cache
-  cbin = cache_get(ctx->disk_cache, &k);
-  if (cbin != NULL) {
-    strb_appendb(bin, cbin);
-    return GA_NO_ERROR;
+  if (ctx->disk_cache) {
+    cbin = cache_get(ctx->disk_cache, &k);
+    if (cbin != NULL) {
+      strb_appendb(bin, cbin);
+      return GA_NO_ERROR;
+    }
   }
 
   err = call_compiler(ctx, src, &ptx, log);
   if (err != GA_NO_ERROR) return err;
   err = make_bin(ctx, &ptx, bin, log);
   if (err != GA_NO_ERROR) return err;
-  pk = memdup(&k, sizeof(k));
-  if (pk == NULL)
-    return err;
-  cbin = strb_alloc(bin->l);
-  if (cbin == NULL) {
-    free(pk);
-    return err;
-  }
-  strb_appendb(cbin, bin);
-  if (strb_error(cbin)) {
-    free(pk);
-    strb_free(cbin);
-    return err;
+  if (ctx->disk_cache) {
+    pk = memdup(&k, sizeof(k));
+    if (pk == NULL)
+      return GA_NO_ERROR;
+    cbin = strb_alloc(bin->l);
+    if (cbin == NULL) {
+      free(pk);
+      return GA_NO_ERROR;
+    }
+    strb_appendb(cbin, bin);
+    if (strb_error(cbin)) {
+      free(pk);
+      strb_free(cbin);
+      return GA_NO_ERROR;
+    }
+    cache_add(ctx->disk_cache, pk, cbin);
   }
-  cache_add(ctx->disk_cache, pk, cbin);
 
-  return err;
+  return GA_NO_ERROR;
 }
 
 static void _cuda_freekernel(gpukernel *k) {

From f1e3c2701727221488be47bbce06e0569e8c1e1d Mon Sep 17 00:00:00 2001
From: Arnaud Bergeron <abergeron@gmail.com>
Date: Wed, 25 Jan 2017 16:49:25 -0500
Subject: [PATCH 14/41] Fix some directory creation problems in the disk_cache
 code.

---
 src/cache/disk.c | 51 ++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 43 insertions(+), 8 deletions(-)

diff --git a/src/cache/disk.c b/src/cache/disk.c
index 2cf90d7c77..0a7a5ab18d 100644
--- a/src/cache/disk.c
+++ b/src/cache/disk.c
@@ -81,6 +81,27 @@ static int mkstempat(int dfd, char *template) {
   return -1;
 }
 
+/* Ensure that a path exists by creating all intermediate directories */
+static int ensureat(int dfd, char *path) {
+  char *curp;
+  char *pos;
+
+  curp = path;
+
+  while ((pos = strchr(curp, '/')) != NULL) {
+    *pos = '\0';
+    if (mkdirat(dfd, path, 0777)) {
+      if (errno != EEXIST) return -1;
+      /* For now we suppose that EEXIST means that the directory is
+       * already there.*/
+    }
+    curp = pos + 1;
+    *pos = '/';
+  }
+
+  return 0;
+}
+
 static int key_path(disk_cache *c, const cache_key_t key, char *out) {
   strb kb = STRB_STATIC_INIT;
   unsigned char hash[64];
@@ -88,8 +109,8 @@ static int key_path(disk_cache *c, const cache_key_t key, char *out) {
 
   if (c->kwrite(&kb, key)) return -1;
   if (Skein_512((unsigned char *)kb.s, kb.l, hash)) return -1;
-  if (snprintf(out, 6, "%02x%02x/%02x%02x",
-               hash[0], hash[1], hash[2], hash[3]) != 5)
+  if (snprintf(out, 10, "%02x%02x/%02x%02x",
+               hash[0], hash[1], hash[2], hash[3]) != 9)
     return -1;
   for (i = 4; i < 64; i += 4) {
     if (snprintf(out+(i * 2 + 1), 9, "%02x%02x%02x%02x",
@@ -109,7 +130,9 @@ static int write_entry(disk_cache *c, const cache_key_t k,
 
   if (key_path(c, k, hexp)) return -1;
 
-  if (!strb_ensure(&b, 16)) return -1;
+  if (ensureat(c->dirfd, hexp)) return -1;
+
+  if (strb_ensure(&b, 16)) return -1;
   b.l = 16;
   c->kwrite(&b, k);
   kl = b.l - 16;
@@ -135,7 +158,7 @@ static int write_entry(disk_cache *c, const cache_key_t k,
     unlinkat(c->dirfd, tmp_path, 0);
     return -1;
   }
-  
+
   if (renameat(c->dirfd, tmp_path, c->dirfd, hexp)) {
     unlinkat(c->dirfd, tmp_path, 0);
     return -1;
@@ -227,7 +250,7 @@ static int disk_add(cache *_c, cache_key_t k, cache_value_t v) {
 static int disk_del(cache *_c, const cache_key_t key) {
   disk_cache *c = (disk_cache *)_c;
   char hexp[HEXP_LEN] = {0};
-  
+
   cache_del(c->mem, key);
 
   key_path(c, key, hexp);
@@ -262,17 +285,29 @@ cache *cache_disk(const char *dirpath, cache *mem,
                   kread_fn kread, vread_fn vread) {
   struct stat st;
   disk_cache *res;
+  char *dirp = strdup(dirpath);
+
+  if (dirp == NULL) return NULL;
+
+  if (ensureat(AT_FDCWD, dirp) != 0) {
+    free(dirp);
+    return NULL;
+  }
+  free(dirp);
+
+  mkdir(dirpath, 0777); /* This may fail, but it's ok */
 
-  mkdir(dirpath, 0777); /* This may fail, but we don't care */
   if (lstat(dirpath, &st) != 0)
     return NULL;
+
   if (!(st.st_mode & S_IFDIR))
     return NULL;
 
   res = calloc(sizeof(*res), 1);
-  if (res == NULL) return NULL;
+  if (res == NULL)
+    return NULL;
 
-  res->dirfd = open(dirpath, O_RDWR|O_CLOEXEC);
+  res->dirfd = open(dirpath, O_RDONLY|O_CLOEXEC);
   if (res->dirfd == -1) {
     free(res);
     return NULL;

From 24dd80d621be81a6dbe2be976b38bf4f0b6f20eb Mon Sep 17 00:00:00 2001
From: Arnaud Bergeron <abergeron@gmail.com>
Date: Mon, 30 Jan 2017 13:38:24 -0500
Subject: [PATCH 15/41] Fix the cache cleanup script.

---
 bin/gpuarray-cache | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bin/gpuarray-cache b/bin/gpuarray-cache
index 528e3eb9a5..04b7e8e68a 100644
--- a/bin/gpuarray-cache
+++ b/bin/gpuarray-cache
@@ -19,7 +19,7 @@ def clean(max_size):
             os.remove(path)
 
 
-SUFFIXES = {'B': 1, 'K': 1 << 10, 'M': 1 < 20, 'G': 1 << 30, 'T': 1 << 40,
+SUFFIXES = {'B': 1, 'K': 1 << 10, 'M': 1 << 20, 'G': 1 << 30, 'T': 1 << 40,
             'P': 1 << 50, 'E': 1 << 60, 'Z': 1 << 70, 'Y': 1 << 80}
 
 
@@ -44,7 +44,7 @@ if __name__ == '__main__':
     import argparse
 
     parser = argparse.ArgumentParser(description='libgpuarray cache maintenance utility')
-    parser.add_argument('-s', '--max_size', help='Set the maximum size for pruning')
+    parser.add_argument('-s', '--max_size', help='Set the maximum size for pruning (in bytes with suffixes: K, M, G, ...)')
     args = parser.parse_args()
 
     clean(get_size(args.max_size))

From 5f020fc1f8aa16bd9e1b084c8a397e7013847296 Mon Sep 17 00:00:00 2001
From: Arnaud Bergeron <abergeron@gmail.com>
Date: Fri, 10 Feb 2017 12:46:24 -0500
Subject: [PATCH 16/41] Fix a type punning issue in the Skein code.

---
 src/util/skein.c | 16 ++++++++--------
 src/util/skein.h |  5 ++++-
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/src/util/skein.c b/src/util/skein.c
index 51362e5efb..38912e8320 100644
--- a/src/util/skein.c
+++ b/src/util/skein.c
@@ -247,13 +247,13 @@ int Skein_512_Update(Skein_512_Ctxt_t *ctx, const u08b_t *msg,
       n = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt;  /* # bytes free in buffer b[] */
       if (n) {
         Skein_assert(n < msgByteCnt);         /* check on our logic here */
-        memcpy(&ctx->b[ctx->h.bCnt],msg,n);
+        memcpy(&ctx->bb.b[ctx->h.bCnt],msg,n);
         msgByteCnt  -= n;
         msg         += n;
         ctx->h.bCnt += n;
       }
       Skein_assert(ctx->h.bCnt == SKEIN_512_BLOCK_BYTES);
-      Skein_512_Process_Block(ctx,ctx->b,1,SKEIN_512_BLOCK_BYTES);
+      Skein_512_Process_Block(ctx,ctx->bb.b,1,SKEIN_512_BLOCK_BYTES);
       ctx->h.bCnt = 0;
     }
     /* now process any remaining full blocks, directly from input message data */
@@ -269,7 +269,7 @@ int Skein_512_Update(Skein_512_Ctxt_t *ctx, const u08b_t *msg,
   /* copy any remaining source message data bytes into b[] */
   if (msgByteCnt) {
     Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES);
-    memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt);
+    memcpy(&ctx->bb.b[ctx->h.bCnt],msg,msgByteCnt);
     ctx->h.bCnt += msgByteCnt;
   }
 
@@ -285,20 +285,20 @@ int Skein_512_Final(Skein_512_Ctxt_t *ctx, u08b_t *hashVal) {
 
   ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;                 /* tag as the final block */
   if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES)            /* zero pad b[] if necessary */
-    memset(&ctx->b[ctx->h.bCnt],0,SKEIN_512_BLOCK_BYTES - ctx->h.bCnt);
+    memset(&ctx->bb.b[ctx->h.bCnt],0,SKEIN_512_BLOCK_BYTES - ctx->h.bCnt);
 
-  Skein_512_Process_Block(ctx,ctx->b,1,ctx->h.bCnt);  /* process the final block */
+  Skein_512_Process_Block(ctx,ctx->bb.b,1,ctx->h.bCnt);  /* process the final block */
 
   /* now output the result */
   byteCnt = (ctx->h.hashBitLen + 7) >> 3;             /* total number of output bytes */
 
   /* run Threefish in "counter mode" to generate output */
-  memset(ctx->b,0,sizeof(ctx->b));  /* zero out b[], so it can hold the counter */
+  memset(ctx->bb.b,0,sizeof(ctx->bb.b));  /* zero out b[], so it can hold the counter */
   memcpy(X,ctx->X,sizeof(X));       /* keep a local copy of counter mode "key" */
   for (i=0;i*SKEIN_512_BLOCK_BYTES < byteCnt;i++) {
-    ((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */
+    ctx->bb.l[0] = Skein_Swap64((u64b_t) i); /* build the counter block */
     Skein_Start_New_Type(ctx,OUT_FINAL);
-    Skein_512_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */
+    Skein_512_Process_Block(ctx,ctx->bb.b,1,sizeof(u64b_t)); /* run "counter mode" */
     n = byteCnt - i*SKEIN_512_BLOCK_BYTES;   /* number of output bytes left to go */
     if (n >= SKEIN_512_BLOCK_BYTES)
       n  = SKEIN_512_BLOCK_BYTES;
diff --git a/src/util/skein.h b/src/util/skein.h
index 89d7ebf209..b505a51801 100644
--- a/src/util/skein.h
+++ b/src/util/skein.h
@@ -56,7 +56,10 @@ typedef struct {
 typedef struct {                     /* 512-bit Skein hash context structure */
   Skein_Ctxt_Hdr_t h;                     /* common header context variables */
   u64b_t  X[SKEIN_512_STATE_WORDS];                    /* chaining variables */
-  u08b_t  b[SKEIN_512_BLOCK_BYTES]; /* partial block buffer (8-byte aligned) */
+  union Skein_512_Ctxt_b_u {
+    u08b_t b[SKEIN_512_BLOCK_BYTES]; /* partial block buffer (8-byte aligned) */
+    u64b_t l[SKEIN_512_BLOCK_BYTES/8];
+  } bb;
 } Skein_512_Ctxt_t;
 
 /*   Skein APIs for (incremental) "straight hashing" */

From 4d014a8b255cc85324211bb8354fc44436db1aa0 Mon Sep 17 00:00:00 2001
From: Arnaud Bergeron <abergeron@gmail.com>
Date: Wed, 1 Mar 2017 15:24:06 -0500
Subject: [PATCH 17/41] Switch away from ...at() functions since those don't
 exist on windows.

Also try to make the code work for windows.
---
 CMakeLists.txt   |   2 +-
 make.bat         |   6 ++
 src/cache/disk.c | 223 ++++++++++++++++++++++++++++++++---------------
 3 files changed, 160 insertions(+), 71 deletions(-)
 create mode 100755 make.bat

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 382c064e9c..09f7f1fd4c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,7 +7,7 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/CMakeModules/")
 # -Wall is unbelieveably noisy with Visual Studio:
 # http://stackoverflow.com/q/4001736/3257826
 if(MSVC)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -W4")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -W4 -D_CRT_SECURE_NO_WARNINGS")
 else()
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall")
 endif()
diff --git a/make.bat b/make.bat
new file mode 100755
index 0000000000..1ea7aa55cd
--- /dev/null
+++ b/make.bat
@@ -0,0 +1,6 @@
+del bld
+mkdir bld
+cd bld
+cmake .. -G "NMake Makefiles"
+cmake --build . --config Release
+cd ..
diff --git a/src/cache/disk.c b/src/cache/disk.c
index 0a7a5ab18d..2f4fc77efe 100644
--- a/src/cache/disk.c
+++ b/src/cache/disk.c
@@ -1,13 +1,57 @@
-#include <dirent.h>
 #include <errno.h>
 #include <fcntl.h>
-#include <unistd.h>
 #include <stdlib.h>
+
+#include "private_config.h"
+
+#ifdef _WIN32
+#define PATH_MAX 255
+
+#define WIN32_LEAN_AND_MEAN
+#include <Windows.h>
+
+#include <process.h>
+#include <io.h>
+
+struct timezone;
+
+struct timeval {
+  long tv_sec;
+  long tv_usec;
+} timeval;
+
+static int gettimeofday(struct timeval *tp, struct timezone *tzp) {
+  /*
+   * Note: some broken versions only have 8 trailing zero's, the
+   * correct epoch has 9 trailing zero's This magic number is the
+   * number of 100 nanosecond intervals since January 1, 1601 (UTC)
+   * until 00:00:00 January 1, 1970
+   */
+  static const uint64_t EPOCH = ((uint64_t)116444736000000000ULL);
+
+  SYSTEMTIME system_time;
+  FILETIME file_time;
+  uint64_t time;
+
+  GetSystemTime(&system_time);
+  SystemTimeToFileTime(&system_time, &file_time);
+  time = ((uint64_t)file_time.dwLowDateTime);
+  time += ((uint64_t)file_time.dwHighDateTime) << 32;
+
+  tp->tv_sec = (long)((time - EPOCH) / 10000000L);
+  tp->tv_usec = (long)(system_time.wMilliseconds * 1000);
+  return 0;
+}
+
+#else
+#define PATH_MAX 1024
+#include <unistd.h>
 #include <sys/time.h>
+#endif
+
 #include <sys/stat.h>
 
 #include "cache.h"
-#include "private_config.h"
 #include "util/skein.h"
 
 #define HEXP_LEN (128 + 2)
@@ -19,7 +63,7 @@ typedef struct _disk_cache {
   vwrite_fn vwrite;
   kread_fn kread;
   vread_fn vread;
-  int dirfd;
+  const char *dirp;
 } disk_cache;
 
 
@@ -31,72 +75,112 @@ static unsigned long long ntohull(const char *in) {
 }
 
 static void htonull(unsigned long long in, char *out) {
-  out[0] = in >> 56;
-  out[1] = in >> 48;
-  out[2] = in >> 40;
-  out[3] = in >> 32;
-  out[4] = in >> 24;
-  out[5] = in >> 16;
-  out[6] = in >> 8;
-  out[7] = in;
+  out[0] = (char)(in >> 56);
+  out[1] = (char)(in >> 48);
+  out[2] = (char)(in >> 40);
+  out[3] = (char)(in >> 32);
+  out[4] = (char)(in >> 24);
+  out[5] = (char)(in >> 16);
+  out[6] = (char)(in >> 8);
+  out[7] = (char)(in);
 }
 
-static int mkstempat(int dfd, char *template) {
-  static const char letters[] =
-    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
-  size_t length;
-  char *XXXXXX;
-  struct timeval tv;
-  unsigned long long  randnum, working;
-  int i, tries, fd;
-
-  length = strlen(template);
-  if (length < 6) {
-    errno = EINVAL;
+static int catp(char *path, const char *dirp, const char *rpath) {
+  if (strlcpy(path, dirp, PATH_MAX) >= PATH_MAX) {
+    errno = ENAMETOOLONG;
     return -1;
   }
-  XXXXXX = template + length - 6;
-  if (strcmp(XXXXXX, "XXXXXX") != 0) {
-    errno = EINVAL;
+  if (strlcat(path, rpath, PATH_MAX) >= PATH_MAX) {
+    errno = ENAMETOOLONG;
     return -1;
   }
+  return 0;
+}
 
-  /* This is kind of crappy, but the point is to not step on each
-     other's feet */
-  gettimeofday(&tv, NULL);
-  randnum = ((unsigned long long) tv.tv_usec << 16) ^ tv.tv_sec ^ getpid();
+static int openp(const char *dirp, const char *rpath, int flags, int mode) {
+  char path[PATH_MAX];
 
-  for (tries = 0; tries < TMP_MAX; tries++) {
-    for (working = randnum, i = 0; i < 6; i++) {
-      XXXXXX[i] = letters[working % 62];
-      working /= 62;
-    }
-    fd = openat(dfd, template, O_RDWR | O_CREAT | O_EXCL, 0600);
-    if (fd >= 0 || (errno != EEXIST && errno != EISDIR))
-      return fd;
+  if (catp(path, dirp, rpath))
+    return -1;
 
-    randnum += (tv.tv_usec >> 10) & 0xfff;
-  }
-  errno = EEXIST;
-  return -1;
+  return open(path, flags, mode);
 }
 
-/* Ensure that a path exists by creating all intermediate directories */
-static int ensureat(int dfd, char *path) {
-  char *curp;
-  char *pos;
+static int mkstempp(const char *dirp, char *template) {
+  char path[PATH_MAX];
+  int res;
+
+  if (catp(path, dirp, template))
+    return -1;
+
+  res = mkstemp(path);
 
-  curp = path;
+  /* We need to copy the result path back */
+  if (res == 0)
+    memcpy(template, &path[strlen(dirp)], strlen(template));
+
+  return res;
+}
+
+static int unlinkp(const char *dirp, const char *rpath) {
+  char path[PATH_MAX];
+
+  if (catp(path, dirp, rpath))
+    return -1;
+
+  return unlink(path);
+}
+
+static int renamep(const char *dirp, const char *ropath, const char *rnpath) {
+  char opath[PATH_MAX];
+  char npath[PATH_MAX];
+
+  if (catp(opath, dirp, ropath))
+    return -1;
+  if (catp(npath, dirp, rnpath))
+    return -1;
 
-  while ((pos = strchr(curp, '/')) != NULL) {
-    *pos = '\0';
-    if (mkdirat(dfd, path, 0777)) {
+  return rename(opath, npath);
+}
+
+/* Ensure that a path exists by creating all intermediate directories */
+int ensurep(const char *dirp, const char *rpath) {
+  char path[PATH_MAX];
+  char *pp;
+  char sep;
+
+  if (dirp == NULL) {
+    if (strlcpy(path, rpath, PATH_MAX) >= PATH_MAX) {
+      errno = ENAMETOOLONG;
+      return -1;
+    }
+#ifdef _WIN32
+    /* Skip root dir (windows) */
+    pp = strchr(path, '\\');
+    if (pp)
+      while (*pp == '\\') pp++;
+    else
+      pp = path;
+#else
+    pp = path;
+    /* Skip root dir (unix) */
+    while (*pp == '/') pp++;
+#endif
+  } else {
+    if (catp(path, dirp, rpath))
+      return -1;
+
+    pp = path + strlen(dirp);
+  }
+  while ((pp = strpbrk(pp + 1, "\\/")) != NULL) {
+    sep = *pp;
+    *pp = '\0';
+    if (mkdir(path, 0777)) {
       if (errno != EEXIST) return -1;
       /* For now we suppose that EEXIST means that the directory is
-       * already there.*/
+       * already there. */
     }
-    curp = pos + 1;
-    *pos = '/';
+    *pp = sep;
   }
 
   return 0;
@@ -130,7 +214,7 @@ static int write_entry(disk_cache *c, const cache_key_t k,
 
   if (key_path(c, k, hexp)) return -1;
 
-  if (ensureat(c->dirfd, hexp)) return -1;
+  if (ensurep(c->dirp, hexp)) return -1;
 
   if (strb_ensure(&b, 16)) return -1;
   b.l = 16;
@@ -145,7 +229,7 @@ static int write_entry(disk_cache *c, const cache_key_t k,
     return -1;
   }
 
-  fd = mkstempat(c->dirfd, tmp_path);
+  fd = mkstempp(c->dirp, tmp_path);
   if (fd == -1) {
     strb_clear(&b);
     return -1;
@@ -155,13 +239,18 @@ static int write_entry(disk_cache *c, const cache_key_t k,
   strb_clear(&b);
   close(fd);
   if (err) {
-    unlinkat(c->dirfd, tmp_path, 0);
+    unlinkp(c->dirp, tmp_path);
     return -1;
   }
 
-  if (renameat(c->dirfd, tmp_path, c->dirfd, hexp)) {
-    unlinkat(c->dirfd, tmp_path, 0);
+  if (renamep(c->dirp, tmp_path, hexp)) {
+    unlinkp(c->dirp, tmp_path);
+#ifdef _WIN32
+    /* On windows we can't rename over an existing file */
+    return (errno != EACCES) ? -1 : 0;
+#else
     return -1;
+#endif
   }
 
   return 0;
@@ -179,7 +268,7 @@ static int find_entry(disk_cache *c, const cache_key_t key,
 
   if (key_path(c, key, hexp)) return 0;
 
-  fd = openat(c->dirfd, hexp, O_RDONLY);
+  fd = openp(c->dirp, hexp, O_RDONLY, 0);
 
   if (fd == -1) return 0;
 
@@ -255,7 +344,7 @@ static int disk_del(cache *_c, const cache_key_t key) {
 
   key_path(c, key, hexp);
 
-  return (unlinkat(c->dirfd, hexp, 0) == 0);
+  return (unlinkp(c->dirp, hexp) == 0);
 }
 
 static cache_value_t disk_get(cache *_c, const cache_key_t key) {
@@ -277,7 +366,7 @@ static cache_value_t disk_get(cache *_c, const cache_key_t key) {
 static void disk_destroy(cache *_c) {
   disk_cache *c = (disk_cache *)_c;
   cache_destroy(c->mem);
-  close(c->dirfd);
+  free((void *)c->dirp);
 }
 
 cache *cache_disk(const char *dirpath, cache *mem,
@@ -289,11 +378,10 @@ cache *cache_disk(const char *dirpath, cache *mem,
 
   if (dirp == NULL) return NULL;
 
-  if (ensureat(AT_FDCWD, dirp) != 0) {
+  if (ensurep(NULL, dirp) != 0) {
     free(dirp);
     return NULL;
   }
-  free(dirp);
 
   mkdir(dirpath, 0777); /* This may fail, but it's ok */
 
@@ -307,12 +395,7 @@ cache *cache_disk(const char *dirpath, cache *mem,
   if (res == NULL)
     return NULL;
 
-  res->dirfd = open(dirpath, O_RDONLY|O_CLOEXEC);
-  if (res->dirfd == -1) {
-    free(res);
-    return NULL;
-  }
-
+  res->dirp = dirp;
   res->mem = mem;
   res->kwrite = kwrite;
   res->vwrite = vwrite;

From a64059bd69e4b705d94b55ff3d8ed164319d2bde Mon Sep 17 00:00:00 2001
From: Arnaud Bergeron <abergeron@gmail.com>
Date: Wed, 1 Mar 2017 17:53:19 -0500
Subject: [PATCH 18/41] Fix bug in opencl gemmBatch bindings.

---
 src/gpuarray_blas_opencl_clblas.c  | 4 ++--
 src/gpuarray_blas_opencl_clblast.c | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/gpuarray_blas_opencl_clblas.c b/src/gpuarray_blas_opencl_clblas.c
index 8ee019afb7..f6e51429b1 100644
--- a/src/gpuarray_blas_opencl_clblas.c
+++ b/src/gpuarray_blas_opencl_clblas.c
@@ -100,7 +100,7 @@ static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB,
     ARRAY_INIT(C[i]);
     err = clblasSgemm(convO(order), convT(transA), convT(transB), M, N, K,
                       alpha, A[i]->buf, offA[i], lda, B[i]->buf, offB[i], ldb,
-                      beta, C[i]->buf, offB[i], ldc, 1, &ctx->q,
+                      beta, C[i]->buf, offC[i], ldc, 1, &ctx->q,
                       num_ev, num_ev == 0 ? NULL : evl, &ev);
     if (err != clblasSuccess)
       return GA_BLAS_ERROR;
@@ -132,7 +132,7 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB,
     ARRAY_INIT(C[i]);
     err = clblasDgemm(convO(order), convT(transA), convT(transB), M, N, K,
                       alpha, A[i]->buf, offA[i], lda, B[i]->buf, offB[i], ldb,
-                      beta, C[i]->buf, offB[i], ldc, 1, &ctx->q,
+                      beta, C[i]->buf, offC[i], ldc, 1, &ctx->q,
                       num_ev, num_ev == 0 ? NULL : evl, &ev);
     if (err != clblasSuccess)
       return GA_BLAS_ERROR;
diff --git a/src/gpuarray_blas_opencl_clblast.c b/src/gpuarray_blas_opencl_clblast.c
index 4a5369e56e..c6fd010a3b 100644
--- a/src/gpuarray_blas_opencl_clblast.c
+++ b/src/gpuarray_blas_opencl_clblast.c
@@ -68,7 +68,7 @@ static int hgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB,
     ARRAY_INIT(C[i]);
     err = CLBlastHgemm(convO(order), convT(transA), convT(transB), M, N, K,
                        float_to_half(alpha), A[i]->buf, offA[i], lda, B[i]->buf, offB[i], ldb,
-                       float_to_half(beta), C[i]->buf, offB[i], ldc, &ctx->q, &ev);
+                       float_to_half(beta), C[i]->buf, offC[i], ldc, &ctx->q, &ev);
     if (err != kSuccess)
       return GA_BLAS_ERROR;
     ARRAY_FINI(A[i]);
@@ -97,7 +97,7 @@ static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB,
     ARRAY_INIT(C[i]);
     err = CLBlastSgemm(convO(order), convT(transA), convT(transB), M, N, K,
                       alpha, A[i]->buf, offA[i], lda, B[i]->buf, offB[i], ldb,
-                      beta, C[i]->buf, offB[i], ldc, &ctx->q, &ev);
+                      beta, C[i]->buf, offC[i], ldc, &ctx->q, &ev);
     if (err != kSuccess)
       return GA_BLAS_ERROR;
     ARRAY_FINI(A[i]);
@@ -126,7 +126,7 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB,
     ARRAY_INIT(C[i]);
     err = CLBlastDgemm(convO(order), convT(transA), convT(transB), M, N, K,
                       alpha, A[i]->buf, offA[i], lda, B[i]->buf, offB[i], ldb,
-                      beta, C[i]->buf, offB[i], ldc, &ctx->q, &ev);
+                      beta, C[i]->buf, offC[i], ldc, &ctx->q, &ev);
     if (err != kSuccess)
       return GA_BLAS_ERROR;
     ARRAY_FINI(A[i]);

From e280a3ddf33caa61846de2c4b6ada6c9487ab548 Mon Sep 17 00:00:00 2001
From: Arnaud Bergeron <abergeron@gmail.com>
Date: Wed, 1 Mar 2017 17:58:02 -0500
Subject: [PATCH 19/41] Make MSVC slightly less verbose in its warnings.

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 09f7f1fd4c..5d0761bc73 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,7 +7,7 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/CMakeModules/")
 # -Wall is unbelieveably noisy with Visual Studio:
 # http://stackoverflow.com/q/4001736/3257826
 if(MSVC)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -W4 -D_CRT_SECURE_NO_WARNINGS")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -W3 -D_CRT_SECURE_NO_WARNINGS")
 else()
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall")
 endif()

From 4f25bc73ed46115769374cac859ba6cc6820a80d Mon Sep 17 00:00:00 2001
From: Arnaud Bergeron <abergeron@gmail.com>
Date: Wed, 1 Mar 2017 18:03:35 -0500
Subject: [PATCH 20/41] Windows changes.

---
 CMakeLists.txt             |  2 +-
 src/cache/disk.c           | 15 ++++++++++++++-
 src/gpuarray_array.c       |  6 +++---
 src/gpuarray_buffer.c      |  4 ++--
 src/gpuarray_buffer_blas.c |  2 +-
 src/gpuarray_elemwise.c    |  2 +-
 src/gpuarray_reduction.c   |  2 +-
 src/private.h              |  6 +++---
 src/private_config.h.in    |  4 +---
 src/util/strb.c            |  7 +++++++
 10 files changed, 34 insertions(+), 16 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5d0761bc73..ddfefab53c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,7 +7,7 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/CMakeModules/")
 # -Wall is unbelieveably noisy with Visual Studio:
 # http://stackoverflow.com/q/4001736/3257826
 if(MSVC)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -W3 -D_CRT_SECURE_NO_WARNINGS")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -W3")
 else()
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall")
 endif()
diff --git a/src/cache/disk.c b/src/cache/disk.c
index 2f4fc77efe..0b827aaefa 100644
--- a/src/cache/disk.c
+++ b/src/cache/disk.c
@@ -1,3 +1,4 @@
+#define _CRT_SECURE_NO_WARNINGS
 #include <errno.h>
 #include <fcntl.h>
 #include <stdlib.h>
@@ -11,7 +12,10 @@
 #include <Windows.h>
 
 #include <process.h>
+#include <direct.h>
 #include <io.h>
+#include <sys/types.h>
+#include <sys/stat.h>
 
 struct timezone;
 
@@ -43,13 +47,22 @@ static int gettimeofday(struct timeval *tp, struct timezone *tzp) {
   return 0;
 }
 
+#define open _open
+#define unlink _unlink
+#define mkdir(p, f) _mkdir(p)
+#define close _close
+#define strdup _strdup
+#define lstat _stat64
+#define fstat _fstat64
+#define stat __stat64
+
 #else
 #define PATH_MAX 1024
 #include <unistd.h>
 #include <sys/time.h>
+#include <sys/stat.h>
 #endif
 
-#include <sys/stat.h>
 
 #include "cache.h"
 #include "util/skein.h"
diff --git a/src/gpuarray_array.c b/src/gpuarray_array.c
index 91c043a1d6..267eb5badc 100644
--- a/src/gpuarray_array.c
+++ b/src/gpuarray_array.c
@@ -83,7 +83,7 @@ static int ga_extcopy(GpuArray *dst, const GpuArray *src) {
 }
 
 /* Value below which a size_t multiplication will never overflow. */
-#define MUL_NO_OVERFLOW (1UL << (sizeof(size_t) * 4))
+#define MUL_NO_OVERFLOW (1ULL << (sizeof(size_t) * 4))
 
 void GpuArray_fix_flags(GpuArray *a) {
   /* Only keep the writable flag */
@@ -330,9 +330,9 @@ static int gen_take1_kernel(GpuKernel *k, gpucontext *ctx, char **err_str,
                             const GpuArray *ind, int addr32) {
   strb sb = STRB_STATIC_INIT;
   int *atypes;
-  size_t nargs, apos;
   char *sz, *ssz;
   unsigned int i, i2;
+  unsigned int nargs, apos;
   int flags = GA_USE_CLUDA;
   int res;
 
@@ -432,9 +432,9 @@ int GpuArray_take1(GpuArray *a, const GpuArray *v, const GpuArray *i,
 #if DEBUG
   char *errstr = NULL;
 #endif
-  size_t argp;
   GpuKernel k;
   unsigned int j;
+  unsigned int argp;
   int err, kerr = 0;
   int addr32 = 0;
 
diff --git a/src/gpuarray_buffer.c b/src/gpuarray_buffer.c
index d3226ea94e..a1e840c939 100644
--- a/src/gpuarray_buffer.c
+++ b/src/gpuarray_buffer.c
@@ -45,10 +45,10 @@ gpucontext *gpucontext_init(const char *name, int dev, int flags, int *ret) {
   if (res == NULL)
     return NULL;
   res->ops = ops;
-  if (gpucontext_property(res, GA_CTX_PROP_BLAS_OPS, &res->blas_ops) != GA_NO_ERROR)
+  if (gpucontext_property(res, GA_CTX_PROP_BLAS_OPS, (void *)&res->blas_ops) != GA_NO_ERROR)
     res->blas_ops = NULL;
   res->blas_handle = NULL;
-  if (gpucontext_property(res, GA_CTX_PROP_COMM_OPS, &res->comm_ops) != GA_NO_ERROR)
+  if (gpucontext_property(res, GA_CTX_PROP_COMM_OPS, (void *)&res->comm_ops) != GA_NO_ERROR)
     res->comm_ops = NULL;
   res->extcopy_cache = NULL;
   return res;
diff --git a/src/gpuarray_buffer_blas.c b/src/gpuarray_buffer_blas.c
index c73f3c2f19..3fdc525e78 100644
--- a/src/gpuarray_buffer_blas.c
+++ b/src/gpuarray_buffer_blas.c
@@ -10,7 +10,7 @@ int gpublas_setup(gpucontext *ctx) {
 
 void gpublas_teardown(gpucontext *ctx) {
   if (ctx->blas_ops != NULL)
-    return ctx->blas_ops->teardown(ctx);
+    ctx->blas_ops->teardown(ctx);
 }
 
 const char *gpublas_error(gpucontext *ctx) {
diff --git a/src/gpuarray_elemwise.c b/src/gpuarray_elemwise.c
index f3ce7ee261..1d93e5a155 100644
--- a/src/gpuarray_elemwise.c
+++ b/src/gpuarray_elemwise.c
@@ -131,8 +131,8 @@ static int gen_elemwise_basic_kernel(GpuKernel *k, gpucontext *ctx,
   strb sb = STRB_STATIC_INIT;
   unsigned int i, _i, j;
   int *ktypes;
-  size_t p;
   char *size = "ga_size", *ssize = "ga_ssize";
+  unsigned int p;
   int flags = GA_USE_CLUDA;
   int res;
 
diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c
index 12eedb24a9..b1a185e3b7 100644
--- a/src/gpuarray_reduction.c
+++ b/src/gpuarray_reduction.c
@@ -644,7 +644,7 @@ static int   maxandargmaxCompile                (maxandargmax_ctx*  ctx){
 		GA_SIZE,   /* dstArgmaxOff */
 		GA_BUFFER  /* dstArgmaxSteps */
 	};
-	const size_t ARG_TYPECODES_LEN = sizeof(ARG_TYPECODES)/sizeof(*ARG_TYPECODES);
+	const unsigned int ARG_TYPECODES_LEN = sizeof(ARG_TYPECODES)/sizeof(*ARG_TYPECODES);
 	const char*  SRCS[1];
 
 	SRCS[0] = ctx->sourceCode;
diff --git a/src/private.h b/src/private.h
index abe9783de7..2de8742674 100644
--- a/src/private.h
+++ b/src/private.h
@@ -26,9 +26,9 @@ extern "C" {
 }
 #endif
 
-#define ADDR32_MAX   4294967295
-#define SADDR32_MIN -2147483648
-#define SADDR32_MAX  2147483647
+#define ADDR32_MAX   4294967295L
+#define SADDR32_MIN -2147483648L
+#define SADDR32_MAX  2147483647L
 
 struct _gpuarray_buffer_ops;
 typedef struct _gpuarray_buffer_ops gpuarray_buffer_ops;
diff --git a/src/private_config.h.in b/src/private_config.h.in
index c3cd3a0195..f58a03edae 100644
--- a/src/private_config.h.in
+++ b/src/private_config.h.in
@@ -22,9 +22,7 @@ extern "C" {
 #ifdef _MSC_VER
 /* God damn Microsoft ... */
 #define snprintf _snprintf
-#endif
-
-#ifdef _MSC_VER
+#define strdup _strdup
 /* MS VC++ 2008 does not support inline */
 #define inline __inline
 #define alloca _alloca
diff --git a/src/util/strb.c b/src/util/strb.c
index 15cd496c4f..22da8bf637 100644
--- a/src/util/strb.c
+++ b/src/util/strb.c
@@ -1,6 +1,13 @@
+#define _CRT_SECURE_NO_WARNINGS
 #include <errno.h>
 #include <stdarg.h>
+#ifdef _MSC_VER
+#include <io.h>
+#define read _read
+#define write _write
+#else
 #include <unistd.h>
+#endif
 
 #include "util/strb.h"
 

From 0678d76e02a8bc21f06a586bd4e719d633db49b9 Mon Sep 17 00:00:00 2001
From: Arnaud Bergeron <abergeron@gmail.com>
Date: Tue, 14 Mar 2017 15:47:21 -0400
Subject: [PATCH 21/41] Initialized the blas_ops pointer so that compilers stop
 freaking out.

---
 src/gpuarray_buffer_opencl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c
index 89b56f80a7..e1e8fdd82a 100644
--- a/src/gpuarray_buffer_opencl.c
+++ b/src/gpuarray_buffer_opencl.c
@@ -212,7 +212,7 @@ cl_command_queue cl_get_stream(gpucontext *ctx) {
 }
 
 static void cl_free_ctx(cl_ctx *ctx) {
-  gpuarray_blas_ops *blas_ops;
+  gpuarray_blas_ops *blas_ops = NULL;
 
   ASSERT_CTX(ctx);
   assert(ctx->refcnt != 0);

From f761cfaeb0705d4505caa382ce24acc0ccbd86a6 Mon Sep 17 00:00:00 2001
From: Arnaud Bergeron <abergeron@gmail.com>
Date: Tue, 14 Mar 2017 15:47:45 -0400
Subject: [PATCH 22/41] Add support for floats in GpuArray_dump().

---
 src/gpuarray_array.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/gpuarray_array.c b/src/gpuarray_array.c
index 267eb5badc..45a2a1186e 100644
--- a/src/gpuarray_array.c
+++ b/src/gpuarray_array.c
@@ -1096,6 +1096,9 @@ int GpuArray_fdump(FILE *fd, const GpuArray *a) {
     case GA_LONG:
       fprintf(fd, "%lld", (long long)*(int64_t *)p);
       break;
+    case GA_FLOAT:
+      fprintf(fd, "%f", *(float *)p);
+      break;
     case GA_SSIZE:
       fprintf(fd, "%" SPREFIX "d", *(ssize_t *)p);
       break;

From eca4ffd3cf1e3056f70b4e6263db54c7427934fe Mon Sep 17 00:00:00 2001
From: Arnaud Bergeron <abergeron@gmail.com>
Date: Tue, 14 Mar 2017 16:20:39 -0400
Subject: [PATCH 23/41] Fix mkstempp to actually return the filename.

---
 src/cache/disk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cache/disk.c b/src/cache/disk.c
index 0b827aaefa..3ded869829 100644
--- a/src/cache/disk.c
+++ b/src/cache/disk.c
@@ -129,7 +129,7 @@ static int mkstempp(const char *dirp, char *template) {
   res = mkstemp(path);
 
   /* We need to copy the result path back */
-  if (res == 0)
+  if (res != -1)
     memcpy(template, &path[strlen(dirp)], strlen(template));
 
   return res;

From b78e9d42805174bdf2e26f0da1076492eb71c14e Mon Sep 17 00:00:00 2001
From: Arnaud Bergeron <abergeron@gmail.com>
Date: Tue, 14 Mar 2017 18:14:35 -0400
Subject: [PATCH 24/41] Fix windows annoyances.

---
 make.bat                  | 2 +-
 setup.py                  | 2 +-
 src/gpuarray/blas.h       | 4 ++--
 src/gpuarray_array_blas.c | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/make.bat b/make.bat
index 1ea7aa55cd..16bc79f441 100755
--- a/make.bat
+++ b/make.bat
@@ -1,6 +1,6 @@
 del bld
 mkdir bld
 cd bld
-cmake .. -G "NMake Makefiles"
+cmake .. -G "NMake Makefiles" -DCMAKE_BUILD_TYPE=Release
 cmake --build . --config Release
 cd ..
diff --git a/setup.py b/setup.py
index 0084e04b4a..10f5996b54 100755
--- a/setup.py
+++ b/setup.py
@@ -82,7 +82,7 @@ def __init__(self, *args, **kwargs):
     current_dir = os.path.abspath(os.path.dirname(__file__))
     include_dirs += [os.path.join(current_dir, 'src')]
 
-    default_bin_dir = os.path.join(current_dir, 'lib', 'Release')
+    default_bin_dir = os.path.join(current_dir, 'lib')
     if not os.path.isdir(default_bin_dir):
         raise RuntimeError('default binary dir {} does not exist, you may need to build the C library in release mode'.format(default_bin_dir))
     library_dirs += [default_bin_dir]
diff --git a/src/gpuarray/blas.h b/src/gpuarray/blas.h
index a8dd8096bc..a59d3bb885 100644
--- a/src/gpuarray/blas.h
+++ b/src/gpuarray/blas.h
@@ -9,8 +9,8 @@ extern "C" {
 #endif
 
 // only for vector-vector dot
-GPUARRAY_PUBLIC int GpuArray_rdot( GpuArray *X, GpuArray *Y,
-                                   GpuArray *Z, int nocopy);
+GPUARRAY_PUBLIC int GpuArray_rdot(GpuArray *X, GpuArray *Y,
+                                  GpuArray *Z, int nocopy);
 #define GpuArray_hdot GpuArray_rdot
 #define GpuArray_sdot GpuArray_rdot
 #define GpuArray_ddot GpuArray_rdot
diff --git a/src/gpuarray_array_blas.c b/src/gpuarray_array_blas.c
index 8f9fb5919b..2e9a398e2d 100644
--- a/src/gpuarray_array_blas.c
+++ b/src/gpuarray_array_blas.c
@@ -5,8 +5,8 @@
 #include "gpuarray/util.h"
 #include "gpuarray/error.h"
 
-int GpuArray_rdot( GpuArray *X, GpuArray *Y,
-        GpuArray *Z, int nocopy) {
+int GpuArray_rdot(GpuArray *X, GpuArray *Y,
+                  GpuArray *Z, int nocopy) {
     GpuArray *Xp = X;
     GpuArray copyX;
     GpuArray *Yp = Y;

From 88601038668cc607c954ceee9af1a4983ec213de Mon Sep 17 00:00:00 2001
From: Arnaud Bergeron <abergeron@gmail.com>
Date: Tue, 14 Mar 2017 19:07:29 -0400
Subject: [PATCH 25/41] Make sure to properly terminate the cache path with a
 separator and work around lstat choking on terminating separators on windows.

---
 src/cache/disk.c | 31 ++++++++++++++++++++++++++++---
 1 file changed, 28 insertions(+), 3 deletions(-)

diff --git a/src/cache/disk.c b/src/cache/disk.c
index 3ded869829..6d260ac559 100644
--- a/src/cache/disk.c
+++ b/src/cache/disk.c
@@ -387,20 +387,45 @@ cache *cache_disk(const char *dirpath, cache *mem,
                   kread_fn kread, vread_fn vread) {
   struct stat st;
   disk_cache *res;
-  char *dirp = strdup(dirpath);
+  char *dirp;
+  size_t dirl = strlen(dirpath);
+  char sep = '/';
+
+  /* This trickery is to make sure the path ends with a separator */
+#ifdef _WIN32
+  if (dirpath[dirl - 1] == '\\')
+    sep = '\\';
+#endif
+
+  if (dirpath[dirl - 1] != sep) dirl++;
+
+  dirp = malloc(dirl + 1);  /* With the NUL */
 
   if (dirp == NULL) return NULL;
 
+  strlcpy(dirp, dirpath, dirl + 1);
+
+  if (dirp[dirl - 1] != sep) {
+    dirp[dirl - 1] = sep;
+    dirp[dirl] = '\0';
+  }
+
   if (ensurep(NULL, dirp) != 0) {
     free(dirp);
     return NULL;
   }
 
-  mkdir(dirpath, 0777); /* This may fail, but it's ok */
+  /* For Windows mkdir and lstat which can't handle trailing separator */
+  dirp[dirl -  1] = '\0';
 
-  if (lstat(dirpath, &st) != 0)
+  mkdir(dirp, 0777); /* This may fail, but it's ok */
+
+  if (lstat(dirp, &st) != 0)
     return NULL;
 
+  /* Restore the good path at the end */
+  dirp[dirl - 1] = sep;
+
   if (!(st.st_mode & S_IFDIR))
     return NULL;
 

From 97d855339c78cfc30fc4566d9828d1dbfa95657f Mon Sep 17 00:00:00 2001
From: Arnaud Bergeron <abergeron@gmail.com>
Date: Tue, 14 Mar 2017 19:46:41 -0400
Subject: [PATCH 26/41] Make sure to open cache files in binary mode for
 windows.

---
 src/cache/disk.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/cache/disk.c b/src/cache/disk.c
index 6d260ac559..db0917b1f4 100644
--- a/src/cache/disk.c
+++ b/src/cache/disk.c
@@ -61,6 +61,9 @@ static int gettimeofday(struct timeval *tp, struct timezone *tzp) {
 #include <unistd.h>
 #include <sys/time.h>
 #include <sys/stat.h>
+
+#define O_BINARY 0
+
 #endif
 
 
@@ -281,7 +284,7 @@ static int find_entry(disk_cache *c, const cache_key_t key,
 
   if (key_path(c, key, hexp)) return 0;
 
-  fd = openp(c->dirp, hexp, O_RDONLY, 0);
+  fd = openp(c->dirp, hexp, O_RDONLY|O_BINARY, 0);
 
   if (fd == -1) return 0;
 

From 46c2f08fca5bcd5e9bc54351b68a4ef4b886abe5 Mon Sep 17 00:00:00 2001
From: Arnaud Bergeron <abergeron@gmail.com>
Date: Thu, 16 Mar 2017 18:32:06 -0400
Subject: [PATCH 27/41] Don't crash on key read failure.

---
 src/cache/disk.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/cache/disk.c b/src/cache/disk.c
index db0917b1f4..f77128906f 100644
--- a/src/cache/disk.c
+++ b/src/cache/disk.c
@@ -337,7 +337,8 @@ static int find_entry(disk_cache *c, const cache_key_t key,
     return 1;
   }
  error:
-  c->c.kfree(k);
+  if (k)
+    c->c.kfree(k);
   b.s = ts;
   strb_clear(&b);
   return 0;

From 01e5fa22706d48b8fde4e3aa39fa809c00d3f59d Mon Sep 17 00:00:00 2001
From: Arnaud Bergeron <abergeron@gmail.com>
Date: Thu, 16 Mar 2017 18:33:26 -0400
Subject: [PATCH 28/41] Error out if we reach EOF before the passed-in length.

---
 src/util/strb.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/util/strb.c b/src/util/strb.c
index 22da8bf637..c8ae4da25d 100644
--- a/src/util/strb.c
+++ b/src/util/strb.c
@@ -73,8 +73,8 @@ void strb_read(strb *sb, int fd, size_t sz) {
   sb->l += sz;
   while (sz) {
     res = read(fd, b, sz);
-    if (res == -1) {
-      if (errno == EAGAIN || errno == EINTR)
+    if (res == -1 || res == 0) {
+      if (res == -1 && errno == EAGAIN || errno == EINTR)
         continue;
       strb_seterror(sb);
       return;

From f03754961edddc597c3c44e78ec5c453159bd84c Mon Sep 17 00:00:00 2001
From: Arnaud Bergeron <abergeron@gmail.com>
Date: Thu, 16 Mar 2017 19:14:47 -0400
Subject: [PATCH 29/41] Make sure to open files in binary mode.

---
 src/gpuarray_mkstemp.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/gpuarray_mkstemp.c b/src/gpuarray_mkstemp.c
index ac5ea10940..5e2e8ca520 100644
--- a/src/gpuarray_mkstemp.c
+++ b/src/gpuarray_mkstemp.c
@@ -8,6 +8,8 @@
 #include <io.h>
 #define open _open
 #define mktemp _mktemp
+#else
+#define O_BINARY 0
 #endif
 
 int mkstemp(char *path) {
@@ -18,7 +20,7 @@ int mkstemp(char *path) {
     do {
         tmp = mktemp(path);
         if (tmp == NULL) return -1;
-        res = open(path, O_CREAT|O_EXCL|O_RDWR, S_IREAD|S_IWRITE);
+        res = open(path, O_CREAT|O_EXCL|O_RDWR|O_BINARY, S_IREAD|S_IWRITE);
         if (res != -1 || errno != EEXIST)
             return res;
     } while (--tries);

From a9ebffa0d0056b3a4d6e8fac12dc27f8e8a7fbfd Mon Sep 17 00:00:00 2001
From: Arnaud Bergeron <abergeron@gmail.com>
Date: Thu, 16 Mar 2017 19:17:28 -0400
Subject: [PATCH 30/41] Make sure to open files in binary mode for writing too.

---
 src/cache/disk.c | 7 +++++--
 src/util/strb.c  | 2 +-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/cache/disk.c b/src/cache/disk.c
index f77128906f..a7f5dbbcd7 100644
--- a/src/cache/disk.c
+++ b/src/cache/disk.c
@@ -63,6 +63,7 @@ static int gettimeofday(struct timeval *tp, struct timezone *tzp) {
 #include <sys/stat.h>
 
 #define O_BINARY 0
+#define setmode(a, b)
 
 #endif
 
@@ -131,9 +132,11 @@ static int mkstempp(const char *dirp, char *template) {
 
   res = mkstemp(path);
 
-  /* We need to copy the result path back */
-  if (res != -1)
+  /* We need to copy the result path back and set binary mode (for windows) */
+  if (res != -1) {
+    setmode(res, O_BINARY);
     memcpy(template, &path[strlen(dirp)], strlen(template));
+  }
 
   return res;
 }
diff --git a/src/util/strb.c b/src/util/strb.c
index c8ae4da25d..dda9dcdfc2 100644
--- a/src/util/strb.c
+++ b/src/util/strb.c
@@ -74,7 +74,7 @@ void strb_read(strb *sb, int fd, size_t sz) {
   while (sz) {
     res = read(fd, b, sz);
     if (res == -1 || res == 0) {
-      if (res == -1 && errno == EAGAIN || errno == EINTR)
+      if (res == -1 && (errno == EAGAIN || errno == EINTR))
         continue;
       strb_seterror(sb);
       return;

From 3c6fb5dc645c0c8364f4586634b9b5e2e48deeb0 Mon Sep 17 00:00:00 2001
From: Arnaud Bergeron <abergeron@gmail.com>
Date: Thu, 16 Mar 2017 20:15:10 -0400
Subject: [PATCH 31/41] Fix ntohull for platforms that have signed chars.

---
 src/cache/disk.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/src/cache/disk.c b/src/cache/disk.c
index a7f5dbbcd7..6a5e086a10 100644
--- a/src/cache/disk.c
+++ b/src/cache/disk.c
@@ -63,7 +63,7 @@ static int gettimeofday(struct timeval *tp, struct timezone *tzp) {
 #include <sys/stat.h>
 
 #define O_BINARY 0
-#define setmode(a, b)
+#define _setmode(a, b)
 
 #endif
 
@@ -84,7 +84,8 @@ typedef struct _disk_cache {
 } disk_cache;
 
 
-static unsigned long long ntohull(const char *in) {
+static unsigned long long ntohull(const char *_in) {
+  const unsigned char *in = (const unsigned char *)_in;
   return ((unsigned long long)in[0] << 56 | (unsigned long long)in[1] << 48 |
           (unsigned long long)in[2] << 40 | (unsigned long long)in[3] << 32 |
           (unsigned long long)in[4] << 24 | (unsigned long long)in[5] << 16 |
@@ -92,14 +93,14 @@ static unsigned long long ntohull(const char *in) {
 }
 
 static void htonull(unsigned long long in, char *out) {
-  out[0] = (char)(in >> 56);
-  out[1] = (char)(in >> 48);
-  out[2] = (char)(in >> 40);
-  out[3] = (char)(in >> 32);
-  out[4] = (char)(in >> 24);
-  out[5] = (char)(in >> 16);
-  out[6] = (char)(in >> 8);
-  out[7] = (char)(in);
+  out[0] = (unsigned char)(in >> 56);
+  out[1] = (unsigned char)(in >> 48);
+  out[2] = (unsigned char)(in >> 40);
+  out[3] = (unsigned char)(in >> 32);
+  out[4] = (unsigned char)(in >> 24);
+  out[5] = (unsigned char)(in >> 16);
+  out[6] = (unsigned char)(in >> 8);
+  out[7] = (unsigned char)(in);
 }
 
 static int catp(char *path, const char *dirp, const char *rpath) {
@@ -134,7 +135,7 @@ static int mkstempp(const char *dirp, char *template) {
 
   /* We need to copy the result path back and set binary mode (for windows) */
   if (res != -1) {
-    setmode(res, O_BINARY);
+    _setmode(res, O_BINARY);
     memcpy(template, &path[strlen(dirp)], strlen(template));
   }
 

From 26b85ee432bcbc261f5760ac55cdc3be2b15eeb6 Mon Sep 17 00:00:00 2001
From: Arnaud Bergeron <abergeron@gmail.com>
Date: Mon, 20 Mar 2017 17:33:42 -0400
Subject: [PATCH 32/41] Make GpuKernel_binary work again.

---
 src/gpuarray/buffer.h        | 17 +++++++++++++++--
 src/gpuarray/error.h         |  1 -
 src/gpuarray/kernel.h        |  1 -
 src/gpuarray_buffer.c        |  2 +-
 src/gpuarray_buffer_cuda.c   | 19 +++++++++++++++----
 src/gpuarray_buffer_opencl.c | 29 +++++++++++++++++++++++++++++
 src/gpuarray_error.c         |  1 -
 src/private.h                |  1 +
 src/private_cuda.h           |  2 ++
 9 files changed, 63 insertions(+), 10 deletions(-)

diff --git a/src/gpuarray/buffer.h b/src/gpuarray/buffer.h
index 34878d503a..800756a072 100644
--- a/src/gpuarray/buffer.h
+++ b/src/gpuarray/buffer.h
@@ -494,9 +494,22 @@ GPUARRAY_PUBLIC int gpukernel_call(gpukernel *k, unsigned int n,
                                    size_t shared, void **args);
 
 /**
- * Get the kernel binary (REMOVED).
+ * (Deprecated) Get the kernel binary.
  *
- * Always returns GA_DEPRECATED_ERROR.
+ * This function is deprecated and will be removed in the next release.
+ *
+ * This can be use to cache kernel binaries after compilation of a
+ * specific device.  The kernel can be recreated by calling
+ * kernel_alloc with the binary and size and passing `GA_USE_BINARY`
+ * as the use flags.
+ *
+ * The returned pointer is allocated and must be freed by the caller.
+ *
+ * \param k kernel
+ * \param sz size of the returned binary
+ * \param obj pointer to the binary for the kernel.
+ *
+ * \returns GA_NO_ERROR or an error code if an error occurred.
  */
 GPUARRAY_PUBLIC int gpukernel_binary(gpukernel *k, size_t *sz, void **obj);
 
diff --git a/src/gpuarray/error.h b/src/gpuarray/error.h
index 84c852a257..af963c1531 100644
--- a/src/gpuarray/error.h
+++ b/src/gpuarray/error.h
@@ -36,7 +36,6 @@ enum ga_error {
   GA_COMM_ERROR,
   GA_XLARGE_ERROR,
   GA_LOAD_ERROR,
-  GA_DEPRECATED_ERROR,
   /* Add more error types if needed, but at the end */
   /* Don't forget to sync with Gpu_error() */
 };
diff --git a/src/gpuarray/kernel.h b/src/gpuarray/kernel.h
index da779123b9..f88d74ffc6 100644
--- a/src/gpuarray/kernel.h
+++ b/src/gpuarray/kernel.h
@@ -107,7 +107,6 @@ GPUARRAY_PUBLIC int GpuKernel_call(GpuKernel *k, unsigned int n,
                                    const size_t *gs, const size_t *ls,
                                    size_t shared, void **args);
 
-/* Deprecated and to be removed */
 GPUARRAY_PUBLIC int GpuKernel_binary(const GpuKernel *k, size_t *sz,
                                     void **obj);
 
diff --git a/src/gpuarray_buffer.c b/src/gpuarray_buffer.c
index a1e840c939..dee65f130c 100644
--- a/src/gpuarray_buffer.c
+++ b/src/gpuarray_buffer.c
@@ -187,7 +187,7 @@ int gpukernel_call(gpukernel *k, unsigned int n, const size_t *gs,
 }
 
 int gpukernel_binary(gpukernel *k, size_t *sz, void **obj) {
-  return GA_DEPRECATED_ERROR;
+  return ((partial_gpukernel *)k)->ctx->ops->kernel_binary(k, sz, obj);
 }
 
 int gpukernel_property(gpukernel *k, int prop_id, void *res) {
diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c
index 7b1f540c26..fe0c100508 100644
--- a/src/gpuarray_buffer_cuda.c
+++ b/src/gpuarray_buffer_cuda.c
@@ -1151,6 +1151,7 @@ static void _cuda_freekernel(gpukernel *k) {
     }
     CLEAR(k);
     free(k->args);
+    free(k->bin);
     free(k->types);
     free(k);
   }
@@ -1264,13 +1265,15 @@ static gpukernel *cuda_newkernel(gpucontext *c, unsigned int count,
       FAIL(NULL, GA_SYS_ERROR);
     }
 
+    /* Don't clear bin after this */
+    res->bin_sz = bin.l;
+    res->bin = bin.s;
     res->refcnt = 1;
     res->argcount = argcount;
     res->types = calloc(argcount, sizeof(int));
     if (res->types == NULL) {
       _cuda_freekernel(res);
       strb_clear(&src);
-      strb_clear(&bin);
       cuda_exit(ctx);
       FAIL(NULL, GA_MEMORY_ERROR);
     }
@@ -1279,7 +1282,6 @@ static gpukernel *cuda_newkernel(gpucontext *c, unsigned int count,
     if (res->args == NULL) {
       _cuda_freekernel(res);
       strb_clear(&src);
-      strb_clear(&bin);
       cuda_exit(ctx);
       FAIL(NULL, GA_MEMORY_ERROR);
     }
@@ -1288,11 +1290,9 @@ static gpukernel *cuda_newkernel(gpucontext *c, unsigned int count,
     if (ctx->err != CUDA_SUCCESS) {
       _cuda_freekernel(res);
       strb_clear(&src);
-      strb_clear(&bin);
       cuda_exit(ctx);
       FAIL(NULL, GA_IMPL_ERROR);
     }
-    strb_clear(&bin);
 
     ctx->err = cuModuleGetFunction(&res->k, res->m, fname);
     if (ctx->err != CUDA_SUCCESS) {
@@ -1390,6 +1390,16 @@ static int cuda_callkernel(gpukernel *k, unsigned int n,
     return GA_NO_ERROR;
 }
 
+static int cuda_kernelbin(gpukernel *k, size_t *sz, void **obj) {
+  void *res = malloc(k->bin_sz);
+  if (res == NULL)
+    return GA_MEMORY_ERROR;
+  memcpy(res, k->bin, k->bin_sz);
+  *sz = k->bin_sz;
+  *obj = res;
+  return GA_NO_ERROR;
+}
+
 static int cuda_sync(gpudata *b) {
   cuda_context *ctx = (cuda_context *)b->ctx;
   int err = GA_NO_ERROR;
@@ -1792,6 +1802,7 @@ const gpuarray_buffer_ops cuda_ops = {cuda_get_platform_count,
                                       cuda_freekernel,
                                       cuda_kernelsetarg,
                                       cuda_callkernel,
+                                      cuda_kernelbin,
                                       cuda_sync,
                                       cuda_transfer,
                                       cuda_property,
diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c
index e1e8fdd82a..3e87f23c41 100644
--- a/src/gpuarray_buffer_opencl.c
+++ b/src/gpuarray_buffer_opencl.c
@@ -1076,6 +1076,34 @@ static int cl_callkernel(gpukernel *k, unsigned int n,
   return GA_NO_ERROR;
 }
 
+static int cl_kernelbin(gpukernel *k, size_t *sz, void **obj) {
+  cl_ctx *ctx = k->ctx;
+  cl_program p;
+  size_t rsz;
+  void *res;
+
+  ASSERT_KER(k);
+  ASSERT_CTX(ctx);
+
+  ctx->err = clGetKernelInfo(k->k, CL_KERNEL_PROGRAM, sizeof(p), &p, NULL);
+  if (ctx->err != CL_SUCCESS)
+    return GA_IMPL_ERROR;
+  ctx->err = clGetProgramInfo(p, CL_PROGRAM_BINARY_SIZES, sizeof(rsz), &rsz, NULL);
+  if (ctx->err != CL_SUCCESS)
+    return GA_IMPL_ERROR;
+  res = malloc(rsz);
+  if (res == NULL)
+    return GA_MEMORY_ERROR;
+  ctx->err = clGetProgramInfo(p, CL_PROGRAM_BINARIES, sizeof(res), &res, NULL);
+  if (ctx->err != CL_SUCCESS) {
+    free(res);
+    return GA_IMPL_ERROR;
+  }
+  *sz = rsz;
+  *obj = res;
+  return GA_NO_ERROR;
+}
+
 static int cl_sync(gpudata *b) {
   cl_ctx *ctx = (cl_ctx *)b->ctx;
 
@@ -1437,6 +1465,7 @@ const gpuarray_buffer_ops opencl_ops = {cl_get_platform_count,
                                         cl_releasekernel,
                                         cl_setkernelarg,
                                         cl_callkernel,
+                                        cl_kernelbin,
                                         cl_sync,
                                         cl_transfer,
                                         cl_property,
diff --git a/src/gpuarray_error.c b/src/gpuarray_error.c
index ddebd3e9dc..b7d5011f5b 100644
--- a/src/gpuarray_error.c
+++ b/src/gpuarray_error.c
@@ -25,7 +25,6 @@ const char *gpuarray_error_str(int err) {
   case GA_COMM_ERROR:        return "Error in collectives call";
   case GA_XLARGE_ERROR:      return "Input size too large for operation";
   case GA_LOAD_ERROR:        return "Error loading library";
-  case GA_DEPRECATED_ERROR:  return "Deprecated (removed) functionality";
   default: return "Unknown GA error";
   }
 }
diff --git a/src/private.h b/src/private.h
index 2de8742674..820ebb6287 100644
--- a/src/private.h
+++ b/src/private.h
@@ -100,6 +100,7 @@ struct _gpuarray_buffer_ops {
                      const size_t *gs, const size_t *ls,
                      size_t shared, void **args);
 
+  int (*kernel_binary)(gpukernel *k, size_t *sz, void **obj);
   int (*buffer_sync)(gpudata *b);
   int (*buffer_transfer)(gpudata *dst, size_t dstoff,
                          gpudata *src, size_t srcoff, size_t sz);
diff --git a/src/private_cuda.h b/src/private_cuda.h
index a0b4557977..ad9ff7f8ae 100644
--- a/src/private_cuda.h
+++ b/src/private_cuda.h
@@ -137,6 +137,8 @@ struct _gpukernel {
   CUmodule m;
   CUfunction k;
   void **args;
+  size_t bin_sz;
+  void *bin;
   int *types;
   unsigned int argcount;
   unsigned int refcnt;

From 330703150aaaa7614faa8a524319c9e8977e8ec1 Mon Sep 17 00:00:00 2001
From: Arnaud Bergeron <abergeron@gmail.com>
Date: Thu, 30 Mar 2017 11:54:09 -0400
Subject: [PATCH 33/41] Add a description of the purpose of make.bat.

---
 make.bat | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/make.bat b/make.bat
index 16bc79f441..3402206e00 100755
--- a/make.bat
+++ b/make.bat
@@ -1,3 +1,7 @@
+REM This helps repetitive builds on windows
+REM It needs the compiler you want to use to be available in the shell
+REM and it will build a release version
+
 del bld
 mkdir bld
 cd bld

From e62616b4d6a04c2ba3fdcba4f3d000df3ff7d0a6 Mon Sep 17 00:00:00 2001
From: Arnaud Bergeron <abergeron@gmail.com>
Date: Thu, 30 Mar 2017 11:54:34 -0400
Subject: [PATCH 34/41] Clean up the cache management script.

---
 bin/gpuarray-cache | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/bin/gpuarray-cache b/bin/gpuarray-cache
index 04b7e8e68a..3cfb429a76 100644
--- a/bin/gpuarray-cache
+++ b/bin/gpuarray-cache
@@ -1,11 +1,11 @@
 #!/usr/bin/env python
 
 import os
+import sys
 
-def clean(max_size):
+def clean(max_size, path):
     content = []
-    for root, dirs, files in os.walk(os.environ.get('GPUARRAY_CACHE',
-                                                    '~/.gpuarray/cache/')):
+    for root, dirs, files in os.walk(path):
         for file in files:
             fpath = os.path.join(root, file)
             st = os.stat(fpath)
@@ -25,18 +25,18 @@ SUFFIXES = {'B': 1, 'K': 1 << 10, 'M': 1 << 20, 'G': 1 << 30, 'T': 1 << 40,
 
 def get_size(s):
     i = 0
-    while i < len(s) and (s[i].isdigit() or s[i] == '.'):
-        i += 1
-    num = s[:i]
-    suf = s[i:]
+    s = s.strip()
+    if s[-1].upper() in SUFFIXES:
+        num = s[:-1]
+        suf = s[-1].upper()
+    else:
+        num = s
+        suf = ""
     num = float(num)
     if suf != "":
-        letter = suf.strip().upper()
-        if letter not in SUFFIXES:
-            raise ValueError("can't interpret %r" % init)
-        mult = SUFFIXES[letter]
+        mult = SUFFIXES[suf]
     else:
-        mult = 0
+        mult = 1
     return int(num * mult)
 
 
@@ -46,6 +46,10 @@ if __name__ == '__main__':
     parser = argparse.ArgumentParser(description='libgpuarray cache maintenance utility')
     parser.add_argument('-s', '--max_size', help='Set the maximum size for pruning (in bytes with suffixes: K, M, G, ...)')
     args = parser.parse_args()
+    path = os.environ.get('GPUARRAY_CACHE_PATH', None)
+    if path is None:
+        print("You need to set GPUARRAY_CACHE_PATH so that this programs knows which path to clean.")
+        sys.exit(1)
 
-    clean(get_size(args.max_size))
+    clean(get_size(args.max_size), path)
 

From 1a1cab6a1307dd0f5fcfa2dc3bed73cc532573a3 Mon Sep 17 00:00:00 2001
From: Arnaud Bergeron <abergeron@gmail.com>
Date: Thu, 30 Mar 2017 11:56:17 -0400
Subject: [PATCH 35/41] Add the appropriate bumps.

---
 setup.py           | 4 ++--
 src/CMakeLists.txt | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index 10f5996b54..7ec8d3af93 100755
--- a/setup.py
+++ b/setup.py
@@ -5,8 +5,8 @@
 
 MAJOR = 0
 MINOR = 6
-PATCH = 2
-SUFFIX = ''
+PATCH = 3
+SUFFIX = '.dev0' # include the '.'
 FULLVERSION = '%d.%d.%d%s' % (MAJOR, MINOR, PATCH, SUFFIX)
 
 try:
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 0ed776533d..a45db024ff 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -89,7 +89,7 @@ set_target_properties(gpuarray PROPERTIES
   INSTALL_NAME_DIR ${CMAKE_INSTALL_PREFIX}/lib
   MACOSX_RPATH OFF
   # This is the shared library version
-  VERSION 2.0
+  VERSION 2.1
   )
 
 add_library(gpuarray-static STATIC ${GPUARRAY_SRC})

From 878566be656aa4eb116c2f5aea70c6818b7e37f2 Mon Sep 17 00:00:00 2001
From: Arnaud Bergeron <abergeron@gmail.com>
Date: Thu, 30 Mar 2017 12:13:42 -0400
Subject: [PATCH 36/41] Add comments and fix style.

---
 src/gpuarray_buffer_cuda.c |  2 +-
 src/private_cuda.h         |  2 +-
 src/util/strb.h            | 11 +++++++----
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c
index fe0c100508..4612573f5f 100644
--- a/src/gpuarray_buffer_cuda.c
+++ b/src/gpuarray_buffer_cuda.c
@@ -1034,7 +1034,7 @@ static int call_compiler(cuda_context *ctx, strb *src, strb *ptx, strb *log) {
 
   if (strb_ensure(ptx, buflen) == 0) {
     err = nvrtcGetPTX(prog, ptx->s+ptx->l);
-    if (err == NVRTC_SUCCESS) ptx->l = buflen;
+    if (err == NVRTC_SUCCESS) ptx->l += buflen;
   }
 
 end:
diff --git a/src/private_cuda.h b/src/private_cuda.h
index ad9ff7f8ae..dc81ceba52 100644
--- a/src/private_cuda.h
+++ b/src/private_cuda.h
@@ -68,7 +68,7 @@ typedef struct _cuda_context {
   CUstream mem_s;
   gpudata *freeblocks;
   cache *kernel_cache;
-  cache *disk_cache;
+  cache *disk_cache; // This is per-context to avoid lock contention
   unsigned int enter;
   unsigned char major;
   unsigned char minor;
diff --git a/src/util/strb.h b/src/util/strb.h
index 01ea7a2495..3289de5796 100644
--- a/src/util/strb.h
+++ b/src/util/strb.h
@@ -46,7 +46,7 @@ strb *strb_alloc(size_t s);
  *
  * Don't call this for stack of global declarations, see strb_clear() instead.
  */
-void strb_free(strb *);
+void strb_free(strb *sb);
 
 /*
  * Return a pointer to a dynamically allocated strb with a default
@@ -96,7 +96,7 @@ static inline void strb_clear(strb *sb) {
  * This should almost never be called directly.  Use strb_ensure()
  * instead.
  */
-int strb_grow(strb *, size_t s);
+int strb_grow(strb *sb, size_t s);
 
 /*
  * Make sure there is space to store at least `s` bytes of data after
@@ -159,14 +159,17 @@ static inline void strb_appendb(strb *sb, const strb *sb2) {
  *
  * A format error will place the strb in error mode.
  */
-void strb_appendf(strb *, const char *f, ...);
+void strb_appendf(strb *sb, const char *f, ...);
 
 /*
  * Reads from the file specified by the given file descriptor.
  *
+ * This will read `sz` bytes from the file descriptor.  Insufficient
+ * data is handled as a read error.
+ *
  * A read error will place the strb in error mode.
  */
-void strb_read(strb *, int fd, size_t sz);
+void strb_read(strb *sb, int fd, size_t sz);
 
 /*
  * Write the content of an strb to the specified file descriptor.

From 6934341bc62d7d286e7ed849c1af99d4ec2caec3 Mon Sep 17 00:00:00 2001
From: Arnaud Bergeron <abergeron@gmail.com>
Date: Thu, 30 Mar 2017 12:36:06 -0400
Subject: [PATCH 37/41] Add some explanatory comments.

---
 src/cache/disk.c           | 5 +++++
 src/gpuarray_buffer_cuda.c | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/cache/disk.c b/src/cache/disk.c
index 6a5e086a10..601690a421 100644
--- a/src/cache/disk.c
+++ b/src/cache/disk.c
@@ -84,6 +84,7 @@ typedef struct _disk_cache {
 } disk_cache;
 
 
+/* Convert unsigned long long from network to host order */
 static unsigned long long ntohull(const char *_in) {
   const unsigned char *in = (const unsigned char *)_in;
   return ((unsigned long long)in[0] << 56 | (unsigned long long)in[1] << 48 |
@@ -92,6 +93,7 @@ static unsigned long long ntohull(const char *_in) {
           (unsigned long long)in[6] << 8 | (unsigned long long)in[7]);
 }
 
+/* Convert unsigned long long from host to network order */
 static void htonull(unsigned long long in, char *out) {
   out[0] = (unsigned char)(in >> 56);
   out[1] = (unsigned char)(in >> 48);
@@ -103,6 +105,8 @@ static void htonull(unsigned long long in, char *out) {
   out[7] = (unsigned char)(in);
 }
 
+/* Concatenate prefix and suffix into a single path string while
+   checking for overflow */
 static int catp(char *path, const char *dirp, const char *rpath) {
   if (strlcpy(path, dirp, PATH_MAX) >= PATH_MAX) {
     errno = ENAMETOOLONG;
@@ -115,6 +119,7 @@ static int catp(char *path, const char *dirp, const char *rpath) {
   return 0;
 }
 
+/* open() for a path specifed by the concatenation of dirp and rpath */
 static int openp(const char *dirp, const char *rpath, int flags, int mode) {
   char path[PATH_MAX];
 
diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c
index 4612573f5f..8614eced61 100644
--- a/src/gpuarray_buffer_cuda.c
+++ b/src/gpuarray_buffer_cuda.c
@@ -1025,7 +1025,7 @@ static int call_compiler(cuda_context *ctx, strb *src, strb *ptx, strb *log) {
     strb_appends(log, "NVRTC compile log::\n");
     if (strb_ensure(log, buflen) == 0)
       if (nvrtcGetProgramLog(prog, log->s+log->l) == NVRTC_SUCCESS)
-        log->l += buflen - 1;
+        log->l += buflen - 1; // Remove the final NUL
     strb_appendc(log, '\n');
   }
 

From 137395c005a430f5a951be8397d5eea949b8f73d Mon Sep 17 00:00:00 2001
From: Arnaud Bergeron <abergeron@gmail.com>
Date: Thu, 30 Mar 2017 18:56:48 -0400
Subject: [PATCH 38/41] Fix aliasing of src strb between the caches.

---
 src/gpuarray_buffer_cuda.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c
index 8614eced61..a406a85852 100644
--- a/src/gpuarray_buffer_cuda.c
+++ b/src/gpuarray_buffer_cuda.c
@@ -1120,17 +1120,23 @@ static int compile(cuda_context *ctx, strb *src, strb* bin, strb *log) {
   err = make_bin(ctx, &ptx, bin, log);
   if (err != GA_NO_ERROR) return err;
   if (ctx->disk_cache) {
-    pk = memdup(&k, sizeof(k));
+    pk = calloc(sizeof(kernel_key), 1);
     if (pk == NULL)
       return GA_NO_ERROR;
+    memcpy(pk->bin_id, k.bin_id, 64);
+    strb_appendb(&pk->src, src);
+    if (strb_error(&pk->src)) {
+      key_free((cache_key_t)pk);
+      return GA_NO_ERROR;
+    }
     cbin = strb_alloc(bin->l);
     if (cbin == NULL) {
-      free(pk);
+      key_free((cache_key_t)pk);
       return GA_NO_ERROR;
     }
     strb_appendb(cbin, bin);
     if (strb_error(cbin)) {
-      free(pk);
+      key_free((cache_key_t)pk);
       strb_free(cbin);
       return GA_NO_ERROR;
     }

From a708ed715ada7976d69ec44cf2ad4507b5e6f36b Mon Sep 17 00:00:00 2001
From: Arnaud Bergeron <abergeron@gmail.com>
Date: Tue, 4 Apr 2017 17:22:53 -0400
Subject: [PATCH 39/41] Add shitty error messages when the disk cache fails.

They will be upgraded to better message with the PR about better error messages.
---
 src/gpuarray_buffer_cuda.c | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c
index a406a85852..bbef0ab2a0 100644
--- a/src/gpuarray_buffer_cuda.c
+++ b/src/gpuarray_buffer_cuda.c
@@ -224,14 +224,19 @@ cuda_context *cuda_make_ctx(CUcontext ctx, int flags) {
                           (cache_hash_fn)key_hash,
                           (cache_freek_fn)key_free,
                           (cache_freev_fn)strb_free);
-    if (mem_cache == NULL)
+    if (mem_cache == NULL) {
+      // TODO use better error messages when they are available.
+      fprintf(stderr, "Error initializing disk cache, disabling\n");
       goto fail_disk_cache;
+    }
     res->disk_cache = cache_disk(cache_path, mem_cache,
                                  (kwrite_fn)key_write,
                                  (vwrite_fn)kernel_write,
                                  (kread_fn)key_read,
                                  (vread_fn)kernel_read);
     if (res->disk_cache == NULL) {
+      // TODO use better error messages when they are available.
+      fprintf(stderr, "Error initializing disk cache, disabling\n");
       cache_destroy(mem_cache);
       goto fail_disk_cache;
     }
@@ -1121,26 +1126,38 @@ static int compile(cuda_context *ctx, strb *src, strb* bin, strb *log) {
   if (err != GA_NO_ERROR) return err;
   if (ctx->disk_cache) {
     pk = calloc(sizeof(kernel_key), 1);
-    if (pk == NULL)
+    if (pk == NULL) {
+      // TODO use better error messages
+      fprintf(stderr, "Error adding kernel to disk cache\n");
       return GA_NO_ERROR;
+    }
     memcpy(pk->bin_id, k.bin_id, 64);
     strb_appendb(&pk->src, src);
     if (strb_error(&pk->src)) {
+      // TODO use better error messages
+      fprintf(stderr, "Error adding kernel to disk cache\n");
       key_free((cache_key_t)pk);
       return GA_NO_ERROR;
     }
     cbin = strb_alloc(bin->l);
     if (cbin == NULL) {
+      // TODO use better error messages
+      fprintf(stderr, "Error adding kernel to disk cache\n");
       key_free((cache_key_t)pk);
       return GA_NO_ERROR;
     }
     strb_appendb(cbin, bin);
     if (strb_error(cbin)) {
+      // TODO use better error messages
+      fprintf(stderr, "Error adding kernel to disk cache\n");
       key_free((cache_key_t)pk);
       strb_free(cbin);
       return GA_NO_ERROR;
     }
-    cache_add(ctx->disk_cache, pk, cbin);
+    if (cache_add(ctx->disk_cache, pk, cbin)) {
+      // TODO use better error messages
+      fprintf(stderr, "Error adding kernel to disk cache\n");
+    }
   }
 
   return GA_NO_ERROR;

From e1bd3c803c5ab9bd2bfcb1721bf6f741686e07f7 Mon Sep 17 00:00:00 2001
From: Arnaud Bergeron <abergeron@gmail.com>
Date: Tue, 4 Apr 2017 18:20:51 -0400
Subject: [PATCH 40/41] Add additional info to the kernel cache key.

---
 src/gpuarray_buffer_cuda.c | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c
index bbef0ab2a0..894de1a8f3 100644
--- a/src/gpuarray_buffer_cuda.c
+++ b/src/gpuarray_buffer_cuda.c
@@ -50,10 +50,17 @@ static int detect_arch(const char *prefix, char *ret, CUresult *err);
 static gpudata *new_gpudata(cuda_context *ctx, CUdeviceptr ptr, size_t size);
 
 typedef struct _kernel_key {
+  uint8_t version;
+  uint8_t debug;
+  uint8_t major;
+  uint8_t minor;
+  uint32_t reserved;
   char bin_id[64];
   strb src;
 } kernel_key;
 
+#define KERNEL_KEY_MM (sizeof(kernel_key) - sizeof(strb))
+
 static void key_free(cache_key_t _k) {
   kernel_key *k = (kernel_key *)_k;
   strb_clear(&k->src);
@@ -70,36 +77,36 @@ static uint32_t strb_hash(strb *k) {
 }
 
 static int key_eq(kernel_key *k1, kernel_key *k2) {
-  return (memcmp(k1->bin_id, k2->bin_id, 64) == 0 &&
+  return (memcmp(k1, k2, KERNEL_KEY_MM) == 0 &&
           strb_eq(&k1->src, &k2->src));
 }
 
 static int key_hash(kernel_key *k) {
   XXH32_state_t state;
   XXH32_reset(&state, 42);
-  XXH32_update(&state, k->bin_id, 64);
+  XXH32_update(&state, k, KERNEL_KEY_MM);
   XXH32_update(&state, k->src.s, k->src.l);
   return XXH32_digest(&state);
 }
 
 static int key_write(strb *res, kernel_key *k) {
-  strb_appendn(res, k->bin_id, 64);
+  strb_appendn(res, (const char *)k, KERNEL_KEY_MM);
   strb_appendb(res, &k->src);
   return strb_error(res);
 }
 
 static kernel_key *key_read(const strb *b) {
   kernel_key *k;
-  if (b->l < 64) return NULL;
+  if (b->l < KERNEL_KEY_MM) return NULL;
   k = calloc(1, sizeof(*k));
   if (k == NULL) return NULL;
-  if (strb_ensure(&k->src, b->l - 64) != 0) {
+  if (strb_ensure(&k->src, b->l - KERNEL_KEY_MM) != 0) {
     strb_clear(&k->src);
     free(k);
     return NULL;
   }
-  memcpy(k->bin_id, b->s, 64);
-  strb_appendn(&k->src, b->s+64, b->l-64);
+  memcpy(k->bin_id, b->s, KERNEL_KEY_MM);
+  strb_appendn(&k->src, b->s + KERNEL_KEY_MM, b->l - KERNEL_KEY_MM);
   return k;
 }
 
@@ -1108,6 +1115,13 @@ static int compile(cuda_context *ctx, strb *src, strb* bin, strb *log) {
   kernel_key *pk;
   int err;
 
+  memset(&k, 0, sizeof(k));
+  k.version = 0;
+#ifdef DEBUG
+  k.debug = 1;
+#endif
+  k.major = ctx->major;
+  k.minor = ctx->minor;
   memcpy(k.bin_id, ctx->bin_id, 64);
   memcpy(&k.src, src, sizeof(strb));
 
@@ -1131,7 +1145,7 @@ static int compile(cuda_context *ctx, strb *src, strb* bin, strb *log) {
       fprintf(stderr, "Error adding kernel to disk cache\n");
       return GA_NO_ERROR;
     }
-    memcpy(pk->bin_id, k.bin_id, 64);
+    memcpy(pk, &k, KERNEL_KEY_MM);
     strb_appendb(&pk->src, src);
     if (strb_error(&pk->src)) {
       // TODO use better error messages

From 9306b9692904fa0481aa757e0cc177b09dd89701 Mon Sep 17 00:00:00 2001
From: Arnaud Bergeron <abergeron@gmail.com>
Date: Wed, 5 Apr 2017 13:58:30 -0400
Subject: [PATCH 41/41] Fix bug in key_read and explain KERNEL_KEY_MM.

---
 src/gpuarray_buffer_cuda.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c
index 894de1a8f3..47bc7ac526 100644
--- a/src/gpuarray_buffer_cuda.c
+++ b/src/gpuarray_buffer_cuda.c
@@ -59,6 +59,7 @@ typedef struct _kernel_key {
   strb src;
 } kernel_key;
 
+/* Size of the kernel_key that we can memcopy to duplicate */
 #define KERNEL_KEY_MM (sizeof(kernel_key) - sizeof(strb))
 
 static void key_free(cache_key_t _k) {
@@ -100,12 +101,16 @@ static kernel_key *key_read(const strb *b) {
   if (b->l < KERNEL_KEY_MM) return NULL;
   k = calloc(1, sizeof(*k));
   if (k == NULL) return NULL;
+  memcpy(k, b->s, KERNEL_KEY_MM);
+  if (k->version != 0) {
+    free(k);
+    return NULL;
+  }
   if (strb_ensure(&k->src, b->l - KERNEL_KEY_MM) != 0) {
     strb_clear(&k->src);
     free(k);
     return NULL;
   }
-  memcpy(k->bin_id, b->s, KERNEL_KEY_MM);
   strb_appendn(&k->src, b->s + KERNEL_KEY_MM, b->l - KERNEL_KEY_MM);
   return k;
 }