From 9e80f85dcac94bedaea6857772a26864aab207e3 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 6 Dec 2016 07:54:14 +0100 Subject: [PATCH 01/41] Add a strb method to read from a file. --- src/util/strb.c | 18 +++++++++++++++++- src/util/strb.h | 7 +++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/src/util/strb.c b/src/util/strb.c index b202b5065c..273aa8fa6e 100644 --- a/src/util/strb.c +++ b/src/util/strb.c @@ -1,5 +1,7 @@ - +#include #include +#include + #include "util/strb.h" strb *strb_alloc(size_t i) { @@ -55,3 +57,17 @@ void strb_appendf(strb *sb, const char *f, ...) { va_end(ap); sb->l += s; } + +void strb_read(strb *sb, int fd, size_t sz) { + ssize_t res; + char *b; + if (strb_ensure(sb, sz)) return; + b = sb->s + sb->l; + sb->l += sz; + while (sz) { + res = read(fd, b, sz); + if (res == -1 && !(errno == EAGAIN || errno == EINTR)) { strb_seterror(sb); return; } + sz -= (size_t)res; + b += (size_t)res; + } +} diff --git a/src/util/strb.h b/src/util/strb.h index b2f18449d7..490031969d 100644 --- a/src/util/strb.h +++ b/src/util/strb.h @@ -161,6 +161,13 @@ static inline void strb_appendb(strb *sb, strb *sb2) { */ GPUARRAY_LOCAL void strb_appendf(strb *, const char *f, ...); +/* + * Reads from the file specified by the given file descriptor. + * + * A read error will place the strb in error mode. + */ +GPUARRAY_LOCAL void strb_read(strb *, int fd, size_t sz); + /* * Returns a C string from the content of the strb. * From ea6dfb6b82f8c5e6a274658331f94567f76ce91f Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 7 Dec 2016 06:49:00 +0100 Subject: [PATCH 02/41] Import skein, stripping away everything but skein512-512 and the portable goo. --- src/util/CMakeLists.txt | 1 + src/util/skein.c | 309 ++++++++++++++++++++++++++++++++++++++++ src/util/skein.h | 145 +++++++++++++++++++ 3 files changed, 455 insertions(+) create mode 100644 src/util/skein.c create mode 100644 src/util/skein.h diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt index 61a603b44a..5c21cc3e90 100644 --- a/src/util/CMakeLists.txt +++ b/src/util/CMakeLists.txt @@ -2,4 +2,5 @@ set_rel(UTIL_SRC strb.c xxhash.c integerfactoring.c +skein.c ) diff --git a/src/util/skein.c b/src/util/skein.c new file mode 100644 index 0000000000..50285ea2c9 --- /dev/null +++ b/src/util/skein.c @@ -0,0 +1,309 @@ +/*********************************************************************** +** +** Implementation of the Skein hash function. +** +** Source code author: Doug Whiting, 2008. +** +** This algorithm and source code is released to the public domain. +** +************************************************************************/ + +#include /* get the memcpy/memset functions */ +#include "skein.h" /* get the Skein API definitions */ + +#define MK_64 SKEIN_MK_64 + +/* blkSize = 512 bits. hashSize = 512 bits */ +static const u64b_t SKEIN_512_IV_512[] = + { + MK_64(0x4903ADFF,0x749C51CE), + MK_64(0x0D95DE39,0x9746DF03), + MK_64(0x8FD19341,0x27C79BCE), + MK_64(0x9A255629,0xFF352CB1), + MK_64(0x5DB62599,0xDF6CA7B0), + MK_64(0xEABE394C,0xA9D5C3F4), + MK_64(0x991112C7,0x1A75B523), + MK_64(0xAE18A40B,0x660FCC33) + }; + +static void Skein_Put64_LSB_First(u08b_t *dst,const u64b_t *src,size_t bCnt) { + size_t n; + + for (n = 0; n < bCnt; n++) + dst[n] = (u08b_t)(src[n>>3] >> (8*(n&7))); +} + +static void Skein_Get64_LSB_First(u64b_t *dst, const u08b_t *src, + size_t wCnt) { + size_t n; + + for (n=0; n<8*wCnt; n+=8) + dst[n/8] = (((u64b_t) src[n ])) + + (((u64b_t) src[n+1]) << 8) + + (((u64b_t) src[n+2]) << 16) + + (((u64b_t) src[n+3]) << 24) + + (((u64b_t) src[n+4]) << 32) + + (((u64b_t) src[n+5]) << 40) + + (((u64b_t) src[n+6]) << 48) + + (((u64b_t) src[n+7]) << 56) ; +} + +static u64b_t Skein_Swap64(u64b_t in) { + u64b_t o; + u08b_t *out = (u08b_t *)&o; + out[7] = in >> 56; + out[6] = in >> 48; + out[5] = in >> 40; + out[4] = in >> 32; + out[3] = in >> 24; + out[2] = in >> 16; + out[1] = in >> 8; + out[0] = in; + return o; +} + +/*****************************************************************/ +/* Function to process blkCnt (nonzero) full block(s) of data. */ +#define BLK_BITS (WCNT*64) /* some useful definitions for \ + code here */ +#define KW_TWK_BASE (0) +#define KW_KEY_BASE (3) +#define ks (kw + KW_KEY_BASE) +#define ts (kw + KW_TWK_BASE) + +#define RotL_64(x,N) (((x) << (N)) | ((x) >> (64-(N)))) + +static void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx, const u08b_t *blkPtr, + size_t blkCnt, size_t byteCntAdd) { + enum { + WCNT = SKEIN_512_STATE_WORDS + }; +#define RCNT (SKEIN_512_ROUNDS_TOTAL/8) + + u64b_t kw[WCNT+4]; /* key schedule words : chaining vars + tweak */ + u64b_t X0,X1,X2,X3,X4,X5,X6,X7; /* local copy of vars, for speed */ + u64b_t w [WCNT]; /* local copy of input block */ + + Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */ + ts[0] = ctx->h.T[0]; + ts[1] = ctx->h.T[1]; + do { + /* this implementation only supports 2**64 input bytes (no carry out here) */ + ts[0] += byteCntAdd; /* update processed length */ + + /* precompute the key schedule for this block */ + ks[0] = ctx->X[0]; + ks[1] = ctx->X[1]; + ks[2] = ctx->X[2]; + ks[3] = ctx->X[3]; + ks[4] = ctx->X[4]; + ks[5] = ctx->X[5]; + ks[6] = ctx->X[6]; + ks[7] = ctx->X[7]; + ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ + ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY; + + ts[2] = ts[0] ^ ts[1]; + + Skein_Get64_LSB_First(w,blkPtr,WCNT); /* get input block in little-endian format */ + + X0 = w[0] + ks[0]; /* do the first full key injection */ + X1 = w[1] + ks[1]; + X2 = w[2] + ks[2]; + X3 = w[3] + ks[3]; + X4 = w[4] + ks[4]; + X5 = w[5] + ks[5] + ts[0]; + X6 = w[6] + ks[6] + ts[1]; + X7 = w[7] + ks[7]; + + blkPtr += SKEIN_512_BLOCK_BYTES; + + /* run the rounds */ +#define Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum) \ + X##p0 += X##p1; X##p1 = RotL_64(X##p1,ROT##_0); X##p1 ^= X##p0; \ + X##p2 += X##p3; X##p3 = RotL_64(X##p3,ROT##_1); X##p3 ^= X##p2; \ + X##p4 += X##p5; X##p5 = RotL_64(X##p5,ROT##_2); X##p5 ^= X##p4; \ + X##p6 += X##p7; X##p7 = RotL_64(X##p7,ROT##_3); X##p7 ^= X##p6; \ + +#define R512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum) /* unrolled */ \ + Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum) + +#define I512(R) \ + X0 += ks[((R)+1) % 9]; /* inject the key schedule value */ \ + X1 += ks[((R)+2) % 9]; \ + X2 += ks[((R)+3) % 9]; \ + X3 += ks[((R)+4) % 9]; \ + X4 += ks[((R)+5) % 9]; \ + X5 += ks[((R)+6) % 9] + ts[((R)+1) % 3]; \ + X6 += ks[((R)+7) % 9] + ts[((R)+2) % 3]; \ + X7 += ks[((R)+8) % 9] + (R)+1; + + { + +#define R512_8_rounds(R) /* do 8 full rounds */ \ + R512(0,1,2,3,4,5,6,7,R_512_0,8*(R)+ 1); \ + R512(2,1,4,7,6,5,0,3,R_512_1,8*(R)+ 2); \ + R512(4,1,6,3,0,5,2,7,R_512_2,8*(R)+ 3); \ + R512(6,1,0,7,2,5,4,3,R_512_3,8*(R)+ 4); \ + I512(2*(R)); \ + R512(0,1,2,3,4,5,6,7,R_512_4,8*(R)+ 5); \ + R512(2,1,4,7,6,5,0,3,R_512_5,8*(R)+ 6); \ + R512(4,1,6,3,0,5,2,7,R_512_6,8*(R)+ 7); \ + R512(6,1,0,7,2,5,4,3,R_512_7,8*(R)+ 8); \ + I512(2*(R)+1); /* and key injection */ + + R512_8_rounds( 0); + +#define R512_Unroll_R(NN) (SKEIN_512_ROUNDS_TOTAL/8 > (NN)) + + #if R512_Unroll_R( 1) + R512_8_rounds( 1); + #endif + #if R512_Unroll_R( 2) + R512_8_rounds( 2); + #endif + #if R512_Unroll_R( 3) + R512_8_rounds( 3); + #endif + #if R512_Unroll_R( 4) + R512_8_rounds( 4); + #endif + #if R512_Unroll_R( 5) + R512_8_rounds( 5); + #endif + #if R512_Unroll_R( 6) + R512_8_rounds( 6); + #endif + #if R512_Unroll_R( 7) + R512_8_rounds( 7); + #endif + #if R512_Unroll_R( 8) + R512_8_rounds( 8); + #endif + #if R512_Unroll_R( 9) + R512_8_rounds( 9); + #endif + #if R512_Unroll_R(10) + R512_8_rounds(10); + #endif + #if R512_Unroll_R(11) + R512_8_rounds(11); + #endif + #if R512_Unroll_R(12) + R512_8_rounds(12); + #endif + #if R512_Unroll_R(13) + R512_8_rounds(13); + #endif + #if R512_Unroll_R(14) + R512_8_rounds(14); + #endif + } + + /* do the final "feedforward" xor, update context chaining vars */ + ctx->X[0] = X0 ^ w[0]; + ctx->X[1] = X1 ^ w[1]; + ctx->X[2] = X2 ^ w[2]; + ctx->X[3] = X3 ^ w[3]; + ctx->X[4] = X4 ^ w[4]; + ctx->X[5] = X5 ^ w[5]; + ctx->X[6] = X6 ^ w[6]; + ctx->X[7] = X7 ^ w[7]; + + ts[1] &= ~SKEIN_T1_FLAG_FIRST; + } + while (--blkCnt); + ctx->h.T[0] = ts[0]; + ctx->h.T[1] = ts[1]; +} + +/*****************************************************************/ +/* 512-bit Skein */ +/*****************************************************************/ + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ +/* init the context for a straight hashing operation */ +int Skein_512_Init(Skein_512_Ctxt_t *ctx) { + ctx->h.hashBitLen = 512; /* output hash bit count */ + memcpy(ctx->X,SKEIN_512_IV_512,sizeof(ctx->X)); + + /* Set up to process the data message portion of the hash (default) */ + Skein_Start_New_Type(ctx,MSG); /* T0=0, T1= MSG type */ + + return SKEIN_SUCCESS; +} + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ +/* process the input bytes */ +int Skein_512_Update(Skein_512_Ctxt_t *ctx, const u08b_t *msg, + size_t msgByteCnt) { + size_t n; + + Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */ + + /* process full blocks, if any */ + if (msgByteCnt + ctx->h.bCnt > SKEIN_512_BLOCK_BYTES) { + if (ctx->h.bCnt) { /* finish up any buffered message data */ + n = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt; /* # bytes free in buffer b[] */ + if (n) { + Skein_assert(n < msgByteCnt); /* check on our logic here */ + memcpy(&ctx->b[ctx->h.bCnt],msg,n); + msgByteCnt -= n; + msg += n; + ctx->h.bCnt += n; + } + Skein_assert(ctx->h.bCnt == SKEIN_512_BLOCK_BYTES); + Skein_512_Process_Block(ctx,ctx->b,1,SKEIN_512_BLOCK_BYTES); + ctx->h.bCnt = 0; + } + /* now process any remaining full blocks, directly from input message data */ + if (msgByteCnt > SKEIN_512_BLOCK_BYTES) { + n = (msgByteCnt-1) / SKEIN_512_BLOCK_BYTES; /* number of full blocks to process */ + Skein_512_Process_Block(ctx,msg,n,SKEIN_512_BLOCK_BYTES); + msgByteCnt -= n * SKEIN_512_BLOCK_BYTES; + msg += n * SKEIN_512_BLOCK_BYTES; + } + Skein_assert(ctx->h.bCnt == 0); + } + + /* copy any remaining source message data bytes into b[] */ + if (msgByteCnt) { + Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES); + memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt); + ctx->h.bCnt += msgByteCnt; + } + + return SKEIN_SUCCESS; +} + +/*++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ +/* finalize the hash computation and output the result */ +int Skein_512_Final(Skein_512_Ctxt_t *ctx, u08b_t *hashVal) { + size_t i,n,byteCnt; + u64b_t X[SKEIN_512_STATE_WORDS]; + Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES,SKEIN_FAIL); /* catch uninitialized context */ + + ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */ + if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES) /* zero pad b[] if necessary */ + memset(&ctx->b[ctx->h.bCnt],0,SKEIN_512_BLOCK_BYTES - ctx->h.bCnt); + + Skein_512_Process_Block(ctx,ctx->b,1,ctx->h.bCnt); /* process the final block */ + + /* now output the result */ + byteCnt = (ctx->h.hashBitLen + 7) >> 3; /* total number of output bytes */ + + /* run Threefish in "counter mode" to generate output */ + memset(ctx->b,0,sizeof(ctx->b)); /* zero out b[], so it can hold the counter */ + memcpy(X,ctx->X,sizeof(X)); /* keep a local copy of counter mode "key" */ + for (i=0;i*SKEIN_512_BLOCK_BYTES < byteCnt;i++) { + ((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */ + Skein_Start_New_Type(ctx,OUT_FINAL); + Skein_512_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */ + n = byteCnt - i*SKEIN_512_BLOCK_BYTES; /* number of output bytes left to go */ + if (n >= SKEIN_512_BLOCK_BYTES) + n = SKEIN_512_BLOCK_BYTES; + Skein_Put64_LSB_First(hashVal+i*SKEIN_512_BLOCK_BYTES,ctx->X,n); /* "output" the ctr mode bytes */ + memcpy(ctx->X,X,sizeof(X)); /* restore the counter mode key for next time */ + } + return SKEIN_SUCCESS; +} diff --git a/src/util/skein.h b/src/util/skein.h new file mode 100644 index 0000000000..f21e64409a --- /dev/null +++ b/src/util/skein.h @@ -0,0 +1,145 @@ +#ifndef _SKEIN_H_ +#define _SKEIN_H_ 1 +/************************************************************************** +** +** Interface declarations and internal definitions for Skein hashing. +** +** Source code author: Doug Whiting, 2008. +** +** This algorithm and source code is released to the public domain. +** +*************************************************************************** +** +** The following compile-time switches may be defined to control some +** tradeoffs between speed, code size, error checking, and security. +** +** The "default" note explains what happens when the switch is not defined. +** +** SKEIN_ERR_CHECK -- how error checking is handled inside Skein +** code. If not defined, most error checking +** is disabled (for performance). Otherwise, +** the switch value is interpreted as: +** 0: use assert() to flag errors +** 1: return SKEIN_FAIL to flag errors +** +***************************************************************************/ +#ifdef __cplusplus +extern "C" +{ +#endif + +#include /* get size_t definition */ +#include +typedef unsigned int uint_t; +typedef uint8_t u08b_t; +typedef uint64_t u64b_t; + +enum { + SKEIN_SUCCESS = 0, /* return codes from Skein calls */ + SKEIN_FAIL = 1 +}; + +#define SKEIN_MODIFIER_WORDS ( 2) /* number of modifier (tweak) words */ + +#define SKEIN_512_STATE_WORDS ( 8) + +#define SKEIN_512_STATE_BYTES ( 8*SKEIN_512_STATE_WORDS) +#define SKEIN_512_STATE_BITS (64*SKEIN_512_STATE_WORDS) +#define SKEIN_512_BLOCK_BYTES ( 8*SKEIN_512_STATE_WORDS) + +typedef struct { + size_t hashBitLen; /* size of hash result, in bits */ + size_t bCnt; /* current byte count in buffer b[] */ + u64b_t T[SKEIN_MODIFIER_WORDS]; /* tweak words: T[0]=byte cnt, T[1]=flags */ +} Skein_Ctxt_Hdr_t; + +typedef struct { /* 512-bit Skein hash context structure */ + Skein_Ctxt_Hdr_t h; /* common header context variables */ + u64b_t X[SKEIN_512_STATE_WORDS]; /* chaining variables */ + u08b_t b[SKEIN_512_BLOCK_BYTES]; /* partial block buffer (8-byte aligned) */ +} Skein_512_Ctxt_t; + +/* Skein APIs for (incremental) "straight hashing" */ +int Skein_512_Init (Skein_512_Ctxt_t *ctx); +int Skein_512_Update(Skein_512_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt); +int Skein_512_Final (Skein_512_Ctxt_t *ctx, u08b_t * hashVal); + +/***************************************************************** +** "Internal" Skein definitions +** -- not needed for sequential hashing API, but will be +** helpful for other uses of Skein (e.g., tree hash mode). +** -- included here so that they can be shared between +** reference and optimized code. +******************************************************************/ + +/* tweak word T[1]: bit field starting positions */ +#define SKEIN_T1_BIT(BIT) ((BIT) - 64) /* offset 64 because it's the second word */ + +#define SKEIN_T1_POS_BLK_TYPE SKEIN_T1_BIT(120) /* bits 120..125: type field */ +#define SKEIN_T1_POS_FIRST SKEIN_T1_BIT(126) /* bits 126 : first block flag */ +#define SKEIN_T1_POS_FINAL SKEIN_T1_BIT(127) /* bit 127 : final block flag */ + +/* tweak word T[1]: flag bit definition(s) */ +#define SKEIN_T1_FLAG_FIRST (((u64b_t) 1 ) << SKEIN_T1_POS_FIRST) +#define SKEIN_T1_FLAG_FINAL (((u64b_t) 1 ) << SKEIN_T1_POS_FINAL) + +/* tweak word T[1]: block type field */ +#define SKEIN_BLK_TYPE_MSG (48) /* message processing */ +#define SKEIN_BLK_TYPE_OUT (63) /* output stage */ + +#define SKEIN_T1_BLK_TYPE(T) (((u64b_t) (SKEIN_BLK_TYPE_##T)) << SKEIN_T1_POS_BLK_TYPE) +#define SKEIN_T1_BLK_TYPE_MSG SKEIN_T1_BLK_TYPE(MSG) /* message processing */ +#define SKEIN_T1_BLK_TYPE_OUT SKEIN_T1_BLK_TYPE(OUT) /* output stage */ + +#define SKEIN_T1_BLK_TYPE_OUT_FINAL (SKEIN_T1_BLK_TYPE_OUT | SKEIN_T1_FLAG_FINAL) + +#define SKEIN_MK_64(hi32,lo32) ((lo32) + (((u64b_t) (hi32)) << 32)) +#define SKEIN_KS_PARITY SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22) + +/* +** Skein macros for setting tweak words, etc. +**/ +#define Skein_Set_Tweak(ctxPtr,TWK_NUM,tVal) {(ctxPtr)->h.T[TWK_NUM] = (tVal);} + +#define Skein_Set_T0(ctxPtr,T0) Skein_Set_Tweak(ctxPtr,0,T0) +#define Skein_Set_T1(ctxPtr,T1) Skein_Set_Tweak(ctxPtr,1,T1) + +/* set both tweak words at once */ +#define Skein_Set_T0_T1(ctxPtr,T0,T1) \ + { \ + Skein_Set_T0(ctxPtr,(T0)); \ + Skein_Set_T1(ctxPtr,(T1)); \ + } + +/* set up for starting with a new type: h.T[0]=0; h.T[1] = NEW_TYPE; h.bCnt=0; */ +#define Skein_Start_New_Type(ctxPtr,BLK_TYPE) \ + { Skein_Set_T0_T1(ctxPtr,0,SKEIN_T1_FLAG_FIRST | SKEIN_T1_BLK_TYPE_##BLK_TYPE); (ctxPtr)->h.bCnt=0; } + +/************************************************** +** "Internal" Skein definitions for error checking +***************************************************/ + +#include +#define Skein_Assert(x,retCode) { if (!(x)) return retCode; } /* caller error */ +#define Skein_assert(x) assert(x) /* internal error */ + +/***************************************************************** +** Skein block function constants (shared across Ref and Opt code) +******************************************************************/ +enum { + /* Skein_512 round rotation constants */ + R_512_0_0=46, R_512_0_1=36, R_512_0_2=19, R_512_0_3=37, + R_512_1_0=33, R_512_1_1=27, R_512_1_2=14, R_512_1_3=42, + R_512_2_0=17, R_512_2_1=49, R_512_2_2=36, R_512_2_3=39, + R_512_3_0=44, R_512_3_1= 9, R_512_3_2=54, R_512_3_3=56, + R_512_4_0=39, R_512_4_1=30, R_512_4_2=34, R_512_4_3=24, + R_512_5_0=13, R_512_5_1=50, R_512_5_2=10, R_512_5_3=17, + R_512_6_0=25, R_512_6_1=29, R_512_6_2=39, R_512_6_3=43, + R_512_7_0= 8, R_512_7_1=35, R_512_7_2=56, R_512_7_3=22, +}; + +#ifdef __cplusplus +} +#endif + +#endif /* ifndef _SKEIN_H_ */ From 0cb043f2fcf72396c85088dfb36897553353d777 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 7 Dec 2016 12:21:55 +0100 Subject: [PATCH 03/41] Add a function to do all the steps. --- src/util/skein.c | 8 ++++++++ src/util/skein.h | 1 + 2 files changed, 9 insertions(+) diff --git a/src/util/skein.c b/src/util/skein.c index 50285ea2c9..51362e5efb 100644 --- a/src/util/skein.c +++ b/src/util/skein.c @@ -307,3 +307,11 @@ int Skein_512_Final(Skein_512_Ctxt_t *ctx, u08b_t *hashVal) { } return SKEIN_SUCCESS; } + +int Skein_512(const u08b_t *msg, size_t msgByteCnt, u08b_t *hashVal) { + Skein_512_Ctxt_t ctx; + if (Skein_512_Init(&ctx)) return SKEIN_FAIL; + if (Skein_512_Update(&ctx, msg, msgByteCnt)) return SKEIN_FAIL; + if (Skein_512_Final(&ctx, hashVal)) return SKEIN_FAIL; + return SKEIN_SUCCESS; +} diff --git a/src/util/skein.h b/src/util/skein.h index f21e64409a..89d7ebf209 100644 --- a/src/util/skein.h +++ b/src/util/skein.h @@ -63,6 +63,7 @@ typedef struct { /* 512-bit Skein hash context structure */ int Skein_512_Init (Skein_512_Ctxt_t *ctx); int Skein_512_Update(Skein_512_Ctxt_t *ctx, const u08b_t *msg, size_t msgByteCnt); int Skein_512_Final (Skein_512_Ctxt_t *ctx, u08b_t * hashVal); +int Skein_512(const u08b_t *msg, size_t msgByteCnt, u08b_t *hashVal); /***************************************************************** ** "Internal" Skein definitions From dd06f091bbc518883a5e117a931ef9b9008aaecc Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 8 Dec 2016 21:06:31 +0100 Subject: [PATCH 04/41] Add strb_write(). --- src/util/strb.c | 24 +++++++++++++++++++++++- src/util/strb.h | 7 +++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/src/util/strb.c b/src/util/strb.c index 273aa8fa6e..15cd496c4f 100644 --- a/src/util/strb.c +++ b/src/util/strb.c @@ -66,8 +66,30 @@ void strb_read(strb *sb, int fd, size_t sz) { sb->l += sz; while (sz) { res = read(fd, b, sz); - if (res == -1 && !(errno == EAGAIN || errno == EINTR)) { strb_seterror(sb); return; } + if (res == -1) { + if (errno == EAGAIN || errno == EINTR) + continue; + strb_seterror(sb); + return; + } sz -= (size_t)res; b += (size_t)res; } } + +int strb_write(int fd, strb *sb) { + ssize_t res; + size_t l = sb->l; + char *b = sb->s; + while (l) { + res = write(fd, b, l); + if (res == -1) { + if (errno == EAGAIN || errno == EINTR) + continue; + return -1; + } + l -= (size_t)res; + b += (size_t)res; + } + return 0; +} diff --git a/src/util/strb.h b/src/util/strb.h index 490031969d..3fc1071ea4 100644 --- a/src/util/strb.h +++ b/src/util/strb.h @@ -168,6 +168,13 @@ GPUARRAY_LOCAL void strb_appendf(strb *, const char *f, ...); */ GPUARRAY_LOCAL void strb_read(strb *, int fd, size_t sz); +/* + * Write the content of an strb to the specified file descriptor. + * + * Write errors will be signaled by a nonzero return value. + */ +GPUARRAY_LOCAL int strb_write(int fd, strb *sb); + /* * Returns a C string from the content of the strb. * From c59ddae81487b2d71196f0627d205cbb7dc6714b Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 8 Dec 2016 21:55:58 +0100 Subject: [PATCH 05/41] Fix wrong export type. --- src/gpuarray/buffer.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/gpuarray/buffer.h b/src/gpuarray/buffer.h index d6d3dd8a09..800756a072 100644 --- a/src/gpuarray/buffer.h +++ b/src/gpuarray/buffer.h @@ -328,9 +328,9 @@ GPUARRAY_PUBLIC int gpudata_move(gpudata *dst, size_t dstoff, * \returns the new buffer in dst_ctx or NULL if no efficient way to * transfer could be found. */ -GPUARRAY_LOCAL int gpudata_transfer(gpudata *dst, size_t dstoff, - gpudata *src, size_t srcoff, - size_t sz); +GPUARRAY_PUBLIC int gpudata_transfer(gpudata *dst, size_t dstoff, + gpudata *src, size_t srcoff, + size_t sz); /** * Transfer data from a buffer to memory. From b12013bf410ea1d7fb6b1a4665a6dc353b530b4f Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 8 Dec 2016 22:02:39 +0100 Subject: [PATCH 06/41] Remove GPUARRAY_LOCAL, it is not needed since we default to visibility=hidden --- src/gpuarray/config.h | 6 +----- src/gpuarray_blas_cuda_cublas.c | 2 +- src/gpuarray_blas_opencl_clblas.c | 2 +- src/gpuarray_blas_opencl_clblast.c | 2 +- src/gpuarray_buffer_cuda.c | 3 +-- src/gpuarray_buffer_opencl.c | 3 +-- src/gpuarray_collectives_cuda_nccl.c | 2 +- src/private.h | 28 ++++++++++++++-------------- src/private_config.h.in | 6 +++--- src/private_cuda.h | 17 ++++++++--------- src/private_opencl.h | 8 ++++---- src/util/strb.h | 12 ++++++------ src/util/xxhash.h | 8 ++++---- 13 files changed, 46 insertions(+), 53 deletions(-) diff --git a/src/gpuarray/config.h b/src/gpuarray/config.h index f8fc86a01d..571f81cfe6 100644 --- a/src/gpuarray/config.h +++ b/src/gpuarray/config.h @@ -12,19 +12,15 @@ #else #define GPUARRAY_PUBLIC __declspec(dllimport) #endif - #define GPUARRAY_LOCAL #else #if __GNUC__ >= 4 #define GPUARRAY_PUBLIC __attribute__((visibility ("default"))) - #define GPUARRAY_LOCAL __attribute__((visibility ("hidden"))) #else - #define GPUARRAY_PUBLIC - #define GPUARRAY_LOCAL + #error "Don't know how to export symbols on this platform" #endif #endif #else #define GPUARRAY_PUBLIC - #define GPUARRAY_LOCAL #endif #ifdef _MSC_VER diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c index 6d4648e232..a7b91ed87e 100644 --- a/src/gpuarray_blas_cuda_cublas.c +++ b/src/gpuarray_blas_cuda_cublas.c @@ -1640,7 +1640,7 @@ static int dgerBatch(cb_order order, size_t M, size_t N, double alpha, return GA_NO_ERROR; } -GPUARRAY_LOCAL gpuarray_blas_ops cublas_ops = { +gpuarray_blas_ops cublas_ops = { setup, teardown, error, diff --git a/src/gpuarray_blas_opencl_clblas.c b/src/gpuarray_blas_opencl_clblas.c index 2041710735..8ee019afb7 100644 --- a/src/gpuarray_blas_opencl_clblas.c +++ b/src/gpuarray_blas_opencl_clblas.c @@ -491,7 +491,7 @@ static int dger(cb_order order, size_t M, size_t N, double alpha, return GA_NO_ERROR; } -GPUARRAY_LOCAL gpuarray_blas_ops clblas_ops = { +gpuarray_blas_ops clblas_ops = { setup, teardown, error, diff --git a/src/gpuarray_blas_opencl_clblast.c b/src/gpuarray_blas_opencl_clblast.c index 78cca10f20..4a5369e56e 100644 --- a/src/gpuarray_blas_opencl_clblast.c +++ b/src/gpuarray_blas_opencl_clblast.c @@ -525,7 +525,7 @@ static int dger(cb_order order, size_t M, size_t N, double alpha, return GA_NO_ERROR; } -GPUARRAY_LOCAL gpuarray_blas_ops clblast_ops = { +gpuarray_blas_ops clblast_ops = { setup, teardown, error, diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 447400f277..1c883deaa7 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -39,7 +39,7 @@ STATIC_ASSERT(sizeof(GpuArrayIpcMemHandle) == sizeof(CUipcMemHandle), cuda_ipcme static CUresult err; -GPUARRAY_LOCAL const gpuarray_buffer_ops cuda_ops; +const gpuarray_buffer_ops cuda_ops; static void cuda_freekernel(gpukernel *); static int cuda_property(gpucontext *, gpudata *, gpukernel *, int, void *); @@ -1689,7 +1689,6 @@ static const char *cuda_error(gpucontext *c) { return errstr; } -GPUARRAY_LOCAL const gpuarray_buffer_ops cuda_ops = {cuda_get_platform_count, cuda_get_device_count, cuda_init, diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c index adf34a3825..84bcb6584d 100644 --- a/src/gpuarray_buffer_opencl.c +++ b/src/gpuarray_buffer_opencl.c @@ -28,7 +28,7 @@ static cl_int err; #define CHKFAIL(v) if (err != CL_SUCCESS) FAIL(v, GA_IMPL_ERROR) -GPUARRAY_LOCAL const gpuarray_buffer_ops opencl_ops; +const gpuarray_buffer_ops opencl_ops; static int cl_property(gpucontext *c, gpudata *b, gpukernel *k, int p, void *r); static gpudata *cl_alloc(gpucontext *c, size_t size, void *data, int flags, @@ -1448,7 +1448,6 @@ static const char *cl_error(gpucontext *c) { } } -GPUARRAY_LOCAL const gpuarray_buffer_ops opencl_ops = {cl_get_platform_count, cl_get_device_count, cl_init, diff --git a/src/gpuarray_collectives_cuda_nccl.c b/src/gpuarray_collectives_cuda_nccl.c index a0f6d12060..e382cfa066 100644 --- a/src/gpuarray_collectives_cuda_nccl.c +++ b/src/gpuarray_collectives_cuda_nccl.c @@ -455,6 +455,6 @@ static int all_gather(gpudata* src, size_t offsrc, gpudata* dest, * linked in \ref gpuarray_buffer_cuda.c, in order to fill a /ref gpucontext's * comm_ops. */ -GPUARRAY_LOCAL gpuarray_comm_ops nccl_ops = { +gpuarray_comm_ops nccl_ops = { comm_new, comm_free, generate_clique_id, get_count, get_rank, reduce, all_reduce, reduce_scatter, broadcast, all_gather}; diff --git a/src/private.h b/src/private.h index 0513df8605..7405cf8995 100644 --- a/src/private.h +++ b/src/private.h @@ -256,26 +256,26 @@ static inline void *memdup(const void *p, size_t s) { return res; } -GPUARRAY_LOCAL int GpuArray_is_c_contiguous(const GpuArray *a); -GPUARRAY_LOCAL int GpuArray_is_f_contiguous(const GpuArray *a); -GPUARRAY_LOCAL int GpuArray_is_aligned(const GpuArray *a); +int GpuArray_is_c_contiguous(const GpuArray *a); +int GpuArray_is_f_contiguous(const GpuArray *a); +int GpuArray_is_aligned(const GpuArray *a); -GPUARRAY_LOCAL extern const gpuarray_type scalar_types[]; -GPUARRAY_LOCAL extern const gpuarray_type vector_types[]; +extern const gpuarray_type scalar_types[]; +extern const gpuarray_type vector_types[]; /* * This function generates the kernel code to perform indexing on var id * from planar index 'i' using the dimensions and strides provided. */ -GPUARRAY_LOCAL void gpuarray_elem_perdim(strb *sb, unsigned int nd, - const size_t *dims, - const ssize_t *str, - const char *id); - -GPUARRAY_LOCAL void gpukernel_source_with_line_numbers(unsigned int count, - const char **news, - size_t *newl, - strb *src); +void gpuarray_elem_perdim(strb *sb, unsigned int nd, + const size_t *dims, + const ssize_t *str, + const char *id); + +void gpukernel_source_with_line_numbers(unsigned int count, + const char **news, + size_t *newl, + strb *src); static inline uint16_t float_to_half(float value) { #define ga__shift 13 diff --git a/src/private_config.h.in b/src/private_config.h.in index 23db862c4f..c3cd3a0195 100644 --- a/src/private_config.h.in +++ b/src/private_config.h.in @@ -39,12 +39,12 @@ extern "C" { #define nelems(a) (sizeof(a)/sizeof(a[0])) #ifndef HAVE_MKSTEMP -GPUARRAY_LOCAL int mkstemp(char *path); +int mkstemp(char *path); #endif #ifndef HAVE_STRL -GPUARRAY_LOCAL size_t strlcpy(char *dst, const char *src, size_t size); -GPUARRAY_LOCAL size_t strlcat(char *dst, const char *src, size_t size); +size_t strlcpy(char *dst, const char *src, size_t size); +size_t strlcat(char *dst, const char *src, size_t size); #endif #ifdef __cplusplus diff --git a/src/private_cuda.h b/src/private_cuda.h index 6fab1597ac..da6f60ad7a 100644 --- a/src/private_cuda.h +++ b/src/private_cuda.h @@ -93,10 +93,10 @@ STATIC_ASSERT(sizeof(cuda_context) <= sizeof(gpucontext), #define ARCH_PREFIX "compute_" -GPUARRAY_LOCAL cuda_context *cuda_make_ctx(CUcontext ctx, int flags); -GPUARRAY_LOCAL CUstream cuda_get_stream(cuda_context *ctx); -GPUARRAY_LOCAL void cuda_enter(cuda_context *ctx); -GPUARRAY_LOCAL void cuda_exit(cuda_context *ctx); +cuda_context *cuda_make_ctx(CUcontext ctx, int flags); +CUstream cuda_get_stream(cuda_context *ctx); +void cuda_enter(cuda_context *ctx); +void cuda_exit(cuda_context *ctx); struct _gpudata { CUdeviceptr ptr; @@ -115,11 +115,10 @@ struct _gpudata { #endif }; -GPUARRAY_LOCAL gpudata *cuda_make_buf(cuda_context *c, CUdeviceptr p, - size_t sz); -GPUARRAY_LOCAL size_t cuda_get_sz(gpudata *g); -GPUARRAY_LOCAL int cuda_wait(gpudata *, int); -GPUARRAY_LOCAL int cuda_record(gpudata *, int); +gpudata *cuda_make_buf(cuda_context *c, CUdeviceptr p, size_t sz); +size_t cuda_get_sz(gpudata *g); +int cuda_wait(gpudata *, int); +int cuda_record(gpudata *, int); /* private flags are in the upper 16 bits */ #define CUDA_WAIT_READ 0x10000 diff --git a/src/private_opencl.h b/src/private_opencl.h index e40242d57e..2a523f5bda 100644 --- a/src/private_opencl.h +++ b/src/private_opencl.h @@ -67,9 +67,9 @@ struct _gpukernel { #endif }; -GPUARRAY_LOCAL cl_ctx *cl_make_ctx(cl_context ctx, int flags); -GPUARRAY_LOCAL cl_command_queue cl_get_stream(gpucontext *ctx); -GPUARRAY_LOCAL gpudata *cl_make_buf(gpucontext *c, cl_mem buf); -GPUARRAY_LOCAL cl_mem cl_get_buf(gpudata *g); +cl_ctx *cl_make_ctx(cl_context ctx, int flags); +cl_command_queue cl_get_stream(gpucontext *ctx); +gpudata *cl_make_buf(gpucontext *c, cl_mem buf); +cl_mem cl_get_buf(gpudata *g); #endif diff --git a/src/util/strb.h b/src/util/strb.h index 3fc1071ea4..267941417e 100644 --- a/src/util/strb.h +++ b/src/util/strb.h @@ -39,14 +39,14 @@ typedef struct _strb { * * Returns NULL on error. */ -GPUARRAY_LOCAL strb *strb_alloc(size_t s); +strb *strb_alloc(size_t s); /* * Frees an strb that was dynamically allocated. * * Don't call this for stack of global declarations, see strb_clear() instead. */ -GPUARRAY_LOCAL void strb_free(strb *); +void strb_free(strb *); /* * Return a pointer to a dynamically allocated strb with a default @@ -96,7 +96,7 @@ static inline void strb_clear(strb *sb) { * This should almost never be called directly. Use strb_ensure() * instead. */ -GPUARRAY_LOCAL int strb_grow(strb *, size_t s); +int strb_grow(strb *, size_t s); /* * Make sure there is space to store at least `s` bytes of data after @@ -159,21 +159,21 @@ static inline void strb_appendb(strb *sb, strb *sb2) { * * A format error will place the strb in error mode. */ -GPUARRAY_LOCAL void strb_appendf(strb *, const char *f, ...); +void strb_appendf(strb *, const char *f, ...); /* * Reads from the file specified by the given file descriptor. * * A read error will place the strb in error mode. */ -GPUARRAY_LOCAL void strb_read(strb *, int fd, size_t sz); +void strb_read(strb *, int fd, size_t sz); /* * Write the content of an strb to the specified file descriptor. * * Write errors will be signaled by a nonzero return value. */ -GPUARRAY_LOCAL int strb_write(int fd, strb *sb); +int strb_write(int fd, strb *sb); /* * Returns a C string from the content of the strb. diff --git a/src/util/xxhash.h b/src/util/xxhash.h index c33938234d..6403b9e6f9 100644 --- a/src/util/xxhash.h +++ b/src/util/xxhash.h @@ -106,7 +106,7 @@ They will be automatically translated by this header. * Simple Hash Functions *****************************/ -GPUARRAY_LOCAL unsigned int XXH32 (const void* input, size_t length, unsigned seed); +unsigned int XXH32 (const void* input, size_t length, unsigned seed); /* XXH32() : @@ -129,9 +129,9 @@ These structures allow static allocation of XXH states. States must then be initialized using XXH32_reset() before first use. */ -GPUARRAY_LOCAL XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, unsigned seed); -GPUARRAY_LOCAL XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length); -GPUARRAY_LOCAL unsigned int XXH32_digest (const XXH32_state_t* statePtr); +XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, unsigned seed); +XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length); +unsigned int XXH32_digest (const XXH32_state_t* statePtr); /* These functions calculate the xxHash of an input provided in multiple smaller packets, From 339ac160824a3c916614f8133b1526f39a5aead6 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 8 Dec 2016 22:04:47 +0100 Subject: [PATCH 07/41] Remove some useless stuff in xxhash. --- src/util/xxhash.c | 64 +---------------------------------------------- src/util/xxhash.h | 28 --------------------- 2 files changed, 1 insertion(+), 91 deletions(-) diff --git a/src/util/xxhash.c b/src/util/xxhash.c index 58101b0902..bd2447ca16 100644 --- a/src/util/xxhash.c +++ b/src/util/xxhash.c @@ -31,39 +31,6 @@ You can contact the author at : - xxHash source repository : https://github.com/Cyan4973/xxHash */ - -/************************************** -* Tuning parameters -**************************************/ -/* XXH_FORCE_MEMORY_ACCESS - * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable. - * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal. - * The below switch allow to select different access method for improved performance. - * Method 0 (default) : use `memcpy()`. Safe and portable. - * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable). - * This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`. - * Method 2 : direct access. This method is portable but violate C standard. - * It can generate buggy code on targets which generate assembly depending on alignment. - * But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6) - * See http://stackoverflow.com/a/32095106/646947 for details. - * Prefer these methods in priority order (0 > 1 > 2) - */ -#ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ -# if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) ) -# define XXH_FORCE_MEMORY_ACCESS 2 -# elif defined(__INTEL_COMPILER) || \ - (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) )) -# define XXH_FORCE_MEMORY_ACCESS 1 -# endif -#endif - -/* XXH_ACCEPT_NULL_INPUT_POINTER : - * If the input pointer is a null pointer, xxHash default behavior is to trigger a memory access error, since it is a bad pointer. - * When this option is enabled, xxHash output for null input pointers will be the same as a null-length input. - * By default, this option is disabled. To enable it, uncomment below define : - */ -/* #define XXH_ACCEPT_NULL_INPUT_POINTER 1 */ - /* XXH_FORCE_NATIVE_FORMAT : * By default, xxHash library provides endian-independant Hash values, based on little-endian convention. * Results are therefore identical for little-endian and big-endian CPU. @@ -72,7 +39,7 @@ You can contact the author at : * to improve speed for Big-endian CPU. * This option has no impact on Little_Endian CPU. */ -#define XXH_FORCE_NATIVE_FORMAT 0 +#define XXH_FORCE_NATIVE_FORMAT 1 /* XXH_USELESS_ALIGN_BRANCH : * This is a minor performance trick, only useful with lots of very small keys. @@ -132,25 +99,6 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcp #endif -#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) - -/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */ -static U32 XXH_read32(const void* memPtr) { return *(const U32*) memPtr; } - -#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) - -/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */ -/* currently only defined for gcc and icc */ -typedef union { U32 u32; U64 u64; } __attribute__((packed)) unalign; - -static U32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; } - -#else - -/* portable and safe solution. Generally efficient. - * see : http://stackoverflow.com/a/32095106/646947 - */ - static U32 XXH_read32(const void* memPtr) { U32 val; @@ -158,8 +106,6 @@ static U32 XXH_read32(const void* memPtr) return val; } -#endif // XXH_FORCE_DIRECT_MEMORY_ACCESS - /****************************************** * Compiler-specific Functions and Macros @@ -243,14 +189,6 @@ FORCE_INLINE U32 XXH32_endian_align(const void* input, size_t len, U32 seed, XXH U32 h32; #define XXH_get32bits(p) XXH_readLE32_align(p, endian, align) -#ifdef XXH_ACCEPT_NULL_INPUT_POINTER - if (p==NULL) - { - len=0; - bEnd=p=(const BYTE*)(size_t)16; - } -#endif - if (len>=16) { const BYTE* const limit = bEnd - 16; diff --git a/src/util/xxhash.h b/src/util/xxhash.h index 6403b9e6f9..1d11a095fb 100644 --- a/src/util/xxhash.h +++ b/src/util/xxhash.h @@ -74,34 +74,6 @@ extern "C" { typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode; -/***************************** -* Namespace Emulation -*****************************/ -/* Motivations : - -If you need to include xxHash into your library, -but wish to avoid xxHash symbols to be present on your library interface -in an effort to avoid potential name collision if another library also includes xxHash, - -you can use XXH_NAMESPACE, which will automatically prefix any symbol from xxHash -with the value of XXH_NAMESPACE (so avoid to keep it NULL, and avoid numeric values). - -Note that no change is required within the calling program : -it can still call xxHash functions using their regular name. -They will be automatically translated by this header. -*/ -#ifdef XXH_NAMESPACE -# define XXH_CAT(A,B) A##B -# define XXH_NAME2(A,B) XXH_CAT(A,B) -# define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32) -# define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState) -# define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState) -# define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset) -# define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update) -# define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest) -#endif - - /***************************** * Simple Hash Functions *****************************/ From 7ca34ed68908a348d3c40d80205556d369f321e0 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 8 Dec 2016 22:46:22 +0100 Subject: [PATCH 08/41] Disk cache implementation. --- src/CMakeLists.txt | 1 + src/cache/disk.c | 301 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 302 insertions(+) create mode 100644 src/cache/disk.c diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 02e32eccd4..0ed776533d 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -35,6 +35,7 @@ endmacro() set(_GPUARRAY_SRC cache/lru.c cache/twoq.c +cache/disk.c gpuarray_types.c gpuarray_error.c gpuarray_util.c diff --git a/src/cache/disk.c b/src/cache/disk.c new file mode 100644 index 0000000000..59e6f1ba42 --- /dev/null +++ b/src/cache/disk.c @@ -0,0 +1,301 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "cache.h" +#include "private_config.h" +#include "util/strb.h" +#include "util/skein.h" + +#define HEXP_LEN (128 + 2) + +typedef int (*kwrite_fn)(strb *res, cache_key_t key); +typedef int (*vwrite_fn)(strb *res, cache_value_t val); +typedef cache_key_t (*kread_fn)(const strb *b); +typedef cache_value_t (*vread_fn)(const strb *b); + +typedef struct _disk_cache { + cache c; + cache * mem; + kwrite_fn kwrite; + vwrite_fn vwrite; + kread_fn kread; + vread_fn vread; + int dirfd; +} disk_cache; + + +static unsigned long long ntohull(const char *in) { + return ((unsigned long long)in[0] << 56 | (unsigned long long)in[1] << 48 | + (unsigned long long)in[2] << 40 | (unsigned long long)in[3] << 32 | + (unsigned long long)in[4] << 24 | (unsigned long long)in[5] << 16 | + (unsigned long long)in[6] << 8 | (unsigned long long)in[7]); +} + +static void htonull(unsigned long long in, char *out) { + out[0] = in >> 56; + out[1] = in >> 48; + out[2] = in >> 40; + out[3] = in >> 32; + out[4] = in >> 24; + out[5] = in >> 16; + out[6] = in >> 8; + out[7] = in; +} + +static int mkstempat(int dfd, char *template) { + static const char letters[] = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"; + size_t length; + char *XXXXXX; + struct timeval tv; + unsigned long long randnum, working; + int i, tries, fd; + + length = strlen(template); + if (length < 6) { + errno = EINVAL; + return -1; + } + XXXXXX = template + length - 6; + if (strcmp(XXXXXX, "XXXXXX") != 0) { + errno = EINVAL; + return -1; + } + + /* This is kind of crappy, but the point is to not step on each + other's feet */ + gettimeofday(&tv, NULL); + randnum = ((unsigned long long) tv.tv_usec << 16) ^ tv.tv_sec ^ getpid(); + + for (tries = 0; tries < TMP_MAX; tries++) { + for (working = randnum, i = 0; i < 6; i++) { + XXXXXX[i] = letters[working % 62]; + working /= 62; + } + fd = openat(dfd, template, O_RDWR | O_CREAT | O_EXCL, 0600); + if (fd >= 0 || (errno != EEXIST && errno != EISDIR)) + return fd; + + randnum += (tv.tv_usec >> 10) & 0xfff; + } + errno = EEXIST; + return -1; +} + +static int key_path(disk_cache *c, const cache_key_t key, char *out) { + strb kb = STRB_STATIC_INIT; + unsigned char hash[64]; + int i; + + if (c->kwrite(&kb, key)) return -1; + if (Skein_512((unsigned char *)kb.s, kb.l, hash)) return -1; + if (snprintf(out, 6, "%02x%02x/%02x%02x", + hash[0], hash[1], hash[2], hash[3]) != 5) + return -1; + for (i = 4; i < 64; i += 4) { + if (snprintf(out+(i * 2 + 1), 9, "%02x%02x%02x%02x", + hash[i], hash[i+1], hash[i+2], hash[i+3]) != 8) + return -1; + } + return 0; +} + +static int write_entry(disk_cache *c, const cache_key_t k, + const cache_value_t v) { + char hexp[HEXP_LEN]; + char tmp_path[] = "tmp.XXXXXXXX"; + strb b = STRB_STATIC_INIT; + size_t kl, vl; + int fd, err; + + if (key_path(c, k, hexp)) return -1; + + if (!strb_ensure(&b, 16)) return -1; + b.l = 16; + c->kwrite(&b, k); + kl = b.l - 16; + c->vwrite(&b, v); + vl = b.l - kl - 16; + htonull(kl, b.s); + htonull(vl, b.s + 8); + if (strb_error(&b)) { + strb_clear(&b); + return -1; + } + + fd = mkstempat(c->dirfd, tmp_path); + if (fd == -1) { + strb_clear(&b); + return -1; + } + + err = strb_write(fd, &b); + strb_clear(&b); + close(fd); + if (err) { + unlinkat(c->dirfd, tmp_path, 0); + return -1; + } + + if (renameat(c->dirfd, tmp_path, c->dirfd, hexp)) { + unlinkat(c->dirfd, tmp_path, 0); + return -1; + } + + return 0; +} + +static int find_entry(disk_cache *c, const cache_key_t key, + cache_key_t *_k, cache_value_t *_v) { + struct stat st; + strb b = STRB_STATIC_INIT; + char *ts; + size_t kl, vl; + cache_key_t k; + char hexp[HEXP_LEN]; + int fd; + + if (key_path(c, key, hexp)) return 0; + + fd = openat(c->dirfd, hexp, O_RDONLY); + + if (fd == -1) return 0; + + if (fstat(fd, &st)) { + close(fd); + return 0; + } + + if (!(st.st_mode & S_IFREG)) { + close(fd); + return 0; + } + + strb_read(&b, fd, st.st_size); + close(fd); + + if (strb_error(&b) || b.l < 16) { + strb_clear(&b); + return 0; + } + + kl = ntohull(b.s); + vl = ntohull(b.s + 8); + + if (b.l < 16 + kl + vl) { + strb_clear(&b); + return 0; + } + + ts = b.s; + + b.s += 16; + b.l = kl; + + k = c->kread(&b); + if (k && c->c.keq(key, k)) { + if (_v) { + b.s += kl; + b.l = vl; + *_v = c->vread(&b); + if (*_v == NULL) + goto error; + } + if (_k) + *_k = k; + else + c->c.kfree(k); + b.s = ts; + strb_clear(&b); + return 1; + } + error: + c->c.kfree(k); + b.s = ts; + strb_clear(&b); + return 0; +} + +static int disk_add(cache *_c, cache_key_t k, cache_value_t v) { + disk_cache *c = (disk_cache *)_c; + + /* Ignore write errors */ + write_entry(c, k, v); + + return cache_add(c->mem, k, v); +} + +static int disk_del(cache *_c, const cache_key_t key) { + disk_cache *c = (disk_cache *)_c; + char hexp[HEXP_LEN] = {0}; + + cache_del(c->mem, key); + + key_path(c, key, hexp); + + return (unlinkat(c->dirfd, hexp, 0) == 0); +} + +static cache_value_t disk_get(cache *_c, const cache_key_t key) { + disk_cache *c = (disk_cache *)_c; + cache_key_t k; + cache_value_t v; + + v = cache_get(c->mem, key); + if (v != NULL) + return v; + + if (find_entry(c, key, &k, &v)) { + if (cache_add(c->mem, k, v)) return NULL; + return v; + } + return NULL; +} + +static void disk_destroy(cache *_c) { + disk_cache *c = (disk_cache *)_c; + cache_destroy(c->mem); + close(c->dirfd); +} + +cache *cache_disk(const char *dirpath, cache *mem, + kwrite_fn kwrite, vwrite_fn vwrite, + kread_fn kread, vread_fn vread) { + struct stat st; + disk_cache *res; + + mkdir(dirpath, 0777); /* This may fail, but we don't care */ + if (lstat(dirpath, &st) != 0) + return NULL; + if (!(st.st_mode & S_IFDIR)) + return NULL; + + res = calloc(sizeof(*res), 1); + if (res == NULL) return NULL; + + res->dirfd = open(dirpath, O_RDWR|O_CLOEXEC); + if (res->dirfd == -1) { + free(res); + return NULL; + } + + res->mem = mem; + res->kwrite = kwrite; + res->vwrite = vwrite; + res->kread = kread; + res->vread = vread; + res->c.add = disk_add; + res->c.del = disk_del; + res->c.get = disk_get; + res->c.destroy = disk_destroy; + res->c.keq = mem->keq; + res->c.khash = mem->khash; + res->c.kfree = mem->kfree; + res->c.vfree = mem->vfree; + return (cache *)res; +} From 02b97e7469d6dfd47765414c062ca507e9af6005 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 9 Dec 2016 22:07:18 +0100 Subject: [PATCH 09/41] Cleanup tool. --- bin/gpuarray-cache | 51 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 bin/gpuarray-cache diff --git a/bin/gpuarray-cache b/bin/gpuarray-cache new file mode 100644 index 0000000000..528e3eb9a5 --- /dev/null +++ b/bin/gpuarray-cache @@ -0,0 +1,51 @@ +#!/usr/bin/env python + +import os + +def clean(max_size): + content = [] + for root, dirs, files in os.walk(os.environ.get('GPUARRAY_CACHE', + '~/.gpuarray/cache/')): + for file in files: + fpath = os.path.join(root, file) + st = os.stat(fpath) + content.append((st.st_atime, st.st_size, fpath)) + + content.sort() + cur_size = 0 + for _, size, path in content: + cur_size += size + if cur_size > max_size: + os.remove(path) + + +SUFFIXES = {'B': 1, 'K': 1 << 10, 'M': 1 < 20, 'G': 1 << 30, 'T': 1 << 40, + 'P': 1 << 50, 'E': 1 << 60, 'Z': 1 << 70, 'Y': 1 << 80} + + +def get_size(s): + i = 0 + while i < len(s) and (s[i].isdigit() or s[i] == '.'): + i += 1 + num = s[:i] + suf = s[i:] + num = float(num) + if suf != "": + letter = suf.strip().upper() + if letter not in SUFFIXES: + raise ValueError("can't interpret %r" % init) + mult = SUFFIXES[letter] + else: + mult = 0 + return int(num * mult) + + +if __name__ == '__main__': + import argparse + + parser = argparse.ArgumentParser(description='libgpuarray cache maintenance utility') + parser.add_argument('-s', '--max_size', help='Set the maximum size for pruning') + args = parser.parse_args() + + clean(get_size(args.max_size)) + From d58675a195b4e752cb47a91b331cdaf13f4131dd Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Sat, 10 Dec 2016 09:06:20 +0100 Subject: [PATCH 10/41] Expose the definition of cache_disk(). --- src/cache.h | 10 ++++++++++ src/cache/disk.c | 6 ------ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/src/cache.h b/src/cache.h index f2059e73cc..f2e610f3dd 100644 --- a/src/cache.h +++ b/src/cache.h @@ -4,6 +4,7 @@ #include #include #include "private_config.h" +#include "util/strb.h" typedef void *cache_key_t; typedef void *cache_value_t; @@ -13,6 +14,11 @@ typedef uint32_t (*cache_hash_fn)(cache_key_t); typedef void (*cache_freek_fn)(cache_key_t); typedef void (*cache_freev_fn)(cache_value_t); +typedef int (*kwrite_fn)(strb *res, cache_key_t key); +typedef int (*vwrite_fn)(strb *res, cache_value_t val); +typedef cache_key_t (*kread_fn)(const strb *b); +typedef cache_value_t (*vread_fn)(const strb *b); + typedef struct _cache cache; struct _cache { @@ -78,6 +84,10 @@ cache *cache_twoq(size_t hot_size, size_t warm_size, cache_eq_fn keq, cache_hash_fn khash, cache_freek_fn kfree, cache_freev_fn vfree); +cache *cache_disk(const char *dirpath, cache *mem, + kwrite_fn kwrite, vwrite_fn vwrite, + kread_fn kread, vread_fn vread); + /* API functions */ static inline int cache_add(cache *c, cache_key_t k, cache_value_t v) { return c->add(c, k, v); diff --git a/src/cache/disk.c b/src/cache/disk.c index 59e6f1ba42..2cf90d7c77 100644 --- a/src/cache/disk.c +++ b/src/cache/disk.c @@ -8,16 +8,10 @@ #include "cache.h" #include "private_config.h" -#include "util/strb.h" #include "util/skein.h" #define HEXP_LEN (128 + 2) -typedef int (*kwrite_fn)(strb *res, cache_key_t key); -typedef int (*vwrite_fn)(strb *res, cache_value_t val); -typedef cache_key_t (*kread_fn)(const strb *b); -typedef cache_value_t (*vread_fn)(const strb *b); - typedef struct _disk_cache { cache c; cache * mem; From 605d58c8d9eb740bba6c80653c09abbd842e2ba7 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 17 Jan 2017 19:27:17 -0500 Subject: [PATCH 11/41] Add the functions and setup for disk cache (nothing uses it yet). --- src/gpuarray_buffer_cuda.c | 99 +++++++++++++++++++++++++++++++++++--- src/private_cuda.h | 1 + src/util/strb.h | 2 +- 3 files changed, 95 insertions(+), 7 deletions(-) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 1c883deaa7..cf34d0288d 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -49,18 +49,72 @@ static int cuda_records(gpudata *, int, CUstream); static int detect_arch(const char *prefix, char *ret, CUresult *err); static gpudata *new_gpudata(cuda_context *ctx, CUdeviceptr ptr, size_t size); -static int strb_eq(void *_k1, void *_k2) { - strb *k1 = (strb *)_k1; - strb *k2 = (strb *)_k2; +typedef struct _kernel_key { + char bin_id[64]; + strb *src; +} kernel_key; + +static void key_free(cache_key_t _k) { + kernel_key *k = (kernel_key *)_k; + strb_free(k->src); + free(k); +} + +static int strb_eq(strb *k1, strb *k2) { return (k1->l == k2->l && memcmp(k1->s, k2->s, k1->l) == 0); } -static uint32_t strb_hash(void *_k) { - strb *k = (strb *)_k; +static uint32_t strb_hash(strb *k) { return XXH32(k->s, k->l, 42); } +static int key_eq(kernel_key *k1, kernel_key *k2) { + return (memcmp(k1->bin_id, k2->bin_id, 64) == 0 && + strb_eq(k1->src, k2->src)); +} + +static int key_hash(kernel_key *k) { + XXH32_state_t state; + XXH32_reset(&state, 42); + XXH32_update(&state, k->bin_id, 64); + XXH32_update(&state, k->src->s, k->src->l); + return XXH32_digest(&state); +} + +static int key_write(strb *res, kernel_key *k) { + strb_appendn(res, k->bin_id, 64); + strb_appendb(res, k->src); + return strb_error(res); +} + +static kernel_key *key_read(const strb *b) { + kernel_key *k; + if (b->l < 64) return NULL; + k = malloc(sizeof(*k)); + if (k == NULL) return NULL; + k->src = strb_alloc(b->l - 64); + if (k->src == NULL) { + free(k); + return NULL; + } + memcpy(k->bin_id, b->s, 64); + strb_appendn(k->src, b->s+64, b->l-64); + return k; +} + +static int kernel_write(strb *res, strb *bin) { + strb_appendb(res, bin); + return strb_error(res); +} + +static strb *kernel_read(const strb *b) { + strb *res = strb_alloc(b->l); + if (res != NULL) + strb_appendb(res, b); + return res; +} + static int setup_done = 0; static int major = -1; static int minor = -1; @@ -114,6 +168,8 @@ static int cuda_get_device_count(unsigned int platform, cuda_context *cuda_make_ctx(CUcontext ctx, int flags) { cuda_context *res; + cache *mem_cache; + char *cache_path; void *p; int e; @@ -152,11 +208,38 @@ cuda_context *cuda_make_ctx(CUcontext ctx, int flags) { goto fail_mem_stream; } } - res->kernel_cache = cache_twoq(64, 128, 64, 8, strb_eq, strb_hash, + + res->kernel_cache = cache_twoq(64, 128, 64, 8, + (cache_eq_fn)strb_eq, + (cache_hash_fn)strb_hash, (cache_freek_fn)strb_free, (cache_freev_fn)cuda_freekernel); if (res->kernel_cache == NULL) goto fail_cache; + + cache_path = getenv("GPUARRAY_CACHE_PATH"); + if (cache_path != NULL) { + mem_cache = cache_lru(64, 8, + (cache_eq_fn)key_eq, + (cache_hash_fn)key_hash, + (cache_freek_fn)key_free, + (cache_freev_fn)strb_free); + if (mem_cache == NULL) + goto fail_disk_cache; + res->disk_cache = cache_disk(cache_path, mem_cache, + (kwrite_fn)key_write, + (vwrite_fn)kernel_write, + (kread_fn)key_read, + (vread_fn)kernel_read); + if (res->disk_cache == NULL) { + cache_destroy(mem_cache); + goto fail_disk_cache; + } + } else { + fail_disk_cache: + res->disk_cache = NULL; + } + err = cuMemAllocHost(&p, 16); if (err != CUDA_SUCCESS) { goto fail_errbuf; @@ -174,6 +257,8 @@ cuda_context *cuda_make_ctx(CUcontext ctx, int flags) { fail_end: cuMemFreeHost(p); fail_errbuf: + if (res->disk_cache) + cache_destroy(res->disk_cache); cache_destroy(res->kernel_cache); fail_cache: if (ISCLR(res->flags, GA_CTX_SINGLE_STREAM)) @@ -215,6 +300,8 @@ static void cuda_free_ctx(cuda_context *ctx) { deallocate(curr); } cache_destroy(ctx->kernel_cache); + if (ctx->disk_cache) + cache_destroy(ctx->disk_cache); if (!(ctx->flags & DONTFREE)) { cuCtxPushCurrent(ctx->ctx); diff --git a/src/private_cuda.h b/src/private_cuda.h index da6f60ad7a..ad9ff7f8ae 100644 --- a/src/private_cuda.h +++ b/src/private_cuda.h @@ -68,6 +68,7 @@ typedef struct _cuda_context { CUstream mem_s; gpudata *freeblocks; cache *kernel_cache; + cache *disk_cache; unsigned int enter; unsigned char major; unsigned char minor; diff --git a/src/util/strb.h b/src/util/strb.h index 267941417e..01ea7a2495 100644 --- a/src/util/strb.h +++ b/src/util/strb.h @@ -146,7 +146,7 @@ static inline void strb_appends(strb *sb, const char *s) { /* * Appends the content of another strb. */ -static inline void strb_appendb(strb *sb, strb *sb2) { +static inline void strb_appendb(strb *sb, const strb *sb2) { strb_appendn(sb, sb2->s, sb2->l); } From 6f1fd645a3c1197c3b5cfc3849ed0b5a695596b4 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 18 Jan 2017 19:49:00 -0500 Subject: [PATCH 12/41] Rework the compile logic and integrate the disk cache into the mix. This removes the gpukernel_binary support also, because it became burdensome and redundant. If the kernels are already cached at the library level, there is no need for applications to do the same. --- src/gpuarray/buffer.h | 17 +- src/gpuarray/error.h | 1 + src/gpuarray/kernel.h | 1 + src/gpuarray_buffer.c | 2 +- src/gpuarray_buffer_cuda.c | 375 +++++++++++++++++------------------ src/gpuarray_buffer_opencl.c | 29 --- src/gpuarray_error.c | 1 + src/loaders/libcuda.fn | 4 + src/loaders/libcuda.h | 11 + src/private.h | 1 - src/private_cuda.h | 2 - 11 files changed, 205 insertions(+), 239 deletions(-) diff --git a/src/gpuarray/buffer.h b/src/gpuarray/buffer.h index 800756a072..34878d503a 100644 --- a/src/gpuarray/buffer.h +++ b/src/gpuarray/buffer.h @@ -494,22 +494,9 @@ GPUARRAY_PUBLIC int gpukernel_call(gpukernel *k, unsigned int n, size_t shared, void **args); /** - * (Deprecated) Get the kernel binary. + * Get the kernel binary (REMOVED). * - * This function is deprecated and will be removed in the next release. - * - * This can be use to cache kernel binaries after compilation of a - * specific device. The kernel can be recreated by calling - * kernel_alloc with the binary and size and passing `GA_USE_BINARY` - * as the use flags. - * - * The returned pointer is allocated and must be freed by the caller. - * - * \param k kernel - * \param sz size of the returned binary - * \param obj pointer to the binary for the kernel. - * - * \returns GA_NO_ERROR or an error code if an error occurred. + * Always returns GA_DEPRECATED_ERROR. */ GPUARRAY_PUBLIC int gpukernel_binary(gpukernel *k, size_t *sz, void **obj); diff --git a/src/gpuarray/error.h b/src/gpuarray/error.h index af963c1531..84c852a257 100644 --- a/src/gpuarray/error.h +++ b/src/gpuarray/error.h @@ -36,6 +36,7 @@ enum ga_error { GA_COMM_ERROR, GA_XLARGE_ERROR, GA_LOAD_ERROR, + GA_DEPRECATED_ERROR, /* Add more error types if needed, but at the end */ /* Don't forget to sync with Gpu_error() */ }; diff --git a/src/gpuarray/kernel.h b/src/gpuarray/kernel.h index f88d74ffc6..da779123b9 100644 --- a/src/gpuarray/kernel.h +++ b/src/gpuarray/kernel.h @@ -107,6 +107,7 @@ GPUARRAY_PUBLIC int GpuKernel_call(GpuKernel *k, unsigned int n, const size_t *gs, const size_t *ls, size_t shared, void **args); +/* Deprecated and to be removed */ GPUARRAY_PUBLIC int GpuKernel_binary(const GpuKernel *k, size_t *sz, void **obj); diff --git a/src/gpuarray_buffer.c b/src/gpuarray_buffer.c index a4dfd3329b..d3226ea94e 100644 --- a/src/gpuarray_buffer.c +++ b/src/gpuarray_buffer.c @@ -187,7 +187,7 @@ int gpukernel_call(gpukernel *k, unsigned int n, const size_t *gs, } int gpukernel_binary(gpukernel *k, size_t *sz, void **obj) { - return ((partial_gpukernel *)k)->ctx->ops->kernel_binary(k, sz, obj); + return GA_DEPRECATED_ERROR; } int gpukernel_property(gpukernel *k, int prop_id, void *res) { diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index cf34d0288d..3820dff023 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -51,12 +51,12 @@ static gpudata *new_gpudata(cuda_context *ctx, CUdeviceptr ptr, size_t size); typedef struct _kernel_key { char bin_id[64]; - strb *src; + strb src; } kernel_key; static void key_free(cache_key_t _k) { kernel_key *k = (kernel_key *)_k; - strb_free(k->src); + strb_clear(&k->src); free(k); } @@ -71,35 +71,35 @@ static uint32_t strb_hash(strb *k) { static int key_eq(kernel_key *k1, kernel_key *k2) { return (memcmp(k1->bin_id, k2->bin_id, 64) == 0 && - strb_eq(k1->src, k2->src)); + strb_eq(&k1->src, &k2->src)); } static int key_hash(kernel_key *k) { XXH32_state_t state; XXH32_reset(&state, 42); XXH32_update(&state, k->bin_id, 64); - XXH32_update(&state, k->src->s, k->src->l); + XXH32_update(&state, k->src.s, k->src.l); return XXH32_digest(&state); } static int key_write(strb *res, kernel_key *k) { strb_appendn(res, k->bin_id, 64); - strb_appendb(res, k->src); + strb_appendb(res, &k->src); return strb_error(res); } static kernel_key *key_read(const strb *b) { kernel_key *k; if (b->l < 64) return NULL; - k = malloc(sizeof(*k)); + k = calloc(1, sizeof(*k)); if (k == NULL) return NULL; - k->src = strb_alloc(b->l - 64); - if (k->src == NULL) { + if (strb_ensure(&k->src, b->l - 64) != 0) { + strb_clear(&k->src); free(k); return NULL; } memcpy(k->bin_id, b->s, 64); - strb_appendn(k->src, b->s+64, b->l-64); + strb_appendn(&k->src, b->s+64, b->l-64); return k; } @@ -1000,22 +1000,22 @@ static int detect_arch(const char *prefix, char *ret, CUresult *err) { return GA_NO_ERROR; } -static void *call_compiler(const char *src, size_t len, const char *arch_arg, - size_t *bin_len, char **log, size_t *log_len, - int *ret) { +static int call_compiler(cuda_context *ctx, strb *src, strb *ptx, strb *log) { nvrtcProgram prog; - void *buf = NULL; size_t buflen; const char *opts[4] = { "-arch", "" , "-G", "-lineinfo" }; - nvrtcResult err, err2; + nvrtcResult err; - opts[1] = arch_arg; + opts[1] = ctx->bin_id; - err = nvrtcCreateProgram(&prog, src, NULL, 0, NULL, NULL); - if (err != NVRTC_SUCCESS) FAIL(NULL, GA_SYS_ERROR); + strb_append0(src); + if (strb_error(src)) + return GA_MEMORY_ERROR; + err = nvrtcCreateProgram(&prog, src->s, NULL, 0, NULL, NULL); + if (err != NVRTC_SUCCESS) return GA_SYS_ERROR; err = nvrtcCompileProgram(prog, #ifdef DEBUG @@ -1024,41 +1024,115 @@ static void *call_compiler(const char *src, size_t len, const char *arch_arg, 2, #endif opts); - if (log != NULL) { - err2 = nvrtcGetProgramLogSize(prog, &buflen); - if (err2 != NVRTC_SUCCESS) goto end2; - buf = malloc(buflen); - if (buf == NULL) goto end2; - err2 = nvrtcGetProgramLog(prog, (char *)buf); - if (err2 != NVRTC_SUCCESS) goto end2; - if (log_len != NULL) *log_len = buflen; - *log = (char *)buf; - buf = NULL; + if (nvrtcGetProgramLogSize(prog, &buflen) == NVRTC_SUCCESS) { + strb_appends(log, "NVRTC compile log::\n"); + if (strb_ensure(log, buflen) == 0) + if (nvrtcGetProgramLog(prog, log->s+log->l) == NVRTC_SUCCESS) + log->l += buflen - 1; + strb_appendc(log, '\n'); } -end2: - if (err != NVRTC_SUCCESS) goto end; err = nvrtcGetPTXSize(prog, &buflen); if (err != NVRTC_SUCCESS) goto end; - buf = malloc(buflen); - if (buf == NULL) { - nvrtcDestroyProgram(&prog); - FAIL(NULL, GA_MEMORY_ERROR); + if (strb_ensure(ptx, buflen) == 0) + err = nvrtcGetPTX(prog, ptx->s+ptx->l); + +end: + nvrtcDestroyProgram(&prog); + if (err != NVRTC_SUCCESS) + return GA_SYS_ERROR; + return GA_NO_ERROR; +} + +static int make_bin(cuda_context *ctx, const strb *ptx, strb *bin, strb *log) { + char info_log[2048]; + char error_log[2048]; + void *out; + size_t out_size; + CUlinkState st; + CUjit_option cujit_opts[] = { + CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, + CU_JIT_INFO_LOG_BUFFER, + CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, + CU_JIT_ERROR_LOG_BUFFER, + CU_JIT_LOG_VERBOSE, + CU_JIT_GENERATE_DEBUG_INFO, + CU_JIT_GENERATE_LINE_INFO, + }; + void *cujit_opt_vals[] = { + (void *)sizeof(info_log), info_log, + (void *)sizeof(error_log), error_log, +#ifdef DEBUG + (void *)1, (void *)1, (void *)1 +#else + (void *)0, (void *)0, (void *)0 +#endif + }; + + ctx->err = cuLinkCreate(sizeof(cujit_opts)/sizeof(cujit_opts[0]), + cujit_opts, cujit_opt_vals, &st); + if (ctx->err != CUDA_SUCCESS) + return GA_IMPL_ERROR; + ctx->err = cuLinkAddData(st, CU_JIT_INPUT_PTX, ptx->s, ptx->l, + "kernel code", 0, NULL, NULL); + if (ctx->err != CUDA_SUCCESS) { + cuLinkDestroy(st); + return GA_IMPL_ERROR; + } + ctx->err = cuLinkComplete(st, &out, &out_size); + if (ctx->err != CUDA_SUCCESS) { + cuLinkDestroy(st); + return GA_IMPL_ERROR; } + strb_appendn(bin, out, out_size); + cuLinkDestroy(st); + strb_appends(log, "Link info log::\n"); + strb_appends(log, info_log); + strb_appends(log, "\nLink error log::\n"); + strb_appends(log, error_log); + strb_appendc(log, '\n'); + return GA_NO_ERROR; +} - err = nvrtcGetPTX(prog, (char *)buf); - if (err != NVRTC_SUCCESS) goto end; +static int compile(cuda_context *ctx, strb *src, strb* bin, strb *log) { + strb ptx = STRB_STATIC_INIT; + strb *cbin; + kernel_key k; + kernel_key *pk; + int err; - *bin_len = buflen; + memcpy(k.bin_id, ctx->bin_id, 64); + memcpy(&k.src, src, sizeof(strb)); -end: - nvrtcDestroyProgram(&prog); - if (err != NVRTC_SUCCESS) { - free(buf); - FAIL(NULL, GA_SYS_ERROR); + // Look up the binary in the disk cache + cbin = cache_get(ctx->disk_cache, &k); + if (cbin != NULL) { + strb_appendb(bin, cbin); + return GA_NO_ERROR; + } + + err = call_compiler(ctx, src, &ptx, log); + if (err != GA_NO_ERROR) return err; + err = make_bin(ctx, &ptx, bin, log); + if (err != GA_NO_ERROR) return err; + pk = memdup(&k, sizeof(k)); + if (pk == NULL) + return err; + cbin = strb_alloc(bin->l); + if (cbin == NULL) { + free(pk); + return err; } - return buf; + strb_appendb(cbin, bin); + if (strb_error(cbin)) { + free(pk); + strb_free(cbin); + return err; + } + cache_add(ctx->disk_cache, pk, cbin); + + return err; } static void _cuda_freekernel(gpukernel *k) { @@ -1072,7 +1146,6 @@ static void _cuda_freekernel(gpukernel *k) { } CLEAR(k); free(k->args); - free(k->bin); free(k->types); free(k); } @@ -1084,45 +1157,21 @@ static gpukernel *cuda_newkernel(gpucontext *c, unsigned int count, const int *types, int flags, int *ret, char **err_str) { cuda_context *ctx = (cuda_context *)c; - strb sb = STRB_STATIC_INIT; - strb *psb; - char *bin, *log = NULL; + strb src = STRB_STATIC_INIT; + strb bin = STRB_STATIC_INIT; + strb log = STRB_STATIC_INIT; + strb *psrc; gpukernel *res; - size_t bin_len = 0, log_len = 0; CUdevice dev; unsigned int i; int major, minor; - strb debug_msg = STRB_STATIC_INIT; - - // options for cuModuleLoadDataEx - const size_t cujit_log_size = 4096; - char *cujit_info_log = NULL; - unsigned int num_cujit_opts = 4; - CUjit_option cujit_opts[] = { - CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, - CU_JIT_INFO_LOG_BUFFER, - CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, - CU_JIT_ERROR_LOG_BUFFER - }; - void *cujit_opt_vals[] = { - (void*)(size_t)cujit_log_size, NULL, - (void*)(size_t)cujit_log_size, NULL, - }; + int err; if (count == 0) FAIL(NULL, GA_VALUE_ERROR); if (flags & GA_USE_OPENCL) FAIL(NULL, GA_DEVSUP_ERROR); - if (flags & GA_USE_BINARY) { - // GA_USE_BINARY is exclusive - if (flags & ~GA_USE_BINARY) - FAIL(NULL, GA_INVALID_ERROR); - // We need the length for binary data and there is only one blob. - if (count != 1 || lengths == NULL || lengths[0] == 0) - FAIL(NULL, GA_VALUE_ERROR); - } - cuda_enter(ctx); ctx->err = cuCtxGetDevice(&dev); @@ -1138,6 +1187,7 @@ static gpukernel *cuda_newkernel(gpucontext *c, unsigned int count, // GA_USE_CLUDA is done later // GA_USE_SMALL will always work + // GA_USE_HALF should always work if (flags & GA_USE_DOUBLE) { if (major < 1 || (major == 1 && minor < 3)) { cuda_exit(ctx); @@ -1149,90 +1199,73 @@ static gpukernel *cuda_newkernel(gpucontext *c, unsigned int count, cuda_exit(ctx); FAIL(NULL, GA_DEVSUP_ERROR); } - // GA_USE_HALF should always work - if (flags & GA_USE_BINARY) { - bin = memdup(strings[0], lengths[0]); - bin_len = lengths[0]; - if (bin == NULL) { - cuda_exit(ctx); - FAIL(NULL, GA_MEMORY_ERROR); - } + if (flags & GA_USE_CLUDA) { + strb_appends(&src, CUDA_PREAMBLE); + } + + if (lengths == NULL) { + for (i = 0; i < count; i++) + strb_appends(&src, strings[i]); } else { - if (flags & GA_USE_CLUDA) { - strb_appends(&sb, CUDA_PREAMBLE); + for (i = 0; i < count; i++) { + if (lengths[i] == 0) + strb_appends(&src, strings[i]); + else + strb_appendn(&src, strings[i], lengths[i]); } + } - if (lengths == NULL) { - for (i = 0; i < count; i++) - strb_appends(&sb, strings[i]); - } else { - for (i = 0; i < count; i++) { - if (lengths[i] == 0) - strb_appends(&sb, strings[i]); - else - strb_appendn(&sb, strings[i], lengths[i]); - } - } + strb_append0(&src); - strb_append0(&sb); + if (strb_error(&src)) { + strb_clear(&src); + cuda_exit(ctx); + FAIL(NULL, GA_MEMORY_ERROR); + } - if (strb_error(&sb)) { - strb_clear(&sb); - cuda_exit(ctx); - FAIL(NULL, GA_MEMORY_ERROR); - } + res = (gpukernel *)cache_get(ctx->kernel_cache, &src); + if (res != NULL) { + res->refcnt++; + strb_clear(&src); + return res; + } - res = (gpukernel *)cache_get(ctx->kernel_cache, &sb); - if (res != NULL) { - res->refcnt++; - strb_clear(&sb); - return res; - } - bin = call_compiler(sb.s, sb.l, ctx->bin_id, &bin_len, - &log, &log_len, ret); - if (bin == NULL) { - if (err_str != NULL) { - - // We're substituting debug_msg for a string with this first line: - strb_appends(&debug_msg, "CUDA kernel compile failure ::\n"); - - /* Delete the final NUL */ - sb.l--; - gpukernel_source_with_line_numbers(1, (const char **)&sb.s, - &sb.l, &debug_msg); - - if (log != NULL) { - strb_appends(&debug_msg, "\nCompiler log:\n"); - strb_appendn(&debug_msg, log, log_len); - free(log); - } - *err_str = strb_cstr(&debug_msg); - // *err_str will be free()d by the caller (see docs in kernel.h) - } - strb_clear(&sb); - cuda_exit(ctx); - FAIL(NULL, GA_IMPL_ERROR); + err = compile(ctx, &src, &bin, &log); + if (err != GA_NO_ERROR || strb_error(&bin)) { + if (err_str != NULL) { + strb debug_msg = STRB_STATIC_INIT; + strb_appends(&debug_msg, "CUDA kernel compile failure ::\n"); + src.l--; + gpukernel_source_with_line_numbers(1, (const char **)&src.s, + &src.l, &debug_msg); + strb_appends(&debug_msg, "\nCompile log:\n"); + strb_appendb(&debug_msg, &log); + *err_str = strb_cstr(&debug_msg); } + strb_clear(&src); + strb_clear(&bin); + strb_clear(&log); + cuda_exit(ctx); + FAIL(NULL, err); } + strb_clear(&log); res = calloc(1, sizeof(*res)); if (res == NULL) { - free(bin); - strb_clear(&sb); + strb_clear(&src); + strb_clear(&bin); cuda_exit(ctx); FAIL(NULL, GA_SYS_ERROR); } - res->bin_sz = bin_len; - res->bin = bin; - res->refcnt = 1; res->argcount = argcount; res->types = calloc(argcount, sizeof(int)); if (res->types == NULL) { _cuda_freekernel(res); - strb_clear(&sb); + strb_clear(&src); + strb_clear(&bin); cuda_exit(ctx); FAIL(NULL, GA_MEMORY_ERROR); } @@ -1240,55 +1273,26 @@ static gpukernel *cuda_newkernel(gpucontext *c, unsigned int count, res->args = calloc(argcount, sizeof(void *)); if (res->args == NULL) { _cuda_freekernel(res); - strb_clear(&sb); + strb_clear(&src); + strb_clear(&bin); cuda_exit(ctx); FAIL(NULL, GA_MEMORY_ERROR); } - // for both info/err log - cujit_info_log = (char*)malloc(2*cujit_log_size*sizeof(char)); - if(cujit_info_log == NULL) { - _cuda_freekernel(res); - strb_clear(&sb); - cuda_exit(ctx); - FAIL(NULL, GA_MEMORY_ERROR); - } - cujit_info_log[0] = 0; - cujit_info_log[cujit_log_size] = 0; - cujit_opt_vals[1] = (void*)cujit_info_log; - cujit_opt_vals[3] = (void*)(cujit_info_log+cujit_log_size); - - ctx->err = cuModuleLoadDataEx( - &res->m, bin, - num_cujit_opts, cujit_opts, (void**)cujit_opt_vals); - + ctx->err = cuModuleLoadData(&res->m, bin.s); if (ctx->err != CUDA_SUCCESS) { - if (err_str != NULL) { - strb_appends(&debug_msg, "CUDA kernel link failure::\n"); - if (cujit_info_log[0]) { - strb_appends(&debug_msg, "\nLinker msg:\n"); - strb_appends(&debug_msg, cujit_info_log); - } - if (cujit_info_log[cujit_log_size]) { - strb_appends(&debug_msg, "\nLinker error log:\n"); - strb_appends(&debug_msg, cujit_info_log+cujit_log_size); - } - strb_append0(&debug_msg); - *err_str = strb_cstr(&debug_msg); - } - free(cujit_info_log); _cuda_freekernel(res); - strb_clear(&sb); + strb_clear(&src); + strb_clear(&bin); cuda_exit(ctx); FAIL(NULL, GA_IMPL_ERROR); } - - free(cujit_info_log); + strb_clear(&bin); ctx->err = cuModuleGetFunction(&res->k, res->m, fname); if (ctx->err != CUDA_SUCCESS) { _cuda_freekernel(res); - strb_clear(&sb); + strb_clear(&src); cuda_exit(ctx); FAIL(NULL, GA_IMPL_ERROR); } @@ -1297,16 +1301,16 @@ static gpukernel *cuda_newkernel(gpucontext *c, unsigned int count, ctx->refcnt++; cuda_exit(ctx); TAG_KER(res); - psb = memdup(&sb, sizeof(strb)); - if (psb == NULL) { - cuda_freekernel(res); - strb_clear(&sb); - FAIL(NULL, GA_MEMORY_ERROR); + psrc = memdup(&src, sizeof(strb)); + if (psrc != NULL) { + /* One of the refs is for the cache */ + res->refcnt++; + /* If this fails, it will free the key and remove a ref from the + kernel. */ + cache_add(ctx->kernel_cache, psrc, res); + } else { + strb_clear(&src); } - /* One of the refs is for the cache */ - res->refcnt++; - /* If this fails, it will free the key and remove a ref from the kernel. */ - cache_add(ctx->kernel_cache, psb, res); return res; } @@ -1381,16 +1385,6 @@ static int cuda_callkernel(gpukernel *k, unsigned int n, return GA_NO_ERROR; } -static int cuda_kernelbin(gpukernel *k, size_t *sz, void **obj) { - void *res = malloc(k->bin_sz); - if (res == NULL) - return GA_MEMORY_ERROR; - memcpy(res, k->bin, k->bin_sz); - *sz = k->bin_sz; - *obj = res; - return GA_NO_ERROR; -} - static int cuda_sync(gpudata *b) { cuda_context *ctx = (cuda_context *)b->ctx; int err = GA_NO_ERROR; @@ -1793,7 +1787,6 @@ const gpuarray_buffer_ops cuda_ops = {cuda_get_platform_count, cuda_freekernel, cuda_kernelsetarg, cuda_callkernel, - cuda_kernelbin, cuda_sync, cuda_transfer, cuda_property, diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c index 84bcb6584d..89b56f80a7 100644 --- a/src/gpuarray_buffer_opencl.c +++ b/src/gpuarray_buffer_opencl.c @@ -1076,34 +1076,6 @@ static int cl_callkernel(gpukernel *k, unsigned int n, return GA_NO_ERROR; } -static int cl_kernelbin(gpukernel *k, size_t *sz, void **obj) { - cl_ctx *ctx = k->ctx; - cl_program p; - size_t rsz; - void *res; - - ASSERT_KER(k); - ASSERT_CTX(ctx); - - ctx->err = clGetKernelInfo(k->k, CL_KERNEL_PROGRAM, sizeof(p), &p, NULL); - if (ctx->err != CL_SUCCESS) - return GA_IMPL_ERROR; - ctx->err = clGetProgramInfo(p, CL_PROGRAM_BINARY_SIZES, sizeof(rsz), &rsz, NULL); - if (ctx->err != CL_SUCCESS) - return GA_IMPL_ERROR; - res = malloc(rsz); - if (res == NULL) - return GA_MEMORY_ERROR; - ctx->err = clGetProgramInfo(p, CL_PROGRAM_BINARIES, sizeof(res), &res, NULL); - if (ctx->err != CL_SUCCESS) { - free(res); - return GA_IMPL_ERROR; - } - *sz = rsz; - *obj = res; - return GA_NO_ERROR; -} - static int cl_sync(gpudata *b) { cl_ctx *ctx = (cl_ctx *)b->ctx; @@ -1465,7 +1437,6 @@ const gpuarray_buffer_ops opencl_ops = {cl_get_platform_count, cl_releasekernel, cl_setkernelarg, cl_callkernel, - cl_kernelbin, cl_sync, cl_transfer, cl_property, diff --git a/src/gpuarray_error.c b/src/gpuarray_error.c index b7d5011f5b..ddebd3e9dc 100644 --- a/src/gpuarray_error.c +++ b/src/gpuarray_error.c @@ -25,6 +25,7 @@ const char *gpuarray_error_str(int err) { case GA_COMM_ERROR: return "Error in collectives call"; case GA_XLARGE_ERROR: return "Input size too large for operation"; case GA_LOAD_ERROR: return "Error loading library"; + case GA_DEPRECATED_ERROR: return "Deprecated (removed) functionality"; default: return "Unknown GA error"; } } diff --git a/src/loaders/libcuda.fn b/src/loaders/libcuda.fn index 487706f4f3..5bfc890fba 100644 --- a/src/loaders/libcuda.fn +++ b/src/loaders/libcuda.fn @@ -17,6 +17,10 @@ DEF_PROC(cuCtxGetDevice, (CUdevice *device)); DEF_PROC_V2(cuCtxPushCurrent, (CUcontext ctx)); DEF_PROC_V2(cuCtxPopCurrent, (CUcontext *pctx)); +DEF_PROC(cuLinkCreate, (unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut)); +DEF_PROC(cuLinkAddData, (CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, unsigned int numOptions, CUjit_option *options, void **optionValues)); +DEF_PROC(cuLinkComplete, (CUlinkState state, void **cubinOut, size_t *sizeOut)); +DEF_PROC(cuLinkDestroy, (CUlinkState state)); DEF_PROC(cuModuleLoadData, (CUmodule *module, const void *image)); DEF_PROC(cuModuleLoadDataEx, (CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues)); DEF_PROC(cuModuleUnload, (CUmodule hmod)); diff --git a/src/loaders/libcuda.h b/src/loaders/libcuda.h index e62f8b85df..3a6bf35a8a 100644 --- a/src/loaders/libcuda.h +++ b/src/loaders/libcuda.h @@ -23,6 +23,7 @@ typedef struct CUmod_st *CUmodule; typedef struct CUfunc_st *CUfunction; typedef struct CUevent_st *CUevent; typedef struct CUstream_st *CUstream; +typedef struct CUlinkState_st *CUlinkState; typedef enum CUdevice_attribute_enum CUdevice_attribute; typedef enum CUfunction_attribute_enum CUfunction_attribute; @@ -30,6 +31,7 @@ typedef enum CUevent_flags_enum CUevent_flags; typedef enum CUctx_flags_enum CUctx_flags; typedef enum CUipcMem_flags_enum CUipcMem_flags; typedef enum CUjit_option_enum CUjit_option; +typedef enum CUjitInputType_enum CUjitInputType; #define CU_IPC_HANDLE_SIZE 64 @@ -206,4 +208,13 @@ enum CUjit_option_enum { CU_JIT_NUM_OPTIONS }; +enum CUjitInputType_enum { + CU_JIT_INPUT_CUBIN = 0, + CU_JIT_INPUT_PTX, + CU_JIT_INPUT_FATBINARY, + CU_JIT_INPUT_OBJECT, + CU_JIT_INPUT_LIBRARY, + CU_JIT_NUM_INPUT_TYPES +}; + #endif diff --git a/src/private.h b/src/private.h index 7405cf8995..abe9783de7 100644 --- a/src/private.h +++ b/src/private.h @@ -100,7 +100,6 @@ struct _gpuarray_buffer_ops { const size_t *gs, const size_t *ls, size_t shared, void **args); - int (*kernel_binary)(gpukernel *k, size_t *sz, void **obj); int (*buffer_sync)(gpudata *b); int (*buffer_transfer)(gpudata *dst, size_t dstoff, gpudata *src, size_t srcoff, size_t sz); diff --git a/src/private_cuda.h b/src/private_cuda.h index ad9ff7f8ae..a0b4557977 100644 --- a/src/private_cuda.h +++ b/src/private_cuda.h @@ -137,8 +137,6 @@ struct _gpukernel { CUmodule m; CUfunction k; void **args; - size_t bin_sz; - void *bin; int *types; unsigned int argcount; unsigned int refcnt; From 0cd407243475e4a13821c07ea216136fce270256 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 25 Jan 2017 15:54:24 -0500 Subject: [PATCH 13/41] Fix some problems with the disk cache. --- src/gpuarray_buffer_cuda.c | 61 +++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 28 deletions(-) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 3820dff023..7b1f540c26 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -1011,9 +1011,6 @@ static int call_compiler(cuda_context *ctx, strb *src, strb *ptx, strb *log) { opts[1] = ctx->bin_id; - strb_append0(src); - if (strb_error(src)) - return GA_MEMORY_ERROR; err = nvrtcCreateProgram(&prog, src->s, NULL, 0, NULL, NULL); if (err != NVRTC_SUCCESS) return GA_SYS_ERROR; @@ -1035,8 +1032,10 @@ static int call_compiler(cuda_context *ctx, strb *src, strb *ptx, strb *log) { err = nvrtcGetPTXSize(prog, &buflen); if (err != NVRTC_SUCCESS) goto end; - if (strb_ensure(ptx, buflen) == 0) + if (strb_ensure(ptx, buflen) == 0) { err = nvrtcGetPTX(prog, ptx->s+ptx->l); + if (err == NVRTC_SUCCESS) ptx->l = buflen; + } end: nvrtcDestroyProgram(&prog); @@ -1069,6 +1068,7 @@ static int make_bin(cuda_context *ctx, const strb *ptx, strb *bin, strb *log) { (void *)0, (void *)0, (void *)0 #endif }; + int err = GA_NO_ERROR; ctx->err = cuLinkCreate(sizeof(cujit_opts)/sizeof(cujit_opts[0]), cujit_opts, cujit_opt_vals, &st); @@ -1077,22 +1077,23 @@ static int make_bin(cuda_context *ctx, const strb *ptx, strb *bin, strb *log) { ctx->err = cuLinkAddData(st, CU_JIT_INPUT_PTX, ptx->s, ptx->l, "kernel code", 0, NULL, NULL); if (ctx->err != CUDA_SUCCESS) { - cuLinkDestroy(st); - return GA_IMPL_ERROR; + err = GA_IMPL_ERROR; + goto out; } ctx->err = cuLinkComplete(st, &out, &out_size); if (ctx->err != CUDA_SUCCESS) { - cuLinkDestroy(st); - return GA_IMPL_ERROR; + err = GA_IMPL_ERROR; + goto out; } strb_appendn(bin, out, out_size); +out: cuLinkDestroy(st); strb_appends(log, "Link info log::\n"); strb_appends(log, info_log); strb_appends(log, "\nLink error log::\n"); strb_appends(log, error_log); strb_appendc(log, '\n'); - return GA_NO_ERROR; + return err; } static int compile(cuda_context *ctx, strb *src, strb* bin, strb *log) { @@ -1106,33 +1107,37 @@ static int compile(cuda_context *ctx, strb *src, strb* bin, strb *log) { memcpy(&k.src, src, sizeof(strb)); // Look up the binary in the disk cache - cbin = cache_get(ctx->disk_cache, &k); - if (cbin != NULL) { - strb_appendb(bin, cbin); - return GA_NO_ERROR; + if (ctx->disk_cache) { + cbin = cache_get(ctx->disk_cache, &k); + if (cbin != NULL) { + strb_appendb(bin, cbin); + return GA_NO_ERROR; + } } err = call_compiler(ctx, src, &ptx, log); if (err != GA_NO_ERROR) return err; err = make_bin(ctx, &ptx, bin, log); if (err != GA_NO_ERROR) return err; - pk = memdup(&k, sizeof(k)); - if (pk == NULL) - return err; - cbin = strb_alloc(bin->l); - if (cbin == NULL) { - free(pk); - return err; - } - strb_appendb(cbin, bin); - if (strb_error(cbin)) { - free(pk); - strb_free(cbin); - return err; + if (ctx->disk_cache) { + pk = memdup(&k, sizeof(k)); + if (pk == NULL) + return GA_NO_ERROR; + cbin = strb_alloc(bin->l); + if (cbin == NULL) { + free(pk); + return GA_NO_ERROR; + } + strb_appendb(cbin, bin); + if (strb_error(cbin)) { + free(pk); + strb_free(cbin); + return GA_NO_ERROR; + } + cache_add(ctx->disk_cache, pk, cbin); } - cache_add(ctx->disk_cache, pk, cbin); - return err; + return GA_NO_ERROR; } static void _cuda_freekernel(gpukernel *k) { From f1e3c2701727221488be47bbce06e0569e8c1e1d Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 25 Jan 2017 16:49:25 -0500 Subject: [PATCH 14/41] Fix some directory creation problems in the disk_cache code. --- src/cache/disk.c | 51 ++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 43 insertions(+), 8 deletions(-) diff --git a/src/cache/disk.c b/src/cache/disk.c index 2cf90d7c77..0a7a5ab18d 100644 --- a/src/cache/disk.c +++ b/src/cache/disk.c @@ -81,6 +81,27 @@ static int mkstempat(int dfd, char *template) { return -1; } +/* Ensure that a path exists by creating all intermediate directories */ +static int ensureat(int dfd, char *path) { + char *curp; + char *pos; + + curp = path; + + while ((pos = strchr(curp, '/')) != NULL) { + *pos = '\0'; + if (mkdirat(dfd, path, 0777)) { + if (errno != EEXIST) return -1; + /* For now we suppose that EEXIST means that the directory is + * already there.*/ + } + curp = pos + 1; + *pos = '/'; + } + + return 0; +} + static int key_path(disk_cache *c, const cache_key_t key, char *out) { strb kb = STRB_STATIC_INIT; unsigned char hash[64]; @@ -88,8 +109,8 @@ static int key_path(disk_cache *c, const cache_key_t key, char *out) { if (c->kwrite(&kb, key)) return -1; if (Skein_512((unsigned char *)kb.s, kb.l, hash)) return -1; - if (snprintf(out, 6, "%02x%02x/%02x%02x", - hash[0], hash[1], hash[2], hash[3]) != 5) + if (snprintf(out, 10, "%02x%02x/%02x%02x", + hash[0], hash[1], hash[2], hash[3]) != 9) return -1; for (i = 4; i < 64; i += 4) { if (snprintf(out+(i * 2 + 1), 9, "%02x%02x%02x%02x", @@ -109,7 +130,9 @@ static int write_entry(disk_cache *c, const cache_key_t k, if (key_path(c, k, hexp)) return -1; - if (!strb_ensure(&b, 16)) return -1; + if (ensureat(c->dirfd, hexp)) return -1; + + if (strb_ensure(&b, 16)) return -1; b.l = 16; c->kwrite(&b, k); kl = b.l - 16; @@ -135,7 +158,7 @@ static int write_entry(disk_cache *c, const cache_key_t k, unlinkat(c->dirfd, tmp_path, 0); return -1; } - + if (renameat(c->dirfd, tmp_path, c->dirfd, hexp)) { unlinkat(c->dirfd, tmp_path, 0); return -1; @@ -227,7 +250,7 @@ static int disk_add(cache *_c, cache_key_t k, cache_value_t v) { static int disk_del(cache *_c, const cache_key_t key) { disk_cache *c = (disk_cache *)_c; char hexp[HEXP_LEN] = {0}; - + cache_del(c->mem, key); key_path(c, key, hexp); @@ -262,17 +285,29 @@ cache *cache_disk(const char *dirpath, cache *mem, kread_fn kread, vread_fn vread) { struct stat st; disk_cache *res; + char *dirp = strdup(dirpath); + + if (dirp == NULL) return NULL; + + if (ensureat(AT_FDCWD, dirp) != 0) { + free(dirp); + return NULL; + } + free(dirp); + + mkdir(dirpath, 0777); /* This may fail, but it's ok */ - mkdir(dirpath, 0777); /* This may fail, but we don't care */ if (lstat(dirpath, &st) != 0) return NULL; + if (!(st.st_mode & S_IFDIR)) return NULL; res = calloc(sizeof(*res), 1); - if (res == NULL) return NULL; + if (res == NULL) + return NULL; - res->dirfd = open(dirpath, O_RDWR|O_CLOEXEC); + res->dirfd = open(dirpath, O_RDONLY|O_CLOEXEC); if (res->dirfd == -1) { free(res); return NULL; From 24dd80d621be81a6dbe2be976b38bf4f0b6f20eb Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 30 Jan 2017 13:38:24 -0500 Subject: [PATCH 15/41] Fix the cache cleanup script. --- bin/gpuarray-cache | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/gpuarray-cache b/bin/gpuarray-cache index 528e3eb9a5..04b7e8e68a 100644 --- a/bin/gpuarray-cache +++ b/bin/gpuarray-cache @@ -19,7 +19,7 @@ def clean(max_size): os.remove(path) -SUFFIXES = {'B': 1, 'K': 1 << 10, 'M': 1 < 20, 'G': 1 << 30, 'T': 1 << 40, +SUFFIXES = {'B': 1, 'K': 1 << 10, 'M': 1 << 20, 'G': 1 << 30, 'T': 1 << 40, 'P': 1 << 50, 'E': 1 << 60, 'Z': 1 << 70, 'Y': 1 << 80} @@ -44,7 +44,7 @@ if __name__ == '__main__': import argparse parser = argparse.ArgumentParser(description='libgpuarray cache maintenance utility') - parser.add_argument('-s', '--max_size', help='Set the maximum size for pruning') + parser.add_argument('-s', '--max_size', help='Set the maximum size for pruning (in bytes with suffixes: K, M, G, ...)') args = parser.parse_args() clean(get_size(args.max_size)) From 5f020fc1f8aa16bd9e1b084c8a397e7013847296 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Fri, 10 Feb 2017 12:46:24 -0500 Subject: [PATCH 16/41] Fix a type punning issue in the Skein code. --- src/util/skein.c | 16 ++++++++-------- src/util/skein.h | 5 ++++- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/src/util/skein.c b/src/util/skein.c index 51362e5efb..38912e8320 100644 --- a/src/util/skein.c +++ b/src/util/skein.c @@ -247,13 +247,13 @@ int Skein_512_Update(Skein_512_Ctxt_t *ctx, const u08b_t *msg, n = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt; /* # bytes free in buffer b[] */ if (n) { Skein_assert(n < msgByteCnt); /* check on our logic here */ - memcpy(&ctx->b[ctx->h.bCnt],msg,n); + memcpy(&ctx->bb.b[ctx->h.bCnt],msg,n); msgByteCnt -= n; msg += n; ctx->h.bCnt += n; } Skein_assert(ctx->h.bCnt == SKEIN_512_BLOCK_BYTES); - Skein_512_Process_Block(ctx,ctx->b,1,SKEIN_512_BLOCK_BYTES); + Skein_512_Process_Block(ctx,ctx->bb.b,1,SKEIN_512_BLOCK_BYTES); ctx->h.bCnt = 0; } /* now process any remaining full blocks, directly from input message data */ @@ -269,7 +269,7 @@ int Skein_512_Update(Skein_512_Ctxt_t *ctx, const u08b_t *msg, /* copy any remaining source message data bytes into b[] */ if (msgByteCnt) { Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES); - memcpy(&ctx->b[ctx->h.bCnt],msg,msgByteCnt); + memcpy(&ctx->bb.b[ctx->h.bCnt],msg,msgByteCnt); ctx->h.bCnt += msgByteCnt; } @@ -285,20 +285,20 @@ int Skein_512_Final(Skein_512_Ctxt_t *ctx, u08b_t *hashVal) { ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */ if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES) /* zero pad b[] if necessary */ - memset(&ctx->b[ctx->h.bCnt],0,SKEIN_512_BLOCK_BYTES - ctx->h.bCnt); + memset(&ctx->bb.b[ctx->h.bCnt],0,SKEIN_512_BLOCK_BYTES - ctx->h.bCnt); - Skein_512_Process_Block(ctx,ctx->b,1,ctx->h.bCnt); /* process the final block */ + Skein_512_Process_Block(ctx,ctx->bb.b,1,ctx->h.bCnt); /* process the final block */ /* now output the result */ byteCnt = (ctx->h.hashBitLen + 7) >> 3; /* total number of output bytes */ /* run Threefish in "counter mode" to generate output */ - memset(ctx->b,0,sizeof(ctx->b)); /* zero out b[], so it can hold the counter */ + memset(ctx->bb.b,0,sizeof(ctx->bb.b)); /* zero out b[], so it can hold the counter */ memcpy(X,ctx->X,sizeof(X)); /* keep a local copy of counter mode "key" */ for (i=0;i*SKEIN_512_BLOCK_BYTES < byteCnt;i++) { - ((u64b_t *)ctx->b)[0]= Skein_Swap64((u64b_t) i); /* build the counter block */ + ctx->bb.l[0] = Skein_Swap64((u64b_t) i); /* build the counter block */ Skein_Start_New_Type(ctx,OUT_FINAL); - Skein_512_Process_Block(ctx,ctx->b,1,sizeof(u64b_t)); /* run "counter mode" */ + Skein_512_Process_Block(ctx,ctx->bb.b,1,sizeof(u64b_t)); /* run "counter mode" */ n = byteCnt - i*SKEIN_512_BLOCK_BYTES; /* number of output bytes left to go */ if (n >= SKEIN_512_BLOCK_BYTES) n = SKEIN_512_BLOCK_BYTES; diff --git a/src/util/skein.h b/src/util/skein.h index 89d7ebf209..b505a51801 100644 --- a/src/util/skein.h +++ b/src/util/skein.h @@ -56,7 +56,10 @@ typedef struct { typedef struct { /* 512-bit Skein hash context structure */ Skein_Ctxt_Hdr_t h; /* common header context variables */ u64b_t X[SKEIN_512_STATE_WORDS]; /* chaining variables */ - u08b_t b[SKEIN_512_BLOCK_BYTES]; /* partial block buffer (8-byte aligned) */ + union Skein_512_Ctxt_b_u { + u08b_t b[SKEIN_512_BLOCK_BYTES]; /* partial block buffer (8-byte aligned) */ + u64b_t l[SKEIN_512_BLOCK_BYTES/8]; + } bb; } Skein_512_Ctxt_t; /* Skein APIs for (incremental) "straight hashing" */ From 4d014a8b255cc85324211bb8354fc44436db1aa0 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 1 Mar 2017 15:24:06 -0500 Subject: [PATCH 17/41] Switch away from ...at() functions since those don't exist on windows. Also try to make the code work for windows. --- CMakeLists.txt | 2 +- make.bat | 6 ++ src/cache/disk.c | 223 ++++++++++++++++++++++++++++++++--------------- 3 files changed, 160 insertions(+), 71 deletions(-) create mode 100755 make.bat diff --git a/CMakeLists.txt b/CMakeLists.txt index 382c064e9c..09f7f1fd4c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,7 +7,7 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/CMakeModules/") # -Wall is unbelieveably noisy with Visual Studio: # http://stackoverflow.com/q/4001736/3257826 if(MSVC) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -W4") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -W4 -D_CRT_SECURE_NO_WARNINGS") else() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall") endif() diff --git a/make.bat b/make.bat new file mode 100755 index 0000000000..1ea7aa55cd --- /dev/null +++ b/make.bat @@ -0,0 +1,6 @@ +del bld +mkdir bld +cd bld +cmake .. -G "NMake Makefiles" +cmake --build . --config Release +cd .. diff --git a/src/cache/disk.c b/src/cache/disk.c index 0a7a5ab18d..2f4fc77efe 100644 --- a/src/cache/disk.c +++ b/src/cache/disk.c @@ -1,13 +1,57 @@ -#include #include #include -#include #include + +#include "private_config.h" + +#ifdef _WIN32 +#define PATH_MAX 255 + +#define WIN32_LEAN_AND_MEAN +#include + +#include +#include + +struct timezone; + +struct timeval { + long tv_sec; + long tv_usec; +} timeval; + +static int gettimeofday(struct timeval *tp, struct timezone *tzp) { + /* + * Note: some broken versions only have 8 trailing zero's, the + * correct epoch has 9 trailing zero's This magic number is the + * number of 100 nanosecond intervals since January 1, 1601 (UTC) + * until 00:00:00 January 1, 1970 + */ + static const uint64_t EPOCH = ((uint64_t)116444736000000000ULL); + + SYSTEMTIME system_time; + FILETIME file_time; + uint64_t time; + + GetSystemTime(&system_time); + SystemTimeToFileTime(&system_time, &file_time); + time = ((uint64_t)file_time.dwLowDateTime); + time += ((uint64_t)file_time.dwHighDateTime) << 32; + + tp->tv_sec = (long)((time - EPOCH) / 10000000L); + tp->tv_usec = (long)(system_time.wMilliseconds * 1000); + return 0; +} + +#else +#define PATH_MAX 1024 +#include #include +#endif + #include #include "cache.h" -#include "private_config.h" #include "util/skein.h" #define HEXP_LEN (128 + 2) @@ -19,7 +63,7 @@ typedef struct _disk_cache { vwrite_fn vwrite; kread_fn kread; vread_fn vread; - int dirfd; + const char *dirp; } disk_cache; @@ -31,72 +75,112 @@ static unsigned long long ntohull(const char *in) { } static void htonull(unsigned long long in, char *out) { - out[0] = in >> 56; - out[1] = in >> 48; - out[2] = in >> 40; - out[3] = in >> 32; - out[4] = in >> 24; - out[5] = in >> 16; - out[6] = in >> 8; - out[7] = in; + out[0] = (char)(in >> 56); + out[1] = (char)(in >> 48); + out[2] = (char)(in >> 40); + out[3] = (char)(in >> 32); + out[4] = (char)(in >> 24); + out[5] = (char)(in >> 16); + out[6] = (char)(in >> 8); + out[7] = (char)(in); } -static int mkstempat(int dfd, char *template) { - static const char letters[] = - "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"; - size_t length; - char *XXXXXX; - struct timeval tv; - unsigned long long randnum, working; - int i, tries, fd; - - length = strlen(template); - if (length < 6) { - errno = EINVAL; +static int catp(char *path, const char *dirp, const char *rpath) { + if (strlcpy(path, dirp, PATH_MAX) >= PATH_MAX) { + errno = ENAMETOOLONG; return -1; } - XXXXXX = template + length - 6; - if (strcmp(XXXXXX, "XXXXXX") != 0) { - errno = EINVAL; + if (strlcat(path, rpath, PATH_MAX) >= PATH_MAX) { + errno = ENAMETOOLONG; return -1; } + return 0; +} - /* This is kind of crappy, but the point is to not step on each - other's feet */ - gettimeofday(&tv, NULL); - randnum = ((unsigned long long) tv.tv_usec << 16) ^ tv.tv_sec ^ getpid(); +static int openp(const char *dirp, const char *rpath, int flags, int mode) { + char path[PATH_MAX]; - for (tries = 0; tries < TMP_MAX; tries++) { - for (working = randnum, i = 0; i < 6; i++) { - XXXXXX[i] = letters[working % 62]; - working /= 62; - } - fd = openat(dfd, template, O_RDWR | O_CREAT | O_EXCL, 0600); - if (fd >= 0 || (errno != EEXIST && errno != EISDIR)) - return fd; + if (catp(path, dirp, rpath)) + return -1; - randnum += (tv.tv_usec >> 10) & 0xfff; - } - errno = EEXIST; - return -1; + return open(path, flags, mode); } -/* Ensure that a path exists by creating all intermediate directories */ -static int ensureat(int dfd, char *path) { - char *curp; - char *pos; +static int mkstempp(const char *dirp, char *template) { + char path[PATH_MAX]; + int res; + + if (catp(path, dirp, template)) + return -1; + + res = mkstemp(path); - curp = path; + /* We need to copy the result path back */ + if (res == 0) + memcpy(template, &path[strlen(dirp)], strlen(template)); + + return res; +} + +static int unlinkp(const char *dirp, const char *rpath) { + char path[PATH_MAX]; + + if (catp(path, dirp, rpath)) + return -1; + + return unlink(path); +} + +static int renamep(const char *dirp, const char *ropath, const char *rnpath) { + char opath[PATH_MAX]; + char npath[PATH_MAX]; + + if (catp(opath, dirp, ropath)) + return -1; + if (catp(npath, dirp, rnpath)) + return -1; - while ((pos = strchr(curp, '/')) != NULL) { - *pos = '\0'; - if (mkdirat(dfd, path, 0777)) { + return rename(opath, npath); +} + +/* Ensure that a path exists by creating all intermediate directories */ +int ensurep(const char *dirp, const char *rpath) { + char path[PATH_MAX]; + char *pp; + char sep; + + if (dirp == NULL) { + if (strlcpy(path, rpath, PATH_MAX) >= PATH_MAX) { + errno = ENAMETOOLONG; + return -1; + } +#ifdef _WIN32 + /* Skip root dir (windows) */ + pp = strchr(path, '\\'); + if (pp) + while (*pp == '\\') pp++; + else + pp = path; +#else + pp = path; + /* Skip root dir (unix) */ + while (*pp == '/') pp++; +#endif + } else { + if (catp(path, dirp, rpath)) + return -1; + + pp = path + strlen(dirp); + } + while ((pp = strpbrk(pp + 1, "\\/")) != NULL) { + sep = *pp; + *pp = '\0'; + if (mkdir(path, 0777)) { if (errno != EEXIST) return -1; /* For now we suppose that EEXIST means that the directory is - * already there.*/ + * already there. */ } - curp = pos + 1; - *pos = '/'; + *pp = sep; } return 0; @@ -130,7 +214,7 @@ static int write_entry(disk_cache *c, const cache_key_t k, if (key_path(c, k, hexp)) return -1; - if (ensureat(c->dirfd, hexp)) return -1; + if (ensurep(c->dirp, hexp)) return -1; if (strb_ensure(&b, 16)) return -1; b.l = 16; @@ -145,7 +229,7 @@ static int write_entry(disk_cache *c, const cache_key_t k, return -1; } - fd = mkstempat(c->dirfd, tmp_path); + fd = mkstempp(c->dirp, tmp_path); if (fd == -1) { strb_clear(&b); return -1; @@ -155,13 +239,18 @@ static int write_entry(disk_cache *c, const cache_key_t k, strb_clear(&b); close(fd); if (err) { - unlinkat(c->dirfd, tmp_path, 0); + unlinkp(c->dirp, tmp_path); return -1; } - if (renameat(c->dirfd, tmp_path, c->dirfd, hexp)) { - unlinkat(c->dirfd, tmp_path, 0); + if (renamep(c->dirp, tmp_path, hexp)) { + unlinkp(c->dirp, tmp_path); +#ifdef _WIN32 + /* On windows we can't rename over an existing file */ + return (errno != EACCES) ? -1 : 0; +#else return -1; +#endif } return 0; @@ -179,7 +268,7 @@ static int find_entry(disk_cache *c, const cache_key_t key, if (key_path(c, key, hexp)) return 0; - fd = openat(c->dirfd, hexp, O_RDONLY); + fd = openp(c->dirp, hexp, O_RDONLY, 0); if (fd == -1) return 0; @@ -255,7 +344,7 @@ static int disk_del(cache *_c, const cache_key_t key) { key_path(c, key, hexp); - return (unlinkat(c->dirfd, hexp, 0) == 0); + return (unlinkp(c->dirp, hexp) == 0); } static cache_value_t disk_get(cache *_c, const cache_key_t key) { @@ -277,7 +366,7 @@ static cache_value_t disk_get(cache *_c, const cache_key_t key) { static void disk_destroy(cache *_c) { disk_cache *c = (disk_cache *)_c; cache_destroy(c->mem); - close(c->dirfd); + free((void *)c->dirp); } cache *cache_disk(const char *dirpath, cache *mem, @@ -289,11 +378,10 @@ cache *cache_disk(const char *dirpath, cache *mem, if (dirp == NULL) return NULL; - if (ensureat(AT_FDCWD, dirp) != 0) { + if (ensurep(NULL, dirp) != 0) { free(dirp); return NULL; } - free(dirp); mkdir(dirpath, 0777); /* This may fail, but it's ok */ @@ -307,12 +395,7 @@ cache *cache_disk(const char *dirpath, cache *mem, if (res == NULL) return NULL; - res->dirfd = open(dirpath, O_RDONLY|O_CLOEXEC); - if (res->dirfd == -1) { - free(res); - return NULL; - } - + res->dirp = dirp; res->mem = mem; res->kwrite = kwrite; res->vwrite = vwrite; From a64059bd69e4b705d94b55ff3d8ed164319d2bde Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 1 Mar 2017 17:53:19 -0500 Subject: [PATCH 18/41] Fix bug in opencl gemmBatch bindings. --- src/gpuarray_blas_opencl_clblas.c | 4 ++-- src/gpuarray_blas_opencl_clblast.c | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/gpuarray_blas_opencl_clblas.c b/src/gpuarray_blas_opencl_clblas.c index 8ee019afb7..f6e51429b1 100644 --- a/src/gpuarray_blas_opencl_clblas.c +++ b/src/gpuarray_blas_opencl_clblas.c @@ -100,7 +100,7 @@ static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, ARRAY_INIT(C[i]); err = clblasSgemm(convO(order), convT(transA), convT(transB), M, N, K, alpha, A[i]->buf, offA[i], lda, B[i]->buf, offB[i], ldb, - beta, C[i]->buf, offB[i], ldc, 1, &ctx->q, + beta, C[i]->buf, offC[i], ldc, 1, &ctx->q, num_ev, num_ev == 0 ? NULL : evl, &ev); if (err != clblasSuccess) return GA_BLAS_ERROR; @@ -132,7 +132,7 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, ARRAY_INIT(C[i]); err = clblasDgemm(convO(order), convT(transA), convT(transB), M, N, K, alpha, A[i]->buf, offA[i], lda, B[i]->buf, offB[i], ldb, - beta, C[i]->buf, offB[i], ldc, 1, &ctx->q, + beta, C[i]->buf, offC[i], ldc, 1, &ctx->q, num_ev, num_ev == 0 ? NULL : evl, &ev); if (err != clblasSuccess) return GA_BLAS_ERROR; diff --git a/src/gpuarray_blas_opencl_clblast.c b/src/gpuarray_blas_opencl_clblast.c index 4a5369e56e..c6fd010a3b 100644 --- a/src/gpuarray_blas_opencl_clblast.c +++ b/src/gpuarray_blas_opencl_clblast.c @@ -68,7 +68,7 @@ static int hgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, ARRAY_INIT(C[i]); err = CLBlastHgemm(convO(order), convT(transA), convT(transB), M, N, K, float_to_half(alpha), A[i]->buf, offA[i], lda, B[i]->buf, offB[i], ldb, - float_to_half(beta), C[i]->buf, offB[i], ldc, &ctx->q, &ev); + float_to_half(beta), C[i]->buf, offC[i], ldc, &ctx->q, &ev); if (err != kSuccess) return GA_BLAS_ERROR; ARRAY_FINI(A[i]); @@ -97,7 +97,7 @@ static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, ARRAY_INIT(C[i]); err = CLBlastSgemm(convO(order), convT(transA), convT(transB), M, N, K, alpha, A[i]->buf, offA[i], lda, B[i]->buf, offB[i], ldb, - beta, C[i]->buf, offB[i], ldc, &ctx->q, &ev); + beta, C[i]->buf, offC[i], ldc, &ctx->q, &ev); if (err != kSuccess) return GA_BLAS_ERROR; ARRAY_FINI(A[i]); @@ -126,7 +126,7 @@ static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, ARRAY_INIT(C[i]); err = CLBlastDgemm(convO(order), convT(transA), convT(transB), M, N, K, alpha, A[i]->buf, offA[i], lda, B[i]->buf, offB[i], ldb, - beta, C[i]->buf, offB[i], ldc, &ctx->q, &ev); + beta, C[i]->buf, offC[i], ldc, &ctx->q, &ev); if (err != kSuccess) return GA_BLAS_ERROR; ARRAY_FINI(A[i]); From e280a3ddf33caa61846de2c4b6ada6c9487ab548 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 1 Mar 2017 17:58:02 -0500 Subject: [PATCH 19/41] Make MSVC slightly less verbose in its warnings. --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 09f7f1fd4c..5d0761bc73 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,7 +7,7 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/CMakeModules/") # -Wall is unbelieveably noisy with Visual Studio: # http://stackoverflow.com/q/4001736/3257826 if(MSVC) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -W4 -D_CRT_SECURE_NO_WARNINGS") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -W3 -D_CRT_SECURE_NO_WARNINGS") else() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall") endif() From 4f25bc73ed46115769374cac859ba6cc6820a80d Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 1 Mar 2017 18:03:35 -0500 Subject: [PATCH 20/41] Windows changes. --- CMakeLists.txt | 2 +- src/cache/disk.c | 15 ++++++++++++++- src/gpuarray_array.c | 6 +++--- src/gpuarray_buffer.c | 4 ++-- src/gpuarray_buffer_blas.c | 2 +- src/gpuarray_elemwise.c | 2 +- src/gpuarray_reduction.c | 2 +- src/private.h | 6 +++--- src/private_config.h.in | 4 +--- src/util/strb.c | 7 +++++++ 10 files changed, 34 insertions(+), 16 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5d0761bc73..ddfefab53c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,7 +7,7 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/CMakeModules/") # -Wall is unbelieveably noisy with Visual Studio: # http://stackoverflow.com/q/4001736/3257826 if(MSVC) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -W3 -D_CRT_SECURE_NO_WARNINGS") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -W3") else() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall") endif() diff --git a/src/cache/disk.c b/src/cache/disk.c index 2f4fc77efe..0b827aaefa 100644 --- a/src/cache/disk.c +++ b/src/cache/disk.c @@ -1,3 +1,4 @@ +#define _CRT_SECURE_NO_WARNINGS #include #include #include @@ -11,7 +12,10 @@ #include #include +#include #include +#include +#include struct timezone; @@ -43,13 +47,22 @@ static int gettimeofday(struct timeval *tp, struct timezone *tzp) { return 0; } +#define open _open +#define unlink _unlink +#define mkdir(p, f) _mkdir(p) +#define close _close +#define strdup _strdup +#define lstat _stat64 +#define fstat _fstat64 +#define stat __stat64 + #else #define PATH_MAX 1024 #include #include +#include #endif -#include #include "cache.h" #include "util/skein.h" diff --git a/src/gpuarray_array.c b/src/gpuarray_array.c index 91c043a1d6..267eb5badc 100644 --- a/src/gpuarray_array.c +++ b/src/gpuarray_array.c @@ -83,7 +83,7 @@ static int ga_extcopy(GpuArray *dst, const GpuArray *src) { } /* Value below which a size_t multiplication will never overflow. */ -#define MUL_NO_OVERFLOW (1UL << (sizeof(size_t) * 4)) +#define MUL_NO_OVERFLOW (1ULL << (sizeof(size_t) * 4)) void GpuArray_fix_flags(GpuArray *a) { /* Only keep the writable flag */ @@ -330,9 +330,9 @@ static int gen_take1_kernel(GpuKernel *k, gpucontext *ctx, char **err_str, const GpuArray *ind, int addr32) { strb sb = STRB_STATIC_INIT; int *atypes; - size_t nargs, apos; char *sz, *ssz; unsigned int i, i2; + unsigned int nargs, apos; int flags = GA_USE_CLUDA; int res; @@ -432,9 +432,9 @@ int GpuArray_take1(GpuArray *a, const GpuArray *v, const GpuArray *i, #if DEBUG char *errstr = NULL; #endif - size_t argp; GpuKernel k; unsigned int j; + unsigned int argp; int err, kerr = 0; int addr32 = 0; diff --git a/src/gpuarray_buffer.c b/src/gpuarray_buffer.c index d3226ea94e..a1e840c939 100644 --- a/src/gpuarray_buffer.c +++ b/src/gpuarray_buffer.c @@ -45,10 +45,10 @@ gpucontext *gpucontext_init(const char *name, int dev, int flags, int *ret) { if (res == NULL) return NULL; res->ops = ops; - if (gpucontext_property(res, GA_CTX_PROP_BLAS_OPS, &res->blas_ops) != GA_NO_ERROR) + if (gpucontext_property(res, GA_CTX_PROP_BLAS_OPS, (void *)&res->blas_ops) != GA_NO_ERROR) res->blas_ops = NULL; res->blas_handle = NULL; - if (gpucontext_property(res, GA_CTX_PROP_COMM_OPS, &res->comm_ops) != GA_NO_ERROR) + if (gpucontext_property(res, GA_CTX_PROP_COMM_OPS, (void *)&res->comm_ops) != GA_NO_ERROR) res->comm_ops = NULL; res->extcopy_cache = NULL; return res; diff --git a/src/gpuarray_buffer_blas.c b/src/gpuarray_buffer_blas.c index c73f3c2f19..3fdc525e78 100644 --- a/src/gpuarray_buffer_blas.c +++ b/src/gpuarray_buffer_blas.c @@ -10,7 +10,7 @@ int gpublas_setup(gpucontext *ctx) { void gpublas_teardown(gpucontext *ctx) { if (ctx->blas_ops != NULL) - return ctx->blas_ops->teardown(ctx); + ctx->blas_ops->teardown(ctx); } const char *gpublas_error(gpucontext *ctx) { diff --git a/src/gpuarray_elemwise.c b/src/gpuarray_elemwise.c index f3ce7ee261..1d93e5a155 100644 --- a/src/gpuarray_elemwise.c +++ b/src/gpuarray_elemwise.c @@ -131,8 +131,8 @@ static int gen_elemwise_basic_kernel(GpuKernel *k, gpucontext *ctx, strb sb = STRB_STATIC_INIT; unsigned int i, _i, j; int *ktypes; - size_t p; char *size = "ga_size", *ssize = "ga_ssize"; + unsigned int p; int flags = GA_USE_CLUDA; int res; diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c index 12eedb24a9..b1a185e3b7 100644 --- a/src/gpuarray_reduction.c +++ b/src/gpuarray_reduction.c @@ -644,7 +644,7 @@ static int maxandargmaxCompile (maxandargmax_ctx* ctx){ GA_SIZE, /* dstArgmaxOff */ GA_BUFFER /* dstArgmaxSteps */ }; - const size_t ARG_TYPECODES_LEN = sizeof(ARG_TYPECODES)/sizeof(*ARG_TYPECODES); + const unsigned int ARG_TYPECODES_LEN = sizeof(ARG_TYPECODES)/sizeof(*ARG_TYPECODES); const char* SRCS[1]; SRCS[0] = ctx->sourceCode; diff --git a/src/private.h b/src/private.h index abe9783de7..2de8742674 100644 --- a/src/private.h +++ b/src/private.h @@ -26,9 +26,9 @@ extern "C" { } #endif -#define ADDR32_MAX 4294967295 -#define SADDR32_MIN -2147483648 -#define SADDR32_MAX 2147483647 +#define ADDR32_MAX 4294967295L +#define SADDR32_MIN -2147483648L +#define SADDR32_MAX 2147483647L struct _gpuarray_buffer_ops; typedef struct _gpuarray_buffer_ops gpuarray_buffer_ops; diff --git a/src/private_config.h.in b/src/private_config.h.in index c3cd3a0195..f58a03edae 100644 --- a/src/private_config.h.in +++ b/src/private_config.h.in @@ -22,9 +22,7 @@ extern "C" { #ifdef _MSC_VER /* God damn Microsoft ... */ #define snprintf _snprintf -#endif - -#ifdef _MSC_VER +#define strdup _strdup /* MS VC++ 2008 does not support inline */ #define inline __inline #define alloca _alloca diff --git a/src/util/strb.c b/src/util/strb.c index 15cd496c4f..22da8bf637 100644 --- a/src/util/strb.c +++ b/src/util/strb.c @@ -1,6 +1,13 @@ +#define _CRT_SECURE_NO_WARNINGS #include #include +#ifdef _MSC_VER +#include +#define read _read +#define write _write +#else #include +#endif #include "util/strb.h" From 0678d76e02a8bc21f06a586bd4e719d633db49b9 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 14 Mar 2017 15:47:21 -0400 Subject: [PATCH 21/41] Initialized the blas_ops pointer so that compilers stop freaking out. --- src/gpuarray_buffer_opencl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c index 89b56f80a7..e1e8fdd82a 100644 --- a/src/gpuarray_buffer_opencl.c +++ b/src/gpuarray_buffer_opencl.c @@ -212,7 +212,7 @@ cl_command_queue cl_get_stream(gpucontext *ctx) { } static void cl_free_ctx(cl_ctx *ctx) { - gpuarray_blas_ops *blas_ops; + gpuarray_blas_ops *blas_ops = NULL; ASSERT_CTX(ctx); assert(ctx->refcnt != 0); From f761cfaeb0705d4505caa382ce24acc0ccbd86a6 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 14 Mar 2017 15:47:45 -0400 Subject: [PATCH 22/41] Add support for floats in GpuArray_dump(). --- src/gpuarray_array.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/gpuarray_array.c b/src/gpuarray_array.c index 267eb5badc..45a2a1186e 100644 --- a/src/gpuarray_array.c +++ b/src/gpuarray_array.c @@ -1096,6 +1096,9 @@ int GpuArray_fdump(FILE *fd, const GpuArray *a) { case GA_LONG: fprintf(fd, "%lld", (long long)*(int64_t *)p); break; + case GA_FLOAT: + fprintf(fd, "%f", *(float *)p); + break; case GA_SSIZE: fprintf(fd, "%" SPREFIX "d", *(ssize_t *)p); break; From eca4ffd3cf1e3056f70b4e6263db54c7427934fe Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 14 Mar 2017 16:20:39 -0400 Subject: [PATCH 23/41] Fix mkstempp to actually return the filename. --- src/cache/disk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cache/disk.c b/src/cache/disk.c index 0b827aaefa..3ded869829 100644 --- a/src/cache/disk.c +++ b/src/cache/disk.c @@ -129,7 +129,7 @@ static int mkstempp(const char *dirp, char *template) { res = mkstemp(path); /* We need to copy the result path back */ - if (res == 0) + if (res != -1) memcpy(template, &path[strlen(dirp)], strlen(template)); return res; From b78e9d42805174bdf2e26f0da1076492eb71c14e Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 14 Mar 2017 18:14:35 -0400 Subject: [PATCH 24/41] Fix windows annoyances. --- make.bat | 2 +- setup.py | 2 +- src/gpuarray/blas.h | 4 ++-- src/gpuarray_array_blas.c | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/make.bat b/make.bat index 1ea7aa55cd..16bc79f441 100755 --- a/make.bat +++ b/make.bat @@ -1,6 +1,6 @@ del bld mkdir bld cd bld -cmake .. -G "NMake Makefiles" +cmake .. -G "NMake Makefiles" -DCMAKE_BUILD_TYPE=Release cmake --build . --config Release cd .. diff --git a/setup.py b/setup.py index 0084e04b4a..10f5996b54 100755 --- a/setup.py +++ b/setup.py @@ -82,7 +82,7 @@ def __init__(self, *args, **kwargs): current_dir = os.path.abspath(os.path.dirname(__file__)) include_dirs += [os.path.join(current_dir, 'src')] - default_bin_dir = os.path.join(current_dir, 'lib', 'Release') + default_bin_dir = os.path.join(current_dir, 'lib') if not os.path.isdir(default_bin_dir): raise RuntimeError('default binary dir {} does not exist, you may need to build the C library in release mode'.format(default_bin_dir)) library_dirs += [default_bin_dir] diff --git a/src/gpuarray/blas.h b/src/gpuarray/blas.h index a8dd8096bc..a59d3bb885 100644 --- a/src/gpuarray/blas.h +++ b/src/gpuarray/blas.h @@ -9,8 +9,8 @@ extern "C" { #endif // only for vector-vector dot -GPUARRAY_PUBLIC int GpuArray_rdot( GpuArray *X, GpuArray *Y, - GpuArray *Z, int nocopy); +GPUARRAY_PUBLIC int GpuArray_rdot(GpuArray *X, GpuArray *Y, + GpuArray *Z, int nocopy); #define GpuArray_hdot GpuArray_rdot #define GpuArray_sdot GpuArray_rdot #define GpuArray_ddot GpuArray_rdot diff --git a/src/gpuarray_array_blas.c b/src/gpuarray_array_blas.c index 8f9fb5919b..2e9a398e2d 100644 --- a/src/gpuarray_array_blas.c +++ b/src/gpuarray_array_blas.c @@ -5,8 +5,8 @@ #include "gpuarray/util.h" #include "gpuarray/error.h" -int GpuArray_rdot( GpuArray *X, GpuArray *Y, - GpuArray *Z, int nocopy) { +int GpuArray_rdot(GpuArray *X, GpuArray *Y, + GpuArray *Z, int nocopy) { GpuArray *Xp = X; GpuArray copyX; GpuArray *Yp = Y; From 88601038668cc607c954ceee9af1a4983ec213de Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 14 Mar 2017 19:07:29 -0400 Subject: [PATCH 25/41] Make sure to properly terminate the cache path with a separator and work around lstat choking on terminating separators on windows. --- src/cache/disk.c | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/src/cache/disk.c b/src/cache/disk.c index 3ded869829..6d260ac559 100644 --- a/src/cache/disk.c +++ b/src/cache/disk.c @@ -387,20 +387,45 @@ cache *cache_disk(const char *dirpath, cache *mem, kread_fn kread, vread_fn vread) { struct stat st; disk_cache *res; - char *dirp = strdup(dirpath); + char *dirp; + size_t dirl = strlen(dirpath); + char sep = '/'; + + /* This trickery is to make sure the path ends with a separator */ +#ifdef _WIN32 + if (dirpath[dirl - 1] == '\\') + sep = '\\'; +#endif + + if (dirpath[dirl - 1] != sep) dirl++; + + dirp = malloc(dirl + 1); /* With the NUL */ if (dirp == NULL) return NULL; + strlcpy(dirp, dirpath, dirl + 1); + + if (dirp[dirl - 1] != sep) { + dirp[dirl - 1] = sep; + dirp[dirl] = '\0'; + } + if (ensurep(NULL, dirp) != 0) { free(dirp); return NULL; } - mkdir(dirpath, 0777); /* This may fail, but it's ok */ + /* For Windows mkdir and lstat which can't handle trailing separator */ + dirp[dirl - 1] = '\0'; - if (lstat(dirpath, &st) != 0) + mkdir(dirp, 0777); /* This may fail, but it's ok */ + + if (lstat(dirp, &st) != 0) return NULL; + /* Restore the good path at the end */ + dirp[dirl - 1] = sep; + if (!(st.st_mode & S_IFDIR)) return NULL; From 97d855339c78cfc30fc4566d9828d1dbfa95657f Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 14 Mar 2017 19:46:41 -0400 Subject: [PATCH 26/41] Make sure to open cache files in binary mode for windows. --- src/cache/disk.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/cache/disk.c b/src/cache/disk.c index 6d260ac559..db0917b1f4 100644 --- a/src/cache/disk.c +++ b/src/cache/disk.c @@ -61,6 +61,9 @@ static int gettimeofday(struct timeval *tp, struct timezone *tzp) { #include #include #include + +#define O_BINARY 0 + #endif @@ -281,7 +284,7 @@ static int find_entry(disk_cache *c, const cache_key_t key, if (key_path(c, key, hexp)) return 0; - fd = openp(c->dirp, hexp, O_RDONLY, 0); + fd = openp(c->dirp, hexp, O_RDONLY|O_BINARY, 0); if (fd == -1) return 0; From 46c2f08fca5bcd5e9bc54351b68a4ef4b886abe5 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 16 Mar 2017 18:32:06 -0400 Subject: [PATCH 27/41] Don't crash on key read failure. --- src/cache/disk.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/cache/disk.c b/src/cache/disk.c index db0917b1f4..f77128906f 100644 --- a/src/cache/disk.c +++ b/src/cache/disk.c @@ -337,7 +337,8 @@ static int find_entry(disk_cache *c, const cache_key_t key, return 1; } error: - c->c.kfree(k); + if (k) + c->c.kfree(k); b.s = ts; strb_clear(&b); return 0; From 01e5fa22706d48b8fde4e3aa39fa809c00d3f59d Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 16 Mar 2017 18:33:26 -0400 Subject: [PATCH 28/41] Error out if we reach EOF before the passed-in length. --- src/util/strb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/util/strb.c b/src/util/strb.c index 22da8bf637..c8ae4da25d 100644 --- a/src/util/strb.c +++ b/src/util/strb.c @@ -73,8 +73,8 @@ void strb_read(strb *sb, int fd, size_t sz) { sb->l += sz; while (sz) { res = read(fd, b, sz); - if (res == -1) { - if (errno == EAGAIN || errno == EINTR) + if (res == -1 || res == 0) { + if (res == -1 && errno == EAGAIN || errno == EINTR) continue; strb_seterror(sb); return; From f03754961edddc597c3c44e78ec5c453159bd84c Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 16 Mar 2017 19:14:47 -0400 Subject: [PATCH 29/41] Make sure to open files in binary mode. --- src/gpuarray_mkstemp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/gpuarray_mkstemp.c b/src/gpuarray_mkstemp.c index ac5ea10940..5e2e8ca520 100644 --- a/src/gpuarray_mkstemp.c +++ b/src/gpuarray_mkstemp.c @@ -8,6 +8,8 @@ #include #define open _open #define mktemp _mktemp +#else +#define O_BINARY 0 #endif int mkstemp(char *path) { @@ -18,7 +20,7 @@ int mkstemp(char *path) { do { tmp = mktemp(path); if (tmp == NULL) return -1; - res = open(path, O_CREAT|O_EXCL|O_RDWR, S_IREAD|S_IWRITE); + res = open(path, O_CREAT|O_EXCL|O_RDWR|O_BINARY, S_IREAD|S_IWRITE); if (res != -1 || errno != EEXIST) return res; } while (--tries); From a9ebffa0d0056b3a4d6e8fac12dc27f8e8a7fbfd Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 16 Mar 2017 19:17:28 -0400 Subject: [PATCH 30/41] Make sure to open files in binary mode for writing too. --- src/cache/disk.c | 7 +++++-- src/util/strb.c | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/cache/disk.c b/src/cache/disk.c index f77128906f..a7f5dbbcd7 100644 --- a/src/cache/disk.c +++ b/src/cache/disk.c @@ -63,6 +63,7 @@ static int gettimeofday(struct timeval *tp, struct timezone *tzp) { #include #define O_BINARY 0 +#define setmode(a, b) #endif @@ -131,9 +132,11 @@ static int mkstempp(const char *dirp, char *template) { res = mkstemp(path); - /* We need to copy the result path back */ - if (res != -1) + /* We need to copy the result path back and set binary mode (for windows) */ + if (res != -1) { + setmode(res, O_BINARY); memcpy(template, &path[strlen(dirp)], strlen(template)); + } return res; } diff --git a/src/util/strb.c b/src/util/strb.c index c8ae4da25d..dda9dcdfc2 100644 --- a/src/util/strb.c +++ b/src/util/strb.c @@ -74,7 +74,7 @@ void strb_read(strb *sb, int fd, size_t sz) { while (sz) { res = read(fd, b, sz); if (res == -1 || res == 0) { - if (res == -1 && errno == EAGAIN || errno == EINTR) + if (res == -1 && (errno == EAGAIN || errno == EINTR)) continue; strb_seterror(sb); return; From 3c6fb5dc645c0c8364f4586634b9b5e2e48deeb0 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 16 Mar 2017 20:15:10 -0400 Subject: [PATCH 31/41] Fix ntohull for platforms that have signed chars. --- src/cache/disk.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/src/cache/disk.c b/src/cache/disk.c index a7f5dbbcd7..6a5e086a10 100644 --- a/src/cache/disk.c +++ b/src/cache/disk.c @@ -63,7 +63,7 @@ static int gettimeofday(struct timeval *tp, struct timezone *tzp) { #include #define O_BINARY 0 -#define setmode(a, b) +#define _setmode(a, b) #endif @@ -84,7 +84,8 @@ typedef struct _disk_cache { } disk_cache; -static unsigned long long ntohull(const char *in) { +static unsigned long long ntohull(const char *_in) { + const unsigned char *in = (const unsigned char *)_in; return ((unsigned long long)in[0] << 56 | (unsigned long long)in[1] << 48 | (unsigned long long)in[2] << 40 | (unsigned long long)in[3] << 32 | (unsigned long long)in[4] << 24 | (unsigned long long)in[5] << 16 | @@ -92,14 +93,14 @@ static unsigned long long ntohull(const char *in) { } static void htonull(unsigned long long in, char *out) { - out[0] = (char)(in >> 56); - out[1] = (char)(in >> 48); - out[2] = (char)(in >> 40); - out[3] = (char)(in >> 32); - out[4] = (char)(in >> 24); - out[5] = (char)(in >> 16); - out[6] = (char)(in >> 8); - out[7] = (char)(in); + out[0] = (unsigned char)(in >> 56); + out[1] = (unsigned char)(in >> 48); + out[2] = (unsigned char)(in >> 40); + out[3] = (unsigned char)(in >> 32); + out[4] = (unsigned char)(in >> 24); + out[5] = (unsigned char)(in >> 16); + out[6] = (unsigned char)(in >> 8); + out[7] = (unsigned char)(in); } static int catp(char *path, const char *dirp, const char *rpath) { @@ -134,7 +135,7 @@ static int mkstempp(const char *dirp, char *template) { /* We need to copy the result path back and set binary mode (for windows) */ if (res != -1) { - setmode(res, O_BINARY); + _setmode(res, O_BINARY); memcpy(template, &path[strlen(dirp)], strlen(template)); } From 26b85ee432bcbc261f5760ac55cdc3be2b15eeb6 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Mon, 20 Mar 2017 17:33:42 -0400 Subject: [PATCH 32/41] Make GpuKernel_binary work again. --- src/gpuarray/buffer.h | 17 +++++++++++++++-- src/gpuarray/error.h | 1 - src/gpuarray/kernel.h | 1 - src/gpuarray_buffer.c | 2 +- src/gpuarray_buffer_cuda.c | 19 +++++++++++++++---- src/gpuarray_buffer_opencl.c | 29 +++++++++++++++++++++++++++++ src/gpuarray_error.c | 1 - src/private.h | 1 + src/private_cuda.h | 2 ++ 9 files changed, 63 insertions(+), 10 deletions(-) diff --git a/src/gpuarray/buffer.h b/src/gpuarray/buffer.h index 34878d503a..800756a072 100644 --- a/src/gpuarray/buffer.h +++ b/src/gpuarray/buffer.h @@ -494,9 +494,22 @@ GPUARRAY_PUBLIC int gpukernel_call(gpukernel *k, unsigned int n, size_t shared, void **args); /** - * Get the kernel binary (REMOVED). + * (Deprecated) Get the kernel binary. * - * Always returns GA_DEPRECATED_ERROR. + * This function is deprecated and will be removed in the next release. + * + * This can be use to cache kernel binaries after compilation of a + * specific device. The kernel can be recreated by calling + * kernel_alloc with the binary and size and passing `GA_USE_BINARY` + * as the use flags. + * + * The returned pointer is allocated and must be freed by the caller. + * + * \param k kernel + * \param sz size of the returned binary + * \param obj pointer to the binary for the kernel. + * + * \returns GA_NO_ERROR or an error code if an error occurred. */ GPUARRAY_PUBLIC int gpukernel_binary(gpukernel *k, size_t *sz, void **obj); diff --git a/src/gpuarray/error.h b/src/gpuarray/error.h index 84c852a257..af963c1531 100644 --- a/src/gpuarray/error.h +++ b/src/gpuarray/error.h @@ -36,7 +36,6 @@ enum ga_error { GA_COMM_ERROR, GA_XLARGE_ERROR, GA_LOAD_ERROR, - GA_DEPRECATED_ERROR, /* Add more error types if needed, but at the end */ /* Don't forget to sync with Gpu_error() */ }; diff --git a/src/gpuarray/kernel.h b/src/gpuarray/kernel.h index da779123b9..f88d74ffc6 100644 --- a/src/gpuarray/kernel.h +++ b/src/gpuarray/kernel.h @@ -107,7 +107,6 @@ GPUARRAY_PUBLIC int GpuKernel_call(GpuKernel *k, unsigned int n, const size_t *gs, const size_t *ls, size_t shared, void **args); -/* Deprecated and to be removed */ GPUARRAY_PUBLIC int GpuKernel_binary(const GpuKernel *k, size_t *sz, void **obj); diff --git a/src/gpuarray_buffer.c b/src/gpuarray_buffer.c index a1e840c939..dee65f130c 100644 --- a/src/gpuarray_buffer.c +++ b/src/gpuarray_buffer.c @@ -187,7 +187,7 @@ int gpukernel_call(gpukernel *k, unsigned int n, const size_t *gs, } int gpukernel_binary(gpukernel *k, size_t *sz, void **obj) { - return GA_DEPRECATED_ERROR; + return ((partial_gpukernel *)k)->ctx->ops->kernel_binary(k, sz, obj); } int gpukernel_property(gpukernel *k, int prop_id, void *res) { diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 7b1f540c26..fe0c100508 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -1151,6 +1151,7 @@ static void _cuda_freekernel(gpukernel *k) { } CLEAR(k); free(k->args); + free(k->bin); free(k->types); free(k); } @@ -1264,13 +1265,15 @@ static gpukernel *cuda_newkernel(gpucontext *c, unsigned int count, FAIL(NULL, GA_SYS_ERROR); } + /* Don't clear bin after this */ + res->bin_sz = bin.l; + res->bin = bin.s; res->refcnt = 1; res->argcount = argcount; res->types = calloc(argcount, sizeof(int)); if (res->types == NULL) { _cuda_freekernel(res); strb_clear(&src); - strb_clear(&bin); cuda_exit(ctx); FAIL(NULL, GA_MEMORY_ERROR); } @@ -1279,7 +1282,6 @@ static gpukernel *cuda_newkernel(gpucontext *c, unsigned int count, if (res->args == NULL) { _cuda_freekernel(res); strb_clear(&src); - strb_clear(&bin); cuda_exit(ctx); FAIL(NULL, GA_MEMORY_ERROR); } @@ -1288,11 +1290,9 @@ static gpukernel *cuda_newkernel(gpucontext *c, unsigned int count, if (ctx->err != CUDA_SUCCESS) { _cuda_freekernel(res); strb_clear(&src); - strb_clear(&bin); cuda_exit(ctx); FAIL(NULL, GA_IMPL_ERROR); } - strb_clear(&bin); ctx->err = cuModuleGetFunction(&res->k, res->m, fname); if (ctx->err != CUDA_SUCCESS) { @@ -1390,6 +1390,16 @@ static int cuda_callkernel(gpukernel *k, unsigned int n, return GA_NO_ERROR; } +static int cuda_kernelbin(gpukernel *k, size_t *sz, void **obj) { + void *res = malloc(k->bin_sz); + if (res == NULL) + return GA_MEMORY_ERROR; + memcpy(res, k->bin, k->bin_sz); + *sz = k->bin_sz; + *obj = res; + return GA_NO_ERROR; +} + static int cuda_sync(gpudata *b) { cuda_context *ctx = (cuda_context *)b->ctx; int err = GA_NO_ERROR; @@ -1792,6 +1802,7 @@ const gpuarray_buffer_ops cuda_ops = {cuda_get_platform_count, cuda_freekernel, cuda_kernelsetarg, cuda_callkernel, + cuda_kernelbin, cuda_sync, cuda_transfer, cuda_property, diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c index e1e8fdd82a..3e87f23c41 100644 --- a/src/gpuarray_buffer_opencl.c +++ b/src/gpuarray_buffer_opencl.c @@ -1076,6 +1076,34 @@ static int cl_callkernel(gpukernel *k, unsigned int n, return GA_NO_ERROR; } +static int cl_kernelbin(gpukernel *k, size_t *sz, void **obj) { + cl_ctx *ctx = k->ctx; + cl_program p; + size_t rsz; + void *res; + + ASSERT_KER(k); + ASSERT_CTX(ctx); + + ctx->err = clGetKernelInfo(k->k, CL_KERNEL_PROGRAM, sizeof(p), &p, NULL); + if (ctx->err != CL_SUCCESS) + return GA_IMPL_ERROR; + ctx->err = clGetProgramInfo(p, CL_PROGRAM_BINARY_SIZES, sizeof(rsz), &rsz, NULL); + if (ctx->err != CL_SUCCESS) + return GA_IMPL_ERROR; + res = malloc(rsz); + if (res == NULL) + return GA_MEMORY_ERROR; + ctx->err = clGetProgramInfo(p, CL_PROGRAM_BINARIES, sizeof(res), &res, NULL); + if (ctx->err != CL_SUCCESS) { + free(res); + return GA_IMPL_ERROR; + } + *sz = rsz; + *obj = res; + return GA_NO_ERROR; +} + static int cl_sync(gpudata *b) { cl_ctx *ctx = (cl_ctx *)b->ctx; @@ -1437,6 +1465,7 @@ const gpuarray_buffer_ops opencl_ops = {cl_get_platform_count, cl_releasekernel, cl_setkernelarg, cl_callkernel, + cl_kernelbin, cl_sync, cl_transfer, cl_property, diff --git a/src/gpuarray_error.c b/src/gpuarray_error.c index ddebd3e9dc..b7d5011f5b 100644 --- a/src/gpuarray_error.c +++ b/src/gpuarray_error.c @@ -25,7 +25,6 @@ const char *gpuarray_error_str(int err) { case GA_COMM_ERROR: return "Error in collectives call"; case GA_XLARGE_ERROR: return "Input size too large for operation"; case GA_LOAD_ERROR: return "Error loading library"; - case GA_DEPRECATED_ERROR: return "Deprecated (removed) functionality"; default: return "Unknown GA error"; } } diff --git a/src/private.h b/src/private.h index 2de8742674..820ebb6287 100644 --- a/src/private.h +++ b/src/private.h @@ -100,6 +100,7 @@ struct _gpuarray_buffer_ops { const size_t *gs, const size_t *ls, size_t shared, void **args); + int (*kernel_binary)(gpukernel *k, size_t *sz, void **obj); int (*buffer_sync)(gpudata *b); int (*buffer_transfer)(gpudata *dst, size_t dstoff, gpudata *src, size_t srcoff, size_t sz); diff --git a/src/private_cuda.h b/src/private_cuda.h index a0b4557977..ad9ff7f8ae 100644 --- a/src/private_cuda.h +++ b/src/private_cuda.h @@ -137,6 +137,8 @@ struct _gpukernel { CUmodule m; CUfunction k; void **args; + size_t bin_sz; + void *bin; int *types; unsigned int argcount; unsigned int refcnt; From 330703150aaaa7614faa8a524319c9e8977e8ec1 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 30 Mar 2017 11:54:09 -0400 Subject: [PATCH 33/41] Add a description of the purpose of make.bat. --- make.bat | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/make.bat b/make.bat index 16bc79f441..3402206e00 100755 --- a/make.bat +++ b/make.bat @@ -1,3 +1,7 @@ +REM This helps repetitive builds on windows +REM It needs the compiler you want to use to be available in the shell +REM and it will build a release version + del bld mkdir bld cd bld From e62616b4d6a04c2ba3fdcba4f3d000df3ff7d0a6 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 30 Mar 2017 11:54:34 -0400 Subject: [PATCH 34/41] Clean up the cache management script. --- bin/gpuarray-cache | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/bin/gpuarray-cache b/bin/gpuarray-cache index 04b7e8e68a..3cfb429a76 100644 --- a/bin/gpuarray-cache +++ b/bin/gpuarray-cache @@ -1,11 +1,11 @@ #!/usr/bin/env python import os +import sys -def clean(max_size): +def clean(max_size, path): content = [] - for root, dirs, files in os.walk(os.environ.get('GPUARRAY_CACHE', - '~/.gpuarray/cache/')): + for root, dirs, files in os.walk(path): for file in files: fpath = os.path.join(root, file) st = os.stat(fpath) @@ -25,18 +25,18 @@ SUFFIXES = {'B': 1, 'K': 1 << 10, 'M': 1 << 20, 'G': 1 << 30, 'T': 1 << 40, def get_size(s): i = 0 - while i < len(s) and (s[i].isdigit() or s[i] == '.'): - i += 1 - num = s[:i] - suf = s[i:] + s = s.strip() + if s[-1].upper() in SUFFIXES: + num = s[:-1] + suf = s[-1].upper() + else: + num = s + suf = "" num = float(num) if suf != "": - letter = suf.strip().upper() - if letter not in SUFFIXES: - raise ValueError("can't interpret %r" % init) - mult = SUFFIXES[letter] + mult = SUFFIXES[suf] else: - mult = 0 + mult = 1 return int(num * mult) @@ -46,6 +46,10 @@ if __name__ == '__main__': parser = argparse.ArgumentParser(description='libgpuarray cache maintenance utility') parser.add_argument('-s', '--max_size', help='Set the maximum size for pruning (in bytes with suffixes: K, M, G, ...)') args = parser.parse_args() + path = os.environ.get('GPUARRAY_CACHE_PATH', None) + if path is None: + print("You need to set GPUARRAY_CACHE_PATH so that this programs knows which path to clean.") + sys.exit(1) - clean(get_size(args.max_size)) + clean(get_size(args.max_size), path) From 1a1cab6a1307dd0f5fcfa2dc3bed73cc532573a3 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 30 Mar 2017 11:56:17 -0400 Subject: [PATCH 35/41] Add the appropriate bumps. --- setup.py | 4 ++-- src/CMakeLists.txt | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 10f5996b54..7ec8d3af93 100755 --- a/setup.py +++ b/setup.py @@ -5,8 +5,8 @@ MAJOR = 0 MINOR = 6 -PATCH = 2 -SUFFIX = '' +PATCH = 3 +SUFFIX = '.dev0' # include the '.' FULLVERSION = '%d.%d.%d%s' % (MAJOR, MINOR, PATCH, SUFFIX) try: diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 0ed776533d..a45db024ff 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -89,7 +89,7 @@ set_target_properties(gpuarray PROPERTIES INSTALL_NAME_DIR ${CMAKE_INSTALL_PREFIX}/lib MACOSX_RPATH OFF # This is the shared library version - VERSION 2.0 + VERSION 2.1 ) add_library(gpuarray-static STATIC ${GPUARRAY_SRC}) From 878566be656aa4eb116c2f5aea70c6818b7e37f2 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 30 Mar 2017 12:13:42 -0400 Subject: [PATCH 36/41] Add comments and fix style. --- src/gpuarray_buffer_cuda.c | 2 +- src/private_cuda.h | 2 +- src/util/strb.h | 11 +++++++---- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index fe0c100508..4612573f5f 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -1034,7 +1034,7 @@ static int call_compiler(cuda_context *ctx, strb *src, strb *ptx, strb *log) { if (strb_ensure(ptx, buflen) == 0) { err = nvrtcGetPTX(prog, ptx->s+ptx->l); - if (err == NVRTC_SUCCESS) ptx->l = buflen; + if (err == NVRTC_SUCCESS) ptx->l += buflen; } end: diff --git a/src/private_cuda.h b/src/private_cuda.h index ad9ff7f8ae..dc81ceba52 100644 --- a/src/private_cuda.h +++ b/src/private_cuda.h @@ -68,7 +68,7 @@ typedef struct _cuda_context { CUstream mem_s; gpudata *freeblocks; cache *kernel_cache; - cache *disk_cache; + cache *disk_cache; // This is per-context to avoid lock contention unsigned int enter; unsigned char major; unsigned char minor; diff --git a/src/util/strb.h b/src/util/strb.h index 01ea7a2495..3289de5796 100644 --- a/src/util/strb.h +++ b/src/util/strb.h @@ -46,7 +46,7 @@ strb *strb_alloc(size_t s); * * Don't call this for stack of global declarations, see strb_clear() instead. */ -void strb_free(strb *); +void strb_free(strb *sb); /* * Return a pointer to a dynamically allocated strb with a default @@ -96,7 +96,7 @@ static inline void strb_clear(strb *sb) { * This should almost never be called directly. Use strb_ensure() * instead. */ -int strb_grow(strb *, size_t s); +int strb_grow(strb *sb, size_t s); /* * Make sure there is space to store at least `s` bytes of data after @@ -159,14 +159,17 @@ static inline void strb_appendb(strb *sb, const strb *sb2) { * * A format error will place the strb in error mode. */ -void strb_appendf(strb *, const char *f, ...); +void strb_appendf(strb *sb, const char *f, ...); /* * Reads from the file specified by the given file descriptor. * + * This will read `sz` bytes from the file descriptor. Insufficient + * data is handled as a read error. + * * A read error will place the strb in error mode. */ -void strb_read(strb *, int fd, size_t sz); +void strb_read(strb *sb, int fd, size_t sz); /* * Write the content of an strb to the specified file descriptor. From 6934341bc62d7d286e7ed849c1af99d4ec2caec3 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 30 Mar 2017 12:36:06 -0400 Subject: [PATCH 37/41] Add some explanatory comments. --- src/cache/disk.c | 5 +++++ src/gpuarray_buffer_cuda.c | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/cache/disk.c b/src/cache/disk.c index 6a5e086a10..601690a421 100644 --- a/src/cache/disk.c +++ b/src/cache/disk.c @@ -84,6 +84,7 @@ typedef struct _disk_cache { } disk_cache; +/* Convert unsigned long long from network to host order */ static unsigned long long ntohull(const char *_in) { const unsigned char *in = (const unsigned char *)_in; return ((unsigned long long)in[0] << 56 | (unsigned long long)in[1] << 48 | @@ -92,6 +93,7 @@ static unsigned long long ntohull(const char *_in) { (unsigned long long)in[6] << 8 | (unsigned long long)in[7]); } +/* Convert unsigned long long from host to network order */ static void htonull(unsigned long long in, char *out) { out[0] = (unsigned char)(in >> 56); out[1] = (unsigned char)(in >> 48); @@ -103,6 +105,8 @@ static void htonull(unsigned long long in, char *out) { out[7] = (unsigned char)(in); } +/* Concatenate prefix and suffix into a single path string while + checking for overflow */ static int catp(char *path, const char *dirp, const char *rpath) { if (strlcpy(path, dirp, PATH_MAX) >= PATH_MAX) { errno = ENAMETOOLONG; @@ -115,6 +119,7 @@ static int catp(char *path, const char *dirp, const char *rpath) { return 0; } +/* open() for a path specifed by the concatenation of dirp and rpath */ static int openp(const char *dirp, const char *rpath, int flags, int mode) { char path[PATH_MAX]; diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 4612573f5f..8614eced61 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -1025,7 +1025,7 @@ static int call_compiler(cuda_context *ctx, strb *src, strb *ptx, strb *log) { strb_appends(log, "NVRTC compile log::\n"); if (strb_ensure(log, buflen) == 0) if (nvrtcGetProgramLog(prog, log->s+log->l) == NVRTC_SUCCESS) - log->l += buflen - 1; + log->l += buflen - 1; // Remove the final NUL strb_appendc(log, '\n'); } From 137395c005a430f5a951be8397d5eea949b8f73d Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Thu, 30 Mar 2017 18:56:48 -0400 Subject: [PATCH 38/41] Fix aliasing of src strb between the caches. --- src/gpuarray_buffer_cuda.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 8614eced61..a406a85852 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -1120,17 +1120,23 @@ static int compile(cuda_context *ctx, strb *src, strb* bin, strb *log) { err = make_bin(ctx, &ptx, bin, log); if (err != GA_NO_ERROR) return err; if (ctx->disk_cache) { - pk = memdup(&k, sizeof(k)); + pk = calloc(sizeof(kernel_key), 1); if (pk == NULL) return GA_NO_ERROR; + memcpy(pk->bin_id, k.bin_id, 64); + strb_appendb(&pk->src, src); + if (strb_error(&pk->src)) { + key_free((cache_key_t)pk); + return GA_NO_ERROR; + } cbin = strb_alloc(bin->l); if (cbin == NULL) { - free(pk); + key_free((cache_key_t)pk); return GA_NO_ERROR; } strb_appendb(cbin, bin); if (strb_error(cbin)) { - free(pk); + key_free((cache_key_t)pk); strb_free(cbin); return GA_NO_ERROR; } From a708ed715ada7976d69ec44cf2ad4507b5e6f36b Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 4 Apr 2017 17:22:53 -0400 Subject: [PATCH 39/41] Add shitty error messages when the disk cache fails. They will be upgraded to better message with the PR about better error messages. --- src/gpuarray_buffer_cuda.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index a406a85852..bbef0ab2a0 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -224,14 +224,19 @@ cuda_context *cuda_make_ctx(CUcontext ctx, int flags) { (cache_hash_fn)key_hash, (cache_freek_fn)key_free, (cache_freev_fn)strb_free); - if (mem_cache == NULL) + if (mem_cache == NULL) { + // TODO use better error messages when they are available. + fprintf(stderr, "Error initializing disk cache, disabling\n"); goto fail_disk_cache; + } res->disk_cache = cache_disk(cache_path, mem_cache, (kwrite_fn)key_write, (vwrite_fn)kernel_write, (kread_fn)key_read, (vread_fn)kernel_read); if (res->disk_cache == NULL) { + // TODO use better error messages when they are available. + fprintf(stderr, "Error initializing disk cache, disabling\n"); cache_destroy(mem_cache); goto fail_disk_cache; } @@ -1121,26 +1126,38 @@ static int compile(cuda_context *ctx, strb *src, strb* bin, strb *log) { if (err != GA_NO_ERROR) return err; if (ctx->disk_cache) { pk = calloc(sizeof(kernel_key), 1); - if (pk == NULL) + if (pk == NULL) { + // TODO use better error messages + fprintf(stderr, "Error adding kernel to disk cache\n"); return GA_NO_ERROR; + } memcpy(pk->bin_id, k.bin_id, 64); strb_appendb(&pk->src, src); if (strb_error(&pk->src)) { + // TODO use better error messages + fprintf(stderr, "Error adding kernel to disk cache\n"); key_free((cache_key_t)pk); return GA_NO_ERROR; } cbin = strb_alloc(bin->l); if (cbin == NULL) { + // TODO use better error messages + fprintf(stderr, "Error adding kernel to disk cache\n"); key_free((cache_key_t)pk); return GA_NO_ERROR; } strb_appendb(cbin, bin); if (strb_error(cbin)) { + // TODO use better error messages + fprintf(stderr, "Error adding kernel to disk cache\n"); key_free((cache_key_t)pk); strb_free(cbin); return GA_NO_ERROR; } - cache_add(ctx->disk_cache, pk, cbin); + if (cache_add(ctx->disk_cache, pk, cbin)) { + // TODO use better error messages + fprintf(stderr, "Error adding kernel to disk cache\n"); + } } return GA_NO_ERROR; From e1bd3c803c5ab9bd2bfcb1721bf6f741686e07f7 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Tue, 4 Apr 2017 18:20:51 -0400 Subject: [PATCH 40/41] Add additional info to the kernel cache key. --- src/gpuarray_buffer_cuda.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index bbef0ab2a0..894de1a8f3 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -50,10 +50,17 @@ static int detect_arch(const char *prefix, char *ret, CUresult *err); static gpudata *new_gpudata(cuda_context *ctx, CUdeviceptr ptr, size_t size); typedef struct _kernel_key { + uint8_t version; + uint8_t debug; + uint8_t major; + uint8_t minor; + uint32_t reserved; char bin_id[64]; strb src; } kernel_key; +#define KERNEL_KEY_MM (sizeof(kernel_key) - sizeof(strb)) + static void key_free(cache_key_t _k) { kernel_key *k = (kernel_key *)_k; strb_clear(&k->src); @@ -70,36 +77,36 @@ static uint32_t strb_hash(strb *k) { } static int key_eq(kernel_key *k1, kernel_key *k2) { - return (memcmp(k1->bin_id, k2->bin_id, 64) == 0 && + return (memcmp(k1, k2, KERNEL_KEY_MM) == 0 && strb_eq(&k1->src, &k2->src)); } static int key_hash(kernel_key *k) { XXH32_state_t state; XXH32_reset(&state, 42); - XXH32_update(&state, k->bin_id, 64); + XXH32_update(&state, k, KERNEL_KEY_MM); XXH32_update(&state, k->src.s, k->src.l); return XXH32_digest(&state); } static int key_write(strb *res, kernel_key *k) { - strb_appendn(res, k->bin_id, 64); + strb_appendn(res, (const char *)k, KERNEL_KEY_MM); strb_appendb(res, &k->src); return strb_error(res); } static kernel_key *key_read(const strb *b) { kernel_key *k; - if (b->l < 64) return NULL; + if (b->l < KERNEL_KEY_MM) return NULL; k = calloc(1, sizeof(*k)); if (k == NULL) return NULL; - if (strb_ensure(&k->src, b->l - 64) != 0) { + if (strb_ensure(&k->src, b->l - KERNEL_KEY_MM) != 0) { strb_clear(&k->src); free(k); return NULL; } - memcpy(k->bin_id, b->s, 64); - strb_appendn(&k->src, b->s+64, b->l-64); + memcpy(k->bin_id, b->s, KERNEL_KEY_MM); + strb_appendn(&k->src, b->s + KERNEL_KEY_MM, b->l - KERNEL_KEY_MM); return k; } @@ -1108,6 +1115,13 @@ static int compile(cuda_context *ctx, strb *src, strb* bin, strb *log) { kernel_key *pk; int err; + memset(&k, 0, sizeof(k)); + k.version = 0; +#ifdef DEBUG + k.debug = 1; +#endif + k.major = ctx->major; + k.minor = ctx->minor; memcpy(k.bin_id, ctx->bin_id, 64); memcpy(&k.src, src, sizeof(strb)); @@ -1131,7 +1145,7 @@ static int compile(cuda_context *ctx, strb *src, strb* bin, strb *log) { fprintf(stderr, "Error adding kernel to disk cache\n"); return GA_NO_ERROR; } - memcpy(pk->bin_id, k.bin_id, 64); + memcpy(pk, &k, KERNEL_KEY_MM); strb_appendb(&pk->src, src); if (strb_error(&pk->src)) { // TODO use better error messages From 9306b9692904fa0481aa757e0cc177b09dd89701 Mon Sep 17 00:00:00 2001 From: Arnaud Bergeron Date: Wed, 5 Apr 2017 13:58:30 -0400 Subject: [PATCH 41/41] Fix bug in key_read and explain KERNEL_KEY_MM. --- src/gpuarray_buffer_cuda.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c index 894de1a8f3..47bc7ac526 100644 --- a/src/gpuarray_buffer_cuda.c +++ b/src/gpuarray_buffer_cuda.c @@ -59,6 +59,7 @@ typedef struct _kernel_key { strb src; } kernel_key; +/* Size of the kernel_key that we can memcopy to duplicate */ #define KERNEL_KEY_MM (sizeof(kernel_key) - sizeof(strb)) static void key_free(cache_key_t _k) { @@ -100,12 +101,16 @@ static kernel_key *key_read(const strb *b) { if (b->l < KERNEL_KEY_MM) return NULL; k = calloc(1, sizeof(*k)); if (k == NULL) return NULL; + memcpy(k, b->s, KERNEL_KEY_MM); + if (k->version != 0) { + free(k); + return NULL; + } if (strb_ensure(&k->src, b->l - KERNEL_KEY_MM) != 0) { strb_clear(&k->src); free(k); return NULL; } - memcpy(k->bin_id, b->s, KERNEL_KEY_MM); strb_appendn(&k->src, b->s + KERNEL_KEY_MM, b->l - KERNEL_KEY_MM); return k; }