From 3572161bd72234e87984922fc801ddfaaa47ce6d Mon Sep 17 00:00:00 2001
From: badman74 <badman74@yahoo.com>
Date: Tue, 14 Apr 2015 17:37:16 -0500
Subject: [PATCH 1/2] removed Lyra2.cl and Lyra2RE.cl

GIT seems to disregard my changing the filenames to lowercase so I am
just removing the old files then re-adding them with the new name change
---
 kernel/Lyra2.cl   | 175 -----------------
 kernel/Lyra2RE.cl | 470 ----------------------------------------------
 2 files changed, 645 deletions(-)
 delete mode 100644 kernel/Lyra2.cl
 delete mode 100644 kernel/Lyra2RE.cl

diff --git a/kernel/Lyra2.cl b/kernel/Lyra2.cl
deleted file mode 100644
index 334f6822f..000000000
--- a/kernel/Lyra2.cl
+++ /dev/null
@@ -1,175 +0,0 @@
-/*
-* Lyra2 kernel implementation.
-*
-* ==========================(LICENSE BEGIN)============================
-* Copyright (c) 2014 djm34
-* 
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files (the
-* "Software"), to deal in the Software without restriction, including
-* without limitation the rights to use, copy, modify, merge, publish,
-* distribute, sublicense, and/or sell copies of the Software, and to
-* permit persons to whom the Software is furnished to do so, subject to
-* the following conditions:
-*
-* The above copyright notice and this permission notice shall be
-* included in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*
-* ===========================(LICENSE END)=============================
-*
-* @author   djm34
-*/
-
-/*Blake2b IV Array*/
-__constant static const sph_u64 blake2b_IV[8] =
-{
-  0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
-  0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
-  0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
-  0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
-};
-
-/*Blake2b's rotation*/
-
-static inline uint2 ror2(uint2 v, unsigned a) {
-	uint2 result;
-	unsigned n = 64 - a;
-	if (n == 32) { return (uint2)(v.y,v.x); }
-	if (n < 32) {
-		result.y = ((v.y << (n)) | (v.x >> (32 - n)));
-		result.x = ((v.x << (n)) | (v.y >> (32 - n)));
-	}
-	else {
-		result.y = ((v.x << (n - 32)) | (v.y >> (64 - n)));
-		result.x = ((v.y << (n - 32)) | (v.x >> (64 - n)));
-	}
-	return result;
-}
-static inline uint2 ror2l(uint2 v, unsigned a) {
-	uint2 result;
-		result.y = ((v.x << (32-a)) | (v.y >> (a)));
-		result.x = ((v.y << (32-a)) | (v.x >> (a)));
-	return result;
-}
-static inline uint2 ror2r(uint2 v, unsigned a) {
-	uint2 result;
-		result.y = ((v.y << (64-a)) | (v.x >> (a-32)));
-		result.x = ((v.x << (64-a)) | (v.y >> (a-32)));
-	return result;
-}
-/*
-#define G(a,b,c,d) \
-  do { \
-a = as_uint2(as_ulong(a)+as_ulong(b)); d ^= a; d = d.yx; \
-c = as_uint2(as_ulong(c)+as_ulong(d)); b ^= c; b = ror2l(b, 24); \
-a = as_uint2(as_ulong(a)+as_ulong(b)); d ^= a; d = ror2l(d, 16); \
-c = as_uint2(as_ulong(c)+as_ulong(d)); b ^= c; b = ror2r(b, 63); \
-  } while(0)
-*/
-#define G(a,b,c,d) \
-  do { \
-a = as_uint2(as_ulong(a)+as_ulong(b)); d ^= a; d = d.yx; \
-c = as_uint2(as_ulong(c)+as_ulong(d)); b ^= c; b = as_uint2(as_uchar8(b).s34567012); \
-a = as_uint2(as_ulong(a)+as_ulong(b)); d ^= a; d = ror2l(d, 16); \
-c = as_uint2(as_ulong(c)+as_ulong(d)); b ^= c; b = ror2r(b, 63); \
-  } while(0)
-
-/*One Round of the Blake2b's compression function*/
-#define round_lyra(v)  \
- do { \
-    G(v[ 0],v[ 4],v[ 8],v[12]); \
-    G(v[ 1],v[ 5],v[ 9],v[13]); \
-    G(v[ 2],v[ 6],v[10],v[14]); \
-    G(v[ 3],v[ 7],v[11],v[15]); \
-    G(v[ 0],v[ 5],v[10],v[15]); \
-    G(v[ 1],v[ 6],v[11],v[12]); \
-    G(v[ 2],v[ 7],v[ 8],v[13]); \
-    G(v[ 3],v[ 4],v[ 9],v[14]); \
- } while(0)
-
-
-#define reduceDuplexRowSetup(rowIn, rowInOut, rowOut) \
-   { \
-	for (int i = 0; i < 8; i++) \
-				{ \
-\
-		for (int j = 0; j < 12; j++) {state[j] ^= as_uint2(as_ulong(Matrix[12 * i + j][rowIn]) + as_ulong(Matrix[12 * i + j][rowInOut]));} \
-		round_lyra(state); \
-		for (int j = 0; j < 12; j++) {Matrix[j + 84 - 12 * i][rowOut] = Matrix[12 * i + j][rowIn] ^ state[j];} \
-\
-		Matrix[0 + 12 * i][rowInOut] ^= state[11]; \
-		Matrix[1 + 12 * i][rowInOut] ^= state[0]; \
-		Matrix[2 + 12 * i][rowInOut] ^= state[1]; \
-		Matrix[3 + 12 * i][rowInOut] ^= state[2]; \
-		Matrix[4 + 12 * i][rowInOut] ^= state[3]; \
-		Matrix[5 + 12 * i][rowInOut] ^= state[4]; \
-		Matrix[6 + 12 * i][rowInOut] ^= state[5]; \
-		Matrix[7 + 12 * i][rowInOut] ^= state[6]; \
-		Matrix[8 + 12 * i][rowInOut] ^= state[7]; \
-		Matrix[9 + 12 * i][rowInOut] ^= state[8]; \
-		Matrix[10 + 12 * i][rowInOut] ^= state[9]; \
-		Matrix[11 + 12 * i][rowInOut] ^= state[10]; \
-				} \
- \
-   } 
-
-#define reduceDuplexRow(rowIn, rowInOut, rowOut) \
-  { \
-	 for (int i = 0; i < 8; i++) \
-	 	 	 	 	 { \
-		 for (int j = 0; j < 12; j++) \
-			 state[j] ^= as_uint2(as_ulong(Matrix[12 * i + j][rowIn]) + as_ulong(Matrix[12 * i + j][rowInOut])); \
- \
-		 round_lyra(state); \
-		 for (int j = 0; j < 12; j++) {Matrix[j + 12 * i][rowOut] ^= state[j];} \
-\
-		 Matrix[0 + 12 * i][rowInOut] ^= state[11]; \
-		 Matrix[1 + 12 * i][rowInOut] ^= state[0]; \
-		 Matrix[2 + 12 * i][rowInOut] ^= state[1]; \
-		 Matrix[3 + 12 * i][rowInOut] ^= state[2]; \
-		 Matrix[4 + 12 * i][rowInOut] ^= state[3]; \
-		 Matrix[5 + 12 * i][rowInOut] ^= state[4]; \
-		 Matrix[6 + 12 * i][rowInOut] ^= state[5]; \
-		 Matrix[7 + 12 * i][rowInOut] ^= state[6]; \
-		 Matrix[8 + 12 * i][rowInOut] ^= state[7]; \
-		 Matrix[9 + 12 * i][rowInOut] ^= state[8]; \
-		 Matrix[10 + 12 * i][rowInOut] ^= state[9]; \
-		 Matrix[11 + 12 * i][rowInOut] ^= state[10]; \
-	 	 	 	 	 } \
- \
-  } 
-#define absorbblock(in)  { \
-	state[0] ^= Matrix[0][in]; \
-	state[1] ^= Matrix[1][in]; \
-	state[2] ^= Matrix[2][in]; \
-	state[3] ^= Matrix[3][in]; \
-	state[4] ^= Matrix[4][in]; \
-	state[5] ^= Matrix[5][in]; \
-	state[6] ^= Matrix[6][in]; \
-	state[7] ^= Matrix[7][in]; \
-	state[8] ^= Matrix[8][in]; \
-	state[9] ^= Matrix[9][in]; \
-	state[10] ^= Matrix[10][in]; \
-	state[11] ^= Matrix[11][in]; \
-	round_lyra(state); \
-	round_lyra(state); \
-	round_lyra(state); \
-	round_lyra(state); \
-	round_lyra(state); \
-	round_lyra(state); \
-	round_lyra(state); \
-	round_lyra(state); \
-	round_lyra(state); \
-	round_lyra(state); \
-	round_lyra(state); \
-	round_lyra(state); \
-  } 
diff --git a/kernel/Lyra2RE.cl b/kernel/Lyra2RE.cl
deleted file mode 100644
index 923a9dfa6..000000000
--- a/kernel/Lyra2RE.cl
+++ /dev/null
@@ -1,470 +0,0 @@
-/*
- * Lyra2RE kernel implementation.
- *
- * ==========================(LICENSE BEGIN)============================
- * Copyright (c) 2014 djm34
- * Copyright (c) 2014 James Lovejoy
- * 
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * ===========================(LICENSE END)=============================
- *
- * @author   djm34
- */
-
-#pragma OPENCL EXTENSION cl_amd_printf : enable
-
-#ifndef LYRA2RE_CL
-#define LYRA2RE_CL
-
-#if __ENDIAN_LITTLE__
-#define SPH_LITTLE_ENDIAN 1
-#else
-#define SPH_BIG_ENDIAN 1
-#endif
-
-#define SPH_UPTR sph_u64
-
-typedef unsigned int sph_u32;
-typedef int sph_s32;
-#ifndef __OPENCL_VERSION__
-typedef unsigned long long sph_u64;
-typedef long long sph_s64;
-#else
-typedef unsigned long sph_u64;
-typedef long sph_s64;
-#endif
-
-#define SPH_64 1
-#define SPH_64_TRUE 1
-
-#define SPH_C32(x)    ((sph_u32)(x ## U))
-#define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
-
-#define SPH_C64(x)    ((sph_u64)(x ## UL))
-#define SPH_T64(x)    ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF))
-
-//#define SPH_ROTL32(x, n)   (((x) << (n)) | ((x) >> (32 - (n))))
-//#define SPH_ROTR32(x, n)   (((x) >> (n)) | ((x) << (32 - (n))))
-//#define SPH_ROTL64(x, n)   (((x) << (n)) | ((x) >> (64 - (n))))
-//#define SPH_ROTR64(x, n)   (((x) >> (n)) | ((x) << (64 - (n))))
-
-#define SPH_ROTL32(x,n) rotate(x,(uint)n)     //faster with driver 14.6
-#define SPH_ROTR32(x,n) rotate(x,(uint)(32-n))
-#define SPH_ROTL64(x,n) rotate(x,(ulong)n)
-//#define SPH_ROTR64(x,n) rotate(x,(ulong)(64-n))
-
-/*
-inline ulong rol64 (ulong l,ulong n) { 
-  if (n<=32) {
-    uint2 t = rotate(as_uint2(l), (n)); 
-    return as_ulong((uint2)(bitselect(t.s0, t.s1, (uint)(1 << (n)) - 1), bitselect(t.s0, t.s1, (uint)(~((1 << (n)) - 1)))));
-  } else {
-  uint2 t = rotate(as_uint2(l), (n - 32)); 
-  return as_ulong((uint2)(bitselect(t.s1, t.s0, (uint)(1 << (n - 32)) - 1), bitselect(t.s1, t.s0, (uint)(~((1 << (n - 32)) - 1))))); 
-  }
-}
-*/
-
-/*
-static inline ulong rol64(const ulong vw, unsigned n) {
-	uint2 result;
-    uint2 v=as_uint2(vw);
-    if (n == 32) { return as_ulong((uint2)(v.y, v.x)); }
-	if (n < 32) {
-		result.y = ( (v.y << (n)) | (v.x >> (32 - n)) );
-		result.x = ( (v.x << (n)) | (v.y >> (32 - n)) );
-	}
-	else {
-		result.y = ( (v.x << (n - 32)) | (v.y >> (64 - n)) );
-		result.x = ( (v.y << (n - 32)) | (v.x >> (64 - n)) );
-	}
-	return as_ulong(result);
-}
-*/
-
-static inline sph_u64 ror64(sph_u64 vw, unsigned a) {
-	uint2 result;
-	uint2 v = as_uint2(vw);
-	unsigned n = (unsigned)(64 - a);
-	if (n == 32) { return as_ulong((uint2)(v.y, v.x)); }
-	if (n < 32) {
-		result.y = ((v.y << (n)) | (v.x >> (32 - n)));
-		result.x = ((v.x << (n)) | (v.y >> (32 - n)));
-	}	else {
-		result.y = ((v.x << (n - 32)) | (v.y >> (64 - n)));
-		result.x = ((v.y << (n - 32)) | (v.x >> (64 - n)));
-	}
-	return as_ulong(result);
-}
-
-#define SPH_ROTR64(l,n) ror64(l, n)
-
-#include "blake256.cl"
-#include "groestl256.cl"
-#include "lyra2.cl"
-#include "keccak1600.cl"
-#include "skein256.cl"
-
-#define SWAP4(x) as_uint(as_uchar4(x).wzyx)
-#define SWAP8(x) as_ulong(as_uchar8(x).s76543210)
-
-#if SPH_BIG_ENDIAN
-  #define DEC64E(x) (x)
-  #define DEC64BE(x) (*(const __global sph_u64 *) (x));
-  #define DEC64LE(x) SWAP8(*(const __global sph_u64 *) (x));
-  #define DEC32LE(x) (*(const __global sph_u32 *) (x));
-#else
-  #define DEC64E(x) SWAP8(x)
-  #define DEC64BE(x) SWAP8(*(const __global sph_u64 *) (x));
-  #define DEC64LE(x) (*(const __global sph_u64 *) (x));
-  #define DEC32LE(x) SWAP4(*(const __global sph_u32 *) (x));
-#endif
-
-typedef union {
-  unsigned char h1[64];
-  uint h4[16];
-  ulong h8[8];
-} hash_t;
-
-__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
-__kernel void search(
-	 __global hash_t* hashes,
-	// precalc hash from fisrt part of message
-	const uint h0,
-	const uint h1,
-	const uint h2,
-	const uint h3,
-	const uint h4,
-	const uint h5,
-	const uint h6,
-	const uint h7,
-	// last 12 bytes of original message
-	const uint in16,
-	const uint in17,
-	const uint in18
-)
-
-{
- uint gid = get_global_id(0);
-  __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
-
-    sph_u32 h[8];
-    sph_u32 m[16];
-    sph_u32 v[16];
-
-h[0]=h0;
-h[1]=h1;
-h[2]=h2;
-h[3]=h3;
-h[4]=h4;
-h[5]=h5;
-h[6]=h6;
-h[7]=h7;
-// compress 2nd round
- m[0] = in16;
- m[1] = in17;
- m[2] = in18;
- m[3] = SWAP4(gid);
-
-	for (int i = 4; i < 16; i++) {m[i] = c_Padding[i];}
-
-	for (int i = 0; i < 8; i++) {v[i] = h[i];}
-
-	v[8] =  c_u256[0];
-	v[9] =  c_u256[1];
-	v[10] = c_u256[2];
-	v[11] = c_u256[3];
-	v[12] = c_u256[4] ^ 640;
-	v[13] = c_u256[5] ^ 640;
-	v[14] = c_u256[6];
-	v[15] = c_u256[7];
-
-	for (int r = 0; r < 14; r++) {
-		GS(0, 4, 0x8, 0xC, 0x0);
-		GS(1, 5, 0x9, 0xD, 0x2);
-		GS(2, 6, 0xA, 0xE, 0x4);
-		GS(3, 7, 0xB, 0xF, 0x6);
-		GS(0, 5, 0xA, 0xF, 0x8);
-		GS(1, 6, 0xB, 0xC, 0xA);
-		GS(2, 7, 0x8, 0xD, 0xC);
-		GS(3, 4, 0x9, 0xE, 0xE);
-	}
-
-	for (int i = 0; i < 16; i++) {
-		int j = i & 7;
-		h[j] ^= v[i];}
-
-for (int i = 0; i < 8; i++) {hash->h4[i]=SWAP4(h[i]);}
-
-barrier(CLK_LOCAL_MEM_FENCE);
-
-}
-
-// keccak256
-
-__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
-__kernel void search1(__global hash_t* hashes)
-{
-  uint gid = get_global_id(0);
-  __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
-
- 		sph_u64 keccak_gpu_state[25];
-
-		for (int i = 0; i < 25; i++) {
-			if (i < 4) { keccak_gpu_state[i] = hash->h8[i];
-      } else {
-      keccak_gpu_state[i] = 0;
-      }
-		}
-		keccak_gpu_state[4] = 0x0000000000000001;
-		keccak_gpu_state[16] = 0x8000000000000000;
-
-		keccak_block(keccak_gpu_state);
-		for (int i = 0; i < 4; i++) {hash->h8[i] = keccak_gpu_state[i];}
-barrier(CLK_LOCAL_MEM_FENCE);
-
-}
-
-/// lyra2 algo 
-
-__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
-__kernel void search2(__global hash_t* hashes)
-{
- uint gid = get_global_id(0);
-  __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
-
-  uint2 state[16];
-
-  for (int i = 0; i < 4; i++) { state[i] = as_uint2(hash->h8[i]);} //password
-  for (int i = 0; i < 4; i++) { state[i + 4] = state[i]; } //salt 
-  for (int i = 0; i < 8; i++) { state[i + 8] = as_uint2(blake2b_IV[i]); }
-
-  //     blake2blyra x2 
-
-  for (int i = 0; i < 24; i++) {round_lyra(state);} //because 12 is not enough
-
-  __private uint2 Matrix[96][8]; // very uncool
-  /// reducedSqueezeRow0
-
-  for (int i = 0; i < 8; i++)
-  {
-	  for (int j = 0; j<12; j++) {Matrix[j + 84 - 12 * i][0] = state[j];}
-	  round_lyra(state);
-  }
-
-  /// reducedSqueezeRow1
-
-  for (int i = 0; i < 8; i++)
-  {
-	  for (int j = 0; j < 12; j++) {state[j] ^= Matrix[j + 12 * i][0];}
-	  round_lyra(state);
-	  for (int j = 0; j < 12; j++) {Matrix[j + 84 - 12 * i][1] = Matrix[j + 12 * i][0] ^ state[j];}
-  }
- 
-  reduceDuplexRowSetup(1, 0, 2);
-  reduceDuplexRowSetup(2, 1, 3);
-  reduceDuplexRowSetup(3, 0, 4);
-  reduceDuplexRowSetup(4, 3, 5);
-  reduceDuplexRowSetup(5, 2, 6);
-  reduceDuplexRowSetup(6, 1, 7);
-
-  sph_u32 rowa;
-  rowa = state[0].x & 7;
-
-  reduceDuplexRow(7, rowa, 0);
-  rowa = state[0].x & 7;
-  reduceDuplexRow(0, rowa, 3);
-  rowa = state[0].x & 7;
-  reduceDuplexRow(3, rowa, 6);
-  rowa = state[0].x & 7;
-  reduceDuplexRow(6, rowa, 1);
-  rowa = state[0].x & 7;
-  reduceDuplexRow(1, rowa, 4);
-  rowa = state[0].x & 7;
-  reduceDuplexRow(4, rowa, 7);
-  rowa = state[0].x & 7;
-  reduceDuplexRow(7, rowa, 2);
-  rowa = state[0].x & 7;
-  reduceDuplexRow(2, rowa, 5);
-
-  absorbblock(rowa);
-
-  for (int i = 0; i < 4; i++) {hash->h8[i] = as_ulong(state[i]);} 
-barrier(CLK_LOCAL_MEM_FENCE);
-
-}
-
-//skein256
-
-__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
-__kernel void search3(__global hash_t* hashes)
-{
- uint gid = get_global_id(0);
-  __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
-
-		sph_u64 h[9];
-		sph_u64 t[3];
-        sph_u64 dt0, dt1, dt2, dt3;
-		sph_u64 p0, p1, p2, p3, p4, p5, p6, p7;
-        h[8] = skein_ks_parity;
-
-		for (int i = 0; i < 8; i++) {
-			h[i] = SKEIN_IV512_256[i];
-			h[8] ^= h[i];}
-
-			t[0] = t12[0];
-			t[1] = t12[1];
-			t[2] = t12[2];
-
-        dt0 = hash->h8[0];
-        dt1 = hash->h8[1];
-        dt2 = hash->h8[2];
-        dt3 = hash->h8[3];
-
-		p0 = h[0] + dt0;
-		p1 = h[1] + dt1;
-		p2 = h[2] + dt2;
-		p3 = h[3] + dt3;
-		p4 = h[4];
-		p5 = h[5] + t[0];
-		p6 = h[6] + t[1];
-		p7 = h[7];
-
-        #pragma unroll
-		for (int i = 1; i < 19; i+=2) {Round_8_512(p0, p1, p2, p3, p4, p5, p6, p7, i);}
-        p0 ^= dt0;
-        p1 ^= dt1;
-        p2 ^= dt2;
-        p3 ^= dt3;
-
-		h[0] = p0;
-		h[1] = p1;
-		h[2] = p2;
-		h[3] = p3;
-		h[4] = p4;
-		h[5] = p5;
-		h[6] = p6;
-		h[7] = p7;
-		h[8] = skein_ks_parity;
-
-		for (int i = 0; i < 8; i++) {h[8] ^= h[i];}
-
-		t[0] = t12[3];
-		t[1] = t12[4];
-		t[2] = t12[5];
-		p5 += t[0];  //p5 already equal h[5]
-		p6 += t[1];
-
-        #pragma unroll
-		for (int i = 1; i < 19; i+=2) {Round_8_512(p0, p1, p2, p3, p4, p5, p6, p7, i);}
-
-		hash->h8[0]      = p0;
-		hash->h8[1]      = p1;
-		hash->h8[2]      = p2;
-		hash->h8[3]      = p3;
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-}
-
-__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
-__kernel void search4(__global hash_t* hashes, __global uint* output, const ulong target)
-{
-// __local ulong T0[256], T1[256], T2[256], T3[256], T4[256], T5[256], T6[256], T7[256];
-   // uint u   = get_local_id(0);
-/*
-for (uint u = get_local_id(0); u < 256; u += get_local_size(0)) {
-
-  T0[u] = T0_G[u];
-  T1[u] = T1_G[u];
-  T2[u] = T2_G[u];
-  T3[u] = T3_G[u];
-  T4[u] = T4_G[u];
-  T5[u] = T5_G[u];
-  T6[u] = T6_G[u];
-  T7[u] = T7_G[u];
- }
-barrier(CLK_LOCAL_MEM_FENCE);
-
-  T1[u] = SPH_ROTL64(T0[u], 8UL);
-  T2[u] = SPH_ROTL64(T0[u], 16UL);
-  T3[u] = SPH_ROTL64(T0[u], 24UL);
-  T4[u] = SPH_ROTL64(T0[u], 32UL);
-  T5[u] = SPH_ROTL64(T0[u], 40UL);
-  T6[u] = SPH_ROTL64(T0[u], 48UL);
-  T7[u] = SPH_ROTL64(T0[u], 56UL);
-
-*/
-	uint gid = get_global_id(0);
-
-	__global hash_t *hash = &(hashes[gid - get_global_offset(0)]);
-
-    __private ulong message[8], state[8];
-	__private ulong t[8];
-
-	for (int u = 0; u < 4; u++) {message[u] = hash->h8[u];}
-
-	message[4] = 0x80UL;
-	message[5] = 0UL;
-	message[6] = 0UL;
-	message[7] = 0x0100000000000000UL;
-
-	for (int u = 0; u < 8; u++) {state[u] = message[u];}
-	state[7] ^= 0x0001000000000000UL;
-
-	for (int r = 0; r < 10; r ++) {ROUND_SMALL_P(state, r);}		
-
-	state[7] ^= 0x0001000000000000UL;
-
-	for (int r = 0; r < 10; r ++) {ROUND_SMALL_Q(message, r);}		
-
-	for (int u = 0; u < 8; u++) {state[u] ^= message[u];}
-	message[7] = state[7];
-
-	for (int r = 0; r < 9; r ++) {ROUND_SMALL_P(state, r);}
-    uchar8 State;
-	State.s0 = as_uchar8(state[7] ^ 0x79).s0;
-	State.s1 = as_uchar8(state[0] ^ 0x09).s1;
-	State.s2 = as_uchar8(state[1] ^ 0x19).s2;
-	State.s3 = as_uchar8(state[2] ^ 0x29).s3;
-	State.s4 = as_uchar8(state[3] ^ 0x39).s4;
-	State.s5 = as_uchar8(state[4] ^ 0x49).s5;
-	State.s6 = as_uchar8(state[5] ^ 0x59).s6;
-	State.s7 = as_uchar8(state[6] ^ 0x69).s7;
-
-		state[7] = T0_G[State.s0]
-			   ^ R64(T0_G[State.s1],  8)
-         ^ R64(T0_G[State.s2], 16)
-			   ^ R64(T0_G[State.s3], 24)
-			   ^     T4_G[State.s4]
-			   ^ R64(T4_G[State.s5],  8)
-			   ^ R64(T4_G[State.s6], 16)
-			   ^ R64(T4_G[State.s7], 24) ^message[7];
-
-//	t[7] ^= message[7];
-	barrier(CLK_LOCAL_MEM_FENCE);
-
-	bool result = ( state[7] <= target);
-	if (result) {
-		output[atomic_inc(output + 0xFF)] = SWAP4(gid);
-	}
-}
-
-#endif // LYRA2RE_CL

From a458a61eb10a0880c35eba2d83e0fde9196ba72b Mon Sep 17 00:00:00 2001
From: badman74 <badman74@yahoo.com>
Date: Tue, 14 Apr 2015 17:38:19 -0500
Subject: [PATCH 2/2] re-added lyra2re kernels

---
 kernel/lyra2.cl   | 175 +++++++++++++++++
 kernel/lyra2re.cl | 470 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 645 insertions(+)
 create mode 100644 kernel/lyra2.cl
 create mode 100644 kernel/lyra2re.cl

diff --git a/kernel/lyra2.cl b/kernel/lyra2.cl
new file mode 100644
index 000000000..334f6822f
--- /dev/null
+++ b/kernel/lyra2.cl
@@ -0,0 +1,175 @@
+/*
+* Lyra2 kernel implementation.
+*
+* ==========================(LICENSE BEGIN)============================
+* Copyright (c) 2014 djm34
+* 
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files (the
+* "Software"), to deal in the Software without restriction, including
+* without limitation the rights to use, copy, modify, merge, publish,
+* distribute, sublicense, and/or sell copies of the Software, and to
+* permit persons to whom the Software is furnished to do so, subject to
+* the following conditions:
+*
+* The above copyright notice and this permission notice shall be
+* included in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*
+* ===========================(LICENSE END)=============================
+*
+* @author   djm34
+*/
+
+/*Blake2b IV Array*/
+__constant static const sph_u64 blake2b_IV[8] =
+{
+  0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
+  0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
+  0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
+  0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
+};
+
+/*Blake2b's rotation*/
+
+static inline uint2 ror2(uint2 v, unsigned a) {
+	uint2 result;
+	unsigned n = 64 - a;
+	if (n == 32) { return (uint2)(v.y,v.x); }
+	if (n < 32) {
+		result.y = ((v.y << (n)) | (v.x >> (32 - n)));
+		result.x = ((v.x << (n)) | (v.y >> (32 - n)));
+	}
+	else {
+		result.y = ((v.x << (n - 32)) | (v.y >> (64 - n)));
+		result.x = ((v.y << (n - 32)) | (v.x >> (64 - n)));
+	}
+	return result;
+}
+static inline uint2 ror2l(uint2 v, unsigned a) {
+	uint2 result;
+		result.y = ((v.x << (32-a)) | (v.y >> (a)));
+		result.x = ((v.y << (32-a)) | (v.x >> (a)));
+	return result;
+}
+static inline uint2 ror2r(uint2 v, unsigned a) {
+	uint2 result;
+		result.y = ((v.y << (64-a)) | (v.x >> (a-32)));
+		result.x = ((v.x << (64-a)) | (v.y >> (a-32)));
+	return result;
+}
+/*
+#define G(a,b,c,d) \
+  do { \
+a = as_uint2(as_ulong(a)+as_ulong(b)); d ^= a; d = d.yx; \
+c = as_uint2(as_ulong(c)+as_ulong(d)); b ^= c; b = ror2l(b, 24); \
+a = as_uint2(as_ulong(a)+as_ulong(b)); d ^= a; d = ror2l(d, 16); \
+c = as_uint2(as_ulong(c)+as_ulong(d)); b ^= c; b = ror2r(b, 63); \
+  } while(0)
+*/
+#define G(a,b,c,d) \
+  do { \
+a = as_uint2(as_ulong(a)+as_ulong(b)); d ^= a; d = d.yx; \
+c = as_uint2(as_ulong(c)+as_ulong(d)); b ^= c; b = as_uint2(as_uchar8(b).s34567012); \
+a = as_uint2(as_ulong(a)+as_ulong(b)); d ^= a; d = ror2l(d, 16); \
+c = as_uint2(as_ulong(c)+as_ulong(d)); b ^= c; b = ror2r(b, 63); \
+  } while(0)
+
+/*One Round of the Blake2b's compression function*/
+#define round_lyra(v)  \
+ do { \
+    G(v[ 0],v[ 4],v[ 8],v[12]); \
+    G(v[ 1],v[ 5],v[ 9],v[13]); \
+    G(v[ 2],v[ 6],v[10],v[14]); \
+    G(v[ 3],v[ 7],v[11],v[15]); \
+    G(v[ 0],v[ 5],v[10],v[15]); \
+    G(v[ 1],v[ 6],v[11],v[12]); \
+    G(v[ 2],v[ 7],v[ 8],v[13]); \
+    G(v[ 3],v[ 4],v[ 9],v[14]); \
+ } while(0)
+
+
+#define reduceDuplexRowSetup(rowIn, rowInOut, rowOut) \
+   { \
+	for (int i = 0; i < 8; i++) \
+				{ \
+\
+		for (int j = 0; j < 12; j++) {state[j] ^= as_uint2(as_ulong(Matrix[12 * i + j][rowIn]) + as_ulong(Matrix[12 * i + j][rowInOut]));} \
+		round_lyra(state); \
+		for (int j = 0; j < 12; j++) {Matrix[j + 84 - 12 * i][rowOut] = Matrix[12 * i + j][rowIn] ^ state[j];} \
+\
+		Matrix[0 + 12 * i][rowInOut] ^= state[11]; \
+		Matrix[1 + 12 * i][rowInOut] ^= state[0]; \
+		Matrix[2 + 12 * i][rowInOut] ^= state[1]; \
+		Matrix[3 + 12 * i][rowInOut] ^= state[2]; \
+		Matrix[4 + 12 * i][rowInOut] ^= state[3]; \
+		Matrix[5 + 12 * i][rowInOut] ^= state[4]; \
+		Matrix[6 + 12 * i][rowInOut] ^= state[5]; \
+		Matrix[7 + 12 * i][rowInOut] ^= state[6]; \
+		Matrix[8 + 12 * i][rowInOut] ^= state[7]; \
+		Matrix[9 + 12 * i][rowInOut] ^= state[8]; \
+		Matrix[10 + 12 * i][rowInOut] ^= state[9]; \
+		Matrix[11 + 12 * i][rowInOut] ^= state[10]; \
+				} \
+ \
+   } 
+
+#define reduceDuplexRow(rowIn, rowInOut, rowOut) \
+  { \
+	 for (int i = 0; i < 8; i++) \
+	 	 	 	 	 { \
+		 for (int j = 0; j < 12; j++) \
+			 state[j] ^= as_uint2(as_ulong(Matrix[12 * i + j][rowIn]) + as_ulong(Matrix[12 * i + j][rowInOut])); \
+ \
+		 round_lyra(state); \
+		 for (int j = 0; j < 12; j++) {Matrix[j + 12 * i][rowOut] ^= state[j];} \
+\
+		 Matrix[0 + 12 * i][rowInOut] ^= state[11]; \
+		 Matrix[1 + 12 * i][rowInOut] ^= state[0]; \
+		 Matrix[2 + 12 * i][rowInOut] ^= state[1]; \
+		 Matrix[3 + 12 * i][rowInOut] ^= state[2]; \
+		 Matrix[4 + 12 * i][rowInOut] ^= state[3]; \
+		 Matrix[5 + 12 * i][rowInOut] ^= state[4]; \
+		 Matrix[6 + 12 * i][rowInOut] ^= state[5]; \
+		 Matrix[7 + 12 * i][rowInOut] ^= state[6]; \
+		 Matrix[8 + 12 * i][rowInOut] ^= state[7]; \
+		 Matrix[9 + 12 * i][rowInOut] ^= state[8]; \
+		 Matrix[10 + 12 * i][rowInOut] ^= state[9]; \
+		 Matrix[11 + 12 * i][rowInOut] ^= state[10]; \
+	 	 	 	 	 } \
+ \
+  } 
+#define absorbblock(in)  { \
+	state[0] ^= Matrix[0][in]; \
+	state[1] ^= Matrix[1][in]; \
+	state[2] ^= Matrix[2][in]; \
+	state[3] ^= Matrix[3][in]; \
+	state[4] ^= Matrix[4][in]; \
+	state[5] ^= Matrix[5][in]; \
+	state[6] ^= Matrix[6][in]; \
+	state[7] ^= Matrix[7][in]; \
+	state[8] ^= Matrix[8][in]; \
+	state[9] ^= Matrix[9][in]; \
+	state[10] ^= Matrix[10][in]; \
+	state[11] ^= Matrix[11][in]; \
+	round_lyra(state); \
+	round_lyra(state); \
+	round_lyra(state); \
+	round_lyra(state); \
+	round_lyra(state); \
+	round_lyra(state); \
+	round_lyra(state); \
+	round_lyra(state); \
+	round_lyra(state); \
+	round_lyra(state); \
+	round_lyra(state); \
+	round_lyra(state); \
+  } 
diff --git a/kernel/lyra2re.cl b/kernel/lyra2re.cl
new file mode 100644
index 000000000..923a9dfa6
--- /dev/null
+++ b/kernel/lyra2re.cl
@@ -0,0 +1,470 @@
+/*
+ * Lyra2RE kernel implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ * Copyright (c) 2014 djm34
+ * Copyright (c) 2014 James Lovejoy
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   djm34
+ */
+
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+
+#ifndef LYRA2RE_CL
+#define LYRA2RE_CL
+
+#if __ENDIAN_LITTLE__
+#define SPH_LITTLE_ENDIAN 1
+#else
+#define SPH_BIG_ENDIAN 1
+#endif
+
+#define SPH_UPTR sph_u64
+
+typedef unsigned int sph_u32;
+typedef int sph_s32;
+#ifndef __OPENCL_VERSION__
+typedef unsigned long long sph_u64;
+typedef long long sph_s64;
+#else
+typedef unsigned long sph_u64;
+typedef long sph_s64;
+#endif
+
+#define SPH_64 1
+#define SPH_64_TRUE 1
+
+#define SPH_C32(x)    ((sph_u32)(x ## U))
+#define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
+
+#define SPH_C64(x)    ((sph_u64)(x ## UL))
+#define SPH_T64(x)    ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF))
+
+//#define SPH_ROTL32(x, n)   (((x) << (n)) | ((x) >> (32 - (n))))
+//#define SPH_ROTR32(x, n)   (((x) >> (n)) | ((x) << (32 - (n))))
+//#define SPH_ROTL64(x, n)   (((x) << (n)) | ((x) >> (64 - (n))))
+//#define SPH_ROTR64(x, n)   (((x) >> (n)) | ((x) << (64 - (n))))
+
+#define SPH_ROTL32(x,n) rotate(x,(uint)n)     //faster with driver 14.6
+#define SPH_ROTR32(x,n) rotate(x,(uint)(32-n))
+#define SPH_ROTL64(x,n) rotate(x,(ulong)n)
+//#define SPH_ROTR64(x,n) rotate(x,(ulong)(64-n))
+
+/*
+inline ulong rol64 (ulong l,ulong n) { 
+  if (n<=32) {
+    uint2 t = rotate(as_uint2(l), (n)); 
+    return as_ulong((uint2)(bitselect(t.s0, t.s1, (uint)(1 << (n)) - 1), bitselect(t.s0, t.s1, (uint)(~((1 << (n)) - 1)))));
+  } else {
+  uint2 t = rotate(as_uint2(l), (n - 32)); 
+  return as_ulong((uint2)(bitselect(t.s1, t.s0, (uint)(1 << (n - 32)) - 1), bitselect(t.s1, t.s0, (uint)(~((1 << (n - 32)) - 1))))); 
+  }
+}
+*/
+
+/*
+static inline ulong rol64(const ulong vw, unsigned n) {
+	uint2 result;
+    uint2 v=as_uint2(vw);
+    if (n == 32) { return as_ulong((uint2)(v.y, v.x)); }
+	if (n < 32) {
+		result.y = ( (v.y << (n)) | (v.x >> (32 - n)) );
+		result.x = ( (v.x << (n)) | (v.y >> (32 - n)) );
+	}
+	else {
+		result.y = ( (v.x << (n - 32)) | (v.y >> (64 - n)) );
+		result.x = ( (v.y << (n - 32)) | (v.x >> (64 - n)) );
+	}
+	return as_ulong(result);
+}
+*/
+
+static inline sph_u64 ror64(sph_u64 vw, unsigned a) {
+	uint2 result;
+	uint2 v = as_uint2(vw);
+	unsigned n = (unsigned)(64 - a);
+	if (n == 32) { return as_ulong((uint2)(v.y, v.x)); }
+	if (n < 32) {
+		result.y = ((v.y << (n)) | (v.x >> (32 - n)));
+		result.x = ((v.x << (n)) | (v.y >> (32 - n)));
+	}	else {
+		result.y = ((v.x << (n - 32)) | (v.y >> (64 - n)));
+		result.x = ((v.y << (n - 32)) | (v.x >> (64 - n)));
+	}
+	return as_ulong(result);
+}
+
+#define SPH_ROTR64(l,n) ror64(l, n)
+
+#include "blake256.cl"
+#include "groestl256.cl"
+#include "lyra2.cl"
+#include "keccak1600.cl"
+#include "skein256.cl"
+
+#define SWAP4(x) as_uint(as_uchar4(x).wzyx)
+#define SWAP8(x) as_ulong(as_uchar8(x).s76543210)
+
+#if SPH_BIG_ENDIAN
+  #define DEC64E(x) (x)
+  #define DEC64BE(x) (*(const __global sph_u64 *) (x));
+  #define DEC64LE(x) SWAP8(*(const __global sph_u64 *) (x));
+  #define DEC32LE(x) (*(const __global sph_u32 *) (x));
+#else
+  #define DEC64E(x) SWAP8(x)
+  #define DEC64BE(x) SWAP8(*(const __global sph_u64 *) (x));
+  #define DEC64LE(x) (*(const __global sph_u64 *) (x));
+  #define DEC32LE(x) SWAP4(*(const __global sph_u32 *) (x));
+#endif
+
+typedef union {
+  unsigned char h1[64];
+  uint h4[16];
+  ulong h8[8];
+} hash_t;
+
+__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+__kernel void search(
+	 __global hash_t* hashes,
+	// precalc hash from fisrt part of message
+	const uint h0,
+	const uint h1,
+	const uint h2,
+	const uint h3,
+	const uint h4,
+	const uint h5,
+	const uint h6,
+	const uint h7,
+	// last 12 bytes of original message
+	const uint in16,
+	const uint in17,
+	const uint in18
+)
+
+{
+ uint gid = get_global_id(0);
+  __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
+
+    sph_u32 h[8];
+    sph_u32 m[16];
+    sph_u32 v[16];
+
+h[0]=h0;
+h[1]=h1;
+h[2]=h2;
+h[3]=h3;
+h[4]=h4;
+h[5]=h5;
+h[6]=h6;
+h[7]=h7;
+// compress 2nd round
+ m[0] = in16;
+ m[1] = in17;
+ m[2] = in18;
+ m[3] = SWAP4(gid);
+
+	for (int i = 4; i < 16; i++) {m[i] = c_Padding[i];}
+
+	for (int i = 0; i < 8; i++) {v[i] = h[i];}
+
+	v[8] =  c_u256[0];
+	v[9] =  c_u256[1];
+	v[10] = c_u256[2];
+	v[11] = c_u256[3];
+	v[12] = c_u256[4] ^ 640;
+	v[13] = c_u256[5] ^ 640;
+	v[14] = c_u256[6];
+	v[15] = c_u256[7];
+
+	for (int r = 0; r < 14; r++) {
+		GS(0, 4, 0x8, 0xC, 0x0);
+		GS(1, 5, 0x9, 0xD, 0x2);
+		GS(2, 6, 0xA, 0xE, 0x4);
+		GS(3, 7, 0xB, 0xF, 0x6);
+		GS(0, 5, 0xA, 0xF, 0x8);
+		GS(1, 6, 0xB, 0xC, 0xA);
+		GS(2, 7, 0x8, 0xD, 0xC);
+		GS(3, 4, 0x9, 0xE, 0xE);
+	}
+
+	for (int i = 0; i < 16; i++) {
+		int j = i & 7;
+		h[j] ^= v[i];}
+
+for (int i = 0; i < 8; i++) {hash->h4[i]=SWAP4(h[i]);}
+
+barrier(CLK_LOCAL_MEM_FENCE);
+
+}
+
+// keccak256
+
+__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+__kernel void search1(__global hash_t* hashes)
+{
+  uint gid = get_global_id(0);
+  __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
+
+ 		sph_u64 keccak_gpu_state[25];
+
+		for (int i = 0; i < 25; i++) {
+			if (i < 4) { keccak_gpu_state[i] = hash->h8[i];
+      } else {
+      keccak_gpu_state[i] = 0;
+      }
+		}
+		keccak_gpu_state[4] = 0x0000000000000001;
+		keccak_gpu_state[16] = 0x8000000000000000;
+
+		keccak_block(keccak_gpu_state);
+		for (int i = 0; i < 4; i++) {hash->h8[i] = keccak_gpu_state[i];}
+barrier(CLK_LOCAL_MEM_FENCE);
+
+}
+
+/// lyra2 algo 
+
+__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+__kernel void search2(__global hash_t* hashes)
+{
+ uint gid = get_global_id(0);
+  __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
+
+  uint2 state[16];
+
+  for (int i = 0; i < 4; i++) { state[i] = as_uint2(hash->h8[i]);} //password
+  for (int i = 0; i < 4; i++) { state[i + 4] = state[i]; } //salt 
+  for (int i = 0; i < 8; i++) { state[i + 8] = as_uint2(blake2b_IV[i]); }
+
+  //     blake2blyra x2 
+
+  for (int i = 0; i < 24; i++) {round_lyra(state);} //because 12 is not enough
+
+  __private uint2 Matrix[96][8]; // very uncool
+  /// reducedSqueezeRow0
+
+  for (int i = 0; i < 8; i++)
+  {
+	  for (int j = 0; j<12; j++) {Matrix[j + 84 - 12 * i][0] = state[j];}
+	  round_lyra(state);
+  }
+
+  /// reducedSqueezeRow1
+
+  for (int i = 0; i < 8; i++)
+  {
+	  for (int j = 0; j < 12; j++) {state[j] ^= Matrix[j + 12 * i][0];}
+	  round_lyra(state);
+	  for (int j = 0; j < 12; j++) {Matrix[j + 84 - 12 * i][1] = Matrix[j + 12 * i][0] ^ state[j];}
+  }
+ 
+  reduceDuplexRowSetup(1, 0, 2);
+  reduceDuplexRowSetup(2, 1, 3);
+  reduceDuplexRowSetup(3, 0, 4);
+  reduceDuplexRowSetup(4, 3, 5);
+  reduceDuplexRowSetup(5, 2, 6);
+  reduceDuplexRowSetup(6, 1, 7);
+
+  sph_u32 rowa;
+  rowa = state[0].x & 7;
+
+  reduceDuplexRow(7, rowa, 0);
+  rowa = state[0].x & 7;
+  reduceDuplexRow(0, rowa, 3);
+  rowa = state[0].x & 7;
+  reduceDuplexRow(3, rowa, 6);
+  rowa = state[0].x & 7;
+  reduceDuplexRow(6, rowa, 1);
+  rowa = state[0].x & 7;
+  reduceDuplexRow(1, rowa, 4);
+  rowa = state[0].x & 7;
+  reduceDuplexRow(4, rowa, 7);
+  rowa = state[0].x & 7;
+  reduceDuplexRow(7, rowa, 2);
+  rowa = state[0].x & 7;
+  reduceDuplexRow(2, rowa, 5);
+
+  absorbblock(rowa);
+
+  for (int i = 0; i < 4; i++) {hash->h8[i] = as_ulong(state[i]);} 
+barrier(CLK_LOCAL_MEM_FENCE);
+
+}
+
+//skein256
+
+__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+__kernel void search3(__global hash_t* hashes)
+{
+ uint gid = get_global_id(0);
+  __global hash_t *hash = &(hashes[gid-get_global_offset(0)]);
+
+		sph_u64 h[9];
+		sph_u64 t[3];
+        sph_u64 dt0, dt1, dt2, dt3;
+		sph_u64 p0, p1, p2, p3, p4, p5, p6, p7;
+        h[8] = skein_ks_parity;
+
+		for (int i = 0; i < 8; i++) {
+			h[i] = SKEIN_IV512_256[i];
+			h[8] ^= h[i];}
+
+			t[0] = t12[0];
+			t[1] = t12[1];
+			t[2] = t12[2];
+
+        dt0 = hash->h8[0];
+        dt1 = hash->h8[1];
+        dt2 = hash->h8[2];
+        dt3 = hash->h8[3];
+
+		p0 = h[0] + dt0;
+		p1 = h[1] + dt1;
+		p2 = h[2] + dt2;
+		p3 = h[3] + dt3;
+		p4 = h[4];
+		p5 = h[5] + t[0];
+		p6 = h[6] + t[1];
+		p7 = h[7];
+
+        #pragma unroll
+		for (int i = 1; i < 19; i+=2) {Round_8_512(p0, p1, p2, p3, p4, p5, p6, p7, i);}
+        p0 ^= dt0;
+        p1 ^= dt1;
+        p2 ^= dt2;
+        p3 ^= dt3;
+
+		h[0] = p0;
+		h[1] = p1;
+		h[2] = p2;
+		h[3] = p3;
+		h[4] = p4;
+		h[5] = p5;
+		h[6] = p6;
+		h[7] = p7;
+		h[8] = skein_ks_parity;
+
+		for (int i = 0; i < 8; i++) {h[8] ^= h[i];}
+
+		t[0] = t12[3];
+		t[1] = t12[4];
+		t[2] = t12[5];
+		p5 += t[0];  //p5 already equal h[5]
+		p6 += t[1];
+
+        #pragma unroll
+		for (int i = 1; i < 19; i+=2) {Round_8_512(p0, p1, p2, p3, p4, p5, p6, p7, i);}
+
+		hash->h8[0]      = p0;
+		hash->h8[1]      = p1;
+		hash->h8[2]      = p2;
+		hash->h8[3]      = p3;
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+}
+
+__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+__kernel void search4(__global hash_t* hashes, __global uint* output, const ulong target)
+{
+// __local ulong T0[256], T1[256], T2[256], T3[256], T4[256], T5[256], T6[256], T7[256];
+   // uint u   = get_local_id(0);
+/*
+for (uint u = get_local_id(0); u < 256; u += get_local_size(0)) {
+
+  T0[u] = T0_G[u];
+  T1[u] = T1_G[u];
+  T2[u] = T2_G[u];
+  T3[u] = T3_G[u];
+  T4[u] = T4_G[u];
+  T5[u] = T5_G[u];
+  T6[u] = T6_G[u];
+  T7[u] = T7_G[u];
+ }
+barrier(CLK_LOCAL_MEM_FENCE);
+
+  T1[u] = SPH_ROTL64(T0[u], 8UL);
+  T2[u] = SPH_ROTL64(T0[u], 16UL);
+  T3[u] = SPH_ROTL64(T0[u], 24UL);
+  T4[u] = SPH_ROTL64(T0[u], 32UL);
+  T5[u] = SPH_ROTL64(T0[u], 40UL);
+  T6[u] = SPH_ROTL64(T0[u], 48UL);
+  T7[u] = SPH_ROTL64(T0[u], 56UL);
+
+*/
+	uint gid = get_global_id(0);
+
+	__global hash_t *hash = &(hashes[gid - get_global_offset(0)]);
+
+    __private ulong message[8], state[8];
+	__private ulong t[8];
+
+	for (int u = 0; u < 4; u++) {message[u] = hash->h8[u];}
+
+	message[4] = 0x80UL;
+	message[5] = 0UL;
+	message[6] = 0UL;
+	message[7] = 0x0100000000000000UL;
+
+	for (int u = 0; u < 8; u++) {state[u] = message[u];}
+	state[7] ^= 0x0001000000000000UL;
+
+	for (int r = 0; r < 10; r ++) {ROUND_SMALL_P(state, r);}		
+
+	state[7] ^= 0x0001000000000000UL;
+
+	for (int r = 0; r < 10; r ++) {ROUND_SMALL_Q(message, r);}		
+
+	for (int u = 0; u < 8; u++) {state[u] ^= message[u];}
+	message[7] = state[7];
+
+	for (int r = 0; r < 9; r ++) {ROUND_SMALL_P(state, r);}
+    uchar8 State;
+	State.s0 = as_uchar8(state[7] ^ 0x79).s0;
+	State.s1 = as_uchar8(state[0] ^ 0x09).s1;
+	State.s2 = as_uchar8(state[1] ^ 0x19).s2;
+	State.s3 = as_uchar8(state[2] ^ 0x29).s3;
+	State.s4 = as_uchar8(state[3] ^ 0x39).s4;
+	State.s5 = as_uchar8(state[4] ^ 0x49).s5;
+	State.s6 = as_uchar8(state[5] ^ 0x59).s6;
+	State.s7 = as_uchar8(state[6] ^ 0x69).s7;
+
+		state[7] = T0_G[State.s0]
+			   ^ R64(T0_G[State.s1],  8)
+         ^ R64(T0_G[State.s2], 16)
+			   ^ R64(T0_G[State.s3], 24)
+			   ^     T4_G[State.s4]
+			   ^ R64(T4_G[State.s5],  8)
+			   ^ R64(T4_G[State.s6], 16)
+			   ^ R64(T4_G[State.s7], 24) ^message[7];
+
+//	t[7] ^= message[7];
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	bool result = ( state[7] <= target);
+	if (result) {
+		output[atomic_inc(output + 0xFF)] = SWAP4(gid);
+	}
+}
+
+#endif // LYRA2RE_CL