-
Notifications
You must be signed in to change notification settings - Fork 14
Expand file tree
/
Copy pathslice_rows_kernel.cu
More file actions
55 lines (41 loc) · 1.96 KB
/
slice_rows_kernel.cu
File metadata and controls
55 lines (41 loc) · 1.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#include "slice_rows_kernel.h"
#include <stdio.h>
#define BLOCK_SIZE 32
__global__ void slice_rows_kernel (const float * __restrict__ src,
float * __restrict__ dst, int m, int n, int offset, int len){
int row = blockIdx.y*blockDim.y+threadIdx.y;
int col = blockIdx.x*blockDim.x+threadIdx.x;
//printf("%d x %d\n", row, col);
if (offset <= row && row < offset+len && row < n && col < m){
//printf("offset:%d len:%d %dx%d [%d] %f\n", offset, len, row, col, col * n + row, src[col * n + row]);
dst[col * len + row-offset] = src[col * n + row];
}
}
void slice_rows_kernel_exec(const float *src, float *dst, int m, int n, int offset, int len){
/* specified block and grid size */
dim3 block(BLOCK_SIZE, BLOCK_SIZE);
dim3 grid((m+block.x-1)/block.x, (n+block.y-1)/block.y);
//printf("m:%d n:%d offset:%d len:%d\n", m, n, offset, len);
/* lunch kernel */
slice_rows_kernel<<<grid, block>>>(src, dst, m, n, offset, len);
cudaThreadSynchronize();
}
__global__ void join_rows_kernel (const float * __restrict__ src,
float * __restrict__ dst, int m, int n, int offset, int len){
int row = blockIdx.y*blockDim.y+threadIdx.y;
int col = blockIdx.x*blockDim.x+threadIdx.x;
//printf("%d x %d\n", row, col);
if (offset <= row && row < offset+len && row < n && col < m){
//printf("offset:%d len:%d %dx%d [%d] %f\n", offset, len, row, col, col * n + row, src[col * n + row]);
dst[col * n + row] = src[col * len + row-offset];
}
}
void join_rows_kernel_exec(const float *src, float *dst, int m, int n, int offset, int len){
/* specified block and grid size */
dim3 block(BLOCK_SIZE, BLOCK_SIZE);
dim3 grid((m+block.x-1)/block.x, (n+block.y-1)/block.y);
//printf("m:%d n:%d offset:%d len:%d\n", m, n, offset, len);
/* lunch kernel */
join_rows_kernel<<<grid, block>>>(src, dst, m, n, offset, len);
cudaThreadSynchronize();
}