-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Description
Hello,
I have some issues trying to implement a fast blur with Halide. I am trying to match a hand-written code with rotation of vertical reductions (the code will be below and is explained in this paper), without success. The default inline halide code was surprisingly faster than both my hand-written inline and rotation of vertical reductions. So I tried to compile the Halide generated C code instead of using the generated object file and now it was slower than both of my hand-written versions.
How is the generated object file that much faster than -Ofast -march=native ?
Can I get the rotation of vertical reductions scheduling with Halide ?
Codegen:
const Type real = Float(32);
struct Kernel {
const char* name;
Func f;
std::vector<Argument> args;
};
Kernel blur_kernel(const Target& target) {
ImageParam input(real, 2, "input");
Func clamped_input = Halide::BoundaryConditions::repeat_edge(input);
Func vertical("vertical"), horizontal("horizontal"), blur("blur");
Var x("x"), y("y");
vertical(x, y) = clamped_input(x, y-1) + 2*clamped_input(x, y) + clamped_input(x, y+1);
horizontal(x, y) = vertical(x-1, y) + 2*vertical(x, y) + vertical(x+1, y);
blur(x, y) = horizontal(x, y) * 1.f/16.f;
////
auto vsize = target.natural_vector_size(real);
input.set_host_alignment(vsize);
input.dim(0).set_stride(1);
// input.dim(0).set_min(0);
// input.dim(1).set_min(0);
// I tried various schedulings, the following seemed close to what I want but is incredibly slow:
// vertical.store_at(blur, y).compute_at(blur, x);
return {
.name = "blur",
.f = blur,
.args = { input }
};
}
void generate_kernel(Kernel& k, const Target& target, const std::string& gendir) {
std::cout << "generating " << k.name << std::endl;
k.f.print_loop_nest();
k.f.compile_to_file(gendir + k.name, k.args, k.name, target);
k.f.compile_to_lowered_stmt(gendir + k.name + ".txt", k.args, Text, target);
k.f.compile_to_c(gendir + k.name + ".c", k.args, k.name, target);
k.f.compile_to_assembly(gendir + k.name + ".asm", k.args, k.name, target);
}
int main(int argc, char** argv) {
Target target = get_host_target()
.with_feature(Target::NoRuntime);
Kernel blur = blur_kernel(target);
generate_kernel(blur, target, gendir);
return EXIT_SUCCESS;
}Benchmark:
template<typename F, typename TS, typename TE>
double time_ms(int sample_count, int batch_count, F f,
TS time_start, TE time_elapsed) {
double min_elapsed;
for (int s = 0; s < sample_count; s++) {
time_start();
for (int b = 0; b < batch_count; b++) {
f();
}
double elapsed = time_elapsed();
if (s == 0 || elapsed < min_elapsed) {
min_elapsed = elapsed;
}
}
return min_elapsed / (double)(batch_count);
}
template<typename F>
double cpu_time_ms(int sample_count, int batch_count, F f) {
struct timespec t1, t2;
auto time_start = [&] {
clock_gettime(CLOCK_MONOTONIC, &t1);
};
auto time_elapsed = [&] {
clock_gettime(CLOCK_MONOTONIC, &t2);
return (double)(t2.tv_sec - t1.tv_sec) * 1000. +
(double)(t2.tv_nsec - t1.tv_nsec) / 1000000.;
};
return time_ms(sample_count, batch_count, f, time_start, time_elapsed);
}
const int size = 2000;
const int sample_count = 3;
const int batch_count = 100;
template<typename F>
void bench(const char* name, F f) {
double time = cpu_time_ms(sample_count, batch_count, f);
double nspp = (time * 1e6) / (size * size);
printf("%s time: %.4lf ms, %.4lf ns/px\n", name, time, nspp);
}
int main(int argc, char** argv) {
Halide::Runtime::Buffer<float> i0(size, size);
Halide::Runtime::Buffer<float> i1(size, size);
for (int y = 0; y < size; y++) {
for (int x = 0; x < size; x++) {
i0(x, y) = y + x;
}
}
bench("blur", [&] {
blur(i0, i1);
i1.device_sync();
});
float* __restrict a = (float*) malloc(size * size * sizeof(float));
float* __restrict b = (float*) malloc(size * size * sizeof(float));
for (int y = 0; y < size; y++) {
for (int x = 0; x < size; x++) {
a[y*size + x] = y + x;
}
}
bench("blur-inline", [&] {
int r0 = 0*size;
for (int y = 0; y < (size-1); y++) {
int r1 = y*size;
int r2 = (y+1)*size;
int x0 = 0;
for (int x = 0; x < (size-1); x++) {
int x1 = x;
int x2 = x+1;
float v0 = a[r0+x0] + 2.f*a[r1+x0] + a[r2+x0];
float v1 = a[r0+x1] + 2.f*a[r1+x1] + a[r2+x1];
float v2 = a[r0+x2] + 2.f*a[r1+x2] + a[r2+x2];
b[y*size + x] = (v0 + 2.f*v1 + v2) * (1.f/16.f);
x0 = x1;
}
// last column
int x1 = size-1;
int x2 = x1;
float v0 = a[r0+x0] + 2.f*a[r1+x0] + a[r2+x0];
float v1 = a[r0+x1] + 2.f*a[r1+x1] + a[r2+x1];
float v2 = a[r0+x2] + 2.f*a[r1+x2] + a[r2+x2];
b[y*size + x1] = (v0 + 2.f*v1 + v2) * (1.f/16.f);
}
// last row
int y = size-1;
int r1 = y*size;
int r2 = r1;
int x0 = 0;
for (int x = 0; x < (size-1); x++) {
int x1 = x;
int x2 = x+1;
float v0 = a[r0+x0] + 2.f*a[r1+x0] + a[r2+x0];
float v1 = a[r0+x1] + 2.f*a[r1+x1] + a[r2+x1];
float v2 = a[r0+x2] + 2.f*a[r1+x2] + a[r2+x2];
b[y*size + x] = (v0 + 2.f*v1 + v2) * (1.f/16.f);
x0 = x1;
}
// last column
int x1 = size-1;
int x2 = x1;
float v0 = a[r0+x0] + 2.f*a[r1+x0] + a[r2+x0];
float v1 = a[r0+x1] + 2.f*a[r1+x1] + a[r2+x1];
float v2 = a[r0+x2] + 2.f*a[r1+x2] + a[r2+x2];
b[y*size + x1] = (v0 + 2.f*v1 + v2) * (1.f/16.f);
});
bench("blur-reduction", [&] {
for (int y = 0; y < size; y++) {
float* r0 = &a[std::max(0, y-1)*size];
float* r1 = &a[y*size];
float* r2 = &a[std::min(y+1, size-1)*size];
float v0 = r0[0] + 2.f*r1[0] + r2[0];
float v1 = v0;
for (int x = 0; x < (size-1); x++) {
float v2 = r0[x+1] + 2.f*r1[x+1] + r2[x+1];
b[y*size + x] = (v0 + 2.f*v1 + v2) * (1.f/16.f);
v0 = v1;
v1 = v2;
}
// last column
float v2 = v1;
b[y*size + (size-1)] = (v0 + 2.f*v1 + v2) * (1.f/16.f);
}
});
for (int y = 0; y < size; y++) {
for (int x = 0; x < size; x++) {
if (i1(x, y) != b[y*size + x]) {
fprintf(stderr, "output mismatch (%i, %i) = %f or %f\n", x, y, i1(x, y), b[y*size + x]);
}
}
}
free(a);
free(b);
return EXIT_SUCCESS;
}Running:
# hw=host, CC=clang++
mkdir -p $hw/bin &&
# uncomment to use the generated C++
# $CC $hw/gen/blur.c -c -o $hw/gen/blur.o -std=c++11 -Ofast -march=native &&
$CC src/bench.cpp $hw/gen/blur.o ../$hw/halide_runtime.a -Ofast -o $hw/bin/bench -I $hw/gen/ -I ../halide-headers/ -lpthread -ldl -lrt -std=c++11 -D_POSIX_C_SOURCE=200809 -march=native &&
./$hw/bin/benchIntel i5 results:
- inline, halide object file: ~0.8 ns/px
- inline, halide c file: ~3.3 ns/px
- inline, hand-written: ~2 ns/px
- rotation of vertical reductions, hand-written: ~1 ns/px
Thank you for you help.
I also planned to add parallel tiles and vectorisation on top of this.