Blur implementation

Hello,
I have some issues trying to implement a fast blur with Halide. I am trying to match a hand-written code with rotation of vertical reductions (the code will be below and is explained in [this paper](https://hal.inria.fr/hal-01094906)), without success. The default inline halide code was surprisingly faster than both my hand-written inline and rotation of vertical reductions. So I tried to compile the Halide generated C code instead of using the generated object file and now it was slower than both of my hand-written versions.

How is the generated object file that much faster than `-Ofast -march=native` ?
Can I get the rotation of vertical reductions scheduling with Halide ?

Codegen:
```cpp
const Type real = Float(32);

struct Kernel {
    const char* name;
    Func f;
    std::vector<Argument> args;
};

Kernel blur_kernel(const Target& target) {
    ImageParam input(real, 2, "input");

    Func clamped_input = Halide::BoundaryConditions::repeat_edge(input);
    Func vertical("vertical"), horizontal("horizontal"), blur("blur");
    Var x("x"), y("y");

    vertical(x, y) = clamped_input(x, y-1) + 2*clamped_input(x, y) + clamped_input(x, y+1);
    horizontal(x, y) = vertical(x-1, y) + 2*vertical(x, y) + vertical(x+1, y);
    blur(x, y) = horizontal(x, y) * 1.f/16.f;

    ////

    auto vsize = target.natural_vector_size(real);
    input.set_host_alignment(vsize);
    input.dim(0).set_stride(1);
    // input.dim(0).set_min(0);
    // input.dim(1).set_min(0);

    // I tried various schedulings, the following seemed close to what I want but is incredibly slow:
    // vertical.store_at(blur, y).compute_at(blur, x);

    return {
        .name = "blur",
        .f = blur,
        .args = { input }
    };
}

void generate_kernel(Kernel& k, const Target& target, const std::string& gendir) {
    std::cout << "generating " << k.name << std::endl;
    k.f.print_loop_nest();
    k.f.compile_to_file(gendir + k.name, k.args, k.name, target);
    k.f.compile_to_lowered_stmt(gendir + k.name + ".txt", k.args, Text, target);
    k.f.compile_to_c(gendir + k.name + ".c", k.args, k.name, target);
    k.f.compile_to_assembly(gendir + k.name + ".asm", k.args, k.name, target);
}

int main(int argc, char** argv) {
    Target target = get_host_target()
        .with_feature(Target::NoRuntime);

    Kernel blur = blur_kernel(target);
    generate_kernel(blur, target, gendir);

    return EXIT_SUCCESS;
}
```

Benchmark:
```cpp
template<typename F, typename TS, typename TE>
double time_ms(int sample_count, int batch_count, F f,
               TS time_start, TE time_elapsed) {
    double min_elapsed;

    for (int s = 0; s < sample_count; s++) {
        time_start();

        for (int b = 0; b < batch_count; b++) {
            f();
        }

        double elapsed = time_elapsed();
        if (s == 0 || elapsed < min_elapsed) {
            min_elapsed = elapsed;
        }
    }
    
    return min_elapsed / (double)(batch_count);
}

template<typename F>
double cpu_time_ms(int sample_count, int batch_count, F f) {
    struct timespec t1, t2;
    auto time_start = [&] {
        clock_gettime(CLOCK_MONOTONIC, &t1);
    };
    auto time_elapsed = [&] {
        clock_gettime(CLOCK_MONOTONIC, &t2);

        return (double)(t2.tv_sec - t1.tv_sec) * 1000. +
            (double)(t2.tv_nsec - t1.tv_nsec) / 1000000.;
    };
    return time_ms(sample_count, batch_count, f, time_start, time_elapsed);
}

const int size = 2000;
const int sample_count = 3;
const int batch_count = 100;

template<typename F>
void bench(const char* name, F f) {
    double time = cpu_time_ms(sample_count, batch_count, f);
    double nspp = (time * 1e6) / (size * size);
    printf("%s time: %.4lf ms, %.4lf ns/px\n", name, time, nspp);
}

int main(int argc, char** argv) {
    Halide::Runtime::Buffer<float> i0(size, size);
    Halide::Runtime::Buffer<float> i1(size, size);
    for (int y = 0; y < size; y++) {
        for (int x = 0; x < size; x++) {
            i0(x, y) = y + x;
        }
    }

    bench("blur", [&] {
        blur(i0, i1);
        i1.device_sync();
    });

    float* __restrict a = (float*) malloc(size * size * sizeof(float));
    float* __restrict b = (float*) malloc(size * size * sizeof(float));
    for (int y = 0; y < size; y++) {
        for (int x = 0; x < size; x++) {
            a[y*size + x] = y + x;
        }
    }

    bench("blur-inline", [&] {
        int r0 = 0*size;
        for (int y = 0; y < (size-1); y++) {
            int r1 = y*size;
            int r2 = (y+1)*size;

            int x0 = 0;
            for (int x = 0; x < (size-1); x++) {
                int x1 = x;
                int x2 = x+1;

                float v0 = a[r0+x0] + 2.f*a[r1+x0] + a[r2+x0];
                float v1 = a[r0+x1] + 2.f*a[r1+x1] + a[r2+x1];
                float v2 = a[r0+x2] + 2.f*a[r1+x2] + a[r2+x2];
                b[y*size + x] = (v0 + 2.f*v1 + v2) * (1.f/16.f);

                x0 = x1;
            }

            // last column
            int x1 = size-1;
            int x2 = x1;
            float v0 = a[r0+x0] + 2.f*a[r1+x0] + a[r2+x0];
            float v1 = a[r0+x1] + 2.f*a[r1+x1] + a[r2+x1];
            float v2 = a[r0+x2] + 2.f*a[r1+x2] + a[r2+x2];
            b[y*size + x1] = (v0 + 2.f*v1 + v2) * (1.f/16.f);
        }

        // last row
        int y = size-1;
        int r1 = y*size;
        int r2 = r1;

        int x0 = 0;
        for (int x = 0; x < (size-1); x++) {
            int x1 = x;
            int x2 = x+1;

            float v0 = a[r0+x0] + 2.f*a[r1+x0] + a[r2+x0];
            float v1 = a[r0+x1] + 2.f*a[r1+x1] + a[r2+x1];
            float v2 = a[r0+x2] + 2.f*a[r1+x2] + a[r2+x2];
            b[y*size + x] = (v0 + 2.f*v1 + v2) * (1.f/16.f);

            x0 = x1;
        }

        // last column
        int x1 = size-1;
        int x2 = x1;
        float v0 = a[r0+x0] + 2.f*a[r1+x0] + a[r2+x0];
        float v1 = a[r0+x1] + 2.f*a[r1+x1] + a[r2+x1];
        float v2 = a[r0+x2] + 2.f*a[r1+x2] + a[r2+x2];
        b[y*size + x1] = (v0 + 2.f*v1 + v2) * (1.f/16.f);
    });

    bench("blur-reduction", [&] {
        for (int y = 0; y < size; y++) {
            float* r0 = &a[std::max(0, y-1)*size];
            float* r1 = &a[y*size];
            float* r2 = &a[std::min(y+1, size-1)*size];

	    float v0 = r0[0] + 2.f*r1[0] + r2[0];
            float v1 = v0;
            for (int x = 0; x < (size-1); x++) {
                float v2 = r0[x+1] + 2.f*r1[x+1] + r2[x+1];
                b[y*size + x] = (v0 + 2.f*v1 + v2) * (1.f/16.f);
                
                v0 = v1;
                v1 = v2;
            }

            // last column
            float v2 = v1;
            b[y*size + (size-1)] = (v0 + 2.f*v1 + v2) * (1.f/16.f);
        }
    });

    for (int y = 0; y < size; y++) {
        for (int x = 0; x < size; x++) {
            if (i1(x, y) != b[y*size + x]) {
                fprintf(stderr, "output mismatch (%i, %i) = %f or %f\n", x, y, i1(x, y), b[y*size + x]);
            }
        }
    }

    free(a);
    free(b);
    return EXIT_SUCCESS;
}
```
Running:
```sh
# hw=host, CC=clang++
mkdir -p $hw/bin &&
# uncomment to use the generated C++
# $CC $hw/gen/blur.c -c -o $hw/gen/blur.o -std=c++11 -Ofast -march=native &&
$CC src/bench.cpp $hw/gen/blur.o ../$hw/halide_runtime.a -Ofast -o $hw/bin/bench -I $hw/gen/ -I ../halide-headers/ -lpthread -ldl -lrt -std=c++11 -D_POSIX_C_SOURCE=200809 -march=native &&
./$hw/bin/bench
```

Intel i5 results:
- inline, halide object file: ~0.8 ns/px
- inline, halide c file: ~3.3 ns/px
- inline, hand-written: ~2 ns/px
- rotation of vertical reductions, hand-written: ~1 ns/px

Thank you for you help.
I also planned to add parallel tiles and vectorisation on top of this.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Blur implementation #2905

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Blur implementation #2905

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions