Skip to content

Blur implementation #2905

@Bastacyclop

Description

@Bastacyclop

Hello,
I have some issues trying to implement a fast blur with Halide. I am trying to match a hand-written code with rotation of vertical reductions (the code will be below and is explained in this paper), without success. The default inline halide code was surprisingly faster than both my hand-written inline and rotation of vertical reductions. So I tried to compile the Halide generated C code instead of using the generated object file and now it was slower than both of my hand-written versions.

How is the generated object file that much faster than -Ofast -march=native ?
Can I get the rotation of vertical reductions scheduling with Halide ?

Codegen:

const Type real = Float(32);

struct Kernel {
    const char* name;
    Func f;
    std::vector<Argument> args;
};

Kernel blur_kernel(const Target& target) {
    ImageParam input(real, 2, "input");

    Func clamped_input = Halide::BoundaryConditions::repeat_edge(input);
    Func vertical("vertical"), horizontal("horizontal"), blur("blur");
    Var x("x"), y("y");

    vertical(x, y) = clamped_input(x, y-1) + 2*clamped_input(x, y) + clamped_input(x, y+1);
    horizontal(x, y) = vertical(x-1, y) + 2*vertical(x, y) + vertical(x+1, y);
    blur(x, y) = horizontal(x, y) * 1.f/16.f;

    ////

    auto vsize = target.natural_vector_size(real);
    input.set_host_alignment(vsize);
    input.dim(0).set_stride(1);
    // input.dim(0).set_min(0);
    // input.dim(1).set_min(0);

    // I tried various schedulings, the following seemed close to what I want but is incredibly slow:
    // vertical.store_at(blur, y).compute_at(blur, x);

    return {
        .name = "blur",
        .f = blur,
        .args = { input }
    };
}

void generate_kernel(Kernel& k, const Target& target, const std::string& gendir) {
    std::cout << "generating " << k.name << std::endl;
    k.f.print_loop_nest();
    k.f.compile_to_file(gendir + k.name, k.args, k.name, target);
    k.f.compile_to_lowered_stmt(gendir + k.name + ".txt", k.args, Text, target);
    k.f.compile_to_c(gendir + k.name + ".c", k.args, k.name, target);
    k.f.compile_to_assembly(gendir + k.name + ".asm", k.args, k.name, target);
}

int main(int argc, char** argv) {
    Target target = get_host_target()
        .with_feature(Target::NoRuntime);

    Kernel blur = blur_kernel(target);
    generate_kernel(blur, target, gendir);

    return EXIT_SUCCESS;
}

Benchmark:

template<typename F, typename TS, typename TE>
double time_ms(int sample_count, int batch_count, F f,
               TS time_start, TE time_elapsed) {
    double min_elapsed;

    for (int s = 0; s < sample_count; s++) {
        time_start();

        for (int b = 0; b < batch_count; b++) {
            f();
        }

        double elapsed = time_elapsed();
        if (s == 0 || elapsed < min_elapsed) {
            min_elapsed = elapsed;
        }
    }
    
    return min_elapsed / (double)(batch_count);
}

template<typename F>
double cpu_time_ms(int sample_count, int batch_count, F f) {
    struct timespec t1, t2;
    auto time_start = [&] {
        clock_gettime(CLOCK_MONOTONIC, &t1);
    };
    auto time_elapsed = [&] {
        clock_gettime(CLOCK_MONOTONIC, &t2);

        return (double)(t2.tv_sec - t1.tv_sec) * 1000. +
            (double)(t2.tv_nsec - t1.tv_nsec) / 1000000.;
    };
    return time_ms(sample_count, batch_count, f, time_start, time_elapsed);
}

const int size = 2000;
const int sample_count = 3;
const int batch_count = 100;

template<typename F>
void bench(const char* name, F f) {
    double time = cpu_time_ms(sample_count, batch_count, f);
    double nspp = (time * 1e6) / (size * size);
    printf("%s time: %.4lf ms, %.4lf ns/px\n", name, time, nspp);
}

int main(int argc, char** argv) {
    Halide::Runtime::Buffer<float> i0(size, size);
    Halide::Runtime::Buffer<float> i1(size, size);
    for (int y = 0; y < size; y++) {
        for (int x = 0; x < size; x++) {
            i0(x, y) = y + x;
        }
    }

    bench("blur", [&] {
        blur(i0, i1);
        i1.device_sync();
    });

    float* __restrict a = (float*) malloc(size * size * sizeof(float));
    float* __restrict b = (float*) malloc(size * size * sizeof(float));
    for (int y = 0; y < size; y++) {
        for (int x = 0; x < size; x++) {
            a[y*size + x] = y + x;
        }
    }

    bench("blur-inline", [&] {
        int r0 = 0*size;
        for (int y = 0; y < (size-1); y++) {
            int r1 = y*size;
            int r2 = (y+1)*size;

            int x0 = 0;
            for (int x = 0; x < (size-1); x++) {
                int x1 = x;
                int x2 = x+1;

                float v0 = a[r0+x0] + 2.f*a[r1+x0] + a[r2+x0];
                float v1 = a[r0+x1] + 2.f*a[r1+x1] + a[r2+x1];
                float v2 = a[r0+x2] + 2.f*a[r1+x2] + a[r2+x2];
                b[y*size + x] = (v0 + 2.f*v1 + v2) * (1.f/16.f);

                x0 = x1;
            }

            // last column
            int x1 = size-1;
            int x2 = x1;
            float v0 = a[r0+x0] + 2.f*a[r1+x0] + a[r2+x0];
            float v1 = a[r0+x1] + 2.f*a[r1+x1] + a[r2+x1];
            float v2 = a[r0+x2] + 2.f*a[r1+x2] + a[r2+x2];
            b[y*size + x1] = (v0 + 2.f*v1 + v2) * (1.f/16.f);
        }

        // last row
        int y = size-1;
        int r1 = y*size;
        int r2 = r1;

        int x0 = 0;
        for (int x = 0; x < (size-1); x++) {
            int x1 = x;
            int x2 = x+1;

            float v0 = a[r0+x0] + 2.f*a[r1+x0] + a[r2+x0];
            float v1 = a[r0+x1] + 2.f*a[r1+x1] + a[r2+x1];
            float v2 = a[r0+x2] + 2.f*a[r1+x2] + a[r2+x2];
            b[y*size + x] = (v0 + 2.f*v1 + v2) * (1.f/16.f);

            x0 = x1;
        }

        // last column
        int x1 = size-1;
        int x2 = x1;
        float v0 = a[r0+x0] + 2.f*a[r1+x0] + a[r2+x0];
        float v1 = a[r0+x1] + 2.f*a[r1+x1] + a[r2+x1];
        float v2 = a[r0+x2] + 2.f*a[r1+x2] + a[r2+x2];
        b[y*size + x1] = (v0 + 2.f*v1 + v2) * (1.f/16.f);
    });

    bench("blur-reduction", [&] {
        for (int y = 0; y < size; y++) {
            float* r0 = &a[std::max(0, y-1)*size];
            float* r1 = &a[y*size];
            float* r2 = &a[std::min(y+1, size-1)*size];

	    float v0 = r0[0] + 2.f*r1[0] + r2[0];
            float v1 = v0;
            for (int x = 0; x < (size-1); x++) {
                float v2 = r0[x+1] + 2.f*r1[x+1] + r2[x+1];
                b[y*size + x] = (v0 + 2.f*v1 + v2) * (1.f/16.f);
                
                v0 = v1;
                v1 = v2;
            }

            // last column
            float v2 = v1;
            b[y*size + (size-1)] = (v0 + 2.f*v1 + v2) * (1.f/16.f);
        }
    });

    for (int y = 0; y < size; y++) {
        for (int x = 0; x < size; x++) {
            if (i1(x, y) != b[y*size + x]) {
                fprintf(stderr, "output mismatch (%i, %i) = %f or %f\n", x, y, i1(x, y), b[y*size + x]);
            }
        }
    }

    free(a);
    free(b);
    return EXIT_SUCCESS;
}

Running:

# hw=host, CC=clang++
mkdir -p $hw/bin &&
# uncomment to use the generated C++
# $CC $hw/gen/blur.c -c -o $hw/gen/blur.o -std=c++11 -Ofast -march=native &&
$CC src/bench.cpp $hw/gen/blur.o ../$hw/halide_runtime.a -Ofast -o $hw/bin/bench -I $hw/gen/ -I ../halide-headers/ -lpthread -ldl -lrt -std=c++11 -D_POSIX_C_SOURCE=200809 -march=native &&
./$hw/bin/bench

Intel i5 results:

  • inline, halide object file: ~0.8 ns/px
  • inline, halide c file: ~3.3 ns/px
  • inline, hand-written: ~2 ns/px
  • rotation of vertical reductions, hand-written: ~1 ns/px

Thank you for you help.
I also planned to add parallel tiles and vectorisation on top of this.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions