Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
9e89b7c
Specialized x86 implementation of interleave_vectors
abadams Jan 26, 2026
188bee0
Update test to be more exhaustive
abadams Jan 27, 2026
2ba8dde
Fix comment.
abadams Jan 27, 2026
d102f7b
Comment fix
abadams Jan 27, 2026
46d41dd
clang-tidy fixes
abadams Jan 27, 2026
27f1220
Make variable names more consistent
abadams Jan 27, 2026
5576f46
Simplify code with helper lambda
abadams Jan 27, 2026
107aaa5
Comment tweaks
abadams Jan 27, 2026
0bc1b9f
Don't do half-width unpcks
abadams Jan 28, 2026
cdc1de2
Use optimization fences in the base class too
abadams Jan 30, 2026
23b79ba
Merge branch 'main' into abadams/fix_x86_transpose
mcourteaux Feb 1, 2026
3eef5db
Use Catanzaro's algorithm for non-power-of-two interleaves
abadams Feb 12, 2026
678a353
Support more interleave and deinterleave patterns
abadams Feb 18, 2026
a0b7d66
Merge remote-tracking branch 'origin/main' into abadams/fix_x86_trans…
abadams Feb 18, 2026
4c1adf7
clang-tidy fix
abadams Feb 19, 2026
1c940e8
Handle multiple let injections at same site
abadams Feb 19, 2026
c39b1a0
better simplification and better handling of composite factors
abadams Feb 20, 2026
794df0b
Fix innermost_containing_node
abadams Feb 20, 2026
486addd
Fix some simd op check failures
abadams Feb 21, 2026
a1ecca9
Fix infinite recursion issue and missed case in interleave codegen
abadams Feb 23, 2026
f66d5ea
Adjust expectations in stage_strided_loads test
abadams Feb 23, 2026
c25142f
Allow reversed suffix or not in sve test
abadams Feb 23, 2026
bae3e02
Don't use optimization fences on hexagon
abadams Feb 23, 2026
b7defbd
Fix infinite simplifier loop
abadams Feb 23, 2026
23944a0
Don't hoist transposes on hexagon
abadams Feb 23, 2026
0d110d2
Make distinct strided load nodes in the IR distinct in memory too
abadams Feb 23, 2026
53ae7e4
Merge remote-tracking branch 'origin/main' into abadams/fix_x86_trans…
abadams Feb 24, 2026
84f10b1
arm-32 has no vst2 for 64-bit elements
abadams Feb 24, 2026
8d93c3c
Windows bad filename fix in simd op check
abadams Feb 24, 2026
36565ce
Temporary dumping of cpu info to debug github actions issue
abadams Feb 24, 2026
3f45c47
dump cpuinfo in makefile testing workflow
abadams Feb 24, 2026
223dd7f
Merge remote-tracking branch 'origin/main' into abadams/fix_x86_trans…
abadams Mar 2, 2026
2695151
Address review comments
abadams Mar 6, 2026
31f180a
Merge remote-tracking branch 'origin/main' into abadams/fix_x86_trans…
abadams Mar 10, 2026
2962ea1
Remove duplicate function body
abadams Mar 10, 2026
fa2fcb7
Use slice of predicate
abadams Mar 11, 2026
dcdfb90
clang-format
abadams Mar 11, 2026
70afc58
SVE fixes
abadams Mar 11, 2026
cd04fb2
Merge branch 'main' into abadams/fix_x86_transpose
alexreinking Mar 16, 2026
5d2b524
Move optimization_fence back
alexreinking Mar 16, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/testing-make.yml
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ jobs:
"lld-${LLVM_VERSION}" \
"liblld-${LLVM_VERSION}-dev"
echo "LLVM_CONFIG=llvm-config-${LLVM_VERSION}" | tee -a "$GITHUB_ENV"
cat /proc/cpuinfo
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it's intentional, all good. Just checking, because cpuinfo is typically VERY long in output.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIUC This is here because something like 10% of GHA x86 runners are throwing illegal instruction exceptions, indicating a feature detection bug on certain platforms.

That said, this should probably be something like:

cat /proc/cpuinfo | grep flags | head -1

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I want to see everything (e.g. precise cpu model, number of cpus, etc), in case it helps narrow it down.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can probably back this out since #9017 was merged

elif [ "$RUNNER_OS" = "macOS" ]; then
brew install libjpeg-turbo libpng pkgconf protobuf "llvm@${LLVM_VERSION}" "lld@${LLVM_VERSION}"
echo "LLVM_CONFIG=$(brew --prefix "llvm@${LLVM_VERSION}")/bin/llvm-config" | tee -a "$GITHUB_ENV"
Expand Down
2 changes: 1 addition & 1 deletion apps/iir_blur/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ $(BIN)/%/filter: filter.cpp $(BIN)/%/iir_blur.a $(BIN)/%/iir_blur_auto_schedule.
$(CXX) $(CXXFLAGS) -I$(BIN)/$* -Wall -O3 $^ -o $@ $(LDFLAGS) $(IMAGE_IO_FLAGS) $(CUDA_LDFLAGS) $(OPENCL_LDFLAGS)

$(BIN)/%/out.png: $(BIN)/%/filter
$< ../images/rgba.png $(BIN)/$*/out.png
$< ../images/rgb.png $(BIN)/$*/out.png

clean:
rm -rf $(BIN)
19 changes: 13 additions & 6 deletions apps/iir_blur/iir_blur_generator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,19 +36,26 @@ Func blur_cols_transpose(Func input, Expr height, Expr alpha, bool skip_schedule
if (!skip_schedule) {
if (!target.has_gpu_feature()) {
// CPU schedule.
// 8.2ms on an Intel i9-9960X using 16 threads
// 9.7ms on an Intel i9-9960X at 3.1 GHz using 16 threads
// Split the transpose into tiles of rows. Parallelize over channels
// and strips (Halide supports nested parallelism).
Var xo, yo, t;
// and strips.
Var xo, yo, t, yi;
transpose.compute_root()
.tile(x, y, xo, yo, x, y, vec, vec * 4)
.split(y, y, yi, vec)
.vectorize(yi)
.vectorize(x)
.parallel(yo)
.parallel(c);
.fuse(yo, c, t)
.parallel(t);

blur.in(transpose)
.compute_at(transpose, y)
.vectorize(x)
.unroll(y);

// Run the filter on each row of tiles (which corresponds to a strip of
// columns in the input).
blur.compute_at(transpose, yo);
blur.compute_at(transpose, t);

// Vectorize computations within the strips.
blur.update(0)
Expand Down
35 changes: 32 additions & 3 deletions src/CSE.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -237,10 +237,39 @@ class CSEEveryExprInStmt : public IRMutator {
}
const Call *bundle = Call::as_intrinsic(dummy, {Call::bundle});
internal_assert(bundle && bundle->args.size() == 2);
Stmt s = Store::make(op->name, bundle->args[0], bundle->args[1],

Expr value = bundle->args[0], index = bundle->args[1];

// Figure out which ones are actually needed by the index
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't we have a dead code elimination pass?
Also, in what kind of case would there be more than what we need?
We're looking at a buffer[index] = let x in let y in let z in ...; type of expression.


auto add_all_vars_to_set = [&](const Expr &e, std::set<std::string> &s) {
visit_with(e, [&](auto *, const Variable *var) {
s.insert(var->name);
});
};

std::set<string> index_lets;
add_all_vars_to_set(index, index_lets);
for (const auto &[var, val] : reverse_view(lets)) {
if (index_lets.count(var)) {
add_all_vars_to_set(val, index_lets);
}
}

vector<pair<string, Expr>> deferred;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a comment why some are Let and some are deferred into a LetStmt?

for (const auto &[var, val] : reverse_view(lets)) {
if (index_lets.count(var)) {
deferred.emplace_back(var, val);
} else {
value = Let::make(var, val, value);
}
}

Stmt s = Store::make(op->name, value, index,
op->param, mutate(op->predicate), op->alignment);
for (const auto &[var, value] : reverse_view(lets)) {
s = LetStmt::make(var, value, s);

for (const auto &[var, val] : deferred) {
s = LetStmt::make(var, val, s);
}
return s;
}
Expand Down
10 changes: 6 additions & 4 deletions src/CodeGen_ARM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1478,10 +1478,11 @@ void CodeGen_ARM::visit(const Store *op) {
intrin_type = t;
Type elt = t.element_of();
int vec_bits = t.bits() * t.lanes();
if (elt == Float(32) || elt == Float(64) ||
is_float16_and_has_feature(elt) ||
elt == Int(8) || elt == Int(16) || elt == Int(32) || elt == Int(64) ||
elt == UInt(8) || elt == UInt(16) || elt == UInt(32) || elt == UInt(64)) {
if (t.bits() <= target.bits &&
(elt == Float(32) || elt == Float(64) ||
is_float16_and_has_feature(elt) ||
elt == Int(8) || elt == Int(16) || elt == Int(32) || elt == Int(64) ||
elt == UInt(8) || elt == UInt(16) || elt == UInt(32) || elt == UInt(64))) {
const int target_vector_bits = native_vector_bits();
if (vec_bits % 128 == 0) {
type_ok_for_vst = true;
Expand Down Expand Up @@ -1895,6 +1896,7 @@ void CodeGen_ARM::visit(const Shuffle *op) {
if (target.os != Target::IOS && target.os != Target::OSX &&
load &&
op->vectors.size() == 1 &&
op->is_slice() &&
2 <= stride && stride <= 4 &&
op->slice_begin() < stride &&
load->type.lanes() == stride * op->type.lanes()) {
Expand Down
11 changes: 7 additions & 4 deletions src/CodeGen_Hexagon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ class CodeGen_Hexagon : public CodeGen_Posix {
llvm::Value *interleave_vectors(const std::vector<llvm::Value *> &v) override;
llvm::Value *shuffle_vectors(llvm::Value *a, llvm::Value *b,
const std::vector<int> &indices) override;
llvm::Value *optimization_fence(llvm::Value *v) override;
using CodeGen_Posix::shuffle_vectors;
///@}

Expand Down Expand Up @@ -1301,6 +1302,12 @@ Value *CodeGen_Hexagon::shuffle_vectors(Value *a, Value *b,
return vdelta(concat_vectors({a, b}), indices);
}

Value *CodeGen_Hexagon::optimization_fence(Value *v) {
// As of llvm 21, the base class version seems to trip up LLVM's hexagon
// backend, possibly because it relies on a floating point type.
return v;
}

Value *CodeGen_Hexagon::vlut256(Value *lut, Value *idx, int min_index,
int max_index) {
llvm::Type *lut_ty = lut->getType();
Expand Down Expand Up @@ -1409,10 +1416,6 @@ Value *CodeGen_Hexagon::vlut256(Value *lut, Value *idx, int min_index,
return slice_vector(concat_vectors(result), 0, idx_elements);
}

bool is_power_of_two(int x) {
return (x & (x - 1)) == 0;
}

// vdelta and vrdelta are instructions that take an input vector and
// pass it through a network made up of levels. Each element x at each
// level i can either take the element from the previous level at the
Expand Down
Loading
Loading