diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index c9dc230..c62ea5b 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -101,15 +101,21 @@ jobs: exit 0 fi - # Also check all commits in the PR for [build] + # Set up PR refs and fetch base branch to ensure we have the commits BASE_SHA="${{ github.event.pull_request.base.sha }}" HEAD_SHA="${{ github.event.pull_request.head.sha }}" - COMMIT_MSGS=$(git fetch origin "$BASE_SHA" "$HEAD_SHA" && git log --format=%B "${BASE_SHA}..${HEAD_SHA}" || echo "") + git fetch origin "${{ github.event.pull_request.base.ref }}" --quiet || true + + # Check all commits in the PR for [build] + COMMIT_MSGS=$(git log --format=%B "${BASE_SHA}..${HEAD_SHA}" 2>/dev/null || echo "") if echo "$COMMIT_MSGS" | grep -q "\[build\]"; then echo "should_build=true" >> "$GITHUB_OUTPUT" echo "Running build because a commit in the PR contains [build]" exit 0 fi + + # Check which files changed in the PR + CHANGED_FILES=$(git diff --name-only "${BASE_SHA}" "${HEAD_SHA}" 2>/dev/null || echo "") else # For pushes, check if the head commit message contains [build] if [[ "${{ contains(github.event.head_commit.message, '[build]') }}" == "true" ]]; then @@ -117,14 +123,7 @@ jobs: echo "Running build because commit message contains [build]" exit 0 fi - fi - # For PRs, compare the PR base with the head - if [[ "${{ github.event_name }}" == "pull_request" ]]; then - BASE_SHA="${{ github.event.pull_request.base.sha }}" - HEAD_SHA="${{ github.event.pull_request.head.sha }}" - CHANGED_FILES=$(git fetch origin "$BASE_SHA" "$HEAD_SHA" && git diff --name-only "${BASE_SHA}" "${HEAD_SHA}" || echo "") - else # For pushes, use the before/after SHAs or fallback to comparing with parent BEFORE_SHA="${{ github.event.before }}" AFTER_SHA="${{ github.event.after }}" diff --git a/src/hyperscan/extension.c b/src/hyperscan/extension.c index cb61de8..f34e497 100644 --- a/src/hyperscan/extension.c +++ b/src/hyperscan/extension.c @@ -717,12 +717,12 @@ static PyObject *Database_scan(Database *self, PyObject *args, PyObject *kwds) char **data; PyObject *fast_seq; Py_ssize_t num_buffers; - Py_ssize_t *lengths; + uint32_t *lengths; fast_seq = PySequence_Fast(odata, "expected a sequence of buffers"); num_buffers = PySequence_Fast_GET_SIZE(fast_seq); data = PyMem_RawMalloc(num_buffers * sizeof(char *)); - lengths = PyMem_RawMalloc(num_buffers * sizeof(Py_ssize_t)); + lengths = PyMem_RawMalloc(num_buffers * sizeof(uint32_t)); for (uint32_t i = 0; i < num_buffers; i++) { PyObject *o = PySequence_Fast_GET_ITEM(fast_seq, i); @@ -735,7 +735,7 @@ static PyObject *Database_scan(Database *self, PyObject *args, PyObject *kwds) Py_buffer view; if (PyObject_GetBuffer(o, &view, PyBUF_SIMPLE) != -1) { data[i] = (char *)view.buf; - lengths[i] = view.len; + lengths[i] = (uint32_t)view.len; } else { PyErr_SetString(PyExc_BufferError, "failed to get buffer"); break; @@ -761,7 +761,7 @@ static PyObject *Database_scan(Database *self, PyObject *args, PyObject *kwds) hs_err = hs_scan_vector( self->hs_db, (const char *const *)data, - (const uint32_t *)lengths, + lengths, num_buffers, flags, oscratch == Py_None ? ((Scratch *)self->scratch)->hs_scratch diff --git a/tests/test_hyperscan.py b/tests/test_hyperscan.py index f5a519a..e9d5ea5 100644 --- a/tests/test_hyperscan.py +++ b/tests/test_hyperscan.py @@ -120,19 +120,28 @@ def test_stream_scan(database_stream, mocker): def test_vectored_scan(database_vector, mocker): + """Test vectored scanning across multiple buffers. + + Regression test for issue #202: vectored mode was missing matches + due to Py_ssize_t to uint32_t type aliasing bug on 64-bit systems. + """ callback = mocker.Mock(return_value=None) buffers = [ - bytearray(b"xxxfooxxx"), - bytearray(b"xxfoxbarx"), - bytearray(b"barxxxxxx"), + bytearray(b"xxxfooxxx"), # 9 bytes, offsets 0-8 + bytearray(b"xxfoxbarx"), # 9 bytes, offsets 9-17 + bytearray(b"barxxxxxx"), # 9 bytes, offsets 18-26 ] database_vector.scan(buffers, match_event_handler=callback) callback.assert_has_calls( [ - mocker.call(0, 0, 5, 0, None), - mocker.call(0, 0, 6, 0, None), - mocker.call(2, 9, 12, 0, None), + # Pattern 0 (fo+): matches in buffer 0 and buffer 1 + mocker.call(0, 0, 5, 0, None), # 'fo' at positions 3-4 + mocker.call(0, 0, 6, 0, None), # 'foo' at positions 3-5 + mocker.call(0, 0, 13, 0, None), # 'fo' in buffer 1 at pos 11-12 + # Pattern 2 (BAR): matches in buffer 1 and buffer 2 + mocker.call(2, 14, 17, 0, None), # 'bar' in buffer 1 + mocker.call(2, 18, 21, 0, None), # 'bar' in buffer 2 ], any_order=True, )