diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 9843d63..3594574 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -57,7 +57,7 @@ jobs: ((github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name != github.repository) || (github.event.pull_request.merged == true && startsWith(github.event.pull_request.head.ref, vars.RELEASE_PR_BRANCH || 'create-pull-request'))) || (github.repository == 'darvid/python-hyperscan' && contains(github.event.head_commit.message, '[build]')) run: | - echo "valid_event=true" >> $GITHUB_OUTPUT + echo "valid_event=true" >> "$GITHUB_OUTPUT" check_changes: name: Build pre-conditions check @@ -76,9 +76,11 @@ jobs: - name: Check if build is needed id: check + env: + PR_TITLE: ${{ github.event.pull_request.title }} run: | if [[ "${{ inputs.force_build || false }}" == "true" ]]; then - echo "should_build=true" >> $GITHUB_OUTPUT + echo "should_build=true" >> "$GITHUB_OUTPUT" echo "Running build because force_build is true" exit 0 fi @@ -86,9 +88,8 @@ jobs: # Check for [build] tag in commit messages or PR title if [[ "${{ github.event_name }}" == "pull_request" ]]; then # For PRs, check if PR title contains [build] - PR_TITLE="${{ github.event.pull_request.title }}" if [[ "$PR_TITLE" == *"[build]"* ]]; then - echo "should_build=true" >> $GITHUB_OUTPUT + echo "should_build=true" >> "$GITHUB_OUTPUT" echo "Running build because PR title contains [build]" exit 0 fi @@ -96,16 +97,16 @@ jobs: # Also check all commits in the PR for [build] BASE_SHA="${{ github.event.pull_request.base.sha }}" HEAD_SHA="${{ github.event.pull_request.head.sha }}" - COMMIT_MSGS=$(git fetch origin $BASE_SHA $HEAD_SHA && git log --format=%B $BASE_SHA..$HEAD_SHA || echo "") + COMMIT_MSGS=$(git fetch origin "$BASE_SHA" "$HEAD_SHA" && git log --format=%B "${BASE_SHA}..${HEAD_SHA}" || echo "") if echo "$COMMIT_MSGS" | grep -q "\[build\]"; then - echo "should_build=true" >> $GITHUB_OUTPUT + echo "should_build=true" >> "$GITHUB_OUTPUT" echo "Running build because a commit in the PR contains [build]" exit 0 fi else # For pushes, check if the head commit message contains [build] if [[ "${{ contains(github.event.head_commit.message, '[build]') }}" == "true" ]]; then - echo "should_build=true" >> $GITHUB_OUTPUT + echo "should_build=true" >> "$GITHUB_OUTPUT" echo "Running build because commit message contains [build]" exit 0 fi @@ -115,7 +116,7 @@ jobs: if [[ "${{ github.event_name }}" == "pull_request" ]]; then BASE_SHA="${{ github.event.pull_request.base.sha }}" HEAD_SHA="${{ github.event.pull_request.head.sha }}" - CHANGED_FILES=$(git fetch origin $BASE_SHA $HEAD_SHA && git diff --name-only $BASE_SHA $HEAD_SHA || echo "") + CHANGED_FILES=$(git fetch origin "$BASE_SHA" "$HEAD_SHA" && git diff --name-only "${BASE_SHA}" "${HEAD_SHA}" || echo "") else # For pushes, use the before/after SHAs or fallback to comparing with parent BEFORE_SHA="${{ github.event.before }}" @@ -126,12 +127,12 @@ jobs: CHANGED_FILES=$(git diff --name-only HEAD^ || echo "") else # Try to fetch the commits first to make sure they exist - git fetch --depth=1 origin $BEFORE_SHA || true - git fetch --depth=1 origin $AFTER_SHA || true + git fetch --depth=1 origin "${BEFORE_SHA}" || true + git fetch --depth=1 origin "${AFTER_SHA}" || true # Check if both SHAs exist in the repository - if git cat-file -e $BEFORE_SHA 2>/dev/null && git cat-file -e $AFTER_SHA 2>/dev/null; then - CHANGED_FILES=$(git diff --name-only $BEFORE_SHA $AFTER_SHA || echo "") + if git cat-file -e "${BEFORE_SHA}" 2>/dev/null && git cat-file -e "${AFTER_SHA}" 2>/dev/null; then + CHANGED_FILES=$(git diff --name-only "${BEFORE_SHA}" "${AFTER_SHA}" || echo "") else # Fallback to comparing with parent commit echo "Cannot find one of the SHAs, falling back to HEAD^" @@ -144,16 +145,16 @@ jobs: RESULT=1 echo "$CHANGED_FILES" | grep -q -E '^(src/hyperscan/|README.md|CMakeLists.txt|pyproject.toml|MANIFEST.in|cmake/|build_tools/)' || RESULT=$? - if [[ $RESULT -eq 0 ]]; then - echo "should_build=true" >> $GITHUB_OUTPUT + if [[ "$RESULT" -eq 0 ]]; then + echo "should_build=true" >> "$GITHUB_OUTPUT" echo "Running build because relevant files were changed" else - echo "should_build=false" >> $GITHUB_OUTPUT + echo "should_build=false" >> "$GITHUB_OUTPUT" echo "Skipping build because no relevant files were changed and commit doesn't have [build] tag" fi else # For pull requests, always build (after checking for [build] tag above) - echo "should_build=true" >> $GITHUB_OUTPUT + echo "should_build=true" >> "$GITHUB_OUTPUT" echo "Running build for pull request" fi diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 6f8f648..6885391 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -54,11 +54,18 @@ jobs: src: "./src" args: check --fix + - name: Validate GitHub workflows + uses: raven-actions/actionlint@v2 + - name: Debug refs + env: + GITHUB_HEAD_REF: ${{ github.head_ref }} + GITHUB_REF: ${{ github.ref }} + GITHUB_SHA: ${{ github.sha }} run: | - echo "github.ref: ${{ github.ref }}" - echo "github.head_ref: ${{ github.head_ref }}" - echo "github.sha: ${{ github.sha }}" + echo "github.ref: ${GITHUB_REF}" + echo "github.head_ref: ${GITHUB_HEAD_REF}" + echo "github.sha: ${GITHUB_SHA}" - name: Commit formatting changes uses: iarekylew00t/verified-bot-commit@v1 diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index b462a88..67f62c2 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -11,7 +11,7 @@ jobs: build: name: Build source distribution and wheels uses: ./.github/workflows/build.yml - if: github.event.pull_request.merged == true && startsWith(github.event.pull_request.head.ref, ${{ vars.RELEASE_PR_BRANCH || 'create-pull-request' }}) && github.repository == 'darvid/python-hyperscan' + if: github.event.pull_request.merged == true && startsWith(github.event.pull_request.head.ref, vars.RELEASE_PR_BRANCH || 'create-pull-request') && github.repository == 'darvid/python-hyperscan' permissions: contents: read actions: write @@ -49,23 +49,23 @@ jobs: # Check if HEAD already has a release version tag (prevents redundant releases) if git describe --exact-match --tags HEAD --match "v*" 2>/dev/null; then EXISTING_TAG=$(git describe --exact-match --tags HEAD --match "v*" 2>/dev/null) - echo "HEAD already tagged with release version $EXISTING_TAG, no release needed" - echo "should_release=false" >> $GITHUB_OUTPUT + echo "HEAD already tagged with release version ${EXISTING_TAG}, no release needed" + echo "should_release=false" >> "$GITHUB_OUTPUT" else # Check if there are commits since last release LATEST_TAG=$(git describe --tags --abbrev=0 --match "v*" 2>/dev/null || echo "") if [[ -n "$LATEST_TAG" ]]; then - COMMITS_COUNT=$(git rev-list ${LATEST_TAG}..HEAD --count 2>/dev/null || echo "1") + COMMITS_COUNT=$(git rev-list "${LATEST_TAG}"..HEAD --count 2>/dev/null || echo "1") if [[ "$COMMITS_COUNT" -eq 0 ]]; then - echo "No commits since last release $LATEST_TAG, no new content to release" - echo "should_release=false" >> $GITHUB_OUTPUT + echo "No commits since last release ${LATEST_TAG}, no new content to release" + echo "should_release=false" >> "$GITHUB_OUTPUT" else - echo "Found $COMMITS_COUNT commits since $LATEST_TAG, proceeding with release" - echo "should_release=true" >> $GITHUB_OUTPUT + echo "Found ${COMMITS_COUNT} commits since ${LATEST_TAG}, proceeding with release" + echo "should_release=true" >> "$GITHUB_OUTPUT" fi else echo "No previous release found, proceeding with initial release" - echo "should_release=true" >> $GITHUB_OUTPUT + echo "should_release=true" >> "$GITHUB_OUTPUT" fi fi diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 7516e87..d07e047 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -35,18 +35,18 @@ jobs: CHANGED_FILES="" fi echo "Changed files:" - echo "$CHANGED_FILES" + echo "${CHANGED_FILES}" CHANGES=0 - echo "$CHANGED_FILES" | grep -c -E '^(src/hyperscan/|README.md|CMakeLists.txt|pyproject.toml|MANIFEST.in|cmake/|build_tools/)' || CHANGES=$? + echo "${CHANGED_FILES}" | grep -c -E '^(src/hyperscan/|README.md|CMakeLists.txt|pyproject.toml|MANIFEST.in|cmake/|build_tools/)' || CHANGES=$? if [[ "$CHANGES" -gt 0 ]]; then # The last commit already triggered a build, no need to force - echo "force_build=false" >> $GITHUB_OUTPUT + echo "force_build=false" >> "$GITHUB_OUTPUT" echo "Last commit already triggered a build" else # The last commit didn't trigger a build, we need to force it - echo "force_build=true" >> $GITHUB_OUTPUT + echo "force_build=true" >> "$GITHUB_OUTPUT" echo "Last commit didn't trigger a build, forcing build" fi @@ -79,6 +79,9 @@ jobs: needs: [check_build, check_release] if: github.repository == 'darvid/python-hyperscan' && !contains(github.event.head_commit.message, 'python-semantic-release') && (needs.check_build.outputs.is_build_needed == 'true' || needs.check_release.outputs.is_release_needed == 'true') uses: ./.github/workflows/build.yml + permissions: + contents: read + actions: write with: force_build: "${{ needs.check_release.outputs.is_release_needed == 'true' || fromJSON(needs.check_build.outputs.is_build_needed) }}" @@ -121,11 +124,11 @@ jobs: if: needs.check_release.outputs.is_release_needed == 'true' run: | # Check if branch exists on remote and delete it if it does - if git ls-remote --heads origin ${RELEASE_PR_BRANCH} | grep -q ${RELEASE_PR_BRANCH}; then - git push origin --delete ${RELEASE_PR_BRANCH} + if git ls-remote --heads origin "${RELEASE_PR_BRANCH}" | grep -q "${RELEASE_PR_BRANCH}"; then + git push origin --delete "${RELEASE_PR_BRANCH}" fi # Create new branch - git switch -c ${RELEASE_PR_BRANCH} + git switch -c "${RELEASE_PR_BRANCH}" - name: Semantic release uses: python-semantic-release/python-semantic-release@v9.10.1 @@ -142,7 +145,7 @@ jobs: - name: Create PR if: needs.check_release.outputs.is_release_needed == 'true' run: | - gh pr create -B main -H $RELEASE_PR_BRANCH \ + gh pr create -B main -H "$RELEASE_PR_BRANCH" \ --title "$PR_TITLE" \ --body '🤖' env: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 59e4b84..994132e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -14,3 +14,7 @@ repos: hooks: - id: commitizen stages: [commit-msg] + - repo: https://github.com/rhysd/actionlint + rev: v1.7.4 + hooks: + - id: actionlint diff --git a/actionlint b/actionlint new file mode 100755 index 0000000..e69de29 diff --git a/test_issue_207.py b/test_issue_207.py new file mode 100644 index 0000000..e8c6dcc --- /dev/null +++ b/test_issue_207.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python3 + +import hyperscan + +print(f'hyperscan version: {hyperscan.__version__}') + +# Exact code from GitHub issue #207 +bla = [r'السلام عليكم\s<\/span>'.encode('utf8'), + r'ועליכום הסלאם\s<\/span>'.encode('utf8')] + +print(f'Testing patterns: {bla}') + +try: + rules_db = hyperscan.Database() + rules_db.compile(expressions=bla, + flags=hyperscan.HS_FLAG_UTF8 | hyperscan.HS_FLAG_UCP) + print('SUCCESS: Patterns compiled with HS_FLAG_UTF8 | HS_FLAG_UCP!') +except Exception as e: + print(f'FAILED: {e}') + if 'Expression is not valid UTF-8' in str(e): + print('*** THIS IS THE EXACT BUG FROM ISSUE #207! ***') + else: + print('*** Different error ***') \ No newline at end of file diff --git a/test_unicode.py b/test_unicode.py new file mode 100644 index 0000000..436ab96 --- /dev/null +++ b/test_unicode.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python3 + +import hyperscan +print(f'hyperscan version: {hyperscan.__version__}') + +# Test unicode pattern compilation +patterns = ['السلام عليكم', 'ועליכום הסלאם'] +print(f'Testing unicode patterns: {patterns}') + +try: + db = hyperscan.Database() + db.compile(expressions=patterns) + print('SUCCESS: Unicode patterns compiled without errors!') +except Exception as e: + print(f'FAILED: {str(e)}') + if 'Expression is not valid UTF-8' in str(e): + print('*** THIS IS THE BUG - the fix is NOT working! ***') + else: + print('*** Different error, not the unicode bug ***') \ No newline at end of file diff --git a/test_wheels/hyperscan-0.7.19.dist-info/METADATA b/test_wheels/hyperscan-0.7.19.dist-info/METADATA new file mode 100644 index 0000000..1c17d21 --- /dev/null +++ b/test_wheels/hyperscan-0.7.19.dist-info/METADATA @@ -0,0 +1,99 @@ +Metadata-Version: 2.2 +Name: hyperscan +Version: 0.7.19 +Summary: Python bindings for Hyperscan. +Keywords: regex,hypercan +Author-Email: David Gidwani +License: MIT +Classifier: Development Status :: 4 - Beta +Classifier: Topic :: Software Development :: Libraries +Classifier: Topic :: Software Development :: Libraries :: Python Modules +Classifier: Topic :: Utilities +Classifier: Programming Language :: Python +Classifier: Programming Language :: Python :: 3 +Classifier: Programming Language :: Python :: 3.8 +Classifier: Programming Language :: Python :: 3.9 +Classifier: Programming Language :: Python :: 3.10 +Classifier: Programming Language :: Python :: 3.11 +Classifier: Programming Language :: Python :: 3.12 +Classifier: Programming Language :: Python :: 3.13 +Classifier: Programming Language :: Python :: Implementation :: CPython +Classifier: Environment :: Console +Classifier: Intended Audience :: Developers +Classifier: License :: OSI Approved :: MIT License +Classifier: Operating System :: POSIX :: Linux +Classifier: Operating System :: Unix +Classifier: Operating System :: MacOS +Classifier: Operating System :: Microsoft :: Windows +Project-URL: Homepage, https://github.com/darvid/python-hyperscan +Project-URL: Repository, https://github.com/darvid/python-hyperscan +Project-URL: Documentation, https://python-hyperscan.readthedocs.io/en/latest/ +Requires-Python: <4.0,>=3.9 +Description-Content-Type: text/markdown + +# Hyperscan/Vectorscan for Python + +![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/darvid/python-hyperscan/build.yml?style=plastic) +![PyPI - Version](https://img.shields.io/pypi/v/hyperscan?style=plastic) +![PyPI - Downloads](https://img.shields.io/pypi/dm/hyperscan?style=plastic) +![PyPI - Python Version](https://img.shields.io/pypi/pyversions/hyperscan.svg?style=plastic) +![PyPI - Wheel](https://img.shields.io/pypi/wheel/hyperscan.svg?style=plastic) +![PyPI - License](https://img.shields.io/pypi/l/hyperscan.svg?style=plastic) +[![Read the Docs](https://img.shields.io/readthedocs/python-hyperscan.svg?style=plastic)](https://python-hyperscan.readthedocs.io/en/latest/) + +A CPython extension for [Vectorscan][7], an open source fork of +[Hyperscan][8], Intel's open source ([prior to version 5.4][9]), +high-performance multiple regex matching library. + +* ✅ Binary [manylinux][12]-compatible wheels +* ✅ Statically linked (no need to build Hyperscan/Vectorscan) +* ✅ [Chimera][1] support + +## Installation + +```shell +# 🪄 Installing libhs is NOT required, because python-hyperscan is statically linked +pip install hyperscan +``` + +## Build Optimization + +If you'd like to use Intel's Hyperscan rather than Vectorscan, or if +you'd like to enable native CPU detection to build optimized non-FAT +libraries ([default off in Vectorscan][11]), extending the +[manylinux-hyperscan][10] Docker image used to build the binary wheels +for this library should be fairly straightforward. + +## API Support + +``python-hyperscan`` currently exposes *most* of the C API, with the +following caveats or exceptions: + +* No [stream compression][2] support. +* No [custom allocator][3] support. +* ``hs_expression_info``, ``hs_expression_ext_info``, + ``hs_populate_platform``, and ``hs_serialized_database_info`` not + exposed yet. + +See the [documentation][6] for more detailed build instructions. + +## Resources + +* [PyPI Project][13] +* [Documentation][6] +* [Hyperscan C API Documentation][14] + +[1]: http://intel.github.io/hyperscan/dev-reference/chimera.html +[2]: http://intel.github.io/hyperscan/dev-reference/runtime.html#stream-compression +[3]: http://intel.github.io/hyperscan/dev-reference/runtime.html#custom-allocators +[4]: http://intel.github.io/hyperscan/dev-reference/compilation.html +[5]: https://github.com/darvid/python-hyperscan/issues +[6]: https://python-hyperscan.readthedocs.io +[7]: https://www.vectorcamp.gr/vectorscan/ +[8]: https://www.hyperscan.io/ +[9]: https://github.com/VectorCamp/vectorscan?tab=readme-ov-file#hyperscan-license-change-after-54 +[10]: https://github.com/darvid/manylinux-hyperscan/ +[11]: https://github.com/VectorCamp/vectorscan?tab=readme-ov-file#configure--build +[12]: https://github.com/pypa/manylinux +[13]: https://pypi.org/project/hyperscan/ +[14]: http://intel.github.io/hyperscan/dev-reference/ diff --git a/test_wheels/hyperscan-0.7.19.dist-info/RECORD b/test_wheels/hyperscan-0.7.19.dist-info/RECORD new file mode 100644 index 0000000..0b409d9 --- /dev/null +++ b/test_wheels/hyperscan-0.7.19.dist-info/RECORD @@ -0,0 +1,10 @@ +hyperscan-0.7.19.dist-info/METADATA,sha256=6NsPEGGFUJdhx_ulIMD4Ff-cgw4FlNm1rgaFDQ7yI2Q,4299 +hyperscan-0.7.19.dist-info/RECORD,, +hyperscan-0.7.19.dist-info/WHEEL,sha256=6Dxtid-NXEnR7jvm4_GvErwSb88e3UzL8AWq9MWuAAE,156 +hyperscan-0.7.19.dist-info/licenses/LICENSE,sha256=yvm4yRI_IxT-4iZOEl1Nx9I0Dm0JbAbmHt8OmKopiUA,1070 +hyperscan/__init__.py,sha256=ImBXLA9RN8dJIx94n6R3iRUOBO7v1-q8vImzzKPVLbU,367 +hyperscan/extension.c,sha256=xcYkpNIuIIYNGFWKC46lp9YYbOABu5EpDpSeW09AFgQ,47700 +hyperscan/_version.py,sha256=-_OxJPv2D0J4Tap1QJZo4Z4XyBYoG9M_2-0CsJ35W-I,23 +hyperscan/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 +hyperscan/__init__.pyi,sha256=oRU1eShJUV5-mQheZfDCbZYTpVWPyS0dHrhmbT0ewiI,10768 +hyperscan/_hs_ext.cpython-311-x86_64-linux-gnu.so,sha256=KIaQV29IP80Ed1uJgU38d8nX4nBhDvzbugfr5fX2XnE,7051104 diff --git a/test_wheels/hyperscan-0.7.19.dist-info/WHEEL b/test_wheels/hyperscan-0.7.19.dist-info/WHEEL new file mode 100644 index 0000000..e2fc265 --- /dev/null +++ b/test_wheels/hyperscan-0.7.19.dist-info/WHEEL @@ -0,0 +1,6 @@ +Wheel-Version: 1.0 +Generator: scikit-build-core 0.11.5 +Root-Is-Purelib: false +Tag: cp311-cp311-manylinux_2_17_x86_64 +Tag: cp311-cp311-manylinux2014_x86_64 + diff --git a/test_wheels/hyperscan-0.7.19.dist-info/licenses/LICENSE b/test_wheels/hyperscan-0.7.19.dist-info/licenses/LICENSE new file mode 100644 index 0000000..330bcda --- /dev/null +++ b/test_wheels/hyperscan-0.7.19.dist-info/licenses/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2017 David Gidwani + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/test_wheels/hyperscan/__init__.py b/test_wheels/hyperscan/__init__.py new file mode 100644 index 0000000..7fab16d --- /dev/null +++ b/test_wheels/hyperscan/__init__.py @@ -0,0 +1,17 @@ +import typing + +from hyperscan._hs_ext import * # noqa: F403 + +try: + from hyperscan._version import __version__ # pyright: ignore +except ImportError: + __version__ = "unknown" + + +class ExpressionExt(typing.NamedTuple): + flags: int + min_offset: int = 0 + max_offset: int = 0 + min_length: int = 0 + edit_distance: int = 0 + hamming_distance: int = 0 diff --git a/test_wheels/hyperscan/__init__.pyi b/test_wheels/hyperscan/__init__.pyi new file mode 100644 index 0000000..177f26e --- /dev/null +++ b/test_wheels/hyperscan/__init__.pyi @@ -0,0 +1,352 @@ +from typing import ( + AnyStr, + ByteString, + Callable, + Optional, + Self, + Sequence, + Tuple, + TypeAlias, + Union, + Type, + TracebackType, +) + +CH_BAD_ALIGN = -8 +CH_BAD_ALLOC = -9 +CH_COMPILER_ERROR = -4 +CH_DB_MODE_ERROR = -7 +CH_DB_PLATFORM_ERROR = -6 +CH_DB_VERSION_ERROR = -5 +CH_FAIL_INTERNAL = -32 +CH_FLAG_CASELESS = 1 +CH_FLAG_DOTALL = 2 +CH_FLAG_MULTILINE = 4 +CH_FLAG_SINGLEMATCH = 8 +CH_FLAG_UCP = 64 +CH_FLAG_UTF8 = 32 +CH_INVALID = -1 +CH_MODE_GROUPS = 1048576 +CH_MODE_NOGROUPS = 0 +CH_NOMEM = -2 +CH_SCAN_TERMINATED = -3 +CH_SCRATCH_IN_USE = -10 +CH_SUCCESS = 0 +HS_CPU_FEATURES_AVX2 = 4 +HS_EXT_FLAG_EDIT_DISTANCE = 8 +HS_EXT_FLAG_HAMMING_DISTANCE = 16 +HS_EXT_FLAG_MAX_OFFSET = 2 +HS_EXT_FLAG_MIN_LENGTH = 4 +HS_EXT_FLAG_MIN_OFFSET = 1 +HS_FLAG_ALLOWEMPTY = 16 +HS_FLAG_CASELESS = 1 +HS_FLAG_COMBINATION = 512 +HS_FLAG_DOTALL = 2 +HS_FLAG_MULTILINE = 4 +HS_FLAG_PREFILTER = 128 +HS_FLAG_QUIET = 1024 +HS_FLAG_SINGLEMATCH = 8 +HS_FLAG_SOM_LEFTMOST = 256 +HS_FLAG_UCP = 64 +HS_FLAG_UTF8 = 32 +HS_MODE_BLOCK = 1 +HS_MODE_NOSTREAM = 1 +HS_MODE_SOM_HORIZON_LARGE = 16777216 +HS_MODE_SOM_HORIZON_MEDIUM = 33554432 +HS_MODE_SOM_HORIZON_SMALL = 67108864 +HS_MODE_STREAM = 2 +HS_MODE_VECTORED = 4 +HS_OFFSET_PAST_HORIZON = -1 +HS_SCRATCH_IN_USE = -10 +HS_SUCCESS = 0 +HS_TUNE_FAMILY_BDW = 5 +HS_TUNE_FAMILY_GENERIC = 0 +HS_TUNE_FAMILY_GLM = 8 +HS_TUNE_FAMILY_HSW = 3 +HS_TUNE_FAMILY_IVB = 2 +HS_TUNE_FAMILY_SKL = 6 +HS_TUNE_FAMILY_SKX = 7 +HS_TUNE_FAMILY_SLM = 4 +HS_TUNE_FAMILY_SNB = 1 + +match_event_callback = Callable[[int, int, int, int, object], Optional[bool]] + +def dumpb(database: "Database") -> bytes: + """Serializes a Hyperscan database. + + Args: + database (:class:`Database`): A Hyperscan database. + + Returns: + bytes: A serialized representation of the database. + + """ + +def loadb(buf: ByteString, mode: int) -> "Database": + """Deserializes a Hyperscan database. + + Args: + buf (bytes): A serialized Hyperscan database. + mode (int): The expected mode of the database. + + Returns: + :class:`Database`: The deserialized database instance. + + """ + +class error(Exception): + """Base exception class for Hyperscan errors.""" + +HyperscanError: TypeAlias = error + +class ArchitectureError(error): + """Unsupported CPU architecture.""" + +class BadAlignError(error): + """A parameter passed to this function was not correctly aligned.""" + +class BadAllocationError(error): + """The memory allocator failed.""" + +class CompilerError(error): + """Pattern compilation failed.""" + +class DatabaseModeError(error): + """The given database was built for a different mode of operation.""" + +class DatabasePlatformError(error): + """The given database was built for a different platform.""" + +class DatabaseVersionError(error): + """The given database was built for a different version of Hyperscan.""" + +class InsufficientSpaceError(error): + """Provided buffer was too small.""" + +class InternalPCREError(error): + """Unexpected internal error.""" + +class InvalidError(error): + """Parameter passed to this function was invalid.""" + +class NoMemoryError(error): + """Memory allocation failed.""" + +class ScratchInUseError(error): + """The scratch region was already in use.""" + +class UnknownError(error): + """Unexpected internal error.""" + +class ScanTerminated(error): + """The engine was terminated by callback.""" + +class Scratch: + """Represents Hyperscan 'scratch space. + + Args: + database (:class:`Database`, optional): A database instance. + + """ + + def __init__(self, database: Optional["Database"] = None) -> None: ... + def clone(self) -> "Scratch": + """Clones a scratch space. + + Returns: + :class:`Scratch`: The cloned scratch space. + + """ + def set_database(self, database: "Database") -> None: + """Allocates a scratch with the given database. + + Args: + :obj:`database`: A hyperscan Database. + + """ + +class Stream: + """Provides a context manager for scanning streams of text. + + Args: + database (:class:`Database`): A database initialized with + :const:`HS_MODE_STREAM`. + flags (int, optional): Currently unused. + match_event_handler (callable, optional): The match callback, + which is invoked for each match result, and passed the + expression id, start offset, end offset, flags, and a + context object. + + """ + + def close( + self, + scratch: Optional[Scratch] = None, + match_event_handler: Optional[match_event_callback] = None, + context: Optional[object] = None, + ) -> None: + """Closes the stream. + + Args: + scratch (:class:`Scratch`, optional): Scratch space. + If a **match_event_handler** is provided and a scratch + space is not, the scratch space of the associated + database will be used. + match_event_handler (callable, optional): The match + callback, which is invoked for each match result, and + passed the expression id, start offset, end offset, + flags, and a context object. + context (:obj:`object`, optional): A context object passed + as the last arg to **match_event_handler**. + + """ + def scan( + self, + data: AnyStr, + flags: int = 0, + scratch: Optional[Scratch] = None, + match_event_handler: Optional[match_event_callback] = None, + context: Optional[object] = None, + ) -> None: + """Scans streaming text. + + Args: + data (str): The block of text to scan. + flags (int, optional): Currently unused. + scratch (:obj:`Scratch`, optional): Scratch space. + match_event_handler (callable, optional): The match + callback, which is invoked for each match result, and + passed the expression id, start offset, end offset, + flags, and a context object. + context (object, optional): A context object passed + as the last arg to **match_event_handler**. + + """ + def size(self) -> int: + """Return the size of the stream state in bytes""" + def __enter__(self) -> Self: ... + def __exit__( + self, + exc_type: Optional[Type[BaseException]], + exc_value: Optional[BaseException], + exc_traceback: Optional[TracebackType], + ) -> None: ... + +class Database: + """Represents a Hyperscan database. + + Args: + scratch (:class:`Scratch`, optional): Thread-specific + scratch space. + mode (int, optional): One of :const:`HS_MODE_BLOCK`, + :const:`HS_MODE_STREAM`, or :const:`HS_MODE_VECTORED`. + chimera (bool): Enable Chimera support. + + Attributes: + mode (int): Scanning mode. + chimera (bool): Indicates if Chimera support is enabled. + scratch (:class:`Scratch`): Scratch space. + + """ + + mode: int + chimera: bool + scratch: Scratch + + def __init__( + self, + scratch: Optional[Scratch] = None, + mode: int = HS_MODE_BLOCK, + chimera: bool = False, + ) -> None: ... + def compile( + self, + expressions: Sequence[AnyStr], + ids: Optional[Sequence[int]] = None, + elements: Union[Optional[Sequence[int]], int] = None, + flags: Union[Optional[Sequence[int]], int] = 0, + literal: bool = False, + ext: Optional[Sequence[Tuple[int, int, int, int, int, int]]] = None, + ) -> None: + """Compiles regular expressions + Args: + expressions (sequence of str): A sequence of regular + expressions. + ids (sequence of int, optional): A sequence of + expression identifiers. + elements (int, optional): Length of the expressions + sequence. + flags (sequence of int or int, optional): + Sequence of flags associated with each expression, or a + single value which is applied to all expressions. + literal (bool, optional): If True, uses the pure literal + expression compiler introduced in Hyperscan 5.2.0 + ext (sequence of tuple, optional): A list of tuples used to + define extended behavior for each pattern. Tuples must + contain **flags**, **min_offset**, **max_offset**, + **min_length**, **edit_distance**, and + **hamming_distance**. See hyperscan documentation for + more information. **Note:** this parameter if + **literal** is True + + """ + def info(self) -> str: + """Returns database information. + + Returns: + str: Provides version and platform information for the + database. + + """ + def size(self) -> int: + """Returns the size of the database in bytes. + + Returns: + int: The size of the database in bytes. + + """ + def scan( + self, + data: AnyStr, + match_event_handler: match_event_callback, + flags: int = 0, + context: object = None, + scratch: Optional[Scratch] = None, + ) -> None: + """Scans a block of text. + + Args: + data (str): The block of text to scan, if the database + was opened with streaming or block mode, or a list of + buffers (i.e. :obj:`bytearray`) if the database was + opened with vectored mode. + match_event_handler (callable): The match callback, which is + invoked for each match result, and passed the expression + id, start offset, end offset, flags, and a context + object. + flags (int): Currently unused. + context (object, optional): A context object passed as the + last arg to **match_event_handler**. + scratch (:class:`Scratch`, optional): A scratch object. + + """ + def stream( + self, + match_event_handler: match_event_callback, + flags: int = 0, + context: Optional[object] = None, + ) -> Stream: + """Returns a new stream context manager. + + Args: + match_event_handler (callable, optional): The match callback, + which is invoked for each match result, and passed the + expression id, start offset, end offset, flags, and a + context object. Note that this callback will override + the match event handler defined in the :class:`Database` + instance. + flags (int): Currently unused. + context (object): A context object passed as the last + arg to **match_event_handler** + + """ diff --git a/test_wheels/hyperscan/_version.py b/test_wheels/hyperscan/_version.py new file mode 100644 index 0000000..06643bf --- /dev/null +++ b/test_wheels/hyperscan/_version.py @@ -0,0 +1 @@ +__version__ = "0.7.19" diff --git a/test_wheels/hyperscan/extension.c b/test_wheels/hyperscan/extension.c new file mode 100644 index 0000000..e0e41c1 --- /dev/null +++ b/test_wheels/hyperscan/extension.c @@ -0,0 +1,1542 @@ +#define PY_SSIZE_T_CLEAN +#include +#include +#include +#include +#include +#include +#include + +#define ADD_INT_CONSTANT(module, name) \ + if (PyModule_AddIntConstant(module, #name, name) < 0) { \ + Py_XDECREF(module); \ + return NULL; \ + }; +#define HANDLE_CHIMERA_ERR(err, rv) \ + if (err != CH_SUCCESS) { \ + char serr[80]; \ + sprintf(serr, "error code %i", err); \ + PyGILState_STATE gstate = PyGILState_Ensure(); \ + PyErr_SetString(HyperscanErrors[abs(err)], serr); \ + PyGILState_Release(gstate); \ + return rv; \ + } +#define HANDLE_HYPERSCAN_ERR(err, rv) \ + if (err != HS_SUCCESS) { \ + char serr[80]; \ + sprintf(serr, "error code %i", err); \ + PyGILState_STATE gstate = PyGILState_Ensure(); \ + PyErr_SetString(HyperscanErrors[abs(err)], serr); \ + PyGILState_Release(gstate); \ + return rv; \ + } +#define ADD_HYPERSCAN_ERROR(module, errors, base, name, hs_err, doc) \ + if (PyModule_AddIntConstant(module, #hs_err, hs_err) < 0) { \ + Py_XDECREF(module); \ + return NULL; \ + } \ + PyObject *name = \ + PyErr_NewExceptionWithDoc("hyperscan." #name, doc, base, NULL); \ + if (name == NULL) { \ + Py_XDECREF(module); \ + return NULL; \ + } else { \ + if (PyModule_AddObject(module, #name, name) < 0) { \ + Py_XDECREF(module); \ + return NULL; \ + } \ + errors[abs(hs_err)] = name; \ + } + +static PyObject *HyperscanErrors[33] = {NULL}; +static PyObject *HyperscanError; +static PyTypeObject DatabaseType; +static PyTypeObject ScratchType; +static PyTypeObject StreamType; + +typedef struct { + PyObject *callback; + PyObject *ctx; + int success; +} py_scan_callback_ctx; + +typedef struct { + PyObject_HEAD PyObject *scratch; + hs_database_t *hs_db; + ch_database_t *ch_db; + uint32_t mode; + uint32_t chimera; +} Database; + +typedef struct { + PyObject_HEAD hs_stream_t *identifier; + PyObject *database; + PyObject *scratch; + uint32_t flags; + py_scan_callback_ctx *cctx; +} Stream; + +typedef struct { + PyObject_HEAD PyObject *database; + hs_scratch_t *hs_scratch; + ch_scratch_t *ch_scratch; +} Scratch; + +static int hs_match_handler( + unsigned int id, + long long unsigned int from, + long long unsigned int to, + unsigned int flags, + void *context) +{ + py_scan_callback_ctx *cctx = context; + PyGILState_STATE gstate; + gstate = PyGILState_Ensure(); + PyObject *rv = PyObject_CallFunction( + cctx->callback, "IIIIO", id, from, to, flags, cctx->ctx); + int halt = 1; + if (rv == NULL) { + cctx->success = 0; + } else { + halt = rv == Py_None ? 0 : PyObject_IsTrue(rv); + cctx->success = 1; + } + Py_XDECREF(rv); + PyGILState_Release(gstate); + return halt; +} + +static int ch_match_handler( + unsigned int id, + long long unsigned int from, + long long unsigned int to, + unsigned int flags, + unsigned int size, + const ch_capture_t *captured, + void *context) +{ + py_scan_callback_ctx *cctx = context; + PyGILState_STATE gstate; + + gstate = PyGILState_Ensure(); + PyObject *ocapture = NULL; + PyObject *ocaptured = PyList_New((Py_ssize_t)size); + for (unsigned int i = 0; i < size; i++) { + ocapture = Py_BuildValue( + "(I, K, K)", captured[i].flags, captured[i].from, captured[i].to); + PyList_SetItem(ocaptured, i, ocapture); + } + PyObject *rv = PyObject_CallFunction( + cctx->callback, + "IIIIOO", + id, + from, + to, + flags, + (PyObject *)ocaptured, + cctx->ctx); + int halt = 1; + if (rv == NULL) { + cctx->success = 0; + } else { + halt = rv == Py_None ? 0 : PyObject_IsTrue(rv); + cctx->success = 1; + } + Py_XDECREF(rv); + Py_XDECREF(ocaptured); + PyGILState_Release(gstate); + return halt; +} + +static void Database_dealloc(Database *self) +{ + if (self->chimera) { + ch_free_database(self->ch_db); + if (self->scratch != Py_None && self->scratch != NULL) { + ch_scratch_t *scratch = ((Scratch *)self->scratch)->ch_scratch; + if (scratch) + ch_free_scratch(scratch); + } + } else { + hs_free_database(self->hs_db); + if (self->scratch != Py_None && self->scratch != NULL) { + hs_scratch_t *scratch = ((Scratch *)self->scratch)->hs_scratch; + if (scratch) + hs_free_scratch(scratch); + } + } + + Py_TYPE(self)->tp_free((PyObject *)self); +} + +static PyObject *Database_new( + PyTypeObject *type, PyObject *args, PyObject *kwds) +{ + Database *self; + + self = (Database *)type->tp_alloc(type, 0); + if (self != NULL) { + self->mode = HS_MODE_BLOCK; + self->chimera = 0; + } + + return (PyObject *)self; +} + +static int Database_init(Database *self, PyObject *args, PyObject *kwds) +{ + static char *kwlist[] = {"scratch", "mode", "chimera", NULL}; + self->scratch = Py_None; + if (!PyArg_ParseTupleAndKeywords( + args, + kwds, + "|OIp", + kwlist, + &self->scratch, + &self->mode, + &self->chimera)) + return -1; + return 0; +} + +static PyObject *Database_compile( + Database *self, PyObject *args, PyObject *kwds) +{ + PyObject *oexpressions; + PyObject *oflags = Py_None; + PyObject *oflag = Py_None; + PyObject *oids = Py_None; + PyObject *oext = Py_None; + uint32_t literal = 0; + uint64_t elements = 0; + + static char *kwlist[] = { + "expressions", + "ids", + "elements", + "flags", + "literal", + "ext", + NULL, + }; + if (!PyArg_ParseTupleAndKeywords( + args, + kwds, + "O|OKOpO", + kwlist, + &oexpressions, + &oids, + &elements, + &oflags, + &literal, + &oext)) + return NULL; + + if (elements == 0) { + Py_ssize_t expressions_size = PySequence_Size(oexpressions); + if (expressions_size == -1) { + PyErr_SetString(PyExc_TypeError, "expressions must be a sequence"); + return NULL; + } else { + elements = (uint64_t)expressions_size; + } + } + + PyObject *oexpr = NULL; + PyObject *oid = NULL; + const char **expressions; + uint32_t *flags; + uint32_t *ids; + size_t *lens; + uint32_t globalflag; + + if (self->chimera && self->ch_db != NULL) + ch_free_database(self->ch_db); + else if (!self->chimera && self->hs_db != NULL) { + hs_free_database(self->hs_db); + } + + expressions = malloc(elements * sizeof(char *)); + if (expressions == NULL) + goto memory_error; + + flags = malloc(elements * sizeof(uint32_t)); + if (flags == NULL) + goto memory_error; + + ids = malloc(elements * sizeof(uint32_t)); + if (ids == NULL) + goto memory_error; + + globalflag = (oflags == Py_None ? 0 : PyLong_AsUnsignedLong(oflags)); + + PyErr_Clear(); + + for (uint64_t i = 0; i < elements; i++) { + const char *expression; + uint32_t expr_flags; + uint32_t expr_id; + + oexpr = PySequence_ITEM(oexpressions, i); + + // Handle both bytes and unicode strings + if (PyBytes_Check(oexpr)) { + expression = PyBytes_AsString(oexpr); + } else if (PyUnicode_Check(oexpr)) { + // Convert unicode to UTF-8 bytes + PyObject *temp_bytes = PyUnicode_AsUTF8String(oexpr); + if (temp_bytes == NULL) { + break; + } + expression = PyBytes_AsString(temp_bytes); + // Replace the original object with the encoded version + Py_DECREF(oexpr); + oexpr = temp_bytes; + } else { + PyErr_SetString(PyExc_TypeError, "expressions must be bytes or str"); + break; + } + + if (PyErr_Occurred()) + break; + + if (PyObject_IsTrue(oids)) { + oid = PySequence_ITEM(oids, i); + expr_id = PyLong_AsUnsignedLong(oid); + if (PyErr_Occurred()) + break; + } else { + expr_id = i; + } + + if (PySequence_Check(oflags)) { + oflag = PySequence_ITEM(oflags, i); + if (PyErr_Occurred()) + break; + expr_flags = PyLong_AsUnsignedLong(oflag); + if (PyErr_Occurred()) + break; + } else { + expr_flags = globalflag; + } + + expressions[i] = expression; + ids[i] = expr_id; + flags[i] = expr_flags; + + Py_XDECREF(oexpr); + } + + if (oflag != Py_None) + Py_XDECREF(oflag); + if (oid != Py_None) + Py_XDECREF(oid); + + if (PyErr_Occurred()) { + goto python_error; + } + + struct hs_expr_ext **ext = NULL; + hs_compile_error_t *hs_compile_err; + ch_compile_error_t *ch_compile_err; + + if (literal) { + if (self->chimera) { + PyErr_Format( + PyExc_RuntimeError, + "chimera does not support pure literal expressions"); + goto python_error; + } + lens = malloc(elements * sizeof(size_t)); + if (lens == NULL) + goto memory_error; + for (uint64_t i = 0; i < elements; i++) { + lens[i] = strlen(expressions[i]); + } + hs_error_t hs_err; + Py_BEGIN_ALLOW_THREADS; + hs_err = hs_compile_lit_multi( + expressions, + flags, + ids, + lens, + elements, + self->mode, + NULL, + &self->hs_db, + &hs_compile_err); + free(lens); + Py_END_ALLOW_THREADS; + free(expressions); + free(flags); + free(ids); + + if (hs_err != HS_SUCCESS) { + PyErr_Format( + HyperscanError, + "%s (id:%d)", + hs_compile_err->message, + hs_compile_err->expression); + hs_free_compile_error(hs_compile_err); + return NULL; + } + } else { + if (self->chimera) { + ch_error_t ch_err; + Py_BEGIN_ALLOW_THREADS; + ch_err = ch_compile_ext_multi( + expressions, + flags, + ids, + elements, + self->mode, + 0, + 0, + NULL, + &self->ch_db, + &ch_compile_err); + Py_END_ALLOW_THREADS; + free(expressions); + free(flags); + free(ids); + HANDLE_CHIMERA_ERR(ch_err, NULL); + } else { + if (oext != Py_None) { + ext = malloc(elements * sizeof(struct hs_expr_ext *)); + for (uint64_t i = 0; i < elements; i++) { + ext[i] = malloc(sizeof(struct hs_expr_ext)); + PyObject *oext_item = PySequence_GetItem(oext, i); + if (oext_item == NULL) { + PyErr_Format( + PyExc_RuntimeError, "failed to get ext item at index: %d", i); + goto python_error; + } + if (!PyArg_ParseTuple( + oext_item, + "KKKKII", + &(ext[i]->flags), + &(ext[i]->min_offset), + &(ext[i]->max_offset), + &(ext[i]->min_length), + &(ext[i]->edit_distance), + &(ext[i]->hamming_distance))) { + PyErr_SetString(PyExc_TypeError, "invalid ext info"); + Py_XDECREF(oext_item); + goto python_error; + } + Py_XDECREF(oext_item); + } + } + hs_error_t hs_err; + Py_BEGIN_ALLOW_THREADS; + hs_err = hs_compile_ext_multi( + expressions, + flags, + ids, + (const struct hs_expr_ext *const *)ext, + elements, + self->mode, + NULL, + &self->hs_db, + &hs_compile_err); + Py_END_ALLOW_THREADS; + + free(expressions); + free(flags); + free(ids); + if (hs_err != HS_SUCCESS) { + PyErr_SetString(HyperscanError, hs_compile_err->message); + hs_free_compile_error(hs_compile_err); + return NULL; + } + } + free(ext); + } + + if (self->scratch == Py_None) { + self->scratch = + PyObject_CallFunction((PyObject *)&ScratchType, "O", (PyObject *)self, 0); + } + + Scratch *scratch = ((Scratch *)self->scratch); + + if (self->chimera) { + ch_error_t ch_err; + if (scratch->ch_scratch != NULL) { + ch_err = ch_free_scratch(scratch->ch_scratch); + HANDLE_CHIMERA_ERR(ch_err, NULL); + scratch->ch_scratch = NULL; + } + ch_err = ch_alloc_scratch(self->ch_db, &scratch->ch_scratch); + HANDLE_CHIMERA_ERR(ch_err, NULL); + } else { + hs_error_t hs_err; + if (scratch->hs_scratch != NULL) { + hs_err = hs_free_scratch(scratch->hs_scratch); + HANDLE_HYPERSCAN_ERR(hs_err, NULL); + scratch->hs_scratch = NULL; + } + hs_err = hs_alloc_scratch(self->hs_db, &scratch->hs_scratch); + HANDLE_HYPERSCAN_ERR(hs_err, NULL); + } + + Py_RETURN_NONE; + +memory_error: + return PyErr_NoMemory(); + +python_error: + free(expressions); + free(flags); + free(ids); + return NULL; +} + +static PyObject *Database_info(Database *self, PyObject *args) +{ + char *info; + if (self->chimera) { + ch_error_t ch_err = ch_database_info(self->ch_db, &info); + HANDLE_CHIMERA_ERR(ch_err, NULL); + } else { + hs_error_t hs_err = hs_database_info(self->hs_db, &info); + HANDLE_HYPERSCAN_ERR(hs_err, NULL); + } + + PyObject *oinfo = PyBytes_FromString(info); + Py_INCREF(oinfo); + free(info); + return oinfo; +} + +static PyObject *Database_size(Database *self, PyObject *args) +{ + size_t database_size; + if (self->chimera) { + ch_error_t ch_err = ch_database_size(self->ch_db, &database_size); + HANDLE_CHIMERA_ERR(ch_err, NULL); + } else { + hs_error_t hs_err = hs_database_size(self->hs_db, &database_size); + HANDLE_HYPERSCAN_ERR(hs_err, NULL); + } + + PyObject *odatabase_size = PyLong_FromSize_t(database_size); + Py_INCREF(odatabase_size); + return odatabase_size; +} + +static PyObject *Database_scan(Database *self, PyObject *args, PyObject *kwds) +{ + uint32_t flags = 0; + + PyObject *odata; + PyObject *ocallback = Py_None; + PyObject *oscratch = Py_None; + PyObject *octx = Py_None; + + static char *kwlist[] = { + "data", + "match_event_handler", + "flags", + "context", + "scratch", + NULL, + }; + if (!PyArg_ParseTupleAndKeywords( + args, + kwds, + "O|OIOO", + kwlist, + &odata, + &ocallback, + &flags, + &octx, + &oscratch)) + return NULL; + py_scan_callback_ctx cctx = {ocallback, octx, 1}; + + if (self->mode == HS_MODE_VECTORED) { + char **data; + PyObject *fast_seq; + Py_ssize_t num_buffers; + Py_ssize_t *lengths; + + fast_seq = PySequence_Fast(odata, "expected a sequence of buffers"); + num_buffers = PySequence_Fast_GET_SIZE(fast_seq); + data = PyMem_RawMalloc(num_buffers * sizeof(char *)); + lengths = PyMem_RawMalloc(num_buffers * sizeof(Py_ssize_t)); + + for (uint32_t i = 0; i < num_buffers; i++) { + PyObject *o = PySequence_Fast_GET_ITEM(fast_seq, i); + if (!PyObject_CheckBuffer(o)) { + PyErr_SetString( + PyExc_TypeError, "obj doesn't support buffer interface"); + break; + } + + Py_buffer view; + if (PyObject_GetBuffer(o, &view, PyBUF_SIMPLE) != -1) { + data[i] = (char *)view.buf; + lengths[i] = view.len; + } else { + PyErr_SetString(PyExc_BufferError, "failed to get buffer"); + break; + } + PyBuffer_Release(&view); + } + + if (PyErr_Occurred()) { + PyMem_RawFree(data); + PyMem_RawFree(lengths); + Py_XDECREF(fast_seq); + return NULL; + } + + if (self->chimera) { + PyErr_SetString( + PyExc_RuntimeError, "chimera does not support vectored scanning"); + return NULL; + } + + hs_error_t hs_err; + Py_BEGIN_ALLOW_THREADS; + hs_err = hs_scan_vector( + self->hs_db, + (const char *const *)data, + (const uint32_t *)lengths, + num_buffers, + flags, + oscratch == Py_None ? ((Scratch *)self->scratch)->hs_scratch + : ((Scratch *)oscratch)->hs_scratch, + ocallback == Py_None ? NULL : hs_match_handler, + ocallback == Py_None ? NULL : (void *)&cctx); + Py_END_ALLOW_THREADS; + PyMem_RawFree(data); + PyMem_RawFree(lengths); + Py_XDECREF(fast_seq); + HANDLE_HYPERSCAN_ERR(hs_err, NULL); + } else { + if (!PyBytes_CheckExact(odata)) { + PyErr_SetString(PyExc_TypeError, "a bytes-like object is required"); + return NULL; + } + + char *data = PyBytes_AsString(odata); + if (data == NULL) + return NULL; + Py_ssize_t length = PyBytes_Size(odata); + + if (self->chimera) { + ch_error_t ch_err; + Py_BEGIN_ALLOW_THREADS; + ch_err = ch_scan( + self->ch_db, + data, + length, + flags, + oscratch == Py_None ? ((Scratch *)self->scratch)->ch_scratch + : ((Scratch *)oscratch)->ch_scratch, + ocallback == Py_None ? NULL : ch_match_handler, + NULL, + ocallback == Py_None ? NULL : (void *)&cctx); + Py_END_ALLOW_THREADS; + if (PyErr_Occurred()) { + return NULL; + } + HANDLE_CHIMERA_ERR(ch_err, NULL); + } else { + hs_error_t hs_err; + Py_BEGIN_ALLOW_THREADS; + hs_err = hs_scan( + self->hs_db, + data, + length, + flags, + oscratch == Py_None ? ((Scratch *)self->scratch)->hs_scratch + : ((Scratch *)oscratch)->hs_scratch, + ocallback == Py_None ? NULL : hs_match_handler, + ocallback == Py_None ? NULL : (void *)&cctx); + Py_END_ALLOW_THREADS; + if (PyErr_Occurred()) { + return NULL; + } + HANDLE_HYPERSCAN_ERR(hs_err, NULL); + } + } + if (!cctx.success) { + return NULL; + } + Py_RETURN_NONE; +} + +static PyObject *Database_stream(Database *self, PyObject *args, PyObject *kwds) +{ + uint32_t flags = 0; + PyObject *ocallback = Py_None; + PyObject *octx = Py_None; + static char *kwlist[] = { + "match_event_handler", + "flags", + "context", + NULL, + }; + if (!PyArg_ParseTupleAndKeywords( + args, kwds, "O|IO", kwlist, &ocallback, &flags, &octx)) + return NULL; + PyObject *stream = PyObject_CallFunction( + (PyObject *)&StreamType, "OIOO", (PyObject *)self, flags, ocallback, octx); + if (PyErr_Occurred()) + return NULL; + Py_INCREF(stream); + return stream; +} + +static PyMemberDef Database_members[] = { + {"mode", T_INT, offsetof(Database, mode), 0, "int: Scanning mode."}, + {"scratch", + T_OBJECT_EX, + offsetof(Database, scratch), + 0, + ":class:`Scratch`: Scratch space object."}, + {NULL}}; + +static PyMethodDef Database_methods[] = { + {"compile", + (PyCFunction)Database_compile, + METH_VARARGS | METH_KEYWORDS, + "compile(expressions, ids=None, elements=None, flags=0, literal=False)\n\n" + " Compiles regular expressions.\n\n" + " Args:\n" + " expressions (sequence of str): A sequence of regular\n" + " expressions.\n" + " ids (sequence of int, optional): A sequence of\n" + " expression identifiers.\n" + " elements (int, optional): Length of the expressions\n" + " sequence.\n" + " flags (sequence of int or int, optional):\n" + " Sequence of flags associated with each expression, or a\n" + " single value which is applied to all expressions.\n" + " literal (bool, optional): If True, uses the pure literal\n" + " expression compiler introduced in Hyperscan 5.2.0.\n\n" + " ext (sequence of tuple, optional): A list of tuples used to " + " define extended behavior for each pattern. Tuples must \n" + " contain **flags**, **min_offset**, **max_offset**, \n" + " **min_length**, **edit_distance**, and **hamming_distance**.\n" + " See hyperscan documentation for more information.\n\n"}, + {"info", + (PyCFunction)Database_info, + METH_VARARGS, + "info()\n\n" + " Returns database information.\n\n" + " Returns:\n" + " str: Provides version and platform information for the\n" + " database.\n\n"}, + {"size", + (PyCFunction)Database_size, + METH_VARARGS, + "size()\n\n" + " Returns the size of the database in bytes.\n\n" + " Returns:\n" + " int: The size of the database in bytes.\n\n"}, + {"scan", + (PyCFunction)Database_scan, + METH_VARARGS | METH_KEYWORDS, + "scan(data, match_event_handler, flags=0, context=None, scratch=None)\n\n" + " Scans a block of text.\n\n" + " Args:\n" + " data (str): The block of text to scan, if the database\n" + " was opened with streaming or block mode, or a list of\n" + " buffers (i.e. :obj:`bytearray`) if the database was\n" + " opened with vectored mode.\n" + " match_event_handler (callable): The match callback, which is\n" + " invoked for each match result, and passed the expression\n" + " id, start offset, end offset, flags, and a context object.\n" + " flags (int): Currently unused.\n" + " context (:obj:`object`): A context object passed as the last\n" + " arg to **match_event_handler**.\n" + " scratch (:class:`Scratch`): A scratch object.\n\n"}, + {"stream", + (PyCFunction)Database_stream, + METH_VARARGS | METH_KEYWORDS, + "stream(match_event_handler=None, flags=0, context=None)\n\n" + " Returns a new stream context manager.\n\n" + " Args:\n" + " match_event_handler (callable, optional): The match callback,\n" + " which is invoked for each match result, and passed the\n" + " expression id, start offset, end offset, flags, and a\n" + " context object. Note that this callback will override\n" + " the match event handler defined in the\n" + " :class:`Database` instance.\n" + " flags (int): Currently unused.\n" + " context (:obj:`object`): A context object passed as the last\n" + " arg to **match_event_handler**.\n\n"}, + {NULL}}; + +static PyTypeObject DatabaseType = { + PyVarObject_HEAD_INIT(NULL, 0) "hyperscan.Database", /* tp_name */ + sizeof(Database), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor)Database_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_reserved */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ + "Database(scratch=None, mode=0)\n\n" + " Represents a Hyperscan database.\n\n" + " Args:\n" + " scratch (:class:`Scratch`, optional): Thread-specific\n" + " scratch space.\n" + " mode (int, optional): One of :const:`HS_MODE_BLOCK`,\n" + " :const:`HS_MODE_STREAM`, or :const:`HS_MODE_VECTORED`.\n" + " chimera (bool): Enable Chimera support." + "\n\n", /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + Database_methods, /* tp_methods */ + Database_members, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + (initproc)Database_init, /* tp_init */ + 0, /* tp_alloc */ + Database_new, /* tp_new */ +}; + +static void Stream_dealloc(Stream *self) +{ + if (self->cctx != NULL) + free(self->cctx); + Py_TYPE(self)->tp_free((PyObject *)self); +} + +static PyObject *Stream_new(PyTypeObject *type, PyObject *args, PyObject *kwds) +{ + Stream *self; + + self = (Stream *)type->tp_alloc(type, 0); + if (self != NULL) { + self->flags = 0; + self->database = Py_None; + self->scratch = Py_None; + } + + return (PyObject *)self; +} + +static int Stream_init(Stream *self, PyObject *args, PyObject *kwds) +{ + static char *kwlist[] = { + "database", + "flags", + "match_event_handler", + "context", + "scratch", + NULL, + }; + self->cctx = malloc(sizeof(py_scan_callback_ctx)); + if (!PyArg_ParseTupleAndKeywords( + args, + kwds, + "O|IOOO!", + kwlist, + &self->database, + &self->flags, + &self->cctx->callback, + &self->cctx->ctx, + &self->scratch, + &ScratchType)) + return -1; + if (!PyObject_IsInstance(self->database, (PyObject *)&DatabaseType)) { + PyErr_SetString( + PyExc_TypeError, "database must be a hyperscan.Database instance"); + return -1; + } + return 0; +} + +static PyObject *Stream_close(Stream *self, PyObject *args, PyObject *kwds) +{ + py_scan_callback_ctx cctx; + PyObject *oscratch = Py_None, *ocallback = Py_None, *octx = Py_None; + static char *kwlist[] = {"scratch", "match_event_handler", "context", NULL}; + if (!PyArg_ParseTupleAndKeywords( + args, + kwds, + "|O!OO", + kwlist, + &oscratch, + &ScratchType, + &ocallback, + &octx)) + return NULL; + Database *db = (Database *)self->database; + Scratch *scratch; + if (PyObject_Not(oscratch)) + oscratch = ((Database *)self->database)->scratch; + cctx.callback = PyObject_IsTrue(ocallback) ? ocallback : self->cctx->callback; + cctx.ctx = PyObject_IsTrue(octx) ? octx : self->cctx->ctx; + if (PyObject_IsTrue(oscratch) && cctx.callback != NULL) + scratch = (Scratch *)oscratch; + else + scratch = (Scratch *)db->scratch; + + hs_scratch_t *hs_scratch = scratch->hs_scratch; + hs_error_t hs_err = hs_close_stream( + self->identifier, hs_scratch, hs_match_handler, (void *)&cctx); + HANDLE_HYPERSCAN_ERR(hs_err, NULL); + + Py_RETURN_NONE; +} + +static long Stream_len(PyObject *self) +{ + size_t stream_size; + Stream *stream = (Stream *)self; + Database *db = (Database *)stream->database; + if (db->chimera) { + PyErr_SetString(PyExc_RuntimeError, "chimera does not support streams"); + return 0; + } + hs_error_t err = hs_stream_size(db->hs_db, &stream_size); + HANDLE_HYPERSCAN_ERR(err, 0); + return stream_size; +} + +static PyObject *Stream_enter(Stream *self) +{ + Stream *stream = (Stream *)self; + Database *db = (Database *)stream->database; + if (db->chimera) { + PyErr_SetString(PyExc_RuntimeError, "chimera does not support streams"); + return NULL; + } + hs_error_t err = hs_open_stream(db->hs_db, 0, &self->identifier); + HANDLE_HYPERSCAN_ERR(err, NULL); + return (PyObject *)self; +} + +static PyObject *Stream_exit(Stream *self) +{ + PyObject_CallMethod((PyObject *)self, "close", NULL); + if (PyErr_Occurred()) + return NULL; + Py_RETURN_NONE; +} + +static PyObject *Stream_scan(Stream *self, PyObject *args, PyObject *kwds) +{ + char *data; + Py_ssize_t length; + uint32_t flags; + PyObject *ocallback = Py_None, *octx = Py_None, *oscratch = Py_None; + + static char *kwlist[] = { + "data", "flags", "scratch", "match_event_handler", "context", NULL}; + if (!PyArg_ParseTupleAndKeywords( + args, + kwds, + "s#|IOOO", + kwlist, + &data, + &length, + &flags, + &oscratch, + &ocallback, + &octx)) + return NULL; + + if (PyObject_Not(ocallback)) + ocallback = self->cctx->callback; + if (PyObject_Not(octx)) + octx = self->cctx->ctx; + + Database *db = (Database *)self->database; + Scratch *scratch; + + if (PyObject_Not(oscratch)) + scratch = (Scratch *)db->scratch; + else { + if (!PyObject_IsInstance(oscratch, (PyObject *)&ScratchType)) { + PyErr_SetString( + PyExc_TypeError, "scratch must be a hyperscan.Scratch instance"); + return NULL; + } + scratch = (Scratch *)oscratch; + } + + py_scan_callback_ctx cctx = {ocallback, octx}; + + if (db->chimera) { + PyErr_SetString(PyExc_RuntimeError, "chimera does not support streams"); + return NULL; + } else { + hs_error_t hs_err; + Py_BEGIN_ALLOW_THREADS; + hs_err = hs_scan_stream( + self->identifier, + data, + length, + flags, + scratch->hs_scratch, + ocallback == Py_None ? NULL : hs_match_handler, + ocallback == Py_None ? NULL : (void *)&cctx); + Py_END_ALLOW_THREADS; + HANDLE_HYPERSCAN_ERR(hs_err, NULL); + } + + Py_RETURN_NONE; +} + +static PyMemberDef Stream_members[] = { + {"database", + T_OBJECT_EX, + offsetof(Stream, database), + 0, + ":class:`Database`: Database instance."}, + {"flags", T_INT, offsetof(Stream, flags), 0, "int: Stream flags."}, + {NULL}}; + +static PyMethodDef Stream_methods[] = { + {"__enter__", (PyCFunction)Stream_enter, METH_NOARGS}, + {"__exit__", (PyCFunction)Stream_exit, METH_VARARGS}, + {"close", + (PyCFunction)Stream_close, + METH_VARARGS | METH_KEYWORDS, + "close(scratch=None, match_event_handler=None, context=None)\n\n" + " Closes the stream.\n\n" + " Args:\n" + " scratch (:class:`Scratch`, optional): Scratch space.\n" + " If a **match_event_handler** is provided and a scratch\n" + " space is not, the scratch space of the associated\n" + " database will be used.\n" + " match_event_handler (callable, optional): The match \n" + " callback, which is invoked for each match result, and\n" + " passed the expression id, start offset, end offset,\n" + " flags, and a context object.\n" + " context (:obj:`object`, optional): A context object passed\n" + " as the last arg to **match_event_handler**.\n\n"}, + {"scan", + (PyCFunction)Stream_scan, + METH_VARARGS | METH_KEYWORDS, + "scan(data, flags=0, scratch=None, match_event_handler=None, " + "context=None)\n\n" + " Scans streaming text.\n\n" + " Args:\n" + " data (str): The block of text to scan.\n" + " flags (int, optional): Currently unused.\n" + " scratch (:obj:`Scratch`, optional): Scratch space.\n" + " match_event_handler (callable, optional): The match \n" + " callback, which is invoked for each match result, and\n" + " passed the expression id, start offset, end offset,\n" + " flags, and a context object.\n" + " context (:obj:`object`, optional): A context object passed\n" + " as the last arg to **match_event_handler**.\n\n"}, + {"size", + (PyCFunction)Stream_len, + METH_NOARGS, + "Return the size of the stream state in bytes."}, + {NULL}}; + +static PySequenceMethods Stream_sequence_methods = { + Stream_len, /* sq_length */ +}; + +static PyTypeObject StreamType = { + PyVarObject_HEAD_INIT(NULL, 0) "hyperscan.Stream", /* tp_name */ + sizeof(Stream), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor)Stream_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_reserved */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + &Stream_sequence_methods, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ + "Stream(database=None, flags=0, match_event_handler=None)\n\n" + " Provides a context manager for scanning streams of text.\n\n" + " Args:\n" + " database (:class:`Database`): A database initialized with\n" + " :const:`HS_MODE_STREAM`.\n" + " flags (int, optional): Currently unused.\n" + " match_event_handler (callable, optional): The match callback,\n" + " which is invoked for each match result, and passed the\n" + " expression id, start offset, end offset, flags, and a\n" + " context object." + "\n\n", /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + Stream_methods, /* tp_methods */ + Stream_members, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + (initproc)Stream_init, /* tp_init */ + 0, /* tp_alloc */ + Stream_new, /* tp_new */ +}; + +static void Scratch_dealloc(Scratch *self) +{ + if (self->hs_scratch != NULL) + hs_free_scratch(self->hs_scratch); + if (self->ch_scratch != NULL) + ch_free_scratch(self->ch_scratch); + Py_TYPE(self)->tp_free((PyObject *)self); +} + +static PyObject *Scratch_set_database( + Scratch *self, PyObject *args, PyObject *kwds) +{ + static char *kwlist[] = {"database", NULL}; + if (!PyArg_ParseTupleAndKeywords(args, kwds, "O", kwlist, &self->database)) + return NULL; + if (self->hs_scratch != NULL || self->ch_scratch != NULL) { + PyErr_SetString(HyperscanError, "scratch objects cannot be re-allocated"); + return NULL; + } + Database *db = (Database *)self->database; + if (db->chimera) { + ch_database_t *ch_db = db->ch_db; + ch_error_t ch_err = ch_alloc_scratch(ch_db, &self->ch_scratch); + HANDLE_CHIMERA_ERR(ch_err, NULL); + } else { + hs_database_t *hs_db = db->hs_db; + hs_error_t hs_err = hs_alloc_scratch(hs_db, &self->hs_scratch); + HANDLE_HYPERSCAN_ERR(hs_err, NULL); + } + Py_RETURN_NONE; +} + +static int Scratch_init(Scratch *self, PyObject *args, PyObject *kwds) +{ + static char *kwlist[] = {"database", NULL}; + self->database = Py_None; + if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O", kwlist, &self->database)) + return -1; + if (self->database != Py_None && !Scratch_set_database(self, args, kwds)) + return -1; + return 0; +} + +static PyObject *Scratch_clone(Scratch *self) +{ + PyObject *odest = PyObject_CallFunction((PyObject *)&ScratchType, NULL); + Scratch *dest = (Scratch *)odest; + bool chimera; + if (self->database == Py_None) { + // XXX: Assume chimera mode is false if no db + chimera = false; + } else { + chimera = ((Database *)self->database)->chimera; + } + + if (chimera) { + ch_error_t ch_err = ch_clone_scratch(self->ch_scratch, &dest->ch_scratch); + HANDLE_CHIMERA_ERR(ch_err, NULL); + } else { + hs_error_t hs_err = hs_clone_scratch(self->hs_scratch, &dest->hs_scratch); + HANDLE_HYPERSCAN_ERR(hs_err, NULL); + } + + return odest; +} + +static PyMemberDef Scratch_members[] = { + {"database", + T_OBJECT_EX, + offsetof(Scratch, database), + 0, + ":class:`Database`: The database associated with this scratch space."}, + {NULL}}; + +static PyMethodDef Scratch_methods[] = { + {"clone", + (PyCFunction)Scratch_clone, + METH_NOARGS, + "clone()\n\n" + " Clones a scratch space.\n\n" + " Returns:\n" + " :class:`Scratch`: The cloned scratch space.\n\n"}, + {"set_database", + (PyCFunction)Scratch_set_database, + METH_VARARGS | METH_KEYWORDS, + "set_database(database)\n\n" + " Allocates a scratch with the given database.\n\n"}, + {NULL}}; + +static PyTypeObject ScratchType = { + PyVarObject_HEAD_INIT(NULL, 0) "hyperscan.Scratch", /* tp_name */ + sizeof(Scratch), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor)Scratch_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_reserved */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ + "Scratch(database=None)\n\n" + " Represents Hyperscan 'scratch space.'\n\n" + " Args:\n" + " database (:class:`Database`, optional): A database instance." + "\n\n", /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + Scratch_methods, /* tp_methods */ + Scratch_members, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + (initproc)Scratch_init, /* tp_init */ +}; + +static PyObject *dumpb(PyObject *self, PyObject *args, PyObject *kwds) +{ + Database *db; + char *buf; + size_t length; + static char *kwlist[] = {"database", NULL}; + + if (!PyArg_ParseTupleAndKeywords( + args, kwds, "O!", kwlist, &DatabaseType, &db)) + return NULL; + if (db->chimera) { + PyErr_SetString( + PyExc_RuntimeError, "chimera does not support serialization"); + return NULL; + } + hs_error_t err = hs_serialize_database(db->hs_db, &buf, &length); + HANDLE_HYPERSCAN_ERR(err, NULL); + PyObject *bytes = PyBytes_FromStringAndSize(buf, length); + if (!bytes) { + PyErr_SetString(HyperscanError, "failed to serialize database"); + return NULL; + } + PyMem_Free(buf); + return bytes; +} + +static PyObject *loadb(PyObject *self, PyObject *args, PyObject *kwds) +{ + char *buf; + PyObject *obuf = Py_None; + PyObject *odb; + odb = PyObject_CallFunctionObjArgs((PyObject *)&DatabaseType, NULL); + Database *db = (Database *)odb; + + static char *kwlist[] = {"buf", "mode", NULL}; + if (!PyArg_ParseTupleAndKeywords(args, kwds, "OI", kwlist, &obuf, &db->mode)) + return NULL; + if (!PyBytes_Check(obuf)) { + PyErr_SetString(PyExc_TypeError, "buf must be a bytestring"); + return NULL; + } + + Py_ssize_t length = PyBytes_Size(obuf); + buf = PyBytes_AsString(obuf); + hs_error_t err = hs_deserialize_database(buf, length, &db->hs_db); + HANDLE_HYPERSCAN_ERR(err, NULL); + if (PyErr_Occurred()) + return NULL; + return odb; +} + +static PyMethodDef HyperscanMethods[] = { + {"dumpb", + (PyCFunction)dumpb, + METH_VARARGS | METH_KEYWORDS, + "dumpb(database)\n" + " Serializes a Hyperscan database.\n\n" + " Args:\n" + " database (:class:`Database`): A Hyperscan database.\n\n" + " Returns:\n" + " bytes: A serialized representation of the database.\n\n"}, + {"loadb", + (PyCFunction)loadb, + METH_VARARGS | METH_KEYWORDS, + "loadb(buf, mode)\n" + " Deserializes a Hyperscan database.\n\n" + " Args:\n" + " buf (:obj:`bytearray`): A serialized Hyperscan database.\n" + " mode (int): The expected mode of the database.\n\n" + " Returns:\n" + " :class:`Database`: The deserialized database instance.\n\n"}, + {NULL}}; + +static struct PyModuleDef hyperscanmodule = { + PyModuleDef_HEAD_INIT, + "_hs_ext", + "Hyperscan bindings for CPython.", + -1, + HyperscanMethods, +}; + +PyMODINIT_FUNC PyInit__hs_ext(void) +{ + PyObject *m; + + m = PyModule_Create(&hyperscanmodule); + if (m == NULL) + return NULL; + + ADD_INT_CONSTANT(m, CH_BAD_ALIGN); + ADD_INT_CONSTANT(m, CH_BAD_ALLOC); + ADD_INT_CONSTANT(m, CH_COMPILER_ERROR); + ADD_INT_CONSTANT(m, CH_DB_MODE_ERROR); + ADD_INT_CONSTANT(m, CH_DB_PLATFORM_ERROR); + ADD_INT_CONSTANT(m, CH_DB_VERSION_ERROR); + ADD_INT_CONSTANT(m, CH_FAIL_INTERNAL); + ADD_INT_CONSTANT(m, CH_FLAG_CASELESS); + ADD_INT_CONSTANT(m, CH_FLAG_DOTALL); + ADD_INT_CONSTANT(m, CH_FLAG_MULTILINE); + ADD_INT_CONSTANT(m, CH_FLAG_SINGLEMATCH); + ADD_INT_CONSTANT(m, CH_FLAG_UCP); + ADD_INT_CONSTANT(m, CH_FLAG_UTF8); + ADD_INT_CONSTANT(m, CH_INVALID); + ADD_INT_CONSTANT(m, CH_MODE_GROUPS); + ADD_INT_CONSTANT(m, CH_MODE_NOGROUPS); + ADD_INT_CONSTANT(m, CH_NOMEM); + ADD_INT_CONSTANT(m, CH_SCAN_TERMINATED); + ADD_INT_CONSTANT(m, CH_SCRATCH_IN_USE); + ADD_INT_CONSTANT(m, CH_SUCCESS); + ADD_INT_CONSTANT(m, HS_CPU_FEATURES_AVX2); + ADD_INT_CONSTANT(m, HS_EXT_FLAG_EDIT_DISTANCE); + ADD_INT_CONSTANT(m, HS_EXT_FLAG_HAMMING_DISTANCE); + ADD_INT_CONSTANT(m, HS_EXT_FLAG_MAX_OFFSET); + ADD_INT_CONSTANT(m, HS_EXT_FLAG_MIN_LENGTH); + ADD_INT_CONSTANT(m, HS_EXT_FLAG_MIN_OFFSET); + ADD_INT_CONSTANT(m, HS_FLAG_ALLOWEMPTY); + ADD_INT_CONSTANT(m, HS_FLAG_CASELESS); + ADD_INT_CONSTANT(m, HS_FLAG_COMBINATION); + ADD_INT_CONSTANT(m, HS_FLAG_DOTALL); + ADD_INT_CONSTANT(m, HS_FLAG_MULTILINE); + ADD_INT_CONSTANT(m, HS_FLAG_PREFILTER); + ADD_INT_CONSTANT(m, HS_FLAG_QUIET); + ADD_INT_CONSTANT(m, HS_FLAG_SINGLEMATCH); + ADD_INT_CONSTANT(m, HS_FLAG_SOM_LEFTMOST); + ADD_INT_CONSTANT(m, HS_FLAG_UCP); + ADD_INT_CONSTANT(m, HS_FLAG_UTF8); + ADD_INT_CONSTANT(m, HS_MODE_BLOCK); + ADD_INT_CONSTANT(m, HS_MODE_NOSTREAM); + ADD_INT_CONSTANT(m, HS_MODE_SOM_HORIZON_LARGE); + ADD_INT_CONSTANT(m, HS_MODE_SOM_HORIZON_MEDIUM); + ADD_INT_CONSTANT(m, HS_MODE_SOM_HORIZON_SMALL); + ADD_INT_CONSTANT(m, HS_MODE_STREAM); + ADD_INT_CONSTANT(m, HS_MODE_VECTORED); + ADD_INT_CONSTANT(m, HS_OFFSET_PAST_HORIZON); + ADD_INT_CONSTANT(m, HS_SCRATCH_IN_USE); + ADD_INT_CONSTANT(m, HS_SUCCESS); + ADD_INT_CONSTANT(m, HS_TUNE_FAMILY_BDW); + ADD_INT_CONSTANT(m, HS_TUNE_FAMILY_GENERIC); + ADD_INT_CONSTANT(m, HS_TUNE_FAMILY_GLM); + ADD_INT_CONSTANT(m, HS_TUNE_FAMILY_HSW); + ADD_INT_CONSTANT(m, HS_TUNE_FAMILY_IVB); + ADD_INT_CONSTANT(m, HS_TUNE_FAMILY_SKL); + ADD_INT_CONSTANT(m, HS_TUNE_FAMILY_SKX); + ADD_INT_CONSTANT(m, HS_TUNE_FAMILY_SLM); + ADD_INT_CONSTANT(m, HS_TUNE_FAMILY_SNB); + + HyperscanError = PyErr_NewExceptionWithDoc( + "hyperscan.error", + "Base exception class for Hyperscan errors.", + NULL, + NULL); + if (!HyperscanError) { + goto cleanup_module; + } + Py_XINCREF(HyperscanError); + if (PyModule_AddObject(m, "error", HyperscanError) < 0) { + Py_XDECREF(HyperscanError); + Py_CLEAR(HyperscanError); + goto cleanup_module; + } + if (PyModule_AddObject(m, "HyperscanError", HyperscanError) < 0) { + Py_XDECREF(HyperscanError); + Py_CLEAR(HyperscanError); + goto cleanup_module; + } + + ADD_HYPERSCAN_ERROR( + m, + HyperscanErrors, + HyperscanError, + InvalidError, + HS_INVALID, + "Parameter passed to this function was invalid."); + ADD_HYPERSCAN_ERROR( + m, + HyperscanErrors, + HyperscanError, + NoMemoryError, + HS_NOMEM, + "Memory allocation failed."); + ADD_HYPERSCAN_ERROR( + m, + HyperscanErrors, + HyperscanError, + CompilerError, + HS_COMPILER_ERROR, + "Pattern compilation failed."); + ADD_HYPERSCAN_ERROR( + m, + HyperscanErrors, + HyperscanError, + ScanTerminated, + HS_SCAN_TERMINATED, + "The engine was terminated by callback."); + ADD_HYPERSCAN_ERROR( + m, + HyperscanErrors, + HyperscanError, + DatabaseVersionError, + HS_DB_VERSION_ERROR, + "The given database was built for a different version of Hyperscan."); + ADD_HYPERSCAN_ERROR( + m, + HyperscanErrors, + HyperscanError, + DatabasePlatformError, + HS_DB_PLATFORM_ERROR, + "The given database was built for a different platform (i.e., CPU type)."); + ADD_HYPERSCAN_ERROR( + m, + HyperscanErrors, + HyperscanError, + DatabaseModeError, + HS_DB_MODE_ERROR, + "The given database was built for a different mode of operation."); + ADD_HYPERSCAN_ERROR( + m, + HyperscanErrors, + HyperscanError, + BadAlignError, + HS_BAD_ALIGN, + "A parameter passed to this function was not correctly aligned."); + ADD_HYPERSCAN_ERROR( + m, + HyperscanErrors, + HyperscanError, + BadAllocationError, + HS_BAD_ALLOC, + "The memory allocator failed."); + ADD_HYPERSCAN_ERROR( + m, + HyperscanErrors, + HyperscanError, + ScratchInUseError, + HS_SCRATCH_IN_USE, + "The scratch region was already in use."); + ADD_HYPERSCAN_ERROR( + m, + HyperscanErrors, + HyperscanError, + ArchitectureError, + HS_ARCH_ERROR, + "Unsupported CPU architecture."); + ADD_HYPERSCAN_ERROR( + m, + HyperscanErrors, + HyperscanError, + InsufficientSpaceError, + HS_INSUFFICIENT_SPACE, + "Provided buffer was too small."); + ADD_HYPERSCAN_ERROR( + m, + HyperscanErrors, + HyperscanError, + UnknownError, + HS_UNKNOWN_ERROR, + "Unexpected internal error."); + ADD_HYPERSCAN_ERROR( + m, + HyperscanErrors, + HyperscanError, + InternalPCREError, + CH_FAIL_INTERNAL, + "Unexpected internal error."); + + if ( + (PyType_Ready(&DatabaseType) < 0) || (PyType_Ready(&ScratchType) < 0) || + (PyType_Ready(&StreamType) < 0)) { + goto cleanup_module; + } + + Py_XINCREF(&DatabaseType); + if (PyModule_AddObject(m, "Database", (PyObject *)&DatabaseType) < 0) { + Py_XDECREF(&DatabaseType); + goto cleanup_module; + } + + ScratchType.tp_new = PyType_GenericNew; + Py_XINCREF(&ScratchType); + if (PyModule_AddObject(m, "Scratch", (PyObject *)&ScratchType) < 0) { + Py_XDECREF(&ScratchType); + goto cleanup_module; + } + + Py_XINCREF(&StreamType); + if (PyModule_AddObject(m, "Stream", (PyObject *)&StreamType) < 0) { + Py_XDECREF(&StreamType); + goto cleanup_module; + } + + if (PyModule_AddStringConstant(m, "__version__", hs_version()) < 0) { + goto cleanup_module; + } + + return m; + +cleanup_module: + Py_DECREF(m); + return NULL; +} diff --git a/test_wheels/hyperscan/py.typed b/test_wheels/hyperscan/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/test_without_flags.py b/test_without_flags.py new file mode 100644 index 0000000..bdba0dd --- /dev/null +++ b/test_without_flags.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python3 + +import hyperscan + +print(f'hyperscan version: {hyperscan.__version__}') + +# Same patterns from GitHub issue #207 but WITHOUT the problematic flags +bla = [r'السلام عليكم\s<\/span>'.encode('utf8'), + r'ועליכום הסلאם\s<\/span>'.encode('utf8')] + +print(f'Testing patterns: {bla}') + +print('\n=== Testing WITH problematic flags (should fail) ===') +try: + rules_db = hyperscan.Database() + rules_db.compile(expressions=bla, + flags=hyperscan.HS_FLAG_UTF8 | hyperscan.HS_FLAG_UCP) + print('SUCCESS: Patterns compiled with HS_FLAG_UTF8 | HS_FLAG_UCP!') +except Exception as e: + print(f'FAILED: {e}') + +print('\n=== Testing WITHOUT flags (should work) ===') +try: + rules_db = hyperscan.Database() + rules_db.compile(expressions=bla) + print('SUCCESS: Patterns compiled without flags!') +except Exception as e: + print(f'FAILED: {e}') + +print('\n=== Testing with unicode strings (should work) ===') +try: + unicode_patterns = [r'السلام عليكم\s<\/span>', + r'ועליכום הסلאם\s<\/span>'] + rules_db = hyperscan.Database() + rules_db.compile(expressions=unicode_patterns) + print('SUCCESS: Unicode patterns compiled without flags!') +except Exception as e: + print(f'FAILED: {e}') \ No newline at end of file