From ab9159387cdd28f216799b76f43fa2b6ae372b69 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Mon, 25 Aug 2025 13:34:39 -0700 Subject: [PATCH 1/7] Copy pre-commit-hooks "Standard hooks" from pybind11/.pre-commit-config.yaml --- .pre-commit-config.yaml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index db3968dbd6..29c6f999d8 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -39,6 +39,23 @@ repos: pass_filenames: false always_run: true + # Standard hooks + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: "v5.0.0" + hooks: + - id: check-added-large-files + - id: check-case-conflict + - id: check-docstring-first + - id: check-merge-conflict + - id: check-symlinks + - id: check-toml + - id: check-yaml + - id: debug-statements + - id: end-of-file-fixer + - id: mixed-line-ending + - id: requirements-txt-fixer + - id: trailing-whitespace + # Checking for common mistakes - repo: https://github.com/pre-commit/pygrep-hooks rev: "v1.10.0" From f955667855c43e6a48192262ac7439d704c8b6c3 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Mon, 25 Aug 2025 13:37:00 -0700 Subject: [PATCH 2/7] =?UTF-8?q?Automatic=20whitespace=20fixes=20=E2=80=94?= =?UTF-8?q?=20NO=20manual=20changes.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/ISSUE_TEMPLATE/bug_report.yml | 1 - .github/ISSUE_TEMPLATE/doc_request.yml | 1 - .github/PULL_REQUEST_TEMPLATE.md | 1 - .github/actions/doc_preview/action.yml | 10 ++--- .github/actions/install_unix_deps/action.yml | 2 +- .github/workflows/release-upload.yml | 2 +- SECURITY.md | 1 - ci/tools/download-wheels | 8 ++-- cuda_bindings/LICENSE | 36 +++++++-------- .../cuda/bindings/_bindings/cydriver.pxd.in | 1 - .../cuda/bindings/_bindings/cydriver.pyx.in | 8 ++-- .../cuda/bindings/_bindings/cynvrtc.pxd.in | 1 - .../cuda/bindings/_lib/cyruntime/utils.pyx.in | 2 +- cuda_bindings/cuda/bindings/_lib/utils.pxi.in | 2 +- cuda_bindings/cuda/bindings/cydriver.pxd.in | 2 +- cuda_bindings/cuda/bindings/cynvrtc.pxd.in | 1 - cuda_bindings/cuda/bindings/cyruntime.pxd.in | 2 +- .../cuda/bindings/cyruntime_functions.pxi.in | 1 - cuda_bindings/cuda/bindings/driver.pyx.in | 3 +- cuda_bindings/cuda/bindings/runtime.pyx.in | 3 +- .../docs/source/environment_variables.rst | 1 - cuda_bindings/docs/source/install.rst | 2 +- cuda_bindings/docs/source/module/driver.rst | 26 +++++------ .../docs/source/module/nvjitlink.rst | 2 +- cuda_bindings/docs/source/module/nvrtc.rst | 7 ++- cuda_bindings/docs/source/module/runtime.rst | 44 +++++++++---------- cuda_bindings/docs/source/motivation.rst | 14 +++--- cuda_bindings/docs/source/overview.rst | 39 ++++++++-------- .../docs/source/release/11.6.0-notes.rst | 1 - .../docs/source/release/12.9.X-notes.rst | 2 +- cuda_bindings/docs/source/tips_and_tricks.rst | 6 +-- cuda_bindings/pyproject.toml | 10 ++--- .../core/experimental/_kernel_arg_handler.pyx | 2 +- cuda_core/cuda/core/experimental/_memory.pyx | 10 ++--- .../cuda/core/experimental/_memoryview.pyx | 36 +++++++-------- .../_templates/autosummary/dataclass.rst | 1 - .../_templates/autosummary/namedtuple.rst | 2 +- cuda_core/docs/source/getting-started.rst | 6 +-- cuda_core/docs/source/install.rst | 4 +- cuda_core/docs/source/release/0.1.1-notes.rst | 2 +- cuda_core/docs/source/release/0.3.2-notes.rst | 2 +- cuda_core/pyproject.toml | 6 +-- .../cython/test_get_cuda_native_handle.pyx | 2 +- cuda_python/LICENSE | 36 +++++++-------- cuda_python/docs/source/release.rst | 1 - 45 files changed, 168 insertions(+), 184 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index 09752a7293..4574e04bf3 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -113,4 +113,3 @@ body: +-------------------------------+----------------------+----------------------+ validations: required: false - diff --git a/.github/ISSUE_TEMPLATE/doc_request.yml b/.github/ISSUE_TEMPLATE/doc_request.yml index 26a7faeace..7804a6c85a 100644 --- a/.github/ISSUE_TEMPLATE/doc_request.yml +++ b/.github/ISSUE_TEMPLATE/doc_request.yml @@ -41,4 +41,3 @@ body: label: If this is a correction, please provide a link to the incorrect documentation. If this is a new documentation request, please link to where you have looked. placeholder: | https://nvidia.github.io/cuda-python/latest/ - diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index afc43be22f..aa51259a90 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -11,4 +11,3 @@ closes - [ ] New or existing tests cover these changes. - [ ] The documentation is up to date with these changes. - diff --git a/.github/actions/doc_preview/action.yml b/.github/actions/doc_preview/action.yml index 61531428a7..6948522daa 100644 --- a/.github/actions/doc_preview/action.yml +++ b/.github/actions/doc_preview/action.yml @@ -18,7 +18,7 @@ runs: using: composite steps: # The steps below are executed only when testing in a PR. - # Note: the PR previews will be removed once merged to main (see below) + # Note: the PR previews will be removed once merged to main (see below) - name: Deploy doc preview if: ${{ github.ref_name != 'main' }} uses: JamesIves/github-pages-deploy-action@6c2d9db40f9296374acc17b90404b6e8864128c8 # v4.7.3 @@ -28,7 +28,7 @@ runs: folder: ${{ inputs.source-folder }} target-folder: docs/pr-preview/pr-${{ inputs.pr-number }}/ commit-message: "Deploy doc preview for PR ${{ inputs.pr-number }} (${{ github.sha }})" - + - name: Leave a comment after deployment if: ${{ github.ref_name != 'main' }} uses: marocchino/sticky-pull-request-comment@67d0dec7b07ed060a405f9b2a64b8ab319fdd7db # v2.9.2 @@ -43,8 +43,8 @@ runs: |
https://nvidia.github.io/cuda-python/pr-preview/pr-${{ inputs.pr-number }}/cuda-core/
|
https://nvidia.github.io/cuda-python/pr-preview/pr-${{ inputs.pr-number }}/cuda-bindings/

|

Preview will be ready when the GitHub Pages deployment is complete.

- - # The steps below are executed only when building on main. + + # The steps below are executed only when building on main. - name: Remove doc preview if: ${{ github.ref_name == 'main' }} uses: JamesIves/github-pages-deploy-action@6c2d9db40f9296374acc17b90404b6e8864128c8 # v4.7.3 @@ -54,7 +54,7 @@ runs: folder: ${{ inputs.source-folder }} target-folder: docs/pr-preview/pr-${{ inputs.pr-number }}/ commit-message: "Clean up doc preview for PR ${{ inputs.pr-number }} (${{ github.sha }})" - + - name: Leave a comment after removal if: ${{ github.ref_name == 'main' }} uses: marocchino/sticky-pull-request-comment@67d0dec7b07ed060a405f9b2a64b8ab319fdd7db # v2.9.2 diff --git a/.github/actions/install_unix_deps/action.yml b/.github/actions/install_unix_deps/action.yml index 645b761cf8..6289541c90 100644 --- a/.github/actions/install_unix_deps/action.yml +++ b/.github/actions/install_unix_deps/action.yml @@ -10,7 +10,7 @@ inputs: dependencies: required: true type: string - dependent_exes: + dependent_exes: required: true type: string diff --git a/.github/workflows/release-upload.yml b/.github/workflows/release-upload.yml index f7d6306fc2..8ae08c5026 100644 --- a/.github/workflows/release-upload.yml +++ b/.github/workflows/release-upload.yml @@ -78,7 +78,7 @@ jobs: run: | # Use the shared script to download wheels ./ci/tools/download-wheels "${{ inputs.run-id }}" "${{ inputs.component }}" "${{ github.repository }}" "release/wheels" - + # Upload wheels to the release if [[ -d "release/wheels" && $(ls -A release/wheels 2>/dev/null | wc -l) -gt 0 ]]; then echo "Uploading wheels to release ${{ inputs.git-tag }}" diff --git a/SECURITY.md b/SECURITY.md index b72f0d67ad..4283541556 100755 --- a/SECURITY.md +++ b/SECURITY.md @@ -33,4 +33,3 @@ information. For all security-related concerns, please visit NVIDIA's Product Security portal at . - diff --git a/ci/tools/download-wheels b/ci/tools/download-wheels index 05509bfc0a..a3141afb33 100755 --- a/ci/tools/download-wheels +++ b/ci/tools/download-wheels @@ -49,18 +49,18 @@ do if [[ ! -d "$p" ]]; then continue fi - + # exclude cython test artifacts if [[ "${p}" == *-tests ]]; then echo "Skipping test artifact: $p" continue fi - + # If we're not downloading "all", only process matching component if [[ "$COMPONENT" != "all" && "$p" != ${COMPONENT}* ]]; then continue fi - + echo "Processing artifact: $p" # Move wheel files to output directory if [[ -d "$p" ]]; then @@ -72,4 +72,4 @@ done rm -rf cuda-* echo "Downloaded wheels to: $OUTPUT_DIR" -ls -la "$OUTPUT_DIR" \ No newline at end of file +ls -la "$OUTPUT_DIR" diff --git a/cuda_bindings/LICENSE b/cuda_bindings/LICENSE index b7d042fcee..a5a65097cd 100644 --- a/cuda_bindings/LICENSE +++ b/cuda_bindings/LICENSE @@ -2,46 +2,46 @@ NVIDIA SOFTWARE LICENSE This license is a legal agreement between you and NVIDIA Corporation ("NVIDIA") and governs your use of the NVIDIA CUDA Python software and materials provided hereunder ("SOFTWARE"). -This license can be accepted only by an adult of legal age of majority in the country in which the SOFTWARE is used. If you are under the legal age of majority, you must ask your parent or legal guardian to consent to this license. By taking delivery of the SOFTWARE, you affirm that you have reached the legal age of majority, you accept the terms of this license, and you take legal and financial responsibility for the actions of your permitted users. +This license can be accepted only by an adult of legal age of majority in the country in which the SOFTWARE is used. If you are under the legal age of majority, you must ask your parent or legal guardian to consent to this license. By taking delivery of the SOFTWARE, you affirm that you have reached the legal age of majority, you accept the terms of this license, and you take legal and financial responsibility for the actions of your permitted users. You agree to use the SOFTWARE only for purposes that are permitted by (a) this license, and (b) any applicable law, regulation or generally accepted practices or guidelines in the relevant jurisdictions. 1. LICENSE. Subject to the terms of this license, NVIDIA grants you a non-exclusive limited license to: (a) install and use the SOFTWARE, and (b) distribute the SOFTWARE subject to the distribution requirements described in this license. NVIDIA reserves all rights, title and interest in and to the SOFTWARE not expressly granted to you under this license. -2. DISTRIBUTION REQUIREMENTS. These are the distribution requirements for you to exercise the distribution grant: -a. The terms under which you distribute the SOFTWARE must be consistent with the terms of this license, including (without limitation) terms relating to the license grant and license restrictions and protection of NVIDIA's intellectual property rights. -b. You agree to notify NVIDIA in writing of any known or suspected distribution or use of the SOFTWARE not in compliance with the requirements of this license, and to enforce the terms of your agreements with respect to distributed SOFTWARE. +2. DISTRIBUTION REQUIREMENTS. These are the distribution requirements for you to exercise the distribution grant: +a. The terms under which you distribute the SOFTWARE must be consistent with the terms of this license, including (without limitation) terms relating to the license grant and license restrictions and protection of NVIDIA's intellectual property rights. +b. You agree to notify NVIDIA in writing of any known or suspected distribution or use of the SOFTWARE not in compliance with the requirements of this license, and to enforce the terms of your agreements with respect to distributed SOFTWARE. 3. LIMITATIONS. Your license to use the SOFTWARE is restricted as follows: a. The SOFTWARE is licensed for you to develop applications only for use in systems with NVIDIA GPUs. -b. You may not reverse engineer, decompile or disassemble, or remove copyright or other proprietary notices from any portion of the SOFTWARE or copies of the SOFTWARE. -c. You may not modify or create derivative works of any portion of the SOFTWARE. +b. You may not reverse engineer, decompile or disassemble, or remove copyright or other proprietary notices from any portion of the SOFTWARE or copies of the SOFTWARE. +c. You may not modify or create derivative works of any portion of the SOFTWARE. d. You may not bypass, disable, or circumvent any technical measure, encryption, security, digital rights management or authentication mechanism in the SOFTWARE. e. You may not use the SOFTWARE in any manner that would cause it to become subject to an open source software license. As examples, licenses that require as a condition of use, modification, and/or distribution that the SOFTWARE be (i) disclosed or distributed in source code form; (ii) licensed for the purpose of making derivative works; or (iii) redistributable at no charge. -f. Unless you have an agreement with NVIDIA for this purpose, you may not use the SOFTWARE with any system or application where the use or failure of the system or application can reasonably be expected to threaten or result in personal injury, death, or catastrophic loss. Examples include use in avionics, navigation, military, medical, life support or other life critical applications. NVIDIA does not design, test or manufacture the SOFTWARE for these critical uses and NVIDIA shall not be liable to you or any third party, in whole or in part, for any claims or damages arising from such uses. -g. You agree to defend, indemnify and hold harmless NVIDIA and its affiliates, and their respective employees, contractors, agents, officers and directors, from and against any and all claims, damages, obligations, losses, liabilities, costs or debt, fines, restitutions and expenses (including but not limited to attorney's fees and costs incident to establishing the right of indemnification) arising out of or related to use of the SOFTWARE outside of the scope of this Agreement, or not in compliance with its terms. +f. Unless you have an agreement with NVIDIA for this purpose, you may not use the SOFTWARE with any system or application where the use or failure of the system or application can reasonably be expected to threaten or result in personal injury, death, or catastrophic loss. Examples include use in avionics, navigation, military, medical, life support or other life critical applications. NVIDIA does not design, test or manufacture the SOFTWARE for these critical uses and NVIDIA shall not be liable to you or any third party, in whole or in part, for any claims or damages arising from such uses. +g. You agree to defend, indemnify and hold harmless NVIDIA and its affiliates, and their respective employees, contractors, agents, officers and directors, from and against any and all claims, damages, obligations, losses, liabilities, costs or debt, fines, restitutions and expenses (including but not limited to attorney's fees and costs incident to establishing the right of indemnification) arising out of or related to use of the SOFTWARE outside of the scope of this Agreement, or not in compliance with its terms. -4. PRE-RELEASE. SOFTWARE versions identified as alpha, beta, preview, early access or otherwise as pre-release may not be fully functional, may contain errors or design flaws, and may have reduced or different security, privacy, availability, and reliability standards relative to commercial versions of NVIDIA software and materials. You may use a pre-release SOFTWARE version at your own risk, understanding that these versions are not intended for use in production or business-critical systems. +4. PRE-RELEASE. SOFTWARE versions identified as alpha, beta, preview, early access or otherwise as pre-release may not be fully functional, may contain errors or design flaws, and may have reduced or different security, privacy, availability, and reliability standards relative to commercial versions of NVIDIA software and materials. You may use a pre-release SOFTWARE version at your own risk, understanding that these versions are not intended for use in production or business-critical systems. 5. OWNERSHIP. The SOFTWARE and the related intellectual property rights therein are and will remain the sole and exclusive property of NVIDIA or its licensors. The SOFTWARE is copyrighted and protected by the laws of the United States and other countries, and international treaty provisions. NVIDIA may make changes to the SOFTWARE, at any time without notice, but is not obligated to support or update the SOFTWARE. - -6. COMPONENTS UNDER OTHER LICENSES. The SOFTWARE may include NVIDIA or third-party components with separate legal notices or terms as may be described in proprietary notices accompanying the SOFTWARE. If and to the extent there is a conflict between the terms in this license and the license terms associated with a component, the license terms associated with the components control only to the extent necessary to resolve the conflict. + +6. COMPONENTS UNDER OTHER LICENSES. The SOFTWARE may include NVIDIA or third-party components with separate legal notices or terms as may be described in proprietary notices accompanying the SOFTWARE. If and to the extent there is a conflict between the terms in this license and the license terms associated with a component, the license terms associated with the components control only to the extent necessary to resolve the conflict. 7. FEEDBACK. You may, but don't have to, provide to NVIDIA any Feedback. "Feedback" means any suggestions, bug fixes, enhancements, modifications, feature requests or other feedback regarding the SOFTWARE. For any Feedback that you voluntarily provide, you hereby grant NVIDIA and its affiliates a perpetual, non-exclusive, worldwide, irrevocable license to use, reproduce, modify, license, sublicense (through multiple tiers of sublicensees), and distribute (through multiple tiers of distributors) the Feedback without the payment of any royalties or fees to you. NVIDIA will use Feedback at its choice. -8. NO WARRANTIES. THE SOFTWARE IS PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING, BUT NOT LIMITED TO, WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. NVIDIA DOES NOT WARRANT THAT THE SOFTWARE WILL MEET YOUR REQUIREMENTS OR THAT THE OPERATION THEREOF WILL BE UNINTERRUPTED OR ERROR-FREE, OR THAT ALL ERRORS WILL BE CORRECTED. +8. NO WARRANTIES. THE SOFTWARE IS PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING, BUT NOT LIMITED TO, WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. NVIDIA DOES NOT WARRANT THAT THE SOFTWARE WILL MEET YOUR REQUIREMENTS OR THAT THE OPERATION THEREOF WILL BE UNINTERRUPTED OR ERROR-FREE, OR THAT ALL ERRORS WILL BE CORRECTED. + +9. LIMITATIONS OF LIABILITY. TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS AFFILIATES SHALL NOT BE LIABLE FOR ANY SPECIAL, INCIDENTAL, PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, PROJECT DELAYS, LOSS OF USE, LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION WITH THIS LICENSE OR THE USE OR PERFORMANCE OF THE SOFTWARE, WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON BREACH OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE), PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION OR THEORY OF LIABILITY, EVEN IF NVIDIA HAS PREVIOUSLY BEEN ADVISED OF, OR COULD REASONABLY HAVE FORESEEN, THE POSSIBILITY OF SUCH DAMAGES. IN NO EVENT WILL NVIDIA'S AND ITS AFFILIATES TOTAL CUMULATIVE LIABILITY UNDER OR ARISING OUT OF THIS LICENSE EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE OR EXTEND THIS LIMIT. -9. LIMITATIONS OF LIABILITY. TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS AFFILIATES SHALL NOT BE LIABLE FOR ANY SPECIAL, INCIDENTAL, PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, PROJECT DELAYS, LOSS OF USE, LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION WITH THIS LICENSE OR THE USE OR PERFORMANCE OF THE SOFTWARE, WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON BREACH OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE), PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION OR THEORY OF LIABILITY, EVEN IF NVIDIA HAS PREVIOUSLY BEEN ADVISED OF, OR COULD REASONABLY HAVE FORESEEN, THE POSSIBILITY OF SUCH DAMAGES. IN NO EVENT WILL NVIDIA'S AND ITS AFFILIATES TOTAL CUMULATIVE LIABILITY UNDER OR ARISING OUT OF THIS LICENSE EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE OR EXTEND THIS LIMIT. +10. TERMINATION. Your rights under this license will terminate automatically without notice from NVIDIA if you fail to comply with any term and condition of this license or if you commence or participate in any legal proceeding against NVIDIA with respect to the SOFTWARE. NVIDIA may terminate this license with advance written notice to you if NVIDIA decides to no longer provide the SOFTWARE in a country or, in NVIDIA's sole discretion, the continued use of it is no longer commercially viable. Upon any termination of this license, you agree to promptly discontinue use of the SOFTWARE and destroy all copies in your possession or control. Your prior distributions in accordance with this license are not affected by the termination of this license. All provisions of this license will survive termination, except for the license granted to you. -10. TERMINATION. Your rights under this license will terminate automatically without notice from NVIDIA if you fail to comply with any term and condition of this license or if you commence or participate in any legal proceeding against NVIDIA with respect to the SOFTWARE. NVIDIA may terminate this license with advance written notice to you if NVIDIA decides to no longer provide the SOFTWARE in a country or, in NVIDIA's sole discretion, the continued use of it is no longer commercially viable. Upon any termination of this license, you agree to promptly discontinue use of the SOFTWARE and destroy all copies in your possession or control. Your prior distributions in accordance with this license are not affected by the termination of this license. All provisions of this license will survive termination, except for the license granted to you. +11. APPLICABLE LAW. This license will be governed in all respects by the laws of the United States and of the State of Delaware as those laws are applied to contracts entered into and performed entirely within Delaware by Delaware residents, without regard to the conflicts of laws principles. The United Nations Convention on Contracts for the International Sale of Goods is specifically disclaimed. You agree to all terms of this Agreement in the English language. The state or federal courts residing in Santa Clara County, California shall have exclusive jurisdiction over any dispute or claim arising out of this license. Notwithstanding this, you agree that NVIDIA shall still be allowed to apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction. -11. APPLICABLE LAW. This license will be governed in all respects by the laws of the United States and of the State of Delaware as those laws are applied to contracts entered into and performed entirely within Delaware by Delaware residents, without regard to the conflicts of laws principles. The United Nations Convention on Contracts for the International Sale of Goods is specifically disclaimed. You agree to all terms of this Agreement in the English language. The state or federal courts residing in Santa Clara County, California shall have exclusive jurisdiction over any dispute or claim arising out of this license. Notwithstanding this, you agree that NVIDIA shall still be allowed to apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction. +12. NO ASSIGNMENT. This license and your rights and obligations thereunder may not be assigned by you by any means or operation of law without NVIDIA's permission. Any attempted assignment not approved by NVIDIA in writing shall be void and of no effect. -12. NO ASSIGNMENT. This license and your rights and obligations thereunder may not be assigned by you by any means or operation of law without NVIDIA's permission. Any attempted assignment not approved by NVIDIA in writing shall be void and of no effect. - 13. EXPORT. The SOFTWARE is subject to United States export laws and regulations. You agree that you will not ship, transfer or export the SOFTWARE into any country, or use the SOFTWARE in any manner, prohibited by the United States Bureau of Industry and Security or economic sanctions regulations administered by the U.S. Department of Treasury's Office of Foreign Assets Control (OFAC), or any applicable export laws, restrictions or regulations. These laws include restrictions on destinations, end users and end use. By accepting this license, you confirm that you are not a resident or citizen of any country currently embargoed by the U.S. and that you are not otherwise prohibited from receiving the SOFTWARE. -14. GOVERNMENT USE. The SOFTWARE has been developed entirely at private expense and is "commercial items" consisting of "commercial computer software" and "commercial computer software documentation" provided with RESTRICTED RIGHTS. Use, duplication or disclosure by the U.S. Government or a U.S. Government subcontractor is subject to the restrictions in this license pursuant to DFARS 227.7202-3(a) or as set forth in subparagraphs (b)(1) and (2) of the Commercial Computer Software - Restricted Rights clause at FAR 52.227-19, as applicable. Contractor/manufacturer is NVIDIA, 2788 San Tomas Expressway, Santa Clara, CA 95051. +14. GOVERNMENT USE. The SOFTWARE has been developed entirely at private expense and is "commercial items" consisting of "commercial computer software" and "commercial computer software documentation" provided with RESTRICTED RIGHTS. Use, duplication or disclosure by the U.S. Government or a U.S. Government subcontractor is subject to the restrictions in this license pursuant to DFARS 227.7202-3(a) or as set forth in subparagraphs (b)(1) and (2) of the Commercial Computer Software - Restricted Rights clause at FAR 52.227-19, as applicable. Contractor/manufacturer is NVIDIA, 2788 San Tomas Expressway, Santa Clara, CA 95051. 15. ENTIRE AGREEMENT. This license is the final, complete and exclusive agreement between the parties relating to the subject matter of this license and supersedes all prior or contemporaneous understandings and agreements relating to this subject matter, whether oral or written. If any court of competent jurisdiction determines that any provision of this license is illegal, invalid or unenforceable, the remaining provisions will remain in full force and effect. This license may only be modified in a writing signed by an authorized representative of each party. diff --git a/cuda_bindings/cuda/bindings/_bindings/cydriver.pxd.in b/cuda_bindings/cuda/bindings/_bindings/cydriver.pxd.in index 50701f70d1..b8a5ee5abf 100644 --- a/cuda_bindings/cuda/bindings/_bindings/cydriver.pxd.in +++ b/cuda_bindings/cuda/bindings/_bindings/cydriver.pxd.in @@ -2348,4 +2348,3 @@ cdef CUresult _cuGraphicsVDPAURegisterVideoSurface(CUgraphicsResource* pCudaReso cdef CUresult _cuGraphicsVDPAURegisterOutputSurface(CUgraphicsResource* pCudaResource, VdpOutputSurface vdpSurface, unsigned int flags) except ?CUDA_ERROR_NOT_FOUND nogil {{endif}} - diff --git a/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in b/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in index 7fc86b565e..efbc678bd9 100644 --- a/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in +++ b/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in @@ -512,7 +512,7 @@ cdef int cuPythonInit() except -1 nogil: {{else}} path = 'libcuda.so.1' {{endif}} - + {{if 'Windows' == platform.system()}} LOAD_LIBRARY_SEARCH_SYSTEM32 = 0x00000800 try: @@ -524,7 +524,7 @@ cdef int cuPythonInit() except -1 nogil: if (handle == NULL): raise RuntimeError('Failed to dlopen ' + path) {{endif}} - + # Get latest __cuGetProcAddress_v2 global __cuGetProcAddress_v2 {{if 'Windows' == platform.system()}} @@ -535,7 +535,7 @@ cdef int cuPythonInit() except -1 nogil: {{else}} __cuGetProcAddress_v2 = dlfcn.dlsym(handle, 'cuGetProcAddress_v2') {{endif}} - + # Load using cuGetProcAddress if available if __cuGetProcAddress_v2 != NULL: _F_cuGetProcAddress_v2 = <__cuGetProcAddress_v2_T>__cuGetProcAddress_v2 @@ -2765,7 +2765,7 @@ cdef int cuPythonInit() except -1 nogil: __cuPythonInit = True return 0 - + {{if 'Windows' == platform.system()}} # Load using win32GetAddr if usePTDS: diff --git a/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pxd.in b/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pxd.in index 148530a86a..355b19f48e 100644 --- a/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pxd.in +++ b/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pxd.in @@ -123,4 +123,3 @@ cdef nvrtcResult _nvrtcGetPCHHeapSizeRequired(nvrtcProgram prog, size_t* size) e cdef nvrtcResult _nvrtcSetFlowCallback(nvrtcProgram prog, void* callback, void* payload) except ?NVRTC_ERROR_INVALID_INPUT nogil {{endif}} - diff --git a/cuda_bindings/cuda/bindings/_lib/cyruntime/utils.pyx.in b/cuda_bindings/cuda/bindings/_lib/cyruntime/utils.pyx.in index 292d19cb9c..82c508a912 100644 --- a/cuda_bindings/cuda/bindings/_lib/cyruntime/utils.pyx.in +++ b/cuda_bindings/cuda/bindings/_lib/cyruntime/utils.pyx.in @@ -675,7 +675,7 @@ cdef cudaError_t getRuntimeEglFrame(cudaEglFrame *eglFrame, cydriver.CUeglFrame cueglFrame.eglColorFormat == cydriver.CUeglColorFormat_enum.CU_EGL_COLOR_FORMAT_YUV420_SEMIPLANAR_2020 or cueglFrame.eglColorFormat == cydriver.CUeglColorFormat_enum.CU_EGL_COLOR_FORMAT_YVU420_SEMIPLANAR_2020 or cueglFrame.eglColorFormat == cydriver.CUeglColorFormat_enum.CU_EGL_COLOR_FORMAT_YUV420_SEMIPLANAR_709 or - cueglFrame.eglColorFormat == cydriver.CUeglColorFormat_enum.CU_EGL_COLOR_FORMAT_YVU420_SEMIPLANAR_709 or + cueglFrame.eglColorFormat == cydriver.CUeglColorFormat_enum.CU_EGL_COLOR_FORMAT_YVU420_SEMIPLANAR_709 or cueglFrame.eglColorFormat == cydriver.CUeglColorFormat_enum.CU_EGL_COLOR_FORMAT_Y10V10U10_420_SEMIPLANAR_709 or cueglFrame.eglColorFormat == cydriver.CUeglColorFormat_enum.CU_EGL_COLOR_FORMAT_Y10V10U10_420_SEMIPLANAR_2020 or cueglFrame.eglColorFormat == cydriver.CUeglColorFormat_enum.CU_EGL_COLOR_FORMAT_Y10V10U10_420_SEMIPLANAR_ER or diff --git a/cuda_bindings/cuda/bindings/_lib/utils.pxi.in b/cuda_bindings/cuda/bindings/_lib/utils.pxi.in index 0a9f2e4e30..7f7e0666c7 100644 --- a/cuda_bindings/cuda/bindings/_lib/utils.pxi.in +++ b/cuda_bindings/cuda/bindings/_lib/utils.pxi.in @@ -628,7 +628,7 @@ cdef class _HelperCUcoredumpSettings: {{if 'CU_COREDUMP_ENABLE_USER_TRIGGER' in found_values}}cydriver.CUcoredumpSettings_enum.CU_COREDUMP_ENABLE_USER_TRIGGER,{{endif}}): if self._is_getter == False: self._bool = init_value - + self._cptr = &self._bool self._size = 1 else: diff --git a/cuda_bindings/cuda/bindings/cydriver.pxd.in b/cuda_bindings/cuda/bindings/cydriver.pxd.in index e3fe2f8810..8696e8df47 100644 --- a/cuda_bindings/cuda/bindings/cydriver.pxd.in +++ b/cuda_bindings/cuda/bindings/cydriver.pxd.in @@ -5163,4 +5163,4 @@ cdef enum: RESOURCE_ABI_EXTERNAL_BYTES = 48 cdef enum: MAX_PLANES = 3 -cdef enum: CUDA_EGL_INFINITE_TIMEOUT = 4294967295 \ No newline at end of file +cdef enum: CUDA_EGL_INFINITE_TIMEOUT = 4294967295 diff --git a/cuda_bindings/cuda/bindings/cynvrtc.pxd.in b/cuda_bindings/cuda/bindings/cynvrtc.pxd.in index 7a392687dd..85a5e23663 100644 --- a/cuda_bindings/cuda/bindings/cynvrtc.pxd.in +++ b/cuda_bindings/cuda/bindings/cynvrtc.pxd.in @@ -150,4 +150,3 @@ cdef nvrtcResult nvrtcGetPCHHeapSizeRequired(nvrtcProgram prog, size_t* size) ex cdef nvrtcResult nvrtcSetFlowCallback(nvrtcProgram prog, void* callback, void* payload) except ?NVRTC_ERROR_INVALID_INPUT nogil {{endif}} - diff --git a/cuda_bindings/cuda/bindings/cyruntime.pxd.in b/cuda_bindings/cuda/bindings/cyruntime.pxd.in index bd0bc3d5f3..17ac2318dd 100644 --- a/cuda_bindings/cuda/bindings/cyruntime.pxd.in +++ b/cuda_bindings/cuda/bindings/cyruntime.pxd.in @@ -1956,4 +1956,4 @@ cdef enum: CUDART_VERSION = 13000 cdef enum: __CUDART_API_VERSION = 13000 -cdef enum: CUDA_EGL_MAX_PLANES = 3 \ No newline at end of file +cdef enum: CUDA_EGL_MAX_PLANES = 3 diff --git a/cuda_bindings/cuda/bindings/cyruntime_functions.pxi.in b/cuda_bindings/cuda/bindings/cyruntime_functions.pxi.in index 14230f1a26..4c2c5c0598 100644 --- a/cuda_bindings/cuda/bindings/cyruntime_functions.pxi.in +++ b/cuda_bindings/cuda/bindings/cyruntime_functions.pxi.in @@ -1480,4 +1480,3 @@ cdef extern from "cuda_profiler_api.h": cudaError_t cudaProfilerStop() nogil {{endif}} - diff --git a/cuda_bindings/cuda/bindings/driver.pyx.in b/cuda_bindings/cuda/bindings/driver.pyx.in index 975153c58e..15f5ba6497 100644 --- a/cuda_bindings/cuda/bindings/driver.pyx.in +++ b/cuda_bindings/cuda/bindings/driver.pyx.in @@ -50737,7 +50737,7 @@ def cuCoredumpSetAttributeGlobal(attrib not None : CUcoredumpSettings, value): @cython.embedsignature(True) def cuGetExportTable(pExportTableId : Optional[CUuuid]): - """ + """ Parameters ---------- @@ -54071,4 +54071,3 @@ cdef int _add_native_handle_getters() except?-1: {{endif}} return 0 _add_native_handle_getters() - diff --git a/cuda_bindings/cuda/bindings/runtime.pyx.in b/cuda_bindings/cuda/bindings/runtime.pyx.in index f17436058d..0729a4eeca 100644 --- a/cuda_bindings/cuda/bindings/runtime.pyx.in +++ b/cuda_bindings/cuda/bindings/runtime.pyx.in @@ -19177,7 +19177,7 @@ def cudaIpcOpenMemHandle(handle not None : cudaIpcMemHandle_t, unsigned int flag Notes ----- - No guarantees are made about the address returned in `*devPtr`. + No guarantees are made about the address returned in `*devPtr`. In particular, multiple processes may not receive the same address for the same `handle`. """ cdef void_ptr devPtr = 0 @@ -38015,4 +38015,3 @@ cdef int _add_native_handle_getters() except?-1: {{endif}} return 0 _add_native_handle_getters() - diff --git a/cuda_bindings/docs/source/environment_variables.rst b/cuda_bindings/docs/source/environment_variables.rst index c582fe57b3..a212bfe764 100644 --- a/cuda_bindings/docs/source/environment_variables.rst +++ b/cuda_bindings/docs/source/environment_variables.rst @@ -18,4 +18,3 @@ Build-Time Environment Variables - ``CUDA_PYTHON_PARSER_CACHING`` : bool, toggles the caching of parsed header files during the cuda-bindings build process. If caching is enabled (``CUDA_PYTHON_PARSER_CACHING`` is True), the cache path is set to ./cache_, where is derived from the cuda toolkit libraries used to build cuda-bindings. - ``CUDA_PYTHON_PARALLEL_LEVEL`` (previously ``PARALLEL_LEVEL``) : int, sets the number of threads used in the compilation of extension modules. Not setting it or setting it to 0 would disable parallel builds. - diff --git a/cuda_bindings/docs/source/install.rst b/cuda_bindings/docs/source/install.rst index b9335b4876..b5181c6a35 100644 --- a/cuda_bindings/docs/source/install.rst +++ b/cuda_bindings/docs/source/install.rst @@ -35,7 +35,7 @@ Install all optional dependencies with: Where the optional dependencies include: -* ``nvidia-cuda-nvrtc`` (NVRTC runtime compilation library) +* ``nvidia-cuda-nvrtc`` (NVRTC runtime compilation library) * ``nvidia-nvjitlink`` (nvJitLink library) * ``nvidia-nvvm`` (NVVM library) * ``nvidia-cufile`` (cuFile library, Linux only) diff --git a/cuda_bindings/docs/source/module/driver.rst b/cuda_bindings/docs/source/module/driver.rst index 04e0390d12..8a4ba15ff8 100644 --- a/cuda_bindings/docs/source/module/driver.rst +++ b/cuda_bindings/docs/source/module/driver.rst @@ -2318,7 +2318,7 @@ Data types used by CUDA driver .. autoattribute:: cuda.bindings.driver.CUjit_option.CU_JIT_CACHE_MODE - Specifies whether to enable caching explicitly (-dlcm) + Specifies whether to enable caching explicitly (-dlcm) Choice is based on supplied :py:obj:`~.CUjit_cacheMode_enum`. @@ -3256,7 +3256,7 @@ Data types used by CUDA driver .. autoattribute:: cuda.bindings.driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT - Valid for launches. Set :py:obj:`~.CUlaunchAttributeValue.programmaticEvent` to record the event. Event recorded through this launch attribute is guaranteed to only trigger after all block in the associated kernel trigger the event. A block can trigger the event through PTX launchdep.release or CUDA builtin function cudaTriggerProgrammaticLaunchCompletion(). A trigger can also be inserted at the beginning of each block's execution if triggerAtBlockStart is set to non-0. The dependent launches can choose to wait on the dependency using the programmatic sync (cudaGridDependencySynchronize() or equivalent PTX instructions). Note that dependents (including the CPU thread calling :py:obj:`~.cuEventSynchronize()`) are not guaranteed to observe the release precisely when it is released. For example, :py:obj:`~.cuEventSynchronize()` may only observe the event trigger long after the associated kernel has completed. This recording type is primarily meant for establishing programmatic dependency between device tasks. Note also this type of dependency allows, but does not guarantee, concurrent execution of tasks. + Valid for launches. Set :py:obj:`~.CUlaunchAttributeValue.programmaticEvent` to record the event. Event recorded through this launch attribute is guaranteed to only trigger after all block in the associated kernel trigger the event. A block can trigger the event through PTX launchdep.release or CUDA builtin function cudaTriggerProgrammaticLaunchCompletion(). A trigger can also be inserted at the beginning of each block's execution if triggerAtBlockStart is set to non-0. The dependent launches can choose to wait on the dependency using the programmatic sync (cudaGridDependencySynchronize() or equivalent PTX instructions). Note that dependents (including the CPU thread calling :py:obj:`~.cuEventSynchronize()`) are not guaranteed to observe the release precisely when it is released. For example, :py:obj:`~.cuEventSynchronize()` may only observe the event trigger long after the associated kernel has completed. This recording type is primarily meant for establishing programmatic dependency between device tasks. Note also this type of dependency allows, but does not guarantee, concurrent execution of tasks. The event supplied must not be an interprocess or interop event. The event must disable timing (i.e. must be created with the :py:obj:`~.CU_EVENT_DISABLE_TIMING` flag set). @@ -3282,9 +3282,9 @@ Data types used by CUDA driver .. autoattribute:: cuda.bindings.driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION - Valid for graph nodes, launches. Set :py:obj:`~.CUlaunchAttributeValue.preferredClusterDim` to allow the kernel launch to specify a preferred substitute cluster dimension. Blocks may be grouped according to either the dimensions specified with this attribute (grouped into a "preferred substitute cluster"), or the one specified with :py:obj:`~.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION` attribute (grouped into a "regular cluster"). The cluster dimensions of a "preferred substitute cluster" shall be an integer multiple greater than zero of the regular cluster dimensions. The device will attempt - on a best-effort basis - to group thread blocks into preferred clusters over grouping them into regular clusters. When it deems necessary (primarily when the device temporarily runs out of physical resources to launch the larger preferred clusters), the device may switch to launch the regular clusters instead to attempt to utilize as much of the physical device resources as possible. + Valid for graph nodes, launches. Set :py:obj:`~.CUlaunchAttributeValue.preferredClusterDim` to allow the kernel launch to specify a preferred substitute cluster dimension. Blocks may be grouped according to either the dimensions specified with this attribute (grouped into a "preferred substitute cluster"), or the one specified with :py:obj:`~.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION` attribute (grouped into a "regular cluster"). The cluster dimensions of a "preferred substitute cluster" shall be an integer multiple greater than zero of the regular cluster dimensions. The device will attempt - on a best-effort basis - to group thread blocks into preferred clusters over grouping them into regular clusters. When it deems necessary (primarily when the device temporarily runs out of physical resources to launch the larger preferred clusters), the device may switch to launch the regular clusters instead to attempt to utilize as much of the physical device resources as possible. - Each type of cluster will have its enumeration / coordinate setup as if the grid consists solely of its type of cluster. For example, if the preferred substitute cluster dimensions double the regular cluster dimensions, there might be simultaneously a regular cluster indexed at (1,0,0), and a preferred cluster indexed at (1,0,0). In this example, the preferred substitute cluster (1,0,0) replaces regular clusters (2,0,0) and (3,0,0) and groups their blocks. + Each type of cluster will have its enumeration / coordinate setup as if the grid consists solely of its type of cluster. For example, if the preferred substitute cluster dimensions double the regular cluster dimensions, there might be simultaneously a regular cluster indexed at (1,0,0), and a preferred cluster indexed at (1,0,0). In this example, the preferred substitute cluster (1,0,0) replaces regular clusters (2,0,0) and (3,0,0) and groups their blocks. This attribute will only take effect when a regular cluster dimension has been specified. The preferred substitute cluster dimension must be an integer multiple greater than zero of the regular cluster dimension and must divide the grid. It must also be no more than `maxBlocksPerCluster`, if it is set in the kernel's `__launch_bounds__`. Otherwise it must be less than the maximum value the driver can support. Otherwise, setting this attribute to a value physically unable to fit on any particular device is permitted. @@ -3292,11 +3292,11 @@ Data types used by CUDA driver .. autoattribute:: cuda.bindings.driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT - Valid for launches. Set :py:obj:`~.CUlaunchAttributeValue.launchCompletionEvent` to record the event. + Valid for launches. Set :py:obj:`~.CUlaunchAttributeValue.launchCompletionEvent` to record the event. - Nominally, the event is triggered once all blocks of the kernel have begun execution. Currently this is a best effort. If a kernel B has a launch completion dependency on a kernel A, B may wait until A is complete. Alternatively, blocks of B may begin before all blocks of A have begun, for example if B can claim execution resources unavailable to A (e.g. they run on different GPUs) or if B is a higher priority than A. Exercise caution if such an ordering inversion could lead to deadlock. + Nominally, the event is triggered once all blocks of the kernel have begun execution. Currently this is a best effort. If a kernel B has a launch completion dependency on a kernel A, B may wait until A is complete. Alternatively, blocks of B may begin before all blocks of A have begun, for example if B can claim execution resources unavailable to A (e.g. they run on different GPUs) or if B is a higher priority than A. Exercise caution if such an ordering inversion could lead to deadlock. - A launch completion event is nominally similar to a programmatic event with `triggerAtBlockStart` set except that it is not visible to `cudaGridDependencySynchronize()` and can be used with compute capability less than 9.0. + A launch completion event is nominally similar to a programmatic event with `triggerAtBlockStart` set except that it is not visible to `cudaGridDependencySynchronize()` and can be used with compute capability less than 9.0. The event supplied must not be an interprocess or interop event. The event must disable timing (i.e. must be created with the :py:obj:`~.CU_EVENT_DISABLE_TIMING` flag set). @@ -3304,11 +3304,11 @@ Data types used by CUDA driver .. autoattribute:: cuda.bindings.driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE - Valid for graph nodes, launches. This attribute is graphs-only, and passing it to a launch in a non-capturing stream will result in an error. + Valid for graph nodes, launches. This attribute is graphs-only, and passing it to a launch in a non-capturing stream will result in an error. - :py:obj:`~.CUlaunchAttributeValue`::deviceUpdatableKernelNode::deviceUpdatable can only be set to 0 or 1. Setting the field to 1 indicates that the corresponding kernel node should be device-updatable. On success, a handle will be returned via :py:obj:`~.CUlaunchAttributeValue`::deviceUpdatableKernelNode::devNode which can be passed to the various device-side update functions to update the node's kernel parameters from within another kernel. For more information on the types of device updates that can be made, as well as the relevant limitations thereof, see :py:obj:`~.cudaGraphKernelNodeUpdatesApply`. + :py:obj:`~.CUlaunchAttributeValue`::deviceUpdatableKernelNode::deviceUpdatable can only be set to 0 or 1. Setting the field to 1 indicates that the corresponding kernel node should be device-updatable. On success, a handle will be returned via :py:obj:`~.CUlaunchAttributeValue`::deviceUpdatableKernelNode::devNode which can be passed to the various device-side update functions to update the node's kernel parameters from within another kernel. For more information on the types of device updates that can be made, as well as the relevant limitations thereof, see :py:obj:`~.cudaGraphKernelNodeUpdatesApply`. - Nodes which are device-updatable have additional restrictions compared to regular kernel nodes. Firstly, device-updatable nodes cannot be removed from their graph via :py:obj:`~.cuGraphDestroyNode`. Additionally, once opted-in to this functionality, a node cannot opt out, and any attempt to set the deviceUpdatable attribute to 0 will result in an error. Device-updatable kernel nodes also cannot have their attributes copied to/from another kernel node via :py:obj:`~.cuGraphKernelNodeCopyAttributes`. Graphs containing one or more device-updatable nodes also do not allow multiple instantiation, and neither the graph nor its instantiated version can be passed to :py:obj:`~.cuGraphExecUpdate`. + Nodes which are device-updatable have additional restrictions compared to regular kernel nodes. Firstly, device-updatable nodes cannot be removed from their graph via :py:obj:`~.cuGraphDestroyNode`. Additionally, once opted-in to this functionality, a node cannot opt out, and any attempt to set the deviceUpdatable attribute to 0 will result in an error. Device-updatable kernel nodes also cannot have their attributes copied to/from another kernel node via :py:obj:`~.cuGraphKernelNodeCopyAttributes`. Graphs containing one or more device-updatable nodes also do not allow multiple instantiation, and neither the graph nor its instantiated version can be passed to :py:obj:`~.cuGraphExecUpdate`. If a graph contains device-updatable nodes and updates those nodes from the device from within the graph, the graph must be uploaded with :py:obj:`~.cuGraphUpload` before it is launched. For such a graph, if host-side executable graph updates are made to the device-updatable nodes, the graph must be uploaded before it is launched again. @@ -3322,13 +3322,13 @@ Data types used by CUDA driver .. autoattribute:: cuda.bindings.driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_NVLINK_UTIL_CENTRIC_SCHEDULING - Valid for streams, graph nodes, launches. This attribute is a hint to the CUDA runtime that the launch should attempt to make the kernel maximize its NVLINK utilization. + Valid for streams, graph nodes, launches. This attribute is a hint to the CUDA runtime that the launch should attempt to make the kernel maximize its NVLINK utilization. - When possible to honor this hint, CUDA will assume each block in the grid launch will carry out an even amount of NVLINK traffic, and make a best-effort attempt to adjust the kernel launch based on that assumption. + When possible to honor this hint, CUDA will assume each block in the grid launch will carry out an even amount of NVLINK traffic, and make a best-effort attempt to adjust the kernel launch based on that assumption. - This attribute is a hint only. CUDA makes no functional or performance guarantee. Its applicability can be affected by many different factors, including driver version (i.e. CUDA doesn't guarantee the performance characteristics will be maintained between driver versions or a driver update could alter or regress previously observed perf characteristics.) It also doesn't guarantee a successful result, i.e. applying the attribute may not improve the performance of either the targeted kernel or the encapsulating application. + This attribute is a hint only. CUDA makes no functional or performance guarantee. Its applicability can be affected by many different factors, including driver version (i.e. CUDA doesn't guarantee the performance characteristics will be maintained between driver versions or a driver update could alter or regress previously observed perf characteristics.) It also doesn't guarantee a successful result, i.e. applying the attribute may not improve the performance of either the targeted kernel or the encapsulating application. Valid values for :py:obj:`~.CUlaunchAttributeValue`::nvlinkUtilCentricScheduling are 0 (disabled) and 1 (enabled). diff --git a/cuda_bindings/docs/source/module/nvjitlink.rst b/cuda_bindings/docs/source/module/nvjitlink.rst index ff9bb1ea52..6eda063074 100644 --- a/cuda_bindings/docs/source/module/nvjitlink.rst +++ b/cuda_bindings/docs/source/module/nvjitlink.rst @@ -9,7 +9,7 @@ nvjitlink Note ---- -The nvjitlink bindings are not supported on nvJitLink installations <12.3. Ensure the installed CUDA toolkit's nvJitLink version is >=12.3. +The nvjitlink bindings are not supported on nvJitLink installations <12.3. Ensure the installed CUDA toolkit's nvJitLink version is >=12.3. Functions --------- diff --git a/cuda_bindings/docs/source/module/nvrtc.rst b/cuda_bindings/docs/source/module/nvrtc.rst index 079cd39aad..d8bd6df229 100644 --- a/cuda_bindings/docs/source/module/nvrtc.rst +++ b/cuda_bindings/docs/source/module/nvrtc.rst @@ -232,7 +232,7 @@ Generate line-number information. - - ``--dopt=on``\ + - ``--dopt=on``\ Enable device code optimization. When specified along with ``-G``\ , enables limited debug information generation for optimized device code (currently, only line number information). When ``-G``\ is not specified, ``-dopt=on``\ is implicit. @@ -260,7 +260,7 @@ Specify the fast-compile level for device code, which controls the tradeoff betw - - ``--ptxas-options=``\ + - ``--ptxas-options=``\ Specify options directly to ptxas, the PTX optimizing assembler. @@ -308,7 +308,7 @@ For single-precision floating-point square root, use IEEE round-to-nearest mode - - Default: ``true``\ + - Default: ``true``\ @@ -783,4 +783,3 @@ Enable stack canaries in device code. Stack canaries make it more difficult to e - ``--fdevice-time-trace=``\ (``-fdevice-time-trace=``\ ) Enables the time profiler, outputting a JSON file based on given . Results can be analyzed on chrome://tracing for a flamegraph visualization. - diff --git a/cuda_bindings/docs/source/module/runtime.rst b/cuda_bindings/docs/source/module/runtime.rst index d155f85ebc..9ac22f660b 100644 --- a/cuda_bindings/docs/source/module/runtime.rst +++ b/cuda_bindings/docs/source/module/runtime.rst @@ -281,7 +281,7 @@ This section describes the unified addressing functions of the CUDA runtime appl -CUDA devices can share a unified address space with the host. +CUDA devices can share a unified address space with the host. For these devices there is no distinction between a device pointer and a host pointer -- the same pointer value may be used to access memory from the host program and from a kernel running on the device (with exceptions enumerated below). @@ -307,7 +307,7 @@ Unified addressing is automatically enabled in 64-bit processes . It is possible to look up information about the memory which backs a pointer value. For instance, one may want to know if a pointer points to host or device memory. As another example, in the case of device memory, one may want to know on which CUDA device the memory resides. These properties may be queried using the function cudaPointerGetAttributes() -Since pointers are unique, it is not necessary to specify information about the pointers specified to cudaMemcpy() and other copy functions. +Since pointers are unique, it is not necessary to specify information about the pointers specified to cudaMemcpy() and other copy functions. The copy direction cudaMemcpyDefault may be used to specify that the CUDA runtime should infer the location of the pointer from its value. @@ -321,7 +321,7 @@ Since pointers are unique, it is not necessary to specify information about the All host memory allocated through all devices using cudaMallocHost() and cudaHostAlloc() is always directly accessible from all devices that support unified addressing. This is the case regardless of whether or not the flags cudaHostAllocPortable and cudaHostAllocMapped are specified. -The pointer value through which allocated host memory may be accessed in kernels on all devices that support unified addressing is the same as the pointer value through which that memory is accessed on the host. It is not necessary to call cudaHostGetDevicePointer() to get the device pointer for these allocations. +The pointer value through which allocated host memory may be accessed in kernels on all devices that support unified addressing is the same as the pointer value through which that memory is accessed on the host. It is not necessary to call cudaHostGetDevicePointer() to get the device pointer for these allocations. @@ -345,7 +345,7 @@ Upon enabling direct access from a device that supports unified addressing to an -Not all memory may be accessed on devices through the same pointer value through which they are accessed on the host. These exceptions are host memory registered using cudaHostRegister() and host memory allocated using the flag cudaHostAllocWriteCombined. For these exceptions, there exists a distinct host and device address for the memory. The device address is guaranteed to not overlap any valid host pointer range and is guaranteed to have the same value across all devices that support unified addressing. +Not all memory may be accessed on devices through the same pointer value through which they are accessed on the host. These exceptions are host memory registered using cudaHostRegister() and host memory allocated using the flag cudaHostAllocWriteCombined. For these exceptions, there exists a distinct host and device address for the memory. The device address is guaranteed to not overlap any valid host pointer range and is guaranteed to have the same value across all devices that support unified addressing. @@ -671,7 +671,7 @@ Note that the use of multiple ::CUcontext s per device within a single process w If a non-primary ::CUcontext created by the CUDA Driver API is current to a thread then the CUDA Runtime API calls to that thread will operate on that ::CUcontext, with some exceptions listed below. Interoperability between data types is discussed in the following sections. -The function cudaPointerGetAttributes() will return the error cudaErrorIncompatibleDriverContext if the pointer being queried was allocated by a non-primary context. The function cudaDeviceEnablePeerAccess() and the rest of the peer access API may not be called when a non-primary ::CUcontext is current. +The function cudaPointerGetAttributes() will return the error cudaErrorIncompatibleDriverContext if the pointer being queried was allocated by a non-primary context. The function cudaDeviceEnablePeerAccess() and the rest of the peer access API may not be called when a non-primary ::CUcontext is current. To use the pointer query and peer access APIs with a context created using the CUDA Driver API, it is necessary that the CUDA Driver API be used to access these features. @@ -4697,7 +4697,7 @@ Data types used by CUDA Runtime .. autoattribute:: cuda.bindings.runtime.cudaJitOption.cudaJitCacheMode - Specifies whether to enable caching explicitly (-dlcm) + Specifies whether to enable caching explicitly (-dlcm) Choice is based on supplied :py:obj:`~.cudaJit_CacheMode`. @@ -5199,9 +5199,9 @@ Data types used by CUDA Runtime .. autoattribute:: cuda.bindings.runtime.cudaGraphInstantiateFlags.cudaGraphInstantiateFlagUpload - Automatically upload the graph after instantiation. Only supported by + Automatically upload the graph after instantiation. Only supported by - :py:obj:`~.cudaGraphInstantiateWithParams`. The upload will be performed using the + :py:obj:`~.cudaGraphInstantiateWithParams`. The upload will be performed using the stream provided in `instantiateParams`. @@ -5209,9 +5209,9 @@ Data types used by CUDA Runtime .. autoattribute:: cuda.bindings.runtime.cudaGraphInstantiateFlags.cudaGraphInstantiateFlagDeviceLaunch - Instantiate the graph to be launchable from the device. This flag can only + Instantiate the graph to be launchable from the device. This flag can only - be used on platforms which support unified addressing. This flag cannot be + be used on platforms which support unified addressing. This flag cannot be used in conjunction with cudaGraphInstantiateFlagAutoFreeOnLaunch. @@ -5281,7 +5281,7 @@ Data types used by CUDA Runtime .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeProgrammaticEvent - Valid for launches. Set :py:obj:`~.cudaLaunchAttributeValue.programmaticEvent` to record the event. Event recorded through this launch attribute is guaranteed to only trigger after all block in the associated kernel trigger the event. A block can trigger the event programmatically in a future CUDA release. A trigger can also be inserted at the beginning of each block's execution if triggerAtBlockStart is set to non-0. The dependent launches can choose to wait on the dependency using the programmatic sync (cudaGridDependencySynchronize() or equivalent PTX instructions). Note that dependents (including the CPU thread calling :py:obj:`~.cudaEventSynchronize()`) are not guaranteed to observe the release precisely when it is released. For example, :py:obj:`~.cudaEventSynchronize()` may only observe the event trigger long after the associated kernel has completed. This recording type is primarily meant for establishing programmatic dependency between device tasks. Note also this type of dependency allows, but does not guarantee, concurrent execution of tasks. + Valid for launches. Set :py:obj:`~.cudaLaunchAttributeValue.programmaticEvent` to record the event. Event recorded through this launch attribute is guaranteed to only trigger after all block in the associated kernel trigger the event. A block can trigger the event programmatically in a future CUDA release. A trigger can also be inserted at the beginning of each block's execution if triggerAtBlockStart is set to non-0. The dependent launches can choose to wait on the dependency using the programmatic sync (cudaGridDependencySynchronize() or equivalent PTX instructions). Note that dependents (including the CPU thread calling :py:obj:`~.cudaEventSynchronize()`) are not guaranteed to observe the release precisely when it is released. For example, :py:obj:`~.cudaEventSynchronize()` may only observe the event trigger long after the associated kernel has completed. This recording type is primarily meant for establishing programmatic dependency between device tasks. Note also this type of dependency allows, but does not guarantee, concurrent execution of tasks. The event supplied must not be an interprocess or interop event. The event must disable timing (i.e. must be created with the :py:obj:`~.cudaEventDisableTiming` flag set). @@ -5307,9 +5307,9 @@ Data types used by CUDA Runtime .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributePreferredClusterDimension - Valid for graph nodes and launches. Set :py:obj:`~.cudaLaunchAttributeValue.preferredClusterDim` to allow the kernel launch to specify a preferred substitute cluster dimension. Blocks may be grouped according to either the dimensions specified with this attribute (grouped into a "preferred substitute cluster"), or the one specified with :py:obj:`~.cudaLaunchAttributeClusterDimension` attribute (grouped into a "regular cluster"). The cluster dimensions of a "preferred substitute cluster" shall be an integer multiple greater than zero of the regular cluster dimensions. The device will attempt - on a best-effort basis - to group thread blocks into preferred clusters over grouping them into regular clusters. When it deems necessary (primarily when the device temporarily runs out of physical resources to launch the larger preferred clusters), the device may switch to launch the regular clusters instead to attempt to utilize as much of the physical device resources as possible. + Valid for graph nodes and launches. Set :py:obj:`~.cudaLaunchAttributeValue.preferredClusterDim` to allow the kernel launch to specify a preferred substitute cluster dimension. Blocks may be grouped according to either the dimensions specified with this attribute (grouped into a "preferred substitute cluster"), or the one specified with :py:obj:`~.cudaLaunchAttributeClusterDimension` attribute (grouped into a "regular cluster"). The cluster dimensions of a "preferred substitute cluster" shall be an integer multiple greater than zero of the regular cluster dimensions. The device will attempt - on a best-effort basis - to group thread blocks into preferred clusters over grouping them into regular clusters. When it deems necessary (primarily when the device temporarily runs out of physical resources to launch the larger preferred clusters), the device may switch to launch the regular clusters instead to attempt to utilize as much of the physical device resources as possible. - Each type of cluster will have its enumeration / coordinate setup as if the grid consists solely of its type of cluster. For example, if the preferred substitute cluster dimensions double the regular cluster dimensions, there might be simultaneously a regular cluster indexed at (1,0,0), and a preferred cluster indexed at (1,0,0). In this example, the preferred substitute cluster (1,0,0) replaces regular clusters (2,0,0) and (3,0,0) and groups their blocks. + Each type of cluster will have its enumeration / coordinate setup as if the grid consists solely of its type of cluster. For example, if the preferred substitute cluster dimensions double the regular cluster dimensions, there might be simultaneously a regular cluster indexed at (1,0,0), and a preferred cluster indexed at (1,0,0). In this example, the preferred substitute cluster (1,0,0) replaces regular clusters (2,0,0) and (3,0,0) and groups their blocks. This attribute will only take effect when a regular cluster dimension has been specified. The preferred substitute cluster dimension must be an integer multiple greater than zero of the regular cluster dimension and must divide the grid. It must also be no more than `maxBlocksPerCluster`, if it is set in the kernel's `__launch_bounds__`. Otherwise it must be less than the maximum value the driver can support. Otherwise, setting this attribute to a value physically unable to fit on any particular device is permitted. @@ -5317,11 +5317,11 @@ Data types used by CUDA Runtime .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeLaunchCompletionEvent - Valid for launches. Set :py:obj:`~.cudaLaunchAttributeValue.launchCompletionEvent` to record the event. + Valid for launches. Set :py:obj:`~.cudaLaunchAttributeValue.launchCompletionEvent` to record the event. - Nominally, the event is triggered once all blocks of the kernel have begun execution. Currently this is a best effort. If a kernel B has a launch completion dependency on a kernel A, B may wait until A is complete. Alternatively, blocks of B may begin before all blocks of A have begun, for example if B can claim execution resources unavailable to A (e.g. they run on different GPUs) or if B is a higher priority than A. Exercise caution if such an ordering inversion could lead to deadlock. + Nominally, the event is triggered once all blocks of the kernel have begun execution. Currently this is a best effort. If a kernel B has a launch completion dependency on a kernel A, B may wait until A is complete. Alternatively, blocks of B may begin before all blocks of A have begun, for example if B can claim execution resources unavailable to A (e.g. they run on different GPUs) or if B is a higher priority than A. Exercise caution if such an ordering inversion could lead to deadlock. - A launch completion event is nominally similar to a programmatic event with `triggerAtBlockStart` set except that it is not visible to `cudaGridDependencySynchronize()` and can be used with compute capability less than 9.0. + A launch completion event is nominally similar to a programmatic event with `triggerAtBlockStart` set except that it is not visible to `cudaGridDependencySynchronize()` and can be used with compute capability less than 9.0. The event supplied must not be an interprocess or interop event. The event must disable timing (i.e. must be created with the :py:obj:`~.cudaEventDisableTiming` flag set). @@ -5329,11 +5329,11 @@ Data types used by CUDA Runtime .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeDeviceUpdatableKernelNode - Valid for graph nodes, launches. This attribute is graphs-only, and passing it to a launch in a non-capturing stream will result in an error. + Valid for graph nodes, launches. This attribute is graphs-only, and passing it to a launch in a non-capturing stream will result in an error. - :cudaLaunchAttributeValue::deviceUpdatableKernelNode::deviceUpdatable can only be set to 0 or 1. Setting the field to 1 indicates that the corresponding kernel node should be device-updatable. On success, a handle will be returned via :py:obj:`~.cudaLaunchAttributeValue`::deviceUpdatableKernelNode::devNode which can be passed to the various device-side update functions to update the node's kernel parameters from within another kernel. For more information on the types of device updates that can be made, as well as the relevant limitations thereof, see :py:obj:`~.cudaGraphKernelNodeUpdatesApply`. + :cudaLaunchAttributeValue::deviceUpdatableKernelNode::deviceUpdatable can only be set to 0 or 1. Setting the field to 1 indicates that the corresponding kernel node should be device-updatable. On success, a handle will be returned via :py:obj:`~.cudaLaunchAttributeValue`::deviceUpdatableKernelNode::devNode which can be passed to the various device-side update functions to update the node's kernel parameters from within another kernel. For more information on the types of device updates that can be made, as well as the relevant limitations thereof, see :py:obj:`~.cudaGraphKernelNodeUpdatesApply`. - Nodes which are device-updatable have additional restrictions compared to regular kernel nodes. Firstly, device-updatable nodes cannot be removed from their graph via :py:obj:`~.cudaGraphDestroyNode`. Additionally, once opted-in to this functionality, a node cannot opt out, and any attempt to set the deviceUpdatable attribute to 0 will result in an error. Device-updatable kernel nodes also cannot have their attributes copied to/from another kernel node via :py:obj:`~.cudaGraphKernelNodeCopyAttributes`. Graphs containing one or more device-updatable nodes also do not allow multiple instantiation, and neither the graph nor its instantiated version can be passed to :py:obj:`~.cudaGraphExecUpdate`. + Nodes which are device-updatable have additional restrictions compared to regular kernel nodes. Firstly, device-updatable nodes cannot be removed from their graph via :py:obj:`~.cudaGraphDestroyNode`. Additionally, once opted-in to this functionality, a node cannot opt out, and any attempt to set the deviceUpdatable attribute to 0 will result in an error. Device-updatable kernel nodes also cannot have their attributes copied to/from another kernel node via :py:obj:`~.cudaGraphKernelNodeCopyAttributes`. Graphs containing one or more device-updatable nodes also do not allow multiple instantiation, and neither the graph nor its instantiated version can be passed to :py:obj:`~.cudaGraphExecUpdate`. If a graph contains device-updatable nodes and updates those nodes from the device from within the graph, the graph must be uploaded with :py:obj:`~.cuGraphUpload` before it is launched. For such a graph, if host-side executable graph updates are made to the device-updatable nodes, the graph must be uploaded before it is launched again. @@ -5347,13 +5347,13 @@ Data types used by CUDA Runtime .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeNvlinkUtilCentricScheduling - Valid for streams, graph nodes, launches. This attribute is a hint to the CUDA runtime that the launch should attempt to make the kernel maximize its NVLINK utilization. + Valid for streams, graph nodes, launches. This attribute is a hint to the CUDA runtime that the launch should attempt to make the kernel maximize its NVLINK utilization. - When possible to honor this hint, CUDA will assume each block in the grid launch will carry out an even amount of NVLINK traffic, and make a best-effort attempt to adjust the kernel launch based on that assumption. + When possible to honor this hint, CUDA will assume each block in the grid launch will carry out an even amount of NVLINK traffic, and make a best-effort attempt to adjust the kernel launch based on that assumption. - This attribute is a hint only. CUDA makes no functional or performance guarantee. Its applicability can be affected by many different factors, including driver version (i.e. CUDA doesn't guarantee the performance characteristics will be maintained between driver versions or a driver update could alter or regress previously observed perf characteristics.) It also doesn't guarantee a successful result, i.e. applying the attribute may not improve the performance of either the targeted kernel or the encapsulating application. + This attribute is a hint only. CUDA makes no functional or performance guarantee. Its applicability can be affected by many different factors, including driver version (i.e. CUDA doesn't guarantee the performance characteristics will be maintained between driver versions or a driver update could alter or regress previously observed perf characteristics.) It also doesn't guarantee a successful result, i.e. applying the attribute may not improve the performance of either the targeted kernel or the encapsulating application. Valid values for :py:obj:`~.cudaLaunchAttributeValue.nvlinkUtilCentricScheduling` are 0 (disabled) and 1 (enabled). diff --git a/cuda_bindings/docs/source/motivation.rst b/cuda_bindings/docs/source/motivation.rst index afbd3412d4..433cc16619 100644 --- a/cuda_bindings/docs/source/motivation.rst +++ b/cuda_bindings/docs/source/motivation.rst @@ -9,7 +9,7 @@ What is CUDA Python? NVIDIA’s CUDA Python provides `Cython `_ bindings and Python wrappers for the driver and runtime API for existing toolkits and libraries to simplify GPU-based accelerated processing. Python is one of the most popular -programming languages for science, engineering, data analytics, and deep +programming languages for science, engineering, data analytics, and deep learning applications. The goal of CUDA Python is to unify the Python ecosystem with a single set of interfaces that provide full coverage of and access to the CUDA host APIs from Python. @@ -25,18 +25,18 @@ science, and AI. `Anaconda `_ that can compile Python code for execution on CUDA-capable GPUs, provides Python developers with an easy entry into GPU-accelerated computing and a path for using increasingly sophisticated CUDA -code with a minimum of new syntax and jargon. Numba has its own CUDA driver API -bindings that can now be replaced with CUDA Python. With CUDA Python and Numba, +code with a minimum of new syntax and jargon. Numba has its own CUDA driver API +bindings that can now be replaced with CUDA Python. With CUDA Python and Numba, you get the best of both worlds: rapid iterative development with Python and the speed of a compiled language targeting both CPUs and NVIDIA GPUs. `CuPy `_ is a `NumPy `_/`SciPy `_ compatible Array library, from `Preferred Networks `_, for -GPU-accelerated computing with Python. CUDA Python simplifies the CuPy build -and allows for a faster and smaller memory footprint when importing the CuPy -Python module. In the future, when more CUDA Toolkit libraries are supported, -CuPy will have a lighter maintenance overhead and have fewer wheels to +GPU-accelerated computing with Python. CUDA Python simplifies the CuPy build +and allows for a faster and smaller memory footprint when importing the CuPy +Python module. In the future, when more CUDA Toolkit libraries are supported, +CuPy will have a lighter maintenance overhead and have fewer wheels to release. Users benefit from a faster CUDA runtime! Our goal is to help unify the Python CUDA ecosystem with a single standard set diff --git a/cuda_bindings/docs/source/overview.rst b/cuda_bindings/docs/source/overview.rst index 0f32032528..fdef83639b 100644 --- a/cuda_bindings/docs/source/overview.rst +++ b/cuda_bindings/docs/source/overview.rst @@ -60,7 +60,7 @@ The following code snippet lets us validate each API call and raise exceptions i return nvrtc.nvrtcGetErrorString(error)[1] else: raise RuntimeError('Unknown error type: {}'.format(error)) - + def checkCudaErrors(result): if result[0].value: raise RuntimeError("CUDA error code={}({})".format(result[0].value, _cudaGetErrorEnum(result[0]))) @@ -105,22 +105,22 @@ the program is compiled to target our local compute capability architecture with # Initialize CUDA Driver API checkCudaErrors(driver.cuInit(0)) - + # Retrieve handle for device 0 cuDevice = checkCudaErrors(driver.cuDeviceGet(0)) - + # Derive target architecture for device 0 major = checkCudaErrors(driver.cuDeviceGetAttribute(driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice)) minor = checkCudaErrors(driver.cuDeviceGetAttribute(driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice)) arch_arg = bytes(f'--gpu-architecture=compute_{major}{minor}', 'ascii') - + # Create program prog = checkCudaErrors(nvrtc.nvrtcCreateProgram(str.encode(saxpy), b"saxpy.cu", 0, [], [])) - + # Compile program opts = [b"--fmad=false", arch_arg] checkCudaErrors(nvrtc.nvrtcCompileProgram(prog, 2, opts)) - + # Get PTX from compilation ptxSize = checkCudaErrors(nvrtc.nvrtcGetPTXSize(prog)) ptx = b" " * ptxSize @@ -158,11 +158,11 @@ and from the device: NUM_THREADS = 512 # Threads per block NUM_BLOCKS = 32768 # Blocks per grid - + a = np.array([2.0], dtype=np.float32) n = np.array(NUM_THREADS * NUM_BLOCKS, dtype=np.uint32) bufferSize = n * a.itemsize - + hX = np.random.rand(n).astype(dtype=np.float32) hY = np.random.rand(n).astype(dtype=np.float32) hOut = np.zeros(n).astype(dtype=np.float32) @@ -182,9 +182,9 @@ by calling ``XX.ctypes.data`` for the associated XX: dXclass = checkCudaErrors(driver.cuMemAlloc(bufferSize)) dYclass = checkCudaErrors(driver.cuMemAlloc(bufferSize)) dOutclass = checkCudaErrors(driver.cuMemAlloc(bufferSize)) - + stream = checkCudaErrors(driver.cuStreamCreate(0)) - + checkCudaErrors(driver.cuMemcpyHtoDAsync( dXclass, hX.ctypes.data, bufferSize, stream )) @@ -233,7 +233,7 @@ Now the kernel can be launched: args.ctypes.data, # kernel arguments 0, # extra (ignore) )) - + checkCudaErrors(driver.cuMemcpyDtoHAsync( hOut.ctypes.data, dOutclass, bufferSize, stream )) @@ -304,7 +304,7 @@ maximize performance ({numref}``Figure 1``). .. figure:: _static/images/Nsight-Compute-CLI-625x473.png :name: Figure 1 - + Screenshot of Nsight Compute CLI output of ``cuda.bindings`` example. Preparing kernel arguments @@ -331,7 +331,7 @@ Let's use the following kernel definition as an example: typedef struct { int value; } testStruct; - + extern "C" __global__ void testkernel(int i, int *pi, float f, float *pf, @@ -347,7 +347,7 @@ The first step is to create array objects with types corresponding to your kerne .. list-table:: Correspondence between NumPy types and kernel types. :header-rows: 1 - + * - NumPy type - Corresponding kernel types - itemsize (bytes) @@ -410,12 +410,12 @@ Putting it all together: # Define a custom type testStruct = np.dtype([("value", np.int32)], align=True) - + # Allocate device memory pInt = checkCudaErrors(cudart.cudaMalloc(np.dtype(np.int32).itemsize)) pFloat = checkCudaErrors(cudart.cudaMalloc(np.dtype(np.float32).itemsize)) pStruct = checkCudaErrors(cudart.cudaMalloc(testStruct.itemsize)) - + # Collect all input kernel arguments into a single tuple for further processing kernelValues = ( np.array(1, dtype=np.uint32), @@ -470,12 +470,12 @@ For this example the result becomes: # Define a custom type class testStruct(ctypes.Structure): _fields_ = [("value", ctypes.c_int)] - + # Allocate device memory pInt = checkCudaErrors(cudart.cudaMalloc(ctypes.sizeof(ctypes.c_int))) pFloat = checkCudaErrors(cudart.cudaMalloc(ctypes.sizeof(ctypes.c_float))) pStruct = checkCudaErrors(cudart.cudaMalloc(ctypes.sizeof(testStruct))) - + # Collect all input kernel arguments into a single tuple for further processing kernelValues = ( 1, @@ -531,7 +531,7 @@ For this example, lets use the ``transformKernel`` from `examples/0_Introduction ... } """ - + def main(): ... d_data = checkCudaErrors(cudart.cudaMalloc(size)) @@ -565,4 +565,3 @@ For ctypes, we leverage the special handling of ``None`` type since each Python None, ) kernelArgs = (kernelValues, kernelTypes) - diff --git a/cuda_bindings/docs/source/release/11.6.0-notes.rst b/cuda_bindings/docs/source/release/11.6.0-notes.rst index d7907df847..bcc8944e1f 100644 --- a/cuda_bindings/docs/source/release/11.6.0-notes.rst +++ b/cuda_bindings/docs/source/release/11.6.0-notes.rst @@ -79,4 +79,3 @@ CUDA Functions Not Supported in this Release - cudaVDPAUSetVDPAUDevice .. note:: Deprecated APIs are removed from tracking - diff --git a/cuda_bindings/docs/source/release/12.9.X-notes.rst b/cuda_bindings/docs/source/release/12.9.X-notes.rst index 3be22a695a..a3a640f531 100644 --- a/cuda_bindings/docs/source/release/12.9.X-notes.rst +++ b/cuda_bindings/docs/source/release/12.9.X-notes.rst @@ -18,4 +18,4 @@ Highlights Known issues ------------ -* Updating from older versions (v12.6.2.post1 and below) via ``pip install -U cuda-python`` might not work. Please do a clean re-installation by uninstalling ``pip uninstall -y cuda-python`` followed by installing ``pip install cuda-python``. \ No newline at end of file +* Updating from older versions (v12.6.2.post1 and below) via ``pip install -U cuda-python`` might not work. Please do a clean re-installation by uninstalling ``pip uninstall -y cuda-python`` followed by installing ``pip install cuda-python``. diff --git a/cuda_bindings/docs/source/tips_and_tricks.rst b/cuda_bindings/docs/source/tips_and_tricks.rst index cc666ca275..1a77eb53fd 100644 --- a/cuda_bindings/docs/source/tips_and_tricks.rst +++ b/cuda_bindings/docs/source/tips_and_tricks.rst @@ -27,9 +27,9 @@ All of the Python classes do not manage the lifetime of the underlying CUDA C ob Getting and setting attributes of extension types ================================================= -While the bindings outwardly present the attributes of extension types in a pythonic way, they can't always be interacted with in a Pythonic style. Often the getters/setters (__getitem__(), __setitem__()) are actually a translation step to convert values between Python and C. For example, in some cases, attempting to modify an attribute in place, will lead to unexpected behavior due to the design of the underlying implementation. For this reason, users should use the getters and setters directly when interacting with extension types. +While the bindings outwardly present the attributes of extension types in a pythonic way, they can't always be interacted with in a Pythonic style. Often the getters/setters (__getitem__(), __setitem__()) are actually a translation step to convert values between Python and C. For example, in some cases, attempting to modify an attribute in place, will lead to unexpected behavior due to the design of the underlying implementation. For this reason, users should use the getters and setters directly when interacting with extension types. -An example of this is the :class:`~cuda.bindings.driver.CULaunchConfig` type. +An example of this is the :class:`~cuda.bindings.driver.CULaunchConfig` type. .. code-block:: python @@ -37,7 +37,7 @@ An example of this is the :class:`~cuda.bindings.driver.CULaunchConfig` type. cfg.numAttrs += 1 attr = cuda.CUlaunchAttribute() - + ... # This works. We are passing the new attribute to the setter diff --git a/cuda_bindings/pyproject.toml b/cuda_bindings/pyproject.toml index 1a91b44b83..2637f0b0e7 100644 --- a/cuda_bindings/pyproject.toml +++ b/cuda_bindings/pyproject.toml @@ -84,10 +84,10 @@ select = [ ] ignore = [ - "UP006", - "UP007", + "UP006", + "UP007", "E741", # ambiguous variable name such as I - "B007", # rename unsued loop variable to _name + "B007", # rename unsued loop variable to _name "UP035" # UP006, UP007, UP035 complain about deprecated Typing. use, but disregard backward compatibility of python version ] @@ -103,13 +103,13 @@ exclude = ["cuda/bindings/_version.py"] ] "tests/**/*" = [ - "E722", + "E722", "UP022", "E402", # module level import not at top of file "F841"] # F841 complains about unused variables, but some assignments have side-effects that could be useful for tests (func calls for example) "benchmarks/**/*" = [ - "E722", + "E722", "UP022", "E402", # module level import not at top of file "F841"] # F841 complains about unused variables, but some assignments have side-effects that could be useful for tests (func calls for example) diff --git a/cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx b/cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx index e735630386..0bb40bf404 100644 --- a/cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx +++ b/cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx @@ -215,7 +215,7 @@ cdef class ParamHolder: if isinstance(arg.handle, int): # see note below on handling int arguments prepare_arg[intptr_t](self.data, self.data_addresses, arg.handle, i) - continue + continue else: # it's a CUdeviceptr: self.data_addresses[i] = (arg.handle.getPtr()) diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index ddcf7665e1..3eb80875e8 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -137,14 +137,14 @@ cdef class Buffer: """ if stream is None: raise ValueError("stream must be provided") - + cdef size_t src_size = self._size - + if dst is None: if self._mr is None: raise ValueError("a destination buffer must be provided (this buffer does not have a memory_resource)") dst = self._mr.allocate(src_size, stream) - + cdef size_t dst_size = dst._size if dst_size != src_size: raise ValueError( @@ -168,10 +168,10 @@ cdef class Buffer: """ if stream is None: raise ValueError("stream must be provided") - + cdef size_t dst_size = self._size cdef size_t src_size = src._size - + if src_size != dst_size: raise ValueError( f"buffer sizes mismatch between src and dst (sizes are: src={src_size}, dst={dst_size})" diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/experimental/_memoryview.pyx index 9d24133052..ea8fb01b67 100644 --- a/cuda_core/cuda/core/experimental/_memoryview.pyx +++ b/cuda_core/cuda/core/experimental/_memoryview.pyx @@ -29,20 +29,20 @@ cdef class StridedMemoryView: This object supports both DLPack (up to v1.0) and CUDA Array Interface (CAI) v3. When wrapping an arbitrary object it will try the DLPack protocol first, then the CAI protocol. A :obj:`BufferError` is raised if neither is - supported. - + supported. + Since either way would take a consumer stream, for DLPack it is passed to - ``obj.__dlpack__()`` as-is (except for :obj:`None`, see below); for CAI, a + ``obj.__dlpack__()`` as-is (except for :obj:`None`, see below); for CAI, a stream order will be established between the consumer stream and the - producer stream (from ``obj.__cuda_array_interface__()["stream"]``), as if - ``cudaStreamWaitEvent`` is called by this method. - - To opt-out of the stream ordering operation in either DLPack or CAI, - please pass ``stream_ptr=-1``. Note that this deviates (on purpose) + producer stream (from ``obj.__cuda_array_interface__()["stream"]``), as if + ``cudaStreamWaitEvent`` is called by this method. + + To opt-out of the stream ordering operation in either DLPack or CAI, + please pass ``stream_ptr=-1``. Note that this deviates (on purpose) from the semantics of ``obj.__dlpack__(stream=None, ...)`` since ``cuda.core`` - does not encourage using the (legacy) default/null stream, but is + does not encourage using the (legacy) default/null stream, but is consistent with the CAI's semantics. For DLPack, ``stream=-1`` will be - internally passed to ``obj.__dlpack__()`` instead. + internally passed to ``obj.__dlpack__()`` instead. Attributes ---------- @@ -79,16 +79,16 @@ cdef class StridedMemoryView: bint is_device_accessible bint readonly object exporting_obj - - # If using dlpack, this is a strong reference to the result of - # obj.__dlpack__() so we can lazily create shape and strides from - # it later. If using CAI, this is a reference to the source + + # If using dlpack, this is a strong reference to the result of + # obj.__dlpack__() so we can lazily create shape and strides from + # it later. If using CAI, this is a reference to the source # `__cuda_array_interface__` object. cdef object metadata # The tensor object if has obj has __dlpack__, otherwise must be NULL cdef DLTensor *dl_tensor - + # Memoized properties cdef tuple _shape cdef tuple _strides @@ -110,7 +110,7 @@ cdef class StridedMemoryView: if self._shape is None and self.exporting_obj is not None: if self.dl_tensor != NULL: self._shape = cuda_utils.carray_int64_t_to_tuple( - self.dl_tensor.shape, + self.dl_tensor.shape, self.dl_tensor.ndim ) else: @@ -127,7 +127,7 @@ cdef class StridedMemoryView: if self.dl_tensor != NULL: if self.dl_tensor.strides: self._strides = cuda_utils.carray_int64_t_to_tuple( - self.dl_tensor.strides, + self.dl_tensor.strides, self.dl_tensor.ndim ) else: @@ -140,7 +140,7 @@ cdef class StridedMemoryView: self._strides, i, strides[i] // itemsize ) self._strides_init = True - return self._strides + return self._strides @property def dtype(self) -> Optional[numpy.dtype]: diff --git a/cuda_core/docs/source/_templates/autosummary/dataclass.rst b/cuda_core/docs/source/_templates/autosummary/dataclass.rst index 5931265888..efb115c831 100644 --- a/cuda_core/docs/source/_templates/autosummary/dataclass.rst +++ b/cuda_core/docs/source/_templates/autosummary/dataclass.rst @@ -10,4 +10,3 @@ {% block methods %} .. automethod:: __init__ {% endblock %} - diff --git a/cuda_core/docs/source/_templates/autosummary/namedtuple.rst b/cuda_core/docs/source/_templates/autosummary/namedtuple.rst index 1695beef0a..7ee8a09a1c 100644 --- a/cuda_core/docs/source/_templates/autosummary/namedtuple.rst +++ b/cuda_core/docs/source/_templates/autosummary/namedtuple.rst @@ -8,4 +8,4 @@ .. autoclass:: {{ objname }} :members: __new__ :special-members: __new__ - :exclude-members: count, index, __reduce__, __reduce_ex__, __repr__, __hash__, __str__, __getnewargs__ \ No newline at end of file + :exclude-members: count, index, __reduce__, __reduce_ex__, __repr__, __hash__, __str__, __getnewargs__ diff --git a/cuda_core/docs/source/getting-started.rst b/cuda_core/docs/source/getting-started.rst index 502ea66375..47f8d193a9 100644 --- a/cuda_core/docs/source/getting-started.rst +++ b/cuda_core/docs/source/getting-started.rst @@ -60,7 +60,7 @@ Don't forget to use :meth:`Device.set_current`! import cupy as cp from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch - + dev = Device() dev.set_current() s = dev.create_stream() @@ -82,14 +82,14 @@ We're using `CuPy `_ arrays as inputs for this example, but y .. code-block:: python ker = mod.get_kernel("vector_add") - + # Prepare input/output arrays (using CuPy) size = 50000 rng = cp.random.default_rng() a = rng.random(size, dtype=cp.float32) b = rng.random(size, dtype=cp.float32) c = cp.empty_like(a) - + # Configure launch parameters block = 256 grid = (size + block - 1) // block diff --git a/cuda_core/docs/source/install.rst b/cuda_core/docs/source/install.rst index 8bc1faa0e1..3241f563f7 100644 --- a/cuda_core/docs/source/install.rst +++ b/cuda_core/docs/source/install.rst @@ -10,10 +10,10 @@ Runtime Requirements ``cuda.core`` is supported on all platforms that CUDA is supported. Specific dependencies are as follows: -.. list-table:: +.. list-table:: :header-rows: 1 - * - + * - - CUDA 11 - CUDA 12 * - CUDA Toolkit\ [#f1]_ diff --git a/cuda_core/docs/source/release/0.1.1-notes.rst b/cuda_core/docs/source/release/0.1.1-notes.rst index 5434d726ea..f9ac2b5ccd 100644 --- a/cuda_core/docs/source/release/0.1.1-notes.rst +++ b/cuda_core/docs/source/release/0.1.1-notes.rst @@ -2,7 +2,7 @@ .. SPDX-License-Identifier: Apache-2.0 .. currentmodule:: cuda.core.experimental - + ``cuda.core`` 0.1.1 Release Notes ================================= diff --git a/cuda_core/docs/source/release/0.3.2-notes.rst b/cuda_core/docs/source/release/0.3.2-notes.rst index 8b4763ed31..b1b087dbb7 100644 --- a/cuda_core/docs/source/release/0.3.2-notes.rst +++ b/cuda_core/docs/source/release/0.3.2-notes.rst @@ -25,7 +25,7 @@ None. New features ------------ -- :class:`Stream` and :class:`Event` can be subclassed now. +- :class:`Stream` and :class:`Event` can be subclassed now. New examples diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml index 3506ce0255..76e312b0d7 100644 --- a/cuda_core/pyproject.toml +++ b/cuda_core/pyproject.toml @@ -97,10 +97,10 @@ select = [ ] ignore = [ - "UP006", - "UP007", + "UP006", + "UP007", "E741", # ambiguous variable name such as I - "B007", # rename unsued loop variable to _name + "B007", # rename unsued loop variable to _name "UP035" # UP006, UP007, UP035 complain about deprecated Typing. use, but disregard backward compatibility of python version ] diff --git a/cuda_core/tests/cython/test_get_cuda_native_handle.pyx b/cuda_core/tests/cython/test_get_cuda_native_handle.pyx index d1764d3bba..0c3921e925 100644 --- a/cuda_core/tests/cython/test_get_cuda_native_handle.pyx +++ b/cuda_core/tests/cython/test_get_cuda_native_handle.pyx @@ -23,7 +23,7 @@ cdef extern from "utility.hpp": def test_get_cuda_native_handle(): dev = Device(0) dev.set_current() - + s = dev.create_stream() cdef pyCUstream s_py = s.handle cdef CUstream s_c = get_cuda_native_handle(s_py) diff --git a/cuda_python/LICENSE b/cuda_python/LICENSE index b7d042fcee..a5a65097cd 100644 --- a/cuda_python/LICENSE +++ b/cuda_python/LICENSE @@ -2,46 +2,46 @@ NVIDIA SOFTWARE LICENSE This license is a legal agreement between you and NVIDIA Corporation ("NVIDIA") and governs your use of the NVIDIA CUDA Python software and materials provided hereunder ("SOFTWARE"). -This license can be accepted only by an adult of legal age of majority in the country in which the SOFTWARE is used. If you are under the legal age of majority, you must ask your parent or legal guardian to consent to this license. By taking delivery of the SOFTWARE, you affirm that you have reached the legal age of majority, you accept the terms of this license, and you take legal and financial responsibility for the actions of your permitted users. +This license can be accepted only by an adult of legal age of majority in the country in which the SOFTWARE is used. If you are under the legal age of majority, you must ask your parent or legal guardian to consent to this license. By taking delivery of the SOFTWARE, you affirm that you have reached the legal age of majority, you accept the terms of this license, and you take legal and financial responsibility for the actions of your permitted users. You agree to use the SOFTWARE only for purposes that are permitted by (a) this license, and (b) any applicable law, regulation or generally accepted practices or guidelines in the relevant jurisdictions. 1. LICENSE. Subject to the terms of this license, NVIDIA grants you a non-exclusive limited license to: (a) install and use the SOFTWARE, and (b) distribute the SOFTWARE subject to the distribution requirements described in this license. NVIDIA reserves all rights, title and interest in and to the SOFTWARE not expressly granted to you under this license. -2. DISTRIBUTION REQUIREMENTS. These are the distribution requirements for you to exercise the distribution grant: -a. The terms under which you distribute the SOFTWARE must be consistent with the terms of this license, including (without limitation) terms relating to the license grant and license restrictions and protection of NVIDIA's intellectual property rights. -b. You agree to notify NVIDIA in writing of any known or suspected distribution or use of the SOFTWARE not in compliance with the requirements of this license, and to enforce the terms of your agreements with respect to distributed SOFTWARE. +2. DISTRIBUTION REQUIREMENTS. These are the distribution requirements for you to exercise the distribution grant: +a. The terms under which you distribute the SOFTWARE must be consistent with the terms of this license, including (without limitation) terms relating to the license grant and license restrictions and protection of NVIDIA's intellectual property rights. +b. You agree to notify NVIDIA in writing of any known or suspected distribution or use of the SOFTWARE not in compliance with the requirements of this license, and to enforce the terms of your agreements with respect to distributed SOFTWARE. 3. LIMITATIONS. Your license to use the SOFTWARE is restricted as follows: a. The SOFTWARE is licensed for you to develop applications only for use in systems with NVIDIA GPUs. -b. You may not reverse engineer, decompile or disassemble, or remove copyright or other proprietary notices from any portion of the SOFTWARE or copies of the SOFTWARE. -c. You may not modify or create derivative works of any portion of the SOFTWARE. +b. You may not reverse engineer, decompile or disassemble, or remove copyright or other proprietary notices from any portion of the SOFTWARE or copies of the SOFTWARE. +c. You may not modify or create derivative works of any portion of the SOFTWARE. d. You may not bypass, disable, or circumvent any technical measure, encryption, security, digital rights management or authentication mechanism in the SOFTWARE. e. You may not use the SOFTWARE in any manner that would cause it to become subject to an open source software license. As examples, licenses that require as a condition of use, modification, and/or distribution that the SOFTWARE be (i) disclosed or distributed in source code form; (ii) licensed for the purpose of making derivative works; or (iii) redistributable at no charge. -f. Unless you have an agreement with NVIDIA for this purpose, you may not use the SOFTWARE with any system or application where the use or failure of the system or application can reasonably be expected to threaten or result in personal injury, death, or catastrophic loss. Examples include use in avionics, navigation, military, medical, life support or other life critical applications. NVIDIA does not design, test or manufacture the SOFTWARE for these critical uses and NVIDIA shall not be liable to you or any third party, in whole or in part, for any claims or damages arising from such uses. -g. You agree to defend, indemnify and hold harmless NVIDIA and its affiliates, and their respective employees, contractors, agents, officers and directors, from and against any and all claims, damages, obligations, losses, liabilities, costs or debt, fines, restitutions and expenses (including but not limited to attorney's fees and costs incident to establishing the right of indemnification) arising out of or related to use of the SOFTWARE outside of the scope of this Agreement, or not in compliance with its terms. +f. Unless you have an agreement with NVIDIA for this purpose, you may not use the SOFTWARE with any system or application where the use or failure of the system or application can reasonably be expected to threaten or result in personal injury, death, or catastrophic loss. Examples include use in avionics, navigation, military, medical, life support or other life critical applications. NVIDIA does not design, test or manufacture the SOFTWARE for these critical uses and NVIDIA shall not be liable to you or any third party, in whole or in part, for any claims or damages arising from such uses. +g. You agree to defend, indemnify and hold harmless NVIDIA and its affiliates, and their respective employees, contractors, agents, officers and directors, from and against any and all claims, damages, obligations, losses, liabilities, costs or debt, fines, restitutions and expenses (including but not limited to attorney's fees and costs incident to establishing the right of indemnification) arising out of or related to use of the SOFTWARE outside of the scope of this Agreement, or not in compliance with its terms. -4. PRE-RELEASE. SOFTWARE versions identified as alpha, beta, preview, early access or otherwise as pre-release may not be fully functional, may contain errors or design flaws, and may have reduced or different security, privacy, availability, and reliability standards relative to commercial versions of NVIDIA software and materials. You may use a pre-release SOFTWARE version at your own risk, understanding that these versions are not intended for use in production or business-critical systems. +4. PRE-RELEASE. SOFTWARE versions identified as alpha, beta, preview, early access or otherwise as pre-release may not be fully functional, may contain errors or design flaws, and may have reduced or different security, privacy, availability, and reliability standards relative to commercial versions of NVIDIA software and materials. You may use a pre-release SOFTWARE version at your own risk, understanding that these versions are not intended for use in production or business-critical systems. 5. OWNERSHIP. The SOFTWARE and the related intellectual property rights therein are and will remain the sole and exclusive property of NVIDIA or its licensors. The SOFTWARE is copyrighted and protected by the laws of the United States and other countries, and international treaty provisions. NVIDIA may make changes to the SOFTWARE, at any time without notice, but is not obligated to support or update the SOFTWARE. - -6. COMPONENTS UNDER OTHER LICENSES. The SOFTWARE may include NVIDIA or third-party components with separate legal notices or terms as may be described in proprietary notices accompanying the SOFTWARE. If and to the extent there is a conflict between the terms in this license and the license terms associated with a component, the license terms associated with the components control only to the extent necessary to resolve the conflict. + +6. COMPONENTS UNDER OTHER LICENSES. The SOFTWARE may include NVIDIA or third-party components with separate legal notices or terms as may be described in proprietary notices accompanying the SOFTWARE. If and to the extent there is a conflict between the terms in this license and the license terms associated with a component, the license terms associated with the components control only to the extent necessary to resolve the conflict. 7. FEEDBACK. You may, but don't have to, provide to NVIDIA any Feedback. "Feedback" means any suggestions, bug fixes, enhancements, modifications, feature requests or other feedback regarding the SOFTWARE. For any Feedback that you voluntarily provide, you hereby grant NVIDIA and its affiliates a perpetual, non-exclusive, worldwide, irrevocable license to use, reproduce, modify, license, sublicense (through multiple tiers of sublicensees), and distribute (through multiple tiers of distributors) the Feedback without the payment of any royalties or fees to you. NVIDIA will use Feedback at its choice. -8. NO WARRANTIES. THE SOFTWARE IS PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING, BUT NOT LIMITED TO, WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. NVIDIA DOES NOT WARRANT THAT THE SOFTWARE WILL MEET YOUR REQUIREMENTS OR THAT THE OPERATION THEREOF WILL BE UNINTERRUPTED OR ERROR-FREE, OR THAT ALL ERRORS WILL BE CORRECTED. +8. NO WARRANTIES. THE SOFTWARE IS PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING, BUT NOT LIMITED TO, WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. NVIDIA DOES NOT WARRANT THAT THE SOFTWARE WILL MEET YOUR REQUIREMENTS OR THAT THE OPERATION THEREOF WILL BE UNINTERRUPTED OR ERROR-FREE, OR THAT ALL ERRORS WILL BE CORRECTED. + +9. LIMITATIONS OF LIABILITY. TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS AFFILIATES SHALL NOT BE LIABLE FOR ANY SPECIAL, INCIDENTAL, PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, PROJECT DELAYS, LOSS OF USE, LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION WITH THIS LICENSE OR THE USE OR PERFORMANCE OF THE SOFTWARE, WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON BREACH OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE), PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION OR THEORY OF LIABILITY, EVEN IF NVIDIA HAS PREVIOUSLY BEEN ADVISED OF, OR COULD REASONABLY HAVE FORESEEN, THE POSSIBILITY OF SUCH DAMAGES. IN NO EVENT WILL NVIDIA'S AND ITS AFFILIATES TOTAL CUMULATIVE LIABILITY UNDER OR ARISING OUT OF THIS LICENSE EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE OR EXTEND THIS LIMIT. -9. LIMITATIONS OF LIABILITY. TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS AFFILIATES SHALL NOT BE LIABLE FOR ANY SPECIAL, INCIDENTAL, PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, PROJECT DELAYS, LOSS OF USE, LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION WITH THIS LICENSE OR THE USE OR PERFORMANCE OF THE SOFTWARE, WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON BREACH OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE), PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION OR THEORY OF LIABILITY, EVEN IF NVIDIA HAS PREVIOUSLY BEEN ADVISED OF, OR COULD REASONABLY HAVE FORESEEN, THE POSSIBILITY OF SUCH DAMAGES. IN NO EVENT WILL NVIDIA'S AND ITS AFFILIATES TOTAL CUMULATIVE LIABILITY UNDER OR ARISING OUT OF THIS LICENSE EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE OR EXTEND THIS LIMIT. +10. TERMINATION. Your rights under this license will terminate automatically without notice from NVIDIA if you fail to comply with any term and condition of this license or if you commence or participate in any legal proceeding against NVIDIA with respect to the SOFTWARE. NVIDIA may terminate this license with advance written notice to you if NVIDIA decides to no longer provide the SOFTWARE in a country or, in NVIDIA's sole discretion, the continued use of it is no longer commercially viable. Upon any termination of this license, you agree to promptly discontinue use of the SOFTWARE and destroy all copies in your possession or control. Your prior distributions in accordance with this license are not affected by the termination of this license. All provisions of this license will survive termination, except for the license granted to you. -10. TERMINATION. Your rights under this license will terminate automatically without notice from NVIDIA if you fail to comply with any term and condition of this license or if you commence or participate in any legal proceeding against NVIDIA with respect to the SOFTWARE. NVIDIA may terminate this license with advance written notice to you if NVIDIA decides to no longer provide the SOFTWARE in a country or, in NVIDIA's sole discretion, the continued use of it is no longer commercially viable. Upon any termination of this license, you agree to promptly discontinue use of the SOFTWARE and destroy all copies in your possession or control. Your prior distributions in accordance with this license are not affected by the termination of this license. All provisions of this license will survive termination, except for the license granted to you. +11. APPLICABLE LAW. This license will be governed in all respects by the laws of the United States and of the State of Delaware as those laws are applied to contracts entered into and performed entirely within Delaware by Delaware residents, without regard to the conflicts of laws principles. The United Nations Convention on Contracts for the International Sale of Goods is specifically disclaimed. You agree to all terms of this Agreement in the English language. The state or federal courts residing in Santa Clara County, California shall have exclusive jurisdiction over any dispute or claim arising out of this license. Notwithstanding this, you agree that NVIDIA shall still be allowed to apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction. -11. APPLICABLE LAW. This license will be governed in all respects by the laws of the United States and of the State of Delaware as those laws are applied to contracts entered into and performed entirely within Delaware by Delaware residents, without regard to the conflicts of laws principles. The United Nations Convention on Contracts for the International Sale of Goods is specifically disclaimed. You agree to all terms of this Agreement in the English language. The state or federal courts residing in Santa Clara County, California shall have exclusive jurisdiction over any dispute or claim arising out of this license. Notwithstanding this, you agree that NVIDIA shall still be allowed to apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction. +12. NO ASSIGNMENT. This license and your rights and obligations thereunder may not be assigned by you by any means or operation of law without NVIDIA's permission. Any attempted assignment not approved by NVIDIA in writing shall be void and of no effect. -12. NO ASSIGNMENT. This license and your rights and obligations thereunder may not be assigned by you by any means or operation of law without NVIDIA's permission. Any attempted assignment not approved by NVIDIA in writing shall be void and of no effect. - 13. EXPORT. The SOFTWARE is subject to United States export laws and regulations. You agree that you will not ship, transfer or export the SOFTWARE into any country, or use the SOFTWARE in any manner, prohibited by the United States Bureau of Industry and Security or economic sanctions regulations administered by the U.S. Department of Treasury's Office of Foreign Assets Control (OFAC), or any applicable export laws, restrictions or regulations. These laws include restrictions on destinations, end users and end use. By accepting this license, you confirm that you are not a resident or citizen of any country currently embargoed by the U.S. and that you are not otherwise prohibited from receiving the SOFTWARE. -14. GOVERNMENT USE. The SOFTWARE has been developed entirely at private expense and is "commercial items" consisting of "commercial computer software" and "commercial computer software documentation" provided with RESTRICTED RIGHTS. Use, duplication or disclosure by the U.S. Government or a U.S. Government subcontractor is subject to the restrictions in this license pursuant to DFARS 227.7202-3(a) or as set forth in subparagraphs (b)(1) and (2) of the Commercial Computer Software - Restricted Rights clause at FAR 52.227-19, as applicable. Contractor/manufacturer is NVIDIA, 2788 San Tomas Expressway, Santa Clara, CA 95051. +14. GOVERNMENT USE. The SOFTWARE has been developed entirely at private expense and is "commercial items" consisting of "commercial computer software" and "commercial computer software documentation" provided with RESTRICTED RIGHTS. Use, duplication or disclosure by the U.S. Government or a U.S. Government subcontractor is subject to the restrictions in this license pursuant to DFARS 227.7202-3(a) or as set forth in subparagraphs (b)(1) and (2) of the Commercial Computer Software - Restricted Rights clause at FAR 52.227-19, as applicable. Contractor/manufacturer is NVIDIA, 2788 San Tomas Expressway, Santa Clara, CA 95051. 15. ENTIRE AGREEMENT. This license is the final, complete and exclusive agreement between the parties relating to the subject matter of this license and supersedes all prior or contemporaneous understandings and agreements relating to this subject matter, whether oral or written. If any court of competent jurisdiction determines that any provision of this license is illegal, invalid or unenforceable, the remaining provisions will remain in full force and effect. This license may only be modified in a writing signed by an authorized representative of each party. diff --git a/cuda_python/docs/source/release.rst b/cuda_python/docs/source/release.rst index c97e508c45..9e7a66a522 100644 --- a/cuda_python/docs/source/release.rst +++ b/cuda_python/docs/source/release.rst @@ -17,4 +17,3 @@ Release Notes 12.6.1 11.8.7 11.8.6 - From e582ec8d429adef4838784e1650c2f081ea442ef Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Mon, 25 Aug 2025 14:07:50 -0700 Subject: [PATCH 3/7] =?UTF-8?q?Revert=20"Automatic=20whitespace=20fixes=20?= =?UTF-8?q?=E2=80=94=20NO=20manual=20changes."?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit f955667855c43e6a48192262ac7439d704c8b6c3. --- .github/ISSUE_TEMPLATE/bug_report.yml | 1 + .github/ISSUE_TEMPLATE/doc_request.yml | 1 + .github/PULL_REQUEST_TEMPLATE.md | 1 + .github/actions/doc_preview/action.yml | 10 ++--- .github/actions/install_unix_deps/action.yml | 2 +- .github/workflows/release-upload.yml | 2 +- SECURITY.md | 1 + ci/tools/download-wheels | 8 ++-- cuda_bindings/LICENSE | 36 +++++++-------- .../cuda/bindings/_bindings/cydriver.pxd.in | 1 + .../cuda/bindings/_bindings/cydriver.pyx.in | 8 ++-- .../cuda/bindings/_bindings/cynvrtc.pxd.in | 1 + .../cuda/bindings/_lib/cyruntime/utils.pyx.in | 2 +- cuda_bindings/cuda/bindings/_lib/utils.pxi.in | 2 +- cuda_bindings/cuda/bindings/cydriver.pxd.in | 2 +- cuda_bindings/cuda/bindings/cynvrtc.pxd.in | 1 + cuda_bindings/cuda/bindings/cyruntime.pxd.in | 2 +- .../cuda/bindings/cyruntime_functions.pxi.in | 1 + cuda_bindings/cuda/bindings/driver.pyx.in | 3 +- cuda_bindings/cuda/bindings/runtime.pyx.in | 3 +- .../docs/source/environment_variables.rst | 1 + cuda_bindings/docs/source/install.rst | 2 +- cuda_bindings/docs/source/module/driver.rst | 26 +++++------ .../docs/source/module/nvjitlink.rst | 2 +- cuda_bindings/docs/source/module/nvrtc.rst | 7 +-- cuda_bindings/docs/source/module/runtime.rst | 44 +++++++++---------- cuda_bindings/docs/source/motivation.rst | 14 +++--- cuda_bindings/docs/source/overview.rst | 39 ++++++++-------- .../docs/source/release/11.6.0-notes.rst | 1 + .../docs/source/release/12.9.X-notes.rst | 2 +- cuda_bindings/docs/source/tips_and_tricks.rst | 6 +-- cuda_bindings/pyproject.toml | 10 ++--- .../core/experimental/_kernel_arg_handler.pyx | 2 +- cuda_core/cuda/core/experimental/_memory.pyx | 10 ++--- .../cuda/core/experimental/_memoryview.pyx | 36 +++++++-------- .../_templates/autosummary/dataclass.rst | 1 + .../_templates/autosummary/namedtuple.rst | 2 +- cuda_core/docs/source/getting-started.rst | 6 +-- cuda_core/docs/source/install.rst | 4 +- cuda_core/docs/source/release/0.1.1-notes.rst | 2 +- cuda_core/docs/source/release/0.3.2-notes.rst | 2 +- cuda_core/pyproject.toml | 6 +-- .../cython/test_get_cuda_native_handle.pyx | 2 +- cuda_python/LICENSE | 36 +++++++-------- cuda_python/docs/source/release.rst | 1 + 45 files changed, 184 insertions(+), 168 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index 4574e04bf3..09752a7293 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -113,3 +113,4 @@ body: +-------------------------------+----------------------+----------------------+ validations: required: false + diff --git a/.github/ISSUE_TEMPLATE/doc_request.yml b/.github/ISSUE_TEMPLATE/doc_request.yml index 7804a6c85a..26a7faeace 100644 --- a/.github/ISSUE_TEMPLATE/doc_request.yml +++ b/.github/ISSUE_TEMPLATE/doc_request.yml @@ -41,3 +41,4 @@ body: label: If this is a correction, please provide a link to the incorrect documentation. If this is a new documentation request, please link to where you have looked. placeholder: | https://nvidia.github.io/cuda-python/latest/ + diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index aa51259a90..afc43be22f 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -11,3 +11,4 @@ closes - [ ] New or existing tests cover these changes. - [ ] The documentation is up to date with these changes. + diff --git a/.github/actions/doc_preview/action.yml b/.github/actions/doc_preview/action.yml index 6948522daa..61531428a7 100644 --- a/.github/actions/doc_preview/action.yml +++ b/.github/actions/doc_preview/action.yml @@ -18,7 +18,7 @@ runs: using: composite steps: # The steps below are executed only when testing in a PR. - # Note: the PR previews will be removed once merged to main (see below) + # Note: the PR previews will be removed once merged to main (see below) - name: Deploy doc preview if: ${{ github.ref_name != 'main' }} uses: JamesIves/github-pages-deploy-action@6c2d9db40f9296374acc17b90404b6e8864128c8 # v4.7.3 @@ -28,7 +28,7 @@ runs: folder: ${{ inputs.source-folder }} target-folder: docs/pr-preview/pr-${{ inputs.pr-number }}/ commit-message: "Deploy doc preview for PR ${{ inputs.pr-number }} (${{ github.sha }})" - + - name: Leave a comment after deployment if: ${{ github.ref_name != 'main' }} uses: marocchino/sticky-pull-request-comment@67d0dec7b07ed060a405f9b2a64b8ab319fdd7db # v2.9.2 @@ -43,8 +43,8 @@ runs: |
https://nvidia.github.io/cuda-python/pr-preview/pr-${{ inputs.pr-number }}/cuda-core/
|
https://nvidia.github.io/cuda-python/pr-preview/pr-${{ inputs.pr-number }}/cuda-bindings/

|

Preview will be ready when the GitHub Pages deployment is complete.

- - # The steps below are executed only when building on main. + + # The steps below are executed only when building on main. - name: Remove doc preview if: ${{ github.ref_name == 'main' }} uses: JamesIves/github-pages-deploy-action@6c2d9db40f9296374acc17b90404b6e8864128c8 # v4.7.3 @@ -54,7 +54,7 @@ runs: folder: ${{ inputs.source-folder }} target-folder: docs/pr-preview/pr-${{ inputs.pr-number }}/ commit-message: "Clean up doc preview for PR ${{ inputs.pr-number }} (${{ github.sha }})" - + - name: Leave a comment after removal if: ${{ github.ref_name == 'main' }} uses: marocchino/sticky-pull-request-comment@67d0dec7b07ed060a405f9b2a64b8ab319fdd7db # v2.9.2 diff --git a/.github/actions/install_unix_deps/action.yml b/.github/actions/install_unix_deps/action.yml index 6289541c90..645b761cf8 100644 --- a/.github/actions/install_unix_deps/action.yml +++ b/.github/actions/install_unix_deps/action.yml @@ -10,7 +10,7 @@ inputs: dependencies: required: true type: string - dependent_exes: + dependent_exes: required: true type: string diff --git a/.github/workflows/release-upload.yml b/.github/workflows/release-upload.yml index 8ae08c5026..f7d6306fc2 100644 --- a/.github/workflows/release-upload.yml +++ b/.github/workflows/release-upload.yml @@ -78,7 +78,7 @@ jobs: run: | # Use the shared script to download wheels ./ci/tools/download-wheels "${{ inputs.run-id }}" "${{ inputs.component }}" "${{ github.repository }}" "release/wheels" - + # Upload wheels to the release if [[ -d "release/wheels" && $(ls -A release/wheels 2>/dev/null | wc -l) -gt 0 ]]; then echo "Uploading wheels to release ${{ inputs.git-tag }}" diff --git a/SECURITY.md b/SECURITY.md index 4283541556..b72f0d67ad 100755 --- a/SECURITY.md +++ b/SECURITY.md @@ -33,3 +33,4 @@ information. For all security-related concerns, please visit NVIDIA's Product Security portal at . + diff --git a/ci/tools/download-wheels b/ci/tools/download-wheels index a3141afb33..05509bfc0a 100755 --- a/ci/tools/download-wheels +++ b/ci/tools/download-wheels @@ -49,18 +49,18 @@ do if [[ ! -d "$p" ]]; then continue fi - + # exclude cython test artifacts if [[ "${p}" == *-tests ]]; then echo "Skipping test artifact: $p" continue fi - + # If we're not downloading "all", only process matching component if [[ "$COMPONENT" != "all" && "$p" != ${COMPONENT}* ]]; then continue fi - + echo "Processing artifact: $p" # Move wheel files to output directory if [[ -d "$p" ]]; then @@ -72,4 +72,4 @@ done rm -rf cuda-* echo "Downloaded wheels to: $OUTPUT_DIR" -ls -la "$OUTPUT_DIR" +ls -la "$OUTPUT_DIR" \ No newline at end of file diff --git a/cuda_bindings/LICENSE b/cuda_bindings/LICENSE index a5a65097cd..b7d042fcee 100644 --- a/cuda_bindings/LICENSE +++ b/cuda_bindings/LICENSE @@ -2,46 +2,46 @@ NVIDIA SOFTWARE LICENSE This license is a legal agreement between you and NVIDIA Corporation ("NVIDIA") and governs your use of the NVIDIA CUDA Python software and materials provided hereunder ("SOFTWARE"). -This license can be accepted only by an adult of legal age of majority in the country in which the SOFTWARE is used. If you are under the legal age of majority, you must ask your parent or legal guardian to consent to this license. By taking delivery of the SOFTWARE, you affirm that you have reached the legal age of majority, you accept the terms of this license, and you take legal and financial responsibility for the actions of your permitted users. +This license can be accepted only by an adult of legal age of majority in the country in which the SOFTWARE is used. If you are under the legal age of majority, you must ask your parent or legal guardian to consent to this license. By taking delivery of the SOFTWARE, you affirm that you have reached the legal age of majority, you accept the terms of this license, and you take legal and financial responsibility for the actions of your permitted users. You agree to use the SOFTWARE only for purposes that are permitted by (a) this license, and (b) any applicable law, regulation or generally accepted practices or guidelines in the relevant jurisdictions. 1. LICENSE. Subject to the terms of this license, NVIDIA grants you a non-exclusive limited license to: (a) install and use the SOFTWARE, and (b) distribute the SOFTWARE subject to the distribution requirements described in this license. NVIDIA reserves all rights, title and interest in and to the SOFTWARE not expressly granted to you under this license. -2. DISTRIBUTION REQUIREMENTS. These are the distribution requirements for you to exercise the distribution grant: -a. The terms under which you distribute the SOFTWARE must be consistent with the terms of this license, including (without limitation) terms relating to the license grant and license restrictions and protection of NVIDIA's intellectual property rights. -b. You agree to notify NVIDIA in writing of any known or suspected distribution or use of the SOFTWARE not in compliance with the requirements of this license, and to enforce the terms of your agreements with respect to distributed SOFTWARE. +2. DISTRIBUTION REQUIREMENTS. These are the distribution requirements for you to exercise the distribution grant: +a. The terms under which you distribute the SOFTWARE must be consistent with the terms of this license, including (without limitation) terms relating to the license grant and license restrictions and protection of NVIDIA's intellectual property rights. +b. You agree to notify NVIDIA in writing of any known or suspected distribution or use of the SOFTWARE not in compliance with the requirements of this license, and to enforce the terms of your agreements with respect to distributed SOFTWARE. 3. LIMITATIONS. Your license to use the SOFTWARE is restricted as follows: a. The SOFTWARE is licensed for you to develop applications only for use in systems with NVIDIA GPUs. -b. You may not reverse engineer, decompile or disassemble, or remove copyright or other proprietary notices from any portion of the SOFTWARE or copies of the SOFTWARE. -c. You may not modify or create derivative works of any portion of the SOFTWARE. +b. You may not reverse engineer, decompile or disassemble, or remove copyright or other proprietary notices from any portion of the SOFTWARE or copies of the SOFTWARE. +c. You may not modify or create derivative works of any portion of the SOFTWARE. d. You may not bypass, disable, or circumvent any technical measure, encryption, security, digital rights management or authentication mechanism in the SOFTWARE. e. You may not use the SOFTWARE in any manner that would cause it to become subject to an open source software license. As examples, licenses that require as a condition of use, modification, and/or distribution that the SOFTWARE be (i) disclosed or distributed in source code form; (ii) licensed for the purpose of making derivative works; or (iii) redistributable at no charge. -f. Unless you have an agreement with NVIDIA for this purpose, you may not use the SOFTWARE with any system or application where the use or failure of the system or application can reasonably be expected to threaten or result in personal injury, death, or catastrophic loss. Examples include use in avionics, navigation, military, medical, life support or other life critical applications. NVIDIA does not design, test or manufacture the SOFTWARE for these critical uses and NVIDIA shall not be liable to you or any third party, in whole or in part, for any claims or damages arising from such uses. -g. You agree to defend, indemnify and hold harmless NVIDIA and its affiliates, and their respective employees, contractors, agents, officers and directors, from and against any and all claims, damages, obligations, losses, liabilities, costs or debt, fines, restitutions and expenses (including but not limited to attorney's fees and costs incident to establishing the right of indemnification) arising out of or related to use of the SOFTWARE outside of the scope of this Agreement, or not in compliance with its terms. +f. Unless you have an agreement with NVIDIA for this purpose, you may not use the SOFTWARE with any system or application where the use or failure of the system or application can reasonably be expected to threaten or result in personal injury, death, or catastrophic loss. Examples include use in avionics, navigation, military, medical, life support or other life critical applications. NVIDIA does not design, test or manufacture the SOFTWARE for these critical uses and NVIDIA shall not be liable to you or any third party, in whole or in part, for any claims or damages arising from such uses. +g. You agree to defend, indemnify and hold harmless NVIDIA and its affiliates, and their respective employees, contractors, agents, officers and directors, from and against any and all claims, damages, obligations, losses, liabilities, costs or debt, fines, restitutions and expenses (including but not limited to attorney's fees and costs incident to establishing the right of indemnification) arising out of or related to use of the SOFTWARE outside of the scope of this Agreement, or not in compliance with its terms. -4. PRE-RELEASE. SOFTWARE versions identified as alpha, beta, preview, early access or otherwise as pre-release may not be fully functional, may contain errors or design flaws, and may have reduced or different security, privacy, availability, and reliability standards relative to commercial versions of NVIDIA software and materials. You may use a pre-release SOFTWARE version at your own risk, understanding that these versions are not intended for use in production or business-critical systems. +4. PRE-RELEASE. SOFTWARE versions identified as alpha, beta, preview, early access or otherwise as pre-release may not be fully functional, may contain errors or design flaws, and may have reduced or different security, privacy, availability, and reliability standards relative to commercial versions of NVIDIA software and materials. You may use a pre-release SOFTWARE version at your own risk, understanding that these versions are not intended for use in production or business-critical systems. 5. OWNERSHIP. The SOFTWARE and the related intellectual property rights therein are and will remain the sole and exclusive property of NVIDIA or its licensors. The SOFTWARE is copyrighted and protected by the laws of the United States and other countries, and international treaty provisions. NVIDIA may make changes to the SOFTWARE, at any time without notice, but is not obligated to support or update the SOFTWARE. - -6. COMPONENTS UNDER OTHER LICENSES. The SOFTWARE may include NVIDIA or third-party components with separate legal notices or terms as may be described in proprietary notices accompanying the SOFTWARE. If and to the extent there is a conflict between the terms in this license and the license terms associated with a component, the license terms associated with the components control only to the extent necessary to resolve the conflict. + +6. COMPONENTS UNDER OTHER LICENSES. The SOFTWARE may include NVIDIA or third-party components with separate legal notices or terms as may be described in proprietary notices accompanying the SOFTWARE. If and to the extent there is a conflict between the terms in this license and the license terms associated with a component, the license terms associated with the components control only to the extent necessary to resolve the conflict. 7. FEEDBACK. You may, but don't have to, provide to NVIDIA any Feedback. "Feedback" means any suggestions, bug fixes, enhancements, modifications, feature requests or other feedback regarding the SOFTWARE. For any Feedback that you voluntarily provide, you hereby grant NVIDIA and its affiliates a perpetual, non-exclusive, worldwide, irrevocable license to use, reproduce, modify, license, sublicense (through multiple tiers of sublicensees), and distribute (through multiple tiers of distributors) the Feedback without the payment of any royalties or fees to you. NVIDIA will use Feedback at its choice. -8. NO WARRANTIES. THE SOFTWARE IS PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING, BUT NOT LIMITED TO, WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. NVIDIA DOES NOT WARRANT THAT THE SOFTWARE WILL MEET YOUR REQUIREMENTS OR THAT THE OPERATION THEREOF WILL BE UNINTERRUPTED OR ERROR-FREE, OR THAT ALL ERRORS WILL BE CORRECTED. - -9. LIMITATIONS OF LIABILITY. TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS AFFILIATES SHALL NOT BE LIABLE FOR ANY SPECIAL, INCIDENTAL, PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, PROJECT DELAYS, LOSS OF USE, LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION WITH THIS LICENSE OR THE USE OR PERFORMANCE OF THE SOFTWARE, WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON BREACH OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE), PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION OR THEORY OF LIABILITY, EVEN IF NVIDIA HAS PREVIOUSLY BEEN ADVISED OF, OR COULD REASONABLY HAVE FORESEEN, THE POSSIBILITY OF SUCH DAMAGES. IN NO EVENT WILL NVIDIA'S AND ITS AFFILIATES TOTAL CUMULATIVE LIABILITY UNDER OR ARISING OUT OF THIS LICENSE EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE OR EXTEND THIS LIMIT. +8. NO WARRANTIES. THE SOFTWARE IS PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING, BUT NOT LIMITED TO, WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. NVIDIA DOES NOT WARRANT THAT THE SOFTWARE WILL MEET YOUR REQUIREMENTS OR THAT THE OPERATION THEREOF WILL BE UNINTERRUPTED OR ERROR-FREE, OR THAT ALL ERRORS WILL BE CORRECTED. -10. TERMINATION. Your rights under this license will terminate automatically without notice from NVIDIA if you fail to comply with any term and condition of this license or if you commence or participate in any legal proceeding against NVIDIA with respect to the SOFTWARE. NVIDIA may terminate this license with advance written notice to you if NVIDIA decides to no longer provide the SOFTWARE in a country or, in NVIDIA's sole discretion, the continued use of it is no longer commercially viable. Upon any termination of this license, you agree to promptly discontinue use of the SOFTWARE and destroy all copies in your possession or control. Your prior distributions in accordance with this license are not affected by the termination of this license. All provisions of this license will survive termination, except for the license granted to you. +9. LIMITATIONS OF LIABILITY. TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS AFFILIATES SHALL NOT BE LIABLE FOR ANY SPECIAL, INCIDENTAL, PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, PROJECT DELAYS, LOSS OF USE, LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION WITH THIS LICENSE OR THE USE OR PERFORMANCE OF THE SOFTWARE, WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON BREACH OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE), PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION OR THEORY OF LIABILITY, EVEN IF NVIDIA HAS PREVIOUSLY BEEN ADVISED OF, OR COULD REASONABLY HAVE FORESEEN, THE POSSIBILITY OF SUCH DAMAGES. IN NO EVENT WILL NVIDIA'S AND ITS AFFILIATES TOTAL CUMULATIVE LIABILITY UNDER OR ARISING OUT OF THIS LICENSE EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE OR EXTEND THIS LIMIT. -11. APPLICABLE LAW. This license will be governed in all respects by the laws of the United States and of the State of Delaware as those laws are applied to contracts entered into and performed entirely within Delaware by Delaware residents, without regard to the conflicts of laws principles. The United Nations Convention on Contracts for the International Sale of Goods is specifically disclaimed. You agree to all terms of this Agreement in the English language. The state or federal courts residing in Santa Clara County, California shall have exclusive jurisdiction over any dispute or claim arising out of this license. Notwithstanding this, you agree that NVIDIA shall still be allowed to apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction. +10. TERMINATION. Your rights under this license will terminate automatically without notice from NVIDIA if you fail to comply with any term and condition of this license or if you commence or participate in any legal proceeding against NVIDIA with respect to the SOFTWARE. NVIDIA may terminate this license with advance written notice to you if NVIDIA decides to no longer provide the SOFTWARE in a country or, in NVIDIA's sole discretion, the continued use of it is no longer commercially viable. Upon any termination of this license, you agree to promptly discontinue use of the SOFTWARE and destroy all copies in your possession or control. Your prior distributions in accordance with this license are not affected by the termination of this license. All provisions of this license will survive termination, except for the license granted to you. -12. NO ASSIGNMENT. This license and your rights and obligations thereunder may not be assigned by you by any means or operation of law without NVIDIA's permission. Any attempted assignment not approved by NVIDIA in writing shall be void and of no effect. +11. APPLICABLE LAW. This license will be governed in all respects by the laws of the United States and of the State of Delaware as those laws are applied to contracts entered into and performed entirely within Delaware by Delaware residents, without regard to the conflicts of laws principles. The United Nations Convention on Contracts for the International Sale of Goods is specifically disclaimed. You agree to all terms of this Agreement in the English language. The state or federal courts residing in Santa Clara County, California shall have exclusive jurisdiction over any dispute or claim arising out of this license. Notwithstanding this, you agree that NVIDIA shall still be allowed to apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction. +12. NO ASSIGNMENT. This license and your rights and obligations thereunder may not be assigned by you by any means or operation of law without NVIDIA's permission. Any attempted assignment not approved by NVIDIA in writing shall be void and of no effect. + 13. EXPORT. The SOFTWARE is subject to United States export laws and regulations. You agree that you will not ship, transfer or export the SOFTWARE into any country, or use the SOFTWARE in any manner, prohibited by the United States Bureau of Industry and Security or economic sanctions regulations administered by the U.S. Department of Treasury's Office of Foreign Assets Control (OFAC), or any applicable export laws, restrictions or regulations. These laws include restrictions on destinations, end users and end use. By accepting this license, you confirm that you are not a resident or citizen of any country currently embargoed by the U.S. and that you are not otherwise prohibited from receiving the SOFTWARE. -14. GOVERNMENT USE. The SOFTWARE has been developed entirely at private expense and is "commercial items" consisting of "commercial computer software" and "commercial computer software documentation" provided with RESTRICTED RIGHTS. Use, duplication or disclosure by the U.S. Government or a U.S. Government subcontractor is subject to the restrictions in this license pursuant to DFARS 227.7202-3(a) or as set forth in subparagraphs (b)(1) and (2) of the Commercial Computer Software - Restricted Rights clause at FAR 52.227-19, as applicable. Contractor/manufacturer is NVIDIA, 2788 San Tomas Expressway, Santa Clara, CA 95051. +14. GOVERNMENT USE. The SOFTWARE has been developed entirely at private expense and is "commercial items" consisting of "commercial computer software" and "commercial computer software documentation" provided with RESTRICTED RIGHTS. Use, duplication or disclosure by the U.S. Government or a U.S. Government subcontractor is subject to the restrictions in this license pursuant to DFARS 227.7202-3(a) or as set forth in subparagraphs (b)(1) and (2) of the Commercial Computer Software - Restricted Rights clause at FAR 52.227-19, as applicable. Contractor/manufacturer is NVIDIA, 2788 San Tomas Expressway, Santa Clara, CA 95051. 15. ENTIRE AGREEMENT. This license is the final, complete and exclusive agreement between the parties relating to the subject matter of this license and supersedes all prior or contemporaneous understandings and agreements relating to this subject matter, whether oral or written. If any court of competent jurisdiction determines that any provision of this license is illegal, invalid or unenforceable, the remaining provisions will remain in full force and effect. This license may only be modified in a writing signed by an authorized representative of each party. diff --git a/cuda_bindings/cuda/bindings/_bindings/cydriver.pxd.in b/cuda_bindings/cuda/bindings/_bindings/cydriver.pxd.in index b8a5ee5abf..50701f70d1 100644 --- a/cuda_bindings/cuda/bindings/_bindings/cydriver.pxd.in +++ b/cuda_bindings/cuda/bindings/_bindings/cydriver.pxd.in @@ -2348,3 +2348,4 @@ cdef CUresult _cuGraphicsVDPAURegisterVideoSurface(CUgraphicsResource* pCudaReso cdef CUresult _cuGraphicsVDPAURegisterOutputSurface(CUgraphicsResource* pCudaResource, VdpOutputSurface vdpSurface, unsigned int flags) except ?CUDA_ERROR_NOT_FOUND nogil {{endif}} + diff --git a/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in b/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in index efbc678bd9..7fc86b565e 100644 --- a/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in +++ b/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in @@ -512,7 +512,7 @@ cdef int cuPythonInit() except -1 nogil: {{else}} path = 'libcuda.so.1' {{endif}} - + {{if 'Windows' == platform.system()}} LOAD_LIBRARY_SEARCH_SYSTEM32 = 0x00000800 try: @@ -524,7 +524,7 @@ cdef int cuPythonInit() except -1 nogil: if (handle == NULL): raise RuntimeError('Failed to dlopen ' + path) {{endif}} - + # Get latest __cuGetProcAddress_v2 global __cuGetProcAddress_v2 {{if 'Windows' == platform.system()}} @@ -535,7 +535,7 @@ cdef int cuPythonInit() except -1 nogil: {{else}} __cuGetProcAddress_v2 = dlfcn.dlsym(handle, 'cuGetProcAddress_v2') {{endif}} - + # Load using cuGetProcAddress if available if __cuGetProcAddress_v2 != NULL: _F_cuGetProcAddress_v2 = <__cuGetProcAddress_v2_T>__cuGetProcAddress_v2 @@ -2765,7 +2765,7 @@ cdef int cuPythonInit() except -1 nogil: __cuPythonInit = True return 0 - + {{if 'Windows' == platform.system()}} # Load using win32GetAddr if usePTDS: diff --git a/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pxd.in b/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pxd.in index 355b19f48e..148530a86a 100644 --- a/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pxd.in +++ b/cuda_bindings/cuda/bindings/_bindings/cynvrtc.pxd.in @@ -123,3 +123,4 @@ cdef nvrtcResult _nvrtcGetPCHHeapSizeRequired(nvrtcProgram prog, size_t* size) e cdef nvrtcResult _nvrtcSetFlowCallback(nvrtcProgram prog, void* callback, void* payload) except ?NVRTC_ERROR_INVALID_INPUT nogil {{endif}} + diff --git a/cuda_bindings/cuda/bindings/_lib/cyruntime/utils.pyx.in b/cuda_bindings/cuda/bindings/_lib/cyruntime/utils.pyx.in index 82c508a912..292d19cb9c 100644 --- a/cuda_bindings/cuda/bindings/_lib/cyruntime/utils.pyx.in +++ b/cuda_bindings/cuda/bindings/_lib/cyruntime/utils.pyx.in @@ -675,7 +675,7 @@ cdef cudaError_t getRuntimeEglFrame(cudaEglFrame *eglFrame, cydriver.CUeglFrame cueglFrame.eglColorFormat == cydriver.CUeglColorFormat_enum.CU_EGL_COLOR_FORMAT_YUV420_SEMIPLANAR_2020 or cueglFrame.eglColorFormat == cydriver.CUeglColorFormat_enum.CU_EGL_COLOR_FORMAT_YVU420_SEMIPLANAR_2020 or cueglFrame.eglColorFormat == cydriver.CUeglColorFormat_enum.CU_EGL_COLOR_FORMAT_YUV420_SEMIPLANAR_709 or - cueglFrame.eglColorFormat == cydriver.CUeglColorFormat_enum.CU_EGL_COLOR_FORMAT_YVU420_SEMIPLANAR_709 or + cueglFrame.eglColorFormat == cydriver.CUeglColorFormat_enum.CU_EGL_COLOR_FORMAT_YVU420_SEMIPLANAR_709 or cueglFrame.eglColorFormat == cydriver.CUeglColorFormat_enum.CU_EGL_COLOR_FORMAT_Y10V10U10_420_SEMIPLANAR_709 or cueglFrame.eglColorFormat == cydriver.CUeglColorFormat_enum.CU_EGL_COLOR_FORMAT_Y10V10U10_420_SEMIPLANAR_2020 or cueglFrame.eglColorFormat == cydriver.CUeglColorFormat_enum.CU_EGL_COLOR_FORMAT_Y10V10U10_420_SEMIPLANAR_ER or diff --git a/cuda_bindings/cuda/bindings/_lib/utils.pxi.in b/cuda_bindings/cuda/bindings/_lib/utils.pxi.in index 7f7e0666c7..0a9f2e4e30 100644 --- a/cuda_bindings/cuda/bindings/_lib/utils.pxi.in +++ b/cuda_bindings/cuda/bindings/_lib/utils.pxi.in @@ -628,7 +628,7 @@ cdef class _HelperCUcoredumpSettings: {{if 'CU_COREDUMP_ENABLE_USER_TRIGGER' in found_values}}cydriver.CUcoredumpSettings_enum.CU_COREDUMP_ENABLE_USER_TRIGGER,{{endif}}): if self._is_getter == False: self._bool = init_value - + self._cptr = &self._bool self._size = 1 else: diff --git a/cuda_bindings/cuda/bindings/cydriver.pxd.in b/cuda_bindings/cuda/bindings/cydriver.pxd.in index 8696e8df47..e3fe2f8810 100644 --- a/cuda_bindings/cuda/bindings/cydriver.pxd.in +++ b/cuda_bindings/cuda/bindings/cydriver.pxd.in @@ -5163,4 +5163,4 @@ cdef enum: RESOURCE_ABI_EXTERNAL_BYTES = 48 cdef enum: MAX_PLANES = 3 -cdef enum: CUDA_EGL_INFINITE_TIMEOUT = 4294967295 +cdef enum: CUDA_EGL_INFINITE_TIMEOUT = 4294967295 \ No newline at end of file diff --git a/cuda_bindings/cuda/bindings/cynvrtc.pxd.in b/cuda_bindings/cuda/bindings/cynvrtc.pxd.in index 85a5e23663..7a392687dd 100644 --- a/cuda_bindings/cuda/bindings/cynvrtc.pxd.in +++ b/cuda_bindings/cuda/bindings/cynvrtc.pxd.in @@ -150,3 +150,4 @@ cdef nvrtcResult nvrtcGetPCHHeapSizeRequired(nvrtcProgram prog, size_t* size) ex cdef nvrtcResult nvrtcSetFlowCallback(nvrtcProgram prog, void* callback, void* payload) except ?NVRTC_ERROR_INVALID_INPUT nogil {{endif}} + diff --git a/cuda_bindings/cuda/bindings/cyruntime.pxd.in b/cuda_bindings/cuda/bindings/cyruntime.pxd.in index 17ac2318dd..bd0bc3d5f3 100644 --- a/cuda_bindings/cuda/bindings/cyruntime.pxd.in +++ b/cuda_bindings/cuda/bindings/cyruntime.pxd.in @@ -1956,4 +1956,4 @@ cdef enum: CUDART_VERSION = 13000 cdef enum: __CUDART_API_VERSION = 13000 -cdef enum: CUDA_EGL_MAX_PLANES = 3 +cdef enum: CUDA_EGL_MAX_PLANES = 3 \ No newline at end of file diff --git a/cuda_bindings/cuda/bindings/cyruntime_functions.pxi.in b/cuda_bindings/cuda/bindings/cyruntime_functions.pxi.in index 4c2c5c0598..14230f1a26 100644 --- a/cuda_bindings/cuda/bindings/cyruntime_functions.pxi.in +++ b/cuda_bindings/cuda/bindings/cyruntime_functions.pxi.in @@ -1480,3 +1480,4 @@ cdef extern from "cuda_profiler_api.h": cudaError_t cudaProfilerStop() nogil {{endif}} + diff --git a/cuda_bindings/cuda/bindings/driver.pyx.in b/cuda_bindings/cuda/bindings/driver.pyx.in index 15f5ba6497..975153c58e 100644 --- a/cuda_bindings/cuda/bindings/driver.pyx.in +++ b/cuda_bindings/cuda/bindings/driver.pyx.in @@ -50737,7 +50737,7 @@ def cuCoredumpSetAttributeGlobal(attrib not None : CUcoredumpSettings, value): @cython.embedsignature(True) def cuGetExportTable(pExportTableId : Optional[CUuuid]): - """ + """ Parameters ---------- @@ -54071,3 +54071,4 @@ cdef int _add_native_handle_getters() except?-1: {{endif}} return 0 _add_native_handle_getters() + diff --git a/cuda_bindings/cuda/bindings/runtime.pyx.in b/cuda_bindings/cuda/bindings/runtime.pyx.in index 0729a4eeca..f17436058d 100644 --- a/cuda_bindings/cuda/bindings/runtime.pyx.in +++ b/cuda_bindings/cuda/bindings/runtime.pyx.in @@ -19177,7 +19177,7 @@ def cudaIpcOpenMemHandle(handle not None : cudaIpcMemHandle_t, unsigned int flag Notes ----- - No guarantees are made about the address returned in `*devPtr`. + No guarantees are made about the address returned in `*devPtr`. In particular, multiple processes may not receive the same address for the same `handle`. """ cdef void_ptr devPtr = 0 @@ -38015,3 +38015,4 @@ cdef int _add_native_handle_getters() except?-1: {{endif}} return 0 _add_native_handle_getters() + diff --git a/cuda_bindings/docs/source/environment_variables.rst b/cuda_bindings/docs/source/environment_variables.rst index a212bfe764..c582fe57b3 100644 --- a/cuda_bindings/docs/source/environment_variables.rst +++ b/cuda_bindings/docs/source/environment_variables.rst @@ -18,3 +18,4 @@ Build-Time Environment Variables - ``CUDA_PYTHON_PARSER_CACHING`` : bool, toggles the caching of parsed header files during the cuda-bindings build process. If caching is enabled (``CUDA_PYTHON_PARSER_CACHING`` is True), the cache path is set to ./cache_, where is derived from the cuda toolkit libraries used to build cuda-bindings. - ``CUDA_PYTHON_PARALLEL_LEVEL`` (previously ``PARALLEL_LEVEL``) : int, sets the number of threads used in the compilation of extension modules. Not setting it or setting it to 0 would disable parallel builds. + diff --git a/cuda_bindings/docs/source/install.rst b/cuda_bindings/docs/source/install.rst index b5181c6a35..b9335b4876 100644 --- a/cuda_bindings/docs/source/install.rst +++ b/cuda_bindings/docs/source/install.rst @@ -35,7 +35,7 @@ Install all optional dependencies with: Where the optional dependencies include: -* ``nvidia-cuda-nvrtc`` (NVRTC runtime compilation library) +* ``nvidia-cuda-nvrtc`` (NVRTC runtime compilation library) * ``nvidia-nvjitlink`` (nvJitLink library) * ``nvidia-nvvm`` (NVVM library) * ``nvidia-cufile`` (cuFile library, Linux only) diff --git a/cuda_bindings/docs/source/module/driver.rst b/cuda_bindings/docs/source/module/driver.rst index 8a4ba15ff8..04e0390d12 100644 --- a/cuda_bindings/docs/source/module/driver.rst +++ b/cuda_bindings/docs/source/module/driver.rst @@ -2318,7 +2318,7 @@ Data types used by CUDA driver .. autoattribute:: cuda.bindings.driver.CUjit_option.CU_JIT_CACHE_MODE - Specifies whether to enable caching explicitly (-dlcm) + Specifies whether to enable caching explicitly (-dlcm) Choice is based on supplied :py:obj:`~.CUjit_cacheMode_enum`. @@ -3256,7 +3256,7 @@ Data types used by CUDA driver .. autoattribute:: cuda.bindings.driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT - Valid for launches. Set :py:obj:`~.CUlaunchAttributeValue.programmaticEvent` to record the event. Event recorded through this launch attribute is guaranteed to only trigger after all block in the associated kernel trigger the event. A block can trigger the event through PTX launchdep.release or CUDA builtin function cudaTriggerProgrammaticLaunchCompletion(). A trigger can also be inserted at the beginning of each block's execution if triggerAtBlockStart is set to non-0. The dependent launches can choose to wait on the dependency using the programmatic sync (cudaGridDependencySynchronize() or equivalent PTX instructions). Note that dependents (including the CPU thread calling :py:obj:`~.cuEventSynchronize()`) are not guaranteed to observe the release precisely when it is released. For example, :py:obj:`~.cuEventSynchronize()` may only observe the event trigger long after the associated kernel has completed. This recording type is primarily meant for establishing programmatic dependency between device tasks. Note also this type of dependency allows, but does not guarantee, concurrent execution of tasks. + Valid for launches. Set :py:obj:`~.CUlaunchAttributeValue.programmaticEvent` to record the event. Event recorded through this launch attribute is guaranteed to only trigger after all block in the associated kernel trigger the event. A block can trigger the event through PTX launchdep.release or CUDA builtin function cudaTriggerProgrammaticLaunchCompletion(). A trigger can also be inserted at the beginning of each block's execution if triggerAtBlockStart is set to non-0. The dependent launches can choose to wait on the dependency using the programmatic sync (cudaGridDependencySynchronize() or equivalent PTX instructions). Note that dependents (including the CPU thread calling :py:obj:`~.cuEventSynchronize()`) are not guaranteed to observe the release precisely when it is released. For example, :py:obj:`~.cuEventSynchronize()` may only observe the event trigger long after the associated kernel has completed. This recording type is primarily meant for establishing programmatic dependency between device tasks. Note also this type of dependency allows, but does not guarantee, concurrent execution of tasks. The event supplied must not be an interprocess or interop event. The event must disable timing (i.e. must be created with the :py:obj:`~.CU_EVENT_DISABLE_TIMING` flag set). @@ -3282,9 +3282,9 @@ Data types used by CUDA driver .. autoattribute:: cuda.bindings.driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION - Valid for graph nodes, launches. Set :py:obj:`~.CUlaunchAttributeValue.preferredClusterDim` to allow the kernel launch to specify a preferred substitute cluster dimension. Blocks may be grouped according to either the dimensions specified with this attribute (grouped into a "preferred substitute cluster"), or the one specified with :py:obj:`~.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION` attribute (grouped into a "regular cluster"). The cluster dimensions of a "preferred substitute cluster" shall be an integer multiple greater than zero of the regular cluster dimensions. The device will attempt - on a best-effort basis - to group thread blocks into preferred clusters over grouping them into regular clusters. When it deems necessary (primarily when the device temporarily runs out of physical resources to launch the larger preferred clusters), the device may switch to launch the regular clusters instead to attempt to utilize as much of the physical device resources as possible. + Valid for graph nodes, launches. Set :py:obj:`~.CUlaunchAttributeValue.preferredClusterDim` to allow the kernel launch to specify a preferred substitute cluster dimension. Blocks may be grouped according to either the dimensions specified with this attribute (grouped into a "preferred substitute cluster"), or the one specified with :py:obj:`~.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION` attribute (grouped into a "regular cluster"). The cluster dimensions of a "preferred substitute cluster" shall be an integer multiple greater than zero of the regular cluster dimensions. The device will attempt - on a best-effort basis - to group thread blocks into preferred clusters over grouping them into regular clusters. When it deems necessary (primarily when the device temporarily runs out of physical resources to launch the larger preferred clusters), the device may switch to launch the regular clusters instead to attempt to utilize as much of the physical device resources as possible. - Each type of cluster will have its enumeration / coordinate setup as if the grid consists solely of its type of cluster. For example, if the preferred substitute cluster dimensions double the regular cluster dimensions, there might be simultaneously a regular cluster indexed at (1,0,0), and a preferred cluster indexed at (1,0,0). In this example, the preferred substitute cluster (1,0,0) replaces regular clusters (2,0,0) and (3,0,0) and groups their blocks. + Each type of cluster will have its enumeration / coordinate setup as if the grid consists solely of its type of cluster. For example, if the preferred substitute cluster dimensions double the regular cluster dimensions, there might be simultaneously a regular cluster indexed at (1,0,0), and a preferred cluster indexed at (1,0,0). In this example, the preferred substitute cluster (1,0,0) replaces regular clusters (2,0,0) and (3,0,0) and groups their blocks. This attribute will only take effect when a regular cluster dimension has been specified. The preferred substitute cluster dimension must be an integer multiple greater than zero of the regular cluster dimension and must divide the grid. It must also be no more than `maxBlocksPerCluster`, if it is set in the kernel's `__launch_bounds__`. Otherwise it must be less than the maximum value the driver can support. Otherwise, setting this attribute to a value physically unable to fit on any particular device is permitted. @@ -3292,11 +3292,11 @@ Data types used by CUDA driver .. autoattribute:: cuda.bindings.driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT - Valid for launches. Set :py:obj:`~.CUlaunchAttributeValue.launchCompletionEvent` to record the event. + Valid for launches. Set :py:obj:`~.CUlaunchAttributeValue.launchCompletionEvent` to record the event. - Nominally, the event is triggered once all blocks of the kernel have begun execution. Currently this is a best effort. If a kernel B has a launch completion dependency on a kernel A, B may wait until A is complete. Alternatively, blocks of B may begin before all blocks of A have begun, for example if B can claim execution resources unavailable to A (e.g. they run on different GPUs) or if B is a higher priority than A. Exercise caution if such an ordering inversion could lead to deadlock. + Nominally, the event is triggered once all blocks of the kernel have begun execution. Currently this is a best effort. If a kernel B has a launch completion dependency on a kernel A, B may wait until A is complete. Alternatively, blocks of B may begin before all blocks of A have begun, for example if B can claim execution resources unavailable to A (e.g. they run on different GPUs) or if B is a higher priority than A. Exercise caution if such an ordering inversion could lead to deadlock. - A launch completion event is nominally similar to a programmatic event with `triggerAtBlockStart` set except that it is not visible to `cudaGridDependencySynchronize()` and can be used with compute capability less than 9.0. + A launch completion event is nominally similar to a programmatic event with `triggerAtBlockStart` set except that it is not visible to `cudaGridDependencySynchronize()` and can be used with compute capability less than 9.0. The event supplied must not be an interprocess or interop event. The event must disable timing (i.e. must be created with the :py:obj:`~.CU_EVENT_DISABLE_TIMING` flag set). @@ -3304,11 +3304,11 @@ Data types used by CUDA driver .. autoattribute:: cuda.bindings.driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE - Valid for graph nodes, launches. This attribute is graphs-only, and passing it to a launch in a non-capturing stream will result in an error. + Valid for graph nodes, launches. This attribute is graphs-only, and passing it to a launch in a non-capturing stream will result in an error. - :py:obj:`~.CUlaunchAttributeValue`::deviceUpdatableKernelNode::deviceUpdatable can only be set to 0 or 1. Setting the field to 1 indicates that the corresponding kernel node should be device-updatable. On success, a handle will be returned via :py:obj:`~.CUlaunchAttributeValue`::deviceUpdatableKernelNode::devNode which can be passed to the various device-side update functions to update the node's kernel parameters from within another kernel. For more information on the types of device updates that can be made, as well as the relevant limitations thereof, see :py:obj:`~.cudaGraphKernelNodeUpdatesApply`. + :py:obj:`~.CUlaunchAttributeValue`::deviceUpdatableKernelNode::deviceUpdatable can only be set to 0 or 1. Setting the field to 1 indicates that the corresponding kernel node should be device-updatable. On success, a handle will be returned via :py:obj:`~.CUlaunchAttributeValue`::deviceUpdatableKernelNode::devNode which can be passed to the various device-side update functions to update the node's kernel parameters from within another kernel. For more information on the types of device updates that can be made, as well as the relevant limitations thereof, see :py:obj:`~.cudaGraphKernelNodeUpdatesApply`. - Nodes which are device-updatable have additional restrictions compared to regular kernel nodes. Firstly, device-updatable nodes cannot be removed from their graph via :py:obj:`~.cuGraphDestroyNode`. Additionally, once opted-in to this functionality, a node cannot opt out, and any attempt to set the deviceUpdatable attribute to 0 will result in an error. Device-updatable kernel nodes also cannot have their attributes copied to/from another kernel node via :py:obj:`~.cuGraphKernelNodeCopyAttributes`. Graphs containing one or more device-updatable nodes also do not allow multiple instantiation, and neither the graph nor its instantiated version can be passed to :py:obj:`~.cuGraphExecUpdate`. + Nodes which are device-updatable have additional restrictions compared to regular kernel nodes. Firstly, device-updatable nodes cannot be removed from their graph via :py:obj:`~.cuGraphDestroyNode`. Additionally, once opted-in to this functionality, a node cannot opt out, and any attempt to set the deviceUpdatable attribute to 0 will result in an error. Device-updatable kernel nodes also cannot have their attributes copied to/from another kernel node via :py:obj:`~.cuGraphKernelNodeCopyAttributes`. Graphs containing one or more device-updatable nodes also do not allow multiple instantiation, and neither the graph nor its instantiated version can be passed to :py:obj:`~.cuGraphExecUpdate`. If a graph contains device-updatable nodes and updates those nodes from the device from within the graph, the graph must be uploaded with :py:obj:`~.cuGraphUpload` before it is launched. For such a graph, if host-side executable graph updates are made to the device-updatable nodes, the graph must be uploaded before it is launched again. @@ -3322,13 +3322,13 @@ Data types used by CUDA driver .. autoattribute:: cuda.bindings.driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_NVLINK_UTIL_CENTRIC_SCHEDULING - Valid for streams, graph nodes, launches. This attribute is a hint to the CUDA runtime that the launch should attempt to make the kernel maximize its NVLINK utilization. + Valid for streams, graph nodes, launches. This attribute is a hint to the CUDA runtime that the launch should attempt to make the kernel maximize its NVLINK utilization. - When possible to honor this hint, CUDA will assume each block in the grid launch will carry out an even amount of NVLINK traffic, and make a best-effort attempt to adjust the kernel launch based on that assumption. + When possible to honor this hint, CUDA will assume each block in the grid launch will carry out an even amount of NVLINK traffic, and make a best-effort attempt to adjust the kernel launch based on that assumption. - This attribute is a hint only. CUDA makes no functional or performance guarantee. Its applicability can be affected by many different factors, including driver version (i.e. CUDA doesn't guarantee the performance characteristics will be maintained between driver versions or a driver update could alter or regress previously observed perf characteristics.) It also doesn't guarantee a successful result, i.e. applying the attribute may not improve the performance of either the targeted kernel or the encapsulating application. + This attribute is a hint only. CUDA makes no functional or performance guarantee. Its applicability can be affected by many different factors, including driver version (i.e. CUDA doesn't guarantee the performance characteristics will be maintained between driver versions or a driver update could alter or regress previously observed perf characteristics.) It also doesn't guarantee a successful result, i.e. applying the attribute may not improve the performance of either the targeted kernel or the encapsulating application. Valid values for :py:obj:`~.CUlaunchAttributeValue`::nvlinkUtilCentricScheduling are 0 (disabled) and 1 (enabled). diff --git a/cuda_bindings/docs/source/module/nvjitlink.rst b/cuda_bindings/docs/source/module/nvjitlink.rst index 6eda063074..ff9bb1ea52 100644 --- a/cuda_bindings/docs/source/module/nvjitlink.rst +++ b/cuda_bindings/docs/source/module/nvjitlink.rst @@ -9,7 +9,7 @@ nvjitlink Note ---- -The nvjitlink bindings are not supported on nvJitLink installations <12.3. Ensure the installed CUDA toolkit's nvJitLink version is >=12.3. +The nvjitlink bindings are not supported on nvJitLink installations <12.3. Ensure the installed CUDA toolkit's nvJitLink version is >=12.3. Functions --------- diff --git a/cuda_bindings/docs/source/module/nvrtc.rst b/cuda_bindings/docs/source/module/nvrtc.rst index d8bd6df229..079cd39aad 100644 --- a/cuda_bindings/docs/source/module/nvrtc.rst +++ b/cuda_bindings/docs/source/module/nvrtc.rst @@ -232,7 +232,7 @@ Generate line-number information. - - ``--dopt=on``\ + - ``--dopt=on``\ Enable device code optimization. When specified along with ``-G``\ , enables limited debug information generation for optimized device code (currently, only line number information). When ``-G``\ is not specified, ``-dopt=on``\ is implicit. @@ -260,7 +260,7 @@ Specify the fast-compile level for device code, which controls the tradeoff betw - - ``--ptxas-options=``\ + - ``--ptxas-options=``\ Specify options directly to ptxas, the PTX optimizing assembler. @@ -308,7 +308,7 @@ For single-precision floating-point square root, use IEEE round-to-nearest mode - - Default: ``true``\ + - Default: ``true``\ @@ -783,3 +783,4 @@ Enable stack canaries in device code. Stack canaries make it more difficult to e - ``--fdevice-time-trace=``\ (``-fdevice-time-trace=``\ ) Enables the time profiler, outputting a JSON file based on given . Results can be analyzed on chrome://tracing for a flamegraph visualization. + diff --git a/cuda_bindings/docs/source/module/runtime.rst b/cuda_bindings/docs/source/module/runtime.rst index 9ac22f660b..d155f85ebc 100644 --- a/cuda_bindings/docs/source/module/runtime.rst +++ b/cuda_bindings/docs/source/module/runtime.rst @@ -281,7 +281,7 @@ This section describes the unified addressing functions of the CUDA runtime appl -CUDA devices can share a unified address space with the host. +CUDA devices can share a unified address space with the host. For these devices there is no distinction between a device pointer and a host pointer -- the same pointer value may be used to access memory from the host program and from a kernel running on the device (with exceptions enumerated below). @@ -307,7 +307,7 @@ Unified addressing is automatically enabled in 64-bit processes . It is possible to look up information about the memory which backs a pointer value. For instance, one may want to know if a pointer points to host or device memory. As another example, in the case of device memory, one may want to know on which CUDA device the memory resides. These properties may be queried using the function cudaPointerGetAttributes() -Since pointers are unique, it is not necessary to specify information about the pointers specified to cudaMemcpy() and other copy functions. +Since pointers are unique, it is not necessary to specify information about the pointers specified to cudaMemcpy() and other copy functions. The copy direction cudaMemcpyDefault may be used to specify that the CUDA runtime should infer the location of the pointer from its value. @@ -321,7 +321,7 @@ Since pointers are unique, it is not necessary to specify information about the All host memory allocated through all devices using cudaMallocHost() and cudaHostAlloc() is always directly accessible from all devices that support unified addressing. This is the case regardless of whether or not the flags cudaHostAllocPortable and cudaHostAllocMapped are specified. -The pointer value through which allocated host memory may be accessed in kernels on all devices that support unified addressing is the same as the pointer value through which that memory is accessed on the host. It is not necessary to call cudaHostGetDevicePointer() to get the device pointer for these allocations. +The pointer value through which allocated host memory may be accessed in kernels on all devices that support unified addressing is the same as the pointer value through which that memory is accessed on the host. It is not necessary to call cudaHostGetDevicePointer() to get the device pointer for these allocations. @@ -345,7 +345,7 @@ Upon enabling direct access from a device that supports unified addressing to an -Not all memory may be accessed on devices through the same pointer value through which they are accessed on the host. These exceptions are host memory registered using cudaHostRegister() and host memory allocated using the flag cudaHostAllocWriteCombined. For these exceptions, there exists a distinct host and device address for the memory. The device address is guaranteed to not overlap any valid host pointer range and is guaranteed to have the same value across all devices that support unified addressing. +Not all memory may be accessed on devices through the same pointer value through which they are accessed on the host. These exceptions are host memory registered using cudaHostRegister() and host memory allocated using the flag cudaHostAllocWriteCombined. For these exceptions, there exists a distinct host and device address for the memory. The device address is guaranteed to not overlap any valid host pointer range and is guaranteed to have the same value across all devices that support unified addressing. @@ -671,7 +671,7 @@ Note that the use of multiple ::CUcontext s per device within a single process w If a non-primary ::CUcontext created by the CUDA Driver API is current to a thread then the CUDA Runtime API calls to that thread will operate on that ::CUcontext, with some exceptions listed below. Interoperability between data types is discussed in the following sections. -The function cudaPointerGetAttributes() will return the error cudaErrorIncompatibleDriverContext if the pointer being queried was allocated by a non-primary context. The function cudaDeviceEnablePeerAccess() and the rest of the peer access API may not be called when a non-primary ::CUcontext is current. +The function cudaPointerGetAttributes() will return the error cudaErrorIncompatibleDriverContext if the pointer being queried was allocated by a non-primary context. The function cudaDeviceEnablePeerAccess() and the rest of the peer access API may not be called when a non-primary ::CUcontext is current. To use the pointer query and peer access APIs with a context created using the CUDA Driver API, it is necessary that the CUDA Driver API be used to access these features. @@ -4697,7 +4697,7 @@ Data types used by CUDA Runtime .. autoattribute:: cuda.bindings.runtime.cudaJitOption.cudaJitCacheMode - Specifies whether to enable caching explicitly (-dlcm) + Specifies whether to enable caching explicitly (-dlcm) Choice is based on supplied :py:obj:`~.cudaJit_CacheMode`. @@ -5199,9 +5199,9 @@ Data types used by CUDA Runtime .. autoattribute:: cuda.bindings.runtime.cudaGraphInstantiateFlags.cudaGraphInstantiateFlagUpload - Automatically upload the graph after instantiation. Only supported by + Automatically upload the graph after instantiation. Only supported by - :py:obj:`~.cudaGraphInstantiateWithParams`. The upload will be performed using the + :py:obj:`~.cudaGraphInstantiateWithParams`. The upload will be performed using the stream provided in `instantiateParams`. @@ -5209,9 +5209,9 @@ Data types used by CUDA Runtime .. autoattribute:: cuda.bindings.runtime.cudaGraphInstantiateFlags.cudaGraphInstantiateFlagDeviceLaunch - Instantiate the graph to be launchable from the device. This flag can only + Instantiate the graph to be launchable from the device. This flag can only - be used on platforms which support unified addressing. This flag cannot be + be used on platforms which support unified addressing. This flag cannot be used in conjunction with cudaGraphInstantiateFlagAutoFreeOnLaunch. @@ -5281,7 +5281,7 @@ Data types used by CUDA Runtime .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeProgrammaticEvent - Valid for launches. Set :py:obj:`~.cudaLaunchAttributeValue.programmaticEvent` to record the event. Event recorded through this launch attribute is guaranteed to only trigger after all block in the associated kernel trigger the event. A block can trigger the event programmatically in a future CUDA release. A trigger can also be inserted at the beginning of each block's execution if triggerAtBlockStart is set to non-0. The dependent launches can choose to wait on the dependency using the programmatic sync (cudaGridDependencySynchronize() or equivalent PTX instructions). Note that dependents (including the CPU thread calling :py:obj:`~.cudaEventSynchronize()`) are not guaranteed to observe the release precisely when it is released. For example, :py:obj:`~.cudaEventSynchronize()` may only observe the event trigger long after the associated kernel has completed. This recording type is primarily meant for establishing programmatic dependency between device tasks. Note also this type of dependency allows, but does not guarantee, concurrent execution of tasks. + Valid for launches. Set :py:obj:`~.cudaLaunchAttributeValue.programmaticEvent` to record the event. Event recorded through this launch attribute is guaranteed to only trigger after all block in the associated kernel trigger the event. A block can trigger the event programmatically in a future CUDA release. A trigger can also be inserted at the beginning of each block's execution if triggerAtBlockStart is set to non-0. The dependent launches can choose to wait on the dependency using the programmatic sync (cudaGridDependencySynchronize() or equivalent PTX instructions). Note that dependents (including the CPU thread calling :py:obj:`~.cudaEventSynchronize()`) are not guaranteed to observe the release precisely when it is released. For example, :py:obj:`~.cudaEventSynchronize()` may only observe the event trigger long after the associated kernel has completed. This recording type is primarily meant for establishing programmatic dependency between device tasks. Note also this type of dependency allows, but does not guarantee, concurrent execution of tasks. The event supplied must not be an interprocess or interop event. The event must disable timing (i.e. must be created with the :py:obj:`~.cudaEventDisableTiming` flag set). @@ -5307,9 +5307,9 @@ Data types used by CUDA Runtime .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributePreferredClusterDimension - Valid for graph nodes and launches. Set :py:obj:`~.cudaLaunchAttributeValue.preferredClusterDim` to allow the kernel launch to specify a preferred substitute cluster dimension. Blocks may be grouped according to either the dimensions specified with this attribute (grouped into a "preferred substitute cluster"), or the one specified with :py:obj:`~.cudaLaunchAttributeClusterDimension` attribute (grouped into a "regular cluster"). The cluster dimensions of a "preferred substitute cluster" shall be an integer multiple greater than zero of the regular cluster dimensions. The device will attempt - on a best-effort basis - to group thread blocks into preferred clusters over grouping them into regular clusters. When it deems necessary (primarily when the device temporarily runs out of physical resources to launch the larger preferred clusters), the device may switch to launch the regular clusters instead to attempt to utilize as much of the physical device resources as possible. + Valid for graph nodes and launches. Set :py:obj:`~.cudaLaunchAttributeValue.preferredClusterDim` to allow the kernel launch to specify a preferred substitute cluster dimension. Blocks may be grouped according to either the dimensions specified with this attribute (grouped into a "preferred substitute cluster"), or the one specified with :py:obj:`~.cudaLaunchAttributeClusterDimension` attribute (grouped into a "regular cluster"). The cluster dimensions of a "preferred substitute cluster" shall be an integer multiple greater than zero of the regular cluster dimensions. The device will attempt - on a best-effort basis - to group thread blocks into preferred clusters over grouping them into regular clusters. When it deems necessary (primarily when the device temporarily runs out of physical resources to launch the larger preferred clusters), the device may switch to launch the regular clusters instead to attempt to utilize as much of the physical device resources as possible. - Each type of cluster will have its enumeration / coordinate setup as if the grid consists solely of its type of cluster. For example, if the preferred substitute cluster dimensions double the regular cluster dimensions, there might be simultaneously a regular cluster indexed at (1,0,0), and a preferred cluster indexed at (1,0,0). In this example, the preferred substitute cluster (1,0,0) replaces regular clusters (2,0,0) and (3,0,0) and groups their blocks. + Each type of cluster will have its enumeration / coordinate setup as if the grid consists solely of its type of cluster. For example, if the preferred substitute cluster dimensions double the regular cluster dimensions, there might be simultaneously a regular cluster indexed at (1,0,0), and a preferred cluster indexed at (1,0,0). In this example, the preferred substitute cluster (1,0,0) replaces regular clusters (2,0,0) and (3,0,0) and groups their blocks. This attribute will only take effect when a regular cluster dimension has been specified. The preferred substitute cluster dimension must be an integer multiple greater than zero of the regular cluster dimension and must divide the grid. It must also be no more than `maxBlocksPerCluster`, if it is set in the kernel's `__launch_bounds__`. Otherwise it must be less than the maximum value the driver can support. Otherwise, setting this attribute to a value physically unable to fit on any particular device is permitted. @@ -5317,11 +5317,11 @@ Data types used by CUDA Runtime .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeLaunchCompletionEvent - Valid for launches. Set :py:obj:`~.cudaLaunchAttributeValue.launchCompletionEvent` to record the event. + Valid for launches. Set :py:obj:`~.cudaLaunchAttributeValue.launchCompletionEvent` to record the event. - Nominally, the event is triggered once all blocks of the kernel have begun execution. Currently this is a best effort. If a kernel B has a launch completion dependency on a kernel A, B may wait until A is complete. Alternatively, blocks of B may begin before all blocks of A have begun, for example if B can claim execution resources unavailable to A (e.g. they run on different GPUs) or if B is a higher priority than A. Exercise caution if such an ordering inversion could lead to deadlock. + Nominally, the event is triggered once all blocks of the kernel have begun execution. Currently this is a best effort. If a kernel B has a launch completion dependency on a kernel A, B may wait until A is complete. Alternatively, blocks of B may begin before all blocks of A have begun, for example if B can claim execution resources unavailable to A (e.g. they run on different GPUs) or if B is a higher priority than A. Exercise caution if such an ordering inversion could lead to deadlock. - A launch completion event is nominally similar to a programmatic event with `triggerAtBlockStart` set except that it is not visible to `cudaGridDependencySynchronize()` and can be used with compute capability less than 9.0. + A launch completion event is nominally similar to a programmatic event with `triggerAtBlockStart` set except that it is not visible to `cudaGridDependencySynchronize()` and can be used with compute capability less than 9.0. The event supplied must not be an interprocess or interop event. The event must disable timing (i.e. must be created with the :py:obj:`~.cudaEventDisableTiming` flag set). @@ -5329,11 +5329,11 @@ Data types used by CUDA Runtime .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeDeviceUpdatableKernelNode - Valid for graph nodes, launches. This attribute is graphs-only, and passing it to a launch in a non-capturing stream will result in an error. + Valid for graph nodes, launches. This attribute is graphs-only, and passing it to a launch in a non-capturing stream will result in an error. - :cudaLaunchAttributeValue::deviceUpdatableKernelNode::deviceUpdatable can only be set to 0 or 1. Setting the field to 1 indicates that the corresponding kernel node should be device-updatable. On success, a handle will be returned via :py:obj:`~.cudaLaunchAttributeValue`::deviceUpdatableKernelNode::devNode which can be passed to the various device-side update functions to update the node's kernel parameters from within another kernel. For more information on the types of device updates that can be made, as well as the relevant limitations thereof, see :py:obj:`~.cudaGraphKernelNodeUpdatesApply`. + :cudaLaunchAttributeValue::deviceUpdatableKernelNode::deviceUpdatable can only be set to 0 or 1. Setting the field to 1 indicates that the corresponding kernel node should be device-updatable. On success, a handle will be returned via :py:obj:`~.cudaLaunchAttributeValue`::deviceUpdatableKernelNode::devNode which can be passed to the various device-side update functions to update the node's kernel parameters from within another kernel. For more information on the types of device updates that can be made, as well as the relevant limitations thereof, see :py:obj:`~.cudaGraphKernelNodeUpdatesApply`. - Nodes which are device-updatable have additional restrictions compared to regular kernel nodes. Firstly, device-updatable nodes cannot be removed from their graph via :py:obj:`~.cudaGraphDestroyNode`. Additionally, once opted-in to this functionality, a node cannot opt out, and any attempt to set the deviceUpdatable attribute to 0 will result in an error. Device-updatable kernel nodes also cannot have their attributes copied to/from another kernel node via :py:obj:`~.cudaGraphKernelNodeCopyAttributes`. Graphs containing one or more device-updatable nodes also do not allow multiple instantiation, and neither the graph nor its instantiated version can be passed to :py:obj:`~.cudaGraphExecUpdate`. + Nodes which are device-updatable have additional restrictions compared to regular kernel nodes. Firstly, device-updatable nodes cannot be removed from their graph via :py:obj:`~.cudaGraphDestroyNode`. Additionally, once opted-in to this functionality, a node cannot opt out, and any attempt to set the deviceUpdatable attribute to 0 will result in an error. Device-updatable kernel nodes also cannot have their attributes copied to/from another kernel node via :py:obj:`~.cudaGraphKernelNodeCopyAttributes`. Graphs containing one or more device-updatable nodes also do not allow multiple instantiation, and neither the graph nor its instantiated version can be passed to :py:obj:`~.cudaGraphExecUpdate`. If a graph contains device-updatable nodes and updates those nodes from the device from within the graph, the graph must be uploaded with :py:obj:`~.cuGraphUpload` before it is launched. For such a graph, if host-side executable graph updates are made to the device-updatable nodes, the graph must be uploaded before it is launched again. @@ -5347,13 +5347,13 @@ Data types used by CUDA Runtime .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeNvlinkUtilCentricScheduling - Valid for streams, graph nodes, launches. This attribute is a hint to the CUDA runtime that the launch should attempt to make the kernel maximize its NVLINK utilization. + Valid for streams, graph nodes, launches. This attribute is a hint to the CUDA runtime that the launch should attempt to make the kernel maximize its NVLINK utilization. - When possible to honor this hint, CUDA will assume each block in the grid launch will carry out an even amount of NVLINK traffic, and make a best-effort attempt to adjust the kernel launch based on that assumption. + When possible to honor this hint, CUDA will assume each block in the grid launch will carry out an even amount of NVLINK traffic, and make a best-effort attempt to adjust the kernel launch based on that assumption. - This attribute is a hint only. CUDA makes no functional or performance guarantee. Its applicability can be affected by many different factors, including driver version (i.e. CUDA doesn't guarantee the performance characteristics will be maintained between driver versions or a driver update could alter or regress previously observed perf characteristics.) It also doesn't guarantee a successful result, i.e. applying the attribute may not improve the performance of either the targeted kernel or the encapsulating application. + This attribute is a hint only. CUDA makes no functional or performance guarantee. Its applicability can be affected by many different factors, including driver version (i.e. CUDA doesn't guarantee the performance characteristics will be maintained between driver versions or a driver update could alter or regress previously observed perf characteristics.) It also doesn't guarantee a successful result, i.e. applying the attribute may not improve the performance of either the targeted kernel or the encapsulating application. Valid values for :py:obj:`~.cudaLaunchAttributeValue.nvlinkUtilCentricScheduling` are 0 (disabled) and 1 (enabled). diff --git a/cuda_bindings/docs/source/motivation.rst b/cuda_bindings/docs/source/motivation.rst index 433cc16619..afbd3412d4 100644 --- a/cuda_bindings/docs/source/motivation.rst +++ b/cuda_bindings/docs/source/motivation.rst @@ -9,7 +9,7 @@ What is CUDA Python? NVIDIA’s CUDA Python provides `Cython `_ bindings and Python wrappers for the driver and runtime API for existing toolkits and libraries to simplify GPU-based accelerated processing. Python is one of the most popular -programming languages for science, engineering, data analytics, and deep +programming languages for science, engineering, data analytics, and deep learning applications. The goal of CUDA Python is to unify the Python ecosystem with a single set of interfaces that provide full coverage of and access to the CUDA host APIs from Python. @@ -25,18 +25,18 @@ science, and AI. `Anaconda `_ that can compile Python code for execution on CUDA-capable GPUs, provides Python developers with an easy entry into GPU-accelerated computing and a path for using increasingly sophisticated CUDA -code with a minimum of new syntax and jargon. Numba has its own CUDA driver API -bindings that can now be replaced with CUDA Python. With CUDA Python and Numba, +code with a minimum of new syntax and jargon. Numba has its own CUDA driver API +bindings that can now be replaced with CUDA Python. With CUDA Python and Numba, you get the best of both worlds: rapid iterative development with Python and the speed of a compiled language targeting both CPUs and NVIDIA GPUs. `CuPy `_ is a `NumPy `_/`SciPy `_ compatible Array library, from `Preferred Networks `_, for -GPU-accelerated computing with Python. CUDA Python simplifies the CuPy build -and allows for a faster and smaller memory footprint when importing the CuPy -Python module. In the future, when more CUDA Toolkit libraries are supported, -CuPy will have a lighter maintenance overhead and have fewer wheels to +GPU-accelerated computing with Python. CUDA Python simplifies the CuPy build +and allows for a faster and smaller memory footprint when importing the CuPy +Python module. In the future, when more CUDA Toolkit libraries are supported, +CuPy will have a lighter maintenance overhead and have fewer wheels to release. Users benefit from a faster CUDA runtime! Our goal is to help unify the Python CUDA ecosystem with a single standard set diff --git a/cuda_bindings/docs/source/overview.rst b/cuda_bindings/docs/source/overview.rst index fdef83639b..0f32032528 100644 --- a/cuda_bindings/docs/source/overview.rst +++ b/cuda_bindings/docs/source/overview.rst @@ -60,7 +60,7 @@ The following code snippet lets us validate each API call and raise exceptions i return nvrtc.nvrtcGetErrorString(error)[1] else: raise RuntimeError('Unknown error type: {}'.format(error)) - + def checkCudaErrors(result): if result[0].value: raise RuntimeError("CUDA error code={}({})".format(result[0].value, _cudaGetErrorEnum(result[0]))) @@ -105,22 +105,22 @@ the program is compiled to target our local compute capability architecture with # Initialize CUDA Driver API checkCudaErrors(driver.cuInit(0)) - + # Retrieve handle for device 0 cuDevice = checkCudaErrors(driver.cuDeviceGet(0)) - + # Derive target architecture for device 0 major = checkCudaErrors(driver.cuDeviceGetAttribute(driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice)) minor = checkCudaErrors(driver.cuDeviceGetAttribute(driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice)) arch_arg = bytes(f'--gpu-architecture=compute_{major}{minor}', 'ascii') - + # Create program prog = checkCudaErrors(nvrtc.nvrtcCreateProgram(str.encode(saxpy), b"saxpy.cu", 0, [], [])) - + # Compile program opts = [b"--fmad=false", arch_arg] checkCudaErrors(nvrtc.nvrtcCompileProgram(prog, 2, opts)) - + # Get PTX from compilation ptxSize = checkCudaErrors(nvrtc.nvrtcGetPTXSize(prog)) ptx = b" " * ptxSize @@ -158,11 +158,11 @@ and from the device: NUM_THREADS = 512 # Threads per block NUM_BLOCKS = 32768 # Blocks per grid - + a = np.array([2.0], dtype=np.float32) n = np.array(NUM_THREADS * NUM_BLOCKS, dtype=np.uint32) bufferSize = n * a.itemsize - + hX = np.random.rand(n).astype(dtype=np.float32) hY = np.random.rand(n).astype(dtype=np.float32) hOut = np.zeros(n).astype(dtype=np.float32) @@ -182,9 +182,9 @@ by calling ``XX.ctypes.data`` for the associated XX: dXclass = checkCudaErrors(driver.cuMemAlloc(bufferSize)) dYclass = checkCudaErrors(driver.cuMemAlloc(bufferSize)) dOutclass = checkCudaErrors(driver.cuMemAlloc(bufferSize)) - + stream = checkCudaErrors(driver.cuStreamCreate(0)) - + checkCudaErrors(driver.cuMemcpyHtoDAsync( dXclass, hX.ctypes.data, bufferSize, stream )) @@ -233,7 +233,7 @@ Now the kernel can be launched: args.ctypes.data, # kernel arguments 0, # extra (ignore) )) - + checkCudaErrors(driver.cuMemcpyDtoHAsync( hOut.ctypes.data, dOutclass, bufferSize, stream )) @@ -304,7 +304,7 @@ maximize performance ({numref}``Figure 1``). .. figure:: _static/images/Nsight-Compute-CLI-625x473.png :name: Figure 1 - + Screenshot of Nsight Compute CLI output of ``cuda.bindings`` example. Preparing kernel arguments @@ -331,7 +331,7 @@ Let's use the following kernel definition as an example: typedef struct { int value; } testStruct; - + extern "C" __global__ void testkernel(int i, int *pi, float f, float *pf, @@ -347,7 +347,7 @@ The first step is to create array objects with types corresponding to your kerne .. list-table:: Correspondence between NumPy types and kernel types. :header-rows: 1 - + * - NumPy type - Corresponding kernel types - itemsize (bytes) @@ -410,12 +410,12 @@ Putting it all together: # Define a custom type testStruct = np.dtype([("value", np.int32)], align=True) - + # Allocate device memory pInt = checkCudaErrors(cudart.cudaMalloc(np.dtype(np.int32).itemsize)) pFloat = checkCudaErrors(cudart.cudaMalloc(np.dtype(np.float32).itemsize)) pStruct = checkCudaErrors(cudart.cudaMalloc(testStruct.itemsize)) - + # Collect all input kernel arguments into a single tuple for further processing kernelValues = ( np.array(1, dtype=np.uint32), @@ -470,12 +470,12 @@ For this example the result becomes: # Define a custom type class testStruct(ctypes.Structure): _fields_ = [("value", ctypes.c_int)] - + # Allocate device memory pInt = checkCudaErrors(cudart.cudaMalloc(ctypes.sizeof(ctypes.c_int))) pFloat = checkCudaErrors(cudart.cudaMalloc(ctypes.sizeof(ctypes.c_float))) pStruct = checkCudaErrors(cudart.cudaMalloc(ctypes.sizeof(testStruct))) - + # Collect all input kernel arguments into a single tuple for further processing kernelValues = ( 1, @@ -531,7 +531,7 @@ For this example, lets use the ``transformKernel`` from `examples/0_Introduction ... } """ - + def main(): ... d_data = checkCudaErrors(cudart.cudaMalloc(size)) @@ -565,3 +565,4 @@ For ctypes, we leverage the special handling of ``None`` type since each Python None, ) kernelArgs = (kernelValues, kernelTypes) + diff --git a/cuda_bindings/docs/source/release/11.6.0-notes.rst b/cuda_bindings/docs/source/release/11.6.0-notes.rst index bcc8944e1f..d7907df847 100644 --- a/cuda_bindings/docs/source/release/11.6.0-notes.rst +++ b/cuda_bindings/docs/source/release/11.6.0-notes.rst @@ -79,3 +79,4 @@ CUDA Functions Not Supported in this Release - cudaVDPAUSetVDPAUDevice .. note:: Deprecated APIs are removed from tracking + diff --git a/cuda_bindings/docs/source/release/12.9.X-notes.rst b/cuda_bindings/docs/source/release/12.9.X-notes.rst index a3a640f531..3be22a695a 100644 --- a/cuda_bindings/docs/source/release/12.9.X-notes.rst +++ b/cuda_bindings/docs/source/release/12.9.X-notes.rst @@ -18,4 +18,4 @@ Highlights Known issues ------------ -* Updating from older versions (v12.6.2.post1 and below) via ``pip install -U cuda-python`` might not work. Please do a clean re-installation by uninstalling ``pip uninstall -y cuda-python`` followed by installing ``pip install cuda-python``. +* Updating from older versions (v12.6.2.post1 and below) via ``pip install -U cuda-python`` might not work. Please do a clean re-installation by uninstalling ``pip uninstall -y cuda-python`` followed by installing ``pip install cuda-python``. \ No newline at end of file diff --git a/cuda_bindings/docs/source/tips_and_tricks.rst b/cuda_bindings/docs/source/tips_and_tricks.rst index 1a77eb53fd..cc666ca275 100644 --- a/cuda_bindings/docs/source/tips_and_tricks.rst +++ b/cuda_bindings/docs/source/tips_and_tricks.rst @@ -27,9 +27,9 @@ All of the Python classes do not manage the lifetime of the underlying CUDA C ob Getting and setting attributes of extension types ================================================= -While the bindings outwardly present the attributes of extension types in a pythonic way, they can't always be interacted with in a Pythonic style. Often the getters/setters (__getitem__(), __setitem__()) are actually a translation step to convert values between Python and C. For example, in some cases, attempting to modify an attribute in place, will lead to unexpected behavior due to the design of the underlying implementation. For this reason, users should use the getters and setters directly when interacting with extension types. +While the bindings outwardly present the attributes of extension types in a pythonic way, they can't always be interacted with in a Pythonic style. Often the getters/setters (__getitem__(), __setitem__()) are actually a translation step to convert values between Python and C. For example, in some cases, attempting to modify an attribute in place, will lead to unexpected behavior due to the design of the underlying implementation. For this reason, users should use the getters and setters directly when interacting with extension types. -An example of this is the :class:`~cuda.bindings.driver.CULaunchConfig` type. +An example of this is the :class:`~cuda.bindings.driver.CULaunchConfig` type. .. code-block:: python @@ -37,7 +37,7 @@ An example of this is the :class:`~cuda.bindings.driver.CULaunchConfig` type. cfg.numAttrs += 1 attr = cuda.CUlaunchAttribute() - + ... # This works. We are passing the new attribute to the setter diff --git a/cuda_bindings/pyproject.toml b/cuda_bindings/pyproject.toml index 2637f0b0e7..1a91b44b83 100644 --- a/cuda_bindings/pyproject.toml +++ b/cuda_bindings/pyproject.toml @@ -84,10 +84,10 @@ select = [ ] ignore = [ - "UP006", - "UP007", + "UP006", + "UP007", "E741", # ambiguous variable name such as I - "B007", # rename unsued loop variable to _name + "B007", # rename unsued loop variable to _name "UP035" # UP006, UP007, UP035 complain about deprecated Typing. use, but disregard backward compatibility of python version ] @@ -103,13 +103,13 @@ exclude = ["cuda/bindings/_version.py"] ] "tests/**/*" = [ - "E722", + "E722", "UP022", "E402", # module level import not at top of file "F841"] # F841 complains about unused variables, but some assignments have side-effects that could be useful for tests (func calls for example) "benchmarks/**/*" = [ - "E722", + "E722", "UP022", "E402", # module level import not at top of file "F841"] # F841 complains about unused variables, but some assignments have side-effects that could be useful for tests (func calls for example) diff --git a/cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx b/cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx index 0bb40bf404..e735630386 100644 --- a/cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx +++ b/cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx @@ -215,7 +215,7 @@ cdef class ParamHolder: if isinstance(arg.handle, int): # see note below on handling int arguments prepare_arg[intptr_t](self.data, self.data_addresses, arg.handle, i) - continue + continue else: # it's a CUdeviceptr: self.data_addresses[i] = (arg.handle.getPtr()) diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index 3eb80875e8..ddcf7665e1 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -137,14 +137,14 @@ cdef class Buffer: """ if stream is None: raise ValueError("stream must be provided") - + cdef size_t src_size = self._size - + if dst is None: if self._mr is None: raise ValueError("a destination buffer must be provided (this buffer does not have a memory_resource)") dst = self._mr.allocate(src_size, stream) - + cdef size_t dst_size = dst._size if dst_size != src_size: raise ValueError( @@ -168,10 +168,10 @@ cdef class Buffer: """ if stream is None: raise ValueError("stream must be provided") - + cdef size_t dst_size = self._size cdef size_t src_size = src._size - + if src_size != dst_size: raise ValueError( f"buffer sizes mismatch between src and dst (sizes are: src={src_size}, dst={dst_size})" diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/experimental/_memoryview.pyx index ea8fb01b67..9d24133052 100644 --- a/cuda_core/cuda/core/experimental/_memoryview.pyx +++ b/cuda_core/cuda/core/experimental/_memoryview.pyx @@ -29,20 +29,20 @@ cdef class StridedMemoryView: This object supports both DLPack (up to v1.0) and CUDA Array Interface (CAI) v3. When wrapping an arbitrary object it will try the DLPack protocol first, then the CAI protocol. A :obj:`BufferError` is raised if neither is - supported. - + supported. + Since either way would take a consumer stream, for DLPack it is passed to - ``obj.__dlpack__()`` as-is (except for :obj:`None`, see below); for CAI, a + ``obj.__dlpack__()`` as-is (except for :obj:`None`, see below); for CAI, a stream order will be established between the consumer stream and the - producer stream (from ``obj.__cuda_array_interface__()["stream"]``), as if - ``cudaStreamWaitEvent`` is called by this method. - - To opt-out of the stream ordering operation in either DLPack or CAI, - please pass ``stream_ptr=-1``. Note that this deviates (on purpose) + producer stream (from ``obj.__cuda_array_interface__()["stream"]``), as if + ``cudaStreamWaitEvent`` is called by this method. + + To opt-out of the stream ordering operation in either DLPack or CAI, + please pass ``stream_ptr=-1``. Note that this deviates (on purpose) from the semantics of ``obj.__dlpack__(stream=None, ...)`` since ``cuda.core`` - does not encourage using the (legacy) default/null stream, but is + does not encourage using the (legacy) default/null stream, but is consistent with the CAI's semantics. For DLPack, ``stream=-1`` will be - internally passed to ``obj.__dlpack__()`` instead. + internally passed to ``obj.__dlpack__()`` instead. Attributes ---------- @@ -79,16 +79,16 @@ cdef class StridedMemoryView: bint is_device_accessible bint readonly object exporting_obj - - # If using dlpack, this is a strong reference to the result of - # obj.__dlpack__() so we can lazily create shape and strides from - # it later. If using CAI, this is a reference to the source + + # If using dlpack, this is a strong reference to the result of + # obj.__dlpack__() so we can lazily create shape and strides from + # it later. If using CAI, this is a reference to the source # `__cuda_array_interface__` object. cdef object metadata # The tensor object if has obj has __dlpack__, otherwise must be NULL cdef DLTensor *dl_tensor - + # Memoized properties cdef tuple _shape cdef tuple _strides @@ -110,7 +110,7 @@ cdef class StridedMemoryView: if self._shape is None and self.exporting_obj is not None: if self.dl_tensor != NULL: self._shape = cuda_utils.carray_int64_t_to_tuple( - self.dl_tensor.shape, + self.dl_tensor.shape, self.dl_tensor.ndim ) else: @@ -127,7 +127,7 @@ cdef class StridedMemoryView: if self.dl_tensor != NULL: if self.dl_tensor.strides: self._strides = cuda_utils.carray_int64_t_to_tuple( - self.dl_tensor.strides, + self.dl_tensor.strides, self.dl_tensor.ndim ) else: @@ -140,7 +140,7 @@ cdef class StridedMemoryView: self._strides, i, strides[i] // itemsize ) self._strides_init = True - return self._strides + return self._strides @property def dtype(self) -> Optional[numpy.dtype]: diff --git a/cuda_core/docs/source/_templates/autosummary/dataclass.rst b/cuda_core/docs/source/_templates/autosummary/dataclass.rst index efb115c831..5931265888 100644 --- a/cuda_core/docs/source/_templates/autosummary/dataclass.rst +++ b/cuda_core/docs/source/_templates/autosummary/dataclass.rst @@ -10,3 +10,4 @@ {% block methods %} .. automethod:: __init__ {% endblock %} + diff --git a/cuda_core/docs/source/_templates/autosummary/namedtuple.rst b/cuda_core/docs/source/_templates/autosummary/namedtuple.rst index 7ee8a09a1c..1695beef0a 100644 --- a/cuda_core/docs/source/_templates/autosummary/namedtuple.rst +++ b/cuda_core/docs/source/_templates/autosummary/namedtuple.rst @@ -8,4 +8,4 @@ .. autoclass:: {{ objname }} :members: __new__ :special-members: __new__ - :exclude-members: count, index, __reduce__, __reduce_ex__, __repr__, __hash__, __str__, __getnewargs__ + :exclude-members: count, index, __reduce__, __reduce_ex__, __repr__, __hash__, __str__, __getnewargs__ \ No newline at end of file diff --git a/cuda_core/docs/source/getting-started.rst b/cuda_core/docs/source/getting-started.rst index 47f8d193a9..502ea66375 100644 --- a/cuda_core/docs/source/getting-started.rst +++ b/cuda_core/docs/source/getting-started.rst @@ -60,7 +60,7 @@ Don't forget to use :meth:`Device.set_current`! import cupy as cp from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch - + dev = Device() dev.set_current() s = dev.create_stream() @@ -82,14 +82,14 @@ We're using `CuPy `_ arrays as inputs for this example, but y .. code-block:: python ker = mod.get_kernel("vector_add") - + # Prepare input/output arrays (using CuPy) size = 50000 rng = cp.random.default_rng() a = rng.random(size, dtype=cp.float32) b = rng.random(size, dtype=cp.float32) c = cp.empty_like(a) - + # Configure launch parameters block = 256 grid = (size + block - 1) // block diff --git a/cuda_core/docs/source/install.rst b/cuda_core/docs/source/install.rst index 3241f563f7..8bc1faa0e1 100644 --- a/cuda_core/docs/source/install.rst +++ b/cuda_core/docs/source/install.rst @@ -10,10 +10,10 @@ Runtime Requirements ``cuda.core`` is supported on all platforms that CUDA is supported. Specific dependencies are as follows: -.. list-table:: +.. list-table:: :header-rows: 1 - * - + * - - CUDA 11 - CUDA 12 * - CUDA Toolkit\ [#f1]_ diff --git a/cuda_core/docs/source/release/0.1.1-notes.rst b/cuda_core/docs/source/release/0.1.1-notes.rst index f9ac2b5ccd..5434d726ea 100644 --- a/cuda_core/docs/source/release/0.1.1-notes.rst +++ b/cuda_core/docs/source/release/0.1.1-notes.rst @@ -2,7 +2,7 @@ .. SPDX-License-Identifier: Apache-2.0 .. currentmodule:: cuda.core.experimental - + ``cuda.core`` 0.1.1 Release Notes ================================= diff --git a/cuda_core/docs/source/release/0.3.2-notes.rst b/cuda_core/docs/source/release/0.3.2-notes.rst index b1b087dbb7..8b4763ed31 100644 --- a/cuda_core/docs/source/release/0.3.2-notes.rst +++ b/cuda_core/docs/source/release/0.3.2-notes.rst @@ -25,7 +25,7 @@ None. New features ------------ -- :class:`Stream` and :class:`Event` can be subclassed now. +- :class:`Stream` and :class:`Event` can be subclassed now. New examples diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml index 76e312b0d7..3506ce0255 100644 --- a/cuda_core/pyproject.toml +++ b/cuda_core/pyproject.toml @@ -97,10 +97,10 @@ select = [ ] ignore = [ - "UP006", - "UP007", + "UP006", + "UP007", "E741", # ambiguous variable name such as I - "B007", # rename unsued loop variable to _name + "B007", # rename unsued loop variable to _name "UP035" # UP006, UP007, UP035 complain about deprecated Typing. use, but disregard backward compatibility of python version ] diff --git a/cuda_core/tests/cython/test_get_cuda_native_handle.pyx b/cuda_core/tests/cython/test_get_cuda_native_handle.pyx index 0c3921e925..d1764d3bba 100644 --- a/cuda_core/tests/cython/test_get_cuda_native_handle.pyx +++ b/cuda_core/tests/cython/test_get_cuda_native_handle.pyx @@ -23,7 +23,7 @@ cdef extern from "utility.hpp": def test_get_cuda_native_handle(): dev = Device(0) dev.set_current() - + s = dev.create_stream() cdef pyCUstream s_py = s.handle cdef CUstream s_c = get_cuda_native_handle(s_py) diff --git a/cuda_python/LICENSE b/cuda_python/LICENSE index a5a65097cd..b7d042fcee 100644 --- a/cuda_python/LICENSE +++ b/cuda_python/LICENSE @@ -2,46 +2,46 @@ NVIDIA SOFTWARE LICENSE This license is a legal agreement between you and NVIDIA Corporation ("NVIDIA") and governs your use of the NVIDIA CUDA Python software and materials provided hereunder ("SOFTWARE"). -This license can be accepted only by an adult of legal age of majority in the country in which the SOFTWARE is used. If you are under the legal age of majority, you must ask your parent or legal guardian to consent to this license. By taking delivery of the SOFTWARE, you affirm that you have reached the legal age of majority, you accept the terms of this license, and you take legal and financial responsibility for the actions of your permitted users. +This license can be accepted only by an adult of legal age of majority in the country in which the SOFTWARE is used. If you are under the legal age of majority, you must ask your parent or legal guardian to consent to this license. By taking delivery of the SOFTWARE, you affirm that you have reached the legal age of majority, you accept the terms of this license, and you take legal and financial responsibility for the actions of your permitted users. You agree to use the SOFTWARE only for purposes that are permitted by (a) this license, and (b) any applicable law, regulation or generally accepted practices or guidelines in the relevant jurisdictions. 1. LICENSE. Subject to the terms of this license, NVIDIA grants you a non-exclusive limited license to: (a) install and use the SOFTWARE, and (b) distribute the SOFTWARE subject to the distribution requirements described in this license. NVIDIA reserves all rights, title and interest in and to the SOFTWARE not expressly granted to you under this license. -2. DISTRIBUTION REQUIREMENTS. These are the distribution requirements for you to exercise the distribution grant: -a. The terms under which you distribute the SOFTWARE must be consistent with the terms of this license, including (without limitation) terms relating to the license grant and license restrictions and protection of NVIDIA's intellectual property rights. -b. You agree to notify NVIDIA in writing of any known or suspected distribution or use of the SOFTWARE not in compliance with the requirements of this license, and to enforce the terms of your agreements with respect to distributed SOFTWARE. +2. DISTRIBUTION REQUIREMENTS. These are the distribution requirements for you to exercise the distribution grant: +a. The terms under which you distribute the SOFTWARE must be consistent with the terms of this license, including (without limitation) terms relating to the license grant and license restrictions and protection of NVIDIA's intellectual property rights. +b. You agree to notify NVIDIA in writing of any known or suspected distribution or use of the SOFTWARE not in compliance with the requirements of this license, and to enforce the terms of your agreements with respect to distributed SOFTWARE. 3. LIMITATIONS. Your license to use the SOFTWARE is restricted as follows: a. The SOFTWARE is licensed for you to develop applications only for use in systems with NVIDIA GPUs. -b. You may not reverse engineer, decompile or disassemble, or remove copyright or other proprietary notices from any portion of the SOFTWARE or copies of the SOFTWARE. -c. You may not modify or create derivative works of any portion of the SOFTWARE. +b. You may not reverse engineer, decompile or disassemble, or remove copyright or other proprietary notices from any portion of the SOFTWARE or copies of the SOFTWARE. +c. You may not modify or create derivative works of any portion of the SOFTWARE. d. You may not bypass, disable, or circumvent any technical measure, encryption, security, digital rights management or authentication mechanism in the SOFTWARE. e. You may not use the SOFTWARE in any manner that would cause it to become subject to an open source software license. As examples, licenses that require as a condition of use, modification, and/or distribution that the SOFTWARE be (i) disclosed or distributed in source code form; (ii) licensed for the purpose of making derivative works; or (iii) redistributable at no charge. -f. Unless you have an agreement with NVIDIA for this purpose, you may not use the SOFTWARE with any system or application where the use or failure of the system or application can reasonably be expected to threaten or result in personal injury, death, or catastrophic loss. Examples include use in avionics, navigation, military, medical, life support or other life critical applications. NVIDIA does not design, test or manufacture the SOFTWARE for these critical uses and NVIDIA shall not be liable to you or any third party, in whole or in part, for any claims or damages arising from such uses. -g. You agree to defend, indemnify and hold harmless NVIDIA and its affiliates, and their respective employees, contractors, agents, officers and directors, from and against any and all claims, damages, obligations, losses, liabilities, costs or debt, fines, restitutions and expenses (including but not limited to attorney's fees and costs incident to establishing the right of indemnification) arising out of or related to use of the SOFTWARE outside of the scope of this Agreement, or not in compliance with its terms. +f. Unless you have an agreement with NVIDIA for this purpose, you may not use the SOFTWARE with any system or application where the use or failure of the system or application can reasonably be expected to threaten or result in personal injury, death, or catastrophic loss. Examples include use in avionics, navigation, military, medical, life support or other life critical applications. NVIDIA does not design, test or manufacture the SOFTWARE for these critical uses and NVIDIA shall not be liable to you or any third party, in whole or in part, for any claims or damages arising from such uses. +g. You agree to defend, indemnify and hold harmless NVIDIA and its affiliates, and their respective employees, contractors, agents, officers and directors, from and against any and all claims, damages, obligations, losses, liabilities, costs or debt, fines, restitutions and expenses (including but not limited to attorney's fees and costs incident to establishing the right of indemnification) arising out of or related to use of the SOFTWARE outside of the scope of this Agreement, or not in compliance with its terms. -4. PRE-RELEASE. SOFTWARE versions identified as alpha, beta, preview, early access or otherwise as pre-release may not be fully functional, may contain errors or design flaws, and may have reduced or different security, privacy, availability, and reliability standards relative to commercial versions of NVIDIA software and materials. You may use a pre-release SOFTWARE version at your own risk, understanding that these versions are not intended for use in production or business-critical systems. +4. PRE-RELEASE. SOFTWARE versions identified as alpha, beta, preview, early access or otherwise as pre-release may not be fully functional, may contain errors or design flaws, and may have reduced or different security, privacy, availability, and reliability standards relative to commercial versions of NVIDIA software and materials. You may use a pre-release SOFTWARE version at your own risk, understanding that these versions are not intended for use in production or business-critical systems. 5. OWNERSHIP. The SOFTWARE and the related intellectual property rights therein are and will remain the sole and exclusive property of NVIDIA or its licensors. The SOFTWARE is copyrighted and protected by the laws of the United States and other countries, and international treaty provisions. NVIDIA may make changes to the SOFTWARE, at any time without notice, but is not obligated to support or update the SOFTWARE. - -6. COMPONENTS UNDER OTHER LICENSES. The SOFTWARE may include NVIDIA or third-party components with separate legal notices or terms as may be described in proprietary notices accompanying the SOFTWARE. If and to the extent there is a conflict between the terms in this license and the license terms associated with a component, the license terms associated with the components control only to the extent necessary to resolve the conflict. + +6. COMPONENTS UNDER OTHER LICENSES. The SOFTWARE may include NVIDIA or third-party components with separate legal notices or terms as may be described in proprietary notices accompanying the SOFTWARE. If and to the extent there is a conflict between the terms in this license and the license terms associated with a component, the license terms associated with the components control only to the extent necessary to resolve the conflict. 7. FEEDBACK. You may, but don't have to, provide to NVIDIA any Feedback. "Feedback" means any suggestions, bug fixes, enhancements, modifications, feature requests or other feedback regarding the SOFTWARE. For any Feedback that you voluntarily provide, you hereby grant NVIDIA and its affiliates a perpetual, non-exclusive, worldwide, irrevocable license to use, reproduce, modify, license, sublicense (through multiple tiers of sublicensees), and distribute (through multiple tiers of distributors) the Feedback without the payment of any royalties or fees to you. NVIDIA will use Feedback at its choice. -8. NO WARRANTIES. THE SOFTWARE IS PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING, BUT NOT LIMITED TO, WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. NVIDIA DOES NOT WARRANT THAT THE SOFTWARE WILL MEET YOUR REQUIREMENTS OR THAT THE OPERATION THEREOF WILL BE UNINTERRUPTED OR ERROR-FREE, OR THAT ALL ERRORS WILL BE CORRECTED. - -9. LIMITATIONS OF LIABILITY. TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS AFFILIATES SHALL NOT BE LIABLE FOR ANY SPECIAL, INCIDENTAL, PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, PROJECT DELAYS, LOSS OF USE, LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION WITH THIS LICENSE OR THE USE OR PERFORMANCE OF THE SOFTWARE, WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON BREACH OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE), PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION OR THEORY OF LIABILITY, EVEN IF NVIDIA HAS PREVIOUSLY BEEN ADVISED OF, OR COULD REASONABLY HAVE FORESEEN, THE POSSIBILITY OF SUCH DAMAGES. IN NO EVENT WILL NVIDIA'S AND ITS AFFILIATES TOTAL CUMULATIVE LIABILITY UNDER OR ARISING OUT OF THIS LICENSE EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE OR EXTEND THIS LIMIT. +8. NO WARRANTIES. THE SOFTWARE IS PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING, BUT NOT LIMITED TO, WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. NVIDIA DOES NOT WARRANT THAT THE SOFTWARE WILL MEET YOUR REQUIREMENTS OR THAT THE OPERATION THEREOF WILL BE UNINTERRUPTED OR ERROR-FREE, OR THAT ALL ERRORS WILL BE CORRECTED. -10. TERMINATION. Your rights under this license will terminate automatically without notice from NVIDIA if you fail to comply with any term and condition of this license or if you commence or participate in any legal proceeding against NVIDIA with respect to the SOFTWARE. NVIDIA may terminate this license with advance written notice to you if NVIDIA decides to no longer provide the SOFTWARE in a country or, in NVIDIA's sole discretion, the continued use of it is no longer commercially viable. Upon any termination of this license, you agree to promptly discontinue use of the SOFTWARE and destroy all copies in your possession or control. Your prior distributions in accordance with this license are not affected by the termination of this license. All provisions of this license will survive termination, except for the license granted to you. +9. LIMITATIONS OF LIABILITY. TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS AFFILIATES SHALL NOT BE LIABLE FOR ANY SPECIAL, INCIDENTAL, PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, PROJECT DELAYS, LOSS OF USE, LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION WITH THIS LICENSE OR THE USE OR PERFORMANCE OF THE SOFTWARE, WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON BREACH OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE), PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION OR THEORY OF LIABILITY, EVEN IF NVIDIA HAS PREVIOUSLY BEEN ADVISED OF, OR COULD REASONABLY HAVE FORESEEN, THE POSSIBILITY OF SUCH DAMAGES. IN NO EVENT WILL NVIDIA'S AND ITS AFFILIATES TOTAL CUMULATIVE LIABILITY UNDER OR ARISING OUT OF THIS LICENSE EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE OR EXTEND THIS LIMIT. -11. APPLICABLE LAW. This license will be governed in all respects by the laws of the United States and of the State of Delaware as those laws are applied to contracts entered into and performed entirely within Delaware by Delaware residents, without regard to the conflicts of laws principles. The United Nations Convention on Contracts for the International Sale of Goods is specifically disclaimed. You agree to all terms of this Agreement in the English language. The state or federal courts residing in Santa Clara County, California shall have exclusive jurisdiction over any dispute or claim arising out of this license. Notwithstanding this, you agree that NVIDIA shall still be allowed to apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction. +10. TERMINATION. Your rights under this license will terminate automatically without notice from NVIDIA if you fail to comply with any term and condition of this license or if you commence or participate in any legal proceeding against NVIDIA with respect to the SOFTWARE. NVIDIA may terminate this license with advance written notice to you if NVIDIA decides to no longer provide the SOFTWARE in a country or, in NVIDIA's sole discretion, the continued use of it is no longer commercially viable. Upon any termination of this license, you agree to promptly discontinue use of the SOFTWARE and destroy all copies in your possession or control. Your prior distributions in accordance with this license are not affected by the termination of this license. All provisions of this license will survive termination, except for the license granted to you. -12. NO ASSIGNMENT. This license and your rights and obligations thereunder may not be assigned by you by any means or operation of law without NVIDIA's permission. Any attempted assignment not approved by NVIDIA in writing shall be void and of no effect. +11. APPLICABLE LAW. This license will be governed in all respects by the laws of the United States and of the State of Delaware as those laws are applied to contracts entered into and performed entirely within Delaware by Delaware residents, without regard to the conflicts of laws principles. The United Nations Convention on Contracts for the International Sale of Goods is specifically disclaimed. You agree to all terms of this Agreement in the English language. The state or federal courts residing in Santa Clara County, California shall have exclusive jurisdiction over any dispute or claim arising out of this license. Notwithstanding this, you agree that NVIDIA shall still be allowed to apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction. +12. NO ASSIGNMENT. This license and your rights and obligations thereunder may not be assigned by you by any means or operation of law without NVIDIA's permission. Any attempted assignment not approved by NVIDIA in writing shall be void and of no effect. + 13. EXPORT. The SOFTWARE is subject to United States export laws and regulations. You agree that you will not ship, transfer or export the SOFTWARE into any country, or use the SOFTWARE in any manner, prohibited by the United States Bureau of Industry and Security or economic sanctions regulations administered by the U.S. Department of Treasury's Office of Foreign Assets Control (OFAC), or any applicable export laws, restrictions or regulations. These laws include restrictions on destinations, end users and end use. By accepting this license, you confirm that you are not a resident or citizen of any country currently embargoed by the U.S. and that you are not otherwise prohibited from receiving the SOFTWARE. -14. GOVERNMENT USE. The SOFTWARE has been developed entirely at private expense and is "commercial items" consisting of "commercial computer software" and "commercial computer software documentation" provided with RESTRICTED RIGHTS. Use, duplication or disclosure by the U.S. Government or a U.S. Government subcontractor is subject to the restrictions in this license pursuant to DFARS 227.7202-3(a) or as set forth in subparagraphs (b)(1) and (2) of the Commercial Computer Software - Restricted Rights clause at FAR 52.227-19, as applicable. Contractor/manufacturer is NVIDIA, 2788 San Tomas Expressway, Santa Clara, CA 95051. +14. GOVERNMENT USE. The SOFTWARE has been developed entirely at private expense and is "commercial items" consisting of "commercial computer software" and "commercial computer software documentation" provided with RESTRICTED RIGHTS. Use, duplication or disclosure by the U.S. Government or a U.S. Government subcontractor is subject to the restrictions in this license pursuant to DFARS 227.7202-3(a) or as set forth in subparagraphs (b)(1) and (2) of the Commercial Computer Software - Restricted Rights clause at FAR 52.227-19, as applicable. Contractor/manufacturer is NVIDIA, 2788 San Tomas Expressway, Santa Clara, CA 95051. 15. ENTIRE AGREEMENT. This license is the final, complete and exclusive agreement between the parties relating to the subject matter of this license and supersedes all prior or contemporaneous understandings and agreements relating to this subject matter, whether oral or written. If any court of competent jurisdiction determines that any provision of this license is illegal, invalid or unenforceable, the remaining provisions will remain in full force and effect. This license may only be modified in a writing signed by an authorized representative of each party. diff --git a/cuda_python/docs/source/release.rst b/cuda_python/docs/source/release.rst index 9e7a66a522..c97e508c45 100644 --- a/cuda_python/docs/source/release.rst +++ b/cuda_python/docs/source/release.rst @@ -17,3 +17,4 @@ Release Notes 12.6.1 11.8.7 11.8.6 + From 06589dc01e93efe3e558559d13c94a3fd161b2cd Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Mon, 25 Aug 2025 14:17:39 -0700 Subject: [PATCH 4/7] gen_exclude at top level --- .pre-commit-config.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 29c6f999d8..56823c88e2 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -12,6 +12,8 @@ ci: skip: [bandit] submodules: false +gen_exclude: &gen_exclude '^cuda_bindings/cuda/bindings/.*\.in?$' + # Please update the rev: SHAs below with this command: # pre-commit autoupdate --freeze repos: @@ -52,9 +54,11 @@ repos: - id: check-yaml - id: debug-statements - id: end-of-file-fixer + exclude: *gen_exclude - id: mixed-line-ending - id: requirements-txt-fixer - id: trailing-whitespace + exclude: *gen_exclude # Checking for common mistakes - repo: https://github.com/pre-commit/pygrep-hooks From fd798ca16503be53d113ebd2e3471816e1d6a9e6 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Mon, 25 Aug 2025 14:19:07 -0700 Subject: [PATCH 5/7] =?UTF-8?q?Automatic=20whitespace=20fixes=20=E2=80=94?= =?UTF-8?q?=20NO=20manual=20changes.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/ISSUE_TEMPLATE/bug_report.yml | 1 - .github/ISSUE_TEMPLATE/doc_request.yml | 1 - .github/PULL_REQUEST_TEMPLATE.md | 1 - .github/actions/doc_preview/action.yml | 10 ++--- .github/actions/install_unix_deps/action.yml | 2 +- .github/workflows/release-upload.yml | 2 +- SECURITY.md | 1 - ci/tools/download-wheels | 8 ++-- cuda_bindings/LICENSE | 36 +++++++-------- .../docs/source/environment_variables.rst | 1 - cuda_bindings/docs/source/install.rst | 2 +- cuda_bindings/docs/source/module/driver.rst | 26 +++++------ .../docs/source/module/nvjitlink.rst | 2 +- cuda_bindings/docs/source/module/nvrtc.rst | 7 ++- cuda_bindings/docs/source/module/runtime.rst | 44 +++++++++---------- cuda_bindings/docs/source/motivation.rst | 14 +++--- cuda_bindings/docs/source/overview.rst | 39 ++++++++-------- .../docs/source/release/11.6.0-notes.rst | 1 - .../docs/source/release/12.9.X-notes.rst | 2 +- cuda_bindings/docs/source/tips_and_tricks.rst | 6 +-- cuda_bindings/pyproject.toml | 10 ++--- .../core/experimental/_kernel_arg_handler.pyx | 2 +- cuda_core/cuda/core/experimental/_memory.pyx | 10 ++--- .../cuda/core/experimental/_memoryview.pyx | 36 +++++++-------- .../_templates/autosummary/dataclass.rst | 1 - .../_templates/autosummary/namedtuple.rst | 2 +- cuda_core/docs/source/getting-started.rst | 6 +-- cuda_core/docs/source/install.rst | 4 +- cuda_core/docs/source/release/0.1.1-notes.rst | 2 +- cuda_core/docs/source/release/0.3.2-notes.rst | 2 +- cuda_core/pyproject.toml | 6 +-- .../cython/test_get_cuda_native_handle.pyx | 2 +- cuda_python/LICENSE | 36 +++++++-------- cuda_python/docs/source/release.rst | 1 - 34 files changed, 158 insertions(+), 168 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index 09752a7293..4574e04bf3 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -113,4 +113,3 @@ body: +-------------------------------+----------------------+----------------------+ validations: required: false - diff --git a/.github/ISSUE_TEMPLATE/doc_request.yml b/.github/ISSUE_TEMPLATE/doc_request.yml index 26a7faeace..7804a6c85a 100644 --- a/.github/ISSUE_TEMPLATE/doc_request.yml +++ b/.github/ISSUE_TEMPLATE/doc_request.yml @@ -41,4 +41,3 @@ body: label: If this is a correction, please provide a link to the incorrect documentation. If this is a new documentation request, please link to where you have looked. placeholder: | https://nvidia.github.io/cuda-python/latest/ - diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index afc43be22f..aa51259a90 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -11,4 +11,3 @@ closes - [ ] New or existing tests cover these changes. - [ ] The documentation is up to date with these changes. - diff --git a/.github/actions/doc_preview/action.yml b/.github/actions/doc_preview/action.yml index 61531428a7..6948522daa 100644 --- a/.github/actions/doc_preview/action.yml +++ b/.github/actions/doc_preview/action.yml @@ -18,7 +18,7 @@ runs: using: composite steps: # The steps below are executed only when testing in a PR. - # Note: the PR previews will be removed once merged to main (see below) + # Note: the PR previews will be removed once merged to main (see below) - name: Deploy doc preview if: ${{ github.ref_name != 'main' }} uses: JamesIves/github-pages-deploy-action@6c2d9db40f9296374acc17b90404b6e8864128c8 # v4.7.3 @@ -28,7 +28,7 @@ runs: folder: ${{ inputs.source-folder }} target-folder: docs/pr-preview/pr-${{ inputs.pr-number }}/ commit-message: "Deploy doc preview for PR ${{ inputs.pr-number }} (${{ github.sha }})" - + - name: Leave a comment after deployment if: ${{ github.ref_name != 'main' }} uses: marocchino/sticky-pull-request-comment@67d0dec7b07ed060a405f9b2a64b8ab319fdd7db # v2.9.2 @@ -43,8 +43,8 @@ runs: |
https://nvidia.github.io/cuda-python/pr-preview/pr-${{ inputs.pr-number }}/cuda-core/
|
https://nvidia.github.io/cuda-python/pr-preview/pr-${{ inputs.pr-number }}/cuda-bindings/

|

Preview will be ready when the GitHub Pages deployment is complete.

- - # The steps below are executed only when building on main. + + # The steps below are executed only when building on main. - name: Remove doc preview if: ${{ github.ref_name == 'main' }} uses: JamesIves/github-pages-deploy-action@6c2d9db40f9296374acc17b90404b6e8864128c8 # v4.7.3 @@ -54,7 +54,7 @@ runs: folder: ${{ inputs.source-folder }} target-folder: docs/pr-preview/pr-${{ inputs.pr-number }}/ commit-message: "Clean up doc preview for PR ${{ inputs.pr-number }} (${{ github.sha }})" - + - name: Leave a comment after removal if: ${{ github.ref_name == 'main' }} uses: marocchino/sticky-pull-request-comment@67d0dec7b07ed060a405f9b2a64b8ab319fdd7db # v2.9.2 diff --git a/.github/actions/install_unix_deps/action.yml b/.github/actions/install_unix_deps/action.yml index 645b761cf8..6289541c90 100644 --- a/.github/actions/install_unix_deps/action.yml +++ b/.github/actions/install_unix_deps/action.yml @@ -10,7 +10,7 @@ inputs: dependencies: required: true type: string - dependent_exes: + dependent_exes: required: true type: string diff --git a/.github/workflows/release-upload.yml b/.github/workflows/release-upload.yml index f7d6306fc2..8ae08c5026 100644 --- a/.github/workflows/release-upload.yml +++ b/.github/workflows/release-upload.yml @@ -78,7 +78,7 @@ jobs: run: | # Use the shared script to download wheels ./ci/tools/download-wheels "${{ inputs.run-id }}" "${{ inputs.component }}" "${{ github.repository }}" "release/wheels" - + # Upload wheels to the release if [[ -d "release/wheels" && $(ls -A release/wheels 2>/dev/null | wc -l) -gt 0 ]]; then echo "Uploading wheels to release ${{ inputs.git-tag }}" diff --git a/SECURITY.md b/SECURITY.md index b72f0d67ad..4283541556 100755 --- a/SECURITY.md +++ b/SECURITY.md @@ -33,4 +33,3 @@ information. For all security-related concerns, please visit NVIDIA's Product Security portal at . - diff --git a/ci/tools/download-wheels b/ci/tools/download-wheels index 05509bfc0a..a3141afb33 100755 --- a/ci/tools/download-wheels +++ b/ci/tools/download-wheels @@ -49,18 +49,18 @@ do if [[ ! -d "$p" ]]; then continue fi - + # exclude cython test artifacts if [[ "${p}" == *-tests ]]; then echo "Skipping test artifact: $p" continue fi - + # If we're not downloading "all", only process matching component if [[ "$COMPONENT" != "all" && "$p" != ${COMPONENT}* ]]; then continue fi - + echo "Processing artifact: $p" # Move wheel files to output directory if [[ -d "$p" ]]; then @@ -72,4 +72,4 @@ done rm -rf cuda-* echo "Downloaded wheels to: $OUTPUT_DIR" -ls -la "$OUTPUT_DIR" \ No newline at end of file +ls -la "$OUTPUT_DIR" diff --git a/cuda_bindings/LICENSE b/cuda_bindings/LICENSE index b7d042fcee..a5a65097cd 100644 --- a/cuda_bindings/LICENSE +++ b/cuda_bindings/LICENSE @@ -2,46 +2,46 @@ NVIDIA SOFTWARE LICENSE This license is a legal agreement between you and NVIDIA Corporation ("NVIDIA") and governs your use of the NVIDIA CUDA Python software and materials provided hereunder ("SOFTWARE"). -This license can be accepted only by an adult of legal age of majority in the country in which the SOFTWARE is used. If you are under the legal age of majority, you must ask your parent or legal guardian to consent to this license. By taking delivery of the SOFTWARE, you affirm that you have reached the legal age of majority, you accept the terms of this license, and you take legal and financial responsibility for the actions of your permitted users. +This license can be accepted only by an adult of legal age of majority in the country in which the SOFTWARE is used. If you are under the legal age of majority, you must ask your parent or legal guardian to consent to this license. By taking delivery of the SOFTWARE, you affirm that you have reached the legal age of majority, you accept the terms of this license, and you take legal and financial responsibility for the actions of your permitted users. You agree to use the SOFTWARE only for purposes that are permitted by (a) this license, and (b) any applicable law, regulation or generally accepted practices or guidelines in the relevant jurisdictions. 1. LICENSE. Subject to the terms of this license, NVIDIA grants you a non-exclusive limited license to: (a) install and use the SOFTWARE, and (b) distribute the SOFTWARE subject to the distribution requirements described in this license. NVIDIA reserves all rights, title and interest in and to the SOFTWARE not expressly granted to you under this license. -2. DISTRIBUTION REQUIREMENTS. These are the distribution requirements for you to exercise the distribution grant: -a. The terms under which you distribute the SOFTWARE must be consistent with the terms of this license, including (without limitation) terms relating to the license grant and license restrictions and protection of NVIDIA's intellectual property rights. -b. You agree to notify NVIDIA in writing of any known or suspected distribution or use of the SOFTWARE not in compliance with the requirements of this license, and to enforce the terms of your agreements with respect to distributed SOFTWARE. +2. DISTRIBUTION REQUIREMENTS. These are the distribution requirements for you to exercise the distribution grant: +a. The terms under which you distribute the SOFTWARE must be consistent with the terms of this license, including (without limitation) terms relating to the license grant and license restrictions and protection of NVIDIA's intellectual property rights. +b. You agree to notify NVIDIA in writing of any known or suspected distribution or use of the SOFTWARE not in compliance with the requirements of this license, and to enforce the terms of your agreements with respect to distributed SOFTWARE. 3. LIMITATIONS. Your license to use the SOFTWARE is restricted as follows: a. The SOFTWARE is licensed for you to develop applications only for use in systems with NVIDIA GPUs. -b. You may not reverse engineer, decompile or disassemble, or remove copyright or other proprietary notices from any portion of the SOFTWARE or copies of the SOFTWARE. -c. You may not modify or create derivative works of any portion of the SOFTWARE. +b. You may not reverse engineer, decompile or disassemble, or remove copyright or other proprietary notices from any portion of the SOFTWARE or copies of the SOFTWARE. +c. You may not modify or create derivative works of any portion of the SOFTWARE. d. You may not bypass, disable, or circumvent any technical measure, encryption, security, digital rights management or authentication mechanism in the SOFTWARE. e. You may not use the SOFTWARE in any manner that would cause it to become subject to an open source software license. As examples, licenses that require as a condition of use, modification, and/or distribution that the SOFTWARE be (i) disclosed or distributed in source code form; (ii) licensed for the purpose of making derivative works; or (iii) redistributable at no charge. -f. Unless you have an agreement with NVIDIA for this purpose, you may not use the SOFTWARE with any system or application where the use or failure of the system or application can reasonably be expected to threaten or result in personal injury, death, or catastrophic loss. Examples include use in avionics, navigation, military, medical, life support or other life critical applications. NVIDIA does not design, test or manufacture the SOFTWARE for these critical uses and NVIDIA shall not be liable to you or any third party, in whole or in part, for any claims or damages arising from such uses. -g. You agree to defend, indemnify and hold harmless NVIDIA and its affiliates, and their respective employees, contractors, agents, officers and directors, from and against any and all claims, damages, obligations, losses, liabilities, costs or debt, fines, restitutions and expenses (including but not limited to attorney's fees and costs incident to establishing the right of indemnification) arising out of or related to use of the SOFTWARE outside of the scope of this Agreement, or not in compliance with its terms. +f. Unless you have an agreement with NVIDIA for this purpose, you may not use the SOFTWARE with any system or application where the use or failure of the system or application can reasonably be expected to threaten or result in personal injury, death, or catastrophic loss. Examples include use in avionics, navigation, military, medical, life support or other life critical applications. NVIDIA does not design, test or manufacture the SOFTWARE for these critical uses and NVIDIA shall not be liable to you or any third party, in whole or in part, for any claims or damages arising from such uses. +g. You agree to defend, indemnify and hold harmless NVIDIA and its affiliates, and their respective employees, contractors, agents, officers and directors, from and against any and all claims, damages, obligations, losses, liabilities, costs or debt, fines, restitutions and expenses (including but not limited to attorney's fees and costs incident to establishing the right of indemnification) arising out of or related to use of the SOFTWARE outside of the scope of this Agreement, or not in compliance with its terms. -4. PRE-RELEASE. SOFTWARE versions identified as alpha, beta, preview, early access or otherwise as pre-release may not be fully functional, may contain errors or design flaws, and may have reduced or different security, privacy, availability, and reliability standards relative to commercial versions of NVIDIA software and materials. You may use a pre-release SOFTWARE version at your own risk, understanding that these versions are not intended for use in production or business-critical systems. +4. PRE-RELEASE. SOFTWARE versions identified as alpha, beta, preview, early access or otherwise as pre-release may not be fully functional, may contain errors or design flaws, and may have reduced or different security, privacy, availability, and reliability standards relative to commercial versions of NVIDIA software and materials. You may use a pre-release SOFTWARE version at your own risk, understanding that these versions are not intended for use in production or business-critical systems. 5. OWNERSHIP. The SOFTWARE and the related intellectual property rights therein are and will remain the sole and exclusive property of NVIDIA or its licensors. The SOFTWARE is copyrighted and protected by the laws of the United States and other countries, and international treaty provisions. NVIDIA may make changes to the SOFTWARE, at any time without notice, but is not obligated to support or update the SOFTWARE. - -6. COMPONENTS UNDER OTHER LICENSES. The SOFTWARE may include NVIDIA or third-party components with separate legal notices or terms as may be described in proprietary notices accompanying the SOFTWARE. If and to the extent there is a conflict between the terms in this license and the license terms associated with a component, the license terms associated with the components control only to the extent necessary to resolve the conflict. + +6. COMPONENTS UNDER OTHER LICENSES. The SOFTWARE may include NVIDIA or third-party components with separate legal notices or terms as may be described in proprietary notices accompanying the SOFTWARE. If and to the extent there is a conflict between the terms in this license and the license terms associated with a component, the license terms associated with the components control only to the extent necessary to resolve the conflict. 7. FEEDBACK. You may, but don't have to, provide to NVIDIA any Feedback. "Feedback" means any suggestions, bug fixes, enhancements, modifications, feature requests or other feedback regarding the SOFTWARE. For any Feedback that you voluntarily provide, you hereby grant NVIDIA and its affiliates a perpetual, non-exclusive, worldwide, irrevocable license to use, reproduce, modify, license, sublicense (through multiple tiers of sublicensees), and distribute (through multiple tiers of distributors) the Feedback without the payment of any royalties or fees to you. NVIDIA will use Feedback at its choice. -8. NO WARRANTIES. THE SOFTWARE IS PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING, BUT NOT LIMITED TO, WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. NVIDIA DOES NOT WARRANT THAT THE SOFTWARE WILL MEET YOUR REQUIREMENTS OR THAT THE OPERATION THEREOF WILL BE UNINTERRUPTED OR ERROR-FREE, OR THAT ALL ERRORS WILL BE CORRECTED. +8. NO WARRANTIES. THE SOFTWARE IS PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING, BUT NOT LIMITED TO, WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. NVIDIA DOES NOT WARRANT THAT THE SOFTWARE WILL MEET YOUR REQUIREMENTS OR THAT THE OPERATION THEREOF WILL BE UNINTERRUPTED OR ERROR-FREE, OR THAT ALL ERRORS WILL BE CORRECTED. + +9. LIMITATIONS OF LIABILITY. TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS AFFILIATES SHALL NOT BE LIABLE FOR ANY SPECIAL, INCIDENTAL, PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, PROJECT DELAYS, LOSS OF USE, LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION WITH THIS LICENSE OR THE USE OR PERFORMANCE OF THE SOFTWARE, WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON BREACH OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE), PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION OR THEORY OF LIABILITY, EVEN IF NVIDIA HAS PREVIOUSLY BEEN ADVISED OF, OR COULD REASONABLY HAVE FORESEEN, THE POSSIBILITY OF SUCH DAMAGES. IN NO EVENT WILL NVIDIA'S AND ITS AFFILIATES TOTAL CUMULATIVE LIABILITY UNDER OR ARISING OUT OF THIS LICENSE EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE OR EXTEND THIS LIMIT. -9. LIMITATIONS OF LIABILITY. TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS AFFILIATES SHALL NOT BE LIABLE FOR ANY SPECIAL, INCIDENTAL, PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, PROJECT DELAYS, LOSS OF USE, LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION WITH THIS LICENSE OR THE USE OR PERFORMANCE OF THE SOFTWARE, WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON BREACH OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE), PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION OR THEORY OF LIABILITY, EVEN IF NVIDIA HAS PREVIOUSLY BEEN ADVISED OF, OR COULD REASONABLY HAVE FORESEEN, THE POSSIBILITY OF SUCH DAMAGES. IN NO EVENT WILL NVIDIA'S AND ITS AFFILIATES TOTAL CUMULATIVE LIABILITY UNDER OR ARISING OUT OF THIS LICENSE EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE OR EXTEND THIS LIMIT. +10. TERMINATION. Your rights under this license will terminate automatically without notice from NVIDIA if you fail to comply with any term and condition of this license or if you commence or participate in any legal proceeding against NVIDIA with respect to the SOFTWARE. NVIDIA may terminate this license with advance written notice to you if NVIDIA decides to no longer provide the SOFTWARE in a country or, in NVIDIA's sole discretion, the continued use of it is no longer commercially viable. Upon any termination of this license, you agree to promptly discontinue use of the SOFTWARE and destroy all copies in your possession or control. Your prior distributions in accordance with this license are not affected by the termination of this license. All provisions of this license will survive termination, except for the license granted to you. -10. TERMINATION. Your rights under this license will terminate automatically without notice from NVIDIA if you fail to comply with any term and condition of this license or if you commence or participate in any legal proceeding against NVIDIA with respect to the SOFTWARE. NVIDIA may terminate this license with advance written notice to you if NVIDIA decides to no longer provide the SOFTWARE in a country or, in NVIDIA's sole discretion, the continued use of it is no longer commercially viable. Upon any termination of this license, you agree to promptly discontinue use of the SOFTWARE and destroy all copies in your possession or control. Your prior distributions in accordance with this license are not affected by the termination of this license. All provisions of this license will survive termination, except for the license granted to you. +11. APPLICABLE LAW. This license will be governed in all respects by the laws of the United States and of the State of Delaware as those laws are applied to contracts entered into and performed entirely within Delaware by Delaware residents, without regard to the conflicts of laws principles. The United Nations Convention on Contracts for the International Sale of Goods is specifically disclaimed. You agree to all terms of this Agreement in the English language. The state or federal courts residing in Santa Clara County, California shall have exclusive jurisdiction over any dispute or claim arising out of this license. Notwithstanding this, you agree that NVIDIA shall still be allowed to apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction. -11. APPLICABLE LAW. This license will be governed in all respects by the laws of the United States and of the State of Delaware as those laws are applied to contracts entered into and performed entirely within Delaware by Delaware residents, without regard to the conflicts of laws principles. The United Nations Convention on Contracts for the International Sale of Goods is specifically disclaimed. You agree to all terms of this Agreement in the English language. The state or federal courts residing in Santa Clara County, California shall have exclusive jurisdiction over any dispute or claim arising out of this license. Notwithstanding this, you agree that NVIDIA shall still be allowed to apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction. +12. NO ASSIGNMENT. This license and your rights and obligations thereunder may not be assigned by you by any means or operation of law without NVIDIA's permission. Any attempted assignment not approved by NVIDIA in writing shall be void and of no effect. -12. NO ASSIGNMENT. This license and your rights and obligations thereunder may not be assigned by you by any means or operation of law without NVIDIA's permission. Any attempted assignment not approved by NVIDIA in writing shall be void and of no effect. - 13. EXPORT. The SOFTWARE is subject to United States export laws and regulations. You agree that you will not ship, transfer or export the SOFTWARE into any country, or use the SOFTWARE in any manner, prohibited by the United States Bureau of Industry and Security or economic sanctions regulations administered by the U.S. Department of Treasury's Office of Foreign Assets Control (OFAC), or any applicable export laws, restrictions or regulations. These laws include restrictions on destinations, end users and end use. By accepting this license, you confirm that you are not a resident or citizen of any country currently embargoed by the U.S. and that you are not otherwise prohibited from receiving the SOFTWARE. -14. GOVERNMENT USE. The SOFTWARE has been developed entirely at private expense and is "commercial items" consisting of "commercial computer software" and "commercial computer software documentation" provided with RESTRICTED RIGHTS. Use, duplication or disclosure by the U.S. Government or a U.S. Government subcontractor is subject to the restrictions in this license pursuant to DFARS 227.7202-3(a) or as set forth in subparagraphs (b)(1) and (2) of the Commercial Computer Software - Restricted Rights clause at FAR 52.227-19, as applicable. Contractor/manufacturer is NVIDIA, 2788 San Tomas Expressway, Santa Clara, CA 95051. +14. GOVERNMENT USE. The SOFTWARE has been developed entirely at private expense and is "commercial items" consisting of "commercial computer software" and "commercial computer software documentation" provided with RESTRICTED RIGHTS. Use, duplication or disclosure by the U.S. Government or a U.S. Government subcontractor is subject to the restrictions in this license pursuant to DFARS 227.7202-3(a) or as set forth in subparagraphs (b)(1) and (2) of the Commercial Computer Software - Restricted Rights clause at FAR 52.227-19, as applicable. Contractor/manufacturer is NVIDIA, 2788 San Tomas Expressway, Santa Clara, CA 95051. 15. ENTIRE AGREEMENT. This license is the final, complete and exclusive agreement between the parties relating to the subject matter of this license and supersedes all prior or contemporaneous understandings and agreements relating to this subject matter, whether oral or written. If any court of competent jurisdiction determines that any provision of this license is illegal, invalid or unenforceable, the remaining provisions will remain in full force and effect. This license may only be modified in a writing signed by an authorized representative of each party. diff --git a/cuda_bindings/docs/source/environment_variables.rst b/cuda_bindings/docs/source/environment_variables.rst index c582fe57b3..a212bfe764 100644 --- a/cuda_bindings/docs/source/environment_variables.rst +++ b/cuda_bindings/docs/source/environment_variables.rst @@ -18,4 +18,3 @@ Build-Time Environment Variables - ``CUDA_PYTHON_PARSER_CACHING`` : bool, toggles the caching of parsed header files during the cuda-bindings build process. If caching is enabled (``CUDA_PYTHON_PARSER_CACHING`` is True), the cache path is set to ./cache_, where is derived from the cuda toolkit libraries used to build cuda-bindings. - ``CUDA_PYTHON_PARALLEL_LEVEL`` (previously ``PARALLEL_LEVEL``) : int, sets the number of threads used in the compilation of extension modules. Not setting it or setting it to 0 would disable parallel builds. - diff --git a/cuda_bindings/docs/source/install.rst b/cuda_bindings/docs/source/install.rst index b9335b4876..b5181c6a35 100644 --- a/cuda_bindings/docs/source/install.rst +++ b/cuda_bindings/docs/source/install.rst @@ -35,7 +35,7 @@ Install all optional dependencies with: Where the optional dependencies include: -* ``nvidia-cuda-nvrtc`` (NVRTC runtime compilation library) +* ``nvidia-cuda-nvrtc`` (NVRTC runtime compilation library) * ``nvidia-nvjitlink`` (nvJitLink library) * ``nvidia-nvvm`` (NVVM library) * ``nvidia-cufile`` (cuFile library, Linux only) diff --git a/cuda_bindings/docs/source/module/driver.rst b/cuda_bindings/docs/source/module/driver.rst index 04e0390d12..8a4ba15ff8 100644 --- a/cuda_bindings/docs/source/module/driver.rst +++ b/cuda_bindings/docs/source/module/driver.rst @@ -2318,7 +2318,7 @@ Data types used by CUDA driver .. autoattribute:: cuda.bindings.driver.CUjit_option.CU_JIT_CACHE_MODE - Specifies whether to enable caching explicitly (-dlcm) + Specifies whether to enable caching explicitly (-dlcm) Choice is based on supplied :py:obj:`~.CUjit_cacheMode_enum`. @@ -3256,7 +3256,7 @@ Data types used by CUDA driver .. autoattribute:: cuda.bindings.driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT - Valid for launches. Set :py:obj:`~.CUlaunchAttributeValue.programmaticEvent` to record the event. Event recorded through this launch attribute is guaranteed to only trigger after all block in the associated kernel trigger the event. A block can trigger the event through PTX launchdep.release or CUDA builtin function cudaTriggerProgrammaticLaunchCompletion(). A trigger can also be inserted at the beginning of each block's execution if triggerAtBlockStart is set to non-0. The dependent launches can choose to wait on the dependency using the programmatic sync (cudaGridDependencySynchronize() or equivalent PTX instructions). Note that dependents (including the CPU thread calling :py:obj:`~.cuEventSynchronize()`) are not guaranteed to observe the release precisely when it is released. For example, :py:obj:`~.cuEventSynchronize()` may only observe the event trigger long after the associated kernel has completed. This recording type is primarily meant for establishing programmatic dependency between device tasks. Note also this type of dependency allows, but does not guarantee, concurrent execution of tasks. + Valid for launches. Set :py:obj:`~.CUlaunchAttributeValue.programmaticEvent` to record the event. Event recorded through this launch attribute is guaranteed to only trigger after all block in the associated kernel trigger the event. A block can trigger the event through PTX launchdep.release or CUDA builtin function cudaTriggerProgrammaticLaunchCompletion(). A trigger can also be inserted at the beginning of each block's execution if triggerAtBlockStart is set to non-0. The dependent launches can choose to wait on the dependency using the programmatic sync (cudaGridDependencySynchronize() or equivalent PTX instructions). Note that dependents (including the CPU thread calling :py:obj:`~.cuEventSynchronize()`) are not guaranteed to observe the release precisely when it is released. For example, :py:obj:`~.cuEventSynchronize()` may only observe the event trigger long after the associated kernel has completed. This recording type is primarily meant for establishing programmatic dependency between device tasks. Note also this type of dependency allows, but does not guarantee, concurrent execution of tasks. The event supplied must not be an interprocess or interop event. The event must disable timing (i.e. must be created with the :py:obj:`~.CU_EVENT_DISABLE_TIMING` flag set). @@ -3282,9 +3282,9 @@ Data types used by CUDA driver .. autoattribute:: cuda.bindings.driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION - Valid for graph nodes, launches. Set :py:obj:`~.CUlaunchAttributeValue.preferredClusterDim` to allow the kernel launch to specify a preferred substitute cluster dimension. Blocks may be grouped according to either the dimensions specified with this attribute (grouped into a "preferred substitute cluster"), or the one specified with :py:obj:`~.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION` attribute (grouped into a "regular cluster"). The cluster dimensions of a "preferred substitute cluster" shall be an integer multiple greater than zero of the regular cluster dimensions. The device will attempt - on a best-effort basis - to group thread blocks into preferred clusters over grouping them into regular clusters. When it deems necessary (primarily when the device temporarily runs out of physical resources to launch the larger preferred clusters), the device may switch to launch the regular clusters instead to attempt to utilize as much of the physical device resources as possible. + Valid for graph nodes, launches. Set :py:obj:`~.CUlaunchAttributeValue.preferredClusterDim` to allow the kernel launch to specify a preferred substitute cluster dimension. Blocks may be grouped according to either the dimensions specified with this attribute (grouped into a "preferred substitute cluster"), or the one specified with :py:obj:`~.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION` attribute (grouped into a "regular cluster"). The cluster dimensions of a "preferred substitute cluster" shall be an integer multiple greater than zero of the regular cluster dimensions. The device will attempt - on a best-effort basis - to group thread blocks into preferred clusters over grouping them into regular clusters. When it deems necessary (primarily when the device temporarily runs out of physical resources to launch the larger preferred clusters), the device may switch to launch the regular clusters instead to attempt to utilize as much of the physical device resources as possible. - Each type of cluster will have its enumeration / coordinate setup as if the grid consists solely of its type of cluster. For example, if the preferred substitute cluster dimensions double the regular cluster dimensions, there might be simultaneously a regular cluster indexed at (1,0,0), and a preferred cluster indexed at (1,0,0). In this example, the preferred substitute cluster (1,0,0) replaces regular clusters (2,0,0) and (3,0,0) and groups their blocks. + Each type of cluster will have its enumeration / coordinate setup as if the grid consists solely of its type of cluster. For example, if the preferred substitute cluster dimensions double the regular cluster dimensions, there might be simultaneously a regular cluster indexed at (1,0,0), and a preferred cluster indexed at (1,0,0). In this example, the preferred substitute cluster (1,0,0) replaces regular clusters (2,0,0) and (3,0,0) and groups their blocks. This attribute will only take effect when a regular cluster dimension has been specified. The preferred substitute cluster dimension must be an integer multiple greater than zero of the regular cluster dimension and must divide the grid. It must also be no more than `maxBlocksPerCluster`, if it is set in the kernel's `__launch_bounds__`. Otherwise it must be less than the maximum value the driver can support. Otherwise, setting this attribute to a value physically unable to fit on any particular device is permitted. @@ -3292,11 +3292,11 @@ Data types used by CUDA driver .. autoattribute:: cuda.bindings.driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT - Valid for launches. Set :py:obj:`~.CUlaunchAttributeValue.launchCompletionEvent` to record the event. + Valid for launches. Set :py:obj:`~.CUlaunchAttributeValue.launchCompletionEvent` to record the event. - Nominally, the event is triggered once all blocks of the kernel have begun execution. Currently this is a best effort. If a kernel B has a launch completion dependency on a kernel A, B may wait until A is complete. Alternatively, blocks of B may begin before all blocks of A have begun, for example if B can claim execution resources unavailable to A (e.g. they run on different GPUs) or if B is a higher priority than A. Exercise caution if such an ordering inversion could lead to deadlock. + Nominally, the event is triggered once all blocks of the kernel have begun execution. Currently this is a best effort. If a kernel B has a launch completion dependency on a kernel A, B may wait until A is complete. Alternatively, blocks of B may begin before all blocks of A have begun, for example if B can claim execution resources unavailable to A (e.g. they run on different GPUs) or if B is a higher priority than A. Exercise caution if such an ordering inversion could lead to deadlock. - A launch completion event is nominally similar to a programmatic event with `triggerAtBlockStart` set except that it is not visible to `cudaGridDependencySynchronize()` and can be used with compute capability less than 9.0. + A launch completion event is nominally similar to a programmatic event with `triggerAtBlockStart` set except that it is not visible to `cudaGridDependencySynchronize()` and can be used with compute capability less than 9.0. The event supplied must not be an interprocess or interop event. The event must disable timing (i.e. must be created with the :py:obj:`~.CU_EVENT_DISABLE_TIMING` flag set). @@ -3304,11 +3304,11 @@ Data types used by CUDA driver .. autoattribute:: cuda.bindings.driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE - Valid for graph nodes, launches. This attribute is graphs-only, and passing it to a launch in a non-capturing stream will result in an error. + Valid for graph nodes, launches. This attribute is graphs-only, and passing it to a launch in a non-capturing stream will result in an error. - :py:obj:`~.CUlaunchAttributeValue`::deviceUpdatableKernelNode::deviceUpdatable can only be set to 0 or 1. Setting the field to 1 indicates that the corresponding kernel node should be device-updatable. On success, a handle will be returned via :py:obj:`~.CUlaunchAttributeValue`::deviceUpdatableKernelNode::devNode which can be passed to the various device-side update functions to update the node's kernel parameters from within another kernel. For more information on the types of device updates that can be made, as well as the relevant limitations thereof, see :py:obj:`~.cudaGraphKernelNodeUpdatesApply`. + :py:obj:`~.CUlaunchAttributeValue`::deviceUpdatableKernelNode::deviceUpdatable can only be set to 0 or 1. Setting the field to 1 indicates that the corresponding kernel node should be device-updatable. On success, a handle will be returned via :py:obj:`~.CUlaunchAttributeValue`::deviceUpdatableKernelNode::devNode which can be passed to the various device-side update functions to update the node's kernel parameters from within another kernel. For more information on the types of device updates that can be made, as well as the relevant limitations thereof, see :py:obj:`~.cudaGraphKernelNodeUpdatesApply`. - Nodes which are device-updatable have additional restrictions compared to regular kernel nodes. Firstly, device-updatable nodes cannot be removed from their graph via :py:obj:`~.cuGraphDestroyNode`. Additionally, once opted-in to this functionality, a node cannot opt out, and any attempt to set the deviceUpdatable attribute to 0 will result in an error. Device-updatable kernel nodes also cannot have their attributes copied to/from another kernel node via :py:obj:`~.cuGraphKernelNodeCopyAttributes`. Graphs containing one or more device-updatable nodes also do not allow multiple instantiation, and neither the graph nor its instantiated version can be passed to :py:obj:`~.cuGraphExecUpdate`. + Nodes which are device-updatable have additional restrictions compared to regular kernel nodes. Firstly, device-updatable nodes cannot be removed from their graph via :py:obj:`~.cuGraphDestroyNode`. Additionally, once opted-in to this functionality, a node cannot opt out, and any attempt to set the deviceUpdatable attribute to 0 will result in an error. Device-updatable kernel nodes also cannot have their attributes copied to/from another kernel node via :py:obj:`~.cuGraphKernelNodeCopyAttributes`. Graphs containing one or more device-updatable nodes also do not allow multiple instantiation, and neither the graph nor its instantiated version can be passed to :py:obj:`~.cuGraphExecUpdate`. If a graph contains device-updatable nodes and updates those nodes from the device from within the graph, the graph must be uploaded with :py:obj:`~.cuGraphUpload` before it is launched. For such a graph, if host-side executable graph updates are made to the device-updatable nodes, the graph must be uploaded before it is launched again. @@ -3322,13 +3322,13 @@ Data types used by CUDA driver .. autoattribute:: cuda.bindings.driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_NVLINK_UTIL_CENTRIC_SCHEDULING - Valid for streams, graph nodes, launches. This attribute is a hint to the CUDA runtime that the launch should attempt to make the kernel maximize its NVLINK utilization. + Valid for streams, graph nodes, launches. This attribute is a hint to the CUDA runtime that the launch should attempt to make the kernel maximize its NVLINK utilization. - When possible to honor this hint, CUDA will assume each block in the grid launch will carry out an even amount of NVLINK traffic, and make a best-effort attempt to adjust the kernel launch based on that assumption. + When possible to honor this hint, CUDA will assume each block in the grid launch will carry out an even amount of NVLINK traffic, and make a best-effort attempt to adjust the kernel launch based on that assumption. - This attribute is a hint only. CUDA makes no functional or performance guarantee. Its applicability can be affected by many different factors, including driver version (i.e. CUDA doesn't guarantee the performance characteristics will be maintained between driver versions or a driver update could alter or regress previously observed perf characteristics.) It also doesn't guarantee a successful result, i.e. applying the attribute may not improve the performance of either the targeted kernel or the encapsulating application. + This attribute is a hint only. CUDA makes no functional or performance guarantee. Its applicability can be affected by many different factors, including driver version (i.e. CUDA doesn't guarantee the performance characteristics will be maintained between driver versions or a driver update could alter or regress previously observed perf characteristics.) It also doesn't guarantee a successful result, i.e. applying the attribute may not improve the performance of either the targeted kernel or the encapsulating application. Valid values for :py:obj:`~.CUlaunchAttributeValue`::nvlinkUtilCentricScheduling are 0 (disabled) and 1 (enabled). diff --git a/cuda_bindings/docs/source/module/nvjitlink.rst b/cuda_bindings/docs/source/module/nvjitlink.rst index ff9bb1ea52..6eda063074 100644 --- a/cuda_bindings/docs/source/module/nvjitlink.rst +++ b/cuda_bindings/docs/source/module/nvjitlink.rst @@ -9,7 +9,7 @@ nvjitlink Note ---- -The nvjitlink bindings are not supported on nvJitLink installations <12.3. Ensure the installed CUDA toolkit's nvJitLink version is >=12.3. +The nvjitlink bindings are not supported on nvJitLink installations <12.3. Ensure the installed CUDA toolkit's nvJitLink version is >=12.3. Functions --------- diff --git a/cuda_bindings/docs/source/module/nvrtc.rst b/cuda_bindings/docs/source/module/nvrtc.rst index 079cd39aad..d8bd6df229 100644 --- a/cuda_bindings/docs/source/module/nvrtc.rst +++ b/cuda_bindings/docs/source/module/nvrtc.rst @@ -232,7 +232,7 @@ Generate line-number information. - - ``--dopt=on``\ + - ``--dopt=on``\ Enable device code optimization. When specified along with ``-G``\ , enables limited debug information generation for optimized device code (currently, only line number information). When ``-G``\ is not specified, ``-dopt=on``\ is implicit. @@ -260,7 +260,7 @@ Specify the fast-compile level for device code, which controls the tradeoff betw - - ``--ptxas-options=``\ + - ``--ptxas-options=``\ Specify options directly to ptxas, the PTX optimizing assembler. @@ -308,7 +308,7 @@ For single-precision floating-point square root, use IEEE round-to-nearest mode - - Default: ``true``\ + - Default: ``true``\ @@ -783,4 +783,3 @@ Enable stack canaries in device code. Stack canaries make it more difficult to e - ``--fdevice-time-trace=``\ (``-fdevice-time-trace=``\ ) Enables the time profiler, outputting a JSON file based on given . Results can be analyzed on chrome://tracing for a flamegraph visualization. - diff --git a/cuda_bindings/docs/source/module/runtime.rst b/cuda_bindings/docs/source/module/runtime.rst index d155f85ebc..9ac22f660b 100644 --- a/cuda_bindings/docs/source/module/runtime.rst +++ b/cuda_bindings/docs/source/module/runtime.rst @@ -281,7 +281,7 @@ This section describes the unified addressing functions of the CUDA runtime appl -CUDA devices can share a unified address space with the host. +CUDA devices can share a unified address space with the host. For these devices there is no distinction between a device pointer and a host pointer -- the same pointer value may be used to access memory from the host program and from a kernel running on the device (with exceptions enumerated below). @@ -307,7 +307,7 @@ Unified addressing is automatically enabled in 64-bit processes . It is possible to look up information about the memory which backs a pointer value. For instance, one may want to know if a pointer points to host or device memory. As another example, in the case of device memory, one may want to know on which CUDA device the memory resides. These properties may be queried using the function cudaPointerGetAttributes() -Since pointers are unique, it is not necessary to specify information about the pointers specified to cudaMemcpy() and other copy functions. +Since pointers are unique, it is not necessary to specify information about the pointers specified to cudaMemcpy() and other copy functions. The copy direction cudaMemcpyDefault may be used to specify that the CUDA runtime should infer the location of the pointer from its value. @@ -321,7 +321,7 @@ Since pointers are unique, it is not necessary to specify information about the All host memory allocated through all devices using cudaMallocHost() and cudaHostAlloc() is always directly accessible from all devices that support unified addressing. This is the case regardless of whether or not the flags cudaHostAllocPortable and cudaHostAllocMapped are specified. -The pointer value through which allocated host memory may be accessed in kernels on all devices that support unified addressing is the same as the pointer value through which that memory is accessed on the host. It is not necessary to call cudaHostGetDevicePointer() to get the device pointer for these allocations. +The pointer value through which allocated host memory may be accessed in kernels on all devices that support unified addressing is the same as the pointer value through which that memory is accessed on the host. It is not necessary to call cudaHostGetDevicePointer() to get the device pointer for these allocations. @@ -345,7 +345,7 @@ Upon enabling direct access from a device that supports unified addressing to an -Not all memory may be accessed on devices through the same pointer value through which they are accessed on the host. These exceptions are host memory registered using cudaHostRegister() and host memory allocated using the flag cudaHostAllocWriteCombined. For these exceptions, there exists a distinct host and device address for the memory. The device address is guaranteed to not overlap any valid host pointer range and is guaranteed to have the same value across all devices that support unified addressing. +Not all memory may be accessed on devices through the same pointer value through which they are accessed on the host. These exceptions are host memory registered using cudaHostRegister() and host memory allocated using the flag cudaHostAllocWriteCombined. For these exceptions, there exists a distinct host and device address for the memory. The device address is guaranteed to not overlap any valid host pointer range and is guaranteed to have the same value across all devices that support unified addressing. @@ -671,7 +671,7 @@ Note that the use of multiple ::CUcontext s per device within a single process w If a non-primary ::CUcontext created by the CUDA Driver API is current to a thread then the CUDA Runtime API calls to that thread will operate on that ::CUcontext, with some exceptions listed below. Interoperability between data types is discussed in the following sections. -The function cudaPointerGetAttributes() will return the error cudaErrorIncompatibleDriverContext if the pointer being queried was allocated by a non-primary context. The function cudaDeviceEnablePeerAccess() and the rest of the peer access API may not be called when a non-primary ::CUcontext is current. +The function cudaPointerGetAttributes() will return the error cudaErrorIncompatibleDriverContext if the pointer being queried was allocated by a non-primary context. The function cudaDeviceEnablePeerAccess() and the rest of the peer access API may not be called when a non-primary ::CUcontext is current. To use the pointer query and peer access APIs with a context created using the CUDA Driver API, it is necessary that the CUDA Driver API be used to access these features. @@ -4697,7 +4697,7 @@ Data types used by CUDA Runtime .. autoattribute:: cuda.bindings.runtime.cudaJitOption.cudaJitCacheMode - Specifies whether to enable caching explicitly (-dlcm) + Specifies whether to enable caching explicitly (-dlcm) Choice is based on supplied :py:obj:`~.cudaJit_CacheMode`. @@ -5199,9 +5199,9 @@ Data types used by CUDA Runtime .. autoattribute:: cuda.bindings.runtime.cudaGraphInstantiateFlags.cudaGraphInstantiateFlagUpload - Automatically upload the graph after instantiation. Only supported by + Automatically upload the graph after instantiation. Only supported by - :py:obj:`~.cudaGraphInstantiateWithParams`. The upload will be performed using the + :py:obj:`~.cudaGraphInstantiateWithParams`. The upload will be performed using the stream provided in `instantiateParams`. @@ -5209,9 +5209,9 @@ Data types used by CUDA Runtime .. autoattribute:: cuda.bindings.runtime.cudaGraphInstantiateFlags.cudaGraphInstantiateFlagDeviceLaunch - Instantiate the graph to be launchable from the device. This flag can only + Instantiate the graph to be launchable from the device. This flag can only - be used on platforms which support unified addressing. This flag cannot be + be used on platforms which support unified addressing. This flag cannot be used in conjunction with cudaGraphInstantiateFlagAutoFreeOnLaunch. @@ -5281,7 +5281,7 @@ Data types used by CUDA Runtime .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeProgrammaticEvent - Valid for launches. Set :py:obj:`~.cudaLaunchAttributeValue.programmaticEvent` to record the event. Event recorded through this launch attribute is guaranteed to only trigger after all block in the associated kernel trigger the event. A block can trigger the event programmatically in a future CUDA release. A trigger can also be inserted at the beginning of each block's execution if triggerAtBlockStart is set to non-0. The dependent launches can choose to wait on the dependency using the programmatic sync (cudaGridDependencySynchronize() or equivalent PTX instructions). Note that dependents (including the CPU thread calling :py:obj:`~.cudaEventSynchronize()`) are not guaranteed to observe the release precisely when it is released. For example, :py:obj:`~.cudaEventSynchronize()` may only observe the event trigger long after the associated kernel has completed. This recording type is primarily meant for establishing programmatic dependency between device tasks. Note also this type of dependency allows, but does not guarantee, concurrent execution of tasks. + Valid for launches. Set :py:obj:`~.cudaLaunchAttributeValue.programmaticEvent` to record the event. Event recorded through this launch attribute is guaranteed to only trigger after all block in the associated kernel trigger the event. A block can trigger the event programmatically in a future CUDA release. A trigger can also be inserted at the beginning of each block's execution if triggerAtBlockStart is set to non-0. The dependent launches can choose to wait on the dependency using the programmatic sync (cudaGridDependencySynchronize() or equivalent PTX instructions). Note that dependents (including the CPU thread calling :py:obj:`~.cudaEventSynchronize()`) are not guaranteed to observe the release precisely when it is released. For example, :py:obj:`~.cudaEventSynchronize()` may only observe the event trigger long after the associated kernel has completed. This recording type is primarily meant for establishing programmatic dependency between device tasks. Note also this type of dependency allows, but does not guarantee, concurrent execution of tasks. The event supplied must not be an interprocess or interop event. The event must disable timing (i.e. must be created with the :py:obj:`~.cudaEventDisableTiming` flag set). @@ -5307,9 +5307,9 @@ Data types used by CUDA Runtime .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributePreferredClusterDimension - Valid for graph nodes and launches. Set :py:obj:`~.cudaLaunchAttributeValue.preferredClusterDim` to allow the kernel launch to specify a preferred substitute cluster dimension. Blocks may be grouped according to either the dimensions specified with this attribute (grouped into a "preferred substitute cluster"), or the one specified with :py:obj:`~.cudaLaunchAttributeClusterDimension` attribute (grouped into a "regular cluster"). The cluster dimensions of a "preferred substitute cluster" shall be an integer multiple greater than zero of the regular cluster dimensions. The device will attempt - on a best-effort basis - to group thread blocks into preferred clusters over grouping them into regular clusters. When it deems necessary (primarily when the device temporarily runs out of physical resources to launch the larger preferred clusters), the device may switch to launch the regular clusters instead to attempt to utilize as much of the physical device resources as possible. + Valid for graph nodes and launches. Set :py:obj:`~.cudaLaunchAttributeValue.preferredClusterDim` to allow the kernel launch to specify a preferred substitute cluster dimension. Blocks may be grouped according to either the dimensions specified with this attribute (grouped into a "preferred substitute cluster"), or the one specified with :py:obj:`~.cudaLaunchAttributeClusterDimension` attribute (grouped into a "regular cluster"). The cluster dimensions of a "preferred substitute cluster" shall be an integer multiple greater than zero of the regular cluster dimensions. The device will attempt - on a best-effort basis - to group thread blocks into preferred clusters over grouping them into regular clusters. When it deems necessary (primarily when the device temporarily runs out of physical resources to launch the larger preferred clusters), the device may switch to launch the regular clusters instead to attempt to utilize as much of the physical device resources as possible. - Each type of cluster will have its enumeration / coordinate setup as if the grid consists solely of its type of cluster. For example, if the preferred substitute cluster dimensions double the regular cluster dimensions, there might be simultaneously a regular cluster indexed at (1,0,0), and a preferred cluster indexed at (1,0,0). In this example, the preferred substitute cluster (1,0,0) replaces regular clusters (2,0,0) and (3,0,0) and groups their blocks. + Each type of cluster will have its enumeration / coordinate setup as if the grid consists solely of its type of cluster. For example, if the preferred substitute cluster dimensions double the regular cluster dimensions, there might be simultaneously a regular cluster indexed at (1,0,0), and a preferred cluster indexed at (1,0,0). In this example, the preferred substitute cluster (1,0,0) replaces regular clusters (2,0,0) and (3,0,0) and groups their blocks. This attribute will only take effect when a regular cluster dimension has been specified. The preferred substitute cluster dimension must be an integer multiple greater than zero of the regular cluster dimension and must divide the grid. It must also be no more than `maxBlocksPerCluster`, if it is set in the kernel's `__launch_bounds__`. Otherwise it must be less than the maximum value the driver can support. Otherwise, setting this attribute to a value physically unable to fit on any particular device is permitted. @@ -5317,11 +5317,11 @@ Data types used by CUDA Runtime .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeLaunchCompletionEvent - Valid for launches. Set :py:obj:`~.cudaLaunchAttributeValue.launchCompletionEvent` to record the event. + Valid for launches. Set :py:obj:`~.cudaLaunchAttributeValue.launchCompletionEvent` to record the event. - Nominally, the event is triggered once all blocks of the kernel have begun execution. Currently this is a best effort. If a kernel B has a launch completion dependency on a kernel A, B may wait until A is complete. Alternatively, blocks of B may begin before all blocks of A have begun, for example if B can claim execution resources unavailable to A (e.g. they run on different GPUs) or if B is a higher priority than A. Exercise caution if such an ordering inversion could lead to deadlock. + Nominally, the event is triggered once all blocks of the kernel have begun execution. Currently this is a best effort. If a kernel B has a launch completion dependency on a kernel A, B may wait until A is complete. Alternatively, blocks of B may begin before all blocks of A have begun, for example if B can claim execution resources unavailable to A (e.g. they run on different GPUs) or if B is a higher priority than A. Exercise caution if such an ordering inversion could lead to deadlock. - A launch completion event is nominally similar to a programmatic event with `triggerAtBlockStart` set except that it is not visible to `cudaGridDependencySynchronize()` and can be used with compute capability less than 9.0. + A launch completion event is nominally similar to a programmatic event with `triggerAtBlockStart` set except that it is not visible to `cudaGridDependencySynchronize()` and can be used with compute capability less than 9.0. The event supplied must not be an interprocess or interop event. The event must disable timing (i.e. must be created with the :py:obj:`~.cudaEventDisableTiming` flag set). @@ -5329,11 +5329,11 @@ Data types used by CUDA Runtime .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeDeviceUpdatableKernelNode - Valid for graph nodes, launches. This attribute is graphs-only, and passing it to a launch in a non-capturing stream will result in an error. + Valid for graph nodes, launches. This attribute is graphs-only, and passing it to a launch in a non-capturing stream will result in an error. - :cudaLaunchAttributeValue::deviceUpdatableKernelNode::deviceUpdatable can only be set to 0 or 1. Setting the field to 1 indicates that the corresponding kernel node should be device-updatable. On success, a handle will be returned via :py:obj:`~.cudaLaunchAttributeValue`::deviceUpdatableKernelNode::devNode which can be passed to the various device-side update functions to update the node's kernel parameters from within another kernel. For more information on the types of device updates that can be made, as well as the relevant limitations thereof, see :py:obj:`~.cudaGraphKernelNodeUpdatesApply`. + :cudaLaunchAttributeValue::deviceUpdatableKernelNode::deviceUpdatable can only be set to 0 or 1. Setting the field to 1 indicates that the corresponding kernel node should be device-updatable. On success, a handle will be returned via :py:obj:`~.cudaLaunchAttributeValue`::deviceUpdatableKernelNode::devNode which can be passed to the various device-side update functions to update the node's kernel parameters from within another kernel. For more information on the types of device updates that can be made, as well as the relevant limitations thereof, see :py:obj:`~.cudaGraphKernelNodeUpdatesApply`. - Nodes which are device-updatable have additional restrictions compared to regular kernel nodes. Firstly, device-updatable nodes cannot be removed from their graph via :py:obj:`~.cudaGraphDestroyNode`. Additionally, once opted-in to this functionality, a node cannot opt out, and any attempt to set the deviceUpdatable attribute to 0 will result in an error. Device-updatable kernel nodes also cannot have their attributes copied to/from another kernel node via :py:obj:`~.cudaGraphKernelNodeCopyAttributes`. Graphs containing one or more device-updatable nodes also do not allow multiple instantiation, and neither the graph nor its instantiated version can be passed to :py:obj:`~.cudaGraphExecUpdate`. + Nodes which are device-updatable have additional restrictions compared to regular kernel nodes. Firstly, device-updatable nodes cannot be removed from their graph via :py:obj:`~.cudaGraphDestroyNode`. Additionally, once opted-in to this functionality, a node cannot opt out, and any attempt to set the deviceUpdatable attribute to 0 will result in an error. Device-updatable kernel nodes also cannot have their attributes copied to/from another kernel node via :py:obj:`~.cudaGraphKernelNodeCopyAttributes`. Graphs containing one or more device-updatable nodes also do not allow multiple instantiation, and neither the graph nor its instantiated version can be passed to :py:obj:`~.cudaGraphExecUpdate`. If a graph contains device-updatable nodes and updates those nodes from the device from within the graph, the graph must be uploaded with :py:obj:`~.cuGraphUpload` before it is launched. For such a graph, if host-side executable graph updates are made to the device-updatable nodes, the graph must be uploaded before it is launched again. @@ -5347,13 +5347,13 @@ Data types used by CUDA Runtime .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeNvlinkUtilCentricScheduling - Valid for streams, graph nodes, launches. This attribute is a hint to the CUDA runtime that the launch should attempt to make the kernel maximize its NVLINK utilization. + Valid for streams, graph nodes, launches. This attribute is a hint to the CUDA runtime that the launch should attempt to make the kernel maximize its NVLINK utilization. - When possible to honor this hint, CUDA will assume each block in the grid launch will carry out an even amount of NVLINK traffic, and make a best-effort attempt to adjust the kernel launch based on that assumption. + When possible to honor this hint, CUDA will assume each block in the grid launch will carry out an even amount of NVLINK traffic, and make a best-effort attempt to adjust the kernel launch based on that assumption. - This attribute is a hint only. CUDA makes no functional or performance guarantee. Its applicability can be affected by many different factors, including driver version (i.e. CUDA doesn't guarantee the performance characteristics will be maintained between driver versions or a driver update could alter or regress previously observed perf characteristics.) It also doesn't guarantee a successful result, i.e. applying the attribute may not improve the performance of either the targeted kernel or the encapsulating application. + This attribute is a hint only. CUDA makes no functional or performance guarantee. Its applicability can be affected by many different factors, including driver version (i.e. CUDA doesn't guarantee the performance characteristics will be maintained between driver versions or a driver update could alter or regress previously observed perf characteristics.) It also doesn't guarantee a successful result, i.e. applying the attribute may not improve the performance of either the targeted kernel or the encapsulating application. Valid values for :py:obj:`~.cudaLaunchAttributeValue.nvlinkUtilCentricScheduling` are 0 (disabled) and 1 (enabled). diff --git a/cuda_bindings/docs/source/motivation.rst b/cuda_bindings/docs/source/motivation.rst index afbd3412d4..433cc16619 100644 --- a/cuda_bindings/docs/source/motivation.rst +++ b/cuda_bindings/docs/source/motivation.rst @@ -9,7 +9,7 @@ What is CUDA Python? NVIDIA’s CUDA Python provides `Cython `_ bindings and Python wrappers for the driver and runtime API for existing toolkits and libraries to simplify GPU-based accelerated processing. Python is one of the most popular -programming languages for science, engineering, data analytics, and deep +programming languages for science, engineering, data analytics, and deep learning applications. The goal of CUDA Python is to unify the Python ecosystem with a single set of interfaces that provide full coverage of and access to the CUDA host APIs from Python. @@ -25,18 +25,18 @@ science, and AI. `Anaconda `_ that can compile Python code for execution on CUDA-capable GPUs, provides Python developers with an easy entry into GPU-accelerated computing and a path for using increasingly sophisticated CUDA -code with a minimum of new syntax and jargon. Numba has its own CUDA driver API -bindings that can now be replaced with CUDA Python. With CUDA Python and Numba, +code with a minimum of new syntax and jargon. Numba has its own CUDA driver API +bindings that can now be replaced with CUDA Python. With CUDA Python and Numba, you get the best of both worlds: rapid iterative development with Python and the speed of a compiled language targeting both CPUs and NVIDIA GPUs. `CuPy `_ is a `NumPy `_/`SciPy `_ compatible Array library, from `Preferred Networks `_, for -GPU-accelerated computing with Python. CUDA Python simplifies the CuPy build -and allows for a faster and smaller memory footprint when importing the CuPy -Python module. In the future, when more CUDA Toolkit libraries are supported, -CuPy will have a lighter maintenance overhead and have fewer wheels to +GPU-accelerated computing with Python. CUDA Python simplifies the CuPy build +and allows for a faster and smaller memory footprint when importing the CuPy +Python module. In the future, when more CUDA Toolkit libraries are supported, +CuPy will have a lighter maintenance overhead and have fewer wheels to release. Users benefit from a faster CUDA runtime! Our goal is to help unify the Python CUDA ecosystem with a single standard set diff --git a/cuda_bindings/docs/source/overview.rst b/cuda_bindings/docs/source/overview.rst index 0f32032528..fdef83639b 100644 --- a/cuda_bindings/docs/source/overview.rst +++ b/cuda_bindings/docs/source/overview.rst @@ -60,7 +60,7 @@ The following code snippet lets us validate each API call and raise exceptions i return nvrtc.nvrtcGetErrorString(error)[1] else: raise RuntimeError('Unknown error type: {}'.format(error)) - + def checkCudaErrors(result): if result[0].value: raise RuntimeError("CUDA error code={}({})".format(result[0].value, _cudaGetErrorEnum(result[0]))) @@ -105,22 +105,22 @@ the program is compiled to target our local compute capability architecture with # Initialize CUDA Driver API checkCudaErrors(driver.cuInit(0)) - + # Retrieve handle for device 0 cuDevice = checkCudaErrors(driver.cuDeviceGet(0)) - + # Derive target architecture for device 0 major = checkCudaErrors(driver.cuDeviceGetAttribute(driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice)) minor = checkCudaErrors(driver.cuDeviceGetAttribute(driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice)) arch_arg = bytes(f'--gpu-architecture=compute_{major}{minor}', 'ascii') - + # Create program prog = checkCudaErrors(nvrtc.nvrtcCreateProgram(str.encode(saxpy), b"saxpy.cu", 0, [], [])) - + # Compile program opts = [b"--fmad=false", arch_arg] checkCudaErrors(nvrtc.nvrtcCompileProgram(prog, 2, opts)) - + # Get PTX from compilation ptxSize = checkCudaErrors(nvrtc.nvrtcGetPTXSize(prog)) ptx = b" " * ptxSize @@ -158,11 +158,11 @@ and from the device: NUM_THREADS = 512 # Threads per block NUM_BLOCKS = 32768 # Blocks per grid - + a = np.array([2.0], dtype=np.float32) n = np.array(NUM_THREADS * NUM_BLOCKS, dtype=np.uint32) bufferSize = n * a.itemsize - + hX = np.random.rand(n).astype(dtype=np.float32) hY = np.random.rand(n).astype(dtype=np.float32) hOut = np.zeros(n).astype(dtype=np.float32) @@ -182,9 +182,9 @@ by calling ``XX.ctypes.data`` for the associated XX: dXclass = checkCudaErrors(driver.cuMemAlloc(bufferSize)) dYclass = checkCudaErrors(driver.cuMemAlloc(bufferSize)) dOutclass = checkCudaErrors(driver.cuMemAlloc(bufferSize)) - + stream = checkCudaErrors(driver.cuStreamCreate(0)) - + checkCudaErrors(driver.cuMemcpyHtoDAsync( dXclass, hX.ctypes.data, bufferSize, stream )) @@ -233,7 +233,7 @@ Now the kernel can be launched: args.ctypes.data, # kernel arguments 0, # extra (ignore) )) - + checkCudaErrors(driver.cuMemcpyDtoHAsync( hOut.ctypes.data, dOutclass, bufferSize, stream )) @@ -304,7 +304,7 @@ maximize performance ({numref}``Figure 1``). .. figure:: _static/images/Nsight-Compute-CLI-625x473.png :name: Figure 1 - + Screenshot of Nsight Compute CLI output of ``cuda.bindings`` example. Preparing kernel arguments @@ -331,7 +331,7 @@ Let's use the following kernel definition as an example: typedef struct { int value; } testStruct; - + extern "C" __global__ void testkernel(int i, int *pi, float f, float *pf, @@ -347,7 +347,7 @@ The first step is to create array objects with types corresponding to your kerne .. list-table:: Correspondence between NumPy types and kernel types. :header-rows: 1 - + * - NumPy type - Corresponding kernel types - itemsize (bytes) @@ -410,12 +410,12 @@ Putting it all together: # Define a custom type testStruct = np.dtype([("value", np.int32)], align=True) - + # Allocate device memory pInt = checkCudaErrors(cudart.cudaMalloc(np.dtype(np.int32).itemsize)) pFloat = checkCudaErrors(cudart.cudaMalloc(np.dtype(np.float32).itemsize)) pStruct = checkCudaErrors(cudart.cudaMalloc(testStruct.itemsize)) - + # Collect all input kernel arguments into a single tuple for further processing kernelValues = ( np.array(1, dtype=np.uint32), @@ -470,12 +470,12 @@ For this example the result becomes: # Define a custom type class testStruct(ctypes.Structure): _fields_ = [("value", ctypes.c_int)] - + # Allocate device memory pInt = checkCudaErrors(cudart.cudaMalloc(ctypes.sizeof(ctypes.c_int))) pFloat = checkCudaErrors(cudart.cudaMalloc(ctypes.sizeof(ctypes.c_float))) pStruct = checkCudaErrors(cudart.cudaMalloc(ctypes.sizeof(testStruct))) - + # Collect all input kernel arguments into a single tuple for further processing kernelValues = ( 1, @@ -531,7 +531,7 @@ For this example, lets use the ``transformKernel`` from `examples/0_Introduction ... } """ - + def main(): ... d_data = checkCudaErrors(cudart.cudaMalloc(size)) @@ -565,4 +565,3 @@ For ctypes, we leverage the special handling of ``None`` type since each Python None, ) kernelArgs = (kernelValues, kernelTypes) - diff --git a/cuda_bindings/docs/source/release/11.6.0-notes.rst b/cuda_bindings/docs/source/release/11.6.0-notes.rst index d7907df847..bcc8944e1f 100644 --- a/cuda_bindings/docs/source/release/11.6.0-notes.rst +++ b/cuda_bindings/docs/source/release/11.6.0-notes.rst @@ -79,4 +79,3 @@ CUDA Functions Not Supported in this Release - cudaVDPAUSetVDPAUDevice .. note:: Deprecated APIs are removed from tracking - diff --git a/cuda_bindings/docs/source/release/12.9.X-notes.rst b/cuda_bindings/docs/source/release/12.9.X-notes.rst index 3be22a695a..a3a640f531 100644 --- a/cuda_bindings/docs/source/release/12.9.X-notes.rst +++ b/cuda_bindings/docs/source/release/12.9.X-notes.rst @@ -18,4 +18,4 @@ Highlights Known issues ------------ -* Updating from older versions (v12.6.2.post1 and below) via ``pip install -U cuda-python`` might not work. Please do a clean re-installation by uninstalling ``pip uninstall -y cuda-python`` followed by installing ``pip install cuda-python``. \ No newline at end of file +* Updating from older versions (v12.6.2.post1 and below) via ``pip install -U cuda-python`` might not work. Please do a clean re-installation by uninstalling ``pip uninstall -y cuda-python`` followed by installing ``pip install cuda-python``. diff --git a/cuda_bindings/docs/source/tips_and_tricks.rst b/cuda_bindings/docs/source/tips_and_tricks.rst index cc666ca275..1a77eb53fd 100644 --- a/cuda_bindings/docs/source/tips_and_tricks.rst +++ b/cuda_bindings/docs/source/tips_and_tricks.rst @@ -27,9 +27,9 @@ All of the Python classes do not manage the lifetime of the underlying CUDA C ob Getting and setting attributes of extension types ================================================= -While the bindings outwardly present the attributes of extension types in a pythonic way, they can't always be interacted with in a Pythonic style. Often the getters/setters (__getitem__(), __setitem__()) are actually a translation step to convert values between Python and C. For example, in some cases, attempting to modify an attribute in place, will lead to unexpected behavior due to the design of the underlying implementation. For this reason, users should use the getters and setters directly when interacting with extension types. +While the bindings outwardly present the attributes of extension types in a pythonic way, they can't always be interacted with in a Pythonic style. Often the getters/setters (__getitem__(), __setitem__()) are actually a translation step to convert values between Python and C. For example, in some cases, attempting to modify an attribute in place, will lead to unexpected behavior due to the design of the underlying implementation. For this reason, users should use the getters and setters directly when interacting with extension types. -An example of this is the :class:`~cuda.bindings.driver.CULaunchConfig` type. +An example of this is the :class:`~cuda.bindings.driver.CULaunchConfig` type. .. code-block:: python @@ -37,7 +37,7 @@ An example of this is the :class:`~cuda.bindings.driver.CULaunchConfig` type. cfg.numAttrs += 1 attr = cuda.CUlaunchAttribute() - + ... # This works. We are passing the new attribute to the setter diff --git a/cuda_bindings/pyproject.toml b/cuda_bindings/pyproject.toml index 1a91b44b83..2637f0b0e7 100644 --- a/cuda_bindings/pyproject.toml +++ b/cuda_bindings/pyproject.toml @@ -84,10 +84,10 @@ select = [ ] ignore = [ - "UP006", - "UP007", + "UP006", + "UP007", "E741", # ambiguous variable name such as I - "B007", # rename unsued loop variable to _name + "B007", # rename unsued loop variable to _name "UP035" # UP006, UP007, UP035 complain about deprecated Typing. use, but disregard backward compatibility of python version ] @@ -103,13 +103,13 @@ exclude = ["cuda/bindings/_version.py"] ] "tests/**/*" = [ - "E722", + "E722", "UP022", "E402", # module level import not at top of file "F841"] # F841 complains about unused variables, but some assignments have side-effects that could be useful for tests (func calls for example) "benchmarks/**/*" = [ - "E722", + "E722", "UP022", "E402", # module level import not at top of file "F841"] # F841 complains about unused variables, but some assignments have side-effects that could be useful for tests (func calls for example) diff --git a/cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx b/cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx index e735630386..0bb40bf404 100644 --- a/cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx +++ b/cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx @@ -215,7 +215,7 @@ cdef class ParamHolder: if isinstance(arg.handle, int): # see note below on handling int arguments prepare_arg[intptr_t](self.data, self.data_addresses, arg.handle, i) - continue + continue else: # it's a CUdeviceptr: self.data_addresses[i] = (arg.handle.getPtr()) diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory.pyx index ddcf7665e1..3eb80875e8 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory.pyx @@ -137,14 +137,14 @@ cdef class Buffer: """ if stream is None: raise ValueError("stream must be provided") - + cdef size_t src_size = self._size - + if dst is None: if self._mr is None: raise ValueError("a destination buffer must be provided (this buffer does not have a memory_resource)") dst = self._mr.allocate(src_size, stream) - + cdef size_t dst_size = dst._size if dst_size != src_size: raise ValueError( @@ -168,10 +168,10 @@ cdef class Buffer: """ if stream is None: raise ValueError("stream must be provided") - + cdef size_t dst_size = self._size cdef size_t src_size = src._size - + if src_size != dst_size: raise ValueError( f"buffer sizes mismatch between src and dst (sizes are: src={src_size}, dst={dst_size})" diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/experimental/_memoryview.pyx index 9d24133052..ea8fb01b67 100644 --- a/cuda_core/cuda/core/experimental/_memoryview.pyx +++ b/cuda_core/cuda/core/experimental/_memoryview.pyx @@ -29,20 +29,20 @@ cdef class StridedMemoryView: This object supports both DLPack (up to v1.0) and CUDA Array Interface (CAI) v3. When wrapping an arbitrary object it will try the DLPack protocol first, then the CAI protocol. A :obj:`BufferError` is raised if neither is - supported. - + supported. + Since either way would take a consumer stream, for DLPack it is passed to - ``obj.__dlpack__()`` as-is (except for :obj:`None`, see below); for CAI, a + ``obj.__dlpack__()`` as-is (except for :obj:`None`, see below); for CAI, a stream order will be established between the consumer stream and the - producer stream (from ``obj.__cuda_array_interface__()["stream"]``), as if - ``cudaStreamWaitEvent`` is called by this method. - - To opt-out of the stream ordering operation in either DLPack or CAI, - please pass ``stream_ptr=-1``. Note that this deviates (on purpose) + producer stream (from ``obj.__cuda_array_interface__()["stream"]``), as if + ``cudaStreamWaitEvent`` is called by this method. + + To opt-out of the stream ordering operation in either DLPack or CAI, + please pass ``stream_ptr=-1``. Note that this deviates (on purpose) from the semantics of ``obj.__dlpack__(stream=None, ...)`` since ``cuda.core`` - does not encourage using the (legacy) default/null stream, but is + does not encourage using the (legacy) default/null stream, but is consistent with the CAI's semantics. For DLPack, ``stream=-1`` will be - internally passed to ``obj.__dlpack__()`` instead. + internally passed to ``obj.__dlpack__()`` instead. Attributes ---------- @@ -79,16 +79,16 @@ cdef class StridedMemoryView: bint is_device_accessible bint readonly object exporting_obj - - # If using dlpack, this is a strong reference to the result of - # obj.__dlpack__() so we can lazily create shape and strides from - # it later. If using CAI, this is a reference to the source + + # If using dlpack, this is a strong reference to the result of + # obj.__dlpack__() so we can lazily create shape and strides from + # it later. If using CAI, this is a reference to the source # `__cuda_array_interface__` object. cdef object metadata # The tensor object if has obj has __dlpack__, otherwise must be NULL cdef DLTensor *dl_tensor - + # Memoized properties cdef tuple _shape cdef tuple _strides @@ -110,7 +110,7 @@ cdef class StridedMemoryView: if self._shape is None and self.exporting_obj is not None: if self.dl_tensor != NULL: self._shape = cuda_utils.carray_int64_t_to_tuple( - self.dl_tensor.shape, + self.dl_tensor.shape, self.dl_tensor.ndim ) else: @@ -127,7 +127,7 @@ cdef class StridedMemoryView: if self.dl_tensor != NULL: if self.dl_tensor.strides: self._strides = cuda_utils.carray_int64_t_to_tuple( - self.dl_tensor.strides, + self.dl_tensor.strides, self.dl_tensor.ndim ) else: @@ -140,7 +140,7 @@ cdef class StridedMemoryView: self._strides, i, strides[i] // itemsize ) self._strides_init = True - return self._strides + return self._strides @property def dtype(self) -> Optional[numpy.dtype]: diff --git a/cuda_core/docs/source/_templates/autosummary/dataclass.rst b/cuda_core/docs/source/_templates/autosummary/dataclass.rst index 5931265888..efb115c831 100644 --- a/cuda_core/docs/source/_templates/autosummary/dataclass.rst +++ b/cuda_core/docs/source/_templates/autosummary/dataclass.rst @@ -10,4 +10,3 @@ {% block methods %} .. automethod:: __init__ {% endblock %} - diff --git a/cuda_core/docs/source/_templates/autosummary/namedtuple.rst b/cuda_core/docs/source/_templates/autosummary/namedtuple.rst index 1695beef0a..7ee8a09a1c 100644 --- a/cuda_core/docs/source/_templates/autosummary/namedtuple.rst +++ b/cuda_core/docs/source/_templates/autosummary/namedtuple.rst @@ -8,4 +8,4 @@ .. autoclass:: {{ objname }} :members: __new__ :special-members: __new__ - :exclude-members: count, index, __reduce__, __reduce_ex__, __repr__, __hash__, __str__, __getnewargs__ \ No newline at end of file + :exclude-members: count, index, __reduce__, __reduce_ex__, __repr__, __hash__, __str__, __getnewargs__ diff --git a/cuda_core/docs/source/getting-started.rst b/cuda_core/docs/source/getting-started.rst index 502ea66375..47f8d193a9 100644 --- a/cuda_core/docs/source/getting-started.rst +++ b/cuda_core/docs/source/getting-started.rst @@ -60,7 +60,7 @@ Don't forget to use :meth:`Device.set_current`! import cupy as cp from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch - + dev = Device() dev.set_current() s = dev.create_stream() @@ -82,14 +82,14 @@ We're using `CuPy `_ arrays as inputs for this example, but y .. code-block:: python ker = mod.get_kernel("vector_add") - + # Prepare input/output arrays (using CuPy) size = 50000 rng = cp.random.default_rng() a = rng.random(size, dtype=cp.float32) b = rng.random(size, dtype=cp.float32) c = cp.empty_like(a) - + # Configure launch parameters block = 256 grid = (size + block - 1) // block diff --git a/cuda_core/docs/source/install.rst b/cuda_core/docs/source/install.rst index 8bc1faa0e1..3241f563f7 100644 --- a/cuda_core/docs/source/install.rst +++ b/cuda_core/docs/source/install.rst @@ -10,10 +10,10 @@ Runtime Requirements ``cuda.core`` is supported on all platforms that CUDA is supported. Specific dependencies are as follows: -.. list-table:: +.. list-table:: :header-rows: 1 - * - + * - - CUDA 11 - CUDA 12 * - CUDA Toolkit\ [#f1]_ diff --git a/cuda_core/docs/source/release/0.1.1-notes.rst b/cuda_core/docs/source/release/0.1.1-notes.rst index 5434d726ea..f9ac2b5ccd 100644 --- a/cuda_core/docs/source/release/0.1.1-notes.rst +++ b/cuda_core/docs/source/release/0.1.1-notes.rst @@ -2,7 +2,7 @@ .. SPDX-License-Identifier: Apache-2.0 .. currentmodule:: cuda.core.experimental - + ``cuda.core`` 0.1.1 Release Notes ================================= diff --git a/cuda_core/docs/source/release/0.3.2-notes.rst b/cuda_core/docs/source/release/0.3.2-notes.rst index 8b4763ed31..b1b087dbb7 100644 --- a/cuda_core/docs/source/release/0.3.2-notes.rst +++ b/cuda_core/docs/source/release/0.3.2-notes.rst @@ -25,7 +25,7 @@ None. New features ------------ -- :class:`Stream` and :class:`Event` can be subclassed now. +- :class:`Stream` and :class:`Event` can be subclassed now. New examples diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml index 3506ce0255..76e312b0d7 100644 --- a/cuda_core/pyproject.toml +++ b/cuda_core/pyproject.toml @@ -97,10 +97,10 @@ select = [ ] ignore = [ - "UP006", - "UP007", + "UP006", + "UP007", "E741", # ambiguous variable name such as I - "B007", # rename unsued loop variable to _name + "B007", # rename unsued loop variable to _name "UP035" # UP006, UP007, UP035 complain about deprecated Typing. use, but disregard backward compatibility of python version ] diff --git a/cuda_core/tests/cython/test_get_cuda_native_handle.pyx b/cuda_core/tests/cython/test_get_cuda_native_handle.pyx index d1764d3bba..0c3921e925 100644 --- a/cuda_core/tests/cython/test_get_cuda_native_handle.pyx +++ b/cuda_core/tests/cython/test_get_cuda_native_handle.pyx @@ -23,7 +23,7 @@ cdef extern from "utility.hpp": def test_get_cuda_native_handle(): dev = Device(0) dev.set_current() - + s = dev.create_stream() cdef pyCUstream s_py = s.handle cdef CUstream s_c = get_cuda_native_handle(s_py) diff --git a/cuda_python/LICENSE b/cuda_python/LICENSE index b7d042fcee..a5a65097cd 100644 --- a/cuda_python/LICENSE +++ b/cuda_python/LICENSE @@ -2,46 +2,46 @@ NVIDIA SOFTWARE LICENSE This license is a legal agreement between you and NVIDIA Corporation ("NVIDIA") and governs your use of the NVIDIA CUDA Python software and materials provided hereunder ("SOFTWARE"). -This license can be accepted only by an adult of legal age of majority in the country in which the SOFTWARE is used. If you are under the legal age of majority, you must ask your parent or legal guardian to consent to this license. By taking delivery of the SOFTWARE, you affirm that you have reached the legal age of majority, you accept the terms of this license, and you take legal and financial responsibility for the actions of your permitted users. +This license can be accepted only by an adult of legal age of majority in the country in which the SOFTWARE is used. If you are under the legal age of majority, you must ask your parent or legal guardian to consent to this license. By taking delivery of the SOFTWARE, you affirm that you have reached the legal age of majority, you accept the terms of this license, and you take legal and financial responsibility for the actions of your permitted users. You agree to use the SOFTWARE only for purposes that are permitted by (a) this license, and (b) any applicable law, regulation or generally accepted practices or guidelines in the relevant jurisdictions. 1. LICENSE. Subject to the terms of this license, NVIDIA grants you a non-exclusive limited license to: (a) install and use the SOFTWARE, and (b) distribute the SOFTWARE subject to the distribution requirements described in this license. NVIDIA reserves all rights, title and interest in and to the SOFTWARE not expressly granted to you under this license. -2. DISTRIBUTION REQUIREMENTS. These are the distribution requirements for you to exercise the distribution grant: -a. The terms under which you distribute the SOFTWARE must be consistent with the terms of this license, including (without limitation) terms relating to the license grant and license restrictions and protection of NVIDIA's intellectual property rights. -b. You agree to notify NVIDIA in writing of any known or suspected distribution or use of the SOFTWARE not in compliance with the requirements of this license, and to enforce the terms of your agreements with respect to distributed SOFTWARE. +2. DISTRIBUTION REQUIREMENTS. These are the distribution requirements for you to exercise the distribution grant: +a. The terms under which you distribute the SOFTWARE must be consistent with the terms of this license, including (without limitation) terms relating to the license grant and license restrictions and protection of NVIDIA's intellectual property rights. +b. You agree to notify NVIDIA in writing of any known or suspected distribution or use of the SOFTWARE not in compliance with the requirements of this license, and to enforce the terms of your agreements with respect to distributed SOFTWARE. 3. LIMITATIONS. Your license to use the SOFTWARE is restricted as follows: a. The SOFTWARE is licensed for you to develop applications only for use in systems with NVIDIA GPUs. -b. You may not reverse engineer, decompile or disassemble, or remove copyright or other proprietary notices from any portion of the SOFTWARE or copies of the SOFTWARE. -c. You may not modify or create derivative works of any portion of the SOFTWARE. +b. You may not reverse engineer, decompile or disassemble, or remove copyright or other proprietary notices from any portion of the SOFTWARE or copies of the SOFTWARE. +c. You may not modify or create derivative works of any portion of the SOFTWARE. d. You may not bypass, disable, or circumvent any technical measure, encryption, security, digital rights management or authentication mechanism in the SOFTWARE. e. You may not use the SOFTWARE in any manner that would cause it to become subject to an open source software license. As examples, licenses that require as a condition of use, modification, and/or distribution that the SOFTWARE be (i) disclosed or distributed in source code form; (ii) licensed for the purpose of making derivative works; or (iii) redistributable at no charge. -f. Unless you have an agreement with NVIDIA for this purpose, you may not use the SOFTWARE with any system or application where the use or failure of the system or application can reasonably be expected to threaten or result in personal injury, death, or catastrophic loss. Examples include use in avionics, navigation, military, medical, life support or other life critical applications. NVIDIA does not design, test or manufacture the SOFTWARE for these critical uses and NVIDIA shall not be liable to you or any third party, in whole or in part, for any claims or damages arising from such uses. -g. You agree to defend, indemnify and hold harmless NVIDIA and its affiliates, and their respective employees, contractors, agents, officers and directors, from and against any and all claims, damages, obligations, losses, liabilities, costs or debt, fines, restitutions and expenses (including but not limited to attorney's fees and costs incident to establishing the right of indemnification) arising out of or related to use of the SOFTWARE outside of the scope of this Agreement, or not in compliance with its terms. +f. Unless you have an agreement with NVIDIA for this purpose, you may not use the SOFTWARE with any system or application where the use or failure of the system or application can reasonably be expected to threaten or result in personal injury, death, or catastrophic loss. Examples include use in avionics, navigation, military, medical, life support or other life critical applications. NVIDIA does not design, test or manufacture the SOFTWARE for these critical uses and NVIDIA shall not be liable to you or any third party, in whole or in part, for any claims or damages arising from such uses. +g. You agree to defend, indemnify and hold harmless NVIDIA and its affiliates, and their respective employees, contractors, agents, officers and directors, from and against any and all claims, damages, obligations, losses, liabilities, costs or debt, fines, restitutions and expenses (including but not limited to attorney's fees and costs incident to establishing the right of indemnification) arising out of or related to use of the SOFTWARE outside of the scope of this Agreement, or not in compliance with its terms. -4. PRE-RELEASE. SOFTWARE versions identified as alpha, beta, preview, early access or otherwise as pre-release may not be fully functional, may contain errors or design flaws, and may have reduced or different security, privacy, availability, and reliability standards relative to commercial versions of NVIDIA software and materials. You may use a pre-release SOFTWARE version at your own risk, understanding that these versions are not intended for use in production or business-critical systems. +4. PRE-RELEASE. SOFTWARE versions identified as alpha, beta, preview, early access or otherwise as pre-release may not be fully functional, may contain errors or design flaws, and may have reduced or different security, privacy, availability, and reliability standards relative to commercial versions of NVIDIA software and materials. You may use a pre-release SOFTWARE version at your own risk, understanding that these versions are not intended for use in production or business-critical systems. 5. OWNERSHIP. The SOFTWARE and the related intellectual property rights therein are and will remain the sole and exclusive property of NVIDIA or its licensors. The SOFTWARE is copyrighted and protected by the laws of the United States and other countries, and international treaty provisions. NVIDIA may make changes to the SOFTWARE, at any time without notice, but is not obligated to support or update the SOFTWARE. - -6. COMPONENTS UNDER OTHER LICENSES. The SOFTWARE may include NVIDIA or third-party components with separate legal notices or terms as may be described in proprietary notices accompanying the SOFTWARE. If and to the extent there is a conflict between the terms in this license and the license terms associated with a component, the license terms associated with the components control only to the extent necessary to resolve the conflict. + +6. COMPONENTS UNDER OTHER LICENSES. The SOFTWARE may include NVIDIA or third-party components with separate legal notices or terms as may be described in proprietary notices accompanying the SOFTWARE. If and to the extent there is a conflict between the terms in this license and the license terms associated with a component, the license terms associated with the components control only to the extent necessary to resolve the conflict. 7. FEEDBACK. You may, but don't have to, provide to NVIDIA any Feedback. "Feedback" means any suggestions, bug fixes, enhancements, modifications, feature requests or other feedback regarding the SOFTWARE. For any Feedback that you voluntarily provide, you hereby grant NVIDIA and its affiliates a perpetual, non-exclusive, worldwide, irrevocable license to use, reproduce, modify, license, sublicense (through multiple tiers of sublicensees), and distribute (through multiple tiers of distributors) the Feedback without the payment of any royalties or fees to you. NVIDIA will use Feedback at its choice. -8. NO WARRANTIES. THE SOFTWARE IS PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING, BUT NOT LIMITED TO, WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. NVIDIA DOES NOT WARRANT THAT THE SOFTWARE WILL MEET YOUR REQUIREMENTS OR THAT THE OPERATION THEREOF WILL BE UNINTERRUPTED OR ERROR-FREE, OR THAT ALL ERRORS WILL BE CORRECTED. +8. NO WARRANTIES. THE SOFTWARE IS PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING, BUT NOT LIMITED TO, WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. NVIDIA DOES NOT WARRANT THAT THE SOFTWARE WILL MEET YOUR REQUIREMENTS OR THAT THE OPERATION THEREOF WILL BE UNINTERRUPTED OR ERROR-FREE, OR THAT ALL ERRORS WILL BE CORRECTED. + +9. LIMITATIONS OF LIABILITY. TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS AFFILIATES SHALL NOT BE LIABLE FOR ANY SPECIAL, INCIDENTAL, PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, PROJECT DELAYS, LOSS OF USE, LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION WITH THIS LICENSE OR THE USE OR PERFORMANCE OF THE SOFTWARE, WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON BREACH OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE), PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION OR THEORY OF LIABILITY, EVEN IF NVIDIA HAS PREVIOUSLY BEEN ADVISED OF, OR COULD REASONABLY HAVE FORESEEN, THE POSSIBILITY OF SUCH DAMAGES. IN NO EVENT WILL NVIDIA'S AND ITS AFFILIATES TOTAL CUMULATIVE LIABILITY UNDER OR ARISING OUT OF THIS LICENSE EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE OR EXTEND THIS LIMIT. -9. LIMITATIONS OF LIABILITY. TO THE MAXIMUM EXTENT PERMITTED BY LAW, NVIDIA AND ITS AFFILIATES SHALL NOT BE LIABLE FOR ANY SPECIAL, INCIDENTAL, PUNITIVE OR CONSEQUENTIAL DAMAGES, OR ANY LOST PROFITS, PROJECT DELAYS, LOSS OF USE, LOSS OF DATA OR LOSS OF GOODWILL, OR THE COSTS OF PROCURING SUBSTITUTE PRODUCTS, ARISING OUT OF OR IN CONNECTION WITH THIS LICENSE OR THE USE OR PERFORMANCE OF THE SOFTWARE, WHETHER SUCH LIABILITY ARISES FROM ANY CLAIM BASED UPON BREACH OF CONTRACT, BREACH OF WARRANTY, TORT (INCLUDING NEGLIGENCE), PRODUCT LIABILITY OR ANY OTHER CAUSE OF ACTION OR THEORY OF LIABILITY, EVEN IF NVIDIA HAS PREVIOUSLY BEEN ADVISED OF, OR COULD REASONABLY HAVE FORESEEN, THE POSSIBILITY OF SUCH DAMAGES. IN NO EVENT WILL NVIDIA'S AND ITS AFFILIATES TOTAL CUMULATIVE LIABILITY UNDER OR ARISING OUT OF THIS LICENSE EXCEED US$10.00. THE NATURE OF THE LIABILITY OR THE NUMBER OF CLAIMS OR SUITS SHALL NOT ENLARGE OR EXTEND THIS LIMIT. +10. TERMINATION. Your rights under this license will terminate automatically without notice from NVIDIA if you fail to comply with any term and condition of this license or if you commence or participate in any legal proceeding against NVIDIA with respect to the SOFTWARE. NVIDIA may terminate this license with advance written notice to you if NVIDIA decides to no longer provide the SOFTWARE in a country or, in NVIDIA's sole discretion, the continued use of it is no longer commercially viable. Upon any termination of this license, you agree to promptly discontinue use of the SOFTWARE and destroy all copies in your possession or control. Your prior distributions in accordance with this license are not affected by the termination of this license. All provisions of this license will survive termination, except for the license granted to you. -10. TERMINATION. Your rights under this license will terminate automatically without notice from NVIDIA if you fail to comply with any term and condition of this license or if you commence or participate in any legal proceeding against NVIDIA with respect to the SOFTWARE. NVIDIA may terminate this license with advance written notice to you if NVIDIA decides to no longer provide the SOFTWARE in a country or, in NVIDIA's sole discretion, the continued use of it is no longer commercially viable. Upon any termination of this license, you agree to promptly discontinue use of the SOFTWARE and destroy all copies in your possession or control. Your prior distributions in accordance with this license are not affected by the termination of this license. All provisions of this license will survive termination, except for the license granted to you. +11. APPLICABLE LAW. This license will be governed in all respects by the laws of the United States and of the State of Delaware as those laws are applied to contracts entered into and performed entirely within Delaware by Delaware residents, without regard to the conflicts of laws principles. The United Nations Convention on Contracts for the International Sale of Goods is specifically disclaimed. You agree to all terms of this Agreement in the English language. The state or federal courts residing in Santa Clara County, California shall have exclusive jurisdiction over any dispute or claim arising out of this license. Notwithstanding this, you agree that NVIDIA shall still be allowed to apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction. -11. APPLICABLE LAW. This license will be governed in all respects by the laws of the United States and of the State of Delaware as those laws are applied to contracts entered into and performed entirely within Delaware by Delaware residents, without regard to the conflicts of laws principles. The United Nations Convention on Contracts for the International Sale of Goods is specifically disclaimed. You agree to all terms of this Agreement in the English language. The state or federal courts residing in Santa Clara County, California shall have exclusive jurisdiction over any dispute or claim arising out of this license. Notwithstanding this, you agree that NVIDIA shall still be allowed to apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction. +12. NO ASSIGNMENT. This license and your rights and obligations thereunder may not be assigned by you by any means or operation of law without NVIDIA's permission. Any attempted assignment not approved by NVIDIA in writing shall be void and of no effect. -12. NO ASSIGNMENT. This license and your rights and obligations thereunder may not be assigned by you by any means or operation of law without NVIDIA's permission. Any attempted assignment not approved by NVIDIA in writing shall be void and of no effect. - 13. EXPORT. The SOFTWARE is subject to United States export laws and regulations. You agree that you will not ship, transfer or export the SOFTWARE into any country, or use the SOFTWARE in any manner, prohibited by the United States Bureau of Industry and Security or economic sanctions regulations administered by the U.S. Department of Treasury's Office of Foreign Assets Control (OFAC), or any applicable export laws, restrictions or regulations. These laws include restrictions on destinations, end users and end use. By accepting this license, you confirm that you are not a resident or citizen of any country currently embargoed by the U.S. and that you are not otherwise prohibited from receiving the SOFTWARE. -14. GOVERNMENT USE. The SOFTWARE has been developed entirely at private expense and is "commercial items" consisting of "commercial computer software" and "commercial computer software documentation" provided with RESTRICTED RIGHTS. Use, duplication or disclosure by the U.S. Government or a U.S. Government subcontractor is subject to the restrictions in this license pursuant to DFARS 227.7202-3(a) or as set forth in subparagraphs (b)(1) and (2) of the Commercial Computer Software - Restricted Rights clause at FAR 52.227-19, as applicable. Contractor/manufacturer is NVIDIA, 2788 San Tomas Expressway, Santa Clara, CA 95051. +14. GOVERNMENT USE. The SOFTWARE has been developed entirely at private expense and is "commercial items" consisting of "commercial computer software" and "commercial computer software documentation" provided with RESTRICTED RIGHTS. Use, duplication or disclosure by the U.S. Government or a U.S. Government subcontractor is subject to the restrictions in this license pursuant to DFARS 227.7202-3(a) or as set forth in subparagraphs (b)(1) and (2) of the Commercial Computer Software - Restricted Rights clause at FAR 52.227-19, as applicable. Contractor/manufacturer is NVIDIA, 2788 San Tomas Expressway, Santa Clara, CA 95051. 15. ENTIRE AGREEMENT. This license is the final, complete and exclusive agreement between the parties relating to the subject matter of this license and supersedes all prior or contemporaneous understandings and agreements relating to this subject matter, whether oral or written. If any court of competent jurisdiction determines that any provision of this license is illegal, invalid or unenforceable, the remaining provisions will remain in full force and effect. This license may only be modified in a writing signed by an authorized representative of each party. diff --git a/cuda_python/docs/source/release.rst b/cuda_python/docs/source/release.rst index c97e508c45..9e7a66a522 100644 --- a/cuda_python/docs/source/release.rst +++ b/cuda_python/docs/source/release.rst @@ -17,4 +17,3 @@ Release Notes 12.6.1 11.8.7 11.8.6 - From 8001d515abc45aeab5095bc2dc1322a01f0359a5 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Mon, 25 Aug 2025 14:29:01 -0700 Subject: [PATCH 6/7] git checkout main cuda_bindings/docs/source/module/*.rst --- cuda_bindings/docs/source/module/driver.rst | 26 +++++------ .../docs/source/module/nvjitlink.rst | 2 +- cuda_bindings/docs/source/module/nvrtc.rst | 7 +-- cuda_bindings/docs/source/module/runtime.rst | 44 +++++++++---------- 4 files changed, 40 insertions(+), 39 deletions(-) diff --git a/cuda_bindings/docs/source/module/driver.rst b/cuda_bindings/docs/source/module/driver.rst index 8a4ba15ff8..04e0390d12 100644 --- a/cuda_bindings/docs/source/module/driver.rst +++ b/cuda_bindings/docs/source/module/driver.rst @@ -2318,7 +2318,7 @@ Data types used by CUDA driver .. autoattribute:: cuda.bindings.driver.CUjit_option.CU_JIT_CACHE_MODE - Specifies whether to enable caching explicitly (-dlcm) + Specifies whether to enable caching explicitly (-dlcm) Choice is based on supplied :py:obj:`~.CUjit_cacheMode_enum`. @@ -3256,7 +3256,7 @@ Data types used by CUDA driver .. autoattribute:: cuda.bindings.driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT - Valid for launches. Set :py:obj:`~.CUlaunchAttributeValue.programmaticEvent` to record the event. Event recorded through this launch attribute is guaranteed to only trigger after all block in the associated kernel trigger the event. A block can trigger the event through PTX launchdep.release or CUDA builtin function cudaTriggerProgrammaticLaunchCompletion(). A trigger can also be inserted at the beginning of each block's execution if triggerAtBlockStart is set to non-0. The dependent launches can choose to wait on the dependency using the programmatic sync (cudaGridDependencySynchronize() or equivalent PTX instructions). Note that dependents (including the CPU thread calling :py:obj:`~.cuEventSynchronize()`) are not guaranteed to observe the release precisely when it is released. For example, :py:obj:`~.cuEventSynchronize()` may only observe the event trigger long after the associated kernel has completed. This recording type is primarily meant for establishing programmatic dependency between device tasks. Note also this type of dependency allows, but does not guarantee, concurrent execution of tasks. + Valid for launches. Set :py:obj:`~.CUlaunchAttributeValue.programmaticEvent` to record the event. Event recorded through this launch attribute is guaranteed to only trigger after all block in the associated kernel trigger the event. A block can trigger the event through PTX launchdep.release or CUDA builtin function cudaTriggerProgrammaticLaunchCompletion(). A trigger can also be inserted at the beginning of each block's execution if triggerAtBlockStart is set to non-0. The dependent launches can choose to wait on the dependency using the programmatic sync (cudaGridDependencySynchronize() or equivalent PTX instructions). Note that dependents (including the CPU thread calling :py:obj:`~.cuEventSynchronize()`) are not guaranteed to observe the release precisely when it is released. For example, :py:obj:`~.cuEventSynchronize()` may only observe the event trigger long after the associated kernel has completed. This recording type is primarily meant for establishing programmatic dependency between device tasks. Note also this type of dependency allows, but does not guarantee, concurrent execution of tasks. The event supplied must not be an interprocess or interop event. The event must disable timing (i.e. must be created with the :py:obj:`~.CU_EVENT_DISABLE_TIMING` flag set). @@ -3282,9 +3282,9 @@ Data types used by CUDA driver .. autoattribute:: cuda.bindings.driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION - Valid for graph nodes, launches. Set :py:obj:`~.CUlaunchAttributeValue.preferredClusterDim` to allow the kernel launch to specify a preferred substitute cluster dimension. Blocks may be grouped according to either the dimensions specified with this attribute (grouped into a "preferred substitute cluster"), or the one specified with :py:obj:`~.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION` attribute (grouped into a "regular cluster"). The cluster dimensions of a "preferred substitute cluster" shall be an integer multiple greater than zero of the regular cluster dimensions. The device will attempt - on a best-effort basis - to group thread blocks into preferred clusters over grouping them into regular clusters. When it deems necessary (primarily when the device temporarily runs out of physical resources to launch the larger preferred clusters), the device may switch to launch the regular clusters instead to attempt to utilize as much of the physical device resources as possible. + Valid for graph nodes, launches. Set :py:obj:`~.CUlaunchAttributeValue.preferredClusterDim` to allow the kernel launch to specify a preferred substitute cluster dimension. Blocks may be grouped according to either the dimensions specified with this attribute (grouped into a "preferred substitute cluster"), or the one specified with :py:obj:`~.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION` attribute (grouped into a "regular cluster"). The cluster dimensions of a "preferred substitute cluster" shall be an integer multiple greater than zero of the regular cluster dimensions. The device will attempt - on a best-effort basis - to group thread blocks into preferred clusters over grouping them into regular clusters. When it deems necessary (primarily when the device temporarily runs out of physical resources to launch the larger preferred clusters), the device may switch to launch the regular clusters instead to attempt to utilize as much of the physical device resources as possible. - Each type of cluster will have its enumeration / coordinate setup as if the grid consists solely of its type of cluster. For example, if the preferred substitute cluster dimensions double the regular cluster dimensions, there might be simultaneously a regular cluster indexed at (1,0,0), and a preferred cluster indexed at (1,0,0). In this example, the preferred substitute cluster (1,0,0) replaces regular clusters (2,0,0) and (3,0,0) and groups their blocks. + Each type of cluster will have its enumeration / coordinate setup as if the grid consists solely of its type of cluster. For example, if the preferred substitute cluster dimensions double the regular cluster dimensions, there might be simultaneously a regular cluster indexed at (1,0,0), and a preferred cluster indexed at (1,0,0). In this example, the preferred substitute cluster (1,0,0) replaces regular clusters (2,0,0) and (3,0,0) and groups their blocks. This attribute will only take effect when a regular cluster dimension has been specified. The preferred substitute cluster dimension must be an integer multiple greater than zero of the regular cluster dimension and must divide the grid. It must also be no more than `maxBlocksPerCluster`, if it is set in the kernel's `__launch_bounds__`. Otherwise it must be less than the maximum value the driver can support. Otherwise, setting this attribute to a value physically unable to fit on any particular device is permitted. @@ -3292,11 +3292,11 @@ Data types used by CUDA driver .. autoattribute:: cuda.bindings.driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT - Valid for launches. Set :py:obj:`~.CUlaunchAttributeValue.launchCompletionEvent` to record the event. + Valid for launches. Set :py:obj:`~.CUlaunchAttributeValue.launchCompletionEvent` to record the event. - Nominally, the event is triggered once all blocks of the kernel have begun execution. Currently this is a best effort. If a kernel B has a launch completion dependency on a kernel A, B may wait until A is complete. Alternatively, blocks of B may begin before all blocks of A have begun, for example if B can claim execution resources unavailable to A (e.g. they run on different GPUs) or if B is a higher priority than A. Exercise caution if such an ordering inversion could lead to deadlock. + Nominally, the event is triggered once all blocks of the kernel have begun execution. Currently this is a best effort. If a kernel B has a launch completion dependency on a kernel A, B may wait until A is complete. Alternatively, blocks of B may begin before all blocks of A have begun, for example if B can claim execution resources unavailable to A (e.g. they run on different GPUs) or if B is a higher priority than A. Exercise caution if such an ordering inversion could lead to deadlock. - A launch completion event is nominally similar to a programmatic event with `triggerAtBlockStart` set except that it is not visible to `cudaGridDependencySynchronize()` and can be used with compute capability less than 9.0. + A launch completion event is nominally similar to a programmatic event with `triggerAtBlockStart` set except that it is not visible to `cudaGridDependencySynchronize()` and can be used with compute capability less than 9.0. The event supplied must not be an interprocess or interop event. The event must disable timing (i.e. must be created with the :py:obj:`~.CU_EVENT_DISABLE_TIMING` flag set). @@ -3304,11 +3304,11 @@ Data types used by CUDA driver .. autoattribute:: cuda.bindings.driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE - Valid for graph nodes, launches. This attribute is graphs-only, and passing it to a launch in a non-capturing stream will result in an error. + Valid for graph nodes, launches. This attribute is graphs-only, and passing it to a launch in a non-capturing stream will result in an error. - :py:obj:`~.CUlaunchAttributeValue`::deviceUpdatableKernelNode::deviceUpdatable can only be set to 0 or 1. Setting the field to 1 indicates that the corresponding kernel node should be device-updatable. On success, a handle will be returned via :py:obj:`~.CUlaunchAttributeValue`::deviceUpdatableKernelNode::devNode which can be passed to the various device-side update functions to update the node's kernel parameters from within another kernel. For more information on the types of device updates that can be made, as well as the relevant limitations thereof, see :py:obj:`~.cudaGraphKernelNodeUpdatesApply`. + :py:obj:`~.CUlaunchAttributeValue`::deviceUpdatableKernelNode::deviceUpdatable can only be set to 0 or 1. Setting the field to 1 indicates that the corresponding kernel node should be device-updatable. On success, a handle will be returned via :py:obj:`~.CUlaunchAttributeValue`::deviceUpdatableKernelNode::devNode which can be passed to the various device-side update functions to update the node's kernel parameters from within another kernel. For more information on the types of device updates that can be made, as well as the relevant limitations thereof, see :py:obj:`~.cudaGraphKernelNodeUpdatesApply`. - Nodes which are device-updatable have additional restrictions compared to regular kernel nodes. Firstly, device-updatable nodes cannot be removed from their graph via :py:obj:`~.cuGraphDestroyNode`. Additionally, once opted-in to this functionality, a node cannot opt out, and any attempt to set the deviceUpdatable attribute to 0 will result in an error. Device-updatable kernel nodes also cannot have their attributes copied to/from another kernel node via :py:obj:`~.cuGraphKernelNodeCopyAttributes`. Graphs containing one or more device-updatable nodes also do not allow multiple instantiation, and neither the graph nor its instantiated version can be passed to :py:obj:`~.cuGraphExecUpdate`. + Nodes which are device-updatable have additional restrictions compared to regular kernel nodes. Firstly, device-updatable nodes cannot be removed from their graph via :py:obj:`~.cuGraphDestroyNode`. Additionally, once opted-in to this functionality, a node cannot opt out, and any attempt to set the deviceUpdatable attribute to 0 will result in an error. Device-updatable kernel nodes also cannot have their attributes copied to/from another kernel node via :py:obj:`~.cuGraphKernelNodeCopyAttributes`. Graphs containing one or more device-updatable nodes also do not allow multiple instantiation, and neither the graph nor its instantiated version can be passed to :py:obj:`~.cuGraphExecUpdate`. If a graph contains device-updatable nodes and updates those nodes from the device from within the graph, the graph must be uploaded with :py:obj:`~.cuGraphUpload` before it is launched. For such a graph, if host-side executable graph updates are made to the device-updatable nodes, the graph must be uploaded before it is launched again. @@ -3322,13 +3322,13 @@ Data types used by CUDA driver .. autoattribute:: cuda.bindings.driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_NVLINK_UTIL_CENTRIC_SCHEDULING - Valid for streams, graph nodes, launches. This attribute is a hint to the CUDA runtime that the launch should attempt to make the kernel maximize its NVLINK utilization. + Valid for streams, graph nodes, launches. This attribute is a hint to the CUDA runtime that the launch should attempt to make the kernel maximize its NVLINK utilization. - When possible to honor this hint, CUDA will assume each block in the grid launch will carry out an even amount of NVLINK traffic, and make a best-effort attempt to adjust the kernel launch based on that assumption. + When possible to honor this hint, CUDA will assume each block in the grid launch will carry out an even amount of NVLINK traffic, and make a best-effort attempt to adjust the kernel launch based on that assumption. - This attribute is a hint only. CUDA makes no functional or performance guarantee. Its applicability can be affected by many different factors, including driver version (i.e. CUDA doesn't guarantee the performance characteristics will be maintained between driver versions or a driver update could alter or regress previously observed perf characteristics.) It also doesn't guarantee a successful result, i.e. applying the attribute may not improve the performance of either the targeted kernel or the encapsulating application. + This attribute is a hint only. CUDA makes no functional or performance guarantee. Its applicability can be affected by many different factors, including driver version (i.e. CUDA doesn't guarantee the performance characteristics will be maintained between driver versions or a driver update could alter or regress previously observed perf characteristics.) It also doesn't guarantee a successful result, i.e. applying the attribute may not improve the performance of either the targeted kernel or the encapsulating application. Valid values for :py:obj:`~.CUlaunchAttributeValue`::nvlinkUtilCentricScheduling are 0 (disabled) and 1 (enabled). diff --git a/cuda_bindings/docs/source/module/nvjitlink.rst b/cuda_bindings/docs/source/module/nvjitlink.rst index 6eda063074..ff9bb1ea52 100644 --- a/cuda_bindings/docs/source/module/nvjitlink.rst +++ b/cuda_bindings/docs/source/module/nvjitlink.rst @@ -9,7 +9,7 @@ nvjitlink Note ---- -The nvjitlink bindings are not supported on nvJitLink installations <12.3. Ensure the installed CUDA toolkit's nvJitLink version is >=12.3. +The nvjitlink bindings are not supported on nvJitLink installations <12.3. Ensure the installed CUDA toolkit's nvJitLink version is >=12.3. Functions --------- diff --git a/cuda_bindings/docs/source/module/nvrtc.rst b/cuda_bindings/docs/source/module/nvrtc.rst index d8bd6df229..079cd39aad 100644 --- a/cuda_bindings/docs/source/module/nvrtc.rst +++ b/cuda_bindings/docs/source/module/nvrtc.rst @@ -232,7 +232,7 @@ Generate line-number information. - - ``--dopt=on``\ + - ``--dopt=on``\ Enable device code optimization. When specified along with ``-G``\ , enables limited debug information generation for optimized device code (currently, only line number information). When ``-G``\ is not specified, ``-dopt=on``\ is implicit. @@ -260,7 +260,7 @@ Specify the fast-compile level for device code, which controls the tradeoff betw - - ``--ptxas-options=``\ + - ``--ptxas-options=``\ Specify options directly to ptxas, the PTX optimizing assembler. @@ -308,7 +308,7 @@ For single-precision floating-point square root, use IEEE round-to-nearest mode - - Default: ``true``\ + - Default: ``true``\ @@ -783,3 +783,4 @@ Enable stack canaries in device code. Stack canaries make it more difficult to e - ``--fdevice-time-trace=``\ (``-fdevice-time-trace=``\ ) Enables the time profiler, outputting a JSON file based on given . Results can be analyzed on chrome://tracing for a flamegraph visualization. + diff --git a/cuda_bindings/docs/source/module/runtime.rst b/cuda_bindings/docs/source/module/runtime.rst index 9ac22f660b..d155f85ebc 100644 --- a/cuda_bindings/docs/source/module/runtime.rst +++ b/cuda_bindings/docs/source/module/runtime.rst @@ -281,7 +281,7 @@ This section describes the unified addressing functions of the CUDA runtime appl -CUDA devices can share a unified address space with the host. +CUDA devices can share a unified address space with the host. For these devices there is no distinction between a device pointer and a host pointer -- the same pointer value may be used to access memory from the host program and from a kernel running on the device (with exceptions enumerated below). @@ -307,7 +307,7 @@ Unified addressing is automatically enabled in 64-bit processes . It is possible to look up information about the memory which backs a pointer value. For instance, one may want to know if a pointer points to host or device memory. As another example, in the case of device memory, one may want to know on which CUDA device the memory resides. These properties may be queried using the function cudaPointerGetAttributes() -Since pointers are unique, it is not necessary to specify information about the pointers specified to cudaMemcpy() and other copy functions. +Since pointers are unique, it is not necessary to specify information about the pointers specified to cudaMemcpy() and other copy functions. The copy direction cudaMemcpyDefault may be used to specify that the CUDA runtime should infer the location of the pointer from its value. @@ -321,7 +321,7 @@ Since pointers are unique, it is not necessary to specify information about the All host memory allocated through all devices using cudaMallocHost() and cudaHostAlloc() is always directly accessible from all devices that support unified addressing. This is the case regardless of whether or not the flags cudaHostAllocPortable and cudaHostAllocMapped are specified. -The pointer value through which allocated host memory may be accessed in kernels on all devices that support unified addressing is the same as the pointer value through which that memory is accessed on the host. It is not necessary to call cudaHostGetDevicePointer() to get the device pointer for these allocations. +The pointer value through which allocated host memory may be accessed in kernels on all devices that support unified addressing is the same as the pointer value through which that memory is accessed on the host. It is not necessary to call cudaHostGetDevicePointer() to get the device pointer for these allocations. @@ -345,7 +345,7 @@ Upon enabling direct access from a device that supports unified addressing to an -Not all memory may be accessed on devices through the same pointer value through which they are accessed on the host. These exceptions are host memory registered using cudaHostRegister() and host memory allocated using the flag cudaHostAllocWriteCombined. For these exceptions, there exists a distinct host and device address for the memory. The device address is guaranteed to not overlap any valid host pointer range and is guaranteed to have the same value across all devices that support unified addressing. +Not all memory may be accessed on devices through the same pointer value through which they are accessed on the host. These exceptions are host memory registered using cudaHostRegister() and host memory allocated using the flag cudaHostAllocWriteCombined. For these exceptions, there exists a distinct host and device address for the memory. The device address is guaranteed to not overlap any valid host pointer range and is guaranteed to have the same value across all devices that support unified addressing. @@ -671,7 +671,7 @@ Note that the use of multiple ::CUcontext s per device within a single process w If a non-primary ::CUcontext created by the CUDA Driver API is current to a thread then the CUDA Runtime API calls to that thread will operate on that ::CUcontext, with some exceptions listed below. Interoperability between data types is discussed in the following sections. -The function cudaPointerGetAttributes() will return the error cudaErrorIncompatibleDriverContext if the pointer being queried was allocated by a non-primary context. The function cudaDeviceEnablePeerAccess() and the rest of the peer access API may not be called when a non-primary ::CUcontext is current. +The function cudaPointerGetAttributes() will return the error cudaErrorIncompatibleDriverContext if the pointer being queried was allocated by a non-primary context. The function cudaDeviceEnablePeerAccess() and the rest of the peer access API may not be called when a non-primary ::CUcontext is current. To use the pointer query and peer access APIs with a context created using the CUDA Driver API, it is necessary that the CUDA Driver API be used to access these features. @@ -4697,7 +4697,7 @@ Data types used by CUDA Runtime .. autoattribute:: cuda.bindings.runtime.cudaJitOption.cudaJitCacheMode - Specifies whether to enable caching explicitly (-dlcm) + Specifies whether to enable caching explicitly (-dlcm) Choice is based on supplied :py:obj:`~.cudaJit_CacheMode`. @@ -5199,9 +5199,9 @@ Data types used by CUDA Runtime .. autoattribute:: cuda.bindings.runtime.cudaGraphInstantiateFlags.cudaGraphInstantiateFlagUpload - Automatically upload the graph after instantiation. Only supported by + Automatically upload the graph after instantiation. Only supported by - :py:obj:`~.cudaGraphInstantiateWithParams`. The upload will be performed using the + :py:obj:`~.cudaGraphInstantiateWithParams`. The upload will be performed using the stream provided in `instantiateParams`. @@ -5209,9 +5209,9 @@ Data types used by CUDA Runtime .. autoattribute:: cuda.bindings.runtime.cudaGraphInstantiateFlags.cudaGraphInstantiateFlagDeviceLaunch - Instantiate the graph to be launchable from the device. This flag can only + Instantiate the graph to be launchable from the device. This flag can only - be used on platforms which support unified addressing. This flag cannot be + be used on platforms which support unified addressing. This flag cannot be used in conjunction with cudaGraphInstantiateFlagAutoFreeOnLaunch. @@ -5281,7 +5281,7 @@ Data types used by CUDA Runtime .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeProgrammaticEvent - Valid for launches. Set :py:obj:`~.cudaLaunchAttributeValue.programmaticEvent` to record the event. Event recorded through this launch attribute is guaranteed to only trigger after all block in the associated kernel trigger the event. A block can trigger the event programmatically in a future CUDA release. A trigger can also be inserted at the beginning of each block's execution if triggerAtBlockStart is set to non-0. The dependent launches can choose to wait on the dependency using the programmatic sync (cudaGridDependencySynchronize() or equivalent PTX instructions). Note that dependents (including the CPU thread calling :py:obj:`~.cudaEventSynchronize()`) are not guaranteed to observe the release precisely when it is released. For example, :py:obj:`~.cudaEventSynchronize()` may only observe the event trigger long after the associated kernel has completed. This recording type is primarily meant for establishing programmatic dependency between device tasks. Note also this type of dependency allows, but does not guarantee, concurrent execution of tasks. + Valid for launches. Set :py:obj:`~.cudaLaunchAttributeValue.programmaticEvent` to record the event. Event recorded through this launch attribute is guaranteed to only trigger after all block in the associated kernel trigger the event. A block can trigger the event programmatically in a future CUDA release. A trigger can also be inserted at the beginning of each block's execution if triggerAtBlockStart is set to non-0. The dependent launches can choose to wait on the dependency using the programmatic sync (cudaGridDependencySynchronize() or equivalent PTX instructions). Note that dependents (including the CPU thread calling :py:obj:`~.cudaEventSynchronize()`) are not guaranteed to observe the release precisely when it is released. For example, :py:obj:`~.cudaEventSynchronize()` may only observe the event trigger long after the associated kernel has completed. This recording type is primarily meant for establishing programmatic dependency between device tasks. Note also this type of dependency allows, but does not guarantee, concurrent execution of tasks. The event supplied must not be an interprocess or interop event. The event must disable timing (i.e. must be created with the :py:obj:`~.cudaEventDisableTiming` flag set). @@ -5307,9 +5307,9 @@ Data types used by CUDA Runtime .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributePreferredClusterDimension - Valid for graph nodes and launches. Set :py:obj:`~.cudaLaunchAttributeValue.preferredClusterDim` to allow the kernel launch to specify a preferred substitute cluster dimension. Blocks may be grouped according to either the dimensions specified with this attribute (grouped into a "preferred substitute cluster"), or the one specified with :py:obj:`~.cudaLaunchAttributeClusterDimension` attribute (grouped into a "regular cluster"). The cluster dimensions of a "preferred substitute cluster" shall be an integer multiple greater than zero of the regular cluster dimensions. The device will attempt - on a best-effort basis - to group thread blocks into preferred clusters over grouping them into regular clusters. When it deems necessary (primarily when the device temporarily runs out of physical resources to launch the larger preferred clusters), the device may switch to launch the regular clusters instead to attempt to utilize as much of the physical device resources as possible. + Valid for graph nodes and launches. Set :py:obj:`~.cudaLaunchAttributeValue.preferredClusterDim` to allow the kernel launch to specify a preferred substitute cluster dimension. Blocks may be grouped according to either the dimensions specified with this attribute (grouped into a "preferred substitute cluster"), or the one specified with :py:obj:`~.cudaLaunchAttributeClusterDimension` attribute (grouped into a "regular cluster"). The cluster dimensions of a "preferred substitute cluster" shall be an integer multiple greater than zero of the regular cluster dimensions. The device will attempt - on a best-effort basis - to group thread blocks into preferred clusters over grouping them into regular clusters. When it deems necessary (primarily when the device temporarily runs out of physical resources to launch the larger preferred clusters), the device may switch to launch the regular clusters instead to attempt to utilize as much of the physical device resources as possible. - Each type of cluster will have its enumeration / coordinate setup as if the grid consists solely of its type of cluster. For example, if the preferred substitute cluster dimensions double the regular cluster dimensions, there might be simultaneously a regular cluster indexed at (1,0,0), and a preferred cluster indexed at (1,0,0). In this example, the preferred substitute cluster (1,0,0) replaces regular clusters (2,0,0) and (3,0,0) and groups their blocks. + Each type of cluster will have its enumeration / coordinate setup as if the grid consists solely of its type of cluster. For example, if the preferred substitute cluster dimensions double the regular cluster dimensions, there might be simultaneously a regular cluster indexed at (1,0,0), and a preferred cluster indexed at (1,0,0). In this example, the preferred substitute cluster (1,0,0) replaces regular clusters (2,0,0) and (3,0,0) and groups their blocks. This attribute will only take effect when a regular cluster dimension has been specified. The preferred substitute cluster dimension must be an integer multiple greater than zero of the regular cluster dimension and must divide the grid. It must also be no more than `maxBlocksPerCluster`, if it is set in the kernel's `__launch_bounds__`. Otherwise it must be less than the maximum value the driver can support. Otherwise, setting this attribute to a value physically unable to fit on any particular device is permitted. @@ -5317,11 +5317,11 @@ Data types used by CUDA Runtime .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeLaunchCompletionEvent - Valid for launches. Set :py:obj:`~.cudaLaunchAttributeValue.launchCompletionEvent` to record the event. + Valid for launches. Set :py:obj:`~.cudaLaunchAttributeValue.launchCompletionEvent` to record the event. - Nominally, the event is triggered once all blocks of the kernel have begun execution. Currently this is a best effort. If a kernel B has a launch completion dependency on a kernel A, B may wait until A is complete. Alternatively, blocks of B may begin before all blocks of A have begun, for example if B can claim execution resources unavailable to A (e.g. they run on different GPUs) or if B is a higher priority than A. Exercise caution if such an ordering inversion could lead to deadlock. + Nominally, the event is triggered once all blocks of the kernel have begun execution. Currently this is a best effort. If a kernel B has a launch completion dependency on a kernel A, B may wait until A is complete. Alternatively, blocks of B may begin before all blocks of A have begun, for example if B can claim execution resources unavailable to A (e.g. they run on different GPUs) or if B is a higher priority than A. Exercise caution if such an ordering inversion could lead to deadlock. - A launch completion event is nominally similar to a programmatic event with `triggerAtBlockStart` set except that it is not visible to `cudaGridDependencySynchronize()` and can be used with compute capability less than 9.0. + A launch completion event is nominally similar to a programmatic event with `triggerAtBlockStart` set except that it is not visible to `cudaGridDependencySynchronize()` and can be used with compute capability less than 9.0. The event supplied must not be an interprocess or interop event. The event must disable timing (i.e. must be created with the :py:obj:`~.cudaEventDisableTiming` flag set). @@ -5329,11 +5329,11 @@ Data types used by CUDA Runtime .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeDeviceUpdatableKernelNode - Valid for graph nodes, launches. This attribute is graphs-only, and passing it to a launch in a non-capturing stream will result in an error. + Valid for graph nodes, launches. This attribute is graphs-only, and passing it to a launch in a non-capturing stream will result in an error. - :cudaLaunchAttributeValue::deviceUpdatableKernelNode::deviceUpdatable can only be set to 0 or 1. Setting the field to 1 indicates that the corresponding kernel node should be device-updatable. On success, a handle will be returned via :py:obj:`~.cudaLaunchAttributeValue`::deviceUpdatableKernelNode::devNode which can be passed to the various device-side update functions to update the node's kernel parameters from within another kernel. For more information on the types of device updates that can be made, as well as the relevant limitations thereof, see :py:obj:`~.cudaGraphKernelNodeUpdatesApply`. + :cudaLaunchAttributeValue::deviceUpdatableKernelNode::deviceUpdatable can only be set to 0 or 1. Setting the field to 1 indicates that the corresponding kernel node should be device-updatable. On success, a handle will be returned via :py:obj:`~.cudaLaunchAttributeValue`::deviceUpdatableKernelNode::devNode which can be passed to the various device-side update functions to update the node's kernel parameters from within another kernel. For more information on the types of device updates that can be made, as well as the relevant limitations thereof, see :py:obj:`~.cudaGraphKernelNodeUpdatesApply`. - Nodes which are device-updatable have additional restrictions compared to regular kernel nodes. Firstly, device-updatable nodes cannot be removed from their graph via :py:obj:`~.cudaGraphDestroyNode`. Additionally, once opted-in to this functionality, a node cannot opt out, and any attempt to set the deviceUpdatable attribute to 0 will result in an error. Device-updatable kernel nodes also cannot have their attributes copied to/from another kernel node via :py:obj:`~.cudaGraphKernelNodeCopyAttributes`. Graphs containing one or more device-updatable nodes also do not allow multiple instantiation, and neither the graph nor its instantiated version can be passed to :py:obj:`~.cudaGraphExecUpdate`. + Nodes which are device-updatable have additional restrictions compared to regular kernel nodes. Firstly, device-updatable nodes cannot be removed from their graph via :py:obj:`~.cudaGraphDestroyNode`. Additionally, once opted-in to this functionality, a node cannot opt out, and any attempt to set the deviceUpdatable attribute to 0 will result in an error. Device-updatable kernel nodes also cannot have their attributes copied to/from another kernel node via :py:obj:`~.cudaGraphKernelNodeCopyAttributes`. Graphs containing one or more device-updatable nodes also do not allow multiple instantiation, and neither the graph nor its instantiated version can be passed to :py:obj:`~.cudaGraphExecUpdate`. If a graph contains device-updatable nodes and updates those nodes from the device from within the graph, the graph must be uploaded with :py:obj:`~.cuGraphUpload` before it is launched. For such a graph, if host-side executable graph updates are made to the device-updatable nodes, the graph must be uploaded before it is launched again. @@ -5347,13 +5347,13 @@ Data types used by CUDA Runtime .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeNvlinkUtilCentricScheduling - Valid for streams, graph nodes, launches. This attribute is a hint to the CUDA runtime that the launch should attempt to make the kernel maximize its NVLINK utilization. + Valid for streams, graph nodes, launches. This attribute is a hint to the CUDA runtime that the launch should attempt to make the kernel maximize its NVLINK utilization. - When possible to honor this hint, CUDA will assume each block in the grid launch will carry out an even amount of NVLINK traffic, and make a best-effort attempt to adjust the kernel launch based on that assumption. + When possible to honor this hint, CUDA will assume each block in the grid launch will carry out an even amount of NVLINK traffic, and make a best-effort attempt to adjust the kernel launch based on that assumption. - This attribute is a hint only. CUDA makes no functional or performance guarantee. Its applicability can be affected by many different factors, including driver version (i.e. CUDA doesn't guarantee the performance characteristics will be maintained between driver versions or a driver update could alter or regress previously observed perf characteristics.) It also doesn't guarantee a successful result, i.e. applying the attribute may not improve the performance of either the targeted kernel or the encapsulating application. + This attribute is a hint only. CUDA makes no functional or performance guarantee. Its applicability can be affected by many different factors, including driver version (i.e. CUDA doesn't guarantee the performance characteristics will be maintained between driver versions or a driver update could alter or regress previously observed perf characteristics.) It also doesn't guarantee a successful result, i.e. applying the attribute may not improve the performance of either the targeted kernel or the encapsulating application. Valid values for :py:obj:`~.cudaLaunchAttributeValue.nvlinkUtilCentricScheduling` are 0 (disabled) and 1 (enabled). From 44792945639a2ba03f378017072ce160f950f95d Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Mon, 25 Aug 2025 14:32:55 -0700 Subject: [PATCH 7/7] Add `$|cuda_bindings/docs/source/module/.*\.rst?$` to `gen_exclude` --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 56823c88e2..0479db6a59 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -12,7 +12,7 @@ ci: skip: [bandit] submodules: false -gen_exclude: &gen_exclude '^cuda_bindings/cuda/bindings/.*\.in?$' +gen_exclude: &gen_exclude '^cuda_bindings/cuda/bindings/.*\.in?$|cuda_bindings/docs/source/module/.*\.rst?$' # Please update the rev: SHAs below with this command: # pre-commit autoupdate --freeze