accuknox · Eshrath027 · Apr 7, 2026 · Mar 20, 2026 · Mar 20, 2026 · Mar 20, 2026
diff --git a/.gitignore b/.gitignore
@@ -16,3 +16,5 @@ llms-full.txt
 .vscode/
 *.swp
 .DS_Store
+
+build/
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,26 @@
+FROM python:3.12-slim
+
+# Install uv
+RUN pip install --no-cache-dir uv
+
+# Create non-root user
+RUN useradd -m -u 1000 codeassureuser
+
+WORKDIR /app
+
+# Copy project files
+COPY pyproject.toml ./
+COPY sast_verify/ ./sast_verify/
+# COPY codeassure.json ./
+
+# Install the package
+RUN uv pip install --system --no-cache .
+
+# Set ownership
+RUN chown -R codeassureuser:codeassureuser /app
+
+USER codeassureuser
+
+WORKDIR /workspace
+
+ENTRYPOINT ["codeassure", "--config", "/app/codeassure.json"]
diff --git a/brev_docker_files/docker-compose.yml b/brev_docker_files/docker-compose.yml
@@ -0,0 +1,40 @@
+services:
+  qwen35-server:
+    image: vllm/vllm-openai:v0.17.1-x86_64-cu130
+    ports:
+      - "5000:5000"
+    volumes:
+      - /home/shadeform/.cache/huggingface:/root/.cache/huggingface
+    ipc: host
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    command:
+      - Sehyo/Qwen3.5-122B-A10B-NVFP4
+      - --served-model-name
+      - qwen35-nvfp4
+      - --swap-space
+      - "16"
+      - --max-num-seqs
+      - "32"
+      - --max-model-len
+      - "65536"
+      - --gpu-memory-utilization
+      - "0.9"
+      - --tensor-parallel-size
+      - "1"
+      - --language-model-only
+      - --enable-auto-tool-choice
+      - --tool-call-parser
+      - qwen3_coder
+      - --reasoning-parser
+      - qwen3
+      - --trust-remote-code
+      - --host
+      - 0.0.0.0
+      - --port
+      - "5000"
diff --git a/brev_docker_files/docker-compose_nemotron.yml b/brev_docker_files/docker-compose_nemotron.yml
@@ -0,0 +1,53 @@
+services:
+  nemotron-server:
+    image: vllm/vllm-openai:v0.17.1-x86_64-cu130
+    ports:
+      - "5000:5000"
+    volumes:
+      - /home/ubuntu/.cache/huggingface:/root/.cache/huggingface
+      - /home/shadeform/super_v3_reasoning_parser.py:/app/super_v3_reasoning_parser.py:ro
+    ipc: host
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    command:
+      - --model
+      - nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4
+      - --async-scheduling
+      - --served-model-name
+      - nvidia/nemotron-3-super
+      - --dtype
+      - auto
+      - --kv-cache-dtype
+      - fp8
+      - --tensor-parallel-size
+      - "1"
+      - --pipeline-parallel-size
+      - "1"
+      - --data-parallel-size
+      - "1"
+      - --swap-space
+      - "0"
+      - --trust-remote-code
+      - --attention-backend
+      - TRITON_ATTN
+      - --gpu-memory-utilization
+      - "0.9"
+      - --enable-chunked-prefill
+      - --max-num-seqs
+      - "512"
+      - --host
+      - 0.0.0.0
+      - --port
+      - "5000"
+      - --enable-auto-tool-choice
+      - --tool-call-parser
+      - qwen3_coder
+      - --reasoning-parser-plugin
+      - /app/super_v3_reasoning_parser.py
+      - --reasoning-parser
+      - super_v3
diff --git a/brev_docker_files/init.sh b/brev_docker_files/init.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+ufw --force reset
+ufw default deny incoming
+ufw default allow outgoing
+ufw allow 22/tcp
+ufw allow 2222/tcp
+ufw allow in from 0.0.0.0/0 to any port 22
+ufw --force enable
+iptables -F DOCKER-USER
+iptables -A DOCKER-USER -m conntrack --ctstate ESTABLISHED,RELATED -j ACCEPT
+iptables -A DOCKER-USER -i docker0 ! -o docker0 -j ACCEPT
+iptables -A DOCKER-USER -i br+     ! -o br+     -j ACCEPT
+iptables -A DOCKER-USER -i cni+    ! -o cni+    -j ACCEPT
+iptables -A DOCKER-USER -i cali+   ! -o cali+   -j ACCEPT
+iptables -A DOCKER-USER -i docker0 -o docker0 -j ACCEPT
+iptables -A DOCKER-USER -i br+     -o br+     -j ACCEPT
+iptables -A DOCKER-USER -i cni+    -o cni+    -j ACCEPT
+iptables -A DOCKER-USER -i cali+   -o cali+   -j ACCEPT
+iptables -A DOCKER-USER -i lo -j ACCEPT
+iptables -A DOCKER-USER -j DROP
+iptables -A DOCKER-USER -j RETURN
diff --git a/brev_docker_files/super_v3_reasoning_parser.py b/brev_docker_files/super_v3_reasoning_parser.py
@@ -0,0 +1,28 @@
+from vllm.reasoning.abs_reasoning_parsers import ReasoningParserManager
+from vllm.reasoning.deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser
+
+
+@ReasoningParserManager.register_module("super_v3")
+class SuperV3ReasoningParser(DeepSeekR1ReasoningParser):
+    def extract_reasoning(self, model_output, request):
+        reasoning_content, final_content = super().extract_reasoning(
+            model_output, request
+        )
+        if (
+            hasattr(request, "chat_template_kwargs")
+            and request.chat_template_kwargs
+            and (
+                request.chat_template_kwargs.get("enable_thinking") is False
+                or request.chat_template_kwargs.get("force_nonempty_content") is True
+            )
+            and final_content is None
+        ):
+            """
+            The original `deepseek_r1` reasoning parser this inherits from will automatically put everything in the reasoning content when it cannot parse out reasoning. This was fine for the DeepSeek R1 model that was not intended to be used without reasoning.
+            1. Since the Nemotron 3 Nano and Super both have thinking off modes modulated by "enable_thinking=false" in the chat template kwargs, this change instead which will properly place the content in cases where there is no thinking enabled via config.
+            2. There are rare cases where the model will output only reasoning without an end-think token `</think>` (e.g. reasoning exceeds max length), which results in empty content returned. End users may want to unilaterally avoid such cases and always have a content response even if the model does not finish its reasoning.
+            """
+            # Put all nonempty content into the content, rather than return content
+            reasoning_content, final_content = None, reasoning_content
+
+        return reasoning_content, final_content
diff --git a/build.sh b/build.sh
@@ -0,0 +1,12 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+echo "==> Installing build deps..."
+uv pip install --system -e ".[build]"
+
+echo "==> Building standalone binary..."
+pyinstaller codeassure.spec --clean
+
+echo ""
+echo "Binary ready: dist/codeassure"
+echo "Test it: ./dist/codeassure --help"
diff --git a/build_entry.py b/build_entry.py
@@ -0,0 +1,4 @@
+from sast_verify.cli import main
+
+if __name__ == "__main__":
+    main()
diff --git a/codeassure.json b/codeassure.json
@@ -4,5 +4,5 @@
     "name": "qwen35-nvfp4",
     "api_base": "http://localhost:5000/v1"
   },
-  "concurrency": 16
+  "concurrency": 4
 }
diff --git a/codeassure.spec b/codeassure.spec
@@ -0,0 +1,85 @@
+# -*- mode: python ; coding: utf-8 -*-
+# PyInstaller spec for codeassure standalone binary
+from PyInstaller.utils.hooks import copy_metadata
+
+a = Analysis(
+    ['build_entry.py'],
+    pathex=[],
+    binaries=[],
+    datas=[
+        *copy_metadata('genai_prices'),
+        *copy_metadata('pydantic_ai_slim'),
+    ],
+    hiddenimports=[
+        # genai_prices (used by pydantic-ai messages.py at import time)
+        'genai_prices',
+        # pydantic / pydantic-ai
+        'pydantic',
+        'pydantic.v1',
+        'pydantic_core',
+        'pydantic_ai',
+        'pydantic_ai.models',
+        'pydantic_ai.models.openai',
+        'pydantic_ai_slim',
+        # anthropic SDK
+        'anthropic',
+        'anthropic._client',
+        'anthropic.resources',
+        # httpx (used by both anthropic and pydantic-ai)
+        'httpx',
+        'httpcore',
+        # async
+        'anyio',
+        'anyio._backends._asyncio',
+        'sniffio',
+        # openai client (pydantic-ai-slim[openai])
+        'openai',
+        'openai._client',
+        # project internals
+        'sast_verify',
+        'sast_verify.cli',
+        'sast_verify.config',
+        'sast_verify.pipeline',
+        'sast_verify.preprocess',
+        'sast_verify.retrieval',
+        'sast_verify.schema',
+        'sast_verify.agents',
+        'sast_verify.agents.analyzer',
+        'sast_verify.agents.runner',
+        'sast_verify.agents.tools',
+        'sast_verify.agents.deps',
+        'sast_verify.prompts',
+        'sast_verify.prompts.analyzer',
+        'sast_verify.prompts.rule_policies',
+        'sast_verify.eval',
+        'sast_verify.eval.evaluate',
+    ],
+    hookspath=[],
+    hooksconfig={},
+    runtime_hooks=[],
+    excludes=[],
+    noarchive=False,
+)
+
+pyz = PYZ(a.pure)
+
+exe = EXE(
+    pyz,
+    a.scripts,
+    a.binaries,
+    a.datas,
+    [],
+    name='codeassure',
+    debug=False,
+    bootloader_ignore_signals=False,
+    strip=False,
+    upx=True,
+    upx_exclude=[],
+    runtime_tmpdir=None,
+    console=True,
+    disable_windowed_traceback=False,
+    argv_emulation=False,
+    target_arch=None,
+    codesign_identity=None,
+    entitlements_file=None,
+)
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -1,12 +1,10 @@
 services:
-  nemotron-server:
+  qwen35-server:
     image: vllm/vllm-openai:v0.17.1-x86_64-cu130
     ports:
       - "5000:5000"
-    environment:
-      VLLM_ENABLE_CUDA_COMPATIBILITY: "1"
     volumes:
-      - /home/ubuntu/.cache/huggingface:/root/.cache/huggingface
+      - /home/shadeform/.cache/huggingface:/root/.cache/huggingface
     ipc: host
     deploy:
       resources:
@@ -15,28 +13,28 @@ services:
             - driver: nvidia
               count: all
               capabilities: [gpu]
-    entrypoint: ["/bin/bash", "-lc"]
-    command: >
-      wget -qO /tmp/super_v3_reasoning_parser.py
-      https://huggingface.co/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4/raw/main/super_v3_reasoning_parser.py
-      &&
-      vllm serve nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4
-      --async-scheduling
-      --served-model-name nvidia/nemotron-3-super
-      --dtype auto
-      --kv-cache-dtype fp8
-      --tensor-parallel-size 1
-      --pipeline-parallel-size 1
-      --data-parallel-size 1
-      --swap-space 0
-      --trust-remote-code
-      --attention-backend TRITON_ATTN
-      --gpu-memory-utilization 0.9
-      --enable-chunked-prefill
-      --max-num-seqs 512
-      --host 0.0.0.0
-      --port 5000
-      --enable-auto-tool-choice
-      --tool-call-parser qwen3_coder
-      --reasoning-parser-plugin /tmp/super_v3_reasoning_parser.py
-      --reasoning-parser super_v3
+    command:
+      - Sehyo/Qwen3.5-122B-A10B-NVFP4
+      - --served-model-name
+      - qwen35-nvfp4
+      - --swap-space
+      - "16"
+      - --max-num-seqs
+      - "32"
+      - --max-model-len
+      - "65536"
+      - --gpu-memory-utilization
+      - "0.9"
+      - --tensor-parallel-size
+      - "1"
+      - --language-model-only
+      - --enable-auto-tool-choice
+      - --tool-call-parser
+      - qwen3_coder
+      - --reasoning-parser
+      - qwen3
+      - --trust-remote-code
+      - --host
+      - 0.0.0.0
+      - --port
+      - "5000"
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,13 +1,23 @@
+[build-system]
+requires = ["setuptools>=68"]
+build-backend = "setuptools.build_meta"
+
 [project]
 name = "codeassure"
 version = "0.1.0"
 description = "AI-powered SAST finding verification"
+readme = "README.md"
 requires-python = ">=3.11"
+license = { text = "MIT" }
 dependencies = [
     "pydantic-ai-slim[openai]",
     "pydantic>=2.0",
+    "anthropic>=0.40.0",
 ]
 
+[project.optional-dependencies]
+build = ["pyinstaller>=6.0"]
+
 [tool.setuptools.packages.find]
 include = ["sast_verify*"]
-Original file line number
+Diff line change
@@ Expand Up / @@ -16,3 +16,5 @@ llms-full.txt @@
     .vscode/
     *.swp
     .DS_Store
+    build/