From 8b598bc9058f249655dcf611bafe6fdee5befdf8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E3=83=84=E3=83=B3=E3=83=87=E3=83=AC?= Date: Thu, 9 Apr 2026 19:38:57 +0300 Subject: [PATCH] feat(compliance): add pii_masker skill using micro-f1-mask --- docs/skills/README.md | 7 ++ docs/skills/pii_masker.md | 75 +++++++++++++++++++ examples/pii_guardrail_flow.py | 53 ++++++++++++++ pyproject.toml | 2 +- skills/compliance/pii_masker/__init__.py | 3 + skills/compliance/pii_masker/card.json | 19 +++++ skills/compliance/pii_masker/instructions.md | 6 ++ skills/compliance/pii_masker/manifest.yaml | 23 ++++++ skills/compliance/pii_masker/skill.py | 77 ++++++++++++++++++++ tests/skills/compliance/test_pii_masker.py | 41 +++++++++++ 10 files changed, 305 insertions(+), 1 deletion(-) create mode 100644 docs/skills/pii_masker.md create mode 100644 examples/pii_guardrail_flow.py create mode 100644 skills/compliance/pii_masker/__init__.py create mode 100644 skills/compliance/pii_masker/card.json create mode 100644 skills/compliance/pii_masker/instructions.md create mode 100644 skills/compliance/pii_masker/manifest.yaml create mode 100644 skills/compliance/pii_masker/skill.py create mode 100644 tests/skills/compliance/test_pii_masker.py diff --git a/docs/skills/README.md b/docs/skills/README.md index 9c1f962..edf878e 100644 --- a/docs/skills/README.md +++ b/docs/skills/README.md @@ -31,6 +31,13 @@ Skills tailored for generating, parsing, and orchestrating large datasets for ma | :--- | :--- | :--- | | **[Synthetic Data Generator](synthetic_generator.md)** | `data_engineering/synthetic_generator` | Generates high-entropy structured synthetic data for model fine-tuning to avoid mode collapse. | +## Compliance +Enforces privacy, guardrails, and secure handling of sensitive data before it reaches external endpoints. + +| Skill | ID | Description | +| :--- | :--- | :--- | +| **[PII Masker](pii_masker.md)** | `compliance/pii_masker` | High-precision, local PII (Personally Identifiable Information) detection and redaction using the micro-f1-mask model. | + --- ## 📥 Installing Skills diff --git a/docs/skills/pii_masker.md b/docs/skills/pii_masker.md new file mode 100644 index 0000000..4e8f116 --- /dev/null +++ b/docs/skills/pii_masker.md @@ -0,0 +1,75 @@ +# PII Masker + +**ID**: `compliance/pii_masker` +**Category**: Compliance + +High-precision, local PII (Personally Identifiable Information) detection and redaction using the `micro-f1-mask` model. This skill acts as a "Privacy Firewall" at the edge, scrubbing sensitive data before it reaches high-latency cloud models. + +> [!WARNING] +> **Disclaimer**: This skill and the underlying base model are provided for **demonstration and proof-of-concept purposes only**. +> Reaching production-grade 95%+ enterprise accuracy requires architectural optimizations, hard-negative mining, and dataset-specific fine-tuning. Full implementation of the `micro-f1-mask` privacy middleware should only happen after you rigorously fine-tune and test it exclusively with your own proprietary data structures. +> Visit the core project repository for training orchestration and full middleware execution: [github.com/arpahls/micro-f1-mask](https://github.com/arpahls/micro-f1-mask) + +## How It Works + +Agentic workflows inherently risk leaking sensitive user data (names, physical addresses, emails, crypto wallets, etc.) to external LLM providers. This skill solves this by utilizing a local [Ollama](https://ollama.com/) instance hosting the `arpacorp/micro-f1-mask` edge model. + +1. **Contextual Recognition**: Unlike rigid regex patterns, the 270M parameter model is trained to recognize syntactic structure and distinguish between generic information (e.g. "a specific date") and genuine PII (e.g. "a birth date"). +2. **Local Execution**: The text is evaluated entirely on your local node, ensuring that raw unencrypted data never touches the external internet. + +## Prerequisites + +- **Local Inference Support**: This skill uses the `requests` library to communicate entirely locally. +- **Ollama**: You must have [Ollama](https://ollama.com/) running. +- **Model**: You must pull the base privacy edge model before utilizing this skill: + ```bash + ollama run arpacorp/micro-f1-mask + ``` +*(Note for full-cycle setups: While Redis is a strict prerequisite for running the full standalone FastAPI bridge of the `micro-f1-mask` repository, it is **not** a prerequisite for invoking this specific `skillware` skill, as this skill performs the stateless scrubbing pass only.)* + +## Integration & Full Cycle Nuances + +Currently, this `pii_masker` skill functions primarily as a **Forward-Pass Scrubber** (Phase A). +When an agent calls this skill on a block of text, the skill returns a sanitized string with identifying markers (e.g., `[PERSON_1]`). + +**Stateless Design**: By default, this specific Skillware component is stateless. It performs the LLM call and tokenizes the output, but it *does not* automatically preserve the mapping in a local vault (like Redis). +For a complete End-to-End Enterprise integration (The "Full Cycle" ➔ Mask ➔ Send to Cloud ➔ Get Response ➔ Unmask), external developers should either: +- **Option A (Full Middleware Proxy):** Stand up the full standalone FastAPI bridge + Redis vault provided at the [micro-f1-mask repo](https://github.com/arpahls/micro-f1-mask) and point the agent's network traffic entirely through it. +- **Option B (Stateful Agent Logic):** Build custom logic within the calling agent that parses the detected entities returned from this skill's `metadata`, preserves them in its own internal session database or memory variables, invokes the cloud API, and strings-replaces the tags back onto the cloud response. For understanding how state/vault recovery works conceptually during this reconstruction phase, review the core project's dedicated [API Reference & Lifecycle Architecture](https://github.com/ARPAHLS/micro-f1-mask/blob/main/docs/API.md). + +## Arguments + +| Argument | Type | Required | Default | Description | +| :--- | :--- | :--- | :--- | :--- | +| `text` | string | Yes | - | The raw, sensitive input string. | +| `mode` | string | No | `mask` | Options: `mask` (e.g., `[PERSON]`), `redact` (e.g., `XXXX`), or `remove` (removes the token entirely). | +| `ollama_url` | string | No | `http://localhost:11434` | The URL for your local Ollama instance running the model. | + +## Supported Entity Types +The `micro-f1-mask` model detects a variety of entities, including but not limited to: +- Names (`[PERSON]`) +- Emails (`[EMAIL]`) +- Phone Numbers (`[PHONE]`) +- Physical Addresses (`[ADDRESS]`) +- Crypto Wallets (`[CRYPTO_ADDRESS]`) +- Identification Numbers (SSN, Passports, etc.) + +## Example Usage + +Input text: +```text +Hello John Doe, your wallet 0xabc123 has been verified. +``` + +JSON Return (mask mode): +```json +{ + "sanitized_text": "Hello [PERSON_1], your wallet [CRYPTO_ADDRESS] has been verified.", + "metadata": { + "detected_entities": ["PERSON", "CRYPTO_ADDRESS"], + "entity_count": 2, + "security_level": "local-only", + "model": "arpacorp/micro-f1-mask" + } +} +``` diff --git a/examples/pii_guardrail_flow.py b/examples/pii_guardrail_flow.py new file mode 100644 index 0000000..1fd220e --- /dev/null +++ b/examples/pii_guardrail_flow.py @@ -0,0 +1,53 @@ +""" +Example Usage: Local PII Guardrail Flow +======================================= + +This script demonstrates how to intercept an outgoing LLM prompt containing sensitive +user information, run it through the `compliance/pii_masker` skill locally (which leverages +the micro-f1-mask edge model via Ollama), and then generate a secure payload for a cloud LLM API. +""" + +from skillware.core.loader import SkillLoader + + +def simulate_agentic_flow(): + # 1. The user provides a highly sensitive prompt + raw_user_input = ( + "Please schedule a meeting with Satoshi Nakamoto. " + "His email is satoshi@bitcoin.org and his wallet is 0x1234567890ABCDEF." + ) + print("--- 🔴 ORIGINAL RAW INPUT ---") + print(raw_user_input) + print() + + # 2. Load the Privacy Firewall Skill + print("[System] Loading compliance/pii_masker skill...") + pii_skill = SkillLoader.load_skill("compliance/pii_masker")["module"].PIIMaskerSkill() + + # 3. Intercept and Sanitize (Redact mode) + print("[System] Intercepting prompt...") + # NOTE: This requires Ollama running locally with the arpacorp/micro-f1-mask model. + # If Ollama is not running, the skill falls back to returning the original string. + result = pii_skill.execute({ + "text": raw_user_input, + "mode": "redact", # Change to "mask" to see entity tags like [PERSON_1] instead of XXXX + "ollama_url": "http://localhost:11434" + }) + + scrubbed_input = result["sanitized_text"] + metadata = result["metadata"] + + print("\n--- 🟢 SANITIZED PAYLOAD TO CLOUD ---") + print(scrubbed_input) + print("\n[System] Metadata:") + print(metadata) + print() + + # 4. Safe Cloud Invocation (Simulated) + # The external cloud provider (Google, Anthropic, OpenAI) never sees the raw PII. + print("[System] Calling External LLM with sanitized payload...") + print("... Done.") + + +if __name__ == "__main__": + simulate_agentic_flow() diff --git a/pyproject.toml b/pyproject.toml index 24a07f6..f22e42b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "skillware" -version = "0.2.2" +version = "0.2.3" description = "A framework for modular, self-contained AI skills." readme = "README.md" authors = [ diff --git a/skills/compliance/pii_masker/__init__.py b/skills/compliance/pii_masker/__init__.py new file mode 100644 index 0000000..64dffbe --- /dev/null +++ b/skills/compliance/pii_masker/__init__.py @@ -0,0 +1,3 @@ +from .skill import PIIMaskerSkill + +__all__ = ["PIIMaskerSkill"] diff --git a/skills/compliance/pii_masker/card.json b/skills/compliance/pii_masker/card.json new file mode 100644 index 0000000..3551a5c --- /dev/null +++ b/skills/compliance/pii_masker/card.json @@ -0,0 +1,19 @@ +{ + "name": "PII Masker", + "description": "Scrub sensitive PII from text before it hits external logic.", + "icon": "shield", + "color": "#2E86C1", + "ui_schema": { + "type": "card", + "fields": [ + { + "key": "mode", + "label": "Redaction Mode" + }, + { + "key": "ollama_url", + "label": "Ollama Endpoint" + } + ] + } +} diff --git a/skills/compliance/pii_masker/instructions.md b/skills/compliance/pii_masker/instructions.md new file mode 100644 index 0000000..68785c2 --- /dev/null +++ b/skills/compliance/pii_masker/instructions.md @@ -0,0 +1,6 @@ +# Privacy & Redaction Pipeline + +You are using the `compliance/pii_masker` skill. +Use this skill whenever you are handling text that contains sensitive user data (Names, Emails, Physical Addresses, Crypto Wallets, etc.) and you need to pass it to external tools, APIs, or less secure environments. + +This acts as a "Privacy Firewall". Depending on your use case, set the `mode` parameter to `mask` if you need to retain contextual semantic tags (e.g., `[PERSON_1]`), or `redact`/`remove` if you need to completely obscure the information before proceeding. diff --git a/skills/compliance/pii_masker/manifest.yaml b/skills/compliance/pii_masker/manifest.yaml new file mode 100644 index 0000000..7496de8 --- /dev/null +++ b/skills/compliance/pii_masker/manifest.yaml @@ -0,0 +1,23 @@ +name: compliance/pii_masker +version: 0.1.0 +description: High-precision, local PII detection and redaction using the micro-f1-mask model via Ollama. +parameters: + type: object + properties: + text: + type: string + description: The raw, sensitive input string. + mode: + type: string + description: "Options: mask, redact, or remove. Controls how the identified tokens are handled." + ollama_url: + type: string + description: The Ollama API endpoint (defaults to http://localhost:11434). + required: + - text +requirements: + - requests +constitution: | + 1. USER PRIVACY: Do not store or transmit sensitive PII externally; only use the specified local model. + 2. ACCURACY: In mask mode, perfectly preserve the structural tags (e.g., [PERSON_1]) for downstream replacement. + 3. COMPLIANCE: Adhere strictly to the chosen mode (mask/redact/remove). diff --git a/skills/compliance/pii_masker/skill.py b/skills/compliance/pii_masker/skill.py new file mode 100644 index 0000000..0c57237 --- /dev/null +++ b/skills/compliance/pii_masker/skill.py @@ -0,0 +1,77 @@ +import re +import requests +from typing import Any, Dict, List, Tuple +from skillware.core.base_skill import BaseSkill + + +class PIIMaskerSkill(BaseSkill): + """ + Skill to mask/redact PII from text using the arpacorp/micro-f1-mask model via Ollama. + """ + + @property + def manifest(self) -> Dict[str, Any]: + return { + "name": "compliance/pii_masker", + "version": "0.1.0" + } + + def execute(self, params: Dict[str, Any]) -> Dict[str, Any]: + text = params.get("text", "") + mode = params.get("mode", "mask") + ollama_url = params.get("ollama_url", "http://localhost:11434") + + sanitized_text, detected_entities = self._call_ollama(text, ollama_url) + sanitized_text = self._apply_mode(sanitized_text, mode) + + # Build unique entity types list + entities = list(set([re.sub(r'_[0-9]+$', '', e) for e in detected_entities])) + + return { + "sanitized_text": sanitized_text, + "metadata": { + "detected_entities": entities, + "entity_count": len(detected_entities), + "security_level": "local-only", + "model": "arpacorp/micro-f1-mask" + } + } + + def _call_ollama(self, text: str, endpoint: str) -> Tuple[str, List[str]]: + try: + response = requests.post(f"{endpoint}/api/generate", json={ + "model": "arpacorp/micro-f1-mask", + "prompt": text, + "stream": False + }, timeout=30) + if response.status_code == 200: + result_text = response.json().get("response", text) + else: + # If Ollama is down or model missing, return original text as a fallback + # or we could throw an exception. We'll return the raw text if it fails + result_text = text + except requests.exceptions.RequestException: + # Fall back to returning the text unmasked if Ollama is unreachable. + # In a strict environment, you might want to block here. + result_text = text + + # Detect entities in the response + detected = re.findall(r'\[([A-Z_]+(?:_[0-9]+)?)\]', result_text) + return result_text, detected + + def _apply_mode(self, text: str, mode: str) -> str: + if mode == "mask": + return text + + # Pattern to catch [DOCUMENT], [PERSON_1], etc. + pattern = r'\[[A-Z_]+(?:_[0-9]+)?\]' + if mode == "redact": + return re.sub(pattern, "XXXX", text) + elif mode == "remove": + # Replace token and any immediate preceding/following spaces safely + # A simple sub is sufficient. Cleaning up double spaces. + text = re.sub(pattern, "", text) + text = re.sub(r'\s+', ' ', text).strip() + return text + + return text diff --git a/tests/skills/compliance/test_pii_masker.py b/tests/skills/compliance/test_pii_masker.py new file mode 100644 index 0000000..237d944 --- /dev/null +++ b/tests/skills/compliance/test_pii_masker.py @@ -0,0 +1,41 @@ +from skillware.core.loader import SkillLoader + + +def test_pii_masker_manifest(): + bundle = SkillLoader.load_skill("compliance/pii_masker") + assert bundle["manifest"]["name"] == "compliance/pii_masker" + assert "text" in bundle["manifest"]["parameters"]["properties"] + assert "mode" in bundle["manifest"]["parameters"]["properties"] + + +def test_pii_masker_modes(mocker): + bundle = SkillLoader.load_skill("compliance/pii_masker") + skill_class = bundle["module"].PIIMaskerSkill + skill = skill_class() + + # Mock the Ollama API call + mock_response = "Hello [PERSON_1], your wallet [CRYPTO_ADDRESS] and [EMAIL] have been verified." + # The _call_ollama method returns (sanitized_text, [entities]) + mocker.patch.object(skill, '_call_ollama', return_value=(mock_response, ["PERSON_1", "CRYPTO_ADDRESS", "EMAIL"])) + + # Test Mask mode (default) + result_mask = skill.execute({"text": "Hello John Doe, your wallet 0xabc and john@doe.com have been verified."}) + expected_text = "Hello [PERSON_1], your wallet [CRYPTO_ADDRESS] and [EMAIL] have been verified." + assert result_mask["sanitized_text"] == expected_text + assert "PERSON" in result_mask["metadata"]["detected_entities"] + assert "CRYPTO_ADDRESS" in result_mask["metadata"]["detected_entities"] + + # Test Redact mode + result_redact = skill.execute({ + "text": "Hello John Doe, your wallet 0xabc and john@doe.com have been verified.", + "mode": "redact" + }) + assert result_redact["sanitized_text"] == "Hello XXXX, your wallet XXXX and XXXX have been verified." + + # Test Remove mode + result_remove = skill.execute({ + "text": "Hello John Doe, your wallet 0xabc and john@doe.com have been verified.", + "mode": "remove" + }) + # Remove simple mode removes the tags. It cleans spaces around them. + assert result_remove["sanitized_text"] == "Hello , your wallet and have been verified."