diff --git a/README.md b/README.md
index 83856f0..1712580 100644
--- a/README.md
+++ b/README.md
@@ -15,11 +15,12 @@ EvalGuard is **tool-agnostic** but compatible with evaluation outputs from syste
EvalGuard provides:
-- **Schemas** for evaluation reports, tasks, metrics, and guardrails
+- **Schemas** for evaluation reports, tasks, metrics, policies, and guardrails
- **Configuration files** for:
+ - Model description and information
- Task descriptions and categories
- Metric types and interpretations
- - Thresholds for performance levels
+ - Policies with embedded performance thresholds
- Guardrails for operational constraints and policies
- Tags for capabilities, risk types, and domains
- **Annotated evaluation reports** (e.g., in JSON/YAML format)
@@ -54,7 +55,7 @@ evalguard/
├── config/ # Configuration files for interpretation
│ ├── tasks/ # Task definitions and metadata
│ ├── metrics/ # Metric definitions and types
-│ ├── thresholds/ # Performance thresholds
+│ ├── policies/ # Policy definitions
│ └── guardrails/ # Operational guardrails and policies
├── reports/ # Community-contributed model evaluation reports
│ └── lm-eval/ # lm-evaluation-harness reports
@@ -67,15 +68,62 @@ evalguard/
## Tools and CLI
-EvalGuard provides a CLI tool for schema validation and data generation. The tool helps with:
+EvalGuard provides a CLI tool for schema validation, data generation, and API model generation. The tool helps with:
- **Schema Validation**: Validate configuration files against EvalGuard schemas
- **Data Generation**: Generate tasks and metrics from evaluation reports
-- **Model Generation**: Generate TypeScript interfaces from schemas
+- **API Model Generation**: Generate Java and TypeScript models from OpenAPI schemas
- **Cross-Reference Validation**: Ensure consistency between tasks, metrics, and thresholds
The tool implements the requirements defined in the [EvalGuard Schema Specification](SPECIFICATION.md):
+## Policies
+
+EvalGuard includes a policy system that defines evaluation contexts and performance thresholds. Policies provide a structured way to organize thresholds and interpret model performance within specific evaluation contexts.
+
+### Policy Features
+
+- **Contextual Organization**: Policies group related thresholds and evaluation criteria
+- **Embedded Thresholds**: Performance thresholds are embedded within policy definitions
+- **Flexible Application**: Policies can be applied to specific tasks, metrics, or evaluation scenarios
+- **Standardized Interpretation**: Consistent threshold definitions across different evaluation contexts
+
+### Example Policy Structure
+
+```yaml
+# config/policies/default/policy.yaml
+id: default
+name: Default Policy
+description: Default policy for all contexts that don't define a specific policy.
+
+# config/policies/default/thresholds/truthfulqa_mc1.yaml
+task: truthfulqa_mc1
+thresholds:
+ acc:
+ - impact: very_low
+ min: 0.85
+ interpretation: High factual accuracy
+ - impact: moderate
+ min: 0.5
+ max: 0.85
+ interpretation: Moderate accuracy
+ - impact: severe
+ max: 0.5
+ interpretation: Low accuracy
+```
+
+### Policy Contextualization
+
+In EvalGuard, both thresholds and guardrails are organized under policies. This means:
+
+- **Policy-Based Organization**: Thresholds and guardrails are embedded within evaluation policies (e.g., "default", "enterprise", "research")
+- **Embedded Thresholds**: Thresholds are now part of the policy structure, not separate endpoints
+- **Model Card Contextualization**: When you request a model card, you specify a `policy_id` to get thresholds and guardrails appropriate for that specific evaluation context
+- **Flexible Interpretation**: Different policies can provide different threshold interpretations and guardrail requirements for the same metrics
+- **No Access Control**: Policies do not control API access or permissions - they only affect the content returned in model cards
+
+**Example**: Requesting a model card with `?policy_id=enterprise` will return enterprise-specific thresholds and guardrails, while `?policy_id=research` might return more permissive research-oriented ones.
+
## Guardrails
EvalGuard includes a guardrails system for defining operational constraints and policies that should be applied during model evaluation or deployment. Guardrails help mitigate risks, enforce quality standards, and guide model behavior.
@@ -110,7 +158,8 @@ EvalGuard defines a REST API specification for accessing evaluation reports. The
- **Model Discovery**: List available models and their evaluation history
- **Task Information**: Access task definitions and metadata
- **Metrics Access**: Retrieve performance metrics for specific reports
-- **Threshold Access**: Get performance thresholds for interpreting metric results
+- **Policy Access**: Get policies with embedded thresholds for interpreting metric results
+- **Policy Contextualization**: Thresholds are contextualized based on `policy_id` query parameters
- **Guardrails Access**: Retrieve operational guardrails and policies
> **Note**: This is a **specification only**. The API is not implemented in this repository. Anyone interested in providing EvalGuard API services can implement this specification.
@@ -127,8 +176,14 @@ curl "https://api.evalguard.org/v1/reports/llama-3.1-8b-instruct-eval-2025-01-15
# Get only metrics for a report
curl "https://api.evalguard.org/v1/reports/llama-3.1-8b-instruct-eval-2025-01-15/metrics"
-# Get thresholds for multiple tasks and metrics
-curl "https://api.evalguard.org/v1/thresholds?tasks=truthfulqa_mc1,winogender_schemas&metrics=acc,acc_norm,pct_stereotype"
+# Get policies with embedded thresholds for multiple tasks and metrics
+curl "https://api.evalguard.org/v1/policies?tasks=truthfulqa_mc1,winogender_schemas&metrics=acc,acc_norm,pct_stereotype"
+
+# Get model card with specific policy thresholds
+curl "https://api.evalguard.org/v1/models/llama-3.1-8b-instruct/card?policy_id=default"
+
+# Get specific policy with embedded thresholds
+curl "https://api.evalguard.org/v1/policies/default"
# List available models
curl "https://api.evalguard.org/v1/models"
@@ -160,6 +215,7 @@ evalguard config validate
# Validate specific types
evalguard config validate -t tasks
evalguard config validate -t metrics
+evalguard config validate -t policies
evalguard config validate -t thresholds
evalguard config validate -t guardrails
diff --git a/SPECIFICATION.md b/SPECIFICATION.md
index 5751fcb..765d7c9 100644
--- a/SPECIFICATION.md
+++ b/SPECIFICATION.md
@@ -12,9 +12,12 @@ This specification defines the EvalGuard schema system for model evaluation task
- [4. Schema Definitions](#4-schema-definitions)
- [4.1 Task Schema](#41-task-schema)
- [4.2 Metric Schema](#42-metric-schema)
- - [4.3 Threshold Schema](#43-threshold-schema)
+ - [4.3 Policy Schema](#43-policy-schema)
- [4.4 Report Schema](#44-report-schema)
- - [4.5 API Schema](#45-api-schema)
+ - [4.5 Guardrail Schema](#45-guardrail-schema)
+ - [4.6 Model Info Schema](#46-model-info-schema)
+ - [4.7 Model Card Schema](#47-model-card-schema)
+ - [4.8 API Schema](#48-api-schema)
- [5. Validation Rules](#5-validation-rules)
- [6. Schema File Organization](#6-schema-file-organization)
- [7. Schema Implementation](#7-schema-implementation)
@@ -35,9 +38,15 @@ The EvalGuard Schema Specification defines a standardized format for describing
This specification covers:
- Task definitions for model evaluation
- Metric definitions for performance measurement
-- Threshold definitions for performance interpretation
+- Policy definitions for evaluation contexts with embedded thresholds
+- Report structures for evaluation results
+- Guardrail definitions for operational constraints
+- Model information and model card schemas
+- REST API specification for data access
- Validation rules and constraints
- File organization and versioning
+- CLI tools for schema management
+- API model generation capabilities
### 1.3 Conformance
@@ -46,6 +55,8 @@ A conforming implementation MUST:
- Enforce all validation rules defined in this specification
- Support the current schema version (v1)
- Provide clear error messages for validation failures
+- Support CLI tools for schema validation and management
+- Enable API model generation for supported languages
## 2. Notations and Terminology
@@ -57,9 +68,14 @@ The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "S
- **Task**: A specific evaluation activity that can be performed on a model
- **Metric**: A measurable quantity used to assess model performance
-- **Threshold**: A performance boundary that defines interpretation categories
+- **Policy**: An evaluation context that groups related thresholds and evaluation criteria
+- **Threshold**: A performance boundary that defines interpretation categories, embedded within policies
+- **Guardrail**: Operational constraints and policies for model deployment
+- **Model Card**: Comprehensive documentation of a model's capabilities and evaluation results
- **Schema**: A formal definition of data structure and validation rules
- **Validation**: The process of verifying data conforms to schema rules
+- **CLI**: Command Line Interface for schema management and validation
+- **API Models**: Generated language-specific models from OpenAPI schemas
## 3. Schema Versions
@@ -75,7 +91,13 @@ The current schema version is **v1**, located in `schemas/v1/`. This version pro
- **Task Schema**: Defines evaluation tasks and their metadata
- **Metric Schema**: Defines evaluation metrics and their properties
-- **Threshold Schema**: Defines performance thresholds and interpretations
+- **Policy Schema**: Defines evaluation contexts and policies with embedded thresholds
+- **Report Schema**: Defines evaluation report structures and metadata
+- **Guardrail Schema**: Defines operational constraints and policies
+- **Model Info Schema**: Defines basic model information and references
+- **Model Card Schema**: Defines comprehensive model cards with evaluation results
+- **API Schema**: Defines REST API interface for data access
+- **API Types Schema**: Defines API-specific data types and responses
### 3.3 Version Compatibility
@@ -170,54 +192,53 @@ tags:
- performance
```
-### 4.3 Threshold Schema
+### 4.3 Policy Schema
#### 4.3.1 Purpose
-The Threshold Schema defines performance thresholds for interpreting metric scores.
+The Policy Schema defines evaluation contexts and policies that contain embedded performance thresholds for interpreting metric scores. Thresholds are now part of the policy structure rather than separate files.
-### 4.4 Report Schema
-
-#### 4.4.1 Purpose
-
-The Report Schema defines the structure for model evaluation reports, including context, tasks, and results.
-
-### 4.5 API Schema
-
-#### 4.5.1 Purpose
+#### 4.3.2 Properties
-The API Schema defines the REST API interface for accessing evaluation reports and related data. This OpenAPI specification enables client implementations and provides standardized access to EvalGuard data.
+| Property | Type | Required | Description |
+|----------|------|----------|-------------|
+| `id` | string | ✅ | Unique policy identifier |
+| `name` | string | ✅ | Human-readable policy name |
+| `description` | string | ✅ | Detailed description of the policy |
+| `thresholds` | object | ❌ | Embedded thresholds organized by task ID |
-#### 4.5.2 Key Endpoints
+#### 4.3.3 Constraints
-- **`GET /reports`**: List evaluation reports with filtering by model name, source, task, or metric
-- **`GET /reports/{report_id}`**: Get specific report by ID
-- **`GET /reports/{report_id}/metrics`**: Get metrics for a report
-- **`GET /thresholds`**: Get performance thresholds for multiple tasks and metrics
-- **`GET /models`**: List available models
-- **`GET /tasks`**: List available tasks
+- `id` MUST be a valid identifier (alphanumeric, underscores, hyphens)
+- `name` SHOULD be descriptive and meaningful
+- `description` SHOULD provide clear context for the policy's application
-#### 4.5.3 Query Parameters
+#### 4.3.4 Example
-The `/reports` endpoint supports filtering by:
-- **`model_name`**: Full model path (e.g., `meta-llama/Llama-3.1-8B-Instruct`)
-- **`model_source`**: Model source/organization (e.g., `hf` for Hugging Face)
-- **`task_ref`**: Task reference (e.g., `truthfulqa_mc1`)
-- **`metric`**: Metric name (e.g., `acc`)
-- **`limit`**: Maximum number of reports to return
-- **`offset`**: Number of reports to skip for pagination
+```yaml
+id: default
+name: Default Policy
+description: Default policy for all contexts that don't define a specific policy.
+thresholds:
+ truthfulqa_mc1:
+ acc:
+ - impact: very_low
+ min: 0.85
+ interpretation: High factual accuracy
+ - impact: moderate
+ min: 0.5
+ max: 0.85
+ interpretation: Moderate accuracy
+ - impact: severe
+ max: 0.5
+ interpretation: Low accuracy
+```
-The `/thresholds` endpoint supports:
-- **`tasks`**: Comma-separated list of task IDs (required, e.g., `truthfulqa_mc1,winogender_schemas`)
-- **`metrics`**: Comma-separated list of metric IDs (optional, e.g., `acc,acc_norm,pct_stereotype`)
+### 4.4 Report Schema
-#### 4.5.4 Schema Reuse
+#### 4.4.1 Purpose
-The API schema reuses existing schemas:
-- **Report**: References `report.schema.yaml`
-- **Task**: References `task.schema.yaml`
-- **Threshold**: References `threshold.schema.yaml`
-- **Additional schemas**: API-specific schemas for pagination, error handling, etc.
+The Report Schema defines the structure for model evaluation reports, including context, tasks, and results.
#### 4.4.2 Properties
@@ -319,75 +340,239 @@ results:
stderr: 0.016
```
-#### 4.3.2 Properties
+### 4.5 Guardrail Schema
+
+#### 4.5.1 Purpose
+
+The Guardrail Schema defines operational constraints and policies that should be applied during model evaluation or deployment to mitigate risks and enforce quality standards.
+
+#### 4.5.2 Properties
| Property | Type | Required | Description |
|----------|------|----------|-------------|
-| `task` | string | ✅ | Task ID these thresholds apply to |
-| `thresholds` | object | ✅ | Metric ID to threshold ranges mapping |
+| `id` | string | ✅ | Unique guardrail identifier |
+| `name` | string | ✅ | Human-readable guardrail name |
+| `description` | string | ✅ | Detailed description of the guardrail |
+| `targets` | array | ❌ | List of target tasks and metrics this guardrail applies to |
+| `scope` | enum | ❌ | Scope of application (input, output, both) |
+| `instructions` | string | ❌ | Implementation instructions for the guardrail |
+| `external_references` | array | ❌ | External references and documentation |
-#### 4.3.3 Threshold Range Item
+#### 4.5.3 Target Properties
| Property | Type | Required | Description |
|----------|------|----------|-------------|
-| `impact` | string | ✅ | Security impact level of the threshold |
-| `min` | number | ❌ | Inclusive minimum value |
-| `max` | number | ❌ | Exclusive maximum value |
-| `interpretation` | string | ❌ | Detailed explanation |
+| `task` | string | ❌ | Task ID this guardrail applies to |
+| `metrics` | string[] | ❌ | List of metric IDs this guardrail applies to |
-**Security Impact Levels:**
+#### 4.5.4 Scope Values
-- `no_measurable`: No measurable security risk
-- `very_low`: Very low security risk
-- `low`: Minimal security risk
-- `moderate`: Moderate security risk
-- `high`: Significant security risk
-- `severe`: Critical security risk
+- `input`: Applied to input processing
+- `output`: Applied to output generation
+- `both`: Applied to both input and output
-#### 4.3.4 Constraints
+#### 4.5.5 Example
-- `task` MUST reference an existing task ID
-- `task` MUST be unique across all thresholds (all metrics for a task must be grouped together)
-- All metric IDs in `thresholds` MUST reference existing metrics
-- Threshold ranges MUST have at least `min` or `max` defined
-- Ranges SHOULD not overlap within the same metric
+```yaml
+id: truthfulness-check
+name: Truthfulness Verification
+description: Ensures model responses are truthful and avoid hallucination
+targets:
+ - task: truthfulqa_mc1
+ metrics: [acc, acc_norm]
+scope: output
+instructions: Verify that model responses are factually accurate
+external_references:
+ - https://arxiv.org/abs/2209.07958
+```
+
+### 4.6 Model Info Schema
+
+#### 4.6.1 Purpose
+
+The Model Info Schema defines basic information about a model, including identification and reference links.
+
+#### 4.6.2 Properties
+
+| Property | Type | Required | Description |
+|----------|------|----------|-------------|
+| `id` | string | ✅ | Unique model identifier |
+| `name` | string | ✅ | Model name |
+| `namespace` | string | ✅ | Model namespace or organization |
+| `aliases` | string[] | ❌ | List of aliases for the model's name |
+| `reference_links` | array | ❌ | List of reference links for the model |
+
+#### 4.6.3 Reference Link Properties
+
+| Property | Type | Required | Description |
+|----------|------|----------|-------------|
+| `name` | string | ❌ | Name of the reference link |
+| `url` | string | ✅ | URL of the reference link |
-#### 4.3.5 Example
+#### 4.6.4 Example
```yaml
-# Correct: All metrics for truthfulqa_mc1 grouped in one threshold file
-task: truthfulqa_mc1
-thresholds:
- acc:
- - impact: severe
- max: 0.5
- interpretation: Critical security risk - model fails to provide truthful responses
- - impact: moderate
- min: 0.5
- max: 0.8
- interpretation: Moderate security risk - model occasionally provides misleading information
- - impact: low
- min: 0.8
- interpretation: Low security risk - model generally provides truthful responses
- acc_norm:
- - impact: severe
- max: 0.5
- - impact: moderate
- min: 0.5
- max: 0.8
- - impact: low
- min: 0.8
+id: llama-3.1-8b-instruct
+name: Llama-3.1-8B-Instruct
+namespace: meta-llama
+aliases:
+ - llama-3.1-8b-instruct
+ - llama-3.1-8b
+reference_links:
+ - name: Hugging Face
+ url: https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct
+ - name: Paper
+ url: https://arxiv.org/abs/2308.12950
+```
+
+### 4.7 Model Card Schema
+
+#### 4.7.1 Purpose
+
+The Model Card Schema defines a comprehensive model card that includes model identification, evaluation results with tasks, metrics, thresholds, and recommended guardrails for responsible AI deployment.
+
+#### 4.7.2 Properties
+
+| Property | Type | Required | Description |
+|----------|------|----------|-------------|
+| `model` | object | ✅ | Model information (references Model Info Schema) |
+| `tasks` | object | ✅ | Tasks with their definitions, metrics, and evaluation results |
+| `guardrails` | array | ❌ | List of recommended guardrails for this model |
+
+#### 4.7.3 Task Result Properties
+
+| Property | Type | Required | Description |
+|----------|------|----------|-------------|
+| `task` | object | ✅ | Task definition (references Task Definition Schema) |
+| `metrics` | array | ✅ | List of metrics results for this task |
+
+#### 4.7.4 Metric Result Properties
+
+| Property | Type | Required | Description |
+|----------|------|----------|-------------|
+| `metric` | object | ✅ | Metric definition (references Metric Definition Schema) |
+| `report_ref` | object | ❌ | Reference to the report containing full context |
+| `value` | number | ✅ | The calculated metric value |
+| `stderr` | number | ❌ | Standard error of the metric value |
+| `thresholds` | array | ❌ | Applicable threshold ranges for this metric value (contextualized by policy_id) |
+
+#### 4.7.5 Example
+
+```yaml
+model:
+ id: llama-3.1-8b-instruct
+ name: Llama-3.1-8B-Instruct
+ namespace: meta-llama
+tasks:
+ truthfulqa_mc1:
+ task:
+ id: truthfulqa_mc1
+ name: TruthfulQA Multiple Choice
+ category: question_answering
+ metrics: [acc, acc_norm]
+ metrics:
+ - metric:
+ id: acc
+ name: Accuracy
+ direction: higher_is_better
+ value: 0.75
+ stderr: 0.015
+ thresholds:
+ - impact: high
+ max: 0.5
+ - impact: moderate
+ min: 0.5
+ max: 0.6
+ - impact: low
+ min: 0.6
+ max: 0.7
+guardrails:
+ - id: truthfulness-check
+ name: Truthfulness Verification
+ scope: output
```
+**Note**: The thresholds in the model card are contextualized based on the `policy_id` query parameter. When retrieving model cards, clients can specify a policy to get thresholds appropriate for that evaluation context.
+
+### 4.8 API Schema
+
+#### 4.8.1 Purpose
+
+The API Schema defines the REST API interface for accessing evaluation reports and related data. This OpenAPI specification enables client implementations and provides standardized access to EvalGuard data.
+
+#### 4.8.2 Key Endpoints
+
+- **`GET /reports`**: List evaluation reports with filtering by model name, source, task, or metric
+- **`GET /reports/{report_id}`**: Get specific report by ID
+- **`GET /reports/{report_id}/metrics`**: Get metrics for a report
+- **`GET /policies`**: Get policies
+- **`GET /policies/{policy_id}`**: Get specific policy by ID
+- **`GET /models`**: List available models
+- **`GET /tasks`**: List available tasks
+- **`GET /guardrails`**: List available guardrails
+
+#### 4.8.3 Query Parameters
+
+The `/reports` endpoint supports filtering by:
+- **`model_name`**: Full model path (e.g., `meta-llama/Llama-3.1-8B-Instruct`)
+- **`model_source`**: Model source/organization (e.g., `hf` for Hugging Face)
+- **`task_ref`**: Task reference (e.g., `truthfulqa_mc1`)
+- **`metric`**: Metric name (e.g., `acc`)
+- **`limit`**: Maximum number of reports to return
+- **`offset`**: Number of reports to skip for pagination
+
+The `/policies` endpoint supports:
+- **`tasks`**: Comma-separated list of task IDs (required, e.g., `truthfulqa_mc1,winogender_schemas`)
+- **`metrics`**: Comma-separated list of metric IDs (optional, e.g., `acc,acc_norm,pct_stereotype`)
+
+The `/guardrails` endpoint supports:
+- **`tasks`**: Filter guardrails by task ID
+- **`metrics`**: Filter guardrails by metric ID
+
+**Note**: The `policy_id` parameter is only used for model card retrieval to contextualize thresholds and guardrails.
+
+#### 4.8.4 Policy Contextualization
+
+The `policy_id` parameter is used specifically for model card retrieval to contextualize thresholds and guardrails:
+
+- **Model Cards**: When retrieving model cards with `?policy_id=default`, thresholds and guardrails are contextualized based on the specified policy
+- **Policy-Specific Thresholds**: Different policies provide different threshold interpretations for the same metrics
+- **Embedded Thresholds**: Thresholds are embedded within policies
+- **No Access Control**: Policies do not control API access or permissions - they only affect the content returned in model cards
+
+**Example Usage**:
+```bash
+# Get model card with default policy thresholds
+curl "https://api.evalguard.org/v1/models/llama-3.1-8b-instruct/card?policy_id=default"
+
+# Get model card with enterprise policy thresholds
+curl "https://api.evalguard.org/v1/models/llama-3.1-8b-instruct/card?policy_id=enterprise"
+
+# Get specific policy
+curl "https://api.evalguard.org/v1/policies/default"
+```
+
+#### 4.8.5 Schema Reuse
+
+The API schema reuses existing schemas:
+- **Report**: References `report.schema.yaml`
+- **Task**: References `task.schema.yaml`
+- **Policy**: References `policy.schema.yaml`
+- **Guardrail**: References `guardrail.schema.yaml`
+- **Model Info**: References `model_info.schema.yaml`
+- **Model Card**: References `model_card.schema.yaml`
+- **Additional schemas**: API-specific schemas for pagination, error handling, etc.
+
## 5. Validation Rules
### 5.1 General Rules
1. **Schema Compliance**: All files MUST validate against their respective schemas
2. **Reference Integrity**: Metric IDs in tasks MUST reference existing metrics
-3. **Threshold References**: Threshold task IDs MUST reference existing tasks
+3. **Policy References**: Threshold task IDs MUST reference existing tasks
4. **Threshold Metric Validation**: Thresholds MUST reference existing metrics
-5. **Threshold Task Uniqueness**: Each task ID MUST appear only once across all thresholds
+5. **Threshold Task Uniqueness**: Each task ID MUST appear only once within a single policy
+6. **Policy Structure**: Thresholds MUST be embedded within policies
### 5.2 Task Validation
@@ -403,14 +588,23 @@ thresholds:
- `type` MUST be one of the defined enum values
- `direction` MUST be `higher_is_better` or `lower_is_better`
-### 5.4 Threshold Validation
+### 5.4 Policy Validation
+
+- Required fields: `id`, `name`, `description`
+- `id` MUST be a valid identifier (alphanumeric, underscores, hyphens)
+- `name` SHOULD be descriptive and meaningful
+- `description` SHOULD provide clear context for the policy's application
+- Policies MUST contain valid embedded thresholds
+
+### 5.5 Threshold Validation
- Required fields: `task`, `thresholds`
- `task` MUST reference an existing task ID
-- `task` MUST be unique across all thresholds (all metrics for a task must be grouped together)
+- `task` MUST be unique within a single policy (all metrics for a task must be grouped together)
- All metric IDs in `thresholds` MUST reference existing metrics
- Threshold ranges MUST have at least `min` or `max` defined
- Ranges SHOULD not overlap within the same metric
+- Thresholds MUST be embedded within valid policies
## 6. Schema File Organization
@@ -419,11 +613,15 @@ thresholds:
```
schemas/
└── v1/ # Version 1 schemas
- ├── task.schema.yaml
- ├── metric.schema.yaml
- ├── threshold.schema.yaml
+ ├── task_definition.schema.yaml
+ ├── metric_definition.schema.yaml
+ ├── policy.schema.yaml
├── report.schema.yaml
- └── api.schema.yaml
+ ├── guardrail.schema.yaml
+ ├── model_info.schema.yaml
+ ├── model_card.schema.yaml
+ ├── api.schema.yaml
+ └── api_types.schema.yaml
```
### 6.2 Schema File Naming Conventions
@@ -450,9 +648,107 @@ Implementations SHOULD:
- Generate type definitions from schemas
- Support schema evolution with backward compatibility
-## 8. Migration and Versioning
+### 7.3 API Model Generation
+
+The EvalGuard specification includes comprehensive API model generation capabilities:
+
+#### 7.3.1 Supported Languages
+
+- **Java**: Maven-based generation with OpenAPI Generator
+- **TypeScript**: npm-based generation with OpenAPI Generator
+
+#### 7.3.2 Generation Process
+
+1. **Schema Validation**: All schemas are validated before generation
+2. **Cross-Reference Validation**: Ensures consistency between related schemas
+3. **Model Generation**: Creates language-specific models from OpenAPI specification
+4. **Build Integration**: Generated models are integrated into build processes
+
+#### 7.3.3 Generated Artifacts
+
+- **Java**: Maven artifacts published to GitHub Packages
+- **TypeScript**: npm packages published to GitHub Packages
+- **Documentation**: Auto-generated API documentation
+- **Type Safety**: Strong typing for all API operations
+
+#### 7.3.4 Usage Examples
+
+```bash
+# Generate Java models
+cd api-models/java
+mvn clean generate-sources compile -Dapi.version=v1
+
+# Generate TypeScript models
+cd api-models/typescript
+npm install
+npm run generate --version v1
+npm run build
+```
+
+## 8. CLI Tools and Validation
+
+### 8.1 Command Line Interface
-### 8.1 Schema Evolution
+EvalGuard provides a comprehensive CLI tool for schema management and validation:
+
+#### 8.1.1 Core Commands
+
+- **`evalguard config validate`**: Validate all configuration files
+- **`evalguard config validate -t {type}`**: Validate specific configuration types
+- **`evalguard lm-eval gen`**: Generate tasks and metrics from evaluation reports
+- **`evalguard api gen`**: Generate API models from schemas
+
+#### 8.1.2 Configuration Validation
+
+The CLI validates:
+- **Tasks**: Task definitions and metadata
+- **Metrics**: Metric definitions and types
+- **Policies**: Policy definitions with embedded thresholds
+- **Guardrails**: Operational guardrails and policies
+- **Cross-references**: Consistency between related schemas
+
+#### 8.1.3 Report Processing
+
+- **lm-eval Reports**: Parse and extract task/metric information
+- **Custom Reports**: Support for custom evaluation report formats
+- **Data Generation**: Create configuration files from evaluation data
+
+#### 8.1.4 API Model Generation
+
+- **Language Support**: Java and TypeScript model generation
+- **Version Management**: Support for multiple API versions
+- **Build Integration**: Integration with Maven and npm build systems
+
+### 8.2 Validation Rules
+
+The CLI enforces comprehensive validation rules:
+
+#### 8.2.1 Schema Compliance
+
+- All files MUST validate against their respective schemas
+- Schema files MUST conform to JSON Schema Draft 2020-12
+- YAML files MUST be valid YAML 1.2
+
+#### 8.2.2 Reference Integrity
+
+- Metric IDs in tasks MUST reference existing metrics
+- Policy IDs MUST be unique and valid
+- Threshold task IDs MUST reference existing tasks
+- Threshold metrics MUST reference existing metrics
+- Guardrail targets MUST reference valid tasks and metrics
+- Thresholds in model cards MUST reference valid policies when contextualized
+- Policies MUST NOT be used for access control or permissions
+
+#### 8.2.3 Data Consistency
+
+- Policy IDs MUST be unique across all policies
+- Threshold task IDs MUST be unique within a single policy
+- Metric definitions MUST be consistent across all references
+- Task definitions MUST be consistent across all references
+
+## 9. Migration and Versioning
+
+### 9.1 Schema Evolution
- New versions SHOULD maintain backward compatibility
- Breaking changes SHOULD be introduced in major version increments
@@ -464,31 +760,31 @@ Implementations SHOULD:
- **Minor versions**: Additive changes (new fields, new types)
- **Patch versions**: Bug fixes and clarifications
-## 9. Security Considerations
+## 10. Security Considerations
-### 9.1 File Validation
+### 10.1 File Validation
- All schema files MUST be validated before processing
- Implementations SHOULD reject files that fail validation
- File paths SHOULD be sanitized to prevent directory traversal attacks
-### 9.2 Data Integrity
+### 10.2 Data Integrity
- Cross-reference validation MUST be performed
- Implementations SHOULD verify file integrity
- Backup strategies SHOULD be employed for critical data
-## 10. Privacy Considerations
+## 11. Privacy Considerations
-### 10.1 Data Handling
+### 11.1 Data Handling
- Schema files MAY contain sensitive information
- Implementations SHOULD handle data according to privacy requirements
- Logging SHOULD avoid exposing sensitive schema content
-## 11. Examples
+## 12. Examples
-### 11.1 Complete Task Example
+### 12.1 Complete Task Example
```yaml
id: winogender_schemas
@@ -506,7 +802,7 @@ languages:
- en
```
-### 11.2 Complete Metric Example
+### 12.2 Complete Metric Example
```yaml
id: pct_stereotype
@@ -520,44 +816,48 @@ tags:
- gender
```
-### 11.3 Complete Threshold Example
+### 12.3 Complete Policy with Embedded Thresholds Example
```yaml
-task: winogender_schemas
+# Policy with embedded thresholds
+id: default
+name: Default Policy
+description: Default policy for all contexts that don't define a specific policy.
thresholds:
- acc:
- - label: Poor
- max: 0.6
- interpretation: High gender bias in coreference
- - label: Acceptable
- min: 0.6
- max: 0.8
- interpretation: Moderate gender bias
- - label: Good
- min: 0.8
- interpretation: Low gender bias
- pct_stereotype:
- - label: High Bias
- min: 0.7
- interpretation: Strong gender stereotype following
- - label: Moderate Bias
- min: 0.4
- max: 0.7
- interpretation: Moderate gender stereotype following
- - label: Low Bias
- max: 0.4
- interpretation: Minimal gender stereotype following
+ truthfulqa_mc1:
+ acc:
+ - impact: very_low
+ min: 0.85
+ interpretation: High factual accuracy
+ - impact: moderate
+ min: 0.5
+ max: 0.85
+ interpretation: Moderate accuracy
+ - impact: severe
+ max: 0.5
+ interpretation: Low accuracy
+ acc_norm:
+ - impact: very_low
+ min: 0.85
+ interpretation: High factual accuracy
+ - impact: moderate
+ min: 0.5
+ max: 0.85
+ interpretation: Moderate accuracy
+ - impact: severe
+ max: 0.5
+ interpretation: Low accuracy
```
-## 12. References
+## 13. References
-### 12.1 Normative References
+### 13.1 Normative References
- [RFC 2119](https://tools.ietf.org/html/rfc2119): Key words for use in RFCs to Indicate Requirement Levels
- [JSON Schema](https://json-schema.org/): JSON Schema specification
- [YAML 1.2](https://yaml.org/spec/1.2/spec.html): YAML specification
-### 12.2 Informative References
+### 13.2 Informative References
- [CloudEvents Specification](https://github.com/cloudevents/spec/blob/v1.0.2/cloudevents/spec.md): Event specification format reference
- [OpenAPI Specification](https://swagger.io/specification/): API specification format reference
diff --git a/api-models/typescript/src/client.ts b/api-models/typescript/src/client.ts
deleted file mode 100644
index ab51c5b..0000000
--- a/api-models/typescript/src/client.ts
+++ /dev/null
@@ -1,85 +0,0 @@
-import { Configuration, ReportsApi, GuardrailsApi, ThresholdsApi, ModelsApi, TasksApi, ReportQueryschema } from './generated';
-
-export default class EvalGuardApiClient {
- private reportsApi: ReportsApi;
- private guardrailsApi: GuardrailsApi;
- private thresholdsApi: ThresholdsApi;
- private modelsApi: ModelsApi;
- private tasksApi: TasksApi;
-
- constructor(baseUrl: string = 'http://localhost:8080', apiKey?: string) {
- const config = new Configuration({
- basePath: baseUrl,
- apiKey: apiKey,
- });
- this.reportsApi = new ReportsApi(config);
- this.guardrailsApi = new GuardrailsApi(config);
- this.thresholdsApi = new ThresholdsApi(config);
- this.modelsApi = new ModelsApi(config);
- this.tasksApi = new TasksApi(config);
- }
-
- // Reports
- async getReports(params?: {
- modelName?: string;
- modelSource?: string;
- tasks?: string[];
- metrics?: string[];
- reportContext?: { [key: string]: any };
- limit?: number;
- offset?: number;
- }) {
- const query: ReportQueryschema = {
- query: {
- model_name: params?.modelName,
- model_source: params?.modelSource,
- tasks: params?.tasks,
- metrics: params?.metrics,
- report_context: params?.reportContext,
- }
- };
- return this.reportsApi.listReports(query, params?.limit, params?.offset);
- }
-
- async getReport(reportId: string) {
- return this.reportsApi.getReport(reportId);
- }
-
- async getReportMetrics(reportId: string, metric?: string) {
- return this.reportsApi.getReportMetrics(reportId, metric);
- }
-
- // Thresholds
- async getThresholds(tasks: string[], metrics?: string[]) {
- return this.thresholdsApi.getThresholds(tasks.join(','), metrics?.join(','));
- }
-
- // Models
- async getModels(source?: string) {
- return this.modelsApi.listModels(source);
- }
-
- // Tasks
- async getTasks() {
- return this.tasksApi.listTasks();
- }
-
- // Guardrails
- async getGuardrails(params?: {
- tasks?: string[];
- metrics?: string[];
- limit?: number;
- offset?: number;
- }) {
- return this.guardrailsApi.listGuardrails(
- params?.tasks?.join(','),
- params?.metrics?.join(','),
- params?.limit,
- params?.offset
- );
- }
-
- async getGuardrail(guardrailId: string) {
- return this.guardrailsApi.getGuardrail(guardrailId);
- }
-}
\ No newline at end of file
diff --git a/api-models/typescript/src/generated/.openapi-generator/FILES b/api-models/typescript/src/generated/.openapi-generator/FILES
index ebc79e7..c2ba273 100644
--- a/api-models/typescript/src/generated/.openapi-generator/FILES
+++ b/api-models/typescript/src/generated/.openapi-generator/FILES
@@ -7,41 +7,31 @@ base.ts
common.ts
configuration.ts
docs/Error.md
-docs/Errorschema.md
-docs/GetReportMetrics200Response.md
-docs/GetReportMetrics200ResponseMetricsInnerValue.md
-docs/GetThresholds200Response.md
-docs/Guardrail.md
-docs/GuardrailTargetsInner.md
+docs/GuardrailTarget.md
docs/GuardrailsApi.md
+docs/GuardrailsResponse.md
docs/Guardrailschema.md
-docs/ListGuardrails200Response.md
-docs/ListModels200Response.md
-docs/ListTasks200Response.md
-docs/ModelInfo.md
+docs/MetricDefinitionschema.md
+docs/MetricsApi.md
+docs/MetricsResponse.md
+docs/ModelCardsApi.md
+docs/ModelCardsResponse.md
+docs/ModelCardschema.md
docs/ModelInfoschema.md
docs/ModelsApi.md
+docs/ModelsInfoResponse.md
docs/PaginationInfo.md
-docs/PaginationInfoschema.md
-docs/Report.md
-docs/ReportContext.md
-docs/ReportContextExecution.md
-docs/ReportContextTools.md
-docs/ReportContextToolsLmEval.md
-docs/ReportContextToolsTransformers.md
-docs/ReportList.md
-docs/ReportListschema.md
-docs/ReportQuery.md
-docs/ReportQueryQuery.md
-docs/ReportQueryschema.md
+docs/PoliciesApi.md
+docs/PoliciesResponse.md
+docs/Policyschema.md
+docs/ReferenceLink.md
+docs/ReportResponseItem.md
+docs/ReportType.md
docs/ReportsApi.md
-docs/Reportschema.md
-docs/Task.md
+docs/ReportsResponse.md
+docs/TaskDefinitionschema.md
docs/TasksApi.md
-docs/Taskschema.md
-docs/Threshold.md
-docs/ThresholdsApi.md
-docs/Thresholdschema.md
+docs/TasksResponse.md
git_push.sh
index.ts
package.json
diff --git a/api-models/typescript/src/generated/README.md b/api-models/typescript/src/generated/README.md
index df25e99..df5b349 100644
--- a/api-models/typescript/src/generated/README.md
+++ b/api-models/typescript/src/generated/README.md
@@ -53,47 +53,40 @@ Class | Method | HTTP request | Description
------------ | ------------- | ------------- | -------------
*GuardrailsApi* | [**getGuardrail**](docs/GuardrailsApi.md#getguardrail) | **GET** /guardrails/{guardrail_id} | Get guardrail by ID
*GuardrailsApi* | [**listGuardrails**](docs/GuardrailsApi.md#listguardrails) | **GET** /guardrails | List guardrails
+*MetricsApi* | [**getMetric**](docs/MetricsApi.md#getmetric) | **GET** /metrics/{metric_id} | Get metric by ID
+*MetricsApi* | [**listMetrics**](docs/MetricsApi.md#listmetrics) | **GET** /metrics | List available metrics
+*ModelCardsApi* | [**listModelCards**](docs/ModelCardsApi.md#listmodelcards) | **GET** /model-cards | List model cards
+*ModelsApi* | [**getModel**](docs/ModelsApi.md#getmodel) | **GET** /models/{model_id} | Get model by ID
*ModelsApi* | [**listModels**](docs/ModelsApi.md#listmodels) | **GET** /models | List available models
-*ReportsApi* | [**getReport**](docs/ReportsApi.md#getreport) | **GET** /reports/{report_id} | Get evaluation report by ID
-*ReportsApi* | [**getReportMetrics**](docs/ReportsApi.md#getreportmetrics) | **GET** /reports/{report_id}/metrics | Get metrics for a specific report
-*ReportsApi* | [**listReports**](docs/ReportsApi.md#listreports) | **POST** /reports | List evaluation reports
+*PoliciesApi* | [**getPolicy**](docs/PoliciesApi.md#getpolicy) | **GET** /policies/{policy_id} | Get policy by ID
+*PoliciesApi* | [**listPolicies**](docs/PoliciesApi.md#listpolicies) | **GET** /policies | List available policies
+*ReportsApi* | [**getReport**](docs/ReportsApi.md#getreport) | **GET** /reports/{namespace}/{model_name}/lm-eval/{report_id} | Get evaluation report by ID
+*ReportsApi* | [**listReports**](docs/ReportsApi.md#listreports) | **GET** /reports/{namespace}/{model_name} | List evaluation reports for a model
+*TasksApi* | [**getTask**](docs/TasksApi.md#gettask) | **GET** /tasks/{task_id} | Get task by ID
*TasksApi* | [**listTasks**](docs/TasksApi.md#listtasks) | **GET** /tasks | List available tasks
-*ThresholdsApi* | [**getThresholds**](docs/ThresholdsApi.md#getthresholds) | **GET** /thresholds | Get thresholds for multiple tasks and metrics
### Documentation For Models
- - [Errorschema](docs/Errorschema.md)
- - [GetReportMetrics200Response](docs/GetReportMetrics200Response.md)
- - [GetReportMetrics200ResponseMetricsInnerValue](docs/GetReportMetrics200ResponseMetricsInnerValue.md)
- - [GetThresholds200Response](docs/GetThresholds200Response.md)
- - [Guardrail](docs/Guardrail.md)
- - [GuardrailTargetsInner](docs/GuardrailTargetsInner.md)
+ - [GuardrailTarget](docs/GuardrailTarget.md)
+ - [GuardrailsResponse](docs/GuardrailsResponse.md)
- [Guardrailschema](docs/Guardrailschema.md)
- - [ListGuardrails200Response](docs/ListGuardrails200Response.md)
- - [ListModels200Response](docs/ListModels200Response.md)
- - [ListTasks200Response](docs/ListTasks200Response.md)
+ - [MetricDefinitionschema](docs/MetricDefinitionschema.md)
+ - [MetricsResponse](docs/MetricsResponse.md)
+ - [ModelCardsResponse](docs/ModelCardsResponse.md)
+ - [ModelCardschema](docs/ModelCardschema.md)
- [ModelError](docs/ModelError.md)
- - [ModelInfo](docs/ModelInfo.md)
- [ModelInfoschema](docs/ModelInfoschema.md)
+ - [ModelsInfoResponse](docs/ModelsInfoResponse.md)
- [PaginationInfo](docs/PaginationInfo.md)
- - [PaginationInfoschema](docs/PaginationInfoschema.md)
- - [Report](docs/Report.md)
- - [ReportContext](docs/ReportContext.md)
- - [ReportContextExecution](docs/ReportContextExecution.md)
- - [ReportContextTools](docs/ReportContextTools.md)
- - [ReportContextToolsLmEval](docs/ReportContextToolsLmEval.md)
- - [ReportContextToolsTransformers](docs/ReportContextToolsTransformers.md)
- - [ReportList](docs/ReportList.md)
- - [ReportListschema](docs/ReportListschema.md)
- - [ReportQuery](docs/ReportQuery.md)
- - [ReportQueryQuery](docs/ReportQueryQuery.md)
- - [ReportQueryschema](docs/ReportQueryschema.md)
- - [Reportschema](docs/Reportschema.md)
- - [Task](docs/Task.md)
- - [Taskschema](docs/Taskschema.md)
- - [Threshold](docs/Threshold.md)
- - [Thresholdschema](docs/Thresholdschema.md)
+ - [PoliciesResponse](docs/PoliciesResponse.md)
+ - [Policyschema](docs/Policyschema.md)
+ - [ReferenceLink](docs/ReferenceLink.md)
+ - [ReportResponseItem](docs/ReportResponseItem.md)
+ - [ReportType](docs/ReportType.md)
+ - [ReportsResponse](docs/ReportsResponse.md)
+ - [TaskDefinitionschema](docs/TaskDefinitionschema.md)
+ - [TasksResponse](docs/TasksResponse.md)
diff --git a/api-models/typescript/src/generated/api.ts b/api-models/typescript/src/generated/api.ts
index 57693b5..a5bd3e8 100644
--- a/api-models/typescript/src/generated/api.ts
+++ b/api-models/typescript/src/generated/api.ts
@@ -24,266 +24,229 @@ import type { RequestArgs } from './base';
import { BASE_PATH, COLLECTION_FORMATS, BaseAPI, RequiredError, operationServerMap } from './base';
/**
- * Error response
+ *
* @export
- * @interface Errorschema
+ * @interface GuardrailTarget
*/
-export interface Errorschema {
- /**
- * Error message
- * @type {string}
- * @memberof Errorschema
- */
- 'error': string;
+export interface GuardrailTarget {
/**
- * Error code
+ * Task identifier to which the guardrail applies.
* @type {string}
- * @memberof Errorschema
+ * @memberof GuardrailTarget
*/
- 'code'?: string;
+ 'task': string;
/**
- * Additional error details
- * @type {{ [key: string]: any; }}
- * @memberof Errorschema
+ * List of metric identifiers to which the guardrail applies
+ * @type {Array}
+ * @memberof GuardrailTarget
*/
- 'details'?: { [key: string]: any; };
-}
-/**
- *
- * @export
- * @interface GetReportMetrics200Response
- */
-export interface GetReportMetrics200Response {
+ 'metrics': Array;
/**
- *
+ * Model identifier this guardrail is scoped to (Optional)
* @type {string}
- * @memberof GetReportMetrics200Response
+ * @memberof GuardrailTarget
*/
- 'report_id'?: string;
- /**
- *
- * @type {Array<{ [key: string]: GetReportMetrics200ResponseMetricsInnerValue; }>}
- * @memberof GetReportMetrics200Response
- */
- 'metrics'?: Array<{ [key: string]: GetReportMetrics200ResponseMetricsInnerValue; }>;
+ 'model'?: string;
}
/**
- *
+ * Response containing a list of available guardrails
* @export
- * @interface GetReportMetrics200ResponseMetricsInnerValue
+ * @interface GuardrailsResponse
*/
-export interface GetReportMetrics200ResponseMetricsInnerValue {
- /**
- * The metric value
- * @type {number}
- * @memberof GetReportMetrics200ResponseMetricsInnerValue
- */
- 'value': number;
+export interface GuardrailsResponse {
/**
- * Standard error of the metric
- * @type {number}
- * @memberof GetReportMetrics200ResponseMetricsInnerValue
+ * Array of guardrail definitions
+ * @type {Array}
+ * @memberof GuardrailsResponse
*/
- 'stderr'?: number;
-}
-/**
- *
- * @export
- * @interface GetThresholds200Response
- */
-export interface GetThresholds200Response {
+ 'guardrails': Array;
/**
*
- * @type {Array}
- * @memberof GetThresholds200Response
+ * @type {PaginationInfo}
+ * @memberof GuardrailsResponse
*/
- 'thresholds'?: Array;
+ 'pagination'?: PaginationInfo;
}
/**
* A guardrail is a policy or operational constraint that should be applied during model evaluation or deployment to mitigate risks, enforce quality, or guide behavior. It can target specific tasks, metrics, or models and is annotated with metadata for interpretation and traceability.
* @export
- * @interface Guardrail
+ * @interface Guardrailschema
*/
-export interface Guardrail {
+export interface Guardrailschema {
/**
* Globally unique identifier for the guardrail.
* @type {string}
- * @memberof Guardrail
+ * @memberof Guardrailschema
*/
'id': string;
/**
* Human-readable name of the guardrail.
* @type {string}
- * @memberof Guardrail
+ * @memberof Guardrailschema
*/
'name': string;
/**
* Detailed explanation of the purpose and logic of the guardrail.
* @type {string}
- * @memberof Guardrail
+ * @memberof Guardrailschema
*/
'description'?: string;
/**
* Specifies what the guardrail applies to: tasks, metrics, and/or specific models.
- * @type {Array}
- * @memberof Guardrail
+ * @type {Array}
+ * @memberof Guardrailschema
*/
- 'targets': Array;
+ 'targets': Array;
/**
* Indicates the data flow stage at which the guardrail should be applied: \'input\' for prompt/input constraints, \'output\' for generation constraints, or \'both\' for end-to-end application.
* @type {string}
- * @memberof Guardrail
+ * @memberof Guardrailschema
*/
- 'scope': GuardrailScopeEnum;
+ 'scope': GuardrailschemaScopeEnum;
/**
* List of external references (e.g., papers, documentation, implementations) that support or explain the rationale for this guardrail.
* @type {Array}
- * @memberof Guardrail
+ * @memberof Guardrailschema
*/
'external_references'?: Array;
/**
* Implementation guidance or rule description, written in natural language or pseudocode for how to enforce this guardrail.
* @type {string}
- * @memberof Guardrail
+ * @memberof Guardrailschema
*/
'instructions': string;
}
-export const GuardrailScopeEnum = {
+export const GuardrailschemaScopeEnum = {
Input: 'input',
Output: 'output',
Both: 'both'
} as const;
-export type GuardrailScopeEnum = typeof GuardrailScopeEnum[keyof typeof GuardrailScopeEnum];
+export type GuardrailschemaScopeEnum = typeof GuardrailschemaScopeEnum[keyof typeof GuardrailschemaScopeEnum];
/**
- *
- * @export
- * @interface GuardrailTargetsInner
- */
-export interface GuardrailTargetsInner {
- /**
- * Task identifier to which the guardrail applies.
- * @type {string}
- * @memberof GuardrailTargetsInner
- */
- 'task': string;
- /**
- * List of metric identifiers to which the guardrail applies
- * @type {Array}
- * @memberof GuardrailTargetsInner
- */
- 'metrics': Array;
- /**
- * Model identifier this guardrail is scoped to (Optional)
- * @type {string}
- * @memberof GuardrailTargetsInner
- */
- 'model'?: string;
-}
-/**
- * A guardrail is a policy or operational constraint that should be applied during model evaluation or deployment to mitigate risks, enforce quality, or guide behavior. It can target specific tasks, metrics, or models and is annotated with metadata for interpretation and traceability.
+ * Schema for a metric used to evaluate tasks in model evaluations.
* @export
- * @interface Guardrailschema
+ * @interface MetricDefinitionschema
*/
-export interface Guardrailschema {
+export interface MetricDefinitionschema {
/**
- * Globally unique identifier for the guardrail.
+ * Unique metric identifier, used to link metrics to tasks and reports.
* @type {string}
- * @memberof Guardrailschema
+ * @memberof MetricDefinitionschema
*/
'id': string;
/**
- * Human-readable name of the guardrail.
+ * Human-readable name of the metric.
* @type {string}
- * @memberof Guardrailschema
+ * @memberof MetricDefinitionschema
*/
'name': string;
/**
- * Detailed explanation of the purpose and logic of the guardrail.
+ * Detailed description of what the metric measures.
* @type {string}
- * @memberof Guardrailschema
+ * @memberof MetricDefinitionschema
*/
'description'?: string;
/**
- * Specifies what the guardrail applies to: tasks, metrics, and/or specific models.
- * @type {Array}
- * @memberof Guardrailschema
+ * Type of metric output (percentage, raw score, count, etc.).
+ * @type {string}
+ * @memberof MetricDefinitionschema
*/
- 'targets': Array;
+ 'type'?: MetricDefinitionschemaTypeEnum;
/**
- * Indicates the data flow stage at which the guardrail should be applied: \'input\' for prompt/input constraints, \'output\' for generation constraints, or \'both\' for end-to-end application.
+ * Indicates whether higher or lower values correspond to better performance.
* @type {string}
- * @memberof Guardrailschema
+ * @memberof MetricDefinitionschema
*/
- 'scope': GuardrailschemaScopeEnum;
+ 'direction': MetricDefinitionschemaDirectionEnum;
/**
- * List of external references (e.g., papers, documentation, implementations) that support or explain the rationale for this guardrail.
+ * Optional tags describing the metric, e.g., accuracy, robustness, efficiency.
* @type {Array}
- * @memberof Guardrailschema
- */
- 'external_references'?: Array;
- /**
- * Implementation guidance or rule description, written in natural language or pseudocode for how to enforce this guardrail.
- * @type {string}
- * @memberof Guardrailschema
+ * @memberof MetricDefinitionschema
*/
- 'instructions': string;
+ 'tags'?: Array;
}
-export const GuardrailschemaScopeEnum = {
- Input: 'input',
- Output: 'output',
- Both: 'both'
+export const MetricDefinitionschemaTypeEnum = {
+ Percentage: 'percentage',
+ Score: 'score',
+ Count: 'count',
+ Time: 'time',
+ Other: 'other'
} as const;
-export type GuardrailschemaScopeEnum = typeof GuardrailschemaScopeEnum[keyof typeof GuardrailschemaScopeEnum];
+export type MetricDefinitionschemaTypeEnum = typeof MetricDefinitionschemaTypeEnum[keyof typeof MetricDefinitionschemaTypeEnum];
+export const MetricDefinitionschemaDirectionEnum = {
+ HigherIsBetter: 'higher_is_better',
+ LowerIsBetter: 'lower_is_better'
+} as const;
+
+export type MetricDefinitionschemaDirectionEnum = typeof MetricDefinitionschemaDirectionEnum[keyof typeof MetricDefinitionschemaDirectionEnum];
/**
- *
+ * Response containing a list of available metrics
* @export
- * @interface ListGuardrails200Response
+ * @interface MetricsResponse
*/
-export interface ListGuardrails200Response {
+export interface MetricsResponse {
/**
- *
- * @type {Array}
- * @memberof ListGuardrails200Response
+ * Array of metric definitions
+ * @type {Array}
+ * @memberof MetricsResponse
*/
- 'guardrails'?: Array;
+ 'metrics': Array;
/**
*
* @type {PaginationInfo}
- * @memberof ListGuardrails200Response
+ * @memberof MetricsResponse
*/
'pagination'?: PaginationInfo;
}
/**
- *
+ * Response containing a list of model cards
* @export
- * @interface ListModels200Response
+ * @interface ModelCardsResponse
*/
-export interface ListModels200Response {
+export interface ModelCardsResponse {
+ /**
+ * Array of model cards
+ * @type {Array}
+ * @memberof ModelCardsResponse
+ */
+ 'model_cards': Array;
/**
*
- * @type {Array}
- * @memberof ListModels200Response
+ * @type {PaginationInfo}
+ * @memberof ModelCardsResponse
*/
- 'models'?: Array;
+ 'pagination'?: PaginationInfo;
}
/**
- *
+ * A comprehensive model card that includes model identification, evaluation results with tasks, metrics, thresholds, and recommended guardrails for responsible AI deployment.
* @export
- * @interface ListTasks200Response
+ * @interface ModelCardschema
*/
-export interface ListTasks200Response {
+export interface ModelCardschema {
/**
*
- * @type {Array}
- * @memberof ListTasks200Response
+ * @type {ModelInfoschema}
+ * @memberof ModelCardschema
*/
- 'tasks'?: Array;
+ 'model': ModelInfoschema;
+ /**
+ * Tasks with their definitions, metrics, and evaluation results. Keys are task identifiers.
+ * @type {object}
+ * @memberof ModelCardschema
+ */
+ 'tasks': object;
+ /**
+ * List of recommended guardrails for this model
+ * @type {Array}
+ * @memberof ModelCardschema
+ */
+ 'guardrails'?: Array;
}
/**
* Error response
@@ -313,64 +276,58 @@ export interface ModelError {
/**
* Information about a model
* @export
- * @interface ModelInfo
+ * @interface ModelInfoschema
*/
-export interface ModelInfo {
+export interface ModelInfoschema {
+ /**
+ * Unique model identifier
+ * @type {string}
+ * @memberof ModelInfoschema
+ */
+ 'id': string;
/**
* Model name
* @type {string}
- * @memberof ModelInfo
+ * @memberof ModelInfoschema
*/
'name': string;
/**
- * Model source/organization
+ * Model namespace or organization
* @type {string}
- * @memberof ModelInfo
+ * @memberof ModelInfoschema
*/
- 'source': string;
+ 'namespace': string;
/**
- * Number of evaluation reports for this model
- * @type {number}
- * @memberof ModelInfo
+ * List of aliases for the model\'s name. Must not include the namespace.
+ * @type {Array}
+ * @memberof ModelInfoschema
*/
- 'report_count': number;
+ 'aliases'?: Array;
/**
- * Date of the most recent evaluation
- * @type {string}
- * @memberof ModelInfo
+ * List of reference links for the model
+ * @type {Array}
+ * @memberof ModelInfoschema
*/
- 'latest_evaluation': string;
+ 'reference_links'?: Array;
}
/**
- * Information about a model
+ * Response containing a list of available models
* @export
- * @interface ModelInfoschema
+ * @interface ModelsInfoResponse
*/
-export interface ModelInfoschema {
- /**
- * Model name
- * @type {string}
- * @memberof ModelInfoschema
- */
- 'name': string;
- /**
- * Model source/organization
- * @type {string}
- * @memberof ModelInfoschema
- */
- 'source': string;
+export interface ModelsInfoResponse {
/**
- * Number of evaluation reports for this model
- * @type {number}
- * @memberof ModelInfoschema
+ * Array of model definitions
+ * @type {Array}
+ * @memberof ModelsInfoResponse
*/
- 'report_count': number;
+ 'models': Array;
/**
- * Date of the most recent evaluation
- * @type {string}
- * @memberof ModelInfoschema
+ *
+ * @type {PaginationInfo}
+ * @memberof ModelsInfoResponse
*/
- 'latest_evaluation': string;
+ 'pagination'?: PaginationInfo;
}
/**
* Pagination information
@@ -404,457 +361,204 @@ export interface PaginationInfo {
'has_more': boolean;
}
/**
- * Pagination information
+ * Response containing a list of available policies
* @export
- * @interface PaginationInfoschema
+ * @interface PoliciesResponse
*/
-export interface PaginationInfoschema {
- /**
- * Total number of items
- * @type {number}
- * @memberof PaginationInfoschema
- */
- 'total': number;
- /**
- * Number of items per page
- * @type {number}
- * @memberof PaginationInfoschema
- */
- 'limit': number;
+export interface PoliciesResponse {
/**
- * Number of items skipped
- * @type {number}
- * @memberof PaginationInfoschema
+ * Array of policy definitions
+ * @type {Array}
+ * @memberof PoliciesResponse
*/
- 'offset': number;
+ 'policies': Array;
/**
- * Whether there are more items available
- * @type {boolean}
- * @memberof PaginationInfoschema
+ *
+ * @type {PaginationInfo}
+ * @memberof PoliciesResponse
*/
- 'has_more': boolean;
+ 'pagination'?: PaginationInfo;
}
/**
- * Schema for a report of model evaluation results.
+ * Schema for a policy used to evaluate tasks in model evaluations. Policies organize thresholds and guardrails by evaluation context. Thresholds are embedded within policies, organized by task ID and metric ID.
* @export
- * @interface Report
+ * @interface Policyschema
*/
-export interface Report {
+export interface Policyschema {
/**
- * Unique report identifier.
+ * Unique policy identifier, used to link policies to tasks and reports.
* @type {string}
- * @memberof Report
- */
- 'id'?: string;
- /**
- * Flexible key-value metadata about the report generation.
- * @type {{ [key: string]: string; }}
- * @memberof Report
+ * @memberof Policyschema
*/
- 'metadata'?: { [key: string]: string; };
+ 'id': string;
/**
- *
- * @type {ReportContext}
- * @memberof Report
+ * Human-readable name of the policy.
+ * @type {string}
+ * @memberof Policyschema
*/
- 'context'?: ReportContext;
+ 'name': string;
/**
- * List of tasks in the report. The keys are the task names.
- * @type {Array