diff --git a/evaluators/time_efficiency/README.md b/evaluators/time_efficiency/README.md
new file mode 100644
index 0000000..d18f2ec
--- /dev/null
+++ b/evaluators/time_efficiency/README.md
@@ -0,0 +1,51 @@
+# time_efficiency
+
+Scores how quickly an agent resolved relative to a time budget. Catches agents that produce correct answers but take too long for production use.
+
+## How it works
+
+Reads latency percentiles from the trace's `performance_metrics`. Scores against a time budget in seconds:
+
+```
+score = clamp(1.0 - actual_seconds / max_duration_s, 0, 1)
+```
+
+You choose which percentile (`p50`, `p95`, `p99`) and which latency category (`overall`, `llm_calls`, `tool_executions`) to score against. For example, scoring against `p95` of `llm_calls` catches slow LLM responses specifically.
+
+This is a **trace-level** metric. Returns `NOT_EVALUATED` when no latency data is available or when an invalid percentile/source is configured.
+
+## Config
+
+| Option | Type | Default | Description |
+|---|---|---|---|
+| `max_duration_s` | float | 120 | Time budget in seconds |
+| `latency_percentile` | str | `"p50"` | Percentile to score: `"p50"`, `"p95"`, `"p99"` |
+| `latency_source` | str | `"overall"` | Latency category: `"overall"`, `"llm_calls"`, `"tool_executions"` |
+
+## Example
+
+```yaml
+evaluators:
+  - name: time_efficiency
+    type: remote
+    source: github
+    ref: evaluators/time_efficiency/time_efficiency.py
+    threshold: 0.5
+    config:
+      max_duration_s: 60
+      latency_percentile: p95
+      latency_source: overall
+```
+
+## Output details
+
+```json
+{
+  "duration_s": 4.164,
+  "max_duration_s": 60,
+  "utilization": "6.9%",
+  "source": "latency.overall.p95"
+}
+```
+
+Requires `agentevals-evaluator-sdk >= 0.1.1`.
diff --git a/evaluators/time_efficiency/evaluator.yaml b/evaluators/time_efficiency/evaluator.yaml
new file mode 100644
index 0000000..58c9863
--- /dev/null
+++ b/evaluators/time_efficiency/evaluator.yaml
@@ -0,0 +1,6 @@
+name: time_efficiency
+description: Scores how quickly the agent resolved relative to a time budget
+language: python
+entrypoint: time_efficiency.py
+tags: [performance, time, latency, efficiency, budget]
+author: agentevals-dev
diff --git a/evaluators/time_efficiency/time_efficiency.py b/evaluators/time_efficiency/time_efficiency.py
new file mode 100644
index 0000000..7700300
--- /dev/null
+++ b/evaluators/time_efficiency/time_efficiency.py
@@ -0,0 +1,131 @@
+"""Community evaluator: time_efficiency
+
+Scores resolution time relative to a budget.  Reads latency from
+the trace's performance_metrics.
+
+Score = clamp(1.0 - actual_seconds / max_duration_s, 0, 1)
+
+Returns NOT_EVALUATED when no latency data is available.
+
+Config options:
+  max_duration_s       (float): Time budget in seconds (default: 120)
+  latency_percentile   (str):   Which percentile to score against:
+                                "p50" (default), "p95", "p99"
+  latency_source       (str):   Latency category:
+                                "overall" (default), "llm_calls", "tool_executions"
+"""
+
+from __future__ import annotations
+
+from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator
+
+_VALID_PERCENTILES = ("p50", "p95", "p99")
+_VALID_SOURCES = ("overall", "llm_calls", "tool_executions")
+
+
+def _extract_duration_s(perf: dict, percentile: str, source: str) -> tuple[float | None, str]:
+    """Extract duration in seconds from a performance_metrics dict.
+
+    Returns (duration_seconds, description_of_source).
+
+    Supports:
+      nested (agentevals): latency.<source>.<percentile> in milliseconds
+      flat:                duration_s (seconds) or duration_ms (milliseconds)
+    """
+    latency_block = perf.get("latency")
+    if isinstance(latency_block, dict):
+        source_block = latency_block.get(source)
+        if isinstance(source_block, dict):
+            ms_value = source_block.get(percentile)
+            if ms_value is not None:
+                return float(ms_value) / 1000.0, f"latency.{source}.{percentile}"
+
+    duration_s = perf.get("duration_s")
+    if duration_s is not None:
+        return float(duration_s), "duration_s"
+
+    duration_ms = perf.get("duration_ms")
+    if duration_ms is not None:
+        return float(duration_ms) / 1000.0, "duration_ms"
+
+    return None, "no latency data found"
+
+
+def _get_perf(input: EvalInput) -> dict | None:
+    """Return the first non-None performance_metrics from any invocation."""
+    for inv in input.invocations:
+        if isinstance(inv.performance_metrics, dict):
+            return inv.performance_metrics
+    return None
+
+
+@evaluator
+def time_efficiency(input: EvalInput) -> EvalResult:
+    max_duration = input.config.get("max_duration_s", 120.0)
+    percentile = input.config.get("latency_percentile", "p50")
+    source = input.config.get("latency_source", "overall")
+    n = len(input.invocations)
+
+    if percentile not in _VALID_PERCENTILES:
+        return EvalResult(
+            score=0.0,
+            status=EvalStatus.NOT_EVALUATED,
+            per_invocation_scores=[None] * n,
+            details={"reason": f"invalid latency_percentile '{percentile}', must be one of {_VALID_PERCENTILES}"},
+        )
+    if source not in _VALID_SOURCES:
+        return EvalResult(
+            score=0.0,
+            status=EvalStatus.NOT_EVALUATED,
+            per_invocation_scores=[None] * n,
+            details={"reason": f"invalid latency_source '{source}', must be one of {_VALID_SOURCES}"},
+        )
+
+    perf = _get_perf(input)
+    if perf is None:
+        return EvalResult(
+            score=0.0,
+            status=EvalStatus.NOT_EVALUATED,
+            per_invocation_scores=[None] * n,
+            details={"reason": "no performance_metrics available"},
+        )
+
+    duration_s, source_desc = _extract_duration_s(perf, percentile, source)
+    if duration_s is None:
+        return EvalResult(
+            score=0.0,
+            status=EvalStatus.NOT_EVALUATED,
+            per_invocation_scores=[None] * n,
+            details={"reason": source_desc},
+        )
+
+    score = max(0.0, min(1.0, 1.0 - duration_s / max_duration)) if max_duration > 0 else 1.0
+
+    breakdown = {}
+    latency_block = perf.get("latency")
+    if isinstance(latency_block, dict):
+        for src in _VALID_SOURCES:
+            src_block = latency_block.get(src)
+            if isinstance(src_block, dict):
+                val = src_block.get(percentile)
+                if val is not None:
+                    breakdown[src] = round(float(val) / 1000.0, 3)
+
+    details: dict = {
+        "duration_s": round(duration_s, 3),
+        "max_duration_s": max_duration,
+        "utilization": f"{duration_s / max_duration * 100:.1f}%" if max_duration > 0 else "n/a",
+        "source": source_desc,
+    }
+    if breakdown:
+        details["latency_breakdown_s"] = breakdown
+
+    return EvalResult(
+        score=score,
+        per_invocation_scores=[None] * n,
+        details=details,
+    )
+
+
+if __name__ == "__main__":
+    time_efficiency.run()
diff --git a/evaluators/token_efficiency/README.md b/evaluators/token_efficiency/README.md
new file mode 100644
index 0000000..89a2c43
--- /dev/null
+++ b/evaluators/token_efficiency/README.md
@@ -0,0 +1,57 @@
+# token_efficiency
+
+Scores how efficiently an agent used tokens relative to a budget. Useful for catching runaway token consumption — real benchmarks show 8x variation across agent solutions for the same task.
+
+## How it works
+
+Reads token counts from the trace's `performance_metrics`. Scores input and output tokens separately against their budgets, returns the worst of the two:
+
+```
+input_score  = clamp(1.0 - input_tokens / max_input_tokens,  0, 1)
+output_score = clamp(1.0 - output_tokens / max_output_tokens, 0, 1)
+score = min(input_score, output_score)
+```
+
+A score of 1.0 means zero tokens used; 0.0 means at or over budget. With a threshold of 0.3, the agent must use less than 70% of the budget to pass.
+
+This is a **trace-level** metric — per-invocation scores are not applicable (token counts come from the full trace).
+
+Returns `NOT_EVALUATED` when no token data is available in the trace.
+
+## Config
+
+| Option | Type | Default | Description |
+|---|---|---|---|
+| `max_input_tokens` | int | 150000 | Input (prompt) token budget |
+| `max_output_tokens` | int | 50000 | Output (completion) token budget |
+
+## Example
+
+```yaml
+evaluators:
+  - name: token_efficiency
+    type: remote
+    source: github
+    ref: evaluators/token_efficiency/token_efficiency.py
+    threshold: 0.3
+    config:
+      max_input_tokens: 100000
+      max_output_tokens: 30000
+```
+
+## Output details
+
+```json
+{
+  "input_tokens": 75000,
+  "output_tokens": 10000,
+  "max_input_tokens": 100000,
+  "max_output_tokens": 30000,
+  "input_utilization": "75.0%",
+  "output_utilization": "33.3%",
+  "input_score": 0.25,
+  "output_score": 0.6667
+}
+```
+
+Requires `agentevals-evaluator-sdk >= 0.1.1`.
diff --git a/evaluators/token_efficiency/evaluator.yaml b/evaluators/token_efficiency/evaluator.yaml
new file mode 100644
index 0000000..864cc38
--- /dev/null
+++ b/evaluators/token_efficiency/evaluator.yaml
@@ -0,0 +1,6 @@
+name: token_efficiency
+description: Scores how efficiently the agent used tokens relative to a budget
+language: python
+entrypoint: token_efficiency.py
+tags: [performance, tokens, efficiency, budget]
+author: agentevals-dev
diff --git a/evaluators/token_efficiency/token_efficiency.py b/evaluators/token_efficiency/token_efficiency.py
new file mode 100644
index 0000000..5bff15f
--- /dev/null
+++ b/evaluators/token_efficiency/token_efficiency.py
@@ -0,0 +1,108 @@
+"""Community evaluator: token_efficiency
+
+Scores token usage relative to a budget.  Reads token counts from
+the trace's performance_metrics.
+
+Score formula (per dimension):
+  input_score  = clamp(1.0 - total_prompt / max_input_tokens,  0, 1)
+  output_score = clamp(1.0 - total_output / max_output_tokens, 0, 1)
+  score = min(input_score, output_score)
+
+Returns NOT_EVALUATED when no token data is available.
+
+Config options:
+  max_input_tokens  (int): Input token budget  (default: 150000)
+  max_output_tokens (int): Output token budget  (default: 50000)
+"""
+
+from __future__ import annotations
+
+from agentevals_evaluator_sdk import EvalInput, EvalResult, EvalStatus, evaluator
+
+
+def _extract_tokens(perf: dict) -> dict[str, int] | None:
+    """Extract token counts from a performance_metrics dict.
+
+    Supports two layouts:
+      nested (agentevals default): {"tokens": {"total_prompt": N, "total_output": N}}
+      flat   (custom harness):     {"input_tokens": N, "output_tokens": N}
+    """
+    tokens_block = perf.get("tokens")
+    if isinstance(tokens_block, dict):
+        total_prompt = tokens_block.get("total_prompt")
+        total_output = tokens_block.get("total_output")
+        if total_prompt is not None or total_output is not None:
+            return {
+                "input_tokens": int(total_prompt) if total_prompt is not None else 0,
+                "output_tokens": int(total_output) if total_output is not None else 0,
+            }
+
+    input_t = perf.get("input_tokens") if perf.get("input_tokens") is not None else perf.get("prompt_tokens")
+    output_t = perf.get("output_tokens") if perf.get("output_tokens") is not None else perf.get("completion_tokens")
+
+    if input_t is not None or output_t is not None:
+        return {
+            "input_tokens": int(input_t) if input_t is not None else 0,
+            "output_tokens": int(output_t) if output_t is not None else 0,
+        }
+
+    return None
+
+
+def _get_perf(input: EvalInput) -> dict | None:
+    """Return the first non-None performance_metrics from any invocation."""
+    for inv in input.invocations:
+        if isinstance(inv.performance_metrics, dict):
+            return inv.performance_metrics
+    return None
+
+
+@evaluator
+def token_efficiency(input: EvalInput) -> EvalResult:
+    max_input = input.config.get("max_input_tokens", 150_000)
+    max_output = input.config.get("max_output_tokens", 50_000)
+    n = len(input.invocations)
+
+    perf = _get_perf(input)
+    if perf is None:
+        return EvalResult(
+            score=0.0,
+            status=EvalStatus.NOT_EVALUATED,
+            per_invocation_scores=[None] * n,
+            details={"reason": "no performance_metrics available"},
+        )
+
+    tokens = _extract_tokens(perf)
+    if tokens is None:
+        return EvalResult(
+            score=0.0,
+            status=EvalStatus.NOT_EVALUATED,
+            per_invocation_scores=[None] * n,
+            details={"reason": "no token data in performance_metrics"},
+        )
+
+    input_tokens = tokens["input_tokens"]
+    output_tokens = tokens["output_tokens"]
+
+    input_score = max(0.0, min(1.0, 1.0 - input_tokens / max_input)) if max_input > 0 else 1.0
+    output_score = max(0.0, min(1.0, 1.0 - output_tokens / max_output)) if max_output > 0 else 1.0
+    score = min(input_score, output_score)
+
+    return EvalResult(
+        score=score,
+        per_invocation_scores=[None] * n,
+        details={
+            "input_tokens": input_tokens,
+            "output_tokens": output_tokens,
+            "max_input_tokens": max_input,
+            "max_output_tokens": max_output,
+            "input_utilization": f"{input_tokens / max_input * 100:.1f}%" if max_input > 0 else "n/a",
+            "output_utilization": f"{output_tokens / max_output * 100:.1f}%" if max_output > 0 else "n/a",
+            "input_score": round(input_score, 4),
+            "output_score": round(output_score, 4),
+        },
+    )
+
+
+if __name__ == "__main__":
+    token_efficiency.run()
diff --git a/evaluators/tool_efficiency/README.md b/evaluators/tool_efficiency/README.md
new file mode 100644
index 0000000..192f32e
--- /dev/null
+++ b/evaluators/tool_efficiency/README.md
@@ -0,0 +1,72 @@
+# tool_efficiency
+
+Scores whether an agent used tools effectively. Catches waste: duplicate calls (same tool + same args), error responses, and exceeding a tool call budget. Also enforces a minimum number of tool calls when tools are required.
+
+## How it works
+
+Scores each invocation independently based on three factors multiplied together:
+
+```
+useful_calls = total - duplicates - errors
+efficiency   = useful_calls / total
+budget_factor = clamp(1.0 - max(0, total - max_tool_calls) / max_tool_calls, 0, 1)
+min_factor    = total / min_tool_calls   (when total < min_tool_calls)
+score = clamp(efficiency * budget_factor * min_factor, 0, 1)
+```
+
+- **efficiency**: What fraction of calls were useful (not duplicates, not errors).
+- **budget_factor**: Penalizes exceeding `max_tool_calls`. At 2x budget, score goes to 0.
+- **min_factor**: Penalizes falling below `min_tool_calls`. 1 call when 2 are required = 0.5x.
+
+If an invocation has zero tool calls: score is 0.0 when `min_tool_calls > 0`, otherwise 1.0.
+
+The overall score is the mean across invocations.
+
+**Duplicate detection**: Two calls are duplicates if they have the same tool name and identical arguments (JSON-serialized, sorted keys).
+
+**Error detection**: Checks `ToolResponseData.status` for `"error"`, `"failed"`, or `"failure"`.
+
+## Config
+
+| Option | Type | Default | Description |
+|---|---|---|---|
+| `max_tool_calls` | int | 15 | Budget; calls beyond this are penalized |
+| `min_tool_calls` | int | 0 | Minimum required; 0 means tools are optional |
+| `penalize_duplicates` | bool | true | Count duplicate calls as waste |
+| `penalize_errors` | bool | true | Count error responses as waste |
+
+## Example
+
+```yaml
+evaluators:
+  - name: tool_efficiency
+    type: remote
+    source: github
+    ref: evaluators/tool_efficiency/tool_efficiency.py
+    threshold: 0.5
+    config:
+      max_tool_calls: 10
+      min_tool_calls: 1
+      penalize_duplicates: true
+      penalize_errors: true
+```
+
+## Output details
+
+```json
+{
+  "per_invocation": [
+    {
+      "invocation_id": "inv-001",
+      "score": 0.3429,
+      "total_calls": 7,
+      "useful_calls": 4,
+      "duplicate_calls": 2,
+      "error_responses": 1,
+      "budget_factor": 0.6
+    }
+  ]
+}
+```
+
+Requires `agentevals-evaluator-sdk >= 0.1.1`.
diff --git a/evaluators/tool_efficiency/evaluator.yaml b/evaluators/tool_efficiency/evaluator.yaml
new file mode 100644
index 0000000..872f702
--- /dev/null
+++ b/evaluators/tool_efficiency/evaluator.yaml
@@ -0,0 +1,6 @@
+name: tool_efficiency
+description: Scores whether the agent used tools effectively — penalizes waste, duplicates, and errors
+language: python
+entrypoint: tool_efficiency.py
+tags: [performance, tools, efficiency, budget]
+author: agentevals-dev
diff --git a/evaluators/tool_efficiency/tool_efficiency.py b/evaluators/tool_efficiency/tool_efficiency.py
new file mode 100644
index 0000000..d3bcd66
--- /dev/null
+++ b/evaluators/tool_efficiency/tool_efficiency.py
@@ -0,0 +1,119 @@
+"""Community evaluator: tool_efficiency
+
+Scores tool usage effectiveness per invocation.  Penalizes duplicate calls
+(same tool name + args), error responses, and budget overruns.
+
+Score per invocation:
+  useful_calls = total - duplicates - errors
+  efficiency   = useful_calls / total
+  budget_factor = clamp(1.0 - max(0, total - max_tool_calls) / max_tool_calls, 0, 1)
+  min_factor    = total / min_tool_calls  (when total < min_tool_calls, else 1.0)
+  score = clamp(efficiency * budget_factor * min_factor, 0, 1)
+
+If total == 0:  score = 0.0 when min_tool_calls > 0, else 1.0.
+
+Config options:
+  max_tool_calls      (int):  Budget; calls beyond this are penalized (default: 15)
+  min_tool_calls      (int):  Minimum required; 0 means optional      (default: 0)
+  penalize_duplicates (bool): Count duplicate calls as waste           (default: true)
+  penalize_errors     (bool): Count error responses as waste           (default: true)
+"""
+
+from __future__ import annotations
+
+import json
+
+from agentevals_evaluator_sdk import EvalInput, EvalResult, evaluator
+
+
+def _call_signature(call) -> str:
+    """Deterministic signature for deduplication: 'tool_name::sorted_args_json'."""
+    try:
+        args_str = json.dumps(call.args, sort_keys=True, default=str)
+    except (TypeError, ValueError):
+        args_str = str(call.args)
+    return f"{call.name}::{args_str}"
+
+
+def _is_error_response(response) -> bool:
+    """Check if a tool response indicates an error via its status field."""
+    status = (response.status or "").lower()
+    return status in ("error", "failed", "failure")
+
+
+@evaluator
+def tool_efficiency(input: EvalInput) -> EvalResult:
+    max_tool_calls = input.config.get("max_tool_calls", 15)
+    min_tool_calls = input.config.get("min_tool_calls", 0)
+    penalize_duplicates = input.config.get("penalize_duplicates", True)
+    penalize_errors = input.config.get("penalize_errors", True)
+
+    scores: list[float] = []
+    inv_details: list[dict] = []
+
+    for inv in input.invocations:
+        tool_calls = inv.intermediate_steps.tool_calls
+        tool_responses = inv.intermediate_steps.tool_responses
+        total = len(tool_calls)
+
+        if total == 0:
+            score = 0.0 if min_tool_calls > 0 else 1.0
+            reason = f"no tool calls (min required: {min_tool_calls})" if min_tool_calls > 0 else "no tool calls (tools optional)"
+            scores.append(score)
+            inv_details.append({"invocation_id": inv.invocation_id, "score": score, "reason": reason})
+            continue
+
+        dupes = 0
+        if penalize_duplicates:
+            seen: dict[str, int] = {}
+            for call in tool_calls:
+                sig = _call_signature(call)
+                seen[sig] = seen.get(sig, 0) + 1
+            dupes = sum(count - 1 for count in seen.values() if count > 1)
+
+        errors = 0
+        if penalize_errors:
+            errors = sum(1 for r in tool_responses if _is_error_response(r))
+
+        useful = max(0, total - dupes - errors)
+        efficiency = useful / total
+
+        budget_factor = 1.0
+        if max_tool_calls > 0 and total > max_tool_calls:
+            budget_factor = max(0.0, 1.0 - (total - max_tool_calls) / max_tool_calls)
+
+        min_factor = 1.0
+        if min_tool_calls > 0 and total < min_tool_calls:
+            min_factor = total / min_tool_calls
+
+        score = max(0.0, min(1.0, efficiency * budget_factor * min_factor))
+        scores.append(score)
+
+        detail: dict = {
+            "invocation_id": inv.invocation_id,
+            "score": round(score, 4),
+            "total_calls": total,
+            "useful_calls": useful,
+        }
+        if dupes:
+            detail["duplicate_calls"] = dupes
+        if errors:
+            detail["error_responses"] = errors
+        if budget_factor < 1.0:
+            detail["budget_factor"] = round(budget_factor, 4)
+        if min_factor < 1.0:
+            detail["min_factor"] = round(min_factor, 4)
+            detail["min_tool_calls"] = min_tool_calls
+        inv_details.append(detail)
+
+    overall = sum(scores) / len(scores) if scores else 0.0
+
+    return EvalResult(
+        score=overall,
+        per_invocation_scores=scores,
+        details={"per_invocation": inv_details},
+    )
+
+
+if __name__ == "__main__":
+    tool_efficiency.run()