diff --git a/CHANGELOG.md b/CHANGELOG.md
index cb3b3533..562bb048 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,11 @@ All notable changes to the [Nucleus Python Client](https://github.com/scaleapi/n
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.14.30](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.30) - 2022-11-29
+
+### Added
+- Support for uploading track-level metrics to external evaluation functions using track_ref_ids
+
 ## [0.14.29](https://github.com/scaleapi/nucleus-python-client/releases/tag/v0.14.29) - 2022-11-22
 
 ### Added
diff --git a/nucleus/__init__.py b/nucleus/__init__.py
index a40ffee2..4d679acb 100644
--- a/nucleus/__init__.py
+++ b/nucleus/__init__.py
@@ -178,7 +178,7 @@ def __init__(
             import tqdm.notebook as tqdm_notebook
 
             self.tqdm_bar = tqdm_notebook.tqdm
-        self._connection = Connection(self.api_key, self.endpoint)
+        self.connection = Connection(self.api_key, self.endpoint)
         self.validate = Validate(self.api_key, self.endpoint)
 
     def __repr__(self):
@@ -1014,16 +1014,16 @@ def create_object_index(
         )
 
     def delete(self, route: str):
-        return self._connection.delete(route)
+        return self.connection.delete(route)
 
     def get(self, route: str):
-        return self._connection.get(route)
+        return self.connection.get(route)
 
     def post(self, payload: dict, route: str):
-        return self._connection.post(payload, route)
+        return self.connection.post(payload, route)
 
     def put(self, payload: dict, route: str):
-        return self._connection.put(payload, route)
+        return self.connection.put(payload, route)
 
     # TODO: Fix return type, can be a list as well. Brings on a lot of mypy errors ...
     def make_request(
@@ -1054,7 +1054,7 @@ def make_request(
                     "Received defined payload with GET request! Will ignore payload"
                 )
             payload = None
-        return self._connection.make_request(payload, route, requests_command, return_raw_response)  # type: ignore
+        return self.connection.make_request(payload, route, requests_command, return_raw_response)  # type: ignore
 
     def _set_api_key(self, api_key):
         """Fetch API key from environment variable NUCLEUS_API_KEY if not set"""
diff --git a/nucleus/dataset.py b/nucleus/dataset.py
index a83ed638..25a9cf7a 100644
--- a/nucleus/dataset.py
+++ b/nucleus/dataset.py
@@ -1863,7 +1863,7 @@ def tracks(self) -> List[Track]:
         tracks_list = [
             Track.from_json(
                 payload=track,
-                client=self._client,
+                connection=self._client.connection,
             )
             for track in response[TRACKS_KEY]
         ]
diff --git a/nucleus/scene.py b/nucleus/scene.py
index b05c3cc5..48c8aaeb 100644
--- a/nucleus/scene.py
+++ b/nucleus/scene.py
@@ -330,7 +330,10 @@ def from_json(
         frames = [Frame.from_json(frame) for frame in frames_payload]
         tracks_payload = payload.get(TRACKS_KEY, [])
         tracks = (
-            [Track.from_json(track, client) for track in tracks_payload]
+            [
+                Track.from_json(track, connection=client.connection)
+                for track in tracks_payload
+            ]
             if client
             else []
         )
@@ -680,7 +683,10 @@ def from_json(
         items = [DatasetItem.from_json(item) for item in items_payload]
         tracks_payload = payload.get(TRACKS_KEY, [])
         tracks = (
-            [Track.from_json(track, client) for track in tracks_payload]
+            [
+                Track.from_json(track, connection=client.connection)
+                for track in tracks_payload
+            ]
             if client
             else []
         )
diff --git a/nucleus/track.py b/nucleus/track.py
index 4221dac3..21ee8f7c 100644
--- a/nucleus/track.py
+++ b/nucleus/track.py
@@ -12,7 +12,7 @@
 )
 
 if TYPE_CHECKING:
-    from . import NucleusClient
+    from . import Connection
 
 
 @dataclass  # pylint: disable=R0902
@@ -25,7 +25,7 @@ class Track:  # pylint: disable=R0902
         metadata: Arbitrary key/value dictionary of info to attach to this track.
     """
 
-    _client: "NucleusClient"
+    _connection: "Connection"
     dataset_id: str
     reference_id: str
     metadata: Optional[dict] = None
@@ -41,10 +41,10 @@ def __eq__(self, other):
         )
 
     @classmethod
-    def from_json(cls, payload: dict, client: "NucleusClient"):
+    def from_json(cls, payload: dict, connection: "Connection"):
         """Instantiates track object from schematized JSON dict payload."""
         return cls(
-            _client=client,
+            _connection=connection,
             reference_id=str(payload[REFERENCE_ID_KEY]),
             dataset_id=str(payload[DATASET_ID_KEY]),
             metadata=payload.get(METADATA_KEY, None),
@@ -79,7 +79,7 @@ def update(
                 entire metadata object will be overwritten. Otherwise, only the keys in metadata will be overwritten.
         """
 
-        self._client.make_request(
+        self._connection.make_request(
             payload={
                 REFERENCE_ID_KEY: self.reference_id,
                 METADATA_KEY: metadata,
diff --git a/nucleus/validate/__init__.py b/nucleus/validate/__init__.py
index 6ad24442..33b0a7eb 100644
--- a/nucleus/validate/__init__.py
+++ b/nucleus/validate/__init__.py
@@ -7,7 +7,7 @@
 ]
 
 from .client import Validate
-from .constants import ThresholdComparison
+from .constants import EntityLevel, ThresholdComparison
 from .data_transfer_objects.eval_function import (
     EvalFunctionEntry,
     EvaluationCriterion,
diff --git a/nucleus/validate/client.py b/nucleus/validate/client.py
index 840e7847..fd84cf61 100644
--- a/nucleus/validate/client.py
+++ b/nucleus/validate/client.py
@@ -213,7 +213,7 @@ def create_external_eval_function(
 
         Args:
             name: unique name of evaluation function
-            level: level at which the eval function is run, defaults to "item"
+            level: level at which the eval function is run, defaults to EntityLevel.ITEM.
 
         Raises:
             - NucleusAPIError if the creation of the function fails on the server side
diff --git a/nucleus/validate/constants.py b/nucleus/validate/constants.py
index 61e90831..92ad12c1 100644
--- a/nucleus/validate/constants.py
+++ b/nucleus/validate/constants.py
@@ -23,7 +23,14 @@ class ThresholdComparison(str, Enum):
 
 
 class EntityLevel(str, Enum):
-    """Level for evaluation functions and unit tests."""
+    """
+    Data level at which evaluation functions produce outputs.
+    For instance, when comparing results across dataset items, use
+    `EntityLevel.ITEM`. For scenes, use `EntityLevel.SCENE`. Finally,
+    when comparing results between tracks within a single scene or a
+    holistic item datset, use `EntityLevel.TRACK`.
+    """
 
+    TRACK = "track"
     ITEM = "item"
     SCENE = "scene"
diff --git a/nucleus/validate/data_transfer_objects/scenario_test_evaluations.py b/nucleus/validate/data_transfer_objects/scenario_test_evaluations.py
index 87197851..46bf5e71 100644
--- a/nucleus/validate/data_transfer_objects/scenario_test_evaluations.py
+++ b/nucleus/validate/data_transfer_objects/scenario_test_evaluations.py
@@ -6,6 +6,7 @@
 
 
 class EvaluationResult(ImmutableModel):
+    track_ref_id: Optional[str] = None
     item_ref_id: Optional[str] = None
     scene_ref_id: Optional[str] = None
     score: float = 0
@@ -15,16 +16,15 @@ class EvaluationResult(ImmutableModel):
     def is_item_or_scene_provided(
         cls, values
     ):  # pylint: disable=no-self-argument
-        if (
-            values.get("item_ref_id") is None
-            and values.get("scene_ref_id") is None
-        ) or (
-            (
-                values.get("item_ref_id") is not None
-                and values.get("scene_ref_id") is not None
+        ref_ids = [
+            values.get("track_ref_id", None),
+            values.get("item_ref_id", None),
+            values.get("scene_ref_id", None),
+        ]
+        if len([ref_id for ref_id in ref_ids if ref_id is not None]) != 1:
+            raise ValueError(
+                "Must provide exactly one of track_ref_id, item_ref_id, or scene_ref_id"
             )
-        ):
-            raise ValueError("Must provide either item_ref_id or scene_ref_id")
         return values
 
     @validator("score", "weight")
diff --git a/nucleus/validate/scenario_test.py b/nucleus/validate/scenario_test.py
index bcd9b6f1..41b4db30 100644
--- a/nucleus/validate/scenario_test.py
+++ b/nucleus/validate/scenario_test.py
@@ -8,9 +8,16 @@
 from typing import List, Optional, Union
 
 from ..connection import Connection
-from ..constants import DATASET_ITEMS_KEY, NAME_KEY, SCENES_KEY, SLICE_ID_KEY
+from ..constants import (
+    DATASET_ITEMS_KEY,
+    NAME_KEY,
+    SCENES_KEY,
+    SLICE_ID_KEY,
+    TRACKS_KEY,
+)
 from ..dataset_item import DatasetItem
 from ..scene import Scene
+from ..track import Track
 from .constants import (
     EVAL_FUNCTION_ID_KEY,
     SCENARIO_TEST_ID_KEY,
@@ -166,8 +173,8 @@ def get_eval_history(self) -> List[ScenarioTestEvaluation]:
 
     def get_items(
         self, level: EntityLevel = EntityLevel.ITEM
-    ) -> Union[List[DatasetItem], List[Scene]]:
-        """Gets items within a scenario test at a given level, returning a list of DatasetItem or Scene objects.
+    ) -> Union[List[Track], List[DatasetItem], List[Scene]]:
+        """Gets items within a scenario test at a given level, returning a list of Track, DatasetItem, or Scene objects.
 
         Args:
             level: :class:`EntityLevel`
@@ -178,14 +185,22 @@ def get_items(
         response = self.connection.get(
             f"validate/scenario_test/{self.id}/items",
         )
+        if level == EntityLevel.TRACK:
+            return [
+                Track.from_json(track, connection=self.connection)
+                for track in response.get(TRACKS_KEY, [])
+            ]
         if level == EntityLevel.SCENE:
             return [
                 Scene.from_json(scene, skip_validate=True)
-                for scene in response[SCENES_KEY]
+                for scene in response.get(SCENES_KEY, [])
             ]
-        return [
-            DatasetItem.from_json(item) for item in response[DATASET_ITEMS_KEY]
-        ]
+        if level == EntityLevel.ITEM:
+            return [
+                DatasetItem.from_json(item)
+                for item in response.get(DATASET_ITEMS_KEY, [])
+            ]
+        raise ValueError(f"Invalid entity level: {level}")
 
     def set_baseline_model(self, model_id: str):
         """Sets a new baseline model for the ScenarioTest.  In order to be eligible to be a baseline,
@@ -222,23 +237,41 @@ def upload_external_evaluation_results(
             len(results) > 0
         ), "Submitting evaluation requires at least one result."
 
-        level = EntityLevel.ITEM
+        level: Optional[EntityLevel] = None
         metric_per_ref_id = {}
         weight_per_ref_id = {}
         aggregate_weighted_sum = 0.0
         aggregate_weight = 0.0
 
+        # Ensures reults at only one EntityLevel are provided, otherwise throwing a ValueError
+        def ensure_level_consistency_or_raise(
+            cur_level: Optional[EntityLevel], new_level: EntityLevel
+        ):
+            if level is not None and level != new_level:
+                raise ValueError(
+                    f"All evaluation results must only pertain to one level. Received {cur_level} then {new_level}"
+                )
+
         # aggregation based on https://en.wikipedia.org/wiki/Weighted_arithmetic_mean
         for r in results:
-            # Ensure results are uploaded ONLY for items or ONLY for scenes
+            # Ensure results are uploaded ONLY for ONE OF tracks, items, and scenes
+            if r.track_ref_id is not None:
+                ensure_level_consistency_or_raise(level, EntityLevel.TRACK)
+                level = EntityLevel.TRACK
+            if r.item_ref_id is not None:
+                ensure_level_consistency_or_raise(level, EntityLevel.ITEM)
+                level = EntityLevel.ITEM
             if r.scene_ref_id is not None:
+                ensure_level_consistency_or_raise(level, EntityLevel.SCENE)
                 level = EntityLevel.SCENE
-            if r.item_ref_id is not None and level == EntityLevel.SCENE:
-                raise ValueError(
-                    "All evaluation results must either pertain to a scene_ref_id or an item_ref_id, not both."
-                )
             ref_id = (
-                r.item_ref_id if level == EntityLevel.ITEM else r.scene_ref_id
+                r.track_ref_id
+                if level == EntityLevel.TRACK
+                else (
+                    r.item_ref_id
+                    if level == EntityLevel.ITEM
+                    else r.scene_ref_id
+                )
             )
 
             # Aggregate scores and weights
@@ -255,7 +288,7 @@ def upload_external_evaluation_results(
             "overall_metric": aggregate_weighted_sum / aggregate_weight,
             "model_id": model_id,
             "slice_id": self.slice_id,
-            "level": level.value,
+            "level": level.value if level else None,
         }
         response = self.connection.post(
             payload,
diff --git a/pyproject.toml b/pyproject.toml
index ec1f09bb..a81cb572 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,7 +21,7 @@ exclude = '''
 
 [tool.poetry]
 name = "scale-nucleus"
-version = "0.14.29"
+version = "0.14.30"
 description = "The official Python client library for Nucleus, the Data Platform for AI"
 license =  "MIT"
 authors = ["Scale AI Nucleus Team <nucleusapi@scaleapi.com>"]
diff --git a/tests/helpers.py b/tests/helpers.py
index fd3ca13c..7de8ef7e 100644
--- a/tests/helpers.py
+++ b/tests/helpers.py
@@ -20,6 +20,7 @@
 DATASET_WITH_EMBEDDINGS = "ds_c8jwdhy4y4f0078hzceg"
 NUCLEUS_PYTEST_USER_ID = "60ad648c85db770026e9bf77"
 
+EVAL_FUNCTION_NAME = "eval_fn"
 EVAL_FUNCTION_THRESHOLD = 0.5
 EVAL_FUNCTION_COMPARISON = ThresholdComparison.GREATER_THAN_EQUAL_TO
 
diff --git a/tests/test_track.py b/tests/test_track.py
index f336218d..0e23c0da 100644
--- a/tests/test_track.py
+++ b/tests/test_track.py
@@ -1,3 +1,4 @@
+import time
 from copy import deepcopy
 
 import pytest
@@ -69,7 +70,7 @@ def test_create_mp_with_tracks(CLIENT, dataset_scene):
     expected_track_reference_ids = [
         ann["track_reference_id"] for ann in TEST_SCENE_BOX_PREDS_WITH_TRACK
     ]
-    model_reference = "model_test_create_mp_with_tracks"
+    model_reference = "model_" + str(time.time())
     model = CLIENT.create_model(TEST_MODEL_NAME, model_reference)
 
     # Act
diff --git a/tests/validate/test_scenario_test.py b/tests/validate/test_scenario_test.py
index 3b8b6b47..61983b5b 100644
--- a/tests/validate/test_scenario_test.py
+++ b/tests/validate/test_scenario_test.py
@@ -1,11 +1,21 @@
+import time
+
 import pytest
 
+from nucleus.annotation import BoxAnnotation
 from nucleus.validate import CreateScenarioTestError
 from nucleus.validate.constants import EntityLevel
+from nucleus.validate.data_transfer_objects.scenario_test_evaluations import (
+    EvaluationResult,
+)
 from nucleus.validate.scenario_test import ScenarioTest
 from tests.helpers import (
     EVAL_FUNCTION_COMPARISON,
-    EVAL_FUNCTION_THRESHOLD,
+    EVAL_FUNCTION_NAME,
+    TEST_MODEL_NAME,
+    TEST_SCENE_BOX_ANNS_WITH_TRACK,
+    TEST_SCENE_BOX_PREDS_WITH_TRACK,
+    TEST_TRACK_REFERENCE_ID,
     get_uuid,
 )
 
@@ -54,44 +64,132 @@ def test_scenario_test_get_dataset_items(
     test_slice,
     slice_items,
 ):
+    # Arrange
     test_name = "scenario_test_" + get_uuid()  # use uuid to make unique
+    expected_items_locations = [item.image_location for item in slice_items]
 
+    # Act
     scenario_test = CLIENT.validate.create_scenario_test(
         name=test_name,
         slice_id=test_slice.id,
         evaluation_functions=[CLIENT.validate.eval_functions.bbox_iou()],
     )
-
-    expected_items_locations = [item.image_location for item in slice_items]
     actual_items_locations = [
         item.image_location for item in scenario_test.get_items()
     ]
+
+    # Assert
     assert set(actual_items_locations).issubset(expected_items_locations)
+
+    # Clean
     CLIENT.validate.delete_scenario_test(scenario_test.id)
 
 
 def test_scenario_test_get_scenes(
-    CLIENT, test_scene_slice, slice_scenes, annotations
+    CLIENT,
+    test_scene_slice,
+    slice_scenes,
 ):
+    # Arrange
     test_name = "scenario_test_" + get_uuid()  # use uuid to make unique
+    expected_scene_reference_ids = [
+        scene.reference_id for scene in slice_scenes
+    ]
 
+    # Act
     scenario_test = CLIENT.validate.create_scenario_test(
         name=test_name,
         slice_id=test_scene_slice.id,
         evaluation_functions=[CLIENT.validate.eval_functions.bbox_iou()],
     )
-
-    expected_scene_reference_ids = [
-        scene.reference_id for scene in slice_scenes
-    ]
     actual_scene_reference_ids = [
         scene.reference_id
         for scene in scenario_test.get_items(level=EntityLevel.SCENE)
     ]
+
+    # Assert
     assert set(actual_scene_reference_ids).issubset(
         expected_scene_reference_ids
     )
+
+    # Clean
+    CLIENT.validate.delete_scenario_test(scenario_test.id)
+
+
+def test_scenario_test_get_tracks(
+    CLIENT, populated_scene_dataset, test_scene_slice, annotations
+):
+    # Arrange
+    test_name = "scenario_test_" + get_uuid()
+    expected_track_reference_ids = list(
+        set(
+            [
+                ann["track_reference_id"]
+                for ann in TEST_SCENE_BOX_ANNS_WITH_TRACK
+            ]
+        )
+    )
+    annotations = [
+        BoxAnnotation.from_json(ann) for ann in TEST_SCENE_BOX_ANNS_WITH_TRACK
+    ]
+    model_reference = "model_" + str(time.time())
+    model = CLIENT.create_model(TEST_MODEL_NAME, model_reference)
+    populated_scene_dataset.upload_predictions(
+        model=model,
+        predictions=[
+            BoxAnnotation.from_json(ann)
+            for ann in TEST_SCENE_BOX_PREDS_WITH_TRACK
+        ],
+        update=False,
+        asynchronous=False,
+    )
+    populated_scene_dataset.annotate(
+        annotations=annotations,
+        update=False,
+        asynchronous=False,
+    )
+    try:
+        CLIENT.validate.create_external_eval_function(
+            name=EVAL_FUNCTION_NAME,
+            level=EntityLevel.TRACK,
+        )
+    except Exception:  # pylint: disable=W0703
+        # Ignore external eval function already created
+        pass
+    all_external_fns = CLIENT.validate.eval_functions.external_functions
+    eval_fn_config = all_external_fns[EVAL_FUNCTION_NAME]
+    scenario_test = CLIENT.validate.create_scenario_test(
+        name=test_name,
+        slice_id=test_scene_slice.id,
+        evaluation_functions=[eval_fn_config],
+    )
+    scenario_test.upload_external_evaluation_results(
+        eval_fn_config,
+        [
+            EvaluationResult(
+                track_ref_id=TEST_TRACK_REFERENCE_ID,
+                score=1,
+                weight=1,
+            ),
+        ],
+        model.id,
+    )
+
+    # Act
+    actual_track_reference_ids = [
+        track.reference_id
+        for track in scenario_test.get_items(level=EntityLevel.TRACK)
+    ]
+
+    # Assert
+    assert len(actual_track_reference_ids) == len(expected_track_reference_ids)
+    assert set(actual_track_reference_ids).issubset(
+        expected_track_reference_ids
+    )
+
+    # Clean
     CLIENT.validate.delete_scenario_test(scenario_test.id)
+    assert CLIENT.delete_model(model.id) == {}
 
 
 def test_no_criteria_raises_error(CLIENT, test_slice, annotations):