Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions src/database/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,21 @@ def tag(id_: int, tag_: str, *, user_id: int, connection: Connection) -> None:
)


def untag(id_: int, tag_: str, *, connection: Connection) -> None:
connection.execute(
text(
"""
DELETE FROM dataset_tag
WHERE `id` = :dataset_id AND `tag` = :tag
""",
),
parameters={
"dataset_id": id_,
"tag": tag_,
},
)


def get_description(
id_: int,
connection: Connection,
Expand Down
33 changes: 33 additions & 0 deletions src/routers/openml/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,46 @@ def tag_dataset(
}


@router.post(
path="/untag",
)
def untag_dataset(
data_id: Annotated[int, Body()],
tag: Annotated[str, SystemString64],
user: Annotated[User | None, Depends(fetch_user)] = None,
expdb_db: Annotated[Connection, Depends(expdb_connection)] = None,
) -> dict[str, dict[str, Any]]:
if user is None:
raise create_authentication_failed_error()

tags = database.datasets.get_tags_for(data_id, expdb_db)
if tag.casefold() not in [t.casefold() for t in tags]:
raise create_tag_not_found_error(data_id, tag)

database.datasets.untag(data_id, tag, connection=expdb_db)
Comment on lines +64 to +67
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue (bug_risk): Case-insensitive existence check but case-sensitive delete can lead to silent no-op.

The membership check uses casefold() but untag receives the raw tag. If the stored tag differs only by case (e.g. "Foo" vs "foo"), the check can succeed while the DELETE affects 0 rows (depending on DB collation), so the endpoint reports success without untagging. Normalize the tag consistently for both check and delete, or enforce a canonical casing in storage so their behavior matches.

return {
"data_untag": {"id": str(data_id)},
Comment on lines +63 to +69
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Use the canonical stored tag for delete to avoid false-success on mixed-case input.

Validation is case-insensitive, but deletion uses the raw input. On a case-sensitive collation, this can return success without removing anything.

💡 Suggested fix
-    tags = database.datasets.get_tags_for(data_id, expdb_db)
-    if tag.casefold() not in [t.casefold() for t in tags]:
+    tags = database.datasets.get_tags_for(data_id, expdb_db)
+    matching_tag = next((existing for existing in tags if existing.casefold() == tag.casefold()), None)
+    if matching_tag is None:
         raise create_tag_not_found_error(data_id, tag)
 
-    database.datasets.untag(data_id, tag, connection=expdb_db)
+    database.datasets.untag(data_id, matching_tag, connection=expdb_db)
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
tags = database.datasets.get_tags_for(data_id, expdb_db)
if tag.casefold() not in [t.casefold() for t in tags]:
raise create_tag_not_found_error(data_id, tag)
database.datasets.untag(data_id, tag, connection=expdb_db)
return {
"data_untag": {"id": str(data_id)},
tags = database.datasets.get_tags_for(data_id, expdb_db)
matching_tag = next((existing for existing in tags if existing.casefold() == tag.casefold()), None)
if matching_tag is None:
raise create_tag_not_found_error(data_id, tag)
database.datasets.untag(data_id, matching_tag, connection=expdb_db)
return {
"data_untag": {"id": str(data_id)},
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@src/routers/openml/datasets.py` around lines 63 - 69, Validation currently
compares tag case-insensitively using tags =
database.datasets.get_tags_for(data_id, expdb_db) but then calls
database.datasets.untag(data_id, tag, connection=expdb_db) with the raw input,
which can no-op on case-sensitive DBs; change the flow to find the canonical
stored tag from tags (e.g., pick the element t from tags where t.casefold() ==
tag.casefold()) and pass that canonical value to database.datasets.untag; keep
the same create_tag_not_found_error path when no match is found and return the
same payload using the data_id.

}


def create_authentication_failed_error() -> HTTPException:
return HTTPException(
status_code=HTTPStatus.PRECONDITION_FAILED,
detail={"code": "103", "message": "Authentication failed"},
)


def create_tag_not_found_error(data_id: int, tag: str) -> HTTPException:
return HTTPException(
status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
detail={
"code": "474",
"message": "Entity not tagged by this tag.",
"additional_information": f"id={data_id}; tag={tag}",
},
)


def create_tag_exists_error(data_id: int, tag: str) -> HTTPException:
return HTTPException(
status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
Expand Down
68 changes: 68 additions & 0 deletions tests/routers/openml/dataset_tag_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,3 +85,71 @@ def test_dataset_tag_invalid_tag_is_rejected(

assert new.status_code == HTTPStatus.UNPROCESSABLE_ENTITY
assert new.json()["detail"][0]["loc"] == ["body", "tag"]


@pytest.mark.parametrize(
"key",
[None, ApiKey.INVALID],
ids=["no authentication", "invalid key"],
)
def test_dataset_untag_rejects_unauthorized(key: ApiKey, py_api: TestClient) -> None:
apikey = "" if key is None else f"?api_key={key}"
response = py_api.post(
f"/datasets/untag{apikey}",
json={"data_id": 1, "tag": "study_14"},
)
assert response.status_code == HTTPStatus.PRECONDITION_FAILED
assert response.json()["detail"] == {"code": "103", "message": "Authentication failed"}


@pytest.mark.parametrize(
"key",
[ApiKey.ADMIN, ApiKey.SOME_USER, ApiKey.OWNER_USER],
ids=["administrator", "non-owner", "owner"],
)
def test_dataset_untag(key: ApiKey, expdb_test: Connection, py_api: TestClient) -> None:
dataset_id, tag = 1, "study_14" # Dataset 1 already has tag 'study_14'
response = py_api.post(
f"/datasets/untag?api_key={key}",
json={"data_id": dataset_id, "tag": tag},
)
assert response.status_code == HTTPStatus.OK
assert response.json() == {"data_untag": {"id": str(dataset_id)}}

tags = get_tags_for(id_=dataset_id, connection=expdb_test)
assert tag not in tags


def test_dataset_untag_fails_if_tag_does_not_exist(py_api: TestClient) -> None:
dataset_id, tag = 1, "nonexistent_tag"
response = py_api.post(
f"/datasets/untag?api_key={ApiKey.ADMIN}",
json={"data_id": dataset_id, "tag": tag},
)
assert response.status_code == HTTPStatus.INTERNAL_SERVER_ERROR
expected = {
"detail": {
"code": "474",
"message": "Entity not tagged by this tag.",
"additional_information": f"id={dataset_id}; tag={tag}",
},
}
assert expected == response.json()


@pytest.mark.parametrize(
"tag",
["", "h@", " a", "a" * 65],
ids=["too short", "@", "space", "too long"],
)
def test_dataset_untag_invalid_tag_is_rejected(
tag: str,
py_api: TestClient,
) -> None:
response = py_api.post(
f"/datasets/untag?api_key={ApiKey.ADMIN}",
json={"data_id": 1, "tag": tag},
)

assert response.status_code == HTTPStatus.UNPROCESSABLE_ENTITY
assert response.json()["detail"][0]["loc"] == ["body", "tag"]