Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 71 additions & 1 deletion open-api/rest-catalog-open-api.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from __future__ import annotations

from datetime import date, timedelta
from typing import Literal
from typing import Any, Literal
from uuid import UUID

from pydantic import BaseModel, Extra, Field
Expand Down Expand Up @@ -523,6 +523,26 @@ class StorageCredential(BaseModel):
config: dict[str, str]


class MaskHashSha256(BaseModel):
__root__: Any = Field(
...,
description='Mask the data of the column by applying SHA-256.\nThe input must be UTF-8 encoded bytes of the column value.\nThe SHA-256 digest is represented as a lowercase hexadecimal string.\nEngines must follow this procedure to ensure consistency:\n1. Convert the column value to a UTF-8 byte array.\n2. Apply the SHA-256 algorithm as specified in NIST FIPS 180-4.\n3. Convert the resulting 32-byte digest to a 64-character lowercase hexadecimal string.\n',
)


class ReplaceWithNull(BaseModel):
__root__: Any = Field(
..., description='Masks data by replacing it with a NULL value.'
)


class MaskAlphanumeric(BaseModel):
__root__: Any = Field(
...,
description="mask all alphabetic characters with 'x' and numeric characters with 'n'",
)


class LoadCredentialsResponse(BaseModel):
storage_credentials: list[StorageCredential] = Field(
..., alias='storage-credentials'
Expand Down Expand Up @@ -1102,6 +1122,14 @@ class SetStatisticsUpdate(BaseUpdate):
statistics: StatisticsFile


class ApplyTransform(BaseModel):
"""
Replace the field with the result of a transform expression. Produce the original field name with the transformed values.
"""

term: Term | None = None


class UnaryExpression(BaseModel):
type: Literal['is-null', 'not-null', 'is-nan', 'not-nan']
term: Term
Expand All @@ -1121,6 +1149,15 @@ class SetExpression(BaseModel):
values: list[PrimitiveTypeValue]


class Action(BaseModel):
__root__: MaskHashSha256 | ReplaceWithNull | MaskAlphanumeric | ApplyTransform = (
Field(
...,
description='Defines the specific action to be executed for computing the projection.',
)
)


class ResidualFilter6(SetExpression, ResidualFilter1):
"""
An optional filter to be applied to rows in this file scan task.
Expand All @@ -1142,6 +1179,18 @@ class ResidualFilter8(UnaryExpression, ResidualFilter1):
"""


class Projection(BaseModel):
"""
Defines a projection for a column. If action is not specified, the column is projected as-is.

"""

field_id: int = Field(
..., alias='field-id', description='field id of the column being projected.'
)
action: Action | None = None


class StructField(BaseModel):
id: int
name: str
Expand Down Expand Up @@ -1292,6 +1341,26 @@ class ViewUpdate(BaseModel):
)


class ReadRestrictions(BaseModel):
"""
Read restrictions for a table, including column projections and row filter expressions.
A client MUST enforce the restrictions defined in this object when reading data from the table.
These restrictions apply only to the authenticated principal, user, or account associated with the request. They MUST NOT be interpreted as global policy and MUST NOT be applied beyond the entity identified by the Authentication header (or other applicable authentication mechanism).

"""

required_column_projections: list[Projection] | None = Field(
None,
alias='required-column-projections',
description="A list of projections that MUST be applied prior to any query-specified projections. If this property is absent, no mandatory projection applies, and a reader MAY project any subset of columns of the table, including all columns.\n1. A reader MUST project only columns listed in the required-column-projections.\n - If a listed column has a transform, the reader MUST apply it and replace\n all references to the underlying column with the transformed value\n (for example, truncate[4](cc) MUST be projected as truncate[4](cc) AS cc,\n and all references to cc during query evaluation post applying required-row-filter MUST resolve to this alias).\n - Columns not listed in the required-column-projections MUST NOT be read.\n\n2. A column MUST appear at most once in the required-column-projections.\n3. If a projected column's corresponding entry includes an action that the reader cannot evaluate,\n the reader MUST fail rather than ignore the transform.\n\n4. An identity transform is equivalent to projecting the column directly.\n5. The data type of the projected column MUST match the data type defined for the transform result.\n",
)
required_row_filter: Expression | None = Field(
None,
alias='required-row-filter',
description='An expression that filters rows in the table that the authenticated principal does not have access to.\n1. A reader MUST discard any row for which the filter evaluates to false or null, and\n no information derived from discarded rows MAY be included in the query result.\n\n2. Row filters MUST be evaluated against the original, untransformed column values.\n Required projections MUST be applied only after row filters are applied.\n\n3. If a client cannot interpret or evaluate a provided filter expression, it MUST fail.\n4. If this property is absent, null, or always true then no mandatory filtering is required.\n',
)


class LoadTableResult(BaseModel):
"""
Result used when a table is successfully loaded.
Expand Down Expand Up @@ -1337,6 +1406,7 @@ class LoadTableResult(BaseModel):
storage_credentials: list[StorageCredential] | None = Field(
None, alias='storage-credentials'
)
read_restrictions: ReadRestrictions | None = Field(None, alias='read-restrictions')


class ScanTasks(BaseModel):
Expand Down
101 changes: 101 additions & 0 deletions open-api/rest-catalog-open-api.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3347,6 +3347,105 @@ components:
additionalProperties:
type: string

ReadRestrictions:
type: object
description: >
Read restrictions for a table, including column projections and row filter expressions.

A client MUST enforce the restrictions defined in this object when reading data
from the table.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we state that this is an expectation for "trusted" clients?

Copy link
Contributor Author

@singhpk234 singhpk234 Nov 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My understanding is we don't define what trust means like untampared / sandboxed compute etc hence we might need to define that if we were to reference it here, though that MUST would imply the expectation from the client end, please let me know your thoughts considering above, if we were to define trust there are additonal things i think it would be helpful define like predicate reorder attacks :

Suppose that a user who has access only to red widgets executes the query shown earlier:

SELECT *
    FROM widgets_view
    WHERE 1/iff(color = 'Purple', 0, 1) = 1;

if 1/iff(color = 'Purple', 0, 1) = 1 to be executed before filtering the red widgets this would make the user aware that there exists colour = 'Purple'


These restrictions apply only to the authenticated principal, user, or account
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It could be more than that (for example, could be based on the environment as well). Should we just summarize it as the authorization context?

I guess it also implies that caching may be impacted as well:

  • for example Etag should be different for each response
  • Vary header should be set by the server to include Authorization (or any other header related to authorization) and clients supporting caching should check

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we just summarize it as the authorization context

we haven't defined Authorization yet in the the IRC, as this is entirely managed by catalog (for example grants / policies etc), i do agree these are like authorization predicates but wouldn't saying this depends of the authenticated prinicipal suff, do you have any specific case in mind ?

Etag should be different for each response

My understanding was Etag should be same as that of the what we do in the case of storage cred's ? if the callers has a different authenticated prinicipal, the catalog should send the creds accordingly ? let me see what we say from the ETag POV.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I meant authorization in a broader http sense, not necessarily in terms of catalog primitives (like grants/policies). What the authorization itself represents is open for interpretation. User/principals are the obvious ones, but the environment (for example the engine trustworthiness, or if the client is from within vs outside) could be part of it. That said, it should be opaque for the client.

Re ETag, I was not able to find any strong documentation regarding its support in Iceberg. But ETag is a HTTP concept (not an iceberg one) and the semantic is about the whole response, not a part of it. If they were some kind of intermediaries which would basically handle etag and preconditions, this may cause those intermediaries to return the same response to the wrong audience, causing security issues

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

User/principals are the obvious ones, but the environment (for example the engine trustworthiness, or if the client is from within vs outside) could be part of it.

My understanding is trusted engine more about authenticating both user and the engine (that catalog trusts), but still authorizing on the user grants, do we wanna be explicit about trust or being implicit is fine ?
Because there are many ways to establish trust such as mTLS / on-behalf etc. Is the feedback to not go into specifics at all ?

But ETag is a HTTP concept (not an iceberg one) and the semantic is about the whole response, not a part of it

I understand, I meant iceberg ETAG handling, my understanding is we should validate noting changed post doing authorization checks ? authorization check defines what kind of creds one gets and so will be the same for these read restriction, infact we do similar handling in Polaris for this

  1. do AuthZ checks here
  2. then from tables recent metadata pointer create etag and match etag from the request if they match nothing changed ? here

I am not sure if we can do ETAG checks in-general on a protected resource without authorization checks in place ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because there are many ways to establish trust such as mTLS / on-behalf etc. Is the feedback to not go into specifics at all ?

Yes, that's basically the core of it :)

I am not sure if we can do ETAG checks in-general on a protected resource without authorization checks in place ?

I'm not sure the Polaris implementation is 100% correct. The issue is not in the authorization part, but the fact that the etag would only include the table metadata representation but none of the other data returned in the response, so a client may have cached the temporary credentials (or possibly also the read restrictions) and because it got a 304 Not modified from the server, assumes it can reuse the representation it stored.

Those extra information could also be included in the etag computation (it may become to be too expensive though?), and assuming also that the client keeps a cache per authorization header (+ whatever is returned by the Vary header), it could possibly be cache-compatible

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this may cause those intermediaries to return the same response to the wrong audience, causing security issues

My authorization requirement before returning 304 was in the context above, I do agree that post AuthZ we need to validate against the whole response instead of partial response. But what i wanted to suggest is we should be doing the same for read-restrictions what we are doing for credentials, I think we both agree with this right ?

My understanding is if its a multitenant system isn't it the responsibility of the system to be multi-tenant aware while caching stuff ? Are we saying the client by default assumes that every cacheable and is now relying on the server to send an apt Vary header to be able to take a call what cache key to use ? my understnading is we already creds + oauth uri like things client side cache for multitenant env recent impl

String cacheKey = oauth2ServerUri + ":" + config.credential();

https://github.com/apache/iceberg/pull/14178/files#diff-9ca16b7aa108c5b1ef3d8b7695b898ea78fedcd710ea1ae13e3db4d10072b62dR200

That being said if we say ETAG is an HTTP concept and a multitenant system should make user specific cache, do we need to elaborate more ?

associated with the request. They MUST NOT be interpreted as global policy and
MUST NOT be applied beyond the entity identified by the Authentication header
(or other applicable authentication mechanism).
properties:
required-column-projections:
description: >
A list of projections that MUST be applied prior to any query-specified
projections.
If this property is absent, no mandatory projection applies,
and a reader MAY project any subset of columns of the table, including all columns.

1. A reader MUST project only columns listed in the required-column-projections.
- If a listed column has a transform, the reader MUST apply it and replace
all references to the underlying column with the transformed value
(for example, truncate[4](cc) MUST be projected as truncate[4](cc) AS cc,
and all references to cc during query evaluation post applying required-row-filter MUST resolve to this alias).
- Columns not listed in the required-column-projections MUST NOT be read.

2. A column MUST appear at most once in the required-column-projections.

3. If a projected column's corresponding entry includes an action that the reader cannot evaluate,
the reader MUST fail rather than ignore the transform.

4. An identity transform is equivalent to projecting the column directly.

5. The data type of the projected column MUST match the data type defined for the transform result.

type: array
items:
$ref: '#/components/schemas/Projection'
required-row-filter:
description: >
An expression that filters rows in the table that the authenticated principal does not have access to.

1. A reader MUST discard any row for which the filter evaluates to false or null, and
no information derived from discarded rows MAY be included in the query result.

2. Row filters MUST be evaluated against the original, untransformed column values.
Required projections MUST be applied only after row filters are applied.

3. If a client cannot interpret or evaluate a provided filter expression, it MUST fail.

4. If this property is absent, null, or always true then no mandatory filtering is required.
$ref: '#/components/schemas/Expression'

Projection:
type: object
description: >
Defines a projection for a column.
If action is not specified, the column is projected as-is.
properties:
field-id:
type: integer
description: field id of the column being projected.
action:
$ref: '#/components/schemas/Action'
required:
- field-id

Action:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm confused by the introduction of special references and the removal of Term. I thought this was supposed to use Iceberg Expression so why the change?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the action like sha256 / null are very well defined like one needs to do project null for this column ... one option would have been we create transforms for this and then wrap a scalar expression referecing the transform which return this value, it seemed like an overkill for this function rather what we did now is one has an action if its a known mask just execute it, same is for transform, I wonder if for UDFs we just need apply_udf action ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Action is too generic in this context. maybe name it like Masking?

also action could be optional, right? If only projection is needed (without any masking)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Action is too generic in this context. maybe name it like Masking?

Its a bit intentional to name it Action, idea to use them later in expression too, presently Action would suggest on needs to do this, ApplyTransform can be action to apply existing / predefined transforms in iceberg.

also action could be optional, right?

Agree, we can project it as it, no need to wrap it around identity transform, i removed this and added a note on what does a projection without action means.

description: Defines the specific action to be executed for computing the projection.
oneOf:
- $ref: '#/components/schemas/MaskHashSha256'
- $ref: '#/components/schemas/ReplaceWithNull'
- $ref: '#/components/schemas/MaskAlphanumeric'
- $ref: '#/components/schemas/ApplyTransform'

MaskHashSha256:
description: |
Mask the data of the column by applying SHA-256.
The input must be UTF-8 encoded bytes of the column value.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wondering if we need to say UTF-8 encoded bytes. is it applicable to binary or number types?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is it applicable to binary or number types?

it would be applicable i think for the binary type but not for the numeric type like int | long since the requirement is the input type should be the same as output type.

The SHA-256 digest is represented as a lowercase hexadecimal string.
Engines must follow this procedure to ensure consistency:
1. Convert the column value to a UTF-8 byte array.
2. Apply the SHA-256 algorithm as specified in NIST FIPS 180-4.
3. Convert the resulting 32-byte digest to a 64-character lowercase hexadecimal string.

ReplaceWithNull:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the void transform...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good call out, i am not sure why we don't specify the void transform in rest spec, let me dig some historical context if its just a miss, will incorporate this accordingly

Transform:
type: string
example:
- "identity"
- "year"
- "month"
- "day"
- "hour"
- "bucket[256]"
- "truncate[16]"

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added : #14778

description: Masks data by replacing it with a NULL value.

MaskAlphanumeric:
description: mask all alphabetic characters with 'x' and numeric characters with 'n'

ApplyTransform:
type: object
description: Replace the field with the result of a transform expression. Produce the original field name with the transformed values.
properties:
term:
$ref: '#/components/schemas/Term'

LoadCredentialsResponse:
type: object
required:
Expand Down Expand Up @@ -3407,6 +3506,8 @@ components:
type: array
items:
$ref: '#/components/schemas/StorageCredential'
read-restrictions:
$ref: '#/components/schemas/ReadRestrictions'

ScanTasks:
type: object
Expand Down