diff --git a/.asf.yaml b/.asf.yaml index 420c8d1741de..a99fe92001f2 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -42,6 +42,8 @@ github: - SreeramGarlapati - samredai - Fokko + ghp_branch: gh-pages + ghp_path: ~ notifications: commits: commits@iceberg.apache.org diff --git a/.github/workflows/python-ci-docs.yml b/.github/workflows/python-ci-docs.yml new file mode 100644 index 000000000000..9ae255328d61 --- /dev/null +++ b/.github/workflows/python-ci-docs.yml @@ -0,0 +1,59 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +name: "Python Docs" +on: + push: + branches: + - 'master' + paths: + - 'python/docs/**' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{ github.event_name == 'pull_request' }} + +jobs: + docs: + runs-on: ubuntu-20.04 + + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python }} + - name: Install + working-directory: ./python/mkdocs + run: pip install -r requirements.txt + - name: Build + working-directory: ./python/mkdocs + run: mkdocs build + - name: Copy + working-directory: ./python/mkdocs + run: mv ./site /tmp/site + - name: Push changes to gh-pages branch + run: | + git checkout --orphan gh-pages-tmp + git rm --quiet -rf . + cp -r /tmp/site/* . + git config --global user.name 'GitHub Actions' + git config --global user.email 'actions@github.com' + git add --all + git commit -m 'Publish Python docs' + git push -f origin gh-pages-tmp:gh-pages || true diff --git a/docs/python-api-intro.md b/docs/python-api-intro.md deleted file mode 100644 index 80c1039d0cef..000000000000 --- a/docs/python-api-intro.md +++ /dev/null @@ -1,184 +0,0 @@ ---- -title: "Python API" -url: python-api-intro -aliases: - - "python/api-intro" -menu: - main: - parent: "API" - weight: 500 ---- - - -# Iceberg Python API - -Much of the python api conforms to the java api. You can get more info about the java api [here](../api). - -## Catalog - -The Catalog interface, like java provides search and management operations for tables. - -To create a catalog: - -``` python -from iceberg.hive import HiveTables - -# instantiate Hive Tables -conf = {"hive.metastore.uris": 'thrift://{hms_host}:{hms_port}', - "hive.metastore.warehouse.dir": {tmpdir} } -tables = HiveTables(conf) -``` - -and to create a table from a catalog: - -``` python -from iceberg.api.schema import Schema\ -from iceberg.api.types import TimestampType, DoubleType, StringType, NestedField -from iceberg.api.partition_spec import PartitionSpecBuilder - -schema = Schema(NestedField.optional(1, "DateTime", TimestampType.with_timezone()), - NestedField.optional(2, "Bid", DoubleType.get()), - NestedField.optional(3, "Ask", DoubleType.get()), - NestedField.optional(4, "symbol", StringType.get())) -partition_spec = PartitionSpecBuilder(schema).add(1, 1000, "DateTime_day", "day").build() - -tables.create(schema, "test.test_123", partition_spec) -``` - - -## Tables - -The Table interface provides access to table metadata - -+ schema returns the current table `Schema` -+ spec returns the current table `PartitonSpec` -+ properties returns a map of key-value `TableProperties` -+ currentSnapshot returns the current table `Snapshot` -+ snapshots returns all valid snapshots for the table -+ snapshot(id) returns a specific snapshot by ID -+ location returns the table’s base location - -Tables also provide refresh to update the table to the latest version. - -### Scanning -Iceberg table scans start by creating a `TableScan` object with `newScan`. - -``` python -scan = table.new_scan(); -``` - -To configure a scan, call filter and select on the `TableScan` to get a new `TableScan` with those changes. - -``` python -filtered_scan = scan.filter(Expressions.equal("id", 5)) -``` - -String expressions can also be passed to the filter method. - -``` python -filtered_scan = scan.filter("id=5") -``` - -`Schema` projections can be applied against a `TableScan` by passing a list of column names. - -``` python -filtered_scan = scan.select(["col_1", "col_2", "col_3"]) -``` - -Because some data types cannot be read using the python library, a convenience method for excluding columns from projection is provided. - -``` python -filtered_scan = scan.select_except(["unsupported_col_1", "unsupported_col_2"]) -``` - - -Calls to configuration methods create a new `TableScan` so that each `TableScan` is immutable. - -When a scan is configured, `planFiles`, `planTasks`, and `Schema` are used to return files, tasks, and the read projection. - -``` python -scan = table.new_scan() \ - .filter("id=5") \ - .select(["id", "data"]) - -projection = scan.schema -for task in scan.plan_tasks(): - print(task) -``` - -## Types - -Iceberg data types are located in `iceberg.api.types.types` - -### Primitives - -Primitive type instances are available from static methods in each type class. Types without parameters use `get`, and types like `DecimalType` use factory methods: - -```python -IntegerType.get() # int -DoubleType.get() # double -DecimalType.of(9, 2) # decimal(9, 2) -``` - -### Nested types -Structs, maps, and lists are created using factory methods in type classes. - -Like struct fields, map keys or values and list elements are tracked as nested fields. Nested fields track [field IDs](https://iceberg.apache.org/evolution/#correctness) and nullability. - -Struct fields are created using `NestedField.optional` or `NestedField.required`. Map value and list element nullability is set in the map and list factory methods. - -```python -# struct<1 id: int, 2 data: optional string> -struct = StructType.of([NestedField.required(1, "id", IntegerType.get()), - NestedField.optional(2, "data", StringType.get()]) - ) -``` -```python -# map<1 key: int, 2 value: optional string> -map_var = MapType.of_optional(1, IntegerType.get(), - 2, StringType.get()) -``` -```python -# array<1 element: int> -list_var = ListType.of_required(1, IntegerType.get()); -``` - -## Expressions -Iceberg’s `Expressions` are used to configure table scans. To create `Expressions`, use the factory methods in `Expressions`. - -Supported `Predicate` expressions are: - -+ `is_null` -+ `not_null` -+ `equal` -+ `not_equal` -+ `less_than` -+ `less_than_or_equal` -+ `greater_than` -+ `greater_than_or_equal` - -Supported expression `Operations`are: - -+ `and` -+ `or` -+ `not` - -Constant expressions are: - -+ `always_true` -+ `always_false` diff --git a/docs/python-feature-support.md b/docs/python-feature-support.md deleted file mode 100644 index 4488e4763bac..000000000000 --- a/docs/python-feature-support.md +++ /dev/null @@ -1,79 +0,0 @@ ---- -title: "Python Feature Support" -url: python-feature-support -aliases: - - "python/feature-support" -menu: - main: - parent: "API" - weight: 600 ---- - - -# Feature Support - -The goal is that the python library will provide a functional, performant subset of the java library. The initial focus has been on reading table metadata as well as providing the capability to both plan and execute a scan. - -## Feature Comparison - -### Metadata - -| Operation | Java | Python | -|:------------------------|:-----:|:------:| -| Get Schema | X | X | -| Get Snapshots | X | X | -| Plan Scan | X | X | -| Plan Scan for Snapshot | X | X | -| Update Current Snapshot | X | | -| Set Table Properties | X | | -| Create Table | X | X | -| Drop Table | X | X | -| Alter Table | X | | - - -### Read Support - -Pyarrow is used for reading parquet files, so read support is limited to what is currently supported in the pyarrow.parquet package. - -#### Primitive Types - - -| Data Type | Java | Python | -|:------------------------|:----:|:------:| -| BooleanType | X | X | -| DateType | X | X | -| DecimalType | X | X | -| FloatType | X | X | -| IntegerType | X | X | -| LongType | X | X | -| TimeType | X | X | -| TimestampType | X | X | - -#### Nested Types - -| Data Type | Java | Python | -|:------------------------|:----:|:------:| -| ListType of primitives | X | X | -| MapType of primitives | X | X | -| StructType of primitives| X | X | -| ListType of Nested Types| X | | -| MapType of Nested Types | X | | - -### Write Support - -The python client does not currently support write capability diff --git a/docs/python-quickstart.md b/docs/python-quickstart.md deleted file mode 100644 index 03c3acbe81a5..000000000000 --- a/docs/python-quickstart.md +++ /dev/null @@ -1,70 +0,0 @@ ---- -title: "Python Quickstart" -url: python-quickstart -aliases: - - "python/quickstart" -menu: - main: - parent: "API" - weight: 400 ---- - - - -# Python API Quickstart - -## Installation - -Iceberg python is currently in development, for development and testing purposes the best way to install the library is to perform the following steps: -``` -git clone https://github.com/apache/iceberg.git -cd iceberg/python -pip install -e . -``` - -## Testing -Testing is done using tox. The config can be found in `tox.ini` within the python directory of the iceberg project. - -``` -# simply run tox from within the python dir -tox -``` - -# Examples - -## Inspect Table Metadata -``` python - -from iceberg.hive import HiveTables - -# instantiate Hive Tables -conf = {"hive.metastore.uris": 'thrift://{hms_host}:{hms_port}'} -tables = HiveTables(conf) - -# load table -tbl = tables.load("iceberg_db.iceberg_test_table") - -# inspect metadata -print(tbl.schema()) -print(tbl.spec()) -print(tbl.location()) - -# get table level record count -from pprint import pprint -pprint(int(tbl.current_snapshot().summary.get("total-records"))) -``` diff --git a/python/.pre-commit-config.yaml b/python/.pre-commit-config.yaml index cce5d2ba0bf2..54c644b32cf0 100644 --- a/python/.pre-commit-config.yaml +++ b/python/.pre-commit-config.yaml @@ -63,3 +63,11 @@ repos: - id: flake8 args: [ "--ignore=E501,W503,E203" ] additional_dependencies: [ flake8-bugbear==22.7.1, flake8-comprehensions==3.10.0 ] + - repo: https://github.com/executablebooks/mdformat + rev: 0.7.16 + hooks: + - id: mdformat + additional_dependencies: + - mdformat-black + - mdformat-config + - mdformat-beautysh diff --git a/python/CONTRIBUTING.md b/python/CONTRIBUTING.md index 91ae8215b4cd..1bde5ce7db0d 100644 --- a/python/CONTRIBUTING.md +++ b/python/CONTRIBUTING.md @@ -26,6 +26,7 @@ pip install poetry ``` If you have an older version of pip and virtualenv you need to update these: + ```bash pip install --upgrade virtualenv pip ``` @@ -81,11 +82,13 @@ make test-s3 To pass additional arguments to pytest, you can use `PYTEST_ARGS`. *Run pytest in verbose mode* + ```sh make test PYTEST_ARGS="-v" ``` *Run pytest with pdb enabled* + ```sh make test PYTEST_ARGS="--pdb" ``` diff --git a/python/README.md b/python/README.md index 292fb7b9eeac..6dfefbce6a01 100644 --- a/python/README.md +++ b/python/README.md @@ -17,12 +17,11 @@ # Iceberg Python -py-iceberg is a python library for programmatic access to iceberg table metadata as well as to table data in iceberg format. -It is an implementation of [iceberg table spec](https://iceberg.apache.org/spec/) in Python. +pyiceberg is a python library for programmatic access to iceberg table metadata as well as to table data in iceberg format. It is a Python implementation of [iceberg table spec](https://iceberg.apache.org/spec/). Documentation is available at [https://pyiceberg.apache.org/](https://pyiceberg.apache.org/). ## Getting Started -py-iceberg is currently in development, for development and testing purposes the best way to install the library is to perform the following steps: +pyiceberg is currently in development, for development and testing purposes the best way to install the library is to perform the following steps: ``` git clone https://github.com/apache/iceberg.git @@ -30,11 +29,9 @@ cd iceberg/python pip install -e . ``` -Development is made easy using [Poetry](https://python-poetry.org/docs/#installation). - ## Development -Poetry provides virtual environments for development: +Development is made easy using [Poetry](https://python-poetry.org/docs/#installation). Poetry provides virtual environments for development: ```bash poetry shell @@ -54,4 +51,5 @@ poetry run pytest ``` ## Get in Touch + - [Iceberg community](https://iceberg.apache.org/community/) diff --git a/python/mkdocs/README.md b/python/mkdocs/README.md new file mode 100644 index 000000000000..e9e0462bee5a --- /dev/null +++ b/python/mkdocs/README.md @@ -0,0 +1,28 @@ + + +# Docs + +The pyiceberg docs are stored in `docs/`. + +## Running docs locally + +```sh +pip3 install -r requirements.txt +mkdocs serve +open http://localhost:8000/ +``` diff --git a/python/mkdocs/docs/index.md b/python/mkdocs/docs/index.md new file mode 100644 index 000000000000..35351ef0c86f --- /dev/null +++ b/python/mkdocs/docs/index.md @@ -0,0 +1,548 @@ + + +# PyIceberg + +Much of the python api conforms to the Java API. You can get more info about the java api [here](https://iceberg.apache.org/docs/latest/java-api-quickstart/). + +## Installing + +You can install the latest release version from pypi: + +```sh +pip3 install "pyiceberg[s3fs,hive]" +``` + +Or install the latest development version locally: + +```sh +git clone https://github.com/apache/iceberg.git +cd iceberg/python +pip3 install -e ".[s3fs,hive]" +``` + +You can mix and match optional dependencies: + +| Key | Description: | +|-----------|----------------------------------------------------------------------| +| hive | Support for the Hive metastore | +| pyarrow | PyArrow as a FileIO implementation to interact with the object store | +| s3fs | S3FS as a FileIO implementation to interact with the object store | +| snappy | Support for snappy Avro compression | + +# Python CLI Quickstart + +Pyiceberg comes with a CLI that's available after installing the `pyiceberg` package. + +```sh +➜ pyiceberg --help +Usage: pyiceberg [OPTIONS] COMMAND [ARGS]... + +Options: +--catalog TEXT +--verbose BOOLEAN +--output [text|json] +--uri TEXT +--credential TEXT +--help Show this message and exit. + +Commands: +describe Describes a namespace xor table +drop Operations to drop a namespace or table +list Lists tables or namespaces +location Returns the location of the table +properties Properties on tables/namespaces +rename Renames a table +schema Gets the schema of the table +spec Returns the partition spec of the table +uuid Returns the UUID of the table +``` + +# Configuration + +There are three ways of setting the configuration. + +For the CLI you can pass it in using `--uri` and `--credential` and it will automatically detect the type based on the scheme (`http(s)` for rest, `thrift` for Hive). + +Secondly, YAML based configuration is supported `cat ~/.pyiceberg.yaml`: + +```yaml +catalog: + default: + uri: thrift://localhost:9083 + s3.endpoint: http://localhost:9000 + s3.access-key-id: admin + s3.secret-access-key: password + + rest: + uri: http://rest-catalog/ws/ + credential: t-1234:secret +``` + +Lastly, you can also set it using environment variables: + +```sh +export PYICEBERG_CATALOG__DEFAULT__URI=thrift://localhost:9083 + +export PYICEBERG_CATALOG__REST__URI=http://rest-catalog/ws/ +export PYICEBERG_CATALOG__REST__CREDENTIAL=t-1234:secret +``` + +Where the structure is equivalent to the YAML. The levels are separated using a double underscore (`__`). + +## FileIO configuration + +For the FileIO there are several configuration options available: + +| Key | Example | Description | +|----------------------|---------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| s3.endpoint | https://10.0.19.25/ | Configure an alternative endpoint of the S3 service for the FileIO to access. This could be used to use S3FileIO with any s3-compatible object storage service that has a different endpoint, or access a private S3 endpoint in a virtual private cloud. | +| s3.access-key-id | admin | Configure the static secret access key used to access the FileIO. | +| s3.secret-access-key | password | Configure the static session token used to access the FileIO. | +| s3.signer | bearer | Configure the signature version of the FileIO. | + +# CLI Quickstart + +This example assumes that you have a default catalog set. If you want to load another catalog, for example, the rest example above. Then you need to set `--catalog rest`. + +```sh +➜ pyiceberg list +default +nyc +``` + +```sh +➜ pyiceberg list nyc +nyc.taxis +``` + +```sh +pyiceberg describe nyc.taxis +Table format version 1 +Metadata location file:/.../nyc.db/taxis/metadata/00000-aa3a3eac-ea08-4255-b890-383a64a94e42.metadata.json +Table UUID 6cdfda33-bfa3-48a7-a09e-7abb462e3460 +Last Updated 1661783158061 +Partition spec [] +Sort order [] +Current schema Schema, id=0 +├── 1: VendorID: optional long +├── 2: tpep_pickup_datetime: optional timestamptz +├── 3: tpep_dropoff_datetime: optional timestamptz +├── 4: passenger_count: optional double +├── 5: trip_distance: optional double +├── 6: RatecodeID: optional double +├── 7: store_and_fwd_flag: optional string +├── 8: PULocationID: optional long +├── 9: DOLocationID: optional long +├── 10: payment_type: optional long +├── 11: fare_amount: optional double +├── 12: extra: optional double +├── 13: mta_tax: optional double +├── 14: tip_amount: optional double +├── 15: tolls_amount: optional double +├── 16: improvement_surcharge: optional double +├── 17: total_amount: optional double +├── 18: congestion_surcharge: optional double +└── 19: airport_fee: optional double +Current snapshot Operation.APPEND: id=5937117119577207079, schema_id=0 +Snapshots Snapshots +└── Snapshot 5937117119577207079, schema 0: file:/.../nyc.db/taxis/metadata/snap-5937117119577207079-1-94656c4f-4c66-4600-a4ca-f30377300527.avro +Properties owner root +write.format.default parquet +``` + +Or output in JSON for automation: + +```sh +pyiceberg --output json describe nyc.taxis | jq +{ + "identifier": [ + "nyc", + "taxis" + ], + "metadata_location": "file:/.../nyc.db/taxis/metadata/00000-aa3a3eac-ea08-4255-b890-383a64a94e42.metadata.json", + "metadata": { + "location": "file:/.../nyc.db/taxis", + "table-uuid": "6cdfda33-bfa3-48a7-a09e-7abb462e3460", + "last-updated-ms": 1661783158061, + "last-column-id": 19, + "schemas": [ + { + "type": "struct", + "fields": [ + { + "id": 1, + "name": "VendorID", + "type": "long", + "required": false + }, +... + { + "id": 19, + "name": "airport_fee", + "type": "double", + "required": false + } + ], + "schema-id": 0, + "identifier-field-ids": [] + } + ], + "current-schema-id": 0, + "partition-specs": [ + { + "spec-id": 0, + "fields": [] + } + ], + "default-spec-id": 0, + "last-partition-id": 999, + "properties": { + "owner": "root", + "write.format.default": "parquet" + }, + "current-snapshot-id": 5937117119577207000, + "snapshots": [ + { + "snapshot-id": 5937117119577207000, + "timestamp-ms": 1661783158061, + "manifest-list": "file:/.../nyc.db/taxis/metadata/snap-5937117119577207079-1-94656c4f-4c66-4600-a4ca-f30377300527.avro", + "summary": { + "operation": "append", + "spark.app.id": "local-1661783139151", + "added-data-files": "1", + "added-records": "2979431", + "added-files-size": "46600777", + "changed-partition-count": "1", + "total-records": "2979431", + "total-files-size": "46600777", + "total-data-files": "1", + "total-delete-files": "0", + "total-position-deletes": "0", + "total-equality-deletes": "0" + }, + "schema-id": 0 + } + ], + "snapshot-log": [ + { + "snapshot-id": "5937117119577207079", + "timestamp-ms": 1661783158061 + } + ], + "metadata-log": [], + "sort-orders": [ + { + "order-id": 0, + "fields": [] + } + ], + "default-sort-order-id": 0, + "refs": { + "main": { + "snapshot-id": 5937117119577207000, + "type": "branch" + } + }, + "format-version": 1, + "schema": { + "type": "struct", + "fields": [ + { + "id": 1, + "name": "VendorID", + "type": "long", + "required": false + }, +... + { + "id": 19, + "name": "airport_fee", + "type": "double", + "required": false + } + ], + "schema-id": 0, + "identifier-field-ids": [] + }, + "partition-spec": [] + } +} +``` + +# Python API + +To instantiate a catalog: + +```python +from pyiceberg.catalog import load_catalog + +catalog = load_catalog("prod") + +catalog.list_namespaces() +``` + +Returns: + +``` +[('default',), ('nyc',)] +``` + +Listing the tables in the `nyc` namespace: + +```python +catalog.list_tables("nyc") +``` + +Returns: + +``` +[('nyc', 'taxis')] +``` + +Loading the `taxis` table: + +```python +catalog.load_table(("nyc", "taxis")) +``` + +``` +Table( + identifier=('nyc', 'taxis'), + metadata_location='s3a://warehouse/wh/nyc.db/taxis/metadata/00002-6ea51ce3-62aa-4197-9cf8-43d07c3440ca.metadata.json', + metadata=TableMetadataV2( + location='s3a://warehouse/wh/nyc.db/taxis', + table_uuid=UUID('ebd5d172-2162-453d-b586-1cdce52c1116'), + last_updated_ms=1662633437826, + last_column_id=19, + schemas=[Schema( + NestedField(field_id=1, name='VendorID', field_type=LongType(), required=False), + NestedField(field_id=2, name='tpep_pickup_datetime', field_type=TimestamptzType(), required=False), + NestedField(field_id=3, name='tpep_dropoff_datetime', field_type=TimestamptzType(), required=False), + NestedField(field_id=4, name='passenger_count', field_type=DoubleType(), required=False), + NestedField(field_id=5, name='trip_distance', field_type=DoubleType(), required=False), + NestedField(field_id=6, name='RatecodeID', field_type=DoubleType(), required=False), + NestedField(field_id=7, name='store_and_fwd_flag', field_type=StringType(), required=False), + NestedField(field_id=8, name='PULocationID', field_type=LongType(), required=False), + NestedField(field_id=9, name='DOLocationID', field_type=LongType(), required=False), + NestedField(field_id=10, name='payment_type', field_type=LongType(), required=False), + NestedField(field_id=11, name='fare_amount', field_type=DoubleType(), required=False), + NestedField(field_id=12, name='extra', field_type=DoubleType(), required=False), + NestedField(field_id=13, name='mta_tax', field_type=DoubleType(), required=False), + NestedField(field_id=14, name='tip_amount', field_type=DoubleType(), required=False), + NestedField(field_id=15, name='tolls_amount', field_type=DoubleType(), required=False), + NestedField(field_id=16, name='improvement_surcharge', field_type=DoubleType(), required=False), + NestedField(field_id=17, name='total_amount', field_type=DoubleType(), required=False), + NestedField(field_id=18, name='congestion_surcharge', field_type=DoubleType(), required=False), + NestedField(field_id=19, name='airport_fee', field_type=DoubleType(), required=False) + ), + schema_id=0, + identifier_field_ids=[] + )], + current_schema_id=0, + partition_specs=[PartitionSpec(spec_id=0)], + default_spec_id=0, + last_partition_id=999, + properties={ + 'owner': 'root', + 'write.format.default': 'parquet' + }, + current_snapshot_id=8334458494559715805, + snapshots=[ + Snapshot( + snapshot_id=7910949481055846233, + parent_snapshot_id=None, + sequence_number=None, + timestamp_ms=1662489306555, + manifest_list='s3a://warehouse/wh/nyc.db/taxis/metadata/snap-7910949481055846233-1-3eb7a2e1-5b7a-4e76-a29a-3e29c176eea4.avro', + summary=Summary( + Operation.APPEND, + **{ + 'spark.app.id': 'local-1662489289173', + 'added-data-files': '1', + 'added-records': '2979431', + 'added-files-size': '46600777', + 'changed-partition-count': '1', + 'total-records': '2979431', + 'total-files-size': '46600777', + 'total-data-files': '1', + 'total-delete-files': '0', + 'total-position-deletes': '0', + 'total-equality-deletes': '0' + } + ), + schema_id=0 + ), + ], + snapshot_log=[ + SnapshotLogEntry( + snapshot_id='7910949481055846233', + timestamp_ms=1662489306555 + ) + ], + metadata_log=[ + MetadataLogEntry( + metadata_file='s3a://warehouse/wh/nyc.db/taxis/metadata/00000-b58341ba-6a63-4eea-9b2f-e85e47c7d09f.metadata.json', + timestamp_ms=1662489306555 + ) + ], + sort_orders=[SortOrder(order_id=0)], + default_sort_order_id=0, + refs={ + 'main': SnapshotRef( + snapshot_id=8334458494559715805, + snapshot_ref_type=SnapshotRefType.BRANCH, + min_snapshots_to_keep=None, + max_snapshot_age_ms=None, + max_ref_age_ms=None + ) + }, + format_version=2, + last_sequence_number=1 + ) +) +``` + +And to create a table from a catalog: + +```python +from pyiceberg.schema import Schema +from pyiceberg.types import TimestampType, DoubleType, StringType, NestedField + +schema = Schema( + NestedField( + field_id=1, name="datetime", field_type=TimestampType(), required=False + ), + NestedField(field_id=2, name="bid", field_type=DoubleType(), required=False), + NestedField(field_id=3, name="ask", field_type=DoubleType(), required=False), + NestedField(field_id=4, name="symbol", field_type=StringType(), required=False), +) + +from pyiceberg.table.partitioning import PartitionSpec, PartitionField +from pyiceberg.transforms import DayTransform + +partition_spec = PartitionSpec( + PartitionField( + source_id=1, field_id=1000, transform=DayTransform(), name="datetime_day" + ) +) + +from pyiceberg.table.sorting import SortOrder, SortField +from pyiceberg.transforms import IdentityTransform + +sort_order = SortOrder(SortField(source_id=4, transform=IdentityTransform())) + +from pyiceberg.catalog.hive import HiveCatalog + +catalog = HiveCatalog(name="prod", uri="thrift://localhost:9083/") + +catalog.create_table( + identifier="default.bids", + location="/Users/fokkodriesprong/Desktop/docker-spark-iceberg/wh/bids/", + schema=schema, + partition_spec=partition_spec, + sort_order=sort_order, +) +``` + +Which returns a newly created table: + +``` +Table( + identifier=('default', 'bids'), + metadata_location='/Users/fokkodriesprong/Desktop/docker-spark-iceberg/wh/bids//metadata/00000-c8cd93ab-f784-474d-a167-b1a86b05195f.metadata.json', + metadata=TableMetadataV2( + location='/Users/fokkodriesprong/Desktop/docker-spark-iceberg/wh/bids/', + table_uuid=UUID('38d4cb39-4945-4bf2-b374-984b5c4984d2'), + last_updated_ms=1661847562069, + last_column_id=4, + schemas=[ + Schema( + NestedField(field_id=1, name='datetime', field_type=TimestampType(), required=False), + NestedField(field_id=2, name='bid', field_type=DoubleType(), required=False), + NestedField(field_id=3, name='ask', field_type=DoubleType(), required=False), + NestedField(field_id=4, name='symbol', field_type=StringType(), required=False)), + schema_id=1, + identifier_field_ids=[]) + ], + current_schema_id=1, + partition_specs=[ + PartitionSpec( + PartitionField(source_id=1, field_id=1000, transform=DayTransform(), name='datetime_day'),)) + ], + default_spec_id=0, + last_partition_id=1000, + properties={}, + current_snapshot_id=None, + snapshots=[], + snapshot_log=[], + metadata_log=[], + sort_orders=[ + SortOrder(order_id=1, fields=[SortField(source_id=4, transform=IdentityTransform(), direction=SortDirection.ASC, null_order=NullOrder.NULLS_FIRST)]) + ], + default_sort_order_id=1, + refs={}, + format_version=2, + last_sequence_number=0 + ) +) +``` + +# Feature Support + +The goal is that the python library will provide a functional, performant subset of the Java library. The initial focus has been on reading table metadata and provide a convenient CLI to go through the catalog. + +## Metadata + +| Operation | Java | Python | +|:------------------------|:-----:|:------:| +| Get Schema | X | X | +| Get Snapshots | X | X | +| Plan Scan | X | | +| Plan Scan for Snapshot | X | | +| Update Current Snapshot | X | | +| Set Table Properties | X | X | +| Create Table | X | X | +| Drop Table | X | X | +| Alter Table | X | | + +## Types + +The types are kept in `pyiceberg.types`. + +Primitive types: + +- `BooleanType` +- `StringType` +- `IntegerType` +- `LongType` +- `FloatType` +- `DoubleType` +- `DateType` +- `TimeType` +- `TimestampType` +- `TimestamptzType` +- `BinaryType` +- `UUIDType` + +Complex types: + +- `StructType` +- `ListType` +- `MapType` +- `FixedType(16)` +- `DecimalType(8, 3)` diff --git a/python/mkdocs/mkdocs.yml b/python/mkdocs/mkdocs.yml new file mode 100644 index 000000000000..c84a2de465ee --- /dev/null +++ b/python/mkdocs/mkdocs.yml @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +--- +site_name: PyIceberg diff --git a/python/mkdocs/requirements.txt b/python/mkdocs/requirements.txt new file mode 100644 index 000000000000..642a688ebc5a --- /dev/null +++ b/python/mkdocs/requirements.txt @@ -0,0 +1,19 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +mkdocs==1.3.1 +jinja2==3.0.3