From c9254d7cd27ee32c2bcf305845733ece02e37f74 Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Wed, 7 Sep 2022 20:38:10 +0200 Subject: [PATCH 1/2] Python: Split Python docs This PR will split the Python docs in a separate site. The main reason for this is that the docs are part of the Java release, which is not in sync with the Python release cylce. Meaning that there is a high probability that the docs does not match with current version of the code. This will publish the docs to Github pages, by pushing this to the `gh-pages` branch. We can set up an alias from Apache, and point pyiceberg.apache.org to the github pages endpoint. I also tried readthedocs, but I found that not straightforward. Mostly because they have a build process on their end that will pull the code, and build the docs. This involves another pipeline that we have to monitor, and we have to set up webhooks. I am a simple man, and I like simple things, therefore I went for mkdocs. This can push the docs to github pages in a single command: https://www.mkdocs.org/user-guide/deploying-your-docs/#project-pages Considerations: - Decided to keep it to a single page for now, we can break it out into different pages later on. Let me know what you think of this. - We build the docs now when we push to master, probably we'll change this later to trigger on tags. - I've removed the Python docs from the other docs to avoid confusion and make sure that we have a single source of truth. An example is shown here: https://fokko.github.io/incubator-iceberg/ (Once this is merged, I'll remove that one) Closes #363 Closes #3283 --- .asf.yaml | 2 + .github/workflows/python-ci-docs.yml | 60 +++ docs/python-api-intro.md | 184 --------- docs/python-feature-support.md | 79 ---- docs/python-quickstart.md | 70 ---- python/.pre-commit-config.yaml | 8 + python/CONTRIBUTING.md | 3 + python/README.md | 10 +- python/mkdocs/README.md | 28 ++ python/mkdocs/docs/index.md | 548 +++++++++++++++++++++++++++ python/mkdocs/mkdocs.yml | 18 + python/mkdocs/requirements.txt | 19 + 12 files changed, 690 insertions(+), 339 deletions(-) create mode 100644 .github/workflows/python-ci-docs.yml delete mode 100644 docs/python-api-intro.md delete mode 100644 docs/python-feature-support.md delete mode 100644 docs/python-quickstart.md create mode 100644 python/mkdocs/README.md create mode 100644 python/mkdocs/docs/index.md create mode 100644 python/mkdocs/mkdocs.yml create mode 100644 python/mkdocs/requirements.txt diff --git a/.asf.yaml b/.asf.yaml index 420c8d1741de..a99fe92001f2 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -42,6 +42,8 @@ github: - SreeramGarlapati - samredai - Fokko + ghp_branch: gh-pages + ghp_path: ~ notifications: commits: commits@iceberg.apache.org diff --git a/.github/workflows/python-ci-docs.yml b/.github/workflows/python-ci-docs.yml new file mode 100644 index 000000000000..3f579b5997b3 --- /dev/null +++ b/.github/workflows/python-ci-docs.yml @@ -0,0 +1,60 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +name: "Python Docs" +on: + push: + branches: + - 'master' + pull_request: + paths: + - '.github/workflows/python-ci-docs.yml' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{ github.event_name == 'pull_request' }} + +jobs: + docs: + runs-on: ubuntu-20.04 + + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python }} + - name: Install + working-directory: ./python/mkdocs + run: pip install -r requirements.txt + - name: Build + working-directory: ./python/mkdocs + run: mkdocs build + - name: Copy + working-directory: ./python/mkdocs + run: mv ./site /tmp/site + - name: Push changes to gh-pages branch + run: | + git checkout --orphan gh-pages-tmp + git rm --quiet -rf . + cp -r /tmp/site/* . + git config --global user.name 'GitHub Actions' + git config --global user.email 'actions@github.com' + git add --all + git commit -m 'Publish Python docs' + git push -f origin gh-pages-tmp:gh-pages || true diff --git a/docs/python-api-intro.md b/docs/python-api-intro.md deleted file mode 100644 index 80c1039d0cef..000000000000 --- a/docs/python-api-intro.md +++ /dev/null @@ -1,184 +0,0 @@ ---- -title: "Python API" -url: python-api-intro -aliases: - - "python/api-intro" -menu: - main: - parent: "API" - weight: 500 ---- - - -# Iceberg Python API - -Much of the python api conforms to the java api. You can get more info about the java api [here](../api). - -## Catalog - -The Catalog interface, like java provides search and management operations for tables. - -To create a catalog: - -``` python -from iceberg.hive import HiveTables - -# instantiate Hive Tables -conf = {"hive.metastore.uris": 'thrift://{hms_host}:{hms_port}', - "hive.metastore.warehouse.dir": {tmpdir} } -tables = HiveTables(conf) -``` - -and to create a table from a catalog: - -``` python -from iceberg.api.schema import Schema\ -from iceberg.api.types import TimestampType, DoubleType, StringType, NestedField -from iceberg.api.partition_spec import PartitionSpecBuilder - -schema = Schema(NestedField.optional(1, "DateTime", TimestampType.with_timezone()), - NestedField.optional(2, "Bid", DoubleType.get()), - NestedField.optional(3, "Ask", DoubleType.get()), - NestedField.optional(4, "symbol", StringType.get())) -partition_spec = PartitionSpecBuilder(schema).add(1, 1000, "DateTime_day", "day").build() - -tables.create(schema, "test.test_123", partition_spec) -``` - - -## Tables - -The Table interface provides access to table metadata - -+ schema returns the current table `Schema` -+ spec returns the current table `PartitonSpec` -+ properties returns a map of key-value `TableProperties` -+ currentSnapshot returns the current table `Snapshot` -+ snapshots returns all valid snapshots for the table -+ snapshot(id) returns a specific snapshot by ID -+ location returns the table’s base location - -Tables also provide refresh to update the table to the latest version. - -### Scanning -Iceberg table scans start by creating a `TableScan` object with `newScan`. - -``` python -scan = table.new_scan(); -``` - -To configure a scan, call filter and select on the `TableScan` to get a new `TableScan` with those changes. - -``` python -filtered_scan = scan.filter(Expressions.equal("id", 5)) -``` - -String expressions can also be passed to the filter method. - -``` python -filtered_scan = scan.filter("id=5") -``` - -`Schema` projections can be applied against a `TableScan` by passing a list of column names. - -``` python -filtered_scan = scan.select(["col_1", "col_2", "col_3"]) -``` - -Because some data types cannot be read using the python library, a convenience method for excluding columns from projection is provided. - -``` python -filtered_scan = scan.select_except(["unsupported_col_1", "unsupported_col_2"]) -``` - - -Calls to configuration methods create a new `TableScan` so that each `TableScan` is immutable. - -When a scan is configured, `planFiles`, `planTasks`, and `Schema` are used to return files, tasks, and the read projection. - -``` python -scan = table.new_scan() \ - .filter("id=5") \ - .select(["id", "data"]) - -projection = scan.schema -for task in scan.plan_tasks(): - print(task) -``` - -## Types - -Iceberg data types are located in `iceberg.api.types.types` - -### Primitives - -Primitive type instances are available from static methods in each type class. Types without parameters use `get`, and types like `DecimalType` use factory methods: - -```python -IntegerType.get() # int -DoubleType.get() # double -DecimalType.of(9, 2) # decimal(9, 2) -``` - -### Nested types -Structs, maps, and lists are created using factory methods in type classes. - -Like struct fields, map keys or values and list elements are tracked as nested fields. Nested fields track [field IDs](https://iceberg.apache.org/evolution/#correctness) and nullability. - -Struct fields are created using `NestedField.optional` or `NestedField.required`. Map value and list element nullability is set in the map and list factory methods. - -```python -# struct<1 id: int, 2 data: optional string> -struct = StructType.of([NestedField.required(1, "id", IntegerType.get()), - NestedField.optional(2, "data", StringType.get()]) - ) -``` -```python -# map<1 key: int, 2 value: optional string> -map_var = MapType.of_optional(1, IntegerType.get(), - 2, StringType.get()) -``` -```python -# array<1 element: int> -list_var = ListType.of_required(1, IntegerType.get()); -``` - -## Expressions -Iceberg’s `Expressions` are used to configure table scans. To create `Expressions`, use the factory methods in `Expressions`. - -Supported `Predicate` expressions are: - -+ `is_null` -+ `not_null` -+ `equal` -+ `not_equal` -+ `less_than` -+ `less_than_or_equal` -+ `greater_than` -+ `greater_than_or_equal` - -Supported expression `Operations`are: - -+ `and` -+ `or` -+ `not` - -Constant expressions are: - -+ `always_true` -+ `always_false` diff --git a/docs/python-feature-support.md b/docs/python-feature-support.md deleted file mode 100644 index 4488e4763bac..000000000000 --- a/docs/python-feature-support.md +++ /dev/null @@ -1,79 +0,0 @@ ---- -title: "Python Feature Support" -url: python-feature-support -aliases: - - "python/feature-support" -menu: - main: - parent: "API" - weight: 600 ---- - - -# Feature Support - -The goal is that the python library will provide a functional, performant subset of the java library. The initial focus has been on reading table metadata as well as providing the capability to both plan and execute a scan. - -## Feature Comparison - -### Metadata - -| Operation | Java | Python | -|:------------------------|:-----:|:------:| -| Get Schema | X | X | -| Get Snapshots | X | X | -| Plan Scan | X | X | -| Plan Scan for Snapshot | X | X | -| Update Current Snapshot | X | | -| Set Table Properties | X | | -| Create Table | X | X | -| Drop Table | X | X | -| Alter Table | X | | - - -### Read Support - -Pyarrow is used for reading parquet files, so read support is limited to what is currently supported in the pyarrow.parquet package. - -#### Primitive Types - - -| Data Type | Java | Python | -|:------------------------|:----:|:------:| -| BooleanType | X | X | -| DateType | X | X | -| DecimalType | X | X | -| FloatType | X | X | -| IntegerType | X | X | -| LongType | X | X | -| TimeType | X | X | -| TimestampType | X | X | - -#### Nested Types - -| Data Type | Java | Python | -|:------------------------|:----:|:------:| -| ListType of primitives | X | X | -| MapType of primitives | X | X | -| StructType of primitives| X | X | -| ListType of Nested Types| X | | -| MapType of Nested Types | X | | - -### Write Support - -The python client does not currently support write capability diff --git a/docs/python-quickstart.md b/docs/python-quickstart.md deleted file mode 100644 index 03c3acbe81a5..000000000000 --- a/docs/python-quickstart.md +++ /dev/null @@ -1,70 +0,0 @@ ---- -title: "Python Quickstart" -url: python-quickstart -aliases: - - "python/quickstart" -menu: - main: - parent: "API" - weight: 400 ---- - - - -# Python API Quickstart - -## Installation - -Iceberg python is currently in development, for development and testing purposes the best way to install the library is to perform the following steps: -``` -git clone https://github.com/apache/iceberg.git -cd iceberg/python -pip install -e . -``` - -## Testing -Testing is done using tox. The config can be found in `tox.ini` within the python directory of the iceberg project. - -``` -# simply run tox from within the python dir -tox -``` - -# Examples - -## Inspect Table Metadata -``` python - -from iceberg.hive import HiveTables - -# instantiate Hive Tables -conf = {"hive.metastore.uris": 'thrift://{hms_host}:{hms_port}'} -tables = HiveTables(conf) - -# load table -tbl = tables.load("iceberg_db.iceberg_test_table") - -# inspect metadata -print(tbl.schema()) -print(tbl.spec()) -print(tbl.location()) - -# get table level record count -from pprint import pprint -pprint(int(tbl.current_snapshot().summary.get("total-records"))) -``` diff --git a/python/.pre-commit-config.yaml b/python/.pre-commit-config.yaml index cce5d2ba0bf2..54c644b32cf0 100644 --- a/python/.pre-commit-config.yaml +++ b/python/.pre-commit-config.yaml @@ -63,3 +63,11 @@ repos: - id: flake8 args: [ "--ignore=E501,W503,E203" ] additional_dependencies: [ flake8-bugbear==22.7.1, flake8-comprehensions==3.10.0 ] + - repo: https://github.com/executablebooks/mdformat + rev: 0.7.16 + hooks: + - id: mdformat + additional_dependencies: + - mdformat-black + - mdformat-config + - mdformat-beautysh diff --git a/python/CONTRIBUTING.md b/python/CONTRIBUTING.md index 91ae8215b4cd..1bde5ce7db0d 100644 --- a/python/CONTRIBUTING.md +++ b/python/CONTRIBUTING.md @@ -26,6 +26,7 @@ pip install poetry ``` If you have an older version of pip and virtualenv you need to update these: + ```bash pip install --upgrade virtualenv pip ``` @@ -81,11 +82,13 @@ make test-s3 To pass additional arguments to pytest, you can use `PYTEST_ARGS`. *Run pytest in verbose mode* + ```sh make test PYTEST_ARGS="-v" ``` *Run pytest with pdb enabled* + ```sh make test PYTEST_ARGS="--pdb" ``` diff --git a/python/README.md b/python/README.md index 292fb7b9eeac..6dfefbce6a01 100644 --- a/python/README.md +++ b/python/README.md @@ -17,12 +17,11 @@ # Iceberg Python -py-iceberg is a python library for programmatic access to iceberg table metadata as well as to table data in iceberg format. -It is an implementation of [iceberg table spec](https://iceberg.apache.org/spec/) in Python. +pyiceberg is a python library for programmatic access to iceberg table metadata as well as to table data in iceberg format. It is a Python implementation of [iceberg table spec](https://iceberg.apache.org/spec/). Documentation is available at [https://pyiceberg.apache.org/](https://pyiceberg.apache.org/). ## Getting Started -py-iceberg is currently in development, for development and testing purposes the best way to install the library is to perform the following steps: +pyiceberg is currently in development, for development and testing purposes the best way to install the library is to perform the following steps: ``` git clone https://github.com/apache/iceberg.git @@ -30,11 +29,9 @@ cd iceberg/python pip install -e . ``` -Development is made easy using [Poetry](https://python-poetry.org/docs/#installation). - ## Development -Poetry provides virtual environments for development: +Development is made easy using [Poetry](https://python-poetry.org/docs/#installation). Poetry provides virtual environments for development: ```bash poetry shell @@ -54,4 +51,5 @@ poetry run pytest ``` ## Get in Touch + - [Iceberg community](https://iceberg.apache.org/community/) diff --git a/python/mkdocs/README.md b/python/mkdocs/README.md new file mode 100644 index 000000000000..e9e0462bee5a --- /dev/null +++ b/python/mkdocs/README.md @@ -0,0 +1,28 @@ + + +# Docs + +The pyiceberg docs are stored in `docs/`. + +## Running docs locally + +```sh +pip3 install -r requirements.txt +mkdocs serve +open http://localhost:8000/ +``` diff --git a/python/mkdocs/docs/index.md b/python/mkdocs/docs/index.md new file mode 100644 index 000000000000..35351ef0c86f --- /dev/null +++ b/python/mkdocs/docs/index.md @@ -0,0 +1,548 @@ + + +# PyIceberg + +Much of the python api conforms to the Java API. You can get more info about the java api [here](https://iceberg.apache.org/docs/latest/java-api-quickstart/). + +## Installing + +You can install the latest release version from pypi: + +```sh +pip3 install "pyiceberg[s3fs,hive]" +``` + +Or install the latest development version locally: + +```sh +git clone https://github.com/apache/iceberg.git +cd iceberg/python +pip3 install -e ".[s3fs,hive]" +``` + +You can mix and match optional dependencies: + +| Key | Description: | +|-----------|----------------------------------------------------------------------| +| hive | Support for the Hive metastore | +| pyarrow | PyArrow as a FileIO implementation to interact with the object store | +| s3fs | S3FS as a FileIO implementation to interact with the object store | +| snappy | Support for snappy Avro compression | + +# Python CLI Quickstart + +Pyiceberg comes with a CLI that's available after installing the `pyiceberg` package. + +```sh +➜ pyiceberg --help +Usage: pyiceberg [OPTIONS] COMMAND [ARGS]... + +Options: +--catalog TEXT +--verbose BOOLEAN +--output [text|json] +--uri TEXT +--credential TEXT +--help Show this message and exit. + +Commands: +describe Describes a namespace xor table +drop Operations to drop a namespace or table +list Lists tables or namespaces +location Returns the location of the table +properties Properties on tables/namespaces +rename Renames a table +schema Gets the schema of the table +spec Returns the partition spec of the table +uuid Returns the UUID of the table +``` + +# Configuration + +There are three ways of setting the configuration. + +For the CLI you can pass it in using `--uri` and `--credential` and it will automatically detect the type based on the scheme (`http(s)` for rest, `thrift` for Hive). + +Secondly, YAML based configuration is supported `cat ~/.pyiceberg.yaml`: + +```yaml +catalog: + default: + uri: thrift://localhost:9083 + s3.endpoint: http://localhost:9000 + s3.access-key-id: admin + s3.secret-access-key: password + + rest: + uri: http://rest-catalog/ws/ + credential: t-1234:secret +``` + +Lastly, you can also set it using environment variables: + +```sh +export PYICEBERG_CATALOG__DEFAULT__URI=thrift://localhost:9083 + +export PYICEBERG_CATALOG__REST__URI=http://rest-catalog/ws/ +export PYICEBERG_CATALOG__REST__CREDENTIAL=t-1234:secret +``` + +Where the structure is equivalent to the YAML. The levels are separated using a double underscore (`__`). + +## FileIO configuration + +For the FileIO there are several configuration options available: + +| Key | Example | Description | +|----------------------|---------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| s3.endpoint | https://10.0.19.25/ | Configure an alternative endpoint of the S3 service for the FileIO to access. This could be used to use S3FileIO with any s3-compatible object storage service that has a different endpoint, or access a private S3 endpoint in a virtual private cloud. | +| s3.access-key-id | admin | Configure the static secret access key used to access the FileIO. | +| s3.secret-access-key | password | Configure the static session token used to access the FileIO. | +| s3.signer | bearer | Configure the signature version of the FileIO. | + +# CLI Quickstart + +This example assumes that you have a default catalog set. If you want to load another catalog, for example, the rest example above. Then you need to set `--catalog rest`. + +```sh +➜ pyiceberg list +default +nyc +``` + +```sh +➜ pyiceberg list nyc +nyc.taxis +``` + +```sh +pyiceberg describe nyc.taxis +Table format version 1 +Metadata location file:/.../nyc.db/taxis/metadata/00000-aa3a3eac-ea08-4255-b890-383a64a94e42.metadata.json +Table UUID 6cdfda33-bfa3-48a7-a09e-7abb462e3460 +Last Updated 1661783158061 +Partition spec [] +Sort order [] +Current schema Schema, id=0 +├── 1: VendorID: optional long +├── 2: tpep_pickup_datetime: optional timestamptz +├── 3: tpep_dropoff_datetime: optional timestamptz +├── 4: passenger_count: optional double +├── 5: trip_distance: optional double +├── 6: RatecodeID: optional double +├── 7: store_and_fwd_flag: optional string +├── 8: PULocationID: optional long +├── 9: DOLocationID: optional long +├── 10: payment_type: optional long +├── 11: fare_amount: optional double +├── 12: extra: optional double +├── 13: mta_tax: optional double +├── 14: tip_amount: optional double +├── 15: tolls_amount: optional double +├── 16: improvement_surcharge: optional double +├── 17: total_amount: optional double +├── 18: congestion_surcharge: optional double +└── 19: airport_fee: optional double +Current snapshot Operation.APPEND: id=5937117119577207079, schema_id=0 +Snapshots Snapshots +└── Snapshot 5937117119577207079, schema 0: file:/.../nyc.db/taxis/metadata/snap-5937117119577207079-1-94656c4f-4c66-4600-a4ca-f30377300527.avro +Properties owner root +write.format.default parquet +``` + +Or output in JSON for automation: + +```sh +pyiceberg --output json describe nyc.taxis | jq +{ + "identifier": [ + "nyc", + "taxis" + ], + "metadata_location": "file:/.../nyc.db/taxis/metadata/00000-aa3a3eac-ea08-4255-b890-383a64a94e42.metadata.json", + "metadata": { + "location": "file:/.../nyc.db/taxis", + "table-uuid": "6cdfda33-bfa3-48a7-a09e-7abb462e3460", + "last-updated-ms": 1661783158061, + "last-column-id": 19, + "schemas": [ + { + "type": "struct", + "fields": [ + { + "id": 1, + "name": "VendorID", + "type": "long", + "required": false + }, +... + { + "id": 19, + "name": "airport_fee", + "type": "double", + "required": false + } + ], + "schema-id": 0, + "identifier-field-ids": [] + } + ], + "current-schema-id": 0, + "partition-specs": [ + { + "spec-id": 0, + "fields": [] + } + ], + "default-spec-id": 0, + "last-partition-id": 999, + "properties": { + "owner": "root", + "write.format.default": "parquet" + }, + "current-snapshot-id": 5937117119577207000, + "snapshots": [ + { + "snapshot-id": 5937117119577207000, + "timestamp-ms": 1661783158061, + "manifest-list": "file:/.../nyc.db/taxis/metadata/snap-5937117119577207079-1-94656c4f-4c66-4600-a4ca-f30377300527.avro", + "summary": { + "operation": "append", + "spark.app.id": "local-1661783139151", + "added-data-files": "1", + "added-records": "2979431", + "added-files-size": "46600777", + "changed-partition-count": "1", + "total-records": "2979431", + "total-files-size": "46600777", + "total-data-files": "1", + "total-delete-files": "0", + "total-position-deletes": "0", + "total-equality-deletes": "0" + }, + "schema-id": 0 + } + ], + "snapshot-log": [ + { + "snapshot-id": "5937117119577207079", + "timestamp-ms": 1661783158061 + } + ], + "metadata-log": [], + "sort-orders": [ + { + "order-id": 0, + "fields": [] + } + ], + "default-sort-order-id": 0, + "refs": { + "main": { + "snapshot-id": 5937117119577207000, + "type": "branch" + } + }, + "format-version": 1, + "schema": { + "type": "struct", + "fields": [ + { + "id": 1, + "name": "VendorID", + "type": "long", + "required": false + }, +... + { + "id": 19, + "name": "airport_fee", + "type": "double", + "required": false + } + ], + "schema-id": 0, + "identifier-field-ids": [] + }, + "partition-spec": [] + } +} +``` + +# Python API + +To instantiate a catalog: + +```python +from pyiceberg.catalog import load_catalog + +catalog = load_catalog("prod") + +catalog.list_namespaces() +``` + +Returns: + +``` +[('default',), ('nyc',)] +``` + +Listing the tables in the `nyc` namespace: + +```python +catalog.list_tables("nyc") +``` + +Returns: + +``` +[('nyc', 'taxis')] +``` + +Loading the `taxis` table: + +```python +catalog.load_table(("nyc", "taxis")) +``` + +``` +Table( + identifier=('nyc', 'taxis'), + metadata_location='s3a://warehouse/wh/nyc.db/taxis/metadata/00002-6ea51ce3-62aa-4197-9cf8-43d07c3440ca.metadata.json', + metadata=TableMetadataV2( + location='s3a://warehouse/wh/nyc.db/taxis', + table_uuid=UUID('ebd5d172-2162-453d-b586-1cdce52c1116'), + last_updated_ms=1662633437826, + last_column_id=19, + schemas=[Schema( + NestedField(field_id=1, name='VendorID', field_type=LongType(), required=False), + NestedField(field_id=2, name='tpep_pickup_datetime', field_type=TimestamptzType(), required=False), + NestedField(field_id=3, name='tpep_dropoff_datetime', field_type=TimestamptzType(), required=False), + NestedField(field_id=4, name='passenger_count', field_type=DoubleType(), required=False), + NestedField(field_id=5, name='trip_distance', field_type=DoubleType(), required=False), + NestedField(field_id=6, name='RatecodeID', field_type=DoubleType(), required=False), + NestedField(field_id=7, name='store_and_fwd_flag', field_type=StringType(), required=False), + NestedField(field_id=8, name='PULocationID', field_type=LongType(), required=False), + NestedField(field_id=9, name='DOLocationID', field_type=LongType(), required=False), + NestedField(field_id=10, name='payment_type', field_type=LongType(), required=False), + NestedField(field_id=11, name='fare_amount', field_type=DoubleType(), required=False), + NestedField(field_id=12, name='extra', field_type=DoubleType(), required=False), + NestedField(field_id=13, name='mta_tax', field_type=DoubleType(), required=False), + NestedField(field_id=14, name='tip_amount', field_type=DoubleType(), required=False), + NestedField(field_id=15, name='tolls_amount', field_type=DoubleType(), required=False), + NestedField(field_id=16, name='improvement_surcharge', field_type=DoubleType(), required=False), + NestedField(field_id=17, name='total_amount', field_type=DoubleType(), required=False), + NestedField(field_id=18, name='congestion_surcharge', field_type=DoubleType(), required=False), + NestedField(field_id=19, name='airport_fee', field_type=DoubleType(), required=False) + ), + schema_id=0, + identifier_field_ids=[] + )], + current_schema_id=0, + partition_specs=[PartitionSpec(spec_id=0)], + default_spec_id=0, + last_partition_id=999, + properties={ + 'owner': 'root', + 'write.format.default': 'parquet' + }, + current_snapshot_id=8334458494559715805, + snapshots=[ + Snapshot( + snapshot_id=7910949481055846233, + parent_snapshot_id=None, + sequence_number=None, + timestamp_ms=1662489306555, + manifest_list='s3a://warehouse/wh/nyc.db/taxis/metadata/snap-7910949481055846233-1-3eb7a2e1-5b7a-4e76-a29a-3e29c176eea4.avro', + summary=Summary( + Operation.APPEND, + **{ + 'spark.app.id': 'local-1662489289173', + 'added-data-files': '1', + 'added-records': '2979431', + 'added-files-size': '46600777', + 'changed-partition-count': '1', + 'total-records': '2979431', + 'total-files-size': '46600777', + 'total-data-files': '1', + 'total-delete-files': '0', + 'total-position-deletes': '0', + 'total-equality-deletes': '0' + } + ), + schema_id=0 + ), + ], + snapshot_log=[ + SnapshotLogEntry( + snapshot_id='7910949481055846233', + timestamp_ms=1662489306555 + ) + ], + metadata_log=[ + MetadataLogEntry( + metadata_file='s3a://warehouse/wh/nyc.db/taxis/metadata/00000-b58341ba-6a63-4eea-9b2f-e85e47c7d09f.metadata.json', + timestamp_ms=1662489306555 + ) + ], + sort_orders=[SortOrder(order_id=0)], + default_sort_order_id=0, + refs={ + 'main': SnapshotRef( + snapshot_id=8334458494559715805, + snapshot_ref_type=SnapshotRefType.BRANCH, + min_snapshots_to_keep=None, + max_snapshot_age_ms=None, + max_ref_age_ms=None + ) + }, + format_version=2, + last_sequence_number=1 + ) +) +``` + +And to create a table from a catalog: + +```python +from pyiceberg.schema import Schema +from pyiceberg.types import TimestampType, DoubleType, StringType, NestedField + +schema = Schema( + NestedField( + field_id=1, name="datetime", field_type=TimestampType(), required=False + ), + NestedField(field_id=2, name="bid", field_type=DoubleType(), required=False), + NestedField(field_id=3, name="ask", field_type=DoubleType(), required=False), + NestedField(field_id=4, name="symbol", field_type=StringType(), required=False), +) + +from pyiceberg.table.partitioning import PartitionSpec, PartitionField +from pyiceberg.transforms import DayTransform + +partition_spec = PartitionSpec( + PartitionField( + source_id=1, field_id=1000, transform=DayTransform(), name="datetime_day" + ) +) + +from pyiceberg.table.sorting import SortOrder, SortField +from pyiceberg.transforms import IdentityTransform + +sort_order = SortOrder(SortField(source_id=4, transform=IdentityTransform())) + +from pyiceberg.catalog.hive import HiveCatalog + +catalog = HiveCatalog(name="prod", uri="thrift://localhost:9083/") + +catalog.create_table( + identifier="default.bids", + location="/Users/fokkodriesprong/Desktop/docker-spark-iceberg/wh/bids/", + schema=schema, + partition_spec=partition_spec, + sort_order=sort_order, +) +``` + +Which returns a newly created table: + +``` +Table( + identifier=('default', 'bids'), + metadata_location='/Users/fokkodriesprong/Desktop/docker-spark-iceberg/wh/bids//metadata/00000-c8cd93ab-f784-474d-a167-b1a86b05195f.metadata.json', + metadata=TableMetadataV2( + location='/Users/fokkodriesprong/Desktop/docker-spark-iceberg/wh/bids/', + table_uuid=UUID('38d4cb39-4945-4bf2-b374-984b5c4984d2'), + last_updated_ms=1661847562069, + last_column_id=4, + schemas=[ + Schema( + NestedField(field_id=1, name='datetime', field_type=TimestampType(), required=False), + NestedField(field_id=2, name='bid', field_type=DoubleType(), required=False), + NestedField(field_id=3, name='ask', field_type=DoubleType(), required=False), + NestedField(field_id=4, name='symbol', field_type=StringType(), required=False)), + schema_id=1, + identifier_field_ids=[]) + ], + current_schema_id=1, + partition_specs=[ + PartitionSpec( + PartitionField(source_id=1, field_id=1000, transform=DayTransform(), name='datetime_day'),)) + ], + default_spec_id=0, + last_partition_id=1000, + properties={}, + current_snapshot_id=None, + snapshots=[], + snapshot_log=[], + metadata_log=[], + sort_orders=[ + SortOrder(order_id=1, fields=[SortField(source_id=4, transform=IdentityTransform(), direction=SortDirection.ASC, null_order=NullOrder.NULLS_FIRST)]) + ], + default_sort_order_id=1, + refs={}, + format_version=2, + last_sequence_number=0 + ) +) +``` + +# Feature Support + +The goal is that the python library will provide a functional, performant subset of the Java library. The initial focus has been on reading table metadata and provide a convenient CLI to go through the catalog. + +## Metadata + +| Operation | Java | Python | +|:------------------------|:-----:|:------:| +| Get Schema | X | X | +| Get Snapshots | X | X | +| Plan Scan | X | | +| Plan Scan for Snapshot | X | | +| Update Current Snapshot | X | | +| Set Table Properties | X | X | +| Create Table | X | X | +| Drop Table | X | X | +| Alter Table | X | | + +## Types + +The types are kept in `pyiceberg.types`. + +Primitive types: + +- `BooleanType` +- `StringType` +- `IntegerType` +- `LongType` +- `FloatType` +- `DoubleType` +- `DateType` +- `TimeType` +- `TimestampType` +- `TimestamptzType` +- `BinaryType` +- `UUIDType` + +Complex types: + +- `StructType` +- `ListType` +- `MapType` +- `FixedType(16)` +- `DecimalType(8, 3)` diff --git a/python/mkdocs/mkdocs.yml b/python/mkdocs/mkdocs.yml new file mode 100644 index 000000000000..c84a2de465ee --- /dev/null +++ b/python/mkdocs/mkdocs.yml @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +--- +site_name: PyIceberg diff --git a/python/mkdocs/requirements.txt b/python/mkdocs/requirements.txt new file mode 100644 index 000000000000..642a688ebc5a --- /dev/null +++ b/python/mkdocs/requirements.txt @@ -0,0 +1,19 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +mkdocs==1.3.1 +jinja2==3.0.3 From d3900cbf2987d9519506f8331a9fede7264b969a Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Wed, 21 Sep 2022 20:04:40 +0200 Subject: [PATCH 2/2] Comments --- .github/workflows/python-ci-docs.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/python-ci-docs.yml b/.github/workflows/python-ci-docs.yml index 3f579b5997b3..9ae255328d61 100644 --- a/.github/workflows/python-ci-docs.yml +++ b/.github/workflows/python-ci-docs.yml @@ -22,9 +22,8 @@ on: push: branches: - 'master' - pull_request: paths: - - '.github/workflows/python-ci-docs.yml' + - 'python/docs/**' concurrency: group: ${{ github.workflow }}-${{ github.ref }}