diff --git a/.devcontainer/noop.txt b/.devcontainer/noop.txt index dde8dc3c1..848dd9852 100644 --- a/.devcontainer/noop.txt +++ b/.devcontainer/noop.txt @@ -1,3 +1,3 @@ This file copied into the container along with environment.yml* from the parent -folder. This file is included to prevents the Dockerfile COPY instruction from +folder. This file is included to prevents the Dockerfile COPY instruction from failing if no environment.yml is found. \ No newline at end of file diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index d41bc9db3..36966927d 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -1,4 +1,4 @@ - ## Related Tickets & Documents @@ -28,7 +28,7 @@ Closes # - [ ] Documentation Update ## Steps to QA - 3\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mrg\u001b[39;00m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdatasets\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m load_dataset\n\u001b[1;32m 6\u001b[0m client \u001b[38;5;241m=\u001b[39m rg\u001b[38;5;241m.\u001b[39mExtralit()\n", - "File \u001b[0;32m~/Projects/extralit/argilla/src/argilla/__init__.py:15\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Copyright 2024-present, Extralit, Inc.\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;66;03m# Licensed under the Apache License, Version 2.0 (the \"License\");\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;66;03m# See the License for the specific language governing permissions and\u001b[39;00m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;66;03m# limitations under the License.\u001b[39;00m\n\u001b[1;32m 14\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_version\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m __version__ \u001b[38;5;66;03m# noqa\u001b[39;00m\n\u001b[0;32m---> 15\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mclient\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m \u001b[38;5;66;03m# noqa\u001b[39;00m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdatasets\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m \u001b[38;5;66;03m# noqa\u001b[39;00m\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mworkspaces\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m \u001b[38;5;66;03m# noqa\u001b[39;00m\n", - "File \u001b[0;32m~/Projects/extralit/argilla/src/argilla/client.py:22\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtyping\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m TYPE_CHECKING, List, Optional, Union, overload\n\u001b[1;32m 20\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01muuid\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m UUID\n\u001b[0;32m---> 22\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m _api\n\u001b[1;32m 23\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_api\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_base\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ResourceAPI\n\u001b[1;32m 24\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_api\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_client\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m DEFAULT_HTTP_CONFIG\n", - "File \u001b[0;32m~/Projects/extralit/argilla/src/argilla/_api/__init__.py:15\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Copyright 2024-present, Extralit, Inc.\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;66;03m# Licensed under the Apache License, Version 2.0 (the \"License\");\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;66;03m# See the License for the specific language governing permissions and\u001b[39;00m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;66;03m# limitations under the License.\u001b[39;00m\n\u001b[0;32m---> 15\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_api\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_datasets\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m \u001b[38;5;66;03m# noqa 403\u001b[39;00m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_api\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_http\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m \u001b[38;5;66;03m# noqa 403\u001b[39;00m\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_api\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_workspaces\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m \u001b[38;5;66;03m# noqa 403\u001b[39;00m\n", - "File \u001b[0;32m~/Projects/extralit/argilla/src/argilla/_api/_datasets.py:21\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_api\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_base\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ResourceAPI\n\u001b[1;32m 20\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_exceptions\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_api\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m api_error_handler\n\u001b[0;32m---> 21\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_models\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m DatasetModel\n\u001b[1;32m 23\u001b[0m __all__ \u001b[38;5;241m=\u001b[39m [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDatasetsAPI\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 25\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_models\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_dataset_progress\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m UserProgressModel, DatasetProgressModel\n", - "File \u001b[0;32m~/Projects/extralit/argilla/src/argilla/_models/__init__.py:17\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Copyright 2024-present, Extralit, Inc.\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;66;03m# Licensed under the Apache License, Version 2.0 (the \"License\");\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[38;5;66;03m# We skip the flake8 check because we are importing all the models and the import order is important\u001b[39;00m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;66;03m# flake8: noqa\u001b[39;00m\n\u001b[0;32m---> 17\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_models\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_resource\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ResourceModel\n\u001b[1;32m 18\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_models\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_workspace\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m WorkspaceModel\n\u001b[1;32m 19\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_models\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_user\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m UserModel, Role\n", - "File \u001b[0;32m~/Projects/extralit/argilla/src/argilla/_models/_resource.py:19\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtyping\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Optional\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01muuid\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m UUID\n\u001b[0;32m---> 19\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpydantic\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BaseModel, field_serializer\n\u001b[1;32m 22\u001b[0m \u001b[38;5;28;01mclass\u001b[39;00m \u001b[38;5;21;01mResourceModel\u001b[39;00m(BaseModel):\n\u001b[1;32m 23\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Base model for all resources (DatasetModel, WorkspaceModel, UserModel, etc.)\"\"\"\u001b[39;00m\n", - "\u001b[0;31mImportError\u001b[0m: cannot import name 'field_serializer' from 'pydantic' (/Users/jonny/micromamba/envs/extralit/lib/python3.9/site-packages/pydantic/__init__.cpython-39-darwin.so)" - ] - } - ], - "source": [ - "from datetime import datetime\n", - "\n", - "import extralit as ex\n", - "from datasets import load_dataset\n", - "\n", - "client = ex.Extralit()" - ] - }, - { - "cell_type": "code", - "execution_count": 106, - "metadata": {}, - "outputs": [], - "source": [ - "for dataset in client.datasets.list():\n", - " dataset.delete()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Chat Field\n" - ] - }, - { - "cell_type": "code", - "execution_count": 108, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/ben/code/argilla/argilla/src/argilla/datasets/_resource.py:203: UserWarning: Workspace not provided. Using default workspace: argilla id: 735cae0d-eb08-45c3-ad79-0a11ad4dd2c2\n", - " warnings.warn(f\"Workspace not provided. Using default workspace: {workspace.name} id: {workspace.id}\")\n" - ] - }, - { - "data": { - "text/plain": [ - "Dataset(id=UUID('ee5fc998-b475-45a8-86e7-7ff427d43268') inserted_at=datetime.datetime(2024, 8, 23, 10, 46, 50, 148167) updated_at=datetime.datetime(2024, 8, 23, 10, 46, 50, 291527) name='static_chat_20240823124650' status='ready' guidelines=None allow_extra_metadata=False distribution=OverlapTaskDistributionModel(strategy='overlap', min_submitted=1) workspace_id=UUID('735cae0d-eb08-45c3-ad79-0a11ad4dd2c2') last_activity_at=datetime.datetime(2024, 8, 23, 10, 46, 50, 291527))" - ] - }, - "execution_count": 108, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "settings = ex.Settings(\n", - " fields=[\n", - " ex.ChatField(\n", - " name=\"chosen\",\n", - " ),\n", - " ex.ChatField(\n", - " name=\"rejected\",\n", - " ),\n", - " ],\n", - " questions=[\n", - " ex.RatingQuestion(\"rating\", title=\"How would you rate the conversation?\", required=True, values=[1, 2, 3, 4, 5]),\n", - " ex.TextQuestion(\"improved_chosen\", title=\"Rewrite the chosen conversation\", required=False),\n", - " ],\n", - ")\n", - "\n", - "dataset = ex.Dataset(\n", - " settings=settings,\n", - " name=f\"static_chat_{datetime.now().strftime('%Y%m%d%H%M%S')}\",\n", - ")\n", - "\n", - "dataset.create()" - ] - }, - { - "cell_type": "code", - "execution_count": 109, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/ben/code/argilla/argilla/src/argilla/records/_mapping/_mapper.py:89: UserWarning: Keys ['source', 'chosen_rating', 'chosen_model', 'rejected_rating', 'rejected_model'] in data are not present in the mapping and will be ignored.\n", - " warnings.warn(f\"Keys {unknown_keys} in data are not present in the mapping and will be ignored.\")\n" - ] - }, - { - "data": { - "text/html": [ - "
DatasetRecords: The provided batch size 256 was normalized. Using value 100.\n",
-                            "
\n" - ], - "text/plain": [ - "DatasetRecords: The provided batch size \u001b[1;36m256\u001b[0m was normalized. Using value \u001b[1;36m100\u001b[0m.\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Sending records...: 100%|██████████| 1/1 [00:00<00:00, 3.45batch/s]\n" - ] - }, - { - "data": { - "text/plain": [ - "DatasetRecords(Dataset(id=UUID('ee5fc998-b475-45a8-86e7-7ff427d43268') inserted_at=datetime.datetime(2024, 8, 23, 10, 46, 50, 148167) updated_at=datetime.datetime(2024, 8, 23, 10, 46, 50, 291527) name='static_chat_20240823124650' status='ready' guidelines=None allow_extra_metadata=False distribution=OverlapTaskDistributionModel(strategy='overlap', min_submitted=1) workspace_id=UUID('735cae0d-eb08-45c3-ad79-0a11ad4dd2c2') last_activity_at=datetime.datetime(2024, 8, 23, 10, 46, 50, 291527)))" - ] - }, - "execution_count": 109, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ds = load_dataset(\"argilla/Capybara-Preferences\", split=\"train[:100]\")\n", - "dataset.records.log(ds)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Custom Field" - ] - }, - { - "cell_type": "code", - "execution_count": 110, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Dataset(id=UUID('620fb219-73cb-42c6-bad0-456880a93ab9') inserted_at=datetime.datetime(2024, 8, 23, 10, 46, 58, 842638) updated_at=datetime.datetime(2024, 8, 23, 10, 46, 59, 10418) name='interactive_chat_20240823124658' status='ready' guidelines=None allow_extra_metadata=False distribution=OverlapTaskDistributionModel(strategy='overlap', min_submitted=1) workspace_id=UUID('735cae0d-eb08-45c3-ad79-0a11ad4dd2c2') last_activity_at=datetime.datetime(2024, 8, 23, 10, 46, 59, 10418))" - ] - }, - "execution_count": 110, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "html_template_path = \"interactive_chat.html\"\n", - "\n", - "settings = ex.Settings(\n", - " fields=[\n", - " ex.CustomField(name=\"chosen\", template=html_template_path, required=False),\n", - " ex.ChatField(\n", - " name=\"rejected\",\n", - " ),\n", - " ],\n", - " questions=[\n", - " ex.RatingQuestion(\n", - " \"rating\", title=\"How would you rate the conversation?\", required=True, values=[1, 2, 3, 4, 5]\n", - " ),\n", - " ex.TextQuestion(\n", - " \"improved_chosen\", title=\"Rewrite the chosen conversation\", required=True\n", - " ),\n", - " ],\n", - ")\n", - "\n", - "dataset = ex.Dataset(\n", - " settings=settings,\n", - " name=f\"interactive_chat_{datetime.now().strftime('%Y%m%d%H%M%S')}\",\n", - ")\n", - "\n", - "dataset.create()" - ] - }, - { - "cell_type": "code", - "execution_count": 111, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/ben/code/argilla/argilla/src/argilla/records/_mapping/_mapper.py:89: UserWarning: Keys ['source', 'chosen_rating', 'chosen_model', 'rejected_rating', 'rejected_model', 'messages'] in data are not present in the mapping and will be ignored.\n", - " warnings.warn(f\"Keys {unknown_keys} in data are not present in the mapping and will be ignored.\")\n" - ] - }, - { - "data": { - "text/html": [ - "
DatasetRecords: The provided batch size 256 was normalized. Using value 100.\n",
-                            "
\n" - ], - "text/plain": [ - "DatasetRecords: The provided batch size \u001b[1;36m256\u001b[0m was normalized. Using value \u001b[1;36m100\u001b[0m.\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Sending records...: 100%|██████████| 1/1 [00:00<00:00, 3.32batch/s]\n" - ] - }, - { - "data": { - "text/plain": [ - "DatasetRecords(Dataset(id=UUID('620fb219-73cb-42c6-bad0-456880a93ab9') inserted_at=datetime.datetime(2024, 8, 23, 10, 46, 58, 842638) updated_at=datetime.datetime(2024, 8, 23, 10, 46, 59, 10418) name='interactive_chat_20240823124658' status='ready' guidelines=None allow_extra_metadata=False distribution=OverlapTaskDistributionModel(strategy='overlap', min_submitted=1) workspace_id=UUID('735cae0d-eb08-45c3-ad79-0a11ad4dd2c2') last_activity_at=datetime.datetime(2024, 8, 23, 10, 46, 59, 10418)))" - ] - }, - "execution_count": 111, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ds = load_dataset(\"argilla/Capybara-Preferences\", split=\"train[:100]\")\n", - "ds = ds.map(lambda x: {\"messages\": x[\"chosen\"]})\n", - "dataset.records.log(ds)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.19" - } + "cells": [ + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "ename": "ImportError", + "evalue": "cannot import name 'field_serializer' from 'pydantic' (/Users/jonny/micromamba/envs/extralit/lib/python3.9/site-packages/pydantic/__init__.cpython-39-darwin.so)", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[4], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdatetime\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m datetime\n\u001b[0;32m----> 3\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mrg\u001b[39;00m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdatasets\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m load_dataset\n\u001b[1;32m 6\u001b[0m client \u001b[38;5;241m=\u001b[39m rg\u001b[38;5;241m.\u001b[39mExtralit()\n", + "File \u001b[0;32m~/Projects/extralit/argilla/src/argilla/__init__.py:15\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Copyright 2024-present, Extralit, Inc.\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;66;03m# Licensed under the Apache License, Version 2.0 (the \"License\");\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;66;03m# See the License for the specific language governing permissions and\u001b[39;00m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;66;03m# limitations under the License.\u001b[39;00m\n\u001b[1;32m 14\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_version\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m __version__ \u001b[38;5;66;03m# noqa\u001b[39;00m\n\u001b[0;32m---> 15\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mclient\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m \u001b[38;5;66;03m# noqa\u001b[39;00m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdatasets\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m \u001b[38;5;66;03m# noqa\u001b[39;00m\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mworkspaces\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m \u001b[38;5;66;03m# noqa\u001b[39;00m\n", + "File \u001b[0;32m~/Projects/extralit/argilla/src/argilla/client.py:22\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtyping\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m TYPE_CHECKING, List, Optional, Union, overload\n\u001b[1;32m 20\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01muuid\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m UUID\n\u001b[0;32m---> 22\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m _api\n\u001b[1;32m 23\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_api\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_base\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ResourceAPI\n\u001b[1;32m 24\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_api\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_client\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m DEFAULT_HTTP_CONFIG\n", + "File \u001b[0;32m~/Projects/extralit/argilla/src/argilla/_api/__init__.py:15\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Copyright 2024-present, Extralit, Inc.\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;66;03m# Licensed under the Apache License, Version 2.0 (the \"License\");\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;66;03m# See the License for the specific language governing permissions and\u001b[39;00m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;66;03m# limitations under the License.\u001b[39;00m\n\u001b[0;32m---> 15\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_api\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_datasets\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m \u001b[38;5;66;03m# noqa 403\u001b[39;00m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_api\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_http\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m \u001b[38;5;66;03m# noqa 403\u001b[39;00m\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_api\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_workspaces\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m \u001b[38;5;66;03m# noqa 403\u001b[39;00m\n", + "File \u001b[0;32m~/Projects/extralit/argilla/src/argilla/_api/_datasets.py:21\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_api\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_base\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ResourceAPI\n\u001b[1;32m 20\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_exceptions\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_api\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m api_error_handler\n\u001b[0;32m---> 21\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_models\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m DatasetModel\n\u001b[1;32m 23\u001b[0m __all__ \u001b[38;5;241m=\u001b[39m [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDatasetsAPI\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 25\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_models\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_dataset_progress\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m UserProgressModel, DatasetProgressModel\n", + "File \u001b[0;32m~/Projects/extralit/argilla/src/argilla/_models/__init__.py:17\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Copyright 2024-present, Extralit, Inc.\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;66;03m# Licensed under the Apache License, Version 2.0 (the \"License\");\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[38;5;66;03m# We skip the flake8 check because we are importing all the models and the import order is important\u001b[39;00m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;66;03m# flake8: noqa\u001b[39;00m\n\u001b[0;32m---> 17\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_models\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_resource\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ResourceModel\n\u001b[1;32m 18\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_models\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_workspace\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m WorkspaceModel\n\u001b[1;32m 19\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_models\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_user\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m UserModel, Role\n", + "File \u001b[0;32m~/Projects/extralit/argilla/src/argilla/_models/_resource.py:19\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtyping\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Optional\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01muuid\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m UUID\n\u001b[0;32m---> 19\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpydantic\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BaseModel, field_serializer\n\u001b[1;32m 22\u001b[0m \u001b[38;5;28;01mclass\u001b[39;00m \u001b[38;5;21;01mResourceModel\u001b[39;00m(BaseModel):\n\u001b[1;32m 23\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Base model for all resources (DatasetModel, WorkspaceModel, UserModel, etc.)\"\"\"\u001b[39;00m\n", + "\u001b[0;31mImportError\u001b[0m: cannot import name 'field_serializer' from 'pydantic' (/Users/jonny/micromamba/envs/extralit/lib/python3.9/site-packages/pydantic/__init__.cpython-39-darwin.so)" + ] + } + ], + "source": [ + "from datetime import datetime\n", + "\n", + "import extralit as ex\n", + "from datasets import load_dataset\n", + "\n", + "client = ex.Extralit()" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": {}, + "outputs": [], + "source": [ + "for dataset in client.datasets.list():\n", + " dataset.delete()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Chat Field\n" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/ben/code/argilla/argilla/src/argilla/datasets/_resource.py:203: UserWarning: Workspace not provided. Using default workspace: argilla id: 735cae0d-eb08-45c3-ad79-0a11ad4dd2c2\n", + " warnings.warn(f\"Workspace not provided. Using default workspace: {workspace.name} id: {workspace.id}\")\n" + ] }, - "nbformat": 4, - "nbformat_minor": 2 + { + "data": { + "text/plain": [ + "Dataset(id=UUID('ee5fc998-b475-45a8-86e7-7ff427d43268') inserted_at=datetime.datetime(2024, 8, 23, 10, 46, 50, 148167) updated_at=datetime.datetime(2024, 8, 23, 10, 46, 50, 291527) name='static_chat_20240823124650' status='ready' guidelines=None allow_extra_metadata=False distribution=OverlapTaskDistributionModel(strategy='overlap', min_submitted=1) workspace_id=UUID('735cae0d-eb08-45c3-ad79-0a11ad4dd2c2') last_activity_at=datetime.datetime(2024, 8, 23, 10, 46, 50, 291527))" + ] + }, + "execution_count": 108, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "settings = ex.Settings(\n", + " fields=[\n", + " ex.ChatField(\n", + " name=\"chosen\",\n", + " ),\n", + " ex.ChatField(\n", + " name=\"rejected\",\n", + " ),\n", + " ],\n", + " questions=[\n", + " ex.RatingQuestion(\n", + " \"rating\",\n", + " title=\"How would you rate the conversation?\",\n", + " required=True,\n", + " values=[1, 2, 3, 4, 5],\n", + " ),\n", + " ex.TextQuestion(\n", + " \"improved_chosen\", title=\"Rewrite the chosen conversation\", required=False\n", + " ),\n", + " ],\n", + ")\n", + "\n", + "dataset = ex.Dataset(\n", + " settings=settings,\n", + " name=f\"static_chat_{datetime.now().strftime('%Y%m%d%H%M%S')}\",\n", + ")\n", + "\n", + "dataset.create()" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/ben/code/argilla/argilla/src/argilla/records/_mapping/_mapper.py:89: UserWarning: Keys ['source', 'chosen_rating', 'chosen_model', 'rejected_rating', 'rejected_model'] in data are not present in the mapping and will be ignored.\n", + " warnings.warn(f\"Keys {unknown_keys} in data are not present in the mapping and will be ignored.\")\n" + ] + }, + { + "data": { + "text/html": [ + "
DatasetRecords: The provided batch size 256 was normalized. Using value 100.\n",
+       "
\n" + ], + "text/plain": [ + "DatasetRecords: The provided batch size \u001b[1;36m256\u001b[0m was normalized. Using value \u001b[1;36m100\u001b[0m.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Sending records...: 100%|██████████| 1/1 [00:00<00:00, 3.45batch/s]\n" + ] + }, + { + "data": { + "text/plain": [ + "DatasetRecords(Dataset(id=UUID('ee5fc998-b475-45a8-86e7-7ff427d43268') inserted_at=datetime.datetime(2024, 8, 23, 10, 46, 50, 148167) updated_at=datetime.datetime(2024, 8, 23, 10, 46, 50, 291527) name='static_chat_20240823124650' status='ready' guidelines=None allow_extra_metadata=False distribution=OverlapTaskDistributionModel(strategy='overlap', min_submitted=1) workspace_id=UUID('735cae0d-eb08-45c3-ad79-0a11ad4dd2c2') last_activity_at=datetime.datetime(2024, 8, 23, 10, 46, 50, 291527)))" + ] + }, + "execution_count": 109, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds = load_dataset(\"argilla/Capybara-Preferences\", split=\"train[:100]\")\n", + "dataset.records.log(ds)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Custom Field" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Dataset(id=UUID('620fb219-73cb-42c6-bad0-456880a93ab9') inserted_at=datetime.datetime(2024, 8, 23, 10, 46, 58, 842638) updated_at=datetime.datetime(2024, 8, 23, 10, 46, 59, 10418) name='interactive_chat_20240823124658' status='ready' guidelines=None allow_extra_metadata=False distribution=OverlapTaskDistributionModel(strategy='overlap', min_submitted=1) workspace_id=UUID('735cae0d-eb08-45c3-ad79-0a11ad4dd2c2') last_activity_at=datetime.datetime(2024, 8, 23, 10, 46, 59, 10418))" + ] + }, + "execution_count": 110, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "html_template_path = \"interactive_chat.html\"\n", + "\n", + "settings = ex.Settings(\n", + " fields=[\n", + " ex.CustomField(name=\"chosen\", template=html_template_path, required=False),\n", + " ex.ChatField(\n", + " name=\"rejected\",\n", + " ),\n", + " ],\n", + " questions=[\n", + " ex.RatingQuestion(\n", + " \"rating\",\n", + " title=\"How would you rate the conversation?\",\n", + " required=True,\n", + " values=[1, 2, 3, 4, 5],\n", + " ),\n", + " ex.TextQuestion(\n", + " \"improved_chosen\", title=\"Rewrite the chosen conversation\", required=True\n", + " ),\n", + " ],\n", + ")\n", + "\n", + "dataset = ex.Dataset(\n", + " settings=settings,\n", + " name=f\"interactive_chat_{datetime.now().strftime('%Y%m%d%H%M%S')}\",\n", + ")\n", + "\n", + "dataset.create()" + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/ben/code/argilla/argilla/src/argilla/records/_mapping/_mapper.py:89: UserWarning: Keys ['source', 'chosen_rating', 'chosen_model', 'rejected_rating', 'rejected_model', 'messages'] in data are not present in the mapping and will be ignored.\n", + " warnings.warn(f\"Keys {unknown_keys} in data are not present in the mapping and will be ignored.\")\n" + ] + }, + { + "data": { + "text/html": [ + "
DatasetRecords: The provided batch size 256 was normalized. Using value 100.\n",
+       "
\n" + ], + "text/plain": [ + "DatasetRecords: The provided batch size \u001b[1;36m256\u001b[0m was normalized. Using value \u001b[1;36m100\u001b[0m.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Sending records...: 100%|██████████| 1/1 [00:00<00:00, 3.32batch/s]\n" + ] + }, + { + "data": { + "text/plain": [ + "DatasetRecords(Dataset(id=UUID('620fb219-73cb-42c6-bad0-456880a93ab9') inserted_at=datetime.datetime(2024, 8, 23, 10, 46, 58, 842638) updated_at=datetime.datetime(2024, 8, 23, 10, 46, 59, 10418) name='interactive_chat_20240823124658' status='ready' guidelines=None allow_extra_metadata=False distribution=OverlapTaskDistributionModel(strategy='overlap', min_submitted=1) workspace_id=UUID('735cae0d-eb08-45c3-ad79-0a11ad4dd2c2') last_activity_at=datetime.datetime(2024, 8, 23, 10, 46, 59, 10418)))" + ] + }, + "execution_count": 111, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds = load_dataset(\"argilla/Capybara-Preferences\", split=\"train[:100]\")\n", + "ds = ds.map(lambda x: {\"messages\": x[\"chosen\"]})\n", + "dataset.records.log(ds)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.19" + } + }, + "nbformat": 4, + "nbformat_minor": 2 } \ No newline at end of file diff --git a/examples/custom_field/table_field.ipynb b/examples/custom_field/table_field.ipynb index 5644f404b..440d8c335 100644 --- a/examples/custom_field/table_field.ipynb +++ b/examples/custom_field/table_field.ipynb @@ -1,776 +1,768 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/opt/conda/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - } - ], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2\n", - " \n", - "from datetime import datetime\n", - "import json\n", - "\n", - "import extralit as ex\n", - "from datasets import load_dataset\n", - "\n", - "client = ex.Extralit(api_url=\"http://localhost:6900/\", api_key='extralit.apikey')" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "for dataset in client.datasets.list():\n", - " print(dataset.name)\n", - " # dataset.delete()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Load extraction dataset" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Dataset(id=UUID('3a7abf40-a6b7-4cf6-ac09-d89a8b33ac67') inserted_at=datetime.datetime(2024, 4, 4, 5, 23, 44, 562080) updated_at=datetime.datetime(2024, 11, 15, 0, 35, 14, 753190) name='2-Data-Extractions' status='ready' guidelines=None allow_extra_metadata=True distribution=OverlapTaskDistributionModel(strategy='overlap', min_submitted=1) workspace_id=UUID('e9e4e699-a6f9-4482-b5dd-e45874bd87eb') last_activity_at=datetime.datetime(2024, 12, 1, 5, 55, 8, 469548))" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dataset = client.datasets(\n", - " name=\"2-Data-Extractions\",\n", - " workspace=\"itn-recalibration\"\n", - ")\n", - "dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "jp-MarkdownHeadingCollapsed": true - }, - "source": [ - "## Update field" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'reference': 'mosqueira2015pilot', 'pmid': '25959771', 'doc_id': '276c32ef-26d2-40cb-b808-b764018cd2ea', 'type': 'Observation'}\n", - "\n", - "\n", - "{'reference': 'mosqueira2015pilot', 'pmid': '25959771', 'doc_id': '276c32ef-26d2-40cb-b808-b764018cd2ea', 'type': 'ITNCondition'}\n", - "\n", - "\n", - "{'reference': 'mosqueira2015pilot', 'pmid': '25959771', 'doc_id': '276c32ef-26d2-40cb-b808-b764018cd2ea', 'type': 'EntomologicalOutcome'}\n", - "\n", - "\n", - "{'reference': 'mosqueira2015pilot', 'pmid': '25959771', 'doc_id': '276c32ef-26d2-40cb-b808-b764018cd2ea', 'type': 'ClinicalOutcome'}\n", - "\n", - "\n" - ] - }, - { - "data": { - "text/plain": [ - "4" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Find the record with the specific metadata\n", - "records = dataset.records(query=ex.Query(filter=(\"metadata.reference\", \"==\", \"mosqueira2015pilot\")))\n", - "\n", - "# Update the record's extraction field\n", - "updated_records = []\n", - "for record in records:\n", - " print(record.metadata)\n", - " print(type(record.fields[\"extraction\"]))\n", - " record.fields[\"extraction\"] = json.loads(record.fields[\"extraction\"])\n", - " print(type(record.fields[\"extraction\"]))\n", - " updated_records.append(record)\n", - "\n", - "len(updated_records)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
DatasetRecords: The provided batch size 256 was normalized. Using value 4.\n",
-                            "
\n" - ], - "text/plain": [ - "DatasetRecords: The provided batch size \u001b[1;36m256\u001b[0m was normalized. Using value \u001b[1;36m4\u001b[0m.\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Sending records...: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:04<00:00, 4.41s/batch]\n" - ] - }, - { - "data": { - "text/plain": [ - "DatasetRecords(Dataset(id=UUID('3a7abf40-a6b7-4cf6-ac09-d89a8b33ac67') inserted_at=datetime.datetime(2024, 4, 4, 5, 23, 44, 562080) updated_at=datetime.datetime(2024, 11, 15, 0, 35, 14, 753190) name='2-Data-Extractions' status='ready' guidelines=None allow_extra_metadata=True distribution=OverlapTaskDistributionModel(strategy='overlap', min_submitted=1) workspace_id=UUID('e9e4e699-a6f9-4482-b5dd-e45874bd87eb') last_activity_at=datetime.datetime(2024, 11, 21, 18, 7, 47, 105497)))" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dataset.records.log(updated_records)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'reference': 'mosqueira2015pilot',\n", - " 'schema': {'fields': [{'name': 'observation_ref',\n", - " 'type': 'any',\n", - " 'extDtype': 'string'},\n", - " {'name': 'itncondition_ref', 'type': 'any', 'extDtype': 'string'},\n", - " {'name': 'N_people', 'type': 'integer'},\n", - " {'name': 'Age_lower', 'type': 'number'},\n", - " {'name': 'Age_upper', 'type': 'number'}],\n", - " 'primaryKey': ['observation_ref', 'itncondition_ref'],\n", - " 'pandas_version': '1.4.0'},\n", - " 'data': [{'observation_ref': 'S01',\n", - " 'itncondition_ref': 'N01',\n", - " 'N_people': 3903,\n", - " 'Age_lower': 0.5,\n", - " 'Age_upper': 14.0},\n", - " {'observation_ref': 'S02',\n", - " 'itncondition_ref': 'N01',\n", - " 'N_people': 3903,\n", - " 'Age_lower': 0.5,\n", - " 'Age_upper': 14.0}],\n", - " 'validation': {'schema_type': 'dataframe',\n", - " 'version': '0.18.3',\n", - " 'columns': {'N_people': {'title': None,\n", - " 'description': 'Number of people in the study arm of the net in question',\n", - " 'dtype': 'int64',\n", - " 'nullable': True,\n", - " 'checks': None,\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'Age_lower': {'title': None,\n", - " 'description': 'Lower limit of age group in years. For children <1, enter age as a decimal.',\n", - " 'dtype': 'float64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'Age_upper': {'title': None,\n", - " 'description': 'Upper limit of age group in years. For children <1, enter age as a decimal.',\n", - " 'dtype': 'float64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'N_pos': {'title': None,\n", - " 'description': 'Number of people tested to be parasite positive',\n", - " 'dtype': 'int64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'PR': {'title': None,\n", - " 'description': 'Definition: (N_pos/N_people)*100',\n", - " 'dtype': 'float64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'PR_rate_lower': {'title': None,\n", - " 'description': 'Lower bound of parasite positivity rate',\n", - " 'dtype': 'float64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'PR_rate_upper': {'title': None,\n", - " 'description': 'Upper bound of parasite positivity rate',\n", - " 'dtype': 'float64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'CM': {'title': None,\n", - " 'description': 'Number of people with clinical malaria',\n", - " 'dtype': 'int64',\n", - " 'nullable': True,\n", - " 'checks': None,\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'CM_rate': {'title': None,\n", - " 'description': 'Definition: (CM/N_people)*100',\n", - " 'dtype': 'float64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'CM_rate_lower': {'title': None,\n", - " 'description': 'Lower bound of clinical malaria rate',\n", - " 'dtype': 'float64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'CM_rate_upper': {'title': None,\n", - " 'description': 'Upper bound of clinical malaria rate',\n", - " 'dtype': 'float64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'Net_retention': {'title': None,\n", - " 'description': 'Number of nets still owned divided by a number of nets previously distributed',\n", - " 'dtype': 'float64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'N_nets': {'title': None,\n", - " 'description': 'Number of nets found in household or community study arm',\n", - " 'dtype': 'int64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'N_sleep_nets': {'title': None,\n", - " 'description': 'Number of people that slept under a net the previous night',\n", - " 'dtype': 'int64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'Perc_sleep_nets': {'title': None,\n", - " 'description': 'Percent of people that slept under a net the previous night',\n", - " 'dtype': 'float64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False}},\n", - " 'checks': {'check_less_than': {'columns_a': ['Age_lower'],\n", - " 'columns_b': ['Age_upper'],\n", - " 'or_equal': True},\n", - " 'check_greater_than': {'columns_a': 'N_people',\n", - " 'columns_b': ['N_pos', 'CM', 'N_sleep_nets'],\n", - " 'or_equal': True},\n", - " 'check_between': {'columns_target': ['PR', 'CM_rate'],\n", - " 'columns_lower': ['PR_rate_lower', 'CM_rate_lower'],\n", - " 'columns_upper': ['PR_rate_upper', 'CM_rate_upper'],\n", - " 'or_equal': True}},\n", - " 'index': [{'title': 'Observation reference',\n", - " 'description': None,\n", - " 'dtype': 'str',\n", - " 'nullable': False,\n", - " 'checks': {'str_startswith': 'S'},\n", - " 'name': 'observation_ref',\n", - " 'unique': False,\n", - " 'coerce': False},\n", - " {'title': 'ITNCondition reference',\n", - " 'description': None,\n", - " 'dtype': 'str',\n", - " 'nullable': False,\n", - " 'checks': {'str_startswith': 'N'},\n", - " 'name': 'itncondition_ref',\n", - " 'unique': False,\n", - " 'coerce': False}],\n", - " 'dtype': None,\n", - " 'coerce': True,\n", - " 'strict': True,\n", - " 'name': 'ClinicalOutcome',\n", - " 'ordered': False,\n", - " 'unique': None,\n", - " 'report_duplicates': 'all',\n", - " 'unique_column_names': False,\n", - " 'add_missing_columns': False,\n", - " 'title': None,\n", - " 'description': '\\nEpidemiological and clinical outcomes on humans collected from a clinical trial or village trial, if reported in the study.\\nEach clinical outcome should have unique `observation_ref`, `itn_condition_ref`, `Group`, `Age_lower`, and `Age_upper` (if reported).\\n '}}" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "json.loads(record.fields[\"extraction\"])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Test dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Custom Field" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jonny/Projects/extralit/argilla/src/argilla/client.py:354: UserWarning: Dataset with name 'interactive_chat' not found in workspace 'itn-recalibration'\n", - " warnings.warn(f\"Dataset with name {name!r} not found in workspace {workspace.name!r}\")\n" - ] - }, - { - "ename": "AttributeError", - "evalue": "'NoneType' object has no attribute 'delete'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[5], line 5\u001b[0m\n\u001b[1;32m 1\u001b[0m dataset \u001b[38;5;241m=\u001b[39m client\u001b[38;5;241m.\u001b[39mdatasets(\n\u001b[1;32m 2\u001b[0m name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minteractive_chat\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 3\u001b[0m \u001b[38;5;66;03m# workspace=\"itn-recalibration\"\u001b[39;00m\n\u001b[1;32m 4\u001b[0m )\n\u001b[0;32m----> 5\u001b[0m \u001b[43mdataset\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdelete\u001b[49m()\n", - "\u001b[0;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'delete'" - ] - } - ], - "source": [ - "dataset = client.datasets(\n", - " name=\"interactive_chat\",\n", - " # workspace=\"itn-recalibration\"\n", - ")\n", - "dataset.delete()" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "Dataset(id=UUID('92b559e7-8eff-4d4c-85bf-817fd73570e4') inserted_at=datetime.datetime(2024, 12, 2, 21, 33, 33, 529345) updated_at=datetime.datetime(2024, 12, 2, 21, 33, 39, 111530) name='interactive_chat' status='ready' guidelines=None allow_extra_metadata=False distribution=OverlapTaskDistributionModel(strategy='overlap', min_submitted=1) workspace_id=UUID('e9e4e699-a6f9-4482-b5dd-e45874bd87eb') last_activity_at=datetime.datetime(2024, 12, 2, 21, 33, 39, 111530))" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "settings = ex.Settings(\n", - " fields=[\n", - " ex.TableField(name=\"chosen\", required=False),\n", - " ],\n", - " questions=[\n", - " ex.TableQuestion(\n", - " \"extraction\", title=\"Correct the table\", required=True\n", - " ),\n", - " ],\n", - ")\n", - "\n", - "dataset = ex.Dataset(\n", - " settings=settings,\n", - " name=f\"interactive_chat\",\n", - ")\n", - "\n", - "dataset.create()" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'reference': 'mosqueira2015pilot',\n", - " 'schema': {'fields': [{'name': 'observation_ref',\n", - " 'type': 'any',\n", - " 'extDtype': 'string'},\n", - " {'name': 'itncondition_ref', 'type': 'any', 'extDtype': 'string'},\n", - " {'name': 'N_people', 'type': 'integer'},\n", - " {'name': 'Age_lower', 'type': 'number'},\n", - " {'name': 'Age_upper', 'type': 'number'}],\n", - " 'primaryKey': ['observation_ref', 'itncondition_ref'],\n", - " 'pandas_version': '1.4.0'},\n", - " 'data': [{'observation_ref': 'S01',\n", - " 'itncondition_ref': 'N01',\n", - " 'N_people': 3903,\n", - " 'Age_lower': 0.5,\n", - " 'Age_upper': 14.0},\n", - " {'observation_ref': 'S02',\n", - " 'itncondition_ref': 'N01',\n", - " 'N_people': 3903,\n", - " 'Age_lower': 0.5,\n", - " 'Age_upper': 14.0}],\n", - " 'validation': {'schema_type': 'dataframe',\n", - " 'version': '0.18.3',\n", - " 'columns': {'N_people': {'title': None,\n", - " 'description': 'Number of people in the study arm of the net in question',\n", - " 'dtype': 'int64',\n", - " 'nullable': True,\n", - " 'checks': None,\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'Age_lower': {'title': None,\n", - " 'description': 'Lower limit of age group in years. For children <1, enter age as a decimal.',\n", - " 'dtype': 'float64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'Age_upper': {'title': None,\n", - " 'description': 'Upper limit of age group in years. For children <1, enter age as a decimal.',\n", - " 'dtype': 'float64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'N_pos': {'title': None,\n", - " 'description': 'Number of people tested to be parasite positive',\n", - " 'dtype': 'int64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'PR': {'title': None,\n", - " 'description': 'Definition: (N_pos/N_people)*100',\n", - " 'dtype': 'float64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'PR_rate_lower': {'title': None,\n", - " 'description': 'Lower bound of parasite positivity rate',\n", - " 'dtype': 'float64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'PR_rate_upper': {'title': None,\n", - " 'description': 'Upper bound of parasite positivity rate',\n", - " 'dtype': 'float64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'CM': {'title': None,\n", - " 'description': 'Number of people with clinical malaria',\n", - " 'dtype': 'int64',\n", - " 'nullable': True,\n", - " 'checks': None,\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'CM_rate': {'title': None,\n", - " 'description': 'Definition: (CM/N_people)*100',\n", - " 'dtype': 'float64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'CM_rate_lower': {'title': None,\n", - " 'description': 'Lower bound of clinical malaria rate',\n", - " 'dtype': 'float64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'CM_rate_upper': {'title': None,\n", - " 'description': 'Upper bound of clinical malaria rate',\n", - " 'dtype': 'float64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'Net_retention': {'title': None,\n", - " 'description': 'Number of nets still owned divided by a number of nets previously distributed',\n", - " 'dtype': 'float64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'N_nets': {'title': None,\n", - " 'description': 'Number of nets found in household or community study arm',\n", - " 'dtype': 'int64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'N_sleep_nets': {'title': None,\n", - " 'description': 'Number of people that slept under a net the previous night',\n", - " 'dtype': 'int64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'Perc_sleep_nets': {'title': None,\n", - " 'description': 'Percent of people that slept under a net the previous night',\n", - " 'dtype': 'float64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False}},\n", - " 'checks': {'check_less_than': {'columns_a': ['Age_lower'],\n", - " 'columns_b': ['Age_upper'],\n", - " 'or_equal': True},\n", - " 'check_greater_than': {'columns_a': 'N_people',\n", - " 'columns_b': ['N_pos', 'CM', 'N_sleep_nets'],\n", - " 'or_equal': True},\n", - " 'check_between': {'columns_target': ['PR', 'CM_rate'],\n", - " 'columns_lower': ['PR_rate_lower', 'CM_rate_lower'],\n", - " 'columns_upper': ['PR_rate_upper', 'CM_rate_upper'],\n", - " 'or_equal': True}},\n", - " 'index': [{'title': 'Observation reference',\n", - " 'description': None,\n", - " 'dtype': 'str',\n", - " 'nullable': False,\n", - " 'checks': {'str_startswith': 'S'},\n", - " 'name': 'observation_ref',\n", - " 'unique': False,\n", - " 'coerce': False},\n", - " {'title': 'ITNCondition reference',\n", - " 'description': None,\n", - " 'dtype': 'str',\n", - " 'nullable': False,\n", - " 'checks': {'str_startswith': 'N'},\n", - " 'name': 'itncondition_ref',\n", - " 'unique': False,\n", - " 'coerce': False}],\n", - " 'dtype': None,\n", - " 'coerce': True,\n", - " 'strict': True,\n", - " 'name': 'ClinicalOutcome',\n", - " 'ordered': False,\n", - " 'unique': None,\n", - " 'report_duplicates': 'all',\n", - " 'unique_column_names': False,\n", - " 'add_missing_columns': False,\n", - " 'title': None,\n", - " 'description': '\\nEpidemiological and clinical outcomes on humans collected from a clinical trial or village trial, if reported in the study.\\nEach clinical outcome should have unique `observation_ref`, `itn_condition_ref`, `Group`, `Age_lower`, and `Age_upper` (if reported).\\n '}}" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sample_table = record.fields['extraction']\n", - "sample_table" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
DatasetRecords: The provided batch size 256 was normalized. Using value 4.\n",
-                            "
\n" - ], - "text/plain": [ - "DatasetRecords: The provided batch size \u001b[1;36m256\u001b[0m was normalized. Using value \u001b[1;36m4\u001b[0m.\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Sending records...: 100%|███████| 1/1 [00:02<00:00, 2.54s/batch]\n" - ] - }, - { - "data": { - "text/plain": [ - "DatasetRecords(Dataset(id=UUID('a64a827c-f962-417a-a771-ce53f61c0756') inserted_at=datetime.datetime(2024, 11, 29, 23, 9, 55, 104623) updated_at=datetime.datetime(2024, 11, 29, 23, 9, 58, 696913) name='interactive_chat' status='ready' guidelines=None allow_extra_metadata=False distribution=OverlapTaskDistributionModel(strategy='overlap', min_submitted=1) workspace_id=UUID('e9e4e699-a6f9-4482-b5dd-e45874bd87eb') last_activity_at=datetime.datetime(2024, 11, 29, 23, 9, 58, 696913)))" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dataset.records.log([\n", - " {'chosen': sample_table} \\\n", - " for r in updated_records\n", - "])" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "# ds = load_dataset(\"argilla/Capybara-Preferences\", split=\"train[:100]\")\n", - "# ds = ds.map(lambda x: {\"messages\": x[\"chosen\"]})\n", - "# dataset.records.log(ds)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.14" - } + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "import json\n", + "\n", + "import extralit as ex\n", + "\n", + "client = ex.Extralit(api_url=\"http://localhost:6900/\", api_key=\"extralit.apikey\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "for dataset in client.datasets.list():\n", + " print(dataset.name)\n", + " # dataset.delete()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Load extraction dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Dataset(id=UUID('3a7abf40-a6b7-4cf6-ac09-d89a8b33ac67') inserted_at=datetime.datetime(2024, 4, 4, 5, 23, 44, 562080) updated_at=datetime.datetime(2024, 11, 15, 0, 35, 14, 753190) name='2-Data-Extractions' status='ready' guidelines=None allow_extra_metadata=True distribution=OverlapTaskDistributionModel(strategy='overlap', min_submitted=1) workspace_id=UUID('e9e4e699-a6f9-4482-b5dd-e45874bd87eb') last_activity_at=datetime.datetime(2024, 12, 1, 5, 55, 8, 469548))" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = client.datasets(name=\"2-Data-Extractions\", workspace=\"itn-recalibration\")\n", + "dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "## Update field" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'reference': 'mosqueira2015pilot', 'pmid': '25959771', 'doc_id': '276c32ef-26d2-40cb-b808-b764018cd2ea', 'type': 'Observation'}\n", + "\n", + "\n", + "{'reference': 'mosqueira2015pilot', 'pmid': '25959771', 'doc_id': '276c32ef-26d2-40cb-b808-b764018cd2ea', 'type': 'ITNCondition'}\n", + "\n", + "\n", + "{'reference': 'mosqueira2015pilot', 'pmid': '25959771', 'doc_id': '276c32ef-26d2-40cb-b808-b764018cd2ea', 'type': 'EntomologicalOutcome'}\n", + "\n", + "\n", + "{'reference': 'mosqueira2015pilot', 'pmid': '25959771', 'doc_id': '276c32ef-26d2-40cb-b808-b764018cd2ea', 'type': 'ClinicalOutcome'}\n", + "\n", + "\n" + ] }, - "nbformat": 4, - "nbformat_minor": 4 + { + "data": { + "text/plain": [ + "4" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Find the record with the specific metadata\n", + "records = dataset.records(\n", + " query=ex.Query(filter=(\"metadata.reference\", \"==\", \"mosqueira2015pilot\"))\n", + ")\n", + "\n", + "# Update the record's extraction field\n", + "updated_records = []\n", + "for record in records:\n", + " print(record.metadata)\n", + " print(type(record.fields[\"extraction\"]))\n", + " record.fields[\"extraction\"] = json.loads(record.fields[\"extraction\"])\n", + " print(type(record.fields[\"extraction\"]))\n", + " updated_records.append(record)\n", + "\n", + "len(updated_records)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
DatasetRecords: The provided batch size 256 was normalized. Using value 4.\n",
+       "
\n" + ], + "text/plain": [ + "DatasetRecords: The provided batch size \u001b[1;36m256\u001b[0m was normalized. Using value \u001b[1;36m4\u001b[0m.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Sending records...: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:04<00:00, 4.41s/batch]\n" + ] + }, + { + "data": { + "text/plain": [ + "DatasetRecords(Dataset(id=UUID('3a7abf40-a6b7-4cf6-ac09-d89a8b33ac67') inserted_at=datetime.datetime(2024, 4, 4, 5, 23, 44, 562080) updated_at=datetime.datetime(2024, 11, 15, 0, 35, 14, 753190) name='2-Data-Extractions' status='ready' guidelines=None allow_extra_metadata=True distribution=OverlapTaskDistributionModel(strategy='overlap', min_submitted=1) workspace_id=UUID('e9e4e699-a6f9-4482-b5dd-e45874bd87eb') last_activity_at=datetime.datetime(2024, 11, 21, 18, 7, 47, 105497)))" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset.records.log(updated_records)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'reference': 'mosqueira2015pilot',\n", + " 'schema': {'fields': [{'name': 'observation_ref',\n", + " 'type': 'any',\n", + " 'extDtype': 'string'},\n", + " {'name': 'itncondition_ref', 'type': 'any', 'extDtype': 'string'},\n", + " {'name': 'N_people', 'type': 'integer'},\n", + " {'name': 'Age_lower', 'type': 'number'},\n", + " {'name': 'Age_upper', 'type': 'number'}],\n", + " 'primaryKey': ['observation_ref', 'itncondition_ref'],\n", + " 'pandas_version': '1.4.0'},\n", + " 'data': [{'observation_ref': 'S01',\n", + " 'itncondition_ref': 'N01',\n", + " 'N_people': 3903,\n", + " 'Age_lower': 0.5,\n", + " 'Age_upper': 14.0},\n", + " {'observation_ref': 'S02',\n", + " 'itncondition_ref': 'N01',\n", + " 'N_people': 3903,\n", + " 'Age_lower': 0.5,\n", + " 'Age_upper': 14.0}],\n", + " 'validation': {'schema_type': 'dataframe',\n", + " 'version': '0.18.3',\n", + " 'columns': {'N_people': {'title': None,\n", + " 'description': 'Number of people in the study arm of the net in question',\n", + " 'dtype': 'int64',\n", + " 'nullable': True,\n", + " 'checks': None,\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'Age_lower': {'title': None,\n", + " 'description': 'Lower limit of age group in years. For children <1, enter age as a decimal.',\n", + " 'dtype': 'float64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'Age_upper': {'title': None,\n", + " 'description': 'Upper limit of age group in years. For children <1, enter age as a decimal.',\n", + " 'dtype': 'float64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'N_pos': {'title': None,\n", + " 'description': 'Number of people tested to be parasite positive',\n", + " 'dtype': 'int64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'PR': {'title': None,\n", + " 'description': 'Definition: (N_pos/N_people)*100',\n", + " 'dtype': 'float64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'PR_rate_lower': {'title': None,\n", + " 'description': 'Lower bound of parasite positivity rate',\n", + " 'dtype': 'float64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'PR_rate_upper': {'title': None,\n", + " 'description': 'Upper bound of parasite positivity rate',\n", + " 'dtype': 'float64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'CM': {'title': None,\n", + " 'description': 'Number of people with clinical malaria',\n", + " 'dtype': 'int64',\n", + " 'nullable': True,\n", + " 'checks': None,\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'CM_rate': {'title': None,\n", + " 'description': 'Definition: (CM/N_people)*100',\n", + " 'dtype': 'float64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'CM_rate_lower': {'title': None,\n", + " 'description': 'Lower bound of clinical malaria rate',\n", + " 'dtype': 'float64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'CM_rate_upper': {'title': None,\n", + " 'description': 'Upper bound of clinical malaria rate',\n", + " 'dtype': 'float64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'Net_retention': {'title': None,\n", + " 'description': 'Number of nets still owned divided by a number of nets previously distributed',\n", + " 'dtype': 'float64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'N_nets': {'title': None,\n", + " 'description': 'Number of nets found in household or community study arm',\n", + " 'dtype': 'int64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'N_sleep_nets': {'title': None,\n", + " 'description': 'Number of people that slept under a net the previous night',\n", + " 'dtype': 'int64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'Perc_sleep_nets': {'title': None,\n", + " 'description': 'Percent of people that slept under a net the previous night',\n", + " 'dtype': 'float64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False}},\n", + " 'checks': {'check_less_than': {'columns_a': ['Age_lower'],\n", + " 'columns_b': ['Age_upper'],\n", + " 'or_equal': True},\n", + " 'check_greater_than': {'columns_a': 'N_people',\n", + " 'columns_b': ['N_pos', 'CM', 'N_sleep_nets'],\n", + " 'or_equal': True},\n", + " 'check_between': {'columns_target': ['PR', 'CM_rate'],\n", + " 'columns_lower': ['PR_rate_lower', 'CM_rate_lower'],\n", + " 'columns_upper': ['PR_rate_upper', 'CM_rate_upper'],\n", + " 'or_equal': True}},\n", + " 'index': [{'title': 'Observation reference',\n", + " 'description': None,\n", + " 'dtype': 'str',\n", + " 'nullable': False,\n", + " 'checks': {'str_startswith': 'S'},\n", + " 'name': 'observation_ref',\n", + " 'unique': False,\n", + " 'coerce': False},\n", + " {'title': 'ITNCondition reference',\n", + " 'description': None,\n", + " 'dtype': 'str',\n", + " 'nullable': False,\n", + " 'checks': {'str_startswith': 'N'},\n", + " 'name': 'itncondition_ref',\n", + " 'unique': False,\n", + " 'coerce': False}],\n", + " 'dtype': None,\n", + " 'coerce': True,\n", + " 'strict': True,\n", + " 'name': 'ClinicalOutcome',\n", + " 'ordered': False,\n", + " 'unique': None,\n", + " 'report_duplicates': 'all',\n", + " 'unique_column_names': False,\n", + " 'add_missing_columns': False,\n", + " 'title': None,\n", + " 'description': '\\nEpidemiological and clinical outcomes on humans collected from a clinical trial or village trial, if reported in the study.\\nEach clinical outcome should have unique `observation_ref`, `itn_condition_ref`, `Group`, `Age_lower`, and `Age_upper` (if reported).\\n '}}" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "json.loads(record.fields[\"extraction\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Test dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Custom Field" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/jonny/Projects/extralit/argilla/src/argilla/client.py:354: UserWarning: Dataset with name 'interactive_chat' not found in workspace 'itn-recalibration'\n", + " warnings.warn(f\"Dataset with name {name!r} not found in workspace {workspace.name!r}\")\n" + ] + }, + { + "ename": "AttributeError", + "evalue": "'NoneType' object has no attribute 'delete'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[5], line 5\u001b[0m\n\u001b[1;32m 1\u001b[0m dataset \u001b[38;5;241m=\u001b[39m client\u001b[38;5;241m.\u001b[39mdatasets(\n\u001b[1;32m 2\u001b[0m name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minteractive_chat\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 3\u001b[0m \u001b[38;5;66;03m# workspace=\"itn-recalibration\"\u001b[39;00m\n\u001b[1;32m 4\u001b[0m )\n\u001b[0;32m----> 5\u001b[0m \u001b[43mdataset\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdelete\u001b[49m()\n", + "\u001b[0;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'delete'" + ] + } + ], + "source": [ + "dataset = client.datasets(\n", + " name=\"interactive_chat\",\n", + " # workspace=\"itn-recalibration\"\n", + ")\n", + "dataset.delete()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Dataset(id=UUID('92b559e7-8eff-4d4c-85bf-817fd73570e4') inserted_at=datetime.datetime(2024, 12, 2, 21, 33, 33, 529345) updated_at=datetime.datetime(2024, 12, 2, 21, 33, 39, 111530) name='interactive_chat' status='ready' guidelines=None allow_extra_metadata=False distribution=OverlapTaskDistributionModel(strategy='overlap', min_submitted=1) workspace_id=UUID('e9e4e699-a6f9-4482-b5dd-e45874bd87eb') last_activity_at=datetime.datetime(2024, 12, 2, 21, 33, 39, 111530))" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "settings = ex.Settings(\n", + " fields=[\n", + " ex.TableField(name=\"chosen\", required=False),\n", + " ],\n", + " questions=[\n", + " ex.TableQuestion(\"extraction\", title=\"Correct the table\", required=True),\n", + " ],\n", + ")\n", + "\n", + "dataset = ex.Dataset(\n", + " settings=settings,\n", + " name=\"interactive_chat\",\n", + ")\n", + "\n", + "dataset.create()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'reference': 'mosqueira2015pilot',\n", + " 'schema': {'fields': [{'name': 'observation_ref',\n", + " 'type': 'any',\n", + " 'extDtype': 'string'},\n", + " {'name': 'itncondition_ref', 'type': 'any', 'extDtype': 'string'},\n", + " {'name': 'N_people', 'type': 'integer'},\n", + " {'name': 'Age_lower', 'type': 'number'},\n", + " {'name': 'Age_upper', 'type': 'number'}],\n", + " 'primaryKey': ['observation_ref', 'itncondition_ref'],\n", + " 'pandas_version': '1.4.0'},\n", + " 'data': [{'observation_ref': 'S01',\n", + " 'itncondition_ref': 'N01',\n", + " 'N_people': 3903,\n", + " 'Age_lower': 0.5,\n", + " 'Age_upper': 14.0},\n", + " {'observation_ref': 'S02',\n", + " 'itncondition_ref': 'N01',\n", + " 'N_people': 3903,\n", + " 'Age_lower': 0.5,\n", + " 'Age_upper': 14.0}],\n", + " 'validation': {'schema_type': 'dataframe',\n", + " 'version': '0.18.3',\n", + " 'columns': {'N_people': {'title': None,\n", + " 'description': 'Number of people in the study arm of the net in question',\n", + " 'dtype': 'int64',\n", + " 'nullable': True,\n", + " 'checks': None,\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'Age_lower': {'title': None,\n", + " 'description': 'Lower limit of age group in years. For children <1, enter age as a decimal.',\n", + " 'dtype': 'float64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'Age_upper': {'title': None,\n", + " 'description': 'Upper limit of age group in years. For children <1, enter age as a decimal.',\n", + " 'dtype': 'float64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'N_pos': {'title': None,\n", + " 'description': 'Number of people tested to be parasite positive',\n", + " 'dtype': 'int64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'PR': {'title': None,\n", + " 'description': 'Definition: (N_pos/N_people)*100',\n", + " 'dtype': 'float64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'PR_rate_lower': {'title': None,\n", + " 'description': 'Lower bound of parasite positivity rate',\n", + " 'dtype': 'float64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'PR_rate_upper': {'title': None,\n", + " 'description': 'Upper bound of parasite positivity rate',\n", + " 'dtype': 'float64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'CM': {'title': None,\n", + " 'description': 'Number of people with clinical malaria',\n", + " 'dtype': 'int64',\n", + " 'nullable': True,\n", + " 'checks': None,\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'CM_rate': {'title': None,\n", + " 'description': 'Definition: (CM/N_people)*100',\n", + " 'dtype': 'float64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'CM_rate_lower': {'title': None,\n", + " 'description': 'Lower bound of clinical malaria rate',\n", + " 'dtype': 'float64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'CM_rate_upper': {'title': None,\n", + " 'description': 'Upper bound of clinical malaria rate',\n", + " 'dtype': 'float64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'Net_retention': {'title': None,\n", + " 'description': 'Number of nets still owned divided by a number of nets previously distributed',\n", + " 'dtype': 'float64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'N_nets': {'title': None,\n", + " 'description': 'Number of nets found in household or community study arm',\n", + " 'dtype': 'int64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'N_sleep_nets': {'title': None,\n", + " 'description': 'Number of people that slept under a net the previous night',\n", + " 'dtype': 'int64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'Perc_sleep_nets': {'title': None,\n", + " 'description': 'Percent of people that slept under a net the previous night',\n", + " 'dtype': 'float64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False}},\n", + " 'checks': {'check_less_than': {'columns_a': ['Age_lower'],\n", + " 'columns_b': ['Age_upper'],\n", + " 'or_equal': True},\n", + " 'check_greater_than': {'columns_a': 'N_people',\n", + " 'columns_b': ['N_pos', 'CM', 'N_sleep_nets'],\n", + " 'or_equal': True},\n", + " 'check_between': {'columns_target': ['PR', 'CM_rate'],\n", + " 'columns_lower': ['PR_rate_lower', 'CM_rate_lower'],\n", + " 'columns_upper': ['PR_rate_upper', 'CM_rate_upper'],\n", + " 'or_equal': True}},\n", + " 'index': [{'title': 'Observation reference',\n", + " 'description': None,\n", + " 'dtype': 'str',\n", + " 'nullable': False,\n", + " 'checks': {'str_startswith': 'S'},\n", + " 'name': 'observation_ref',\n", + " 'unique': False,\n", + " 'coerce': False},\n", + " {'title': 'ITNCondition reference',\n", + " 'description': None,\n", + " 'dtype': 'str',\n", + " 'nullable': False,\n", + " 'checks': {'str_startswith': 'N'},\n", + " 'name': 'itncondition_ref',\n", + " 'unique': False,\n", + " 'coerce': False}],\n", + " 'dtype': None,\n", + " 'coerce': True,\n", + " 'strict': True,\n", + " 'name': 'ClinicalOutcome',\n", + " 'ordered': False,\n", + " 'unique': None,\n", + " 'report_duplicates': 'all',\n", + " 'unique_column_names': False,\n", + " 'add_missing_columns': False,\n", + " 'title': None,\n", + " 'description': '\\nEpidemiological and clinical outcomes on humans collected from a clinical trial or village trial, if reported in the study.\\nEach clinical outcome should have unique `observation_ref`, `itn_condition_ref`, `Group`, `Age_lower`, and `Age_upper` (if reported).\\n '}}" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sample_table = record.fields[\"extraction\"]\n", + "sample_table" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
DatasetRecords: The provided batch size 256 was normalized. Using value 4.\n",
+       "
\n" + ], + "text/plain": [ + "DatasetRecords: The provided batch size \u001b[1;36m256\u001b[0m was normalized. Using value \u001b[1;36m4\u001b[0m.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Sending records...: 100%|███████| 1/1 [00:02<00:00, 2.54s/batch]\n" + ] + }, + { + "data": { + "text/plain": [ + "DatasetRecords(Dataset(id=UUID('a64a827c-f962-417a-a771-ce53f61c0756') inserted_at=datetime.datetime(2024, 11, 29, 23, 9, 55, 104623) updated_at=datetime.datetime(2024, 11, 29, 23, 9, 58, 696913) name='interactive_chat' status='ready' guidelines=None allow_extra_metadata=False distribution=OverlapTaskDistributionModel(strategy='overlap', min_submitted=1) workspace_id=UUID('e9e4e699-a6f9-4482-b5dd-e45874bd87eb') last_activity_at=datetime.datetime(2024, 11, 29, 23, 9, 58, 696913)))" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset.records.log([{\"chosen\": sample_table} for r in updated_records])" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# ds = load_dataset(\"argilla/Capybara-Preferences\", split=\"train[:100]\")\n", + "# ds = ds.map(lambda x: {\"messages\": x[\"chosen\"]})\n", + "# dataset.records.log(ds)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 4 } \ No newline at end of file diff --git a/examples/deployments/k8s/helm/postgres-helm.yaml b/examples/deployments/k8s/helm/postgres-helm.yaml index e628fe12f..af2a87cdd 100644 --- a/examples/deployments/k8s/helm/postgres-helm.yaml +++ b/examples/deployments/k8s/helm/postgres-helm.yaml @@ -21,7 +21,7 @@ postgresql: enabled: true storageClass: "local-path" size: 1Gi - nodeAffinity: + nodeAffinity: preferredDuringSchedulingIgnoredDuringExecution: - weight: 1 preference: diff --git a/examples/deployments/k8s/helm/weaviate-helm.yaml b/examples/deployments/k8s/helm/weaviate-helm.yaml index 72eb18cbb..4ee0befec 100644 --- a/examples/deployments/k8s/helm/weaviate-helm.yaml +++ b/examples/deployments/k8s/helm/weaviate-helm.yaml @@ -25,15 +25,15 @@ args: - 'http' - '--config-file' - '/weaviate-config/conf.yaml' - - --read-timeout=60s + - --read-timeout=60s - --write-timeout=60s # below is an example that can be used to set an arbitrary nofile limit at # startup: # -# command: +# command: # - "/bin/sh" -# args: +# args: # - "-c" # - "ulimit -n 65535 && /bin/weaviate --host 0.0.0.0 --port 8080 --scheme http --config-file /weaviate-config/conf.yaml" @@ -53,7 +53,7 @@ initContainers: repo: alpine tag: latest pullPolicy: IfNotPresent - + extraInitContainers: {} # - image: some-image # name: some-name @@ -99,7 +99,7 @@ serviceAccountName: # Kubernetes Cluster domain name, used for resolving intra-cluster requests, i.e # between instances of weaviate. # Note: The final '.' on the end of the hostname makes it a FQDN, and is required for -# DNS to resolve in all kubernetes environments. +# DNS to resolve in all kubernetes environments. # See https://github.com/weaviate/weaviate-helm/issues/175 for details. clusterDomain: cluster.local. @@ -287,7 +287,7 @@ env: PROMETHEUS_MONITORING_ENABLED: false PROMETHEUS_MONITORING_GROUP: false - # Set a MEM limit for the Weaviate Pod so it can help you both increase GC-related + # Set a MEM limit for the Weaviate Pod so it can help you both increase GC-related # performance as well as avoid GC-related out-of-memory (“OOM”) situations # GOMEMLIMIT: 6GiB @@ -335,13 +335,13 @@ backups: envconfig: # Configure folder where backups should be saved BACKUP_FILESYSTEM_PATH: /tmp/backups - + s3: enabled: false # If one is using AWS EKS and has already configured K8s Service Account # that holds the AWS credentials one can pass a name of that service account # here using this setting. - # NOTE: the root `serviceAccountName` config has priority over this one, and + # NOTE: the root `serviceAccountName` config has priority over this one, and # if the root one is set this one will NOT overwrite it. This one is here for # backwards compatibility. serviceAccountName: @@ -350,17 +350,17 @@ backups: # Configure bucket where backups should be saved, this setting is mandatory BACKUP_S3_BUCKET: weaviate-backups - # Optional setting. Defaults to empty string. + # Optional setting. Defaults to empty string. # Set this option if you want to save backups to a given location # inside the bucket # BACKUP_S3_PATH: path/inside/bucket - # Optional setting. Defaults to AWS S3 (s3.amazonaws.com). + # Optional setting. Defaults to AWS S3 (s3.amazonaws.com). # Set this option if you have a MinIO storage configured in your environment # and want to use it instead of the AWS S3. # BACKUP_S3_ENDPOINT: custom.minio.endpoint.address - # Optional setting. Defaults to true. + # Optional setting. Defaults to true. # Set this option if you don't want to use SSL. # BACKUP_S3_USE_SSL: true @@ -373,7 +373,7 @@ backups: # You can pass the User credentials (access-key id and access-secret-key) in 2 ways: # 1. by setting the AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY plain values in the `secrets` section below # this chart will create a kubernetes secret for you with these key-values pairs - # 2. create Kubernetes secret/s with AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY keys and their respective values + # 2. create Kubernetes secret/s with AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY keys and their respective values # Set the Key and the secret where it is set in `envSecrets` section below secrets: {} # AWS_ACCESS_KEY_ID: access-key-id (plain text) @@ -421,7 +421,7 @@ backups: # Configure container where backups should be saved, this setting is mandatory BACKUP_AZURE_CONTAINER: weaviate-backups - # Optional setting. Defaults to empty string. + # Optional setting. Defaults to empty string. # Set this option if you want to save backups to a given location # inside the container # BACKUP_AZURE_PATH: path/inside/container @@ -432,7 +432,7 @@ backups: # 1. by setting the AZURE_STORAGE_ACCOUNT and AZURE_STORAGE_KEY # or AZURE_STORAGE_CONNECTION_STRING plain values in the `secrets` section below # this chart will create a kubernetes secret for you with these key-values pairs - # 2. create Kubernetes secret/s with AZURE_STORAGE_ACCOUNT and AZURE_STORAGE_KEY + # 2. create Kubernetes secret/s with AZURE_STORAGE_ACCOUNT and AZURE_STORAGE_KEY # or AZURE_STORAGE_CONNECTION_STRING and their respective values # Set the Key and the secret where it is set in `envSecrets` section below secrets: {} @@ -580,7 +580,7 @@ modules: # as described here: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/ # applies to passageQueryService below securityContext: {} - + # It is possible to add a ServiceAccount to this module's Pods, it can be # used in cases where the module is in a private registry and you want to # give access to the registry only to this pod. @@ -650,13 +650,13 @@ modules: # as described here: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/ # applies to passageQueryService below securityContext: {} - + # It is possible to add a ServiceAccount to this module's Pods, it can be # used in cases where the module is in a private registry and you want to # give access to the registry only to this pod. # NOTE: if not set the root `serviceAccountName` config will be used. serviceAccountName: - + # You can guide where the pods are scheduled on a per-module basis, # as well as for Weaviate overall. Each module accepts nodeSelector, # tolerations, and affinity configuration. If it is set on a per- @@ -721,7 +721,7 @@ modules: # as described here: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/ # applies to passageQueryService below securityContext: {} - + # It is possible to add a ServiceAccount to this module's Pods, it can be # used in cases where the module is in a private registry and you want to # give access to the registry only to this pod. @@ -742,7 +742,7 @@ modules: # These models run only on CPU only and on x86_64 arch # The ML model is containerized in a Weaviate compatible way. # If you want to run a different model that published ones you can follow the - # tutorial from here on how to create such a container: https://github.com/weaviate/t2v-gpt4all-models + # tutorial from here on how to create such a container: https://github.com/weaviate/t2v-gpt4all-models text2vec-gpt4all: # Enable deployment of this module @@ -751,7 +751,7 @@ modules: # You can set directly an inference URL of this module without deploying it with this release. # You can do so by setting a value for the `inferenceUrl` here AND by setting the `enable` to `false` inferenceUrl: {} - + # The configuration below is ignored if enabled==false tag: all-MiniLM-L6-v2 repo: semitechnologies/gpt4all-inference @@ -783,7 +783,7 @@ modules: # security Context for the Contextionary Pods. The configurations are the same as setting them # as described here: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/ securityContext: {} - + # It is possible to add a ServiceAccount to this module's Pods, it can be # used in cases where the module is in a private registry and you want to # give access to the registry only to this pod. @@ -1133,7 +1133,7 @@ modules: # security Context for the Contextionary Pods. The configurations are the same as setting them # as described here: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/ securityContext: {} - + # It is possible to add a ServiceAccount to this module's Pods, it can be # used in cases where the module is in a private registry and you want to # give access to the registry only to this pod. @@ -1309,7 +1309,7 @@ modules: # security Context for the Contextionary Pods. The configurations are the same as setting them # as described here: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/ securityContext: {} - + # It is possible to add a ServiceAccount to this module's Pods, it can be # used in cases where the module is in a private registry and you want to # give access to the registry only to this pod. @@ -1326,7 +1326,7 @@ modules: affinity: # The reranker-cohere module uses Cohere API - # to dynamically compute a score for the relevance + # to dynamically compute a score for the relevance # of the query with each of the initial search results. # More information about Cohere API can be found here: https://docs.cohere.com/docs/rerank-guide reranker-cohere: @@ -1339,7 +1339,7 @@ modules: apiKey: '' # The reranker-voyageai module uses VoaygeAI API - # to dynamically compute a score for the relevance + # to dynamically compute a score for the relevance # of the query with each of the initial search results. # More information about Cohere API can be found here: https://www.voyageai.com/ reranker-voyageai: @@ -1351,7 +1351,7 @@ modules: # an environment variable apiKey: '' - # The reranker-transformers module uses Cross-Encoders for + # The reranker-transformers module uses Cross-Encoders for # sentence pair scoring and sentence pair classification tasks. # More information about Cross-Encoders can be found here: # https://www.sbert.net/examples/applications/cross-encoder/README.html @@ -1411,7 +1411,7 @@ modules: # security Context for the Contextionary Pods. The configurations are the same as setting them # as described here: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/ securityContext: {} - + # It is possible to add a ServiceAccount to this module's Pods, it can be # used in cases where the module is in a private registry and you want to # give access to the registry only to this pod. @@ -1468,7 +1468,7 @@ modules: # security Context for the Contextionary Pods. The configurations are the same as setting them # as described here: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/ securityContext: {} - + # It is possible to add a ServiceAccount to this module's Pods, it can be # used in cases where the module is in a private registry and you want to # give access to the registry only to this pod. @@ -1542,7 +1542,7 @@ modules: # security Context for the Contextionary Pods. The configurations are the same as setting them # as described here: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/ securityContext: {} - + # It is possible to add a ServiceAccount to this module's Pods, it can be # used in cases where the module is in a private registry and you want to # give access to the registry only to this pod. @@ -1615,7 +1615,7 @@ modules: # security Context for the Contextionary Pods. The configurations are the same as setting them # as described here: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/ securityContext: {} - + # It is possible to add a ServiceAccount to this module's Pods, it can be # used in cases where the module is in a private registry and you want to # give access to the registry only to this pod. diff --git a/examples/deployments/k8s/k3d/k3d-config.yaml b/examples/deployments/k8s/k3d/k3d-config.yaml index b6523ea4b..e64ed2226 100644 --- a/examples/deployments/k8s/k3d/k3d-config.yaml +++ b/examples/deployments/k8s/k3d/k3d-config.yaml @@ -13,4 +13,3 @@ k3d: kubeAPI: host: "0.0.0.0" hostPort: "6443" - \ No newline at end of file diff --git a/examples/deployments/k8s/kind/tilt-local-dev-storage-policy.yaml b/examples/deployments/k8s/kind/tilt-local-dev-storage-policy.yaml index b75c774d9..b55ab0269 100644 --- a/examples/deployments/k8s/kind/tilt-local-dev-storage-policy.yaml +++ b/examples/deployments/k8s/kind/tilt-local-dev-storage-policy.yaml @@ -21,7 +21,7 @@ spec: storageClassName: local-path persistentVolumeReclaimPolicy: Retain local: - path: "/usr/share/elasticsearch/data" + path: "/usr/share/elasticsearch/data" nodeAffinity: required: nodeSelectorTerms: @@ -47,12 +47,12 @@ spec: persistentVolumeReclaimPolicy: Retain local: path: "/var/lib/postgresql/data" - nodeAffinity: + nodeAffinity: required: nodeSelectorTerms: - matchExpressions: - key: kubernetes.io/hostname - operator: In + operator: In values: - kind-control-plane --- diff --git a/examples/deployments/k8s/minio-dev.yaml b/examples/deployments/k8s/minio-dev.yaml index a4c0ef744..19db3a796 100644 --- a/examples/deployments/k8s/minio-dev.yaml +++ b/examples/deployments/k8s/minio-dev.yaml @@ -4,7 +4,7 @@ # The `spec.containers[0].args` contains the command run on the pod # The `/data` directory corresponds to the `spec.containers[0].volumeMounts[0].mountPath` # That mount path corresponds to a Kubernetes HostPath which binds `/data` to a local drive or volume on the worker node where the pod runs -# +# apiVersion: v1 kind: Pod metadata: @@ -33,7 +33,7 @@ spec: command: - /bin/bash - -c - args: + args: - minio server /data --console-address :9090 env: - name: MINIO_ACCESS_KEY_FILE diff --git a/examples/document_extraction/setup_workspace.ipynb b/examples/document_extraction/setup_workspace.ipynb index 0d630cf95..46859279a 100644 --- a/examples/document_extraction/setup_workspace.ipynb +++ b/examples/document_extraction/setup_workspace.ipynb @@ -61,7 +61,7 @@ "from pathlib import Path\n", "\n", "# Connect to Extralit using default credentials\n", - "client = ex.Extralit(api_url=\"http://localhost:6900/\", api_key='extralit.apikey')\n", + "client = ex.Extralit(api_url=\"http://localhost:6900/\", api_key=\"extralit.apikey\")\n", "\n", "print(f\"Successfully connected to Extralit at {client.api_url}\")" ] @@ -78,7 +78,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 4, "id": "64b5f09b", "metadata": {}, "outputs": [ @@ -99,8 +99,8 @@ " # Create the workspace\n", " workspace = new_workspace.create()\n", "\n", - " print(f\"Workspace '{workspace_name}' created successfully with ID: {created_workspace.id}\")\n", - "except Exception as e:\n", + " print(f\"Workspace '{workspace_name}' created successfully with ID: {workspace.id}\")\n", + "except Exception:\n", " print(f\"Workspace '{workspace_name}' already exists. Using the existing workspace.\")\n", " workspace = client.workspaces(workspace_name)" ] @@ -117,29 +117,47 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 7, "id": "d4df67d6", "metadata": {}, "outputs": [], "source": [ "# List all workspaces\n", - "dataset = client.datasets(\"imdb\")\n", + "dataset = client.datasets(\"papers-ocr-benchmarks_dataset\")\n", "dataset.settings.to_json(\"./datasets/imdb/settings.json\")" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "id": "77a5690b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Settings(guidelines=None, allow_extra_metadata=False, distribution=OverlapTaskDistribution(min_submitted=1), fields=[], questions=[], vectors=[], metadata=[])" + "Settings(guidelines=None, allow_extra_metadata=True, distribution=OverlapTaskDistribution(min_submitted=1), fields=[TextField(name=reference, title=reference, description=None, type=text, required=False) \n", + ", TextField(name=type, title=type, description=None, type=text, required=False) \n", + ", TextField(name=title, title=title, description=None, type=text, required=False) \n", + ", TextField(name=issn, title=issn, description=None, type=text, required=False) \n", + ", TextField(name=url, title=url, description=None, type=text, required=False) \n", + ", TextField(name=doi, title=doi, description=None, type=text, required=False) \n", + ", TextField(name=language, title=language, description=None, type=text, required=False) \n", + ", TextField(name=urldate, title=urldate, description=None, type=text, required=False) \n", + ", TextField(name=journal, title=journal, description=None, type=text, required=False) \n", + ", TextField(name=authors, title=authors, description=None, type=text, required=False) \n", + ", TextField(name=month, title=month, description=None, type=text, required=False) \n", + ", TextField(name=note, title=note, description=None, type=text, required=False) \n", + ", TextField(name=pages, title=pages, description=None, type=text, required=False) \n", + ", TextField(name=filePaths, title=filePaths, description=None, type=text, required=False) \n", + ", TextField(name=abstract, title=abstract, description=None, type=text, required=False) \n", + ", TextField(name=copyright, title=copyright, description=None, type=text, required=False) \n", + ", TextField(name=editor, title=editor, description=None, type=text, required=False) \n", + "], questions=[RatingQuestion(name=rating_0, title=rating_0, description=None, type=rating, required=True) \n", + "], vectors=[], metadata=[TermsMetadataProperty(name=reference, title=reference, visible_for_annotators=True), TermsMetadataProperty(name=doi, title=doi, visible_for_annotators=True)])" ] }, - "execution_count": 9, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -276,18 +294,20 @@ "# Create empty PDF files - in reality, these would be your actual PDFs\n", "with open(pdf_file1, \"wb\") as f:\n", " f.write(b\"%PDF-1.5\\n%Example Document 1\")\n", - " \n", + "\n", "with open(pdf_file2, \"wb\") as f:\n", " f.write(b\"%PDF-1.5\\n%Example Document 2\")\n", "\n", "# Create a reference dataframe with metadata for the PDFs\n", - "references_df = pd.DataFrame({\n", - " \"reference\": [\"smith2023first\", \"johnson2022analysis\"],\n", - " \"file_path\": [str(pdf_file1), str(pdf_file2)],\n", - " \"title\": [\"Study on Sample Data\", \"Analysis of Experimental Results\"],\n", - " \"authors\": [\"Smith, J.\", \"Johnson, A.\"],\n", - " \"year\": [2023, 2022]\n", - "})\n", + "references_df = pd.DataFrame(\n", + " {\n", + " \"reference\": [\"smith2023first\", \"johnson2022analysis\"],\n", + " \"file_path\": [str(pdf_file1), str(pdf_file2)],\n", + " \"title\": [\"Study on Sample Data\", \"Analysis of Experimental Results\"],\n", + " \"authors\": [\"Smith, J.\", \"Johnson, A.\"],\n", + " \"year\": [2023, 2022],\n", + " }\n", + ")\n", "\n", "# Save the dataframe to a temporary CSV file\n", "references_csv = Path(temp_dir) / \"references.csv\"\n", @@ -328,13 +348,15 @@ "# Import the documents into the workspace\n", "# For demonstration purposes, we'll use the extralit client directly\n", "# Initialize the extralit client with the same credentials\n", - "extralit_client = ex.Extralit(api_url=\"http://localhost:6900/\", api_key='extralit.apikey')\n", + "extralit_client = ex.Extralit(\n", + " api_url=\"http://localhost:6900/\", api_key=\"extralit.apikey\"\n", + ")\n", "\n", "# Import the documents\n", "result = extralit_client.import_documents(\n", " workspace=workspace_name,\n", " papers=str(references_csv),\n", - " metadatas=[\"title\", \"authors\", \"year\"]\n", + " metadatas=[\"title\", \"authors\", \"year\"],\n", ")\n", "\n", "print(f\"Imported {len(result)} documents into workspace '{workspace_name}'\")" @@ -357,37 +379,45 @@ "metadata": {}, "outputs": [], "source": [ + "from extralit.extraction.models.schema import SchemaStructure\n", + "\n", + "\n", "# Define a simple schema using Pandera\n", "class Publication(pa.DataFrameModel):\n", " \"\"\"\n", " General information about the publication, extracted once per paper.\n", " \"\"\"\n", + "\n", " reference: Index[str] = pa.Field(unique=True, check_name=True)\n", " title: Series[str] = pa.Field()\n", " authors: Series[str] = pa.Field()\n", " publication_year: Series[int] = pa.Field(ge=1900, le=2100)\n", " doi: Series[str] = pa.Field(nullable=True)\n", - " \n", + "\n", " class Config:\n", - " singleton = {'enabled': True} # Indicates this is a document-level schema\n", + " singleton = {\"enabled\": True} # Indicates this is a document-level schema\n", + "\n", "\n", "# Define a second schema for experimental data\n", "class ExperimentalData(pa.DataFrameModel):\n", " \"\"\"\n", " Experimental data extracted from the paper, may appear multiple times.\n", " \"\"\"\n", + "\n", " experiment_id: Series[str] = pa.Field()\n", " sample_size: Series[int] = pa.Field(gt=0)\n", " study_type: Series[str] = pa.Field()\n", " result_value: Series[float] = pa.Field()\n", " significance: Series[float] = pa.Field(le=1.0, ge=0.0)\n", "\n", + "\n", "# Create a schema structure object\n", - "from extralit.extraction.models.schema import SchemaStructure\n", "\n", "# Save schemas to a temporary JSON file\n", "schema_file = Path(temp_dir) / \"schemas.json\"\n", - "schema_structure = SchemaStructure(schemas={\"Publication\": Publication, \"ExperimentalData\": ExperimentalData})\n", + "schema_structure = SchemaStructure(\n", + " schemas={\"Publication\": Publication, \"ExperimentalData\": ExperimentalData}\n", + ")\n", "schema_structure.to_json(schema_file)\n", "\n", "print(f\"Created schema file at {schema_file}\")" @@ -402,8 +432,7 @@ "source": [ "# Upload the schema to the workspace\n", "result = extralit_client.upload_schemas(\n", - " workspace=workspace_name,\n", - " schemas=str(schema_file)\n", + " workspace=workspace_name, schemas=str(schema_file)\n", ")\n", "\n", "print(f\"Uploaded schemas to workspace '{workspace_name}'\")" @@ -438,7 +467,7 @@ " references=references,\n", " text_ocr=[\"default\"], # Using the default text OCR model\n", " table_ocr=[\"default\"], # Using the default table OCR model\n", - " output_dataset=\"PDF_Preprocessing_Results\"\n", + " output_dataset=\"PDF_Preprocessing_Results\",\n", ")\n", "\n", "print(f\"Preprocessing completed for {len(preprocessing_result)} documents\")" @@ -468,7 +497,7 @@ "extraction_result = extract_data(\n", " workspace=workspace_name,\n", " references=references,\n", - " output_dataset=\"Data_Extraction_Results\"\n", + " output_dataset=\"Data_Extraction_Results\",\n", ")\n", "\n", "print(f\"LLM extractions completed for {len(extraction_result)} documents\")" @@ -508,7 +537,7 @@ "# Export the extracted data\n", "extracted_data = extralit_client.export_data(\n", " workspace=workspace_name,\n", - " output=\"temp_output.csv\" # This will save the data to a CSV file\n", + " output=\"temp_output.csv\", # This will save the data to a CSV file\n", ")\n", "\n", "# Display the extracted data\n", diff --git a/extralit-frontend/components/base/base-render-table/RenderTable.vue b/extralit-frontend/components/base/base-render-table/RenderTable.vue index 916245089..fbfaf645b 100644 --- a/extralit-frontend/components/base/base-render-table/RenderTable.vue +++ b/extralit-frontend/components/base/base-render-table/RenderTable.vue @@ -1,6 +1,6 @@