From 2098b19bb5197b09c8c220185272a5bcb1c5cbfa Mon Sep 17 00:00:00 2001 From: JonnyTran Date: Thu, 14 Aug 2025 13:13:29 -0700 Subject: [PATCH 1/8] ruff autofix --- .devcontainer/noop.txt | 2 +- .github/pull_request_template.md | 12 +- ...-frontend.teardown-all-pr-environments.yml | 2 +- .pre-commit-config.yaml | 34 +- CONTRIBUTING.md | 2 +- examples/custom_field/custom_field.ipynb | 562 +++--- examples/custom_field/table_field.ipynb | 1538 ++++++++--------- .../deployments/k8s/helm/postgres-helm.yaml | 2 +- .../deployments/k8s/helm/weaviate-helm.yaml | 60 +- examples/deployments/k8s/k3d/k3d-config.yaml | 1 - .../kind/tilt-local-dev-storage-policy.yaml | 6 +- examples/deployments/k8s/minio-dev.yaml | 4 +- .../document_extraction/setup_workspace.ipynb | 55 +- .../base/base-render-table/RenderTable.vue | 62 +- .../base-render-table/renderTable.test.ts | 10 +- .../base/base-render-table/tableUtils.ts | 12 +- .../useLLMExtractionViewModel.ts | 20 +- .../useSchemaTableViewModel.ts | 10 +- .../base/base-render-table/validatorUtils.ts | 20 +- .../container/questions/QuestionsForm.vue | 12 +- .../get-extraction-completion-use-case.ts | 18 +- extralit-server/pyproject.toml | 38 +- .../src/extralit_server/__init__.py | 23 +- .../src/extralit_server/__main__.py | 21 +- extralit-server/src/extralit_server/_app.py | 27 +- .../src/extralit_server/alembic/env.py | 3 +- ...9ee58fbb4_create_workspaces_users_table.py | 20 +- ...913727_fix_suggestions_type_enum_values.py | 20 +- ...4d74_add_status_column_to_records_table.py | 23 +- .../3a8e2f9b5dea_create_questions_table.py | 20 +- .../3fc3c0839959_create_suggestions_table.py | 20 +- ...37_add_metadata_column_to_records_table.py | 20 +- ...d_distribution_column_to_datasets_table.py | 20 +- .../580a6553186f_add_datasets_users_table.py | 20 +- ...0_add_metadata_column_to_datasets_table.py | 23 +- .../6ed1b8bf8e08_create_webhooks_table.py | 23 +- .../74694870197c_create_users_table.py | 20 +- ...dded_document_model_and_updated_record_.py | 3 +- ...0ab5b42d9_create_vectors_settings_table.py | 20 +- ...f8b57a_create_metadata_properties_table.py | 20 +- ...d6b33203390_create_import_history_table.py | 3 +- .../82a5a88a3fa5_create_workspaces_table.py | 20 +- ..._add_last_activity_at_to_datasets_table.py | 20 +- .../8be56284dac0_create_records_table.py | 20 +- .../ae5522b4c674_create_fields_table.py | 20 +- ...extra_metadata_column_to_datasets_table.py | 20 +- .../b9099dc08489_create_datasets_table.py | 20 +- .../bda6fe24314e_create_vectors_table.py | 20 +- ...change_suggestions_score_column_to_json.py | 20 +- ...67_update_responses_user_id_foreign_key.py | 23 +- .../e402e9d9245e_create_responses_table.py | 20 +- .../src/extralit_server/api/__init__.py | 20 +- .../extralit_server/api/errors/__init__.py | 20 +- .../extralit_server/api/errors/v1/__init__.py | 20 +- .../api/errors/v1/exception_handlers.py | 28 +- .../extralit_server/api/handlers/__init__.py | 20 +- .../api/handlers/v1/__init__.py | 20 +- .../api/handlers/v1/authentication.py | 22 +- .../api/handlers/v1/datasets/__init__.py | 20 +- .../api/handlers/v1/datasets/datasets.py | 106 +- .../api/handlers/v1/datasets/questions.py | 31 +- .../api/handlers/v1/datasets/records.py | 96 +- .../api/handlers/v1/datasets/records_bulk.py | 33 +- .../api/handlers/v1/documents.py | 66 +- .../extralit_server/api/handlers/v1/fields.py | 29 +- .../extralit_server/api/handlers/v1/files.py | 27 +- .../api/handlers/v1/imports.py | 60 +- .../extralit_server/api/handlers/v1/info.py | 4 +- .../extralit_server/api/handlers/v1/jobs.py | 35 +- .../api/handlers/v1/metadata_properties.py | 35 +- .../extralit_server/api/handlers/v1/models.py | 2 +- .../extralit_server/api/handlers/v1/oauth2.py | 14 +- .../api/handlers/v1/questions.py | 29 +- .../api/handlers/v1/records.py | 43 +- .../api/handlers/v1/responses.py | 33 +- .../api/handlers/v1/settings.py | 20 +- .../api/handlers/v1/suggestions.py | 27 +- .../extralit_server/api/handlers/v1/users.py | 49 +- .../api/handlers/v1/vectors_settings.py | 29 +- .../api/handlers/v1/webhooks.py | 63 +- .../api/handlers/v1/workspaces.py | 46 +- .../extralit_server/api/policies/__init__.py | 20 +- .../api/policies/v1/__init__.py | 36 +- .../api/policies/v1/commons.py | 22 +- .../api/policies/v1/dataset_policy.py | 23 +- .../api/policies/v1/field_policy.py | 20 +- .../api/policies/v1/file_policy.py | 30 +- .../api/policies/v1/job_policy.py | 20 +- .../policies/v1/metadata_property_policy.py | 20 +- .../api/policies/v1/question_policy.py | 20 +- .../api/policies/v1/record_policy.py | 20 +- .../api/policies/v1/response_policy.py | 20 +- .../api/policies/v1/suggestion_policy.py | 20 +- .../api/policies/v1/user_policy.py | 20 +- .../api/policies/v1/vector_settings_policy.py | 20 +- .../api/policies/v1/webhook_policy.py | 21 +- .../api/policies/v1/workspace_policy.py | 20 +- .../api/policies/v1/workspace_user_policy.py | 20 +- .../src/extralit_server/api/routes.py | 28 +- .../extralit_server/api/schemas/__init__.py | 20 +- .../src/extralit_server/api/schemas/base.py | 26 +- .../api/schemas/v1/__init__.py | 20 +- .../extralit_server/api/schemas/v1/chat.py | 1 + .../extralit_server/api/schemas/v1/commons.py | 26 +- .../api/schemas/v1/datasets.py | 46 +- .../api/schemas/v1/document/preprocessing.py | 13 +- .../api/schemas/v1/document/segments.py | 38 +- .../api/schemas/v1/documents.py | 42 +- .../extralit_server/api/schemas/v1/fields.py | 66 +- .../extralit_server/api/schemas/v1/files.py | 35 +- .../extralit_server/api/schemas/v1/imports.py | 41 +- .../extralit_server/api/schemas/v1/info.py | 20 +- .../extralit_server/api/schemas/v1/jobs.py | 22 +- .../api/schemas/v1/metadata_properties.py | 67 +- .../extralit_server/api/schemas/v1/oauth2.py | 23 +- .../api/schemas/v1/questions.py | 127 +- .../extralit_server/api/schemas/v1/records.py | 105 +- .../api/schemas/v1/records_bulk.py | 31 +- .../api/schemas/v1/responses.py | 72 +- .../api/schemas/v1/settings.py | 17 +- .../api/schemas/v1/suggestions.py | 60 +- .../extralit_server/api/schemas/v1/users.py | 24 +- .../api/schemas/v1/vector_settings.py | 31 +- .../extralit_server/api/schemas/v1/vectors.py | 23 +- .../api/schemas/v1/webhooks.py | 52 +- .../api/schemas/v1/workspaces.py | 28 +- .../src/extralit_server/bulk/__init__.py | 20 +- .../src/extralit_server/bulk/records_bulk.py | 39 +- .../extralit_server/cli/database/__init__.py | 20 +- .../extralit_server/cli/database/migrate.py | 24 +- .../extralit_server/cli/database/revisions.py | 20 +- .../cli/database/users/__init__.py | 20 +- .../cli/database/users/create.py | 20 +- .../cli/database/users/create_default.py | 3 +- .../cli/database/users/migrate.py | 12 +- .../cli/database/users/update.py | 20 +- .../cli/database/users/utils.py | 20 +- .../src/extralit_server/cli/database/utils.py | 20 +- .../cli/search_engine/__init__.py | 20 +- .../cli/search_engine/__main__.py | 2 +- .../cli/search_engine/reindex.py | 35 +- .../src/extralit_server/cli/worker.py | 26 +- .../src/extralit_server/contexts/__init__.py | 20 +- .../src/extralit_server/contexts/accounts.py | 32 +- .../src/extralit_server/contexts/datasets.py | 55 +- .../extralit_server/contexts/distribution.py | 36 +- .../contexts/document/analysis.py | 16 +- .../contexts/document/margin.py | 41 +- .../contexts/document/preprocessing.py | 9 +- .../src/extralit_server/contexts/files.py | 94 +- .../src/extralit_server/contexts/hub.py | 57 +- .../src/extralit_server/contexts/imports.py | 69 +- .../src/extralit_server/contexts/questions.py | 20 +- .../src/extralit_server/contexts/records.py | 30 +- .../src/extralit_server/contexts/search.py | 31 +- .../src/extralit_server/contexts/settings.py | 5 +- .../src/extralit_server/contexts/webhooks.py | 24 +- .../src/extralit_server/database.py | 13 +- extralit-server/src/extralit_server/enums.py | 20 +- .../src/extralit_server/errors/__init__.py | 22 +- .../src/extralit_server/errors/base_errors.py | 14 +- .../extralit_server/errors/error_handler.py | 4 +- .../extralit_server/errors/future/__init__.py | 22 +- .../errors/future/base_errors.py | 4 +- .../src/extralit_server/helpers.py | 21 +- .../extralit_server/integrations/__init__.py | 20 +- .../integrations/huggingface/__init__.py | 20 +- .../integrations/huggingface/spaces.py | 33 +- .../src/extralit_server/jobs/__init__.py | 3 +- .../src/extralit_server/jobs/dataset_jobs.py | 25 +- .../src/extralit_server/jobs/document_jobs.py | 25 +- .../src/extralit_server/jobs/hub_jobs.py | 28 +- .../src/extralit_server/jobs/import_jobs.py | 24 +- .../src/extralit_server/jobs/queues.py | 21 +- .../src/extralit_server/jobs/webhook_jobs.py | 35 +- .../src/extralit_server/logging.py | 5 +- .../src/extralit_server/models/__init__.py | 24 +- .../src/extralit_server/models/base.py | 25 +- .../src/extralit_server/models/database.py | 101 +- .../models/metadata_properties.py | 43 +- .../src/extralit_server/models/mixins.py | 60 +- .../src/extralit_server/models/suggestions.py | 20 +- .../extralit_server/search_engine/__init__.py | 24 +- .../src/extralit_server/search_engine/base.py | 112 +- .../extralit_server/search_engine/commons.py | 107 +- .../search_engine/elasticsearch.py | 56 +- .../search_engine/opensearch.py | 56 +- .../src/extralit_server/security/__init__.py | 23 +- .../security/authentication/__init__.py | 25 +- .../security/authentication/db/__init__.py | 22 +- .../authentication/db/api_key_backend.py | 23 +- .../authentication/db/bearer_token_backend.py | 23 +- .../security/authentication/jwt.py | 20 +- .../authentication/oauth2/__init__.py | 26 +- .../authentication/oauth2/_backends.py | 16 +- .../authentication/oauth2/auth_backend.py | 25 +- .../authentication/oauth2/provider.py | 44 +- .../authentication/oauth2/settings.py | 27 +- .../security/authentication/provider.py | 44 +- .../security/authentication/userinfo.py | 28 +- .../src/extralit_server/security/settings.py | 1 - .../src/extralit_server/settings.py | 35 +- .../src/extralit_server/static_rewrite.py | 24 +- .../src/extralit_server/telemetry/__init__.py | 24 +- .../src/extralit_server/telemetry/_client.py | 9 +- .../src/extralit_server/telemetry/_helpers.py | 2 +- .../src/extralit_server/use_cases/__init__.py | 20 +- .../use_cases/responses/__init__.py | 20 +- .../responses/upsert_responses_in_bulk.py | 25 +- extralit-server/src/extralit_server/utils.py | 42 +- .../src/extralit_server/utils/__init__.py | 20 +- .../src/extralit_server/utils/_fastapi.py | 27 +- .../src/extralit_server/utils/params.py | 42 +- .../src/extralit_server/utils/str_enum.py | 20 +- .../extralit_server/validators/__init__.py | 20 +- .../extralit_server/validators/datasets.py | 22 +- .../extralit_server/validators/questions.py | 25 +- .../src/extralit_server/validators/records.py | 42 +- .../validators/response_values.py | 49 +- .../extralit_server/validators/responses.py | 13 +- .../extralit_server/validators/suggestions.py | 20 +- .../src/extralit_server/validators/users.py | 20 +- .../src/extralit_server/validators/vectors.py | 23 +- .../extralit_server/validators/webhooks.py | 24 +- .../src/extralit_server/webhooks/__init__.py | 20 +- .../extralit_server/webhooks/v1/__init__.py | 20 +- .../extralit_server/webhooks/v1/commons.py | 33 +- .../extralit_server/webhooks/v1/datasets.py | 27 +- .../src/extralit_server/webhooks/v1/enums.py | 20 +- .../src/extralit_server/webhooks/v1/event.py | 23 +- .../src/extralit_server/webhooks/v1/ping.py | 4 +- .../extralit_server/webhooks/v1/records.py | 29 +- .../extralit_server/webhooks/v1/responses.py | 29 +- .../extralit_server/webhooks/v1/schemas.py | 45 +- extralit-server/tests/__init__.py | 21 +- extralit-server/tests/conftest.py | 28 +- extralit-server/tests/database.py | 23 +- extralit-server/tests/factories.py | 19 +- extralit-server/tests/unit/__init__.py | 21 +- extralit-server/tests/unit/api/__init__.py | 20 +- .../tests/unit/api/handlers/v1/__init__.py | 20 +- .../handlers/v1/authentication/__init__.py | 20 +- .../v1/authentication/test_create_token.py | 22 +- .../unit/api/handlers/v1/datasets/__init__.py | 20 +- .../fields/test_create_dataset_field.py | 31 +- .../fields/test_list_dataset_fields.py | 27 +- .../v1/datasets/questions/__init__.py | 20 +- .../questions/test_create_dataset_question.py | 24 +- .../questions/test_list_dataset_questions.py | 22 +- .../handlers/v1/datasets/records/__init__.py | 20 +- .../datasets/records/records_bulk/__init__.py | 20 +- .../test_create_dataset_records_bulk.py | 28 +- .../records_bulk/test_dataset_records_bulk.py | 29 +- ...est_dataset_records_bulk_with_responses.py | 26 +- ...t_dataset_records_bulk_with_suggestions.py | 28 +- .../test_dataset_records_bulk_with_vectors.py | 24 +- .../test_update_dataset_records_in_bulk.py | 24 +- .../test_upsert_dataset_records_bulk.py | 16 +- .../records/test_delete_dataset_records.py | 27 +- .../v1/datasets/test_create_dataset.py | 37 +- .../v1/datasets/test_create_dataset_field.py | 20 +- ...test_create_dataset_metadata_properties.py | 31 +- .../datasets/test_create_dataset_question.py | 27 +- .../test_create_dataset_vector_settings.py | 22 +- .../v1/datasets/test_delete_dataset.py | 29 +- .../v1/datasets/test_export_dataset_to_hub.py | 13 +- .../v1/datasets/test_get_dataset_progress.py | 7 +- .../test_get_dataset_users_progress.py | 4 +- .../test_list_current_user_datasets.py | 31 +- ...aset_records_search_suggestions_options.py | 22 +- .../v1/datasets/test_publish_dataset.py | 31 +- .../handlers/v1/datasets/test_questions.py | 8 +- ...est_search_current_user_dataset_records.py | 26 +- .../datasets/test_search_dataset_records.py | 26 +- .../v1/datasets/test_update_dataset.py | 34 +- .../handlers/v1/fields/test_update_field.py | 30 +- .../unit/api/handlers/v1/info/__init__.py | 20 +- .../api/handlers/v1/info/test_get_status.py | 23 +- .../api/handlers/v1/info/test_get_version.py | 23 +- .../api/handlers/v1/questions/__init__.py | 20 +- .../v1/questions/test_update_question.py | 22 +- .../unit/api/handlers/v1/records/__init__.py | 20 +- .../v1/records/test_create_record_response.py | 35 +- .../handlers/v1/records/test_delete_record.py | 27 +- .../handlers/v1/records/test_update_record.py | 7 +- .../v1/records/test_upsert_suggestion.py | 27 +- .../api/handlers/v1/responses/__init__.py | 20 +- ...test_create_current_user_responses_bulk.py | 18 +- .../v1/responses/test_delete_response.py | 34 +- .../v1/responses/test_update_response.py | 35 +- .../unit/api/handlers/v1/settings/__init__.py | 20 +- .../handlers/v1/settings/test_get_settings.py | 6 +- .../api/handlers/v1/test_bulk_documents.py | 21 +- .../unit/api/handlers/v1/test_datasets.py | 50 +- .../unit/api/handlers/v1/test_documents.py | 59 +- .../tests/unit/api/handlers/v1/test_fields.py | 4 +- .../tests/unit/api/handlers/v1/test_files.py | 10 +- .../unit/api/handlers/v1/test_imports.py | 52 +- .../handlers/v1/test_list_dataset_records.py | 37 +- .../handlers/v1/test_metadata_properties.py | 52 +- .../tests/unit/api/handlers/v1/test_models.py | 13 +- .../unit/api/handlers/v1/test_questions.py | 14 +- .../unit/api/handlers/v1/test_records.py | 39 +- .../unit/api/handlers/v1/test_responses.py | 34 +- .../unit/api/handlers/v1/test_suggestions.py | 24 +- .../tests/unit/api/handlers/v1/test_users.py | 22 +- .../api/handlers/v1/test_vectors_settings.py | 22 +- .../unit/api/handlers/v1/test_workspaces.py | 4 +- .../unit/api/handlers/v1/users/__init__.py | 20 +- .../api/handlers/v1/users/test_create_user.py | 30 +- .../api/handlers/v1/users/test_delete_user.py | 26 +- .../v1/users/test_get_current_user.py | 23 +- .../api/handlers/v1/users/test_get_user.py | 24 +- .../api/handlers/v1/users/test_list_users.py | 24 +- .../api/handlers/v1/users/test_update_user.py | 11 +- .../unit/api/handlers/v1/webhooks/__init__.py | 20 +- .../v1/webhooks/test_create_webhook.py | 34 +- .../v1/webhooks/test_delete_webhook.py | 29 +- .../v1/webhooks/test_list_webhooks.py | 24 +- .../handlers/v1/webhooks/test_ping_webhook.py | 9 +- .../v1/webhooks/test_update_webhook.py | 39 +- .../api/handlers/v1/workspaces/__init__.py | 20 +- .../v1/workspaces/test_create_workspace.py | 17 +- .../workspaces/test_create_workspace_user.py | 26 +- .../workspaces/test_delete_workspace_user.py | 26 +- .../workspaces/test_list_workspace_users.py | 24 +- .../tests/unit/api/schemas/__init__.py | 20 +- .../schemas/v1/records/test_record_create.py | 24 +- .../schemas/v1/records/test_record_upsert.py | 20 +- .../responses/test_response_value_create.py | 1 + .../tests/unit/api/schemas/v1/test_commons.py | 26 +- extralit-server/tests/unit/api/test_docs.py | 21 +- .../tests/unit/api/test_not_found_routes.py | 20 +- .../tests/unit/api/v0/test_workspaces.py | 14 + extralit-server/tests/unit/cli/__init__.py | 20 +- extralit-server/tests/unit/cli/conftest.py | 23 +- .../tests/unit/cli/database/__init__.py | 20 +- .../tests/unit/cli/database/users/__init__.py | 20 +- .../unit/cli/database/users/test_create.py | 20 +- .../cli/database/users/test_create_default.py | 22 +- .../unit/cli/database/users/test_migrate.py | 5 +- .../unit/cli/database/users/test_update.py | 22 +- .../tests/unit/cli/search_engine/__init__.py | 20 +- .../unit/cli/search_engine/test_reindex.py | 20 +- .../tests/unit/commons/__init__.py | 20 +- extralit-server/tests/unit/conftest.py | 14 +- .../tests/unit/contexts/__init__.py | 20 +- .../unit/contexts/hub/test_hub_dataset.py | 20 +- .../contexts/hub/test_hub_dataset_exporter.py | 25 +- .../tests/unit/contexts/search/__init__.py | 20 +- .../test_search_records_query_validator.py | 26 +- .../tests/unit/contexts/test_imports.py | 17 +- .../tests/unit/daos/test_datasets.py | 14 + .../models/test_dataset_user_model.py | 22 +- .../unit/database/models/test_field_model.py | 46 +- extralit-server/tests/unit/errors/__init__.py | 20 +- extralit-server/tests/unit/jobs/__init__.py | 20 +- .../tests/unit/jobs/test_document_jobs.py | 7 +- .../tests/unit/jobs/webhook_jobs/__init__.py | 20 +- .../test_enqueue_notify_events.py | 27 +- extralit-server/tests/unit/models/__init__.py | 20 +- .../tests/unit/models/test_import_history.py | 5 +- .../tests/unit/models/test_webhook.py | 20 +- .../tests/unit/search_engine/__init__.py | 20 +- .../tests/unit/search_engine/conftest.py | 23 +- .../tests/unit/search_engine/test_commons.py | 61 +- .../unit/search_engine/test_elastisearch.py | 26 +- .../unit/search_engine/test_opensearch.py | 26 +- .../tests/unit/security/__init__.py | 20 +- .../unit/security/authentication/__init__.py | 20 +- .../authentication/oauth2/__init__.py | 20 +- .../oauth2/backends/__init__.py | 20 +- .../authentication/oauth2/test_settings.py | 20 +- .../security/authentication/test_userinfo.py | 20 +- .../tests/unit/security/test_model.py | 20 +- .../tests/unit/telemetry/__init__.py | 20 +- .../unit/telemetry/test_telemetry_helpers.py | 25 +- .../tests/unit/test_api_telemetry.py | 24 +- extralit-server/tests/unit/test_app.py | 9 +- extralit-server/tests/unit/test_database.py | 23 +- extralit-server/tests/unit/test_logging.py | 1 + extralit-server/tests/unit/test_utils.py | 14 +- extralit-server/tests/unit/utils/__init__.py | 20 +- .../tests/unit/utils/test_fastapi_utils.py | 22 +- .../tests/unit/validators/__init__.py | 20 +- .../unit/validators/test_records_bulk.py | 28 +- .../tests/unit/webhooks/__init__.py | 20 +- .../tests/unit/webhooks/v1/__init__.py | 20 +- .../webhooks/v1/test_notify_ping_event.py | 8 +- .../integrations/llamaindex_rag_github.ipynb | 7 +- extralit/docs/scripts/gen_popular_issues.py | 6 +- .../docs/tutorials/image_classification.ipynb | 57 +- .../docs/tutorials/image_preference.ipynb | 34 +- .../docs/tutorials/token_classification.ipynb | 8 +- extralit/pyproject.toml | 26 + extralit/src/extralit/_api/_base.py | 2 +- extralit/src/extralit/_api/_client.py | 11 +- extralit/src/extralit/_api/_datasets.py | 5 +- extralit/src/extralit/_api/_fields.py | 2 +- extralit/src/extralit/_api/_http/_helpers.py | 1 - extralit/src/extralit/_api/_metadata.py | 2 +- extralit/src/extralit/_api/_questions.py | 3 +- extralit/src/extralit/_api/_records.py | 4 +- extralit/src/extralit/_api/_token.py | 4 +- extralit/src/extralit/_api/_vectors.py | 2 +- extralit/src/extralit/_api/_workspaces.py | 44 +- extralit/src/extralit/_exceptions/__init__.py | 4 +- extralit/src/extralit/_exceptions/_hub.py | 2 +- .../src/extralit/_helpers/_dataclasses.py | 2 +- extralit/src/extralit/_helpers/_deploy.py | 2 +- extralit/src/extralit/_helpers/_media.py | 2 +- extralit/src/extralit/_models/_base.py | 2 +- extralit/src/extralit/_models/_dataset.py | 5 +- .../src/extralit/_models/_record/_record.py | 6 +- .../src/extralit/_models/_record/_response.py | 4 +- .../extralit/_models/_record/_suggestion.py | 2 +- extralit/src/extralit/_models/_schema.py | 5 +- extralit/src/extralit/_models/_search.py | 2 +- .../src/extralit/_models/_settings/_fields.py | 4 +- .../extralit/_models/_settings/_metadata.py | 2 +- .../extralit/_models/_settings/_questions.py | 4 +- .../_models/_settings/_task_distribution.py | 4 +- .../extralit/_models/_settings/_vectors.py | 4 +- extralit/src/extralit/_models/_user.py | 4 +- extralit/src/extralit/_models/_webhook.py | 2 +- extralit/src/extralit/_models/_workspace.py | 7 +- extralit/src/extralit/cli/app.py | 7 +- extralit/src/extralit/cli/callback.py | 4 +- .../src/extralit/cli/datasets/__main__.py | 11 +- .../src/extralit/cli/documents/__init__.py | 14 + .../src/extralit/cli/documents/__main__.py | 7 +- extralit/src/extralit/cli/documents/add.py | 4 +- extralit/src/extralit/cli/documents/delete.py | 4 +- .../src/extralit/cli/documents/import_bib.py | 17 +- .../extralit/cli/documents/import_history.py | 4 +- extralit/src/extralit/cli/documents/list.py | 4 +- .../src/extralit/cli/extraction/__main__.py | 15 +- .../src/extralit/cli/extraction/status.py | 14 + extralit/src/extralit/cli/files/__main__.py | 7 +- extralit/src/extralit/cli/files/delete.py | 4 +- extralit/src/extralit/cli/files/download.py | 4 +- extralit/src/extralit/cli/files/list.py | 4 +- extralit/src/extralit/cli/files/upload.py | 4 +- extralit/src/extralit/cli/info/__main__.py | 6 +- extralit/src/extralit/cli/login/__main__.py | 3 +- extralit/src/extralit/cli/logout/__main__.py | 3 +- extralit/src/extralit/cli/rich.py | 5 +- extralit/src/extralit/cli/schemas/__main__.py | 7 +- extralit/src/extralit/cli/schemas/download.py | 4 +- extralit/src/extralit/cli/schemas/upload.py | 12 +- .../src/extralit/cli/training/__main__.py | 7 +- extralit/src/extralit/cli/typer_ext.py | 3 +- extralit/src/extralit/cli/users/__main__.py | 8 +- extralit/src/extralit/cli/whoami/__main__.py | 3 +- .../src/extralit/cli/workspaces/__main__.py | 6 +- extralit/src/extralit/client/login.py | 6 +- extralit/src/extralit/client/resources.py | 12 +- extralit/src/extralit/datasets/__init__.py | 2 +- extralit/src/extralit/datasets/_io/_disk.py | 4 +- extralit/src/extralit/datasets/_io/_hub.py | 19 +- extralit/src/extralit/datasets/_resource.py | 2 +- extralit/src/extralit/documents/__init__.py | 2 +- extralit/src/extralit/records/__init__.py | 4 +- .../src/extralit/records/_dataset_records.py | 10 +- .../src/extralit/records/_io/_datasets.py | 11 +- extralit/src/extralit/records/_io/_generic.py | 2 +- extralit/src/extralit/records/_io/_json.py | 4 +- .../src/extralit/records/_mapping/_mapper.py | 19 +- extralit/src/extralit/records/_resource.py | 5 +- extralit/src/extralit/records/_search.py | 21 +- extralit/src/extralit/responses.py | 5 +- extralit/src/extralit/settings/__init__.py | 2 +- extralit/src/extralit/settings/_field.py | 17 +- extralit/src/extralit/settings/_io/_hub.py | 25 +- extralit/src/extralit/settings/_metadata.py | 6 +- extralit/src/extralit/settings/_question.py | 30 +- extralit/src/extralit/settings/_resource.py | 17 +- .../extralit/settings/_task_distribution.py | 2 +- extralit/src/extralit/settings/_templates.py | 12 +- extralit/src/extralit/settings/_vector.py | 2 +- extralit/src/extralit/users/_resource.py | 4 +- extralit/src/extralit/webhooks/__init__.py | 12 +- extralit/src/extralit/webhooks/_event.py | 12 +- extralit/src/extralit/webhooks/_handler.py | 3 +- extralit/src/extralit/webhooks/_helpers.py | 4 +- extralit/src/extralit/workspaces/_resource.py | 17 +- .../tests/integration/test_cli_commands.py | 6 +- .../integration/test_dataset_workspace.py | 2 +- .../integration/test_ranking_questions.py | 2 +- extralit/tests/unit/cli/schemas/__init__.py | 14 + .../tests/unit/cli/schemas/test_delete.py | 14 + .../tests/unit/cli/schemas/test_upload.py | 14 + 492 files changed, 6460 insertions(+), 6425 deletions(-) diff --git a/.devcontainer/noop.txt b/.devcontainer/noop.txt index dde8dc3c1..848dd9852 100644 --- a/.devcontainer/noop.txt +++ b/.devcontainer/noop.txt @@ -1,3 +1,3 @@ This file copied into the container along with environment.yml* from the parent -folder. This file is included to prevents the Dockerfile COPY instruction from +folder. This file is included to prevents the Dockerfile COPY instruction from failing if no environment.yml is found. \ No newline at end of file diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index d41bc9db3..36966927d 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -1,4 +1,4 @@ - ## Related Tickets & Documents @@ -28,7 +28,7 @@ Closes # - [ ] Documentation Update ## Steps to QA - 3\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mrg\u001b[39;00m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdatasets\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m load_dataset\n\u001b[1;32m 6\u001b[0m client \u001b[38;5;241m=\u001b[39m rg\u001b[38;5;241m.\u001b[39mExtralit()\n", - "File \u001b[0;32m~/Projects/extralit/argilla/src/argilla/__init__.py:15\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Copyright 2024-present, Extralit, Inc.\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;66;03m# Licensed under the Apache License, Version 2.0 (the \"License\");\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;66;03m# See the License for the specific language governing permissions and\u001b[39;00m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;66;03m# limitations under the License.\u001b[39;00m\n\u001b[1;32m 14\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_version\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m __version__ \u001b[38;5;66;03m# noqa\u001b[39;00m\n\u001b[0;32m---> 15\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mclient\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m \u001b[38;5;66;03m# noqa\u001b[39;00m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdatasets\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m \u001b[38;5;66;03m# noqa\u001b[39;00m\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mworkspaces\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m \u001b[38;5;66;03m# noqa\u001b[39;00m\n", - "File \u001b[0;32m~/Projects/extralit/argilla/src/argilla/client.py:22\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtyping\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m TYPE_CHECKING, List, Optional, Union, overload\n\u001b[1;32m 20\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01muuid\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m UUID\n\u001b[0;32m---> 22\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m _api\n\u001b[1;32m 23\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_api\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_base\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ResourceAPI\n\u001b[1;32m 24\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_api\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_client\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m DEFAULT_HTTP_CONFIG\n", - "File \u001b[0;32m~/Projects/extralit/argilla/src/argilla/_api/__init__.py:15\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Copyright 2024-present, Extralit, Inc.\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;66;03m# Licensed under the Apache License, Version 2.0 (the \"License\");\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;66;03m# See the License for the specific language governing permissions and\u001b[39;00m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;66;03m# limitations under the License.\u001b[39;00m\n\u001b[0;32m---> 15\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_api\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_datasets\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m \u001b[38;5;66;03m# noqa 403\u001b[39;00m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_api\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_http\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m \u001b[38;5;66;03m# noqa 403\u001b[39;00m\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_api\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_workspaces\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m \u001b[38;5;66;03m# noqa 403\u001b[39;00m\n", - "File \u001b[0;32m~/Projects/extralit/argilla/src/argilla/_api/_datasets.py:21\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_api\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_base\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ResourceAPI\n\u001b[1;32m 20\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_exceptions\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_api\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m api_error_handler\n\u001b[0;32m---> 21\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_models\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m DatasetModel\n\u001b[1;32m 23\u001b[0m __all__ \u001b[38;5;241m=\u001b[39m [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDatasetsAPI\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 25\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_models\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_dataset_progress\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m UserProgressModel, DatasetProgressModel\n", - "File \u001b[0;32m~/Projects/extralit/argilla/src/argilla/_models/__init__.py:17\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Copyright 2024-present, Extralit, Inc.\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;66;03m# Licensed under the Apache License, Version 2.0 (the \"License\");\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[38;5;66;03m# We skip the flake8 check because we are importing all the models and the import order is important\u001b[39;00m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;66;03m# flake8: noqa\u001b[39;00m\n\u001b[0;32m---> 17\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_models\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_resource\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ResourceModel\n\u001b[1;32m 18\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_models\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_workspace\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m WorkspaceModel\n\u001b[1;32m 19\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_models\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_user\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m UserModel, Role\n", - "File \u001b[0;32m~/Projects/extralit/argilla/src/argilla/_models/_resource.py:19\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtyping\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Optional\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01muuid\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m UUID\n\u001b[0;32m---> 19\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpydantic\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BaseModel, field_serializer\n\u001b[1;32m 22\u001b[0m \u001b[38;5;28;01mclass\u001b[39;00m \u001b[38;5;21;01mResourceModel\u001b[39;00m(BaseModel):\n\u001b[1;32m 23\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Base model for all resources (DatasetModel, WorkspaceModel, UserModel, etc.)\"\"\"\u001b[39;00m\n", - "\u001b[0;31mImportError\u001b[0m: cannot import name 'field_serializer' from 'pydantic' (/Users/jonny/micromamba/envs/extralit/lib/python3.9/site-packages/pydantic/__init__.cpython-39-darwin.so)" - ] - } - ], - "source": [ - "from datetime import datetime\n", - "\n", - "import extralit as ex\n", - "from datasets import load_dataset\n", - "\n", - "client = ex.Extralit()" - ] - }, - { - "cell_type": "code", - "execution_count": 106, - "metadata": {}, - "outputs": [], - "source": [ - "for dataset in client.datasets.list():\n", - " dataset.delete()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Chat Field\n" - ] - }, - { - "cell_type": "code", - "execution_count": 108, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/ben/code/argilla/argilla/src/argilla/datasets/_resource.py:203: UserWarning: Workspace not provided. Using default workspace: argilla id: 735cae0d-eb08-45c3-ad79-0a11ad4dd2c2\n", - " warnings.warn(f\"Workspace not provided. Using default workspace: {workspace.name} id: {workspace.id}\")\n" - ] - }, - { - "data": { - "text/plain": [ - "Dataset(id=UUID('ee5fc998-b475-45a8-86e7-7ff427d43268') inserted_at=datetime.datetime(2024, 8, 23, 10, 46, 50, 148167) updated_at=datetime.datetime(2024, 8, 23, 10, 46, 50, 291527) name='static_chat_20240823124650' status='ready' guidelines=None allow_extra_metadata=False distribution=OverlapTaskDistributionModel(strategy='overlap', min_submitted=1) workspace_id=UUID('735cae0d-eb08-45c3-ad79-0a11ad4dd2c2') last_activity_at=datetime.datetime(2024, 8, 23, 10, 46, 50, 291527))" - ] - }, - "execution_count": 108, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "settings = ex.Settings(\n", - " fields=[\n", - " ex.ChatField(\n", - " name=\"chosen\",\n", - " ),\n", - " ex.ChatField(\n", - " name=\"rejected\",\n", - " ),\n", - " ],\n", - " questions=[\n", - " ex.RatingQuestion(\"rating\", title=\"How would you rate the conversation?\", required=True, values=[1, 2, 3, 4, 5]),\n", - " ex.TextQuestion(\"improved_chosen\", title=\"Rewrite the chosen conversation\", required=False),\n", - " ],\n", - ")\n", - "\n", - "dataset = ex.Dataset(\n", - " settings=settings,\n", - " name=f\"static_chat_{datetime.now().strftime('%Y%m%d%H%M%S')}\",\n", - ")\n", - "\n", - "dataset.create()" - ] - }, - { - "cell_type": "code", - "execution_count": 109, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/ben/code/argilla/argilla/src/argilla/records/_mapping/_mapper.py:89: UserWarning: Keys ['source', 'chosen_rating', 'chosen_model', 'rejected_rating', 'rejected_model'] in data are not present in the mapping and will be ignored.\n", - " warnings.warn(f\"Keys {unknown_keys} in data are not present in the mapping and will be ignored.\")\n" - ] - }, - { - "data": { - "text/html": [ - "
DatasetRecords: The provided batch size 256 was normalized. Using value 100.\n",
-                            "
\n" - ], - "text/plain": [ - "DatasetRecords: The provided batch size \u001b[1;36m256\u001b[0m was normalized. Using value \u001b[1;36m100\u001b[0m.\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Sending records...: 100%|██████████| 1/1 [00:00<00:00, 3.45batch/s]\n" - ] - }, - { - "data": { - "text/plain": [ - "DatasetRecords(Dataset(id=UUID('ee5fc998-b475-45a8-86e7-7ff427d43268') inserted_at=datetime.datetime(2024, 8, 23, 10, 46, 50, 148167) updated_at=datetime.datetime(2024, 8, 23, 10, 46, 50, 291527) name='static_chat_20240823124650' status='ready' guidelines=None allow_extra_metadata=False distribution=OverlapTaskDistributionModel(strategy='overlap', min_submitted=1) workspace_id=UUID('735cae0d-eb08-45c3-ad79-0a11ad4dd2c2') last_activity_at=datetime.datetime(2024, 8, 23, 10, 46, 50, 291527)))" - ] - }, - "execution_count": 109, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ds = load_dataset(\"argilla/Capybara-Preferences\", split=\"train[:100]\")\n", - "dataset.records.log(ds)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Custom Field" - ] - }, - { - "cell_type": "code", - "execution_count": 110, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Dataset(id=UUID('620fb219-73cb-42c6-bad0-456880a93ab9') inserted_at=datetime.datetime(2024, 8, 23, 10, 46, 58, 842638) updated_at=datetime.datetime(2024, 8, 23, 10, 46, 59, 10418) name='interactive_chat_20240823124658' status='ready' guidelines=None allow_extra_metadata=False distribution=OverlapTaskDistributionModel(strategy='overlap', min_submitted=1) workspace_id=UUID('735cae0d-eb08-45c3-ad79-0a11ad4dd2c2') last_activity_at=datetime.datetime(2024, 8, 23, 10, 46, 59, 10418))" - ] - }, - "execution_count": 110, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "html_template_path = \"interactive_chat.html\"\n", - "\n", - "settings = ex.Settings(\n", - " fields=[\n", - " ex.CustomField(name=\"chosen\", template=html_template_path, required=False),\n", - " ex.ChatField(\n", - " name=\"rejected\",\n", - " ),\n", - " ],\n", - " questions=[\n", - " ex.RatingQuestion(\n", - " \"rating\", title=\"How would you rate the conversation?\", required=True, values=[1, 2, 3, 4, 5]\n", - " ),\n", - " ex.TextQuestion(\n", - " \"improved_chosen\", title=\"Rewrite the chosen conversation\", required=True\n", - " ),\n", - " ],\n", - ")\n", - "\n", - "dataset = ex.Dataset(\n", - " settings=settings,\n", - " name=f\"interactive_chat_{datetime.now().strftime('%Y%m%d%H%M%S')}\",\n", - ")\n", - "\n", - "dataset.create()" - ] - }, - { - "cell_type": "code", - "execution_count": 111, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/ben/code/argilla/argilla/src/argilla/records/_mapping/_mapper.py:89: UserWarning: Keys ['source', 'chosen_rating', 'chosen_model', 'rejected_rating', 'rejected_model', 'messages'] in data are not present in the mapping and will be ignored.\n", - " warnings.warn(f\"Keys {unknown_keys} in data are not present in the mapping and will be ignored.\")\n" - ] - }, - { - "data": { - "text/html": [ - "
DatasetRecords: The provided batch size 256 was normalized. Using value 100.\n",
-                            "
\n" - ], - "text/plain": [ - "DatasetRecords: The provided batch size \u001b[1;36m256\u001b[0m was normalized. Using value \u001b[1;36m100\u001b[0m.\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Sending records...: 100%|██████████| 1/1 [00:00<00:00, 3.32batch/s]\n" - ] - }, - { - "data": { - "text/plain": [ - "DatasetRecords(Dataset(id=UUID('620fb219-73cb-42c6-bad0-456880a93ab9') inserted_at=datetime.datetime(2024, 8, 23, 10, 46, 58, 842638) updated_at=datetime.datetime(2024, 8, 23, 10, 46, 59, 10418) name='interactive_chat_20240823124658' status='ready' guidelines=None allow_extra_metadata=False distribution=OverlapTaskDistributionModel(strategy='overlap', min_submitted=1) workspace_id=UUID('735cae0d-eb08-45c3-ad79-0a11ad4dd2c2') last_activity_at=datetime.datetime(2024, 8, 23, 10, 46, 59, 10418)))" - ] - }, - "execution_count": 111, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ds = load_dataset(\"argilla/Capybara-Preferences\", split=\"train[:100]\")\n", - "ds = ds.map(lambda x: {\"messages\": x[\"chosen\"]})\n", - "dataset.records.log(ds)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.19" - } + "cells": [ + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "ename": "ImportError", + "evalue": "cannot import name 'field_serializer' from 'pydantic' (/Users/jonny/micromamba/envs/extralit/lib/python3.9/site-packages/pydantic/__init__.cpython-39-darwin.so)", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[4], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdatetime\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m datetime\n\u001b[0;32m----> 3\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mrg\u001b[39;00m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdatasets\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m load_dataset\n\u001b[1;32m 6\u001b[0m client \u001b[38;5;241m=\u001b[39m rg\u001b[38;5;241m.\u001b[39mExtralit()\n", + "File \u001b[0;32m~/Projects/extralit/argilla/src/argilla/__init__.py:15\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Copyright 2024-present, Extralit, Inc.\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;66;03m# Licensed under the Apache License, Version 2.0 (the \"License\");\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;66;03m# See the License for the specific language governing permissions and\u001b[39;00m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;66;03m# limitations under the License.\u001b[39;00m\n\u001b[1;32m 14\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_version\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m __version__ \u001b[38;5;66;03m# noqa\u001b[39;00m\n\u001b[0;32m---> 15\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mclient\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m \u001b[38;5;66;03m# noqa\u001b[39;00m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mdatasets\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m \u001b[38;5;66;03m# noqa\u001b[39;00m\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mworkspaces\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m \u001b[38;5;66;03m# noqa\u001b[39;00m\n", + "File \u001b[0;32m~/Projects/extralit/argilla/src/argilla/client.py:22\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtyping\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m TYPE_CHECKING, List, Optional, Union, overload\n\u001b[1;32m 20\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01muuid\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m UUID\n\u001b[0;32m---> 22\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m _api\n\u001b[1;32m 23\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_api\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_base\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ResourceAPI\n\u001b[1;32m 24\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_api\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_client\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m DEFAULT_HTTP_CONFIG\n", + "File \u001b[0;32m~/Projects/extralit/argilla/src/argilla/_api/__init__.py:15\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Copyright 2024-present, Extralit, Inc.\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;66;03m# Licensed under the Apache License, Version 2.0 (the \"License\");\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;66;03m# See the License for the specific language governing permissions and\u001b[39;00m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;66;03m# limitations under the License.\u001b[39;00m\n\u001b[0;32m---> 15\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_api\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_datasets\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m \u001b[38;5;66;03m# noqa 403\u001b[39;00m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_api\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_http\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m \u001b[38;5;66;03m# noqa 403\u001b[39;00m\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_api\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_workspaces\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;241m*\u001b[39m \u001b[38;5;66;03m# noqa 403\u001b[39;00m\n", + "File \u001b[0;32m~/Projects/extralit/argilla/src/argilla/_api/_datasets.py:21\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_api\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_base\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ResourceAPI\n\u001b[1;32m 20\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_exceptions\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_api\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m api_error_handler\n\u001b[0;32m---> 21\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_models\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m DatasetModel\n\u001b[1;32m 23\u001b[0m __all__ \u001b[38;5;241m=\u001b[39m [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDatasetsAPI\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 25\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_models\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_dataset_progress\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m UserProgressModel, DatasetProgressModel\n", + "File \u001b[0;32m~/Projects/extralit/argilla/src/argilla/_models/__init__.py:17\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Copyright 2024-present, Extralit, Inc.\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;66;03m# Licensed under the Apache License, Version 2.0 (the \"License\");\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[38;5;66;03m# We skip the flake8 check because we are importing all the models and the import order is important\u001b[39;00m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;66;03m# flake8: noqa\u001b[39;00m\n\u001b[0;32m---> 17\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_models\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_resource\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ResourceModel\n\u001b[1;32m 18\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_models\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_workspace\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m WorkspaceModel\n\u001b[1;32m 19\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01margilla\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_models\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_user\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m UserModel, Role\n", + "File \u001b[0;32m~/Projects/extralit/argilla/src/argilla/_models/_resource.py:19\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtyping\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Optional\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01muuid\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m UUID\n\u001b[0;32m---> 19\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpydantic\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BaseModel, field_serializer\n\u001b[1;32m 22\u001b[0m \u001b[38;5;28;01mclass\u001b[39;00m \u001b[38;5;21;01mResourceModel\u001b[39;00m(BaseModel):\n\u001b[1;32m 23\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Base model for all resources (DatasetModel, WorkspaceModel, UserModel, etc.)\"\"\"\u001b[39;00m\n", + "\u001b[0;31mImportError\u001b[0m: cannot import name 'field_serializer' from 'pydantic' (/Users/jonny/micromamba/envs/extralit/lib/python3.9/site-packages/pydantic/__init__.cpython-39-darwin.so)" + ] + } + ], + "source": [ + "from datetime import datetime\n", + "\n", + "import extralit as ex\n", + "from datasets import load_dataset\n", + "\n", + "client = ex.Extralit()" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": {}, + "outputs": [], + "source": [ + "for dataset in client.datasets.list():\n", + " dataset.delete()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Chat Field\n" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/ben/code/argilla/argilla/src/argilla/datasets/_resource.py:203: UserWarning: Workspace not provided. Using default workspace: argilla id: 735cae0d-eb08-45c3-ad79-0a11ad4dd2c2\n", + " warnings.warn(f\"Workspace not provided. Using default workspace: {workspace.name} id: {workspace.id}\")\n" + ] }, - "nbformat": 4, - "nbformat_minor": 2 + { + "data": { + "text/plain": [ + "Dataset(id=UUID('ee5fc998-b475-45a8-86e7-7ff427d43268') inserted_at=datetime.datetime(2024, 8, 23, 10, 46, 50, 148167) updated_at=datetime.datetime(2024, 8, 23, 10, 46, 50, 291527) name='static_chat_20240823124650' status='ready' guidelines=None allow_extra_metadata=False distribution=OverlapTaskDistributionModel(strategy='overlap', min_submitted=1) workspace_id=UUID('735cae0d-eb08-45c3-ad79-0a11ad4dd2c2') last_activity_at=datetime.datetime(2024, 8, 23, 10, 46, 50, 291527))" + ] + }, + "execution_count": 108, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "settings = ex.Settings(\n", + " fields=[\n", + " ex.ChatField(\n", + " name=\"chosen\",\n", + " ),\n", + " ex.ChatField(\n", + " name=\"rejected\",\n", + " ),\n", + " ],\n", + " questions=[\n", + " ex.RatingQuestion(\n", + " \"rating\",\n", + " title=\"How would you rate the conversation?\",\n", + " required=True,\n", + " values=[1, 2, 3, 4, 5],\n", + " ),\n", + " ex.TextQuestion(\n", + " \"improved_chosen\", title=\"Rewrite the chosen conversation\", required=False\n", + " ),\n", + " ],\n", + ")\n", + "\n", + "dataset = ex.Dataset(\n", + " settings=settings,\n", + " name=f\"static_chat_{datetime.now().strftime('%Y%m%d%H%M%S')}\",\n", + ")\n", + "\n", + "dataset.create()" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/ben/code/argilla/argilla/src/argilla/records/_mapping/_mapper.py:89: UserWarning: Keys ['source', 'chosen_rating', 'chosen_model', 'rejected_rating', 'rejected_model'] in data are not present in the mapping and will be ignored.\n", + " warnings.warn(f\"Keys {unknown_keys} in data are not present in the mapping and will be ignored.\")\n" + ] + }, + { + "data": { + "text/html": [ + "
DatasetRecords: The provided batch size 256 was normalized. Using value 100.\n",
+       "
\n" + ], + "text/plain": [ + "DatasetRecords: The provided batch size \u001b[1;36m256\u001b[0m was normalized. Using value \u001b[1;36m100\u001b[0m.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Sending records...: 100%|██████████| 1/1 [00:00<00:00, 3.45batch/s]\n" + ] + }, + { + "data": { + "text/plain": [ + "DatasetRecords(Dataset(id=UUID('ee5fc998-b475-45a8-86e7-7ff427d43268') inserted_at=datetime.datetime(2024, 8, 23, 10, 46, 50, 148167) updated_at=datetime.datetime(2024, 8, 23, 10, 46, 50, 291527) name='static_chat_20240823124650' status='ready' guidelines=None allow_extra_metadata=False distribution=OverlapTaskDistributionModel(strategy='overlap', min_submitted=1) workspace_id=UUID('735cae0d-eb08-45c3-ad79-0a11ad4dd2c2') last_activity_at=datetime.datetime(2024, 8, 23, 10, 46, 50, 291527)))" + ] + }, + "execution_count": 109, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds = load_dataset(\"argilla/Capybara-Preferences\", split=\"train[:100]\")\n", + "dataset.records.log(ds)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Custom Field" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Dataset(id=UUID('620fb219-73cb-42c6-bad0-456880a93ab9') inserted_at=datetime.datetime(2024, 8, 23, 10, 46, 58, 842638) updated_at=datetime.datetime(2024, 8, 23, 10, 46, 59, 10418) name='interactive_chat_20240823124658' status='ready' guidelines=None allow_extra_metadata=False distribution=OverlapTaskDistributionModel(strategy='overlap', min_submitted=1) workspace_id=UUID('735cae0d-eb08-45c3-ad79-0a11ad4dd2c2') last_activity_at=datetime.datetime(2024, 8, 23, 10, 46, 59, 10418))" + ] + }, + "execution_count": 110, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "html_template_path = \"interactive_chat.html\"\n", + "\n", + "settings = ex.Settings(\n", + " fields=[\n", + " ex.CustomField(name=\"chosen\", template=html_template_path, required=False),\n", + " ex.ChatField(\n", + " name=\"rejected\",\n", + " ),\n", + " ],\n", + " questions=[\n", + " ex.RatingQuestion(\n", + " \"rating\",\n", + " title=\"How would you rate the conversation?\",\n", + " required=True,\n", + " values=[1, 2, 3, 4, 5],\n", + " ),\n", + " ex.TextQuestion(\n", + " \"improved_chosen\", title=\"Rewrite the chosen conversation\", required=True\n", + " ),\n", + " ],\n", + ")\n", + "\n", + "dataset = ex.Dataset(\n", + " settings=settings,\n", + " name=f\"interactive_chat_{datetime.now().strftime('%Y%m%d%H%M%S')}\",\n", + ")\n", + "\n", + "dataset.create()" + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/ben/code/argilla/argilla/src/argilla/records/_mapping/_mapper.py:89: UserWarning: Keys ['source', 'chosen_rating', 'chosen_model', 'rejected_rating', 'rejected_model', 'messages'] in data are not present in the mapping and will be ignored.\n", + " warnings.warn(f\"Keys {unknown_keys} in data are not present in the mapping and will be ignored.\")\n" + ] + }, + { + "data": { + "text/html": [ + "
DatasetRecords: The provided batch size 256 was normalized. Using value 100.\n",
+       "
\n" + ], + "text/plain": [ + "DatasetRecords: The provided batch size \u001b[1;36m256\u001b[0m was normalized. Using value \u001b[1;36m100\u001b[0m.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Sending records...: 100%|██████████| 1/1 [00:00<00:00, 3.32batch/s]\n" + ] + }, + { + "data": { + "text/plain": [ + "DatasetRecords(Dataset(id=UUID('620fb219-73cb-42c6-bad0-456880a93ab9') inserted_at=datetime.datetime(2024, 8, 23, 10, 46, 58, 842638) updated_at=datetime.datetime(2024, 8, 23, 10, 46, 59, 10418) name='interactive_chat_20240823124658' status='ready' guidelines=None allow_extra_metadata=False distribution=OverlapTaskDistributionModel(strategy='overlap', min_submitted=1) workspace_id=UUID('735cae0d-eb08-45c3-ad79-0a11ad4dd2c2') last_activity_at=datetime.datetime(2024, 8, 23, 10, 46, 59, 10418)))" + ] + }, + "execution_count": 111, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds = load_dataset(\"argilla/Capybara-Preferences\", split=\"train[:100]\")\n", + "ds = ds.map(lambda x: {\"messages\": x[\"chosen\"]})\n", + "dataset.records.log(ds)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.19" + } + }, + "nbformat": 4, + "nbformat_minor": 2 } \ No newline at end of file diff --git a/examples/custom_field/table_field.ipynb b/examples/custom_field/table_field.ipynb index 5644f404b..440d8c335 100644 --- a/examples/custom_field/table_field.ipynb +++ b/examples/custom_field/table_field.ipynb @@ -1,776 +1,768 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/opt/conda/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - } - ], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2\n", - " \n", - "from datetime import datetime\n", - "import json\n", - "\n", - "import extralit as ex\n", - "from datasets import load_dataset\n", - "\n", - "client = ex.Extralit(api_url=\"http://localhost:6900/\", api_key='extralit.apikey')" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "for dataset in client.datasets.list():\n", - " print(dataset.name)\n", - " # dataset.delete()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Load extraction dataset" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Dataset(id=UUID('3a7abf40-a6b7-4cf6-ac09-d89a8b33ac67') inserted_at=datetime.datetime(2024, 4, 4, 5, 23, 44, 562080) updated_at=datetime.datetime(2024, 11, 15, 0, 35, 14, 753190) name='2-Data-Extractions' status='ready' guidelines=None allow_extra_metadata=True distribution=OverlapTaskDistributionModel(strategy='overlap', min_submitted=1) workspace_id=UUID('e9e4e699-a6f9-4482-b5dd-e45874bd87eb') last_activity_at=datetime.datetime(2024, 12, 1, 5, 55, 8, 469548))" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dataset = client.datasets(\n", - " name=\"2-Data-Extractions\",\n", - " workspace=\"itn-recalibration\"\n", - ")\n", - "dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "jp-MarkdownHeadingCollapsed": true - }, - "source": [ - "## Update field" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'reference': 'mosqueira2015pilot', 'pmid': '25959771', 'doc_id': '276c32ef-26d2-40cb-b808-b764018cd2ea', 'type': 'Observation'}\n", - "\n", - "\n", - "{'reference': 'mosqueira2015pilot', 'pmid': '25959771', 'doc_id': '276c32ef-26d2-40cb-b808-b764018cd2ea', 'type': 'ITNCondition'}\n", - "\n", - "\n", - "{'reference': 'mosqueira2015pilot', 'pmid': '25959771', 'doc_id': '276c32ef-26d2-40cb-b808-b764018cd2ea', 'type': 'EntomologicalOutcome'}\n", - "\n", - "\n", - "{'reference': 'mosqueira2015pilot', 'pmid': '25959771', 'doc_id': '276c32ef-26d2-40cb-b808-b764018cd2ea', 'type': 'ClinicalOutcome'}\n", - "\n", - "\n" - ] - }, - { - "data": { - "text/plain": [ - "4" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Find the record with the specific metadata\n", - "records = dataset.records(query=ex.Query(filter=(\"metadata.reference\", \"==\", \"mosqueira2015pilot\")))\n", - "\n", - "# Update the record's extraction field\n", - "updated_records = []\n", - "for record in records:\n", - " print(record.metadata)\n", - " print(type(record.fields[\"extraction\"]))\n", - " record.fields[\"extraction\"] = json.loads(record.fields[\"extraction\"])\n", - " print(type(record.fields[\"extraction\"]))\n", - " updated_records.append(record)\n", - "\n", - "len(updated_records)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
DatasetRecords: The provided batch size 256 was normalized. Using value 4.\n",
-                            "
\n" - ], - "text/plain": [ - "DatasetRecords: The provided batch size \u001b[1;36m256\u001b[0m was normalized. Using value \u001b[1;36m4\u001b[0m.\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Sending records...: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:04<00:00, 4.41s/batch]\n" - ] - }, - { - "data": { - "text/plain": [ - "DatasetRecords(Dataset(id=UUID('3a7abf40-a6b7-4cf6-ac09-d89a8b33ac67') inserted_at=datetime.datetime(2024, 4, 4, 5, 23, 44, 562080) updated_at=datetime.datetime(2024, 11, 15, 0, 35, 14, 753190) name='2-Data-Extractions' status='ready' guidelines=None allow_extra_metadata=True distribution=OverlapTaskDistributionModel(strategy='overlap', min_submitted=1) workspace_id=UUID('e9e4e699-a6f9-4482-b5dd-e45874bd87eb') last_activity_at=datetime.datetime(2024, 11, 21, 18, 7, 47, 105497)))" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dataset.records.log(updated_records)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'reference': 'mosqueira2015pilot',\n", - " 'schema': {'fields': [{'name': 'observation_ref',\n", - " 'type': 'any',\n", - " 'extDtype': 'string'},\n", - " {'name': 'itncondition_ref', 'type': 'any', 'extDtype': 'string'},\n", - " {'name': 'N_people', 'type': 'integer'},\n", - " {'name': 'Age_lower', 'type': 'number'},\n", - " {'name': 'Age_upper', 'type': 'number'}],\n", - " 'primaryKey': ['observation_ref', 'itncondition_ref'],\n", - " 'pandas_version': '1.4.0'},\n", - " 'data': [{'observation_ref': 'S01',\n", - " 'itncondition_ref': 'N01',\n", - " 'N_people': 3903,\n", - " 'Age_lower': 0.5,\n", - " 'Age_upper': 14.0},\n", - " {'observation_ref': 'S02',\n", - " 'itncondition_ref': 'N01',\n", - " 'N_people': 3903,\n", - " 'Age_lower': 0.5,\n", - " 'Age_upper': 14.0}],\n", - " 'validation': {'schema_type': 'dataframe',\n", - " 'version': '0.18.3',\n", - " 'columns': {'N_people': {'title': None,\n", - " 'description': 'Number of people in the study arm of the net in question',\n", - " 'dtype': 'int64',\n", - " 'nullable': True,\n", - " 'checks': None,\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'Age_lower': {'title': None,\n", - " 'description': 'Lower limit of age group in years. For children <1, enter age as a decimal.',\n", - " 'dtype': 'float64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'Age_upper': {'title': None,\n", - " 'description': 'Upper limit of age group in years. For children <1, enter age as a decimal.',\n", - " 'dtype': 'float64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'N_pos': {'title': None,\n", - " 'description': 'Number of people tested to be parasite positive',\n", - " 'dtype': 'int64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'PR': {'title': None,\n", - " 'description': 'Definition: (N_pos/N_people)*100',\n", - " 'dtype': 'float64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'PR_rate_lower': {'title': None,\n", - " 'description': 'Lower bound of parasite positivity rate',\n", - " 'dtype': 'float64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'PR_rate_upper': {'title': None,\n", - " 'description': 'Upper bound of parasite positivity rate',\n", - " 'dtype': 'float64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'CM': {'title': None,\n", - " 'description': 'Number of people with clinical malaria',\n", - " 'dtype': 'int64',\n", - " 'nullable': True,\n", - " 'checks': None,\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'CM_rate': {'title': None,\n", - " 'description': 'Definition: (CM/N_people)*100',\n", - " 'dtype': 'float64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'CM_rate_lower': {'title': None,\n", - " 'description': 'Lower bound of clinical malaria rate',\n", - " 'dtype': 'float64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'CM_rate_upper': {'title': None,\n", - " 'description': 'Upper bound of clinical malaria rate',\n", - " 'dtype': 'float64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'Net_retention': {'title': None,\n", - " 'description': 'Number of nets still owned divided by a number of nets previously distributed',\n", - " 'dtype': 'float64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'N_nets': {'title': None,\n", - " 'description': 'Number of nets found in household or community study arm',\n", - " 'dtype': 'int64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'N_sleep_nets': {'title': None,\n", - " 'description': 'Number of people that slept under a net the previous night',\n", - " 'dtype': 'int64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'Perc_sleep_nets': {'title': None,\n", - " 'description': 'Percent of people that slept under a net the previous night',\n", - " 'dtype': 'float64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False}},\n", - " 'checks': {'check_less_than': {'columns_a': ['Age_lower'],\n", - " 'columns_b': ['Age_upper'],\n", - " 'or_equal': True},\n", - " 'check_greater_than': {'columns_a': 'N_people',\n", - " 'columns_b': ['N_pos', 'CM', 'N_sleep_nets'],\n", - " 'or_equal': True},\n", - " 'check_between': {'columns_target': ['PR', 'CM_rate'],\n", - " 'columns_lower': ['PR_rate_lower', 'CM_rate_lower'],\n", - " 'columns_upper': ['PR_rate_upper', 'CM_rate_upper'],\n", - " 'or_equal': True}},\n", - " 'index': [{'title': 'Observation reference',\n", - " 'description': None,\n", - " 'dtype': 'str',\n", - " 'nullable': False,\n", - " 'checks': {'str_startswith': 'S'},\n", - " 'name': 'observation_ref',\n", - " 'unique': False,\n", - " 'coerce': False},\n", - " {'title': 'ITNCondition reference',\n", - " 'description': None,\n", - " 'dtype': 'str',\n", - " 'nullable': False,\n", - " 'checks': {'str_startswith': 'N'},\n", - " 'name': 'itncondition_ref',\n", - " 'unique': False,\n", - " 'coerce': False}],\n", - " 'dtype': None,\n", - " 'coerce': True,\n", - " 'strict': True,\n", - " 'name': 'ClinicalOutcome',\n", - " 'ordered': False,\n", - " 'unique': None,\n", - " 'report_duplicates': 'all',\n", - " 'unique_column_names': False,\n", - " 'add_missing_columns': False,\n", - " 'title': None,\n", - " 'description': '\\nEpidemiological and clinical outcomes on humans collected from a clinical trial or village trial, if reported in the study.\\nEach clinical outcome should have unique `observation_ref`, `itn_condition_ref`, `Group`, `Age_lower`, and `Age_upper` (if reported).\\n '}}" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "json.loads(record.fields[\"extraction\"])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Test dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Custom Field" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jonny/Projects/extralit/argilla/src/argilla/client.py:354: UserWarning: Dataset with name 'interactive_chat' not found in workspace 'itn-recalibration'\n", - " warnings.warn(f\"Dataset with name {name!r} not found in workspace {workspace.name!r}\")\n" - ] - }, - { - "ename": "AttributeError", - "evalue": "'NoneType' object has no attribute 'delete'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[5], line 5\u001b[0m\n\u001b[1;32m 1\u001b[0m dataset \u001b[38;5;241m=\u001b[39m client\u001b[38;5;241m.\u001b[39mdatasets(\n\u001b[1;32m 2\u001b[0m name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minteractive_chat\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 3\u001b[0m \u001b[38;5;66;03m# workspace=\"itn-recalibration\"\u001b[39;00m\n\u001b[1;32m 4\u001b[0m )\n\u001b[0;32m----> 5\u001b[0m \u001b[43mdataset\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdelete\u001b[49m()\n", - "\u001b[0;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'delete'" - ] - } - ], - "source": [ - "dataset = client.datasets(\n", - " name=\"interactive_chat\",\n", - " # workspace=\"itn-recalibration\"\n", - ")\n", - "dataset.delete()" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "Dataset(id=UUID('92b559e7-8eff-4d4c-85bf-817fd73570e4') inserted_at=datetime.datetime(2024, 12, 2, 21, 33, 33, 529345) updated_at=datetime.datetime(2024, 12, 2, 21, 33, 39, 111530) name='interactive_chat' status='ready' guidelines=None allow_extra_metadata=False distribution=OverlapTaskDistributionModel(strategy='overlap', min_submitted=1) workspace_id=UUID('e9e4e699-a6f9-4482-b5dd-e45874bd87eb') last_activity_at=datetime.datetime(2024, 12, 2, 21, 33, 39, 111530))" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "settings = ex.Settings(\n", - " fields=[\n", - " ex.TableField(name=\"chosen\", required=False),\n", - " ],\n", - " questions=[\n", - " ex.TableQuestion(\n", - " \"extraction\", title=\"Correct the table\", required=True\n", - " ),\n", - " ],\n", - ")\n", - "\n", - "dataset = ex.Dataset(\n", - " settings=settings,\n", - " name=f\"interactive_chat\",\n", - ")\n", - "\n", - "dataset.create()" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'reference': 'mosqueira2015pilot',\n", - " 'schema': {'fields': [{'name': 'observation_ref',\n", - " 'type': 'any',\n", - " 'extDtype': 'string'},\n", - " {'name': 'itncondition_ref', 'type': 'any', 'extDtype': 'string'},\n", - " {'name': 'N_people', 'type': 'integer'},\n", - " {'name': 'Age_lower', 'type': 'number'},\n", - " {'name': 'Age_upper', 'type': 'number'}],\n", - " 'primaryKey': ['observation_ref', 'itncondition_ref'],\n", - " 'pandas_version': '1.4.0'},\n", - " 'data': [{'observation_ref': 'S01',\n", - " 'itncondition_ref': 'N01',\n", - " 'N_people': 3903,\n", - " 'Age_lower': 0.5,\n", - " 'Age_upper': 14.0},\n", - " {'observation_ref': 'S02',\n", - " 'itncondition_ref': 'N01',\n", - " 'N_people': 3903,\n", - " 'Age_lower': 0.5,\n", - " 'Age_upper': 14.0}],\n", - " 'validation': {'schema_type': 'dataframe',\n", - " 'version': '0.18.3',\n", - " 'columns': {'N_people': {'title': None,\n", - " 'description': 'Number of people in the study arm of the net in question',\n", - " 'dtype': 'int64',\n", - " 'nullable': True,\n", - " 'checks': None,\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'Age_lower': {'title': None,\n", - " 'description': 'Lower limit of age group in years. For children <1, enter age as a decimal.',\n", - " 'dtype': 'float64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'Age_upper': {'title': None,\n", - " 'description': 'Upper limit of age group in years. For children <1, enter age as a decimal.',\n", - " 'dtype': 'float64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'N_pos': {'title': None,\n", - " 'description': 'Number of people tested to be parasite positive',\n", - " 'dtype': 'int64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'PR': {'title': None,\n", - " 'description': 'Definition: (N_pos/N_people)*100',\n", - " 'dtype': 'float64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'PR_rate_lower': {'title': None,\n", - " 'description': 'Lower bound of parasite positivity rate',\n", - " 'dtype': 'float64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'PR_rate_upper': {'title': None,\n", - " 'description': 'Upper bound of parasite positivity rate',\n", - " 'dtype': 'float64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'CM': {'title': None,\n", - " 'description': 'Number of people with clinical malaria',\n", - " 'dtype': 'int64',\n", - " 'nullable': True,\n", - " 'checks': None,\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'CM_rate': {'title': None,\n", - " 'description': 'Definition: (CM/N_people)*100',\n", - " 'dtype': 'float64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'CM_rate_lower': {'title': None,\n", - " 'description': 'Lower bound of clinical malaria rate',\n", - " 'dtype': 'float64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'CM_rate_upper': {'title': None,\n", - " 'description': 'Upper bound of clinical malaria rate',\n", - " 'dtype': 'float64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'Net_retention': {'title': None,\n", - " 'description': 'Number of nets still owned divided by a number of nets previously distributed',\n", - " 'dtype': 'float64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'N_nets': {'title': None,\n", - " 'description': 'Number of nets found in household or community study arm',\n", - " 'dtype': 'int64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'N_sleep_nets': {'title': None,\n", - " 'description': 'Number of people that slept under a net the previous night',\n", - " 'dtype': 'int64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False},\n", - " 'Perc_sleep_nets': {'title': None,\n", - " 'description': 'Percent of people that slept under a net the previous night',\n", - " 'dtype': 'float64',\n", - " 'nullable': True,\n", - " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", - " 'unique': False,\n", - " 'coerce': False,\n", - " 'required': True,\n", - " 'regex': False}},\n", - " 'checks': {'check_less_than': {'columns_a': ['Age_lower'],\n", - " 'columns_b': ['Age_upper'],\n", - " 'or_equal': True},\n", - " 'check_greater_than': {'columns_a': 'N_people',\n", - " 'columns_b': ['N_pos', 'CM', 'N_sleep_nets'],\n", - " 'or_equal': True},\n", - " 'check_between': {'columns_target': ['PR', 'CM_rate'],\n", - " 'columns_lower': ['PR_rate_lower', 'CM_rate_lower'],\n", - " 'columns_upper': ['PR_rate_upper', 'CM_rate_upper'],\n", - " 'or_equal': True}},\n", - " 'index': [{'title': 'Observation reference',\n", - " 'description': None,\n", - " 'dtype': 'str',\n", - " 'nullable': False,\n", - " 'checks': {'str_startswith': 'S'},\n", - " 'name': 'observation_ref',\n", - " 'unique': False,\n", - " 'coerce': False},\n", - " {'title': 'ITNCondition reference',\n", - " 'description': None,\n", - " 'dtype': 'str',\n", - " 'nullable': False,\n", - " 'checks': {'str_startswith': 'N'},\n", - " 'name': 'itncondition_ref',\n", - " 'unique': False,\n", - " 'coerce': False}],\n", - " 'dtype': None,\n", - " 'coerce': True,\n", - " 'strict': True,\n", - " 'name': 'ClinicalOutcome',\n", - " 'ordered': False,\n", - " 'unique': None,\n", - " 'report_duplicates': 'all',\n", - " 'unique_column_names': False,\n", - " 'add_missing_columns': False,\n", - " 'title': None,\n", - " 'description': '\\nEpidemiological and clinical outcomes on humans collected from a clinical trial or village trial, if reported in the study.\\nEach clinical outcome should have unique `observation_ref`, `itn_condition_ref`, `Group`, `Age_lower`, and `Age_upper` (if reported).\\n '}}" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sample_table = record.fields['extraction']\n", - "sample_table" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
DatasetRecords: The provided batch size 256 was normalized. Using value 4.\n",
-                            "
\n" - ], - "text/plain": [ - "DatasetRecords: The provided batch size \u001b[1;36m256\u001b[0m was normalized. Using value \u001b[1;36m4\u001b[0m.\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Sending records...: 100%|███████| 1/1 [00:02<00:00, 2.54s/batch]\n" - ] - }, - { - "data": { - "text/plain": [ - "DatasetRecords(Dataset(id=UUID('a64a827c-f962-417a-a771-ce53f61c0756') inserted_at=datetime.datetime(2024, 11, 29, 23, 9, 55, 104623) updated_at=datetime.datetime(2024, 11, 29, 23, 9, 58, 696913) name='interactive_chat' status='ready' guidelines=None allow_extra_metadata=False distribution=OverlapTaskDistributionModel(strategy='overlap', min_submitted=1) workspace_id=UUID('e9e4e699-a6f9-4482-b5dd-e45874bd87eb') last_activity_at=datetime.datetime(2024, 11, 29, 23, 9, 58, 696913)))" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dataset.records.log([\n", - " {'chosen': sample_table} \\\n", - " for r in updated_records\n", - "])" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "# ds = load_dataset(\"argilla/Capybara-Preferences\", split=\"train[:100]\")\n", - "# ds = ds.map(lambda x: {\"messages\": x[\"chosen\"]})\n", - "# dataset.records.log(ds)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.14" - } + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "import json\n", + "\n", + "import extralit as ex\n", + "\n", + "client = ex.Extralit(api_url=\"http://localhost:6900/\", api_key=\"extralit.apikey\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "for dataset in client.datasets.list():\n", + " print(dataset.name)\n", + " # dataset.delete()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Load extraction dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Dataset(id=UUID('3a7abf40-a6b7-4cf6-ac09-d89a8b33ac67') inserted_at=datetime.datetime(2024, 4, 4, 5, 23, 44, 562080) updated_at=datetime.datetime(2024, 11, 15, 0, 35, 14, 753190) name='2-Data-Extractions' status='ready' guidelines=None allow_extra_metadata=True distribution=OverlapTaskDistributionModel(strategy='overlap', min_submitted=1) workspace_id=UUID('e9e4e699-a6f9-4482-b5dd-e45874bd87eb') last_activity_at=datetime.datetime(2024, 12, 1, 5, 55, 8, 469548))" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset = client.datasets(name=\"2-Data-Extractions\", workspace=\"itn-recalibration\")\n", + "dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "## Update field" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'reference': 'mosqueira2015pilot', 'pmid': '25959771', 'doc_id': '276c32ef-26d2-40cb-b808-b764018cd2ea', 'type': 'Observation'}\n", + "\n", + "\n", + "{'reference': 'mosqueira2015pilot', 'pmid': '25959771', 'doc_id': '276c32ef-26d2-40cb-b808-b764018cd2ea', 'type': 'ITNCondition'}\n", + "\n", + "\n", + "{'reference': 'mosqueira2015pilot', 'pmid': '25959771', 'doc_id': '276c32ef-26d2-40cb-b808-b764018cd2ea', 'type': 'EntomologicalOutcome'}\n", + "\n", + "\n", + "{'reference': 'mosqueira2015pilot', 'pmid': '25959771', 'doc_id': '276c32ef-26d2-40cb-b808-b764018cd2ea', 'type': 'ClinicalOutcome'}\n", + "\n", + "\n" + ] }, - "nbformat": 4, - "nbformat_minor": 4 + { + "data": { + "text/plain": [ + "4" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Find the record with the specific metadata\n", + "records = dataset.records(\n", + " query=ex.Query(filter=(\"metadata.reference\", \"==\", \"mosqueira2015pilot\"))\n", + ")\n", + "\n", + "# Update the record's extraction field\n", + "updated_records = []\n", + "for record in records:\n", + " print(record.metadata)\n", + " print(type(record.fields[\"extraction\"]))\n", + " record.fields[\"extraction\"] = json.loads(record.fields[\"extraction\"])\n", + " print(type(record.fields[\"extraction\"]))\n", + " updated_records.append(record)\n", + "\n", + "len(updated_records)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
DatasetRecords: The provided batch size 256 was normalized. Using value 4.\n",
+       "
\n" + ], + "text/plain": [ + "DatasetRecords: The provided batch size \u001b[1;36m256\u001b[0m was normalized. Using value \u001b[1;36m4\u001b[0m.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Sending records...: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:04<00:00, 4.41s/batch]\n" + ] + }, + { + "data": { + "text/plain": [ + "DatasetRecords(Dataset(id=UUID('3a7abf40-a6b7-4cf6-ac09-d89a8b33ac67') inserted_at=datetime.datetime(2024, 4, 4, 5, 23, 44, 562080) updated_at=datetime.datetime(2024, 11, 15, 0, 35, 14, 753190) name='2-Data-Extractions' status='ready' guidelines=None allow_extra_metadata=True distribution=OverlapTaskDistributionModel(strategy='overlap', min_submitted=1) workspace_id=UUID('e9e4e699-a6f9-4482-b5dd-e45874bd87eb') last_activity_at=datetime.datetime(2024, 11, 21, 18, 7, 47, 105497)))" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset.records.log(updated_records)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'reference': 'mosqueira2015pilot',\n", + " 'schema': {'fields': [{'name': 'observation_ref',\n", + " 'type': 'any',\n", + " 'extDtype': 'string'},\n", + " {'name': 'itncondition_ref', 'type': 'any', 'extDtype': 'string'},\n", + " {'name': 'N_people', 'type': 'integer'},\n", + " {'name': 'Age_lower', 'type': 'number'},\n", + " {'name': 'Age_upper', 'type': 'number'}],\n", + " 'primaryKey': ['observation_ref', 'itncondition_ref'],\n", + " 'pandas_version': '1.4.0'},\n", + " 'data': [{'observation_ref': 'S01',\n", + " 'itncondition_ref': 'N01',\n", + " 'N_people': 3903,\n", + " 'Age_lower': 0.5,\n", + " 'Age_upper': 14.0},\n", + " {'observation_ref': 'S02',\n", + " 'itncondition_ref': 'N01',\n", + " 'N_people': 3903,\n", + " 'Age_lower': 0.5,\n", + " 'Age_upper': 14.0}],\n", + " 'validation': {'schema_type': 'dataframe',\n", + " 'version': '0.18.3',\n", + " 'columns': {'N_people': {'title': None,\n", + " 'description': 'Number of people in the study arm of the net in question',\n", + " 'dtype': 'int64',\n", + " 'nullable': True,\n", + " 'checks': None,\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'Age_lower': {'title': None,\n", + " 'description': 'Lower limit of age group in years. For children <1, enter age as a decimal.',\n", + " 'dtype': 'float64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'Age_upper': {'title': None,\n", + " 'description': 'Upper limit of age group in years. For children <1, enter age as a decimal.',\n", + " 'dtype': 'float64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'N_pos': {'title': None,\n", + " 'description': 'Number of people tested to be parasite positive',\n", + " 'dtype': 'int64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'PR': {'title': None,\n", + " 'description': 'Definition: (N_pos/N_people)*100',\n", + " 'dtype': 'float64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'PR_rate_lower': {'title': None,\n", + " 'description': 'Lower bound of parasite positivity rate',\n", + " 'dtype': 'float64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'PR_rate_upper': {'title': None,\n", + " 'description': 'Upper bound of parasite positivity rate',\n", + " 'dtype': 'float64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'CM': {'title': None,\n", + " 'description': 'Number of people with clinical malaria',\n", + " 'dtype': 'int64',\n", + " 'nullable': True,\n", + " 'checks': None,\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'CM_rate': {'title': None,\n", + " 'description': 'Definition: (CM/N_people)*100',\n", + " 'dtype': 'float64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'CM_rate_lower': {'title': None,\n", + " 'description': 'Lower bound of clinical malaria rate',\n", + " 'dtype': 'float64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'CM_rate_upper': {'title': None,\n", + " 'description': 'Upper bound of clinical malaria rate',\n", + " 'dtype': 'float64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'Net_retention': {'title': None,\n", + " 'description': 'Number of nets still owned divided by a number of nets previously distributed',\n", + " 'dtype': 'float64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'N_nets': {'title': None,\n", + " 'description': 'Number of nets found in household or community study arm',\n", + " 'dtype': 'int64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'N_sleep_nets': {'title': None,\n", + " 'description': 'Number of people that slept under a net the previous night',\n", + " 'dtype': 'int64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'Perc_sleep_nets': {'title': None,\n", + " 'description': 'Percent of people that slept under a net the previous night',\n", + " 'dtype': 'float64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False}},\n", + " 'checks': {'check_less_than': {'columns_a': ['Age_lower'],\n", + " 'columns_b': ['Age_upper'],\n", + " 'or_equal': True},\n", + " 'check_greater_than': {'columns_a': 'N_people',\n", + " 'columns_b': ['N_pos', 'CM', 'N_sleep_nets'],\n", + " 'or_equal': True},\n", + " 'check_between': {'columns_target': ['PR', 'CM_rate'],\n", + " 'columns_lower': ['PR_rate_lower', 'CM_rate_lower'],\n", + " 'columns_upper': ['PR_rate_upper', 'CM_rate_upper'],\n", + " 'or_equal': True}},\n", + " 'index': [{'title': 'Observation reference',\n", + " 'description': None,\n", + " 'dtype': 'str',\n", + " 'nullable': False,\n", + " 'checks': {'str_startswith': 'S'},\n", + " 'name': 'observation_ref',\n", + " 'unique': False,\n", + " 'coerce': False},\n", + " {'title': 'ITNCondition reference',\n", + " 'description': None,\n", + " 'dtype': 'str',\n", + " 'nullable': False,\n", + " 'checks': {'str_startswith': 'N'},\n", + " 'name': 'itncondition_ref',\n", + " 'unique': False,\n", + " 'coerce': False}],\n", + " 'dtype': None,\n", + " 'coerce': True,\n", + " 'strict': True,\n", + " 'name': 'ClinicalOutcome',\n", + " 'ordered': False,\n", + " 'unique': None,\n", + " 'report_duplicates': 'all',\n", + " 'unique_column_names': False,\n", + " 'add_missing_columns': False,\n", + " 'title': None,\n", + " 'description': '\\nEpidemiological and clinical outcomes on humans collected from a clinical trial or village trial, if reported in the study.\\nEach clinical outcome should have unique `observation_ref`, `itn_condition_ref`, `Group`, `Age_lower`, and `Age_upper` (if reported).\\n '}}" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "json.loads(record.fields[\"extraction\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Test dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Custom Field" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/jonny/Projects/extralit/argilla/src/argilla/client.py:354: UserWarning: Dataset with name 'interactive_chat' not found in workspace 'itn-recalibration'\n", + " warnings.warn(f\"Dataset with name {name!r} not found in workspace {workspace.name!r}\")\n" + ] + }, + { + "ename": "AttributeError", + "evalue": "'NoneType' object has no attribute 'delete'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[5], line 5\u001b[0m\n\u001b[1;32m 1\u001b[0m dataset \u001b[38;5;241m=\u001b[39m client\u001b[38;5;241m.\u001b[39mdatasets(\n\u001b[1;32m 2\u001b[0m name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minteractive_chat\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 3\u001b[0m \u001b[38;5;66;03m# workspace=\"itn-recalibration\"\u001b[39;00m\n\u001b[1;32m 4\u001b[0m )\n\u001b[0;32m----> 5\u001b[0m \u001b[43mdataset\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdelete\u001b[49m()\n", + "\u001b[0;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'delete'" + ] + } + ], + "source": [ + "dataset = client.datasets(\n", + " name=\"interactive_chat\",\n", + " # workspace=\"itn-recalibration\"\n", + ")\n", + "dataset.delete()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Dataset(id=UUID('92b559e7-8eff-4d4c-85bf-817fd73570e4') inserted_at=datetime.datetime(2024, 12, 2, 21, 33, 33, 529345) updated_at=datetime.datetime(2024, 12, 2, 21, 33, 39, 111530) name='interactive_chat' status='ready' guidelines=None allow_extra_metadata=False distribution=OverlapTaskDistributionModel(strategy='overlap', min_submitted=1) workspace_id=UUID('e9e4e699-a6f9-4482-b5dd-e45874bd87eb') last_activity_at=datetime.datetime(2024, 12, 2, 21, 33, 39, 111530))" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "settings = ex.Settings(\n", + " fields=[\n", + " ex.TableField(name=\"chosen\", required=False),\n", + " ],\n", + " questions=[\n", + " ex.TableQuestion(\"extraction\", title=\"Correct the table\", required=True),\n", + " ],\n", + ")\n", + "\n", + "dataset = ex.Dataset(\n", + " settings=settings,\n", + " name=\"interactive_chat\",\n", + ")\n", + "\n", + "dataset.create()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'reference': 'mosqueira2015pilot',\n", + " 'schema': {'fields': [{'name': 'observation_ref',\n", + " 'type': 'any',\n", + " 'extDtype': 'string'},\n", + " {'name': 'itncondition_ref', 'type': 'any', 'extDtype': 'string'},\n", + " {'name': 'N_people', 'type': 'integer'},\n", + " {'name': 'Age_lower', 'type': 'number'},\n", + " {'name': 'Age_upper', 'type': 'number'}],\n", + " 'primaryKey': ['observation_ref', 'itncondition_ref'],\n", + " 'pandas_version': '1.4.0'},\n", + " 'data': [{'observation_ref': 'S01',\n", + " 'itncondition_ref': 'N01',\n", + " 'N_people': 3903,\n", + " 'Age_lower': 0.5,\n", + " 'Age_upper': 14.0},\n", + " {'observation_ref': 'S02',\n", + " 'itncondition_ref': 'N01',\n", + " 'N_people': 3903,\n", + " 'Age_lower': 0.5,\n", + " 'Age_upper': 14.0}],\n", + " 'validation': {'schema_type': 'dataframe',\n", + " 'version': '0.18.3',\n", + " 'columns': {'N_people': {'title': None,\n", + " 'description': 'Number of people in the study arm of the net in question',\n", + " 'dtype': 'int64',\n", + " 'nullable': True,\n", + " 'checks': None,\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'Age_lower': {'title': None,\n", + " 'description': 'Lower limit of age group in years. For children <1, enter age as a decimal.',\n", + " 'dtype': 'float64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'Age_upper': {'title': None,\n", + " 'description': 'Upper limit of age group in years. For children <1, enter age as a decimal.',\n", + " 'dtype': 'float64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'N_pos': {'title': None,\n", + " 'description': 'Number of people tested to be parasite positive',\n", + " 'dtype': 'int64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'PR': {'title': None,\n", + " 'description': 'Definition: (N_pos/N_people)*100',\n", + " 'dtype': 'float64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'PR_rate_lower': {'title': None,\n", + " 'description': 'Lower bound of parasite positivity rate',\n", + " 'dtype': 'float64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'PR_rate_upper': {'title': None,\n", + " 'description': 'Upper bound of parasite positivity rate',\n", + " 'dtype': 'float64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'CM': {'title': None,\n", + " 'description': 'Number of people with clinical malaria',\n", + " 'dtype': 'int64',\n", + " 'nullable': True,\n", + " 'checks': None,\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'CM_rate': {'title': None,\n", + " 'description': 'Definition: (CM/N_people)*100',\n", + " 'dtype': 'float64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'CM_rate_lower': {'title': None,\n", + " 'description': 'Lower bound of clinical malaria rate',\n", + " 'dtype': 'float64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'CM_rate_upper': {'title': None,\n", + " 'description': 'Upper bound of clinical malaria rate',\n", + " 'dtype': 'float64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'Net_retention': {'title': None,\n", + " 'description': 'Number of nets still owned divided by a number of nets previously distributed',\n", + " 'dtype': 'float64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'N_nets': {'title': None,\n", + " 'description': 'Number of nets found in household or community study arm',\n", + " 'dtype': 'int64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'N_sleep_nets': {'title': None,\n", + " 'description': 'Number of people that slept under a net the previous night',\n", + " 'dtype': 'int64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False},\n", + " 'Perc_sleep_nets': {'title': None,\n", + " 'description': 'Percent of people that slept under a net the previous night',\n", + " 'dtype': 'float64',\n", + " 'nullable': True,\n", + " 'checks': {'greater_than_or_equal_to': 0, 'less_than_or_equal_to': 100},\n", + " 'unique': False,\n", + " 'coerce': False,\n", + " 'required': True,\n", + " 'regex': False}},\n", + " 'checks': {'check_less_than': {'columns_a': ['Age_lower'],\n", + " 'columns_b': ['Age_upper'],\n", + " 'or_equal': True},\n", + " 'check_greater_than': {'columns_a': 'N_people',\n", + " 'columns_b': ['N_pos', 'CM', 'N_sleep_nets'],\n", + " 'or_equal': True},\n", + " 'check_between': {'columns_target': ['PR', 'CM_rate'],\n", + " 'columns_lower': ['PR_rate_lower', 'CM_rate_lower'],\n", + " 'columns_upper': ['PR_rate_upper', 'CM_rate_upper'],\n", + " 'or_equal': True}},\n", + " 'index': [{'title': 'Observation reference',\n", + " 'description': None,\n", + " 'dtype': 'str',\n", + " 'nullable': False,\n", + " 'checks': {'str_startswith': 'S'},\n", + " 'name': 'observation_ref',\n", + " 'unique': False,\n", + " 'coerce': False},\n", + " {'title': 'ITNCondition reference',\n", + " 'description': None,\n", + " 'dtype': 'str',\n", + " 'nullable': False,\n", + " 'checks': {'str_startswith': 'N'},\n", + " 'name': 'itncondition_ref',\n", + " 'unique': False,\n", + " 'coerce': False}],\n", + " 'dtype': None,\n", + " 'coerce': True,\n", + " 'strict': True,\n", + " 'name': 'ClinicalOutcome',\n", + " 'ordered': False,\n", + " 'unique': None,\n", + " 'report_duplicates': 'all',\n", + " 'unique_column_names': False,\n", + " 'add_missing_columns': False,\n", + " 'title': None,\n", + " 'description': '\\nEpidemiological and clinical outcomes on humans collected from a clinical trial or village trial, if reported in the study.\\nEach clinical outcome should have unique `observation_ref`, `itn_condition_ref`, `Group`, `Age_lower`, and `Age_upper` (if reported).\\n '}}" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sample_table = record.fields[\"extraction\"]\n", + "sample_table" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
DatasetRecords: The provided batch size 256 was normalized. Using value 4.\n",
+       "
\n" + ], + "text/plain": [ + "DatasetRecords: The provided batch size \u001b[1;36m256\u001b[0m was normalized. Using value \u001b[1;36m4\u001b[0m.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Sending records...: 100%|███████| 1/1 [00:02<00:00, 2.54s/batch]\n" + ] + }, + { + "data": { + "text/plain": [ + "DatasetRecords(Dataset(id=UUID('a64a827c-f962-417a-a771-ce53f61c0756') inserted_at=datetime.datetime(2024, 11, 29, 23, 9, 55, 104623) updated_at=datetime.datetime(2024, 11, 29, 23, 9, 58, 696913) name='interactive_chat' status='ready' guidelines=None allow_extra_metadata=False distribution=OverlapTaskDistributionModel(strategy='overlap', min_submitted=1) workspace_id=UUID('e9e4e699-a6f9-4482-b5dd-e45874bd87eb') last_activity_at=datetime.datetime(2024, 11, 29, 23, 9, 58, 696913)))" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset.records.log([{\"chosen\": sample_table} for r in updated_records])" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# ds = load_dataset(\"argilla/Capybara-Preferences\", split=\"train[:100]\")\n", + "# ds = ds.map(lambda x: {\"messages\": x[\"chosen\"]})\n", + "# dataset.records.log(ds)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 4 } \ No newline at end of file diff --git a/examples/deployments/k8s/helm/postgres-helm.yaml b/examples/deployments/k8s/helm/postgres-helm.yaml index e628fe12f..af2a87cdd 100644 --- a/examples/deployments/k8s/helm/postgres-helm.yaml +++ b/examples/deployments/k8s/helm/postgres-helm.yaml @@ -21,7 +21,7 @@ postgresql: enabled: true storageClass: "local-path" size: 1Gi - nodeAffinity: + nodeAffinity: preferredDuringSchedulingIgnoredDuringExecution: - weight: 1 preference: diff --git a/examples/deployments/k8s/helm/weaviate-helm.yaml b/examples/deployments/k8s/helm/weaviate-helm.yaml index 72eb18cbb..4ee0befec 100644 --- a/examples/deployments/k8s/helm/weaviate-helm.yaml +++ b/examples/deployments/k8s/helm/weaviate-helm.yaml @@ -25,15 +25,15 @@ args: - 'http' - '--config-file' - '/weaviate-config/conf.yaml' - - --read-timeout=60s + - --read-timeout=60s - --write-timeout=60s # below is an example that can be used to set an arbitrary nofile limit at # startup: # -# command: +# command: # - "/bin/sh" -# args: +# args: # - "-c" # - "ulimit -n 65535 && /bin/weaviate --host 0.0.0.0 --port 8080 --scheme http --config-file /weaviate-config/conf.yaml" @@ -53,7 +53,7 @@ initContainers: repo: alpine tag: latest pullPolicy: IfNotPresent - + extraInitContainers: {} # - image: some-image # name: some-name @@ -99,7 +99,7 @@ serviceAccountName: # Kubernetes Cluster domain name, used for resolving intra-cluster requests, i.e # between instances of weaviate. # Note: The final '.' on the end of the hostname makes it a FQDN, and is required for -# DNS to resolve in all kubernetes environments. +# DNS to resolve in all kubernetes environments. # See https://github.com/weaviate/weaviate-helm/issues/175 for details. clusterDomain: cluster.local. @@ -287,7 +287,7 @@ env: PROMETHEUS_MONITORING_ENABLED: false PROMETHEUS_MONITORING_GROUP: false - # Set a MEM limit for the Weaviate Pod so it can help you both increase GC-related + # Set a MEM limit for the Weaviate Pod so it can help you both increase GC-related # performance as well as avoid GC-related out-of-memory (“OOM”) situations # GOMEMLIMIT: 6GiB @@ -335,13 +335,13 @@ backups: envconfig: # Configure folder where backups should be saved BACKUP_FILESYSTEM_PATH: /tmp/backups - + s3: enabled: false # If one is using AWS EKS and has already configured K8s Service Account # that holds the AWS credentials one can pass a name of that service account # here using this setting. - # NOTE: the root `serviceAccountName` config has priority over this one, and + # NOTE: the root `serviceAccountName` config has priority over this one, and # if the root one is set this one will NOT overwrite it. This one is here for # backwards compatibility. serviceAccountName: @@ -350,17 +350,17 @@ backups: # Configure bucket where backups should be saved, this setting is mandatory BACKUP_S3_BUCKET: weaviate-backups - # Optional setting. Defaults to empty string. + # Optional setting. Defaults to empty string. # Set this option if you want to save backups to a given location # inside the bucket # BACKUP_S3_PATH: path/inside/bucket - # Optional setting. Defaults to AWS S3 (s3.amazonaws.com). + # Optional setting. Defaults to AWS S3 (s3.amazonaws.com). # Set this option if you have a MinIO storage configured in your environment # and want to use it instead of the AWS S3. # BACKUP_S3_ENDPOINT: custom.minio.endpoint.address - # Optional setting. Defaults to true. + # Optional setting. Defaults to true. # Set this option if you don't want to use SSL. # BACKUP_S3_USE_SSL: true @@ -373,7 +373,7 @@ backups: # You can pass the User credentials (access-key id and access-secret-key) in 2 ways: # 1. by setting the AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY plain values in the `secrets` section below # this chart will create a kubernetes secret for you with these key-values pairs - # 2. create Kubernetes secret/s with AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY keys and their respective values + # 2. create Kubernetes secret/s with AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY keys and their respective values # Set the Key and the secret where it is set in `envSecrets` section below secrets: {} # AWS_ACCESS_KEY_ID: access-key-id (plain text) @@ -421,7 +421,7 @@ backups: # Configure container where backups should be saved, this setting is mandatory BACKUP_AZURE_CONTAINER: weaviate-backups - # Optional setting. Defaults to empty string. + # Optional setting. Defaults to empty string. # Set this option if you want to save backups to a given location # inside the container # BACKUP_AZURE_PATH: path/inside/container @@ -432,7 +432,7 @@ backups: # 1. by setting the AZURE_STORAGE_ACCOUNT and AZURE_STORAGE_KEY # or AZURE_STORAGE_CONNECTION_STRING plain values in the `secrets` section below # this chart will create a kubernetes secret for you with these key-values pairs - # 2. create Kubernetes secret/s with AZURE_STORAGE_ACCOUNT and AZURE_STORAGE_KEY + # 2. create Kubernetes secret/s with AZURE_STORAGE_ACCOUNT and AZURE_STORAGE_KEY # or AZURE_STORAGE_CONNECTION_STRING and their respective values # Set the Key and the secret where it is set in `envSecrets` section below secrets: {} @@ -580,7 +580,7 @@ modules: # as described here: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/ # applies to passageQueryService below securityContext: {} - + # It is possible to add a ServiceAccount to this module's Pods, it can be # used in cases where the module is in a private registry and you want to # give access to the registry only to this pod. @@ -650,13 +650,13 @@ modules: # as described here: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/ # applies to passageQueryService below securityContext: {} - + # It is possible to add a ServiceAccount to this module's Pods, it can be # used in cases where the module is in a private registry and you want to # give access to the registry only to this pod. # NOTE: if not set the root `serviceAccountName` config will be used. serviceAccountName: - + # You can guide where the pods are scheduled on a per-module basis, # as well as for Weaviate overall. Each module accepts nodeSelector, # tolerations, and affinity configuration. If it is set on a per- @@ -721,7 +721,7 @@ modules: # as described here: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/ # applies to passageQueryService below securityContext: {} - + # It is possible to add a ServiceAccount to this module's Pods, it can be # used in cases where the module is in a private registry and you want to # give access to the registry only to this pod. @@ -742,7 +742,7 @@ modules: # These models run only on CPU only and on x86_64 arch # The ML model is containerized in a Weaviate compatible way. # If you want to run a different model that published ones you can follow the - # tutorial from here on how to create such a container: https://github.com/weaviate/t2v-gpt4all-models + # tutorial from here on how to create such a container: https://github.com/weaviate/t2v-gpt4all-models text2vec-gpt4all: # Enable deployment of this module @@ -751,7 +751,7 @@ modules: # You can set directly an inference URL of this module without deploying it with this release. # You can do so by setting a value for the `inferenceUrl` here AND by setting the `enable` to `false` inferenceUrl: {} - + # The configuration below is ignored if enabled==false tag: all-MiniLM-L6-v2 repo: semitechnologies/gpt4all-inference @@ -783,7 +783,7 @@ modules: # security Context for the Contextionary Pods. The configurations are the same as setting them # as described here: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/ securityContext: {} - + # It is possible to add a ServiceAccount to this module's Pods, it can be # used in cases where the module is in a private registry and you want to # give access to the registry only to this pod. @@ -1133,7 +1133,7 @@ modules: # security Context for the Contextionary Pods. The configurations are the same as setting them # as described here: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/ securityContext: {} - + # It is possible to add a ServiceAccount to this module's Pods, it can be # used in cases where the module is in a private registry and you want to # give access to the registry only to this pod. @@ -1309,7 +1309,7 @@ modules: # security Context for the Contextionary Pods. The configurations are the same as setting them # as described here: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/ securityContext: {} - + # It is possible to add a ServiceAccount to this module's Pods, it can be # used in cases where the module is in a private registry and you want to # give access to the registry only to this pod. @@ -1326,7 +1326,7 @@ modules: affinity: # The reranker-cohere module uses Cohere API - # to dynamically compute a score for the relevance + # to dynamically compute a score for the relevance # of the query with each of the initial search results. # More information about Cohere API can be found here: https://docs.cohere.com/docs/rerank-guide reranker-cohere: @@ -1339,7 +1339,7 @@ modules: apiKey: '' # The reranker-voyageai module uses VoaygeAI API - # to dynamically compute a score for the relevance + # to dynamically compute a score for the relevance # of the query with each of the initial search results. # More information about Cohere API can be found here: https://www.voyageai.com/ reranker-voyageai: @@ -1351,7 +1351,7 @@ modules: # an environment variable apiKey: '' - # The reranker-transformers module uses Cross-Encoders for + # The reranker-transformers module uses Cross-Encoders for # sentence pair scoring and sentence pair classification tasks. # More information about Cross-Encoders can be found here: # https://www.sbert.net/examples/applications/cross-encoder/README.html @@ -1411,7 +1411,7 @@ modules: # security Context for the Contextionary Pods. The configurations are the same as setting them # as described here: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/ securityContext: {} - + # It is possible to add a ServiceAccount to this module's Pods, it can be # used in cases where the module is in a private registry and you want to # give access to the registry only to this pod. @@ -1468,7 +1468,7 @@ modules: # security Context for the Contextionary Pods. The configurations are the same as setting them # as described here: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/ securityContext: {} - + # It is possible to add a ServiceAccount to this module's Pods, it can be # used in cases where the module is in a private registry and you want to # give access to the registry only to this pod. @@ -1542,7 +1542,7 @@ modules: # security Context for the Contextionary Pods. The configurations are the same as setting them # as described here: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/ securityContext: {} - + # It is possible to add a ServiceAccount to this module's Pods, it can be # used in cases where the module is in a private registry and you want to # give access to the registry only to this pod. @@ -1615,7 +1615,7 @@ modules: # security Context for the Contextionary Pods. The configurations are the same as setting them # as described here: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/ securityContext: {} - + # It is possible to add a ServiceAccount to this module's Pods, it can be # used in cases where the module is in a private registry and you want to # give access to the registry only to this pod. diff --git a/examples/deployments/k8s/k3d/k3d-config.yaml b/examples/deployments/k8s/k3d/k3d-config.yaml index b6523ea4b..e64ed2226 100644 --- a/examples/deployments/k8s/k3d/k3d-config.yaml +++ b/examples/deployments/k8s/k3d/k3d-config.yaml @@ -13,4 +13,3 @@ k3d: kubeAPI: host: "0.0.0.0" hostPort: "6443" - \ No newline at end of file diff --git a/examples/deployments/k8s/kind/tilt-local-dev-storage-policy.yaml b/examples/deployments/k8s/kind/tilt-local-dev-storage-policy.yaml index b75c774d9..b55ab0269 100644 --- a/examples/deployments/k8s/kind/tilt-local-dev-storage-policy.yaml +++ b/examples/deployments/k8s/kind/tilt-local-dev-storage-policy.yaml @@ -21,7 +21,7 @@ spec: storageClassName: local-path persistentVolumeReclaimPolicy: Retain local: - path: "/usr/share/elasticsearch/data" + path: "/usr/share/elasticsearch/data" nodeAffinity: required: nodeSelectorTerms: @@ -47,12 +47,12 @@ spec: persistentVolumeReclaimPolicy: Retain local: path: "/var/lib/postgresql/data" - nodeAffinity: + nodeAffinity: required: nodeSelectorTerms: - matchExpressions: - key: kubernetes.io/hostname - operator: In + operator: In values: - kind-control-plane --- diff --git a/examples/deployments/k8s/minio-dev.yaml b/examples/deployments/k8s/minio-dev.yaml index a4c0ef744..19db3a796 100644 --- a/examples/deployments/k8s/minio-dev.yaml +++ b/examples/deployments/k8s/minio-dev.yaml @@ -4,7 +4,7 @@ # The `spec.containers[0].args` contains the command run on the pod # The `/data` directory corresponds to the `spec.containers[0].volumeMounts[0].mountPath` # That mount path corresponds to a Kubernetes HostPath which binds `/data` to a local drive or volume on the worker node where the pod runs -# +# apiVersion: v1 kind: Pod metadata: @@ -33,7 +33,7 @@ spec: command: - /bin/bash - -c - args: + args: - minio server /data --console-address :9090 env: - name: MINIO_ACCESS_KEY_FILE diff --git a/examples/document_extraction/setup_workspace.ipynb b/examples/document_extraction/setup_workspace.ipynb index 0d630cf95..8ee10d4ed 100644 --- a/examples/document_extraction/setup_workspace.ipynb +++ b/examples/document_extraction/setup_workspace.ipynb @@ -61,7 +61,7 @@ "from pathlib import Path\n", "\n", "# Connect to Extralit using default credentials\n", - "client = ex.Extralit(api_url=\"http://localhost:6900/\", api_key='extralit.apikey')\n", + "client = ex.Extralit(api_url=\"http://localhost:6900/\", api_key=\"extralit.apikey\")\n", "\n", "print(f\"Successfully connected to Extralit at {client.api_url}\")" ] @@ -99,8 +99,8 @@ " # Create the workspace\n", " workspace = new_workspace.create()\n", "\n", - " print(f\"Workspace '{workspace_name}' created successfully with ID: {created_workspace.id}\")\n", - "except Exception as e:\n", + " print(f\"Workspace '{workspace_name}' created successfully with ID: {workspace.id}\")\n", + "except Exception:\n", " print(f\"Workspace '{workspace_name}' already exists. Using the existing workspace.\")\n", " workspace = client.workspaces(workspace_name)" ] @@ -276,18 +276,20 @@ "# Create empty PDF files - in reality, these would be your actual PDFs\n", "with open(pdf_file1, \"wb\") as f:\n", " f.write(b\"%PDF-1.5\\n%Example Document 1\")\n", - " \n", + "\n", "with open(pdf_file2, \"wb\") as f:\n", " f.write(b\"%PDF-1.5\\n%Example Document 2\")\n", "\n", "# Create a reference dataframe with metadata for the PDFs\n", - "references_df = pd.DataFrame({\n", - " \"reference\": [\"smith2023first\", \"johnson2022analysis\"],\n", - " \"file_path\": [str(pdf_file1), str(pdf_file2)],\n", - " \"title\": [\"Study on Sample Data\", \"Analysis of Experimental Results\"],\n", - " \"authors\": [\"Smith, J.\", \"Johnson, A.\"],\n", - " \"year\": [2023, 2022]\n", - "})\n", + "references_df = pd.DataFrame(\n", + " {\n", + " \"reference\": [\"smith2023first\", \"johnson2022analysis\"],\n", + " \"file_path\": [str(pdf_file1), str(pdf_file2)],\n", + " \"title\": [\"Study on Sample Data\", \"Analysis of Experimental Results\"],\n", + " \"authors\": [\"Smith, J.\", \"Johnson, A.\"],\n", + " \"year\": [2023, 2022],\n", + " }\n", + ")\n", "\n", "# Save the dataframe to a temporary CSV file\n", "references_csv = Path(temp_dir) / \"references.csv\"\n", @@ -328,13 +330,15 @@ "# Import the documents into the workspace\n", "# For demonstration purposes, we'll use the extralit client directly\n", "# Initialize the extralit client with the same credentials\n", - "extralit_client = ex.Extralit(api_url=\"http://localhost:6900/\", api_key='extralit.apikey')\n", + "extralit_client = ex.Extralit(\n", + " api_url=\"http://localhost:6900/\", api_key=\"extralit.apikey\"\n", + ")\n", "\n", "# Import the documents\n", "result = extralit_client.import_documents(\n", " workspace=workspace_name,\n", " papers=str(references_csv),\n", - " metadatas=[\"title\", \"authors\", \"year\"]\n", + " metadatas=[\"title\", \"authors\", \"year\"],\n", ")\n", "\n", "print(f\"Imported {len(result)} documents into workspace '{workspace_name}'\")" @@ -357,37 +361,45 @@ "metadata": {}, "outputs": [], "source": [ + "from extralit.extraction.models.schema import SchemaStructure\n", + "\n", + "\n", "# Define a simple schema using Pandera\n", "class Publication(pa.DataFrameModel):\n", " \"\"\"\n", " General information about the publication, extracted once per paper.\n", " \"\"\"\n", + "\n", " reference: Index[str] = pa.Field(unique=True, check_name=True)\n", " title: Series[str] = pa.Field()\n", " authors: Series[str] = pa.Field()\n", " publication_year: Series[int] = pa.Field(ge=1900, le=2100)\n", " doi: Series[str] = pa.Field(nullable=True)\n", - " \n", + "\n", " class Config:\n", - " singleton = {'enabled': True} # Indicates this is a document-level schema\n", + " singleton = {\"enabled\": True} # Indicates this is a document-level schema\n", + "\n", "\n", "# Define a second schema for experimental data\n", "class ExperimentalData(pa.DataFrameModel):\n", " \"\"\"\n", " Experimental data extracted from the paper, may appear multiple times.\n", " \"\"\"\n", + "\n", " experiment_id: Series[str] = pa.Field()\n", " sample_size: Series[int] = pa.Field(gt=0)\n", " study_type: Series[str] = pa.Field()\n", " result_value: Series[float] = pa.Field()\n", " significance: Series[float] = pa.Field(le=1.0, ge=0.0)\n", "\n", + "\n", "# Create a schema structure object\n", - "from extralit.extraction.models.schema import SchemaStructure\n", "\n", "# Save schemas to a temporary JSON file\n", "schema_file = Path(temp_dir) / \"schemas.json\"\n", - "schema_structure = SchemaStructure(schemas={\"Publication\": Publication, \"ExperimentalData\": ExperimentalData})\n", + "schema_structure = SchemaStructure(\n", + " schemas={\"Publication\": Publication, \"ExperimentalData\": ExperimentalData}\n", + ")\n", "schema_structure.to_json(schema_file)\n", "\n", "print(f\"Created schema file at {schema_file}\")" @@ -402,8 +414,7 @@ "source": [ "# Upload the schema to the workspace\n", "result = extralit_client.upload_schemas(\n", - " workspace=workspace_name,\n", - " schemas=str(schema_file)\n", + " workspace=workspace_name, schemas=str(schema_file)\n", ")\n", "\n", "print(f\"Uploaded schemas to workspace '{workspace_name}'\")" @@ -438,7 +449,7 @@ " references=references,\n", " text_ocr=[\"default\"], # Using the default text OCR model\n", " table_ocr=[\"default\"], # Using the default table OCR model\n", - " output_dataset=\"PDF_Preprocessing_Results\"\n", + " output_dataset=\"PDF_Preprocessing_Results\",\n", ")\n", "\n", "print(f\"Preprocessing completed for {len(preprocessing_result)} documents\")" @@ -468,7 +479,7 @@ "extraction_result = extract_data(\n", " workspace=workspace_name,\n", " references=references,\n", - " output_dataset=\"Data_Extraction_Results\"\n", + " output_dataset=\"Data_Extraction_Results\",\n", ")\n", "\n", "print(f\"LLM extractions completed for {len(extraction_result)} documents\")" @@ -508,7 +519,7 @@ "# Export the extracted data\n", "extracted_data = extralit_client.export_data(\n", " workspace=workspace_name,\n", - " output=\"temp_output.csv\" # This will save the data to a CSV file\n", + " output=\"temp_output.csv\", # This will save the data to a CSV file\n", ")\n", "\n", "# Display the extracted data\n", diff --git a/extralit-frontend/components/base/base-render-table/RenderTable.vue b/extralit-frontend/components/base/base-render-table/RenderTable.vue index 916245089..fbfaf645b 100644 --- a/extralit-frontend/components/base/base-render-table/RenderTable.vue +++ b/extralit-frontend/components/base/base-render-table/RenderTable.vue @@ -1,6 +1,6 @@