From e140b08018ae3a701d23f5634e01830c3d0c9669 Mon Sep 17 00:00:00 2001 From: JonnyTran Date: Wed, 27 Aug 2025 13:02:45 -0700 Subject: [PATCH 01/24] added GET "/datasets/compatible" --- .../api/handlers/v1/datasets/datasets.py | 24 +++++++++++++ .../api/schemas/v1/datasets.py | 34 +++++++++++++++++-- extralit/src/extralit/records/_resource.py | 5 +-- 3 files changed, 58 insertions(+), 5 deletions(-) diff --git a/extralit-server/src/extralit_server/api/handlers/v1/datasets/datasets.py b/extralit-server/src/extralit_server/api/handlers/v1/datasets/datasets.py index 8a99162d3..be27ce1f9 100644 --- a/extralit-server/src/extralit_server/api/handlers/v1/datasets/datasets.py +++ b/extralit-server/src/extralit_server/api/handlers/v1/datasets/datasets.py @@ -97,6 +97,30 @@ async def list_current_user_datasets( return Datasets(items=dataset_list) +@router.get("/datasets/compatible", response_model=Datasets) +async def list_compatible_datasets( + *, + db: Annotated[AsyncSession, Depends(get_async_db)], + current_user: Annotated[User, Security(auth.get_current_user)], + column_names: Annotated[list[str], Query(description="List of column names to match against existing datasets")], + workspace_id: Annotated[UUID | None, Query(description="Filter by workspace_id")] = None, +): + await authorize(current_user, DatasetPolicy.list(workspace_id)) + + filters = { + "workspace_id": workspace_id, + "status": DatasetStatus.ready, + } + + dataset_list = await datasets.list_datasets( + db, user=current_user, **{k: v for k, v in filters.items() if v is not None} + ) + + all_datasets = Datasets(items=dataset_list) + + return all_datasets.get_compatible_datasets(column_names) + + @router.get("/datasets/{dataset_id}/fields", response_model=Fields) async def list_dataset_fields( *, diff --git a/extralit-server/src/extralit_server/api/schemas/v1/datasets.py b/extralit-server/src/extralit_server/api/schemas/v1/datasets.py index 240691175..7acc8d1c3 100644 --- a/extralit-server/src/extralit_server/api/schemas/v1/datasets.py +++ b/extralit-server/src/extralit_server/api/schemas/v1/datasets.py @@ -156,6 +156,31 @@ def validate(cls, value) -> dict: class Datasets(BaseModel): items: list[Dataset] + def get_compatible_datasets(self, column_names: list[str]) -> "Datasets": + """ + Filter datasets that have compatible mappings based on overlapping column names + """ + compatible_datasets = [] + column_names_set = set(column_names) + + for dataset in self.items: + # The Dataset schema automatically parses mapping from metadata_ + if not dataset.mapping: + continue + + # Get all source column names from the mapping + mapping_sources = set(dataset.mapping.sources) + + # Calculate overlap - require at least 50% overlap + if mapping_sources and column_names_set: + overlap = len(column_names_set.intersection(mapping_sources)) + compatibility_score = overlap / len(column_names_set) + + if compatibility_score >= 0.5: # At least 50% compatibility + compatible_datasets.append(dataset) + + return Datasets(items=compatible_datasets) + class DatasetCreate(BaseModel): name: DatasetName @@ -188,16 +213,19 @@ class HubDatasetMapping(BaseModel): fields: list[HubDatasetMappingItem] = Field(..., min_length=1) metadata: list[HubDatasetMappingItem] | None = [] suggestions: list[HubDatasetMappingItem] | None = [] - external_id: str | None = None + source_id: str | None = Field( + None, + description="Dataset-level source identifier (format: import:{import_id}, dataset:{dataset_id}, hub:{repo_id})", + ) + target_id: str | None = Field(None, description="Dataset-level target identifier for workflow tracking") @property def sources(self) -> list[str]: fields_sources = [field.source for field in self.fields] metadata_sources = [metadata.source for metadata in self.metadata] suggestions_sources = [suggestion.source for suggestion in self.suggestions] - external_id_source = [self.external_id] if self.external_id else [] - return list(set(fields_sources + metadata_sources + suggestions_sources + external_id_source)) + return list(set(fields_sources + metadata_sources + suggestions_sources)) class HubDataset(BaseModel): diff --git a/extralit/src/extralit/records/_resource.py b/extralit/src/extralit/records/_resource.py index 407ced4a9..e4d5ace1a 100644 --- a/extralit/src/extralit/records/_resource.py +++ b/extralit/src/extralit/records/_resource.py @@ -100,8 +100,9 @@ def __init__( def __repr__(self) -> str: return ( - f"Record(id={self.id},status={self.status},fields={self.fields},metadata={self.metadata}," - f"suggestions={self.suggestions},responses={self.responses})" + f"Record(id={self.id}, status={self.status}, " + f"fields={list(self.fields.keys())}, metadata={self.metadata}, " + f"suggestions={self.suggestions}, responses={self.responses})" ) ############################ From 9f45bc131103b0f9c3b6fec89ff99473c447ef06 Mon Sep 17 00:00:00 2001 From: JonnyTran Date: Wed, 27 Aug 2025 13:46:05 -0700 Subject: [PATCH 02/24] Add GetImportCompatibleDatasets use case and integrate into dataset configuration --- ...useDatasetConfigurationNameAndWorkspace.ts | 52 +++++++++++++++++++ extralit-frontend/v1/di/di.ts | 3 ++ ...get-import-compatible-datasets-use-case.ts | 46 ++++++++++++++++ .../src/extralit_server/jobs/import_jobs.py | 29 +++++++++-- 4 files changed, 126 insertions(+), 4 deletions(-) create mode 100644 extralit-frontend/v1/domain/usecases/get-import-compatible-datasets-use-case.ts diff --git a/extralit-frontend/components/features/dataset-creation/configuration/useDatasetConfigurationNameAndWorkspace.ts b/extralit-frontend/components/features/dataset-creation/configuration/useDatasetConfigurationNameAndWorkspace.ts index 4a53fd7c7..0534583e5 100644 --- a/extralit-frontend/components/features/dataset-creation/configuration/useDatasetConfigurationNameAndWorkspace.ts +++ b/extralit-frontend/components/features/dataset-creation/configuration/useDatasetConfigurationNameAndWorkspace.ts @@ -2,16 +2,68 @@ import { useFetch } from "@nuxtjs/composition-api"; import { useResolve } from "ts-injecty"; import { ref } from "vue-demi"; import { GetWorkspacesUseCase } from "~/v1/domain/usecases/get-workspaces-use-case"; +import { GetImportCompatibleDatasetsUseCase } from "~/v1/domain/usecases/get-import-compatible-datasets-use-case"; +import { Dataset } from "~/v1/domain/entities/dataset/Dataset"; export const useDatasetConfigurationNameAndWorkspace = () => { const workspaces = ref([]); + const compatibleDatasets = ref([]); + const isLoadingCompatibleDatasets = ref(false); + const workflowType = ref<"create" | "append">("create"); + const selectedTargetDataset = ref(null); + const getWorkspacesUseCase = useResolve(GetWorkspacesUseCase); + const getImportCompatibleDatasetsUseCase = useResolve(GetImportCompatibleDatasetsUseCase); useFetch(async () => { workspaces.value = await getWorkspacesUseCase.execute(); }); + const loadCompatibleDatasets = async (columnNames: string[], workspaceId?: string) => { + if (!columnNames.length) { + compatibleDatasets.value = []; + return; + } + + try { + isLoadingCompatibleDatasets.value = true; + compatibleDatasets.value = await getImportCompatibleDatasetsUseCase.execute({ + columnNames, + workspaceId, + }); + } catch (error) { + console.error("Error loading compatible datasets:", error); + compatibleDatasets.value = []; + } finally { + isLoadingCompatibleDatasets.value = false; + } + }; + + const onWorkflowTypeChange = async (columnNames: string[], workspaceId?: string) => { + if (workflowType.value === "append") { + await loadCompatibleDatasets(columnNames, workspaceId); + } else { + compatibleDatasets.value = []; + selectedTargetDataset.value = null; + } + }; + + const getColumnNamesFromImportData = (importData: any): string[] => { + if (!importData?.data?.data?.length) return []; + + // Get column names from the first row of import data + const firstRow = importData.data.data[0]; + return Object.keys(firstRow); + }; + return { workspaces, + compatibleDatasets, + isLoadingCompatibleDatasets, + workflowType, + selectedTargetDataset, + loadCompatibleDatasets, + onWorkflowTypeChange, + getColumnNamesFromImportData, }; }; diff --git a/extralit-frontend/v1/di/di.ts b/extralit-frontend/v1/di/di.ts index 96c46d206..3e77c4406 100644 --- a/extralit-frontend/v1/di/di.ts +++ b/extralit-frontend/v1/di/di.ts @@ -66,6 +66,7 @@ import { UpdateMetadataSettingUseCase } from "@/v1/domain/usecases/dataset-setti import { OAuthLoginUseCase } from "@/v1/domain/usecases/oauth-login-use-case"; import { GetEnvironmentUseCase } from "@/v1/domain/usecases/get-environment-use-case"; import { GetWorkspacesUseCase } from "@/v1/domain/usecases/get-workspaces-use-case"; +import { GetImportCompatibleDatasetsUseCase } from "@/v1/domain/usecases/get-import-compatible-datasets-use-case"; import { GetDatasetQuestionsGroupedUseCase } from "@/v1/domain/usecases/get-dataset-questions-grouped-use-case"; import { GetDatasetFieldsGroupedUseCase } from "@/v1/domain/usecases/get-dataset-fields-grouped-use-case"; import { GetImportAnalysisUseCase } from "@/v1/domain/usecases/get-import-analysis-use-case"; @@ -113,6 +114,8 @@ export const loadDependencyContainer = (context: Context) => { register(GetWorkspacesUseCase).withDependencies(WorkspaceRepository, useWorkspaces).build(), + register(GetImportCompatibleDatasetsUseCase).withDependency(useAxios).build(), + register(GetDatasetsUseCase).withDependencies(DatasetRepository, useDatasets).build(), register(GetDocumentByRecordMetadataUseCase).withDependencies(DocumentRepository, useDocument).build(), diff --git a/extralit-frontend/v1/domain/usecases/get-import-compatible-datasets-use-case.ts b/extralit-frontend/v1/domain/usecases/get-import-compatible-datasets-use-case.ts new file mode 100644 index 000000000..08a964829 --- /dev/null +++ b/extralit-frontend/v1/domain/usecases/get-import-compatible-datasets-use-case.ts @@ -0,0 +1,46 @@ +import { type NuxtAxiosInstance } from "@nuxtjs/axios"; +import { Dataset } from "../entities/dataset/Dataset"; + +export interface GetImportCompatibleDatasetsParams { + columnNames: string[]; + workspaceId?: string; +} + +export class GetImportCompatibleDatasetsUseCase { + constructor(private readonly axios: NuxtAxiosInstance) {} + + async execute(params: GetImportCompatibleDatasetsParams): Promise { + try { + const response = await this.axios.get("/api/v1/datasets/compatible", { + params: { + column_names: params.columnNames, + workspace_id: params.workspaceId, + }, + }); + + return (response.data.items || []).map( + (datasetFromBackend: any) => + new Dataset( + datasetFromBackend.id, + datasetFromBackend.name, + datasetFromBackend.guidelines, + datasetFromBackend.status, + datasetFromBackend.workspace_id, + datasetFromBackend.workspace_name, + datasetFromBackend.allow_extra_metadata, + { + strategy: datasetFromBackend.distribution.strategy, + minSubmitted: datasetFromBackend.distribution.min_submitted, + }, + datasetFromBackend.metadata, + datasetFromBackend.inserted_at, + datasetFromBackend.updated_at, + datasetFromBackend.last_activity_at + ) + ); + } catch (error) { + console.error("Error fetching compatible datasets:", error); + throw new Error("Failed to fetch compatible datasets"); + } + } +} diff --git a/extralit-server/src/extralit_server/jobs/import_jobs.py b/extralit-server/src/extralit_server/jobs/import_jobs.py index 93f900dce..ef418b6d8 100644 --- a/extralit-server/src/extralit_server/jobs/import_jobs.py +++ b/extralit-server/src/extralit_server/jobs/import_jobs.py @@ -100,10 +100,21 @@ def _row_to_record_schema(self, row: dict[str, Any], dataset: Dataset) -> Record ) def _row_external_id(self, row: dict[str, Any]) -> str: - if not self.mapping.external_id: - return f"import_history_{self.import_history.id}_{self._next_row_idx()}" + # Try to create a meaningful external_id from metadata fields, typically "reference" + if row.get("reference"): + return str(row["reference"]) - return str(row.get(self.mapping.external_id, f"import_history_{self.import_history.id}_{self._next_row_idx()}")) + # Create composite key from multiple metadata fields if available + key_parts = [] + for mapping_metadata in self.mapping.metadata or []: + if row.get(mapping_metadata.source): + key_parts.append(f"{mapping_metadata.source}_{row[mapping_metadata.source]}") + + if key_parts: + return "_".join(key_parts) + + # Fallback to sequential ID when no meaningful metadata available + return f"import_history_{self.import_history.id}_{self._next_row_idx()}" def _row_fields(self, row: dict[str, Any], dataset: Dataset) -> dict[str, Any]: fields = {} @@ -191,6 +202,16 @@ async def import_dataset_from_import_history_job(history_id: UUID, dataset_id: U ) async with SearchEngine.get_by_name(settings.search_engine) as search_engine: - parsed_mapping = HubDatasetMapping.model_validate(mapping) + # Add source_id provenance to the mapping + mapping_with_provenance = {**mapping} + mapping_with_provenance["source_id"] = f"import:{history_id}" + mapping_with_provenance["target_id"] = None # Set to None for incoming datasets + + parsed_mapping = HubDatasetMapping.model_validate(mapping_with_provenance) + + # Store the mapping with provenance in dataset metadata for persistence + dataset.metadata_ = dataset.metadata_ or {} + dataset.metadata_["mapping"] = parsed_mapping.model_dump() + await dataset.save(db) await ImportHistoryDataset(import_history, parsed_mapping).import_to(db, search_engine, dataset) From 59b8dcc3636c702a67a1535f3a8237c6b09cd1ab Mon Sep 17 00:00:00 2001 From: JonnyTran Date: Wed, 27 Aug 2025 17:22:36 -0700 Subject: [PATCH 03/24] Enhance dataset creation workflow with update functionality and new dialog. - Added DatasetUpdateDialog component for updating datasets, integrated data source selection, and improved dataset configuration forms. - Updated translations for button labels and added validation for compatible datasets. --- .../configuration/DatasetConfiguration.vue | 8 +- .../DatasetConfigurationDialog.vue | 15 +- .../DatasetConfigurationForm.vue | 67 +++- .../configuration/DatasetUpdateDialog.vue | 364 ++++++++++++++++++ ...useDatasetConfigurationNameAndWorkspace.ts | 15 +- extralit-frontend/translation/en.js | 9 +- ...get-import-compatible-datasets-use-case.ts | 36 +- .../api/handlers/v1/datasets/datasets.py | 18 +- .../api/schemas/v1/datasets.py | 5 + 9 files changed, 474 insertions(+), 63 deletions(-) create mode 100644 extralit-frontend/components/features/dataset-creation/configuration/DatasetUpdateDialog.vue diff --git a/extralit-frontend/components/features/dataset-creation/configuration/DatasetConfiguration.vue b/extralit-frontend/components/features/dataset-creation/configuration/DatasetConfiguration.vue index 097658810..e7d2a9dbe 100644 --- a/extralit-frontend/components/features/dataset-creation/configuration/DatasetConfiguration.vue +++ b/extralit-frontend/components/features/dataset-creation/configuration/DatasetConfiguration.vue @@ -79,7 +79,11 @@ @@ -88,7 +92,7 @@ - + + diff --git a/extralit-frontend/components/features/dataset-creation/configuration/useDatasetConfigurationNameAndWorkspace.ts b/extralit-frontend/components/features/dataset-creation/configuration/useDatasetConfigurationNameAndWorkspace.ts index 0534583e5..3f453578c 100644 --- a/extralit-frontend/components/features/dataset-creation/configuration/useDatasetConfigurationNameAndWorkspace.ts +++ b/extralit-frontend/components/features/dataset-creation/configuration/useDatasetConfigurationNameAndWorkspace.ts @@ -3,14 +3,14 @@ import { useResolve } from "ts-injecty"; import { ref } from "vue-demi"; import { GetWorkspacesUseCase } from "~/v1/domain/usecases/get-workspaces-use-case"; import { GetImportCompatibleDatasetsUseCase } from "~/v1/domain/usecases/get-import-compatible-datasets-use-case"; -import { Dataset } from "~/v1/domain/entities/dataset/Dataset"; +import { BackendDataset } from "~/v1/infrastructure/types/dataset"; export const useDatasetConfigurationNameAndWorkspace = () => { const workspaces = ref([]); - const compatibleDatasets = ref([]); + const compatibleDatasets = ref([]); const isLoadingCompatibleDatasets = ref(false); const workflowType = ref<"create" | "append">("create"); - const selectedTargetDataset = ref(null); + const selectedTargetDataset = ref(null); const getWorkspacesUseCase = useResolve(GetWorkspacesUseCase); const getImportCompatibleDatasetsUseCase = useResolve(GetImportCompatibleDatasetsUseCase); @@ -48,14 +48,6 @@ export const useDatasetConfigurationNameAndWorkspace = () => { } }; - const getColumnNamesFromImportData = (importData: any): string[] => { - if (!importData?.data?.data?.length) return []; - - // Get column names from the first row of import data - const firstRow = importData.data.data[0]; - return Object.keys(firstRow); - }; - return { workspaces, compatibleDatasets, @@ -64,6 +56,5 @@ export const useDatasetConfigurationNameAndWorkspace = () => { selectedTargetDataset, loadCompatibleDatasets, onWorkflowTypeChange, - getColumnNamesFromImportData, }; }; diff --git a/extralit-frontend/translation/en.js b/extralit-frontend/translation/en.js index 156a781ac..6379d56f5 100644 --- a/extralit-frontend/translation/en.js +++ b/extralit-frontend/translation/en.js @@ -329,13 +329,19 @@ export default { atLeastOneRequired: "At least one required question is needed.", hasInvalidQuestions: "Some questions are invalid", createDataset: "Create the dataset in Extralit", + updateDataset: "Add to the dataset in Extralit", datasetName: "Dataset name", name: "Name", assignWorkspace: "Assign a workspace", selectSplit: "Select a split", recordWarning: "The created dataset will include the first 10K rows and further records can be logged via the python SDK.", - button: "Create dataset", + createButton: "Create dataset", + updateButton: "Update dataset", + sourceField: "From", + targetField: "To", + noMapping: "No mapping", + importSummary: "You are about to add new and update existing records into the dataset.", fields: "Fields", metadata: "Metadata Fields", metadataDescription: "Select fields to include as metadata for filtering and sorting", @@ -355,6 +361,7 @@ export default { none: "None", noWorkspaces: "Please, follow this guide to create a workspace", + noCompatibleDatasets: "No compatible dataset for this import.", }, exportToHub: { dialogTitle: "Push to Hugging Face Hub", diff --git a/extralit-frontend/v1/domain/usecases/get-import-compatible-datasets-use-case.ts b/extralit-frontend/v1/domain/usecases/get-import-compatible-datasets-use-case.ts index 08a964829..f478134e9 100644 --- a/extralit-frontend/v1/domain/usecases/get-import-compatible-datasets-use-case.ts +++ b/extralit-frontend/v1/domain/usecases/get-import-compatible-datasets-use-case.ts @@ -1,5 +1,6 @@ import { type NuxtAxiosInstance } from "@nuxtjs/axios"; -import { Dataset } from "../entities/dataset/Dataset"; +import type { Response } from "~/v1/infrastructure/types/api"; +import type { BackendDataset } from "~/v1/infrastructure/types/dataset"; export interface GetImportCompatibleDatasetsParams { columnNames: string[]; @@ -7,37 +8,16 @@ export interface GetImportCompatibleDatasetsParams { } export class GetImportCompatibleDatasetsUseCase { - constructor(private readonly axios: NuxtAxiosInstance) {} + constructor(private readonly axios: NuxtAxiosInstance) { } - async execute(params: GetImportCompatibleDatasetsParams): Promise { + async execute(params: GetImportCompatibleDatasetsParams): Promise { try { - const response = await this.axios.get("/api/v1/datasets/compatible", { - params: { - column_names: params.columnNames, - workspace_id: params.workspaceId, - }, + const { data } = await this.axios.post>("/v1/datasets/compatible", { + column_names: params.columnNames, + workspace_id: params.workspaceId, }); - return (response.data.items || []).map( - (datasetFromBackend: any) => - new Dataset( - datasetFromBackend.id, - datasetFromBackend.name, - datasetFromBackend.guidelines, - datasetFromBackend.status, - datasetFromBackend.workspace_id, - datasetFromBackend.workspace_name, - datasetFromBackend.allow_extra_metadata, - { - strategy: datasetFromBackend.distribution.strategy, - minSubmitted: datasetFromBackend.distribution.min_submitted, - }, - datasetFromBackend.metadata, - datasetFromBackend.inserted_at, - datasetFromBackend.updated_at, - datasetFromBackend.last_activity_at - ) - ); + return data.items || [] } catch (error) { console.error("Error fetching compatible datasets:", error); throw new Error("Failed to fetch compatible datasets"); diff --git a/extralit-server/src/extralit_server/api/handlers/v1/datasets/datasets.py b/extralit-server/src/extralit_server/api/handlers/v1/datasets/datasets.py index be27ce1f9..e2c2281b2 100644 --- a/extralit-server/src/extralit_server/api/handlers/v1/datasets/datasets.py +++ b/extralit-server/src/extralit_server/api/handlers/v1/datasets/datasets.py @@ -21,9 +21,7 @@ from extralit_server.api.policies.v1 import DatasetPolicy, MetadataPropertyPolicy, authorize, is_authorized from extralit_server.api.schemas.v1.datasets import ( - Dataset as DatasetSchema, -) -from extralit_server.api.schemas.v1.datasets import ( + CompatibleDatasetsRequest, DatasetCreate, DatasetMetrics, DatasetProgress, @@ -34,6 +32,9 @@ ImportHistoryDataset, UsersProgress, ) +from extralit_server.api.schemas.v1.datasets import ( + Dataset as DatasetSchema, +) from extralit_server.api.schemas.v1.fields import Field, FieldCreate, Fields from extralit_server.api.schemas.v1.jobs import Job as JobSchema from extralit_server.api.schemas.v1.metadata_properties import ( @@ -97,18 +98,17 @@ async def list_current_user_datasets( return Datasets(items=dataset_list) -@router.get("/datasets/compatible", response_model=Datasets) +@router.post("/datasets/compatible", response_model=Datasets) async def list_compatible_datasets( *, + request: CompatibleDatasetsRequest, db: Annotated[AsyncSession, Depends(get_async_db)], current_user: Annotated[User, Security(auth.get_current_user)], - column_names: Annotated[list[str], Query(description="List of column names to match against existing datasets")], - workspace_id: Annotated[UUID | None, Query(description="Filter by workspace_id")] = None, ): - await authorize(current_user, DatasetPolicy.list(workspace_id)) + await authorize(current_user, DatasetPolicy.list(request.workspace_id)) filters = { - "workspace_id": workspace_id, + "workspace_id": request.workspace_id, "status": DatasetStatus.ready, } @@ -118,7 +118,7 @@ async def list_compatible_datasets( all_datasets = Datasets(items=dataset_list) - return all_datasets.get_compatible_datasets(column_names) + return all_datasets.get_compatible_datasets(request.column_names) @router.get("/datasets/{dataset_id}/fields", response_model=Fields) diff --git a/extralit-server/src/extralit_server/api/schemas/v1/datasets.py b/extralit-server/src/extralit_server/api/schemas/v1/datasets.py index 7acc8d1c3..1a4ba3bce 100644 --- a/extralit-server/src/extralit_server/api/schemas/v1/datasets.py +++ b/extralit-server/src/extralit_server/api/schemas/v1/datasets.py @@ -246,3 +246,8 @@ class HubDatasetExport(BaseModel): class ImportHistoryDataset(BaseModel): history_id: UUID = Field(..., description="The ID of the import history to import from") mapping: HubDatasetMapping = Field(..., description="The mapping configuration for the import") + + +class CompatibleDatasetsRequest(BaseModel): + column_names: list[str] = Field(..., description="List of column names to match against existing datasets") + workspace_id: UUID | None = Field(None, description="Filter by workspace_id") From 346487c7f1030f91dc9ed254c5a47a8b72a4ce5c Mon Sep 17 00:00:00 2001 From: JonnyTran Date: Thu, 28 Aug 2025 11:04:15 -0700 Subject: [PATCH 04/24] latest --- .../dataset-creation/configuration/DatasetUpdateDialog.vue | 7 +++---- .../useDatasetConfigurationNameAndWorkspace.ts | 2 +- extralit-frontend/package.json | 2 +- extralit-frontend/translation/en.js | 2 +- .../v1/domain/usecases/create-dataset-use-case.ts | 2 +- 5 files changed, 7 insertions(+), 8 deletions(-) diff --git a/extralit-frontend/components/features/dataset-creation/configuration/DatasetUpdateDialog.vue b/extralit-frontend/components/features/dataset-creation/configuration/DatasetUpdateDialog.vue index 778a27bae..37908459b 100644 --- a/extralit-frontend/components/features/dataset-creation/configuration/DatasetUpdateDialog.vue +++ b/extralit-frontend/components/features/dataset-creation/configuration/DatasetUpdateDialog.vue @@ -93,13 +93,14 @@ diff --git a/extralit-frontend/components/features/dataset-creation/configuration/useDatasetConfigurationNameAndWorkspace.ts b/extralit-frontend/components/features/dataset-creation/configuration/useDatasetConfigurationNameAndWorkspace.ts index 3f453578c..36bed599f 100644 --- a/extralit-frontend/components/features/dataset-creation/configuration/useDatasetConfigurationNameAndWorkspace.ts +++ b/extralit-frontend/components/features/dataset-creation/configuration/useDatasetConfigurationNameAndWorkspace.ts @@ -4,6 +4,7 @@ import { ref } from "vue-demi"; import { GetWorkspacesUseCase } from "~/v1/domain/usecases/get-workspaces-use-case"; import { GetImportCompatibleDatasetsUseCase } from "~/v1/domain/usecases/get-import-compatible-datasets-use-case"; import { BackendDataset } from "~/v1/infrastructure/types/dataset"; +import { DatasetCreation } from "~/v1/domain/entities/hub/DatasetCreation"; export const useDatasetConfigurationNameAndWorkspace = () => { const workspaces = ref([]); @@ -32,7 +33,6 @@ export const useDatasetConfigurationNameAndWorkspace = () => { workspaceId, }); } catch (error) { - console.error("Error loading compatible datasets:", error); compatibleDatasets.value = []; } finally { isLoadingCompatibleDatasets.value = false; diff --git a/extralit-frontend/package.json b/extralit-frontend/package.json index 4aaccac37..72973c92d 100644 --- a/extralit-frontend/package.json +++ b/extralit-frontend/package.json @@ -128,4 +128,4 @@ "engines": { "node": ">=18.16.1" } -} +} \ No newline at end of file diff --git a/extralit-frontend/translation/en.js b/extralit-frontend/translation/en.js index 6379d56f5..0f352e73c 100644 --- a/extralit-frontend/translation/en.js +++ b/extralit-frontend/translation/en.js @@ -350,7 +350,7 @@ export default { requiredField: "Required field", requiredQuestion: "Required question", select: "Select", - mapToColumn: "Map to column", + mapToColumn: "Fields mapping", applyToaAField: "Annotate spans on:", subset: "Subset", selectSubset: "Your can create a dataset from only one subset.", diff --git a/extralit-frontend/v1/domain/usecases/create-dataset-use-case.ts b/extralit-frontend/v1/domain/usecases/create-dataset-use-case.ts index 753094037..8cebe2c63 100644 --- a/extralit-frontend/v1/domain/usecases/create-dataset-use-case.ts +++ b/extralit-frontend/v1/domain/usecases/create-dataset-use-case.ts @@ -55,7 +55,7 @@ export class CreateDatasetUseCase { const progress = await this.datasetRepository.getProgress(datasetCreated); - if (progress.hasAtLeastTenRecord) { + if (progress.total) { break; } From eafa6f6488961cc6760f64149cdb02ee1fb7b256 Mon Sep 17 00:00:00 2001 From: JonnyTran Date: Thu, 28 Aug 2025 11:57:21 -0700 Subject: [PATCH 05/24] Implement dataset update functionality and improve error handling. - Introduced UpdateDatasetUseCase for handling dataset updates. - Enhanced DatasetConfigurationForm and DatasetUpdateDialog to support source and target dataset selection. - Added error handling and validation for dataset updates in the relevant components. - Updated useDatasetConfigurationForm to include the new update method. --- .../container/mode/useDocumentViewModel.ts | 6 ----- .../DatasetConfigurationForm.vue | 9 ++++--- .../configuration/DatasetUpdateDialog.vue | 11 +++++++- .../useDatasetConfigurationForm.ts | 25 +++++++++++++++++++ ...useDatasetConfigurationNameAndWorkspace.ts | 14 +++++++++++ extralit-frontend/v1/di/di.ts | 3 +++ .../usecases/update-dataset-use-case.ts | 12 +++++++++ 7 files changed, 70 insertions(+), 10 deletions(-) create mode 100644 extralit-frontend/v1/domain/usecases/update-dataset-use-case.ts diff --git a/extralit-frontend/components/features/annotation/container/mode/useDocumentViewModel.ts b/extralit-frontend/components/features/annotation/container/mode/useDocumentViewModel.ts index fe52361cf..ace51bd5f 100644 --- a/extralit-frontend/components/features/annotation/container/mode/useDocumentViewModel.ts +++ b/extralit-frontend/components/features/annotation/container/mode/useDocumentViewModel.ts @@ -35,12 +35,6 @@ export const useDocumentViewModel = (props: { record: any }) => { await getDocument.setDocument(params); } catch (e) { - const identifier = metadata?.pmid || metadata?.doi || metadata?.doc_id || metadata?.reference || "unknown"; - console.error(`Error fetching document with identifier "${identifier}":`, e); - notification.notify({ - message: `Error fetching document with identifier "${identifier}"`, - type: "danger", - }); clearDocument(); } }; diff --git a/extralit-frontend/components/features/dataset-creation/configuration/DatasetConfigurationForm.vue b/extralit-frontend/components/features/dataset-creation/configuration/DatasetConfigurationForm.vue index c86abf3c4..ee5940629 100644 --- a/extralit-frontend/components/features/dataset-creation/configuration/DatasetConfigurationForm.vue +++ b/extralit-frontend/components/features/dataset-creation/configuration/DatasetConfigurationForm.vue @@ -161,9 +161,12 @@ export default { this.isUpdateWorkflow = false; this.visibleDatasetUpdateDialog = false; }, - updateDataset() { - console.log('Updating dataset', this.dataset) - this.create(this.dataset); + updateDataset(updateData) { + this.closeUpdateDialog(); + + // Use the source dataset from the dialog event, or fall back to this.dataset + const sourceDataset = updateData.source || this.dataset; + this.update(sourceDataset, updateData.targetDataset.id); }, generateName(type: string, number: string | number): string { const typeName = this.$t(`config.questionId.${type}`); diff --git a/extralit-frontend/components/features/dataset-creation/configuration/DatasetUpdateDialog.vue b/extralit-frontend/components/features/dataset-creation/configuration/DatasetUpdateDialog.vue index 37908459b..26c1fb9b2 100644 --- a/extralit-frontend/components/features/dataset-creation/configuration/DatasetUpdateDialog.vue +++ b/extralit-frontend/components/features/dataset-creation/configuration/DatasetUpdateDialog.vue @@ -173,8 +173,17 @@ export default { return; } + // Convert BackendDataset to target dataset info with workspace + const targetDataset = this.convertBackendDatasetToTargetInfo(this.selectedTargetDataset, this.selectedWorkspace); + + if (!targetDataset) { + this.validationError = "Missing target dataset or workspace information"; + return; + } + this.$emit("update-dataset", { - dataset: this.selectedTargetDataset, + targetDataset, + source: this.dataset, }); }, async onWorkspaceChange() { diff --git a/extralit-frontend/components/features/dataset-creation/configuration/useDatasetConfigurationForm.ts b/extralit-frontend/components/features/dataset-creation/configuration/useDatasetConfigurationForm.ts index 96e9dd3d0..0bde411d2 100644 --- a/extralit-frontend/components/features/dataset-creation/configuration/useDatasetConfigurationForm.ts +++ b/extralit-frontend/components/features/dataset-creation/configuration/useDatasetConfigurationForm.ts @@ -3,6 +3,7 @@ import { ref } from "vue-demi"; import { availableFieldTypes } from "~/v1/domain/entities/hub/FieldCreation"; import { availableQuestionTypes } from "~/v1/domain/entities/hub/QuestionCreation"; import { CreateDatasetUseCase } from "~/v1/domain/usecases/create-dataset-use-case"; +import { UpdateDatasetUseCase } from "~/v1/domain/usecases/update-dataset-use-case"; import { useRoutes } from "~/v1/infrastructure/services"; import { DatasetCreation } from "~/v1/domain/entities/hub/DatasetCreation"; import { ImportHistoryDetails } from "~/v1/domain/entities/import/ImportHistoryDetails"; @@ -11,6 +12,7 @@ export const useDatasetConfigurationForm = () => { const isLoading = ref(false); const { goToFeedbackTaskAnnotationPage } = useRoutes(); const createDatasetUseCase = useResolve(CreateDatasetUseCase); + const updateDatasetUseCase = useResolve(UpdateDatasetUseCase); const create = async (dataset: DatasetCreation, importData?: ImportHistoryDetails) => { isLoading.value = true; @@ -39,10 +41,33 @@ export const useDatasetConfigurationForm = () => { } }; + const update = async (dataset: DatasetCreation, targetDatasetId: string) => { + isLoading.value = true; + + try { + const jobId = await updateDatasetUseCase.execute(dataset, targetDatasetId); + + if (!jobId) { + console.error("Failed to start dataset update job"); + return; + } + + console.log("Dataset update job started with ID:", jobId); + + goToFeedbackTaskAnnotationPage(targetDatasetId); + } catch (error) { + console.error("Failed to update dataset:", error); + throw error; + } finally { + isLoading.value = false; + } + }; + return { availableFieldTypes, availableQuestionTypes, create, + update, isLoading, }; }; diff --git a/extralit-frontend/components/features/dataset-creation/configuration/useDatasetConfigurationNameAndWorkspace.ts b/extralit-frontend/components/features/dataset-creation/configuration/useDatasetConfigurationNameAndWorkspace.ts index 36bed599f..fc5abbd29 100644 --- a/extralit-frontend/components/features/dataset-creation/configuration/useDatasetConfigurationNameAndWorkspace.ts +++ b/extralit-frontend/components/features/dataset-creation/configuration/useDatasetConfigurationNameAndWorkspace.ts @@ -48,6 +48,19 @@ export const useDatasetConfigurationNameAndWorkspace = () => { } }; + const convertBackendDatasetToTargetInfo = (backendDataset: BackendDataset | null, selectedWorkspace: any) => { + if (!backendDataset || !selectedWorkspace) return null; + + return { + id: backendDataset.id, + name: backendDataset.name, + workspace: { + id: selectedWorkspace.id, + name: selectedWorkspace.name, + }, + }; + }; + return { workspaces, compatibleDatasets, @@ -56,5 +69,6 @@ export const useDatasetConfigurationNameAndWorkspace = () => { selectedTargetDataset, loadCompatibleDatasets, onWorkflowTypeChange, + convertBackendDatasetToTargetInfo, }; }; diff --git a/extralit-frontend/v1/di/di.ts b/extralit-frontend/v1/di/di.ts index 3e77c4406..6a819c978 100644 --- a/extralit-frontend/v1/di/di.ts +++ b/extralit-frontend/v1/di/di.ts @@ -77,6 +77,7 @@ import { GetImportHistoryDetailsUseCase } from "@/v1/domain/usecases/get-import- import { GetJobStatusUseCase } from "@/v1/domain/usecases/get-job-status-use-case"; import { LoadUserUseCase } from "@/v1/domain/usecases/load-user-use-case"; import { CreateDatasetUseCase } from "@/v1/domain/usecases/create-dataset-use-case"; +import { UpdateDatasetUseCase } from "@/v1/domain/usecases/update-dataset-use-case"; import { GetFirstRecordFromHub } from "@/v1/domain/usecases/get-first-record-from-hub"; import { ExportDatasetToHubUseCase } from "@/v1/domain/usecases/export-dataset-to-hub-use-case"; import { AuthLoginUseCase } from "@/v1/domain/usecases/auth-login-use-case"; @@ -210,6 +211,8 @@ export const loadDependencyContainer = (context: Context) => { .withDependencies(DatasetRepository, WorkspaceRepository, QuestionRepository, FieldRepository, MetadataRepository) .build(), + register(UpdateDatasetUseCase).withDependency(DatasetRepository).build(), + register(GetFirstRecordFromHub).withDependency(HubRepository).build(), register(ExportDatasetToHubUseCase).withDependencies(DatasetRepository, useLocalStorage).build(), diff --git a/extralit-frontend/v1/domain/usecases/update-dataset-use-case.ts b/extralit-frontend/v1/domain/usecases/update-dataset-use-case.ts new file mode 100644 index 000000000..39f2ca604 --- /dev/null +++ b/extralit-frontend/v1/domain/usecases/update-dataset-use-case.ts @@ -0,0 +1,12 @@ +import { IDatasetRepository, JobId } from "../services/IDatasetRepository"; +import { DatasetCreation } from "../entities/hub/DatasetCreation"; + + +export class UpdateDatasetUseCase { + constructor(private readonly datasetRepository: IDatasetRepository) { } + + async execute(dataset: DatasetCreation, targetDatasetId: string): Promise { + + return await this.datasetRepository.import(targetDatasetId, dataset); + } +} \ No newline at end of file From 99ad0bd61a5eac070972c89c03f08073c3c83b32 Mon Sep 17 00:00:00 2001 From: JonnyTran Date: Thu, 28 Aug 2025 12:35:47 -0700 Subject: [PATCH 06/24] refactor --- .pre-commit-config.yaml | 2 +- .../dataset-creation/configuration/DatasetConfigurationForm.vue | 1 + .../tests/unit/contexts/{ => documents}/test_imports.py | 0 3 files changed, 2 insertions(+), 1 deletion(-) rename extralit-server/tests/unit/contexts/{ => documents}/test_imports.py (100%) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 618ac975f..a21ad8b7f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -75,7 +75,7 @@ repos: hooks: - id: frontend-lint name: "Lint and fix extralit-frontend files" - entry: bash -c 'cd extralit-frontend && npx eslint --fix "${@#extralit-frontend/}" || true' + entry: bash -c 'cd extralit-frontend && npx eslint --fix --cache "$@" || true' language: system files: '^extralit-frontend/.*\.(js|ts|vue)$' pass_filenames: true diff --git a/extralit-frontend/components/features/dataset-creation/configuration/DatasetConfigurationForm.vue b/extralit-frontend/components/features/dataset-creation/configuration/DatasetConfigurationForm.vue index ee5940629..e8f0450a9 100644 --- a/extralit-frontend/components/features/dataset-creation/configuration/DatasetConfigurationForm.vue +++ b/extralit-frontend/components/features/dataset-creation/configuration/DatasetConfigurationForm.vue @@ -326,6 +326,7 @@ export default { &__button-area { display: flex; + gap: 1rem; .button { width: 100%; diff --git a/extralit-server/tests/unit/contexts/test_imports.py b/extralit-server/tests/unit/contexts/documents/test_imports.py similarity index 100% rename from extralit-server/tests/unit/contexts/test_imports.py rename to extralit-server/tests/unit/contexts/documents/test_imports.py From 9308c883c45f5ef722251409da2540933ad18611 Mon Sep 17 00:00:00 2001 From: JonnyTran Date: Thu, 28 Aug 2025 12:40:00 -0700 Subject: [PATCH 07/24] Refactor dataset creation components and introduce DatasetCreateDialog. - Renamed DatasetConfigurationDialog to DatasetCreateDialog for clarity. - Added new DatasetCreateDialog component to handle dataset creation with improved UI and validation. - Updated useDatasetConfigurationNameAndWorkspace to remove unused imports. --- .../DatasetConfigurationForm.vue | 2 +- ...tionDialog.vue => DatasetCreateDialog.vue} | 0 ...useDatasetConfigurationNameAndWorkspace.ts | 1 - ...get-import-compatible-datasets-use-case.ts | 1 - .../tests/unit/contexts/test_records_bulk.py | 302 ++++++++++++++++++ 5 files changed, 303 insertions(+), 3 deletions(-) rename extralit-frontend/components/features/dataset-creation/configuration/{DatasetConfigurationDialog.vue => DatasetCreateDialog.vue} (100%) create mode 100644 extralit-server/tests/unit/contexts/test_records_bulk.py diff --git a/extralit-frontend/components/features/dataset-creation/configuration/DatasetConfigurationForm.vue b/extralit-frontend/components/features/dataset-creation/configuration/DatasetConfigurationForm.vue index e8f0450a9..9932e1407 100644 --- a/extralit-frontend/components/features/dataset-creation/configuration/DatasetConfigurationForm.vue +++ b/extralit-frontend/components/features/dataset-creation/configuration/DatasetConfigurationForm.vue @@ -72,7 +72,7 @@ /> - { const workspaces = ref([]); diff --git a/extralit-frontend/v1/domain/usecases/get-import-compatible-datasets-use-case.ts b/extralit-frontend/v1/domain/usecases/get-import-compatible-datasets-use-case.ts index f478134e9..175e2cb37 100644 --- a/extralit-frontend/v1/domain/usecases/get-import-compatible-datasets-use-case.ts +++ b/extralit-frontend/v1/domain/usecases/get-import-compatible-datasets-use-case.ts @@ -19,7 +19,6 @@ export class GetImportCompatibleDatasetsUseCase { return data.items || [] } catch (error) { - console.error("Error fetching compatible datasets:", error); throw new Error("Failed to fetch compatible datasets"); } } diff --git a/extralit-server/tests/unit/contexts/test_records_bulk.py b/extralit-server/tests/unit/contexts/test_records_bulk.py new file mode 100644 index 000000000..8d5df94ca --- /dev/null +++ b/extralit-server/tests/unit/contexts/test_records_bulk.py @@ -0,0 +1,302 @@ +# Copyright 2024-present, Extralit Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +from sqlalchemy import func, select +from sqlalchemy.ext.asyncio import AsyncSession + +from extralit_server.api.schemas.v1.records import RecordUpsert +from extralit_server.api.schemas.v1.records_bulk import RecordsBulkUpsert +from extralit_server.contexts.records_bulk import UpsertRecordsBulk +from extralit_server.enums import DatasetStatus +from extralit_server.models import Record +from extralit_server.search_engine import SearchEngine +from tests.factories import DatasetFactory, RecordFactory, TextFieldFactory + + +@pytest.mark.asyncio +class TestUpsertRecordsBulk: + async def test_upsert_records_bulk_with_duplicate_external_ids( + self, db: AsyncSession, mock_search_engine: SearchEngine + ): + """Tests that records with the same external_id are updated instead of creating duplicates.""" + dataset = await DatasetFactory.create(status=DatasetStatus.ready) + await TextFieldFactory.create(name="text-field", dataset=dataset) + + # Create initial record with external_id + await RecordFactory.create(fields={"text-field": "original value"}, external_id="duplicate-id", dataset=dataset) + + # Verify we have 1 record initially + assert (await db.execute(select(func.count(Record.id)))).scalar_one() == 1 + + # Create UpsertRecordsBulk instance + upsert_bulk = UpsertRecordsBulk(db, mock_search_engine) + + # Attempt to upsert records with the same external_id + bulk_upsert = RecordsBulkUpsert( + items=[ + RecordUpsert(external_id="duplicate-id", fields={"text-field": "updated value 1"}), + RecordUpsert(external_id="duplicate-id", fields={"text-field": "updated value 2"}), + ] + ) + + await upsert_bulk.upsert_records_bulk(dataset, bulk_upsert) + + # Verify we still have only 1 record (no duplicates created) + assert (await db.execute(select(func.count(Record.id)))).scalar_one() == 1 + + # Verify the record was updated with the last value + record = (await db.execute(select(Record))).scalar_one() + assert record.external_id == "duplicate-id" + assert record.fields["text-field"] == "updated value 2" + + async def test_upsert_records_bulk_with_reference_metadata_external_id( + self, db: AsyncSession, mock_search_engine: SearchEngine + ): + """Tests that external_id from metadata (like reference field) is properly used for deduplication.""" + dataset = await DatasetFactory.create(status=DatasetStatus.ready) + await TextFieldFactory.create(name="text-field", dataset=dataset) + + # Create initial record with external_id from reference metadata + await RecordFactory.create( + fields={"text-field": "original value"}, + external_id="ref_123456", + metadata_={"reference": "123456", "doi": "10.1000/sample"}, + dataset=dataset, + ) + + # Verify we have 1 record initially + assert (await db.execute(select(func.count(Record.id)))).scalar_one() == 1 + + # Create UpsertRecordsBulk instance + upsert_bulk = UpsertRecordsBulk(db, mock_search_engine) + + # Attempt to upsert record with same external_id (simulating ImportHistory workflow) + bulk_upsert = RecordsBulkUpsert( + items=[ + RecordUpsert( + external_id="ref_123456", # Same external_id generated from reference + fields={"text-field": "updated from import"}, + metadata={"reference": "123456", "doi": "10.1000/sample", "pmid": "987654"}, + ) + ] + ) + + await upsert_bulk.upsert_records_bulk(dataset, bulk_upsert) + + # Verify we still have only 1 record (deduplication worked) + assert (await db.execute(select(func.count(Record.id)))).scalar_one() == 1 + + # Verify the record was updated with new field and metadata + record = (await db.execute(select(Record))).scalar_one() + assert record.external_id == "ref_123456" + assert record.fields["text-field"] == "updated from import" + assert record.metadata_["reference"] == "123456" + assert record.metadata_["pmid"] == "987654" # New metadata added + + async def test_upsert_records_bulk_updates_existing_records_with_matching_external_id( + self, db: AsyncSession, mock_search_engine: SearchEngine + ): + """Tests that existing records with matching external_id have their fields updated from the upsert.""" + dataset = await DatasetFactory.create(status=DatasetStatus.ready) + await TextFieldFactory.create(name="title", dataset=dataset) + await TextFieldFactory.create(name="content", dataset=dataset) + + # Create initial record + original_record = await RecordFactory.create( + fields={"title": "Original Title", "content": "Original Content"}, + external_id="update-test-123", + metadata_={"source": "initial"}, + dataset=dataset, + ) + + # Store the original record ID and timestamps + original_id = original_record.id + original_inserted_at = original_record.inserted_at + + # Verify we have 1 record initially + assert (await db.execute(select(func.count(Record.id)))).scalar_one() == 1 + + # Create UpsertRecordsBulk instance + upsert_bulk = UpsertRecordsBulk(db, mock_search_engine) + + # Upsert record with same external_id but different field values + bulk_upsert = RecordsBulkUpsert( + items=[ + RecordUpsert( + external_id="update-test-123", + fields={"title": "Updated Title", "content": "Updated Content"}, + metadata={"source": "updated", "version": "2.0"}, + ) + ] + ) + + await upsert_bulk.upsert_records_bulk(dataset, bulk_upsert) + + # Verify we still have only 1 record (existing record was updated) + assert (await db.execute(select(func.count(Record.id)))).scalar_one() == 1 + + # Verify the same record was updated, not replaced + updated_record = (await db.execute(select(Record))).scalar_one() + assert updated_record.id == original_id # Same record ID + assert updated_record.external_id == "update-test-123" + assert updated_record.inserted_at == original_inserted_at # Insert time preserved + assert updated_record.updated_at > original_inserted_at # Update time changed + + # Verify field values were updated + assert updated_record.fields["title"] == "Updated Title" + assert updated_record.fields["content"] == "Updated Content" + + # Verify metadata was updated + assert updated_record.metadata_["source"] == "updated" + assert updated_record.metadata_["version"] == "2.0" + + async def test_upsert_records_bulk_preserves_different_external_ids( + self, db: AsyncSession, mock_search_engine: SearchEngine + ): + """Tests that records with different external_ids are both preserved.""" + dataset = await DatasetFactory.create(status=DatasetStatus.ready) + await TextFieldFactory.create(name="document", dataset=dataset) + + # Create initial records with different external_ids + record_1 = await RecordFactory.create(fields={"document": "Document 1"}, external_id="doc_001", dataset=dataset) + + record_2 = await RecordFactory.create(fields={"document": "Document 2"}, external_id="doc_002", dataset=dataset) + + # Verify we have 2 records initially + assert (await db.execute(select(func.count(Record.id)))).scalar_one() == 2 + + # Create UpsertRecordsBulk instance + upsert_bulk = UpsertRecordsBulk(db, mock_search_engine) + + # Upsert with different external_ids (no conflicts) + bulk_upsert = RecordsBulkUpsert( + items=[ + RecordUpsert( + external_id="doc_003", # New external_id + fields={"document": "Document 3"}, + ), + RecordUpsert( + external_id="doc_001", # Existing external_id (should update) + fields={"document": "Document 1 Updated"}, + ), + ] + ) + + await upsert_bulk.upsert_records_bulk(dataset, bulk_upsert) + + # Verify we have 3 records total (1 new, 1 updated, 1 preserved) + assert (await db.execute(select(func.count(Record.id)))).scalar_one() == 3 + + # Get all records ordered by external_id + records = (await db.execute(select(Record).order_by(Record.external_id))).scalars().all() + + # Verify doc_001 was updated + assert records[0].external_id == "doc_001" + assert records[0].fields["document"] == "Document 1 Updated" + assert records[0].id == record_1.id # Same record ID + + # Verify doc_002 was preserved unchanged + assert records[1].external_id == "doc_002" + assert records[1].fields["document"] == "Document 2" + assert records[1].id == record_2.id # Same record ID + + # Verify doc_003 was created as new record + assert records[2].external_id == "doc_003" + assert records[2].fields["document"] == "Document 3" + assert records[2].id != record_1.id and records[2].id != record_2.id # New record ID + + async def test_upsert_records_bulk_mixed_new_and_duplicate_external_ids( + self, db: AsyncSession, mock_search_engine: SearchEngine + ): + """Tests handling of batch with both new external_ids and duplicate external_ids.""" + dataset = await DatasetFactory.create(status=DatasetStatus.ready) + await TextFieldFactory.create(name="title", dataset=dataset) + + # Create initial records + existing_record_1 = await RecordFactory.create( + fields={"title": "Existing Paper 1"}, + external_id="paper_001", + metadata_={"reference": "001"}, + dataset=dataset, + ) + + existing_record_2 = await RecordFactory.create( + fields={"title": "Existing Paper 2"}, + external_id="paper_002", + metadata_={"reference": "002"}, + dataset=dataset, + ) + + # Verify we have 2 records initially + assert (await db.execute(select(func.count(Record.id)))).scalar_one() == 2 + + # Create UpsertRecordsBulk instance + upsert_bulk = UpsertRecordsBulk(db, mock_search_engine) + + # Upsert batch with mix of new, duplicate, and updating external_ids + bulk_upsert = RecordsBulkUpsert( + items=[ + RecordUpsert( + external_id="paper_003", # New external_id + fields={"title": "New Paper 3"}, + metadata={"reference": "003"}, + ), + RecordUpsert( + external_id="paper_001", # Existing external_id (update) + fields={"title": "Updated Paper 1"}, + metadata={"reference": "001", "updated": True}, + ), + RecordUpsert( + external_id="paper_004", # Another new external_id + fields={"title": "New Paper 4"}, + metadata={"reference": "004"}, + ), + RecordUpsert( + external_id="paper_001", # Duplicate in same batch (second update) + fields={"title": "Paper 1 Final Version"}, + metadata={"reference": "001", "final": True}, + ), + ] + ) + + await upsert_bulk.upsert_records_bulk(dataset, bulk_upsert) + + # Verify final record count: 2 existing + 2 new = 4 total + # (paper_001 updated twice, paper_002 unchanged, paper_003 and paper_004 new) + assert (await db.execute(select(func.count(Record.id)))).scalar_one() == 4 + + # Get all records ordered by external_id + records = (await db.execute(select(Record).order_by(Record.external_id))).scalars().all() + + # Verify paper_001 was updated with the last values from batch + assert records[0].external_id == "paper_001" + assert records[0].fields["title"] == "Paper 1 Final Version" + assert records[0].metadata_["final"] is True + assert records[0].id == existing_record_1.id # Same record ID + + # Verify paper_002 was unchanged (not in upsert batch) + assert records[1].external_id == "paper_002" + assert records[1].fields["title"] == "Existing Paper 2" + assert records[1].id == existing_record_2.id # Same record ID + + # Verify paper_003 was created as new record + assert records[2].external_id == "paper_003" + assert records[2].fields["title"] == "New Paper 3" + assert records[2].metadata_["reference"] == "003" + + # Verify paper_004 was created as new record + assert records[3].external_id == "paper_004" + assert records[3].fields["title"] == "New Paper 4" + assert records[3].metadata_["reference"] == "004" From 41eaf77739e69913c7cfb690ac14f74c3507bd62 Mon Sep 17 00:00:00 2001 From: JonnyTran Date: Thu, 28 Aug 2025 14:19:59 -0700 Subject: [PATCH 08/24] refactoring --- .../configuration/DatasetConfiguration.vue | 2 +- .../import/ImportHistoryDatasetBuilder.ts | 24 +++++++++---------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/extralit-frontend/components/features/dataset-creation/configuration/DatasetConfiguration.vue b/extralit-frontend/components/features/dataset-creation/configuration/DatasetConfiguration.vue index e7d2a9dbe..7f5f4a37b 100644 --- a/extralit-frontend/components/features/dataset-creation/configuration/DatasetConfiguration.vue +++ b/extralit-frontend/components/features/dataset-creation/configuration/DatasetConfiguration.vue @@ -106,7 +106,7 @@ export default { dataSource: { type: String, default: "hub", - validator: (value) => ["hub", "import"].includes(value), + validator: (value: string) => ["hub", "import"].includes(value), }, importData: { type: [ImportHistoryDetails, Object], diff --git a/extralit-frontend/v1/domain/entities/import/ImportHistoryDatasetBuilder.ts b/extralit-frontend/v1/domain/entities/import/ImportHistoryDatasetBuilder.ts index 96f9b498d..f9b573e12 100644 --- a/extralit-frontend/v1/domain/entities/import/ImportHistoryDatasetBuilder.ts +++ b/extralit-frontend/v1/domain/entities/import/ImportHistoryDatasetBuilder.ts @@ -15,12 +15,12 @@ export interface ImportHistoryFeature { name: string; } +export const METADATA_FIELDS = ["reference", "doi", "pmid"] as const; + export class ImportHistoryDatasetBuilder { private readonly importHistoryData: ImportHistoryDetailsResponse; private readonly datasetName: string; - // Fields that should be treated as metadata rather than dataset fields - private static readonly METADATA_FIELDS = ["reference", "doi", "imdb"] as const; constructor(importHistoryData: ImportHistoryDetailsResponse) { this.importHistoryData = importHistoryData; @@ -60,7 +60,7 @@ export class ImportHistoryDatasetBuilder { }; // Ensure metadata fields are properly mapped - ImportHistoryDatasetBuilder.METADATA_FIELDS.forEach((metadataField) => { + METADATA_FIELDS.forEach((metadataField) => { if (this.availableFields.includes(metadataField)) { const hasMapping = mappings.metadata.some((m) => m.target === metadataField); if (!hasMapping) { @@ -102,7 +102,7 @@ export class ImportHistoryDatasetBuilder { const features = this.extractFeaturesFromSchema(); // Ensure metadata fields are included in features if they exist in the data - ImportHistoryDatasetBuilder.METADATA_FIELDS.forEach((metadataField) => { + METADATA_FIELDS.forEach((metadataField) => { if (this.availableFields.includes(metadataField) && !features[metadataField]) { features[metadataField] = { dtype: "string", @@ -144,7 +144,7 @@ export class ImportHistoryDatasetBuilder { // Only create metadata for specific fields that should be treated as metadata this.importHistoryData.data.schema.fields.forEach((field) => { - if (ImportHistoryDatasetBuilder.METADATA_FIELDS.includes(field.name as any)) { + if (METADATA_FIELDS.includes(field.name as any)) { const metadataType = this.inferMetadataType(field.name); if (metadataType) { const metadata = MetadataCreation.from(field.name, metadataType); @@ -161,7 +161,7 @@ export class ImportHistoryDatasetBuilder { if (!hasReferenceMetadata) { const referenceSource = this.availableFields.includes("reference") ? "reference" : "id"; // Only add if the reference source is one of our metadata fields - if (ImportHistoryDatasetBuilder.METADATA_FIELDS.includes(referenceSource as any)) { + if (METADATA_FIELDS.includes(referenceSource as any)) { const referenceMetadata = MetadataCreation.from(referenceSource, "terms"); if (referenceMetadata) { (subset as any).metadata.push(referenceMetadata); @@ -177,10 +177,10 @@ export class ImportHistoryDatasetBuilder { private hasReferenceField(): boolean { return ( this.importHistoryData.data.schema.fields.some((field) => - ImportHistoryDatasetBuilder.METADATA_FIELDS.includes(field.name as any) + METADATA_FIELDS.includes(field.name as any) ) || this.importHistoryData.data.data.some((record) => - ImportHistoryDatasetBuilder.METADATA_FIELDS.some((field) => field in record) + METADATA_FIELDS.some((field) => field in record) ) ); } @@ -252,7 +252,7 @@ export class ImportHistoryDatasetBuilder { const metadata: Record = { ...record.metadata }; // Only include specific fields as metadata - ImportHistoryDatasetBuilder.METADATA_FIELDS.forEach((metadataField) => { + METADATA_FIELDS.forEach((metadataField) => { if (record[metadataField] !== undefined) { metadata[metadataField] = record[metadataField]; } @@ -293,7 +293,7 @@ export class ImportHistoryDatasetBuilder { if (!field) return "no mapping"; // Skip fields that should be treated as metadata - if (ImportHistoryDatasetBuilder.METADATA_FIELDS.includes(fieldName as any)) { + if (METADATA_FIELDS.includes(fieldName as any)) { return "no mapping"; } @@ -317,7 +317,7 @@ export class ImportHistoryDatasetBuilder { */ inferMetadataType(fieldName: string): MetadataTypes | "terms" | null { // Only return metadata types for fields that should be treated as metadata - if (!ImportHistoryDatasetBuilder.METADATA_FIELDS.includes(fieldName as any)) { + if (!METADATA_FIELDS.includes(fieldName as any)) { return null; } @@ -432,7 +432,7 @@ export class ImportHistoryDatasetBuilder { } // Skip fields that should be treated as metadata - if (ImportHistoryDatasetBuilder.METADATA_FIELDS.includes(fieldName as any)) { + if (METADATA_FIELDS.includes(fieldName as any)) { return; } From e70e9b499ec4c24b77ab5b12a6a36f84b6096a00 Mon Sep 17 00:00:00 2001 From: JonnyTran Date: Thu, 28 Aug 2025 14:52:25 -0700 Subject: [PATCH 09/24] Enhance error handling in AxiosErrorHandler and DocumentRepository. - Prioritize specific error messages in AxiosErrorHandler based on business logic, detailed messages, and generic HTTP status messages. - Update DocumentRepository to include a new error constant for listing documents and adjust error handling accordingly. - Modify error detail in documents.py to provide more specific feedback when no documents are found. --- .../repositories/AxiosErrorHandler.ts | 31 ++++++++++++------- .../repositories/DocumentRepository.ts | 5 +-- .../api/handlers/v1/documents.py | 2 +- 3 files changed, 24 insertions(+), 14 deletions(-) diff --git a/extralit-frontend/v1/infrastructure/repositories/AxiosErrorHandler.ts b/extralit-frontend/v1/infrastructure/repositories/AxiosErrorHandler.ts index 1805d6329..d36574e20 100644 --- a/extralit-frontend/v1/infrastructure/repositories/AxiosErrorHandler.ts +++ b/extralit-frontend/v1/infrastructure/repositories/AxiosErrorHandler.ts @@ -12,31 +12,40 @@ export const loadErrorHandler = (context: Context) => { notification.clear(); - const errorHandledKey = `validations.http.${status}.message`; - const handledTranslatedError = t(errorHandledKey); - - if (handledTranslatedError !== errorHandledKey) { - notification.notify({ - message: handledTranslatedError, - type: "danger", - }); - } - + // Prioritize specific error messages over generic HTTP status messages + // 1. Business logic errors (highest priority) if (data.code) { const errorHandledKey = `validations.businessLogic.${data.code}.message`; const handledTranslatedError = t(errorHandledKey); if (handledTranslatedError !== errorHandledKey) { + console.log("handledTranslatedError", errorHandledKey); notification.notify({ message: handledTranslatedError, type: "danger", }); + throw error; } - } else if (data.detail && typeof data.detail === "string") { + } + + // 2. Detailed error messages (medium priority) + if (data.detail && typeof data.detail === "string") { notification.notify({ message: data.detail.toString(), type: "danger", }); + throw error; + } + + // 3. Generic HTTP status messages (fallback) + const errorHandledKey = `validations.http.${status}.message`; + const handledTranslatedError = t(errorHandledKey); + + if (handledTranslatedError !== errorHandledKey) { + notification.notify({ + message: handledTranslatedError, + type: "danger", + }); } throw error; diff --git a/extralit-frontend/v1/infrastructure/repositories/DocumentRepository.ts b/extralit-frontend/v1/infrastructure/repositories/DocumentRepository.ts index 3b2e50911..49698e25c 100644 --- a/extralit-frontend/v1/infrastructure/repositories/DocumentRepository.ts +++ b/extralit-frontend/v1/infrastructure/repositories/DocumentRepository.ts @@ -3,11 +3,12 @@ import { Document, Segment, Segments } from "@/v1/domain/entities/document/Docum const DOCUMENT_API_ERRORS = { ERROR_FETCHING_DOCUMENT: "ERROR_FETCHING_DOCUMENT", + ERROR_LISTING_DOCUMENTS: "ERROR_LISTING_DOCUMENTS", ERROR_FETCHING_SEGMENTS: "ERROR_FETCHING_SEGMENTS", }; export class DocumentRepository { - constructor(private readonly axios: NuxtAxiosInstance) {} + constructor(private readonly axios: NuxtAxiosInstance) { } async getDocuments(params: { workspace_id: string; @@ -54,7 +55,7 @@ export class DocumentRepository { return data; } catch (error) { throw { - response: DOCUMENT_API_ERRORS.ERROR_FETCHING_DOCUMENT, + response: DOCUMENT_API_ERRORS.ERROR_LISTING_DOCUMENTS, }; } } diff --git a/extralit-server/src/extralit_server/api/handlers/v1/documents.py b/extralit-server/src/extralit_server/api/handlers/v1/documents.py index 8ff6d2a0f..16f14c832 100644 --- a/extralit-server/src/extralit_server/api/handlers/v1/documents.py +++ b/extralit-server/src/extralit_server/api/handlers/v1/documents.py @@ -142,7 +142,7 @@ async def get_document( if not documents: raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, - detail=f"No documents found with given criteria in workspace {workspace_id}", + detail=f"No documents found with reference {reference}", ) # TODO disable due to CORS restrictions from frontend From 14fdc0a82ea1f03d9f7426f7a535b24e1411dca3 Mon Sep 17 00:00:00 2001 From: JonnyTran Date: Thu, 28 Aug 2025 14:54:31 -0700 Subject: [PATCH 10/24] Refactor dataset configuration components to support TypeScript. - Updated DatasetConfigurationForm, DatasetConfigurationMetadataSelector, and DatasetCreateDialog to use TypeScript for improved type safety. - Enhanced validator functions in DatasetConfigurationForm and DatasetCreateDialog to specify parameter types. --- .../configuration/DatasetConfigurationForm.vue | 2 +- .../configuration/DatasetConfigurationMetadataSelector.vue | 2 +- .../dataset-creation/configuration/DatasetCreateDialog.vue | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/extralit-frontend/components/features/dataset-creation/configuration/DatasetConfigurationForm.vue b/extralit-frontend/components/features/dataset-creation/configuration/DatasetConfigurationForm.vue index 9932e1407..311d3a136 100644 --- a/extralit-frontend/components/features/dataset-creation/configuration/DatasetConfigurationForm.vue +++ b/extralit-frontend/components/features/dataset-creation/configuration/DatasetConfigurationForm.vue @@ -105,7 +105,7 @@ export default { dataSource: { type: String, default: "hub", - validator: (value) => ["hub", "import"].includes(value), + validator: (value: string) => ["hub", "import"].includes(value), }, }, data() { diff --git a/extralit-frontend/components/features/dataset-creation/configuration/DatasetConfigurationMetadataSelector.vue b/extralit-frontend/components/features/dataset-creation/configuration/DatasetConfigurationMetadataSelector.vue index 08e46c00d..26f58b4e6 100644 --- a/extralit-frontend/components/features/dataset-creation/configuration/DatasetConfigurationMetadataSelector.vue +++ b/extralit-frontend/components/features/dataset-creation/configuration/DatasetConfigurationMetadataSelector.vue @@ -17,7 +17,7 @@ -