From 275ac4d0a3818d6ec7b380cedfd158b27cdb3457 Mon Sep 17 00:00:00 2001 From: Nathan Franklin Date: Thu, 30 Oct 2025 10:00:13 -0500 Subject: [PATCH 01/28] Add shorter WebSocket timeout for local dev --- geoapi/routes/websockets.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/geoapi/routes/websockets.py b/geoapi/routes/websockets.py index 7eb2238f..8881c946 100644 --- a/geoapi/routes/websockets.py +++ b/geoapi/routes/websockets.py @@ -1,5 +1,7 @@ from litestar import WebSocket, websocket from litestar.channels import ChannelsPlugin + +from geoapi.settings import settings from geoapi.utils.decorators import not_anonymous_guard from websockets.exceptions import ConnectionClosedOK, ConnectionClosedError import logging @@ -12,6 +14,10 @@ async def websocket_handler(socket: WebSocket, channels: ChannelsPlugin) -> None await socket.accept() user = socket.scope.get("user") + if settings.APP_ENV == "local": + # Short time out so we speed up hot reload when code changes + socket._send_timeout = 4 + try: async with channels.start_subscription( [f"notifications-{user.id}"] From e0153ef1258dca4d217732b570653f40ff267b26 Mon Sep 17 00:00:00 2001 From: Nathan Franklin Date: Thu, 30 Oct 2025 16:50:38 -0500 Subject: [PATCH 02/28] Add original_system to feature asset --- geoapi/models/feature.py | 3 +- geoapi/routes/projects.py | 3 +- geoapi/services/features.py | 79 ++++++++++++++----- geoapi/tasks/external_data.py | 4 +- .../tests/api_tests/test_feature_service.py | 30 ++++++- .../tests/api_tests/test_projects_routes.py | 10 ++- geoapi/tests/conftest.py | 11 ++- 7 files changed, 113 insertions(+), 27 deletions(-) diff --git a/geoapi/models/feature.py b/geoapi/models/feature.py index c60489c7..82a0621c 100644 --- a/geoapi/models/feature.py +++ b/geoapi/models/feature.py @@ -64,9 +64,10 @@ class FeatureAsset(Base): uuid = mapped_column(UUID(as_uuid=True), default=uuid.uuid4, nullable=False) # system or project id or both path = mapped_column(String(), nullable=False) + display_path = mapped_column(String(), nullable=True) original_name = mapped_column(String(), nullable=True) original_path = mapped_column(String(), nullable=True, index=True) - display_path = mapped_column(String(), nullable=True) + original_system = mapped_column(String(), nullable=True, index=True) asset_type = mapped_column(String(), nullable=False, default="image") feature = relationship("Feature", overlaps="assets") diff --git a/geoapi/routes/projects.py b/geoapi/routes/projects.py index 6b7ff5ff..bb79f45f 100644 --- a/geoapi/routes/projects.py +++ b/geoapi/routes/projects.py @@ -47,6 +47,7 @@ class AssetModel(BaseModel): path: str | None = None uuid: UUID | None = None asset_type: str | None = None + original_system: str | None = None original_path: str | None = None original_name: str | None = None display_path: str | None = None @@ -672,7 +673,7 @@ async def upload_feature_file( file_obj = io.BytesIO(file_bytes) file_obj.filename = file.filename logger.info( - "Add feature to project:{} for user:{}: {}".format( + "UPLOAD Add feature to project:{} for user:{}: {}".format( project_id, request.user.username, file.filename ) ) diff --git a/geoapi/services/features.py b/geoapi/services/features.py index b6e3a891..bf4e04e7 100644 --- a/geoapi/services/features.py +++ b/geoapi/services/features.py @@ -103,18 +103,26 @@ def setStyles(database_session, featureId: int, styles: Dict) -> Feature: @staticmethod def addGeoJSON( - database_session, projectId: int, feature: Dict, original_path=None + database_session, + projectId: int, + feature: Dict, + original_system=None, + original_path=None, ) -> List[Feature]: """ Add a GeoJSON feature to a project :param projectId: int :param feature: dict + :param original_system: str system of original file location [IGNORED] + :param original_path: str path of original file location [IGNORED] :return: Feature """ try: data = geojson.loads(json.dumps(feature)) except ValueError: raise InvalidGeoJSON + + # TODO original_path, original_system are ignored but should not be ignored after WG-600 features = [] if data["type"] == "Feature": feat = Feature.fromGeoJSON(data) @@ -169,7 +177,8 @@ def fromGPX( projectId: int, fileObj: IO, metadata: Dict, - original_path=None, + original_system, # ignored + original_path, # ignored ) -> Feature: # TODO: Fiona should support reading from the file directly, this MemoryFile business @@ -182,6 +191,8 @@ def fromGPX( feat.project_id = projectId feat.the_geom = from_shape(geometries.convert_3D_2D(shp), srid=4326) feat.properties = metadata or {} + # TODO original_path, original_system are ignored but should not be ignored after WG-600 + database_session.add(feat) database_session.commit() return feat @@ -192,6 +203,7 @@ def fromGeoJSON( projectId: int, fileObj: IO, metadata: Dict, + original_system: str = None, original_path: str = None, ) -> List[Feature]: """ @@ -204,7 +216,9 @@ def fromGeoJSON( """ data = json.loads(fileObj.read()) fileObj.close() - return FeaturesService.addGeoJSON(database_session, projectId, data) + return FeaturesService.addGeoJSON( + database_session, projectId, data, original_system, original_path + ) @staticmethod def fromShapefile( @@ -213,6 +227,7 @@ def fromShapefile( fileObj: IO, metadata: Dict, additional_files: List[IO], + original_system=None, original_path=None, ) -> Feature: """Create features from shapefile @@ -232,6 +247,7 @@ def fromShapefile( feat.project_id = projectId feat.the_geom = from_shape(geometries.convert_3D_2D(geom), srid=4326) feat.properties = properties + # TODO original_path, original_system are ignored but should not be ignored after WG-600 database_session.add(feat) features.append(feat) @@ -244,7 +260,8 @@ def from_rapp_questionnaire( projectId: int, fileObj: IO, additional_files: List[IO], - original_path: str = None, + original_system, + original_path, ) -> Feature: """ Import RAPP questionnaire @@ -256,10 +273,11 @@ def from_rapp_questionnaire( :param projectId: int :param fileObj: questionnaire rq file :param additional_files: list of file objs + :param original_system: str path of original system location :param original_path: str path of original file location :return: Feature """ - logger.info(f"Processing f{original_path}") + logger.info(f"Processing {original_system}/{original_path}") data = json.loads(fileObj.read()) location = parse_rapid_geolocation(data.get("geolocation")) @@ -283,7 +301,7 @@ def from_rapp_questionnaire( # write all asset files (i.e jpgs) if additional_files is not None: logger.info( - f"Processing {len(additional_files)} assets for {original_path}" + f"Processing {len(additional_files)} assets for {original_system}/{original_path}" ) for asset_file_obj in additional_files: base_filename = os.path.basename(asset_file_obj.filename) @@ -328,6 +346,7 @@ def from_rapp_questionnaire( fa = FeatureAsset( uuid=asset_uuid, asset_type="questionnaire", + original_system=original_system, original_path=original_path, display_path=original_path, path=get_asset_relative_path(questionnaire_path), @@ -345,7 +364,6 @@ def fromINI( projectId: int, fileObj: IO, metadata: Dict, - original_path: str = None, ) -> TileServer: """ @@ -410,6 +428,7 @@ def fromFileObj( projectId: int, fileObj: IO, metadata: Dict, + original_system: str = None, original_path: str = None, additional_files=None, location: GeoLocation = None, @@ -422,6 +441,7 @@ def fromFileObj( projectId, fileObj, metadata, + original_system, original_path, location, ) @@ -429,12 +449,17 @@ def fromFileObj( elif ext in features_util.GPX_FILE_EXTENSIONS: return [ FeaturesService.fromGPX( - database_session, projectId, fileObj, metadata, original_path + database_session, + projectId, + fileObj, + metadata, + original_system, + original_path, ) ] elif ext in features_util.GEOJSON_FILE_EXTENSIONS: return FeaturesService.fromGeoJSON( - database_session, projectId, fileObj, {}, original_path + database_session, projectId, fileObj, {}, original_system, original_path ) elif ext in features_util.SHAPEFILE_FILE_EXTENSIONS: return FeaturesService.fromShapefile( @@ -443,15 +468,19 @@ def fromFileObj( fileObj, {}, additional_files, + original_system, original_path, ) elif ext in features_util.INI_FILE_EXTENSIONS: - return FeaturesService.fromINI( - database_session, projectId, fileObj, {}, original_path - ) + return FeaturesService.fromINI(database_session, projectId, fileObj, {}) elif ext in features_util.RAPP_QUESTIONNAIRE_FILE_EXTENSIONS: return FeaturesService.from_rapp_questionnaire( - database_session, projectId, fileObj, additional_files, original_path + database_session, + projectId, + fileObj, + additional_files, + original_system, + original_path, ) else: raise ApiException( @@ -464,6 +493,7 @@ def fromImage( projectId: int, fileObj: IO, metadata: Dict, + original_system: str = None, original_path: str = None, location: GeoLocation = None, ) -> Feature: @@ -479,7 +509,7 @@ def fromImage( """ try: logger.debug( - f"processing image {original_path} known_geolocation:{location} using_exif_geolocation:{location is None}" + f"processing image {original_system}/{original_path} known_geolocation:{location} using_exif_geolocation:{location is None}" ) imdata = ImageService.processImage( fileObj, exif_geolocation=location is None @@ -500,6 +530,7 @@ def fromImage( fa = FeatureAsset( uuid=asset_uuid, asset_type="image", + original_system=original_system, original_path=original_path, display_path=original_path, path=get_asset_relative_path(asset_path), @@ -529,6 +560,7 @@ def createFeatureAsset( projectId: int, featureId: int, fileObj: IO, + original_system: str = None, original_path: str = None, ) -> Feature: """ @@ -542,11 +574,17 @@ def createFeatureAsset( ext = fpath.suffix.lstrip(".").lower() if ext in features_util.IMAGE_FILE_EXTENSIONS: fa = FeaturesService.createImageFeatureAsset( - projectId, fileObj, original_path=original_path + projectId, + fileObj, + original_system=original_system, + original_path=original_path, ) elif ext in features_util.VIDEO_FILE_EXTENSIONS: fa = FeaturesService.createVideoFeatureAsset( - projectId, fileObj, original_path=original_path + projectId, + fileObj, + original_system=original_system, + original_path=original_path, ) else: raise ApiException("Invalid format for feature assets") @@ -596,7 +634,10 @@ def featureAssetFromImData(projectId: int, imdata: ImageData) -> FeatureAsset: @staticmethod def createImageFeatureAsset( - projectId: int, fileObj: IO, original_path: str = None + projectId: int, + fileObj: IO, + original_system: str = None, + original_path: str = None, ) -> FeatureAsset: asset_uuid = uuid.uuid4() imdata = ImageService.resizeImage(fileObj) @@ -607,6 +648,7 @@ def createImageFeatureAsset( fa = FeatureAsset( uuid=asset_uuid, asset_type="image", + original_system=original_system, original_path=original_path, display_path=original_path, path=get_asset_relative_path(asset_path), @@ -615,7 +657,7 @@ def createImageFeatureAsset( @staticmethod def createVideoFeatureAsset( - projectId: int, fileObj: IO, original_path: str = None + projectId: int, fileObj: IO, original_system: str, original_path: str = None ) -> FeatureAsset: """ @@ -638,6 +680,7 @@ def createVideoFeatureAsset( os.remove(encoded_path) fa = FeatureAsset( uuid=asset_uuid, + original_system=original_system, original_path=original_path, display_path=original_path, path=get_asset_relative_path(save_path), diff --git a/geoapi/tasks/external_data.py b/geoapi/tasks/external_data.py index 745a9092..e8e4bce3 100644 --- a/geoapi/tasks/external_data.py +++ b/geoapi/tasks/external_data.py @@ -193,6 +193,7 @@ def import_file_from_tapis(userId: int, systemId: str, path: str, projectId: int projectId, temp_file, {}, + original_system=systemId, original_path=path, additional_files=additional_files, location=optional_location_from_metadata, @@ -531,7 +532,8 @@ def import_files_recursively_from_path( projectId, tmp_file, {}, - original_path=item_system_path, + original_system=systemId, + original_path=path, additional_files=additional_files, location=optional_location_from_metadata, ) diff --git a/geoapi/tests/api_tests/test_feature_service.py b/geoapi/tests/api_tests/test_feature_service.py index 52c749f5..70536936 100644 --- a/geoapi/tests/api_tests/test_feature_service.py +++ b/geoapi/tests/api_tests/test_feature_service.py @@ -37,6 +37,7 @@ def test_insert_feature_geojson( assert feature.project_id == projects_fixture.id assert db_session.query(Feature).count() == 1 assert db_session.query(FeatureAsset).count() == 0 + # TODO Test original_path, original_system are configured after WG-600 def test_insert_feature_collection(projects_fixture, geojson_file_fixture, db_session): @@ -57,7 +58,12 @@ def test_remove_feature(projects_fixture, feature_fixture, db_session): def test_create_feature_image(projects_fixture, image_file_fixture, db_session): feature = FeaturesService.fromImage( - db_session, projects_fixture.id, image_file_fixture, metadata={} + db_session, + projects_fixture.id, + image_file_fixture, + metadata={}, + original_system="system", + original_path="path", ) assert feature.project_id == projects_fixture.id assert len(feature.assets) == 1 @@ -71,6 +77,8 @@ def test_create_feature_image(projects_fixture, image_file_fixture, db_session): str(feature.assets[0].uuid) + ".thumb.jpeg", ) ) + assert feature.assets[0].original_system == "system" + assert feature.assets[0].original_path == "path" def test_create_feature_image_small_image( @@ -112,6 +120,8 @@ def test_create_feature_image_asset( projects_fixture.id, feature_fixture.id, FileStorage(image_file_fixture), + original_system="system", + original_path="path", ) assert feature.id == feature_fixture.id assert len(feature.assets) == 1 @@ -124,6 +134,8 @@ def test_create_feature_image_asset( str(feature.assets[0].uuid) + ".thumb.jpeg", ) ) + assert feature.assets[0].original_system == "system" + assert feature.assets[0].original_path == "path" def test_remove_feature_image_asset( @@ -149,6 +161,8 @@ def test_create_feature_video_asset( projects_fixture.id, feature_fixture.id, FileStorage(video_file_fixture), + original_system="system", + original_path="path", ) assert feature.id == feature_fixture.id assert len(feature.assets) == 1 @@ -161,6 +175,8 @@ def test_create_feature_video_asset( str(feature.assets[0].uuid) + ".mp4", ) ) + assert feature.assets[0].original_system == "system" + assert feature.assets[0].original_path == "path" def test_remove_feature_video_asset( @@ -192,6 +208,7 @@ def test_create_feature_shpfile( assert len(features) == 10 assert db_session.query(Feature).count() == 10 assert features[0].project_id == projects_fixture.id + # TODO Test original_path, original_system are configured after WG-600 def test_create_questionnaire_feature( @@ -201,7 +218,9 @@ def test_create_questionnaire_feature( db_session, projects_fixture.id, questionnaire_file_without_assets_fixture, - additional_files=None, + additional_files=[], + original_system="system", + original_path="questionnaire.rq", ) assert feature.project_id == projects_fixture.id assert len(feature.assets) == 1 @@ -210,6 +229,9 @@ def test_create_questionnaire_feature( assert len(os.listdir(get_project_asset_dir(feature.project_id))) == 1 assert len(os.listdir(get_asset_path(feature.assets[0].path))) == 1 assert os.path.isfile(get_asset_path(feature.assets[0].path, "questionnaire.rq")) + assert feature.assets[0].original_path == "questionnaire.rq" + assert feature.assets[0].original_system == "system" + assert len(os.listdir(get_asset_path(feature.assets[0].path))) == 1 def test_create_questionnaire_feature_with_assets( @@ -224,6 +246,8 @@ def test_create_questionnaire_feature_with_assets( projects_fixture.id, questionnaire_file_with_assets_fixture, additional_files=assets, + original_system="system", + original_path="questionnaire.rq", ) assert feature.project_id == projects_fixture.id assert len(feature.assets) == 1 @@ -231,6 +255,8 @@ def test_create_questionnaire_feature_with_assets( assert db_session.query(FeatureAsset).count() == 1 assert len(os.listdir(get_project_asset_dir(feature.project_id))) == 1 assert len(os.listdir(get_asset_path(feature.assets[0].path))) == 3 + assert feature.assets[0].original_path == "questionnaire.rq" + assert feature.assets[0].original_system == "system" assert os.path.isfile(get_asset_path(feature.assets[0].path, "questionnaire.rq")) assert os.path.isfile(get_asset_path(feature.assets[0].path, "image.preview.jpg")) assert os.path.isfile(get_asset_path(feature.assets[0].path, "image.jpg")) diff --git a/geoapi/tests/api_tests/test_projects_routes.py b/geoapi/tests/api_tests/test_projects_routes.py index b3b5bae6..5e04efe4 100644 --- a/geoapi/tests/api_tests/test_projects_routes.py +++ b/geoapi/tests/api_tests/test_projects_routes.py @@ -388,7 +388,7 @@ def test_get_project_features_analytics_with_headers( def test_get_project_features_single_feature( - test_client, projects_fixture, feature_fixture, user1 + test_client, projects_fixture, image_feature_fixture, user1 ): resp = test_client.get( f"/projects/{projects_fixture.id}/features/", @@ -396,7 +396,13 @@ def test_get_project_features_single_feature( ) data = resp.json() assert resp.status_code == 200 - assert len(data["features"]) != 0 + assert len(data["features"]) == 1 + + asset = data["features"][0]["assets"][0] + assert asset["asset_type"] == "image" + + assert asset["original_path"] == image_feature_fixture.assets[0].original_path + assert asset["original_system"] == image_feature_fixture.assets[0].original_system def test_get_project_features_single_feature_public_access( diff --git a/geoapi/tests/conftest.py b/geoapi/tests/conftest.py index b8c2fdc1..9fcb1422 100644 --- a/geoapi/tests/conftest.py +++ b/geoapi/tests/conftest.py @@ -509,8 +509,15 @@ def feature_fixture( @pytest.fixture(scope="function") def image_feature_fixture( image_file_fixture, db_session: "sqlalchemy_config.Session" -) -> "Iterator[FeaturesService]": - yield FeaturesService.fromImage(db_session, 1, image_file_fixture, metadata={}) +) -> Feature: + yield FeaturesService.fromImage( + db_session, + 1, + image_file_fixture, + metadata={}, + original_system="system", + original_path="original/path.jpg", + ) @pytest.fixture(scope="function") From 946ed8cc33b86db20d8f949e164c6dc3d8c0bf98 Mon Sep 17 00:00:00 2001 From: Nathan Franklin Date: Thu, 30 Oct 2025 17:09:06 -0500 Subject: [PATCH 03/28] Update log statement --- geoapi/routes/projects.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/geoapi/routes/projects.py b/geoapi/routes/projects.py index bb79f45f..6659a348 100644 --- a/geoapi/routes/projects.py +++ b/geoapi/routes/projects.py @@ -673,7 +673,7 @@ async def upload_feature_file( file_obj = io.BytesIO(file_bytes) file_obj.filename = file.filename logger.info( - "UPLOAD Add feature to project:{} for user:{}: {}".format( + "Add feature (via http upload) to project:{} for user:{}: {}".format( project_id, request.user.username, file.filename ) ) From 5d2205a3d136e84424332cd6fb1595f6f433be4a Mon Sep 17 00:00:00 2001 From: Nathan Franklin Date: Thu, 30 Oct 2025 17:13:58 -0500 Subject: [PATCH 04/28] Add migration for original_system field --- ...89_add_original_system_to_feature_asset.py | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 geoapi/migrations/versions/20251030_2212-f5f3cfac6089_add_original_system_to_feature_asset.py diff --git a/geoapi/migrations/versions/20251030_2212-f5f3cfac6089_add_original_system_to_feature_asset.py b/geoapi/migrations/versions/20251030_2212-f5f3cfac6089_add_original_system_to_feature_asset.py new file mode 100644 index 00000000..98421df0 --- /dev/null +++ b/geoapi/migrations/versions/20251030_2212-f5f3cfac6089_add_original_system_to_feature_asset.py @@ -0,0 +1,40 @@ +"""add_original_system_to_feature_asset + +Revision ID: f5f3cfac6089 +Revises: ba8b92fc7ead +Create Date: 2025-10-30 22:12:13.034836 + +""" + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = "f5f3cfac6089" +down_revision = "ba8b92fc7ead" +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.add_column( + "feature_assets", sa.Column("original_system", sa.String(), nullable=True) + ) + op.create_index( + op.f("ix_feature_assets_original_system"), + "feature_assets", + ["original_system"], + unique=False, + ) + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.drop_index( + op.f("ix_feature_assets_original_system"), table_name="feature_assets" + ) + op.drop_column("feature_assets", "original_system") + # ### end Alembic commands ### From 434a3a9aa9ab535e5fe9453c8aa0f4acd366e0b1 Mon Sep 17 00:00:00 2001 From: Nathan Franklin Date: Thu, 30 Oct 2025 17:51:56 -0500 Subject: [PATCH 05/28] Revert "Add shorter WebSocket timeout for local dev" This reverts commit 275ac4d0a3818d6ec7b380cedfd158b27cdb3457. --- geoapi/routes/websockets.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/geoapi/routes/websockets.py b/geoapi/routes/websockets.py index 8881c946..7eb2238f 100644 --- a/geoapi/routes/websockets.py +++ b/geoapi/routes/websockets.py @@ -1,7 +1,5 @@ from litestar import WebSocket, websocket from litestar.channels import ChannelsPlugin - -from geoapi.settings import settings from geoapi.utils.decorators import not_anonymous_guard from websockets.exceptions import ConnectionClosedOK, ConnectionClosedError import logging @@ -14,10 +12,6 @@ async def websocket_handler(socket: WebSocket, channels: ChannelsPlugin) -> None await socket.accept() user = socket.scope.get("user") - if settings.APP_ENV == "local": - # Short time out so we speed up hot reload when code changes - socket._send_timeout = 4 - try: async with channels.start_subscription( [f"notifications-{user.id}"] From 704516bfc805693d24e47efc989244bcb3907472 Mon Sep 17 00:00:00 2001 From: Nathan Franklin Date: Mon, 3 Nov 2025 21:05:16 -0600 Subject: [PATCH 06/28] Add initial public accessible check model/service/routes --- geoapi/models/feature.py | 15 +- geoapi/models/public_system_access_check.py | 43 ++++++ geoapi/routes/projects.py | 11 +- geoapi/routes/public_system_access.py | 146 ++++++++++++++++++ geoapi/services/public_system_access.py | 77 +++++++++ .../test_public_system_access_routes.py | 104 +++++++++++++ .../test_public_system_access_service.py | 112 ++++++++++++++ 7 files changed, 505 insertions(+), 3 deletions(-) create mode 100644 geoapi/models/public_system_access_check.py create mode 100644 geoapi/routes/public_system_access.py create mode 100644 geoapi/services/public_system_access.py create mode 100644 geoapi/tests/api_tests/test_public_system_access_routes.py create mode 100644 geoapi/tests/services/test_public_system_access_service.py diff --git a/geoapi/models/feature.py b/geoapi/models/feature.py index 82a0621c..e21f2618 100644 --- a/geoapi/models/feature.py +++ b/geoapi/models/feature.py @@ -1,5 +1,5 @@ import uuid -from sqlalchemy import Integer, String, ForeignKey, Index, DateTime +from sqlalchemy import Integer, String, ForeignKey, Index, DateTime, Boolean import shapely from litestar.dto import dto_field from sqlalchemy.dialects.postgresql import JSONB @@ -65,9 +65,22 @@ class FeatureAsset(Base): # system or project id or both path = mapped_column(String(), nullable=False) display_path = mapped_column(String(), nullable=True) + + # Original source file location original_name = mapped_column(String(), nullable=True) original_path = mapped_column(String(), nullable=True, index=True) original_system = mapped_column(String(), nullable=True, index=True) + + # Current location of the original source file (typically updated when task + # sees that it or a copy is in a publicly accessible location) + current_path = mapped_column(String(), nullable=True, index=True) + current_system = mapped_column(String(), nullable=True, index=True) + + # Is current_system a public-accessable system + is_on_public_system = mapped_column(Boolean(), nullable=True) + # Track when this asset was last checked for public availability + last_public_system_check = mapped_column(DateTime(timezone=True), nullable=True) + asset_type = mapped_column(String(), nullable=False, default="image") feature = relationship("Feature", overlaps="assets") diff --git a/geoapi/models/public_system_access_check.py b/geoapi/models/public_system_access_check.py new file mode 100644 index 00000000..c1c1662b --- /dev/null +++ b/geoapi/models/public_system_access_check.py @@ -0,0 +1,43 @@ +from sqlalchemy import Integer, ForeignKey, DateTime +from sqlalchemy.orm import relationship, mapped_column +from sqlalchemy.sql import func +from geoapi.db import Base + + +class PublicSystemAccessCheck(Base): + """ + Tracks when a project's files were checked for public status. + + This model stores metadata about this public-system-access checks. Individual file status + is tracked via FeatureAsset.is_on_public_system and FeatureAsset.last_public_check. + + Note just one check per file. It can be re-run but we only keep info about a + single (i.e. current or last) check. + """ + + __tablename__ = "public_system_access_checks" + + id = mapped_column(Integer, primary_key=True) + project_id = mapped_column( + ForeignKey("projects.id", ondelete="CASCADE", onupdate="CASCADE"), + index=True, + nullable=False, + unique=True, + ) + task_id = mapped_column( + ForeignKey("tasks.id", ondelete="SET NULL"), index=True, nullable=True + ) + + # Timestamps + started_at = mapped_column(DateTime(timezone=True), server_default=func.now()) + completed_at = mapped_column(DateTime(timezone=True), nullable=True) + + # Relationships + project = relationship("Project") + task = relationship("Task") + + def __repr__(self): + return ( + f"" + ) diff --git a/geoapi/routes/projects.py b/geoapi/routes/projects.py index 6659a348..a739a96a 100644 --- a/geoapi/routes/projects.py +++ b/geoapi/routes/projects.py @@ -40,7 +40,7 @@ class OkResponse(BaseModel): message: str = "accepted" -class AssetModel(BaseModel): +class FeatureAssetModel(BaseModel): model_config = ConfigDict(from_attributes=True) id: int | None = None @@ -51,6 +51,7 @@ class AssetModel(BaseModel): original_path: str | None = None original_name: str | None = None display_path: str | None = None + is_on_public_system: bool | None = None class FeatureModel(BaseModel): @@ -61,7 +62,7 @@ class FeatureModel(BaseModel): geometry: dict properties: dict | None = None styles: dict | None = None - assets: list[AssetModel] | None = None + assets: list[FeatureAssetModel] | None = None class FeatureReturnDTO(SQLAlchemyDTO[Feature]): @@ -1285,6 +1286,11 @@ def feature_enc_hook(feature: Feature) -> FeatureModel: ) +from geoapi.routes.public_system_access import ( + ProjectPublicStatusController, +) # noqa: E402 + + projects_router = Router( path="/projects", route_handlers=[ @@ -1313,6 +1319,7 @@ def feature_enc_hook(feature: Feature) -> FeatureModel: ProjectTileServersResourceController, ProjectTileServersFilesImportResourceController, ProjectTileServerResourceController, + ProjectPublicStatusController, ], type_encoders={Feature: feature_enc_hook}, ) diff --git a/geoapi/routes/public_system_access.py b/geoapi/routes/public_system_access.py new file mode 100644 index 00000000..c21bdeb9 --- /dev/null +++ b/geoapi/routes/public_system_access.py @@ -0,0 +1,146 @@ +from datetime import datetime +from pydantic import BaseModel, ConfigDict +from litestar import Controller, get, post, Request +from celery import uuid as celery_uuid +from sqlalchemy.orm import Session + +from geoapi.log import logger +from geoapi.models import Feature, FeatureAsset +from geoapi.routes.projects import FeatureAssetModel, TaskModel +from geoapi.utils.decorators import project_permissions_guard +from geoapi.tasks.public_system_access_check import check_and_update_system_paths + + +class PublicStatusModel(BaseModel): + model_config = ConfigDict(from_attributes=True) + + id: int + project_id: int + started_at: datetime + completed_at: datetime | None = None + task: TaskModel | None = None + + +class StartPublicStatusRefreshResponse(BaseModel): + message: str + public_status_id: int + task_id: int | None = None + + +class ProjectPublicStatusSummary(BaseModel): + project_id: int + check: PublicStatusModel | None = None + files: list[FeatureAssetModel] + + +class ProjectPublicStatusController(Controller): + # Mounted under projects_router which provides /projects prefix + path = "/{project_id:int}/public-system-access" + + @post( + "/", + tags=["projects"], + operation_id="start_public_system_access_status_refresh", + description="Start checking public system access status for project files", + guards=[project_permissions_guard], + status_code=202, + ) + def start_public_status_refresh( + self, request: Request, db_session: Session, project_id: int + ) -> StartPublicStatusRefreshResponse: + """ + Start a background task to refresh public-system-access status of files. + + This checks all feature assets in the project and updates their + current_system/current_path if they're found in the published location. + """ + from geoapi.services.public_system_access import PublicSystemAccessService + + user = request.user + logger.info( + f"Starting public status refresh for project:{project_id} by user:{user.username}" + ) + + # Check if there's already a running refresh + if PublicSystemAccessService.has_running_check(db_session, project_id): + existing = PublicSystemAccessService.get(db_session, project_id) + return StartPublicStatusRefreshResponse( + message="A public status refresh is already in progress for this project", + public_status_id=existing.id, + task_id=existing.task_id, + ) + + # Generate Celery task UUID + celery_task_uuid = celery_uuid() + + # Create the check record and Task in database + public_access_check = PublicSystemAccessService.start_check( + db_session, project_id, celery_task_uuid=celery_task_uuid + ) + + # Start Celery task with the UUID we generated + check_and_update_system_paths.apply_async( + args=[user.id, project_id, public_access_check.id], + task_id=celery_task_uuid, + ) + + return StartPublicStatusRefreshResponse( + message="Public status refresh started", + public_status_id=public_access_check.id, + task_id=public_access_check.task_id, # Return the database task.id + ) + + @get( + "/files", + tags=["projects"], + operation_id="get_public_files_status", + description="Get public/private status of all files in the project with last refresh info", + guards=[project_permissions_guard], + ) + def get_files_status( + self, request: Request, db_session: "Session", project_id: int + ) -> ProjectPublicStatusSummary: + """ + Get detailed status of which files are public vs private. + + Single endpoint that returns both file-level details and last refresh info. + """ + from geoapi.services.public_system_access import PublicSystemAccessService + + logger.info( + f"Getting public files status for project:{project_id} " + f"by user:{request.user.username}" + ) + + # Get all feature assets + # NOTE: Only includes Features with FeatureAssets. Features without assets + # (geometry-only, like manually drawn shapes) are excluded from this list. + # TODO WG-600: Handle "source file" FeatureAssets (shapefiles, geojson) that represent + # the file that created the feature. These show up in results if they + # have original_system/path set, but may need different UI treatment. + feature_assets = ( + db_session.query(FeatureAsset) + .join(Feature, Feature.id == FeatureAsset.feature_id) + .filter(Feature.project_id == project_id) + .all() + ) + + # TODO with WG-600 or before then we should include a list of features that aren't accounted for here just so + # users are aware of a gap of knowledge + + # Get most recent check + public_access_check = PublicSystemAccessService.get(db_session, project_id) + + files_status = [ + FeatureAssetModel.model_validate(asset) for asset in feature_assets + ] + + return ProjectPublicStatusSummary( + project_id=project_id, + check=( + PublicStatusModel.model_validate(public_access_check) + if public_access_check + else None + ), + files=files_status, + ) diff --git a/geoapi/services/public_system_access.py b/geoapi/services/public_system_access.py new file mode 100644 index 00000000..cc49df0d --- /dev/null +++ b/geoapi/services/public_system_access.py @@ -0,0 +1,77 @@ +from sqlalchemy import and_ +from sqlalchemy.sql import func + +from datetime import datetime, timezone + +from geoapi.models.public_system_access_check import PublicSystemAccessCheck +from geoapi.models import Task, TaskStatus + +from sqlalchemy.orm import Session + + +class PublicSystemAccessService: + """Service for managing checking if files are accessible from public systems.""" + + @staticmethod + def start_check( + db_session: "Session", project_id: int, celery_task_uuid: str + ) -> PublicSystemAccessCheck: + check = ( + db_session.query(PublicSystemAccessCheck) + .filter(PublicSystemAccessCheck.project_id == project_id) + .first() + ) + + task = Task( + process_id=celery_task_uuid, + status=TaskStatus.QUEUED, + description="Refreshing public status", + project_id=project_id, + ) + db_session.add(task) + db_session.flush() # Flush to get the task.id + + if check: + check.started_at = datetime.now(timezone.utc) + check.completed_at = None + check.task_id = task.id + else: + check = PublicSystemAccessCheck(project_id=project_id, task_id=task.id) + db_session.add(check) + + db_session.commit() + return check + + @staticmethod + def complete_check(db_session: "Session", project_id: int) -> None: + """Mark the check as completed.""" + check = PublicSystemAccessService.get(db_session, project_id) + if not check: + raise ValueError(f"No check found for project {project_id}") + check.completed_at = func.now() + db_session.commit() + + @staticmethod + def has_running_check(db_session: Session, project_id: int) -> bool: + """Check if there's currently a running refresh for this project.""" + running = ( + db_session.query(PublicSystemAccessCheck) + .filter( + and_( + PublicSystemAccessCheck.project_id == project_id, + PublicSystemAccessCheck.started_at.isnot(None), + PublicSystemAccessCheck.completed_at.is_(None), + ) + ) + .first() + ) + return running is not None + + @staticmethod + def get(db_session: "Session", project_id: int) -> PublicSystemAccessCheck | None: + """Get the currently running refresh for this project.""" + return ( + db_session.query(PublicSystemAccessCheck) + .filter(PublicSystemAccessCheck.project_id == project_id) + .first() + ) diff --git a/geoapi/tests/api_tests/test_public_system_access_routes.py b/geoapi/tests/api_tests/test_public_system_access_routes.py new file mode 100644 index 00000000..379efe10 --- /dev/null +++ b/geoapi/tests/api_tests/test_public_system_access_routes.py @@ -0,0 +1,104 @@ +import pytest +from unittest.mock import patch, MagicMock + + +@pytest.fixture +def check_public_paths_mock(): + with patch( + "geoapi.routes.public_system_access.check_and_update_system_paths" # Patch where it's used + ) as mock_task: + mock_task.apply_async.return_value = MagicMock(id="mock-task-id") + yield mock_task + + +def test_start_public_status_refresh( + test_client, projects_fixture, user1, check_public_paths_mock +): + resp = test_client.post( + f"/projects/{projects_fixture.id}/public-system-access/", + headers={"X-Tapis-Token": user1.jwt}, + ) + assert resp.status_code == 202 + data = resp.json() + assert data["message"] == "Public status refresh started" + assert data["public_status_id"] is not None + assert data["task_id"] is not None + + # Verify the task was called + check_public_paths_mock.apply_async.assert_called_once() + + +def test_start_public_status_refresh_already_running( + test_client, projects_fixture, user1, check_public_paths_mock +): + # Start first refresh + test_client.post( + f"/projects/{projects_fixture.id}/public-system-access/", + headers={"X-Tapis-Token": user1.jwt}, + ) + + # Try to start second refresh + resp = test_client.post( + f"/projects/{projects_fixture.id}/public-system-access/", + headers={"X-Tapis-Token": user1.jwt}, + ) + assert resp.status_code == 202 + data = resp.json() + assert "already in progress" in data["message"] + + +def test_start_public_status_refresh_unauthorized( + test_client, projects_fixture, check_public_paths_mock +): + resp = test_client.post(f"/projects/{projects_fixture.id}/public-system-access/") + assert resp.status_code == 401 + check_public_paths_mock.apply_async.assert_not_called() + + +def test_start_public_status_refresh_forbidden(test_client, projects_fixture, user2): + resp = test_client.post( + f"/projects/{projects_fixture.id}/public-system-access/", + headers={"X-Tapis-Token": user2.jwt}, + ) + assert resp.status_code == 403 + + +def test_get_public_files_status_empty(test_client, projects_fixture, user1): + resp = test_client.get( + f"/projects/{projects_fixture.id}/public-system-access/files", + headers={"X-Tapis-Token": user1.jwt}, + ) + assert resp.status_code == 200 + data = resp.json() + assert data["project_id"] == projects_fixture.id + assert data["check"] is None + assert data["files"] == [] + + +def test_get_public_files_status_with_features( + test_client, projects_fixture, image_feature_fixture, user1 +): + resp = test_client.get( + f"/projects/{projects_fixture.id}/public-system-access/files", + headers={"X-Tapis-Token": user1.jwt}, + ) + assert resp.status_code == 200 + data = resp.json() + assert data["project_id"] == projects_fixture.id + assert len(data["files"]) == 1 + assert data["files"][0]["asset_type"] == "image" + + +def test_get_public_files_status_unauthorized(test_client, projects_fixture): + resp = test_client.get( + f"/projects/{projects_fixture.id}/public-system-access/files" + ) + assert resp.status_code == 401 + + +def test_get_public_files_status_forbidden(test_client, projects_fixture, user2): + resp = test_client.get( + f"/projects/{projects_fixture.id}/public-system-access/files", + headers={"X-Tapis-Token": user2.jwt}, + ) + assert resp.status_code == 403 diff --git a/geoapi/tests/services/test_public_system_access_service.py b/geoapi/tests/services/test_public_system_access_service.py new file mode 100644 index 00000000..3395b4b5 --- /dev/null +++ b/geoapi/tests/services/test_public_system_access_service.py @@ -0,0 +1,112 @@ +import pytest + +from geoapi.models import TaskStatus +from geoapi.services.public_system_access import PublicSystemAccessService + + +def test_start_check_creates_new_check(db_session, projects_fixture): + """Test start_check creates a new check when none exists""" + check = PublicSystemAccessService.start_check( + db_session, projects_fixture.id, celery_task_uuid="uuid" + ) + + assert check is not None + assert check.project_id == projects_fixture.id + assert check.task_id is not None + assert check.task.status == TaskStatus.QUEUED + assert check.started_at is not None + assert check.completed_at is None + + +def test_start_complete_restart(db_session, projects_fixture): + """Test that start and complete and re-start work""" + # Create initial check with task_id + first_check = PublicSystemAccessService.start_check( + db_session, projects_fixture.id, celery_task_uuid="uuid" + ) + first_started_at = first_check.started_at + assert ( + PublicSystemAccessService.has_running_check(db_session, projects_fixture.id) + is True + ) + + # Complete it + PublicSystemAccessService.complete_check(db_session, projects_fixture.id) + db_session.refresh(first_check) + assert first_check.completed_at is not None + assert ( + PublicSystemAccessService.has_running_check(db_session, projects_fixture.id) + is False + ) + + # Start again with different task_id + second_check = PublicSystemAccessService.start_check( + db_session, projects_fixture.id, celery_task_uuid="uuid" + ) + + assert second_check.id == first_check.id # same entry + assert second_check.task_id == first_check.task_id # different tasks + assert second_check.completed_at is None + assert second_check.started_at != first_started_at + assert ( + PublicSystemAccessService.has_running_check(db_session, projects_fixture.id) + is True + ) + + +def test_complete_check_when_no_check_exists(db_session, projects_fixture): + """Test complete_check raises ValueError when no check exists""" + with pytest.raises(ValueError, match="No check found for project"): + PublicSystemAccessService.complete_check(db_session, projects_fixture.id) + + +def test_has_running_check_returns_false_when_no_check(db_session, projects_fixture): + """Test has_running_check returns False when no check exists""" + result = PublicSystemAccessService.has_running_check( + db_session, projects_fixture.id + ) + assert result is False + + +def test_has_running_check_returns_true_for_running_check(db_session, projects_fixture): + """Test has_running_check returns True for a running check""" + PublicSystemAccessService.start_check( + db_session, projects_fixture.id, celery_task_uuid="uuid" + ) + + result = PublicSystemAccessService.has_running_check( + db_session, projects_fixture.id + ) + assert result is True + + +def test_has_running_check_returns_false_for_completed_check( + db_session, projects_fixture +): + """Test has_running_check returns False when check is completed""" + PublicSystemAccessService.start_check( + db_session, projects_fixture.id, celery_task_uuid="uuid" + ) + PublicSystemAccessService.complete_check(db_session, projects_fixture.id) + + result = PublicSystemAccessService.has_running_check( + db_session, projects_fixture.id + ) + assert result is False + + +def test_get_returns_check(db_session, projects_fixture): + """Test get returns the check for a project""" + original_check = PublicSystemAccessService.start_check( + db_session, projects_fixture.id, celery_task_uuid="uuid" + ) + + retrieved_check = PublicSystemAccessService.get(db_session, projects_fixture.id) + assert retrieved_check.id == original_check.id + assert retrieved_check.project_id == projects_fixture.id + + +def test_get_returns_none_when_no_check(db_session, projects_fixture): + """Test get returns None when no check exists""" + result = PublicSystemAccessService.get(db_session, projects_fixture.id) + assert result is None From 7430e230cfd55760208b3a1f42e02f32d2193526 Mon Sep 17 00:00:00 2001 From: Nathan Franklin Date: Tue, 4 Nov 2025 12:26:41 -0600 Subject: [PATCH 07/28] Rename --- ...access_check.py => file_location_check.py} | 6 +-- ...stem_access.py => file_location_status.py} | 44 +++++++++---------- geoapi/routes/projects.py | 10 +++-- ...stem_access.py => file_location_status.py} | 32 +++++++------- ...py => test_file_location_status_routes.py} | 20 ++++----- .../test_public_system_access_service.py | 36 +++++++-------- 6 files changed, 74 insertions(+), 74 deletions(-) rename geoapi/models/{public_system_access_check.py => file_location_check.py} (86%) rename geoapi/routes/{public_system_access.py => file_location_status.py} (75%) rename geoapi/services/{public_system_access.py => file_location_status.py} (64%) rename geoapi/tests/api_tests/{test_public_system_access_routes.py => test_file_location_status_routes.py} (79%) diff --git a/geoapi/models/public_system_access_check.py b/geoapi/models/file_location_check.py similarity index 86% rename from geoapi/models/public_system_access_check.py rename to geoapi/models/file_location_check.py index c1c1662b..ee2b0e31 100644 --- a/geoapi/models/public_system_access_check.py +++ b/geoapi/models/file_location_check.py @@ -4,14 +4,14 @@ from geoapi.db import Base -class PublicSystemAccessCheck(Base): +class FileLocationCheck(Base): """ - Tracks when a project's files were checked for public status. + Tracks when a project's files were checked for location (including if accessible to public) This model stores metadata about this public-system-access checks. Individual file status is tracked via FeatureAsset.is_on_public_system and FeatureAsset.last_public_check. - Note just one check per file. It can be re-run but we only keep info about a + Note just one check per file. It can be re-run, but we only keep info about a single (i.e. current or last) check. """ diff --git a/geoapi/routes/public_system_access.py b/geoapi/routes/file_location_status.py similarity index 75% rename from geoapi/routes/public_system_access.py rename to geoapi/routes/file_location_status.py index c21bdeb9..6d0379be 100644 --- a/geoapi/routes/public_system_access.py +++ b/geoapi/routes/file_location_status.py @@ -8,10 +8,10 @@ from geoapi.models import Feature, FeatureAsset from geoapi.routes.projects import FeatureAssetModel, TaskModel from geoapi.utils.decorators import project_permissions_guard -from geoapi.tasks.public_system_access_check import check_and_update_system_paths +from geoapi.tasks.file_location_check import check_and_update_file_locations -class PublicStatusModel(BaseModel): +class FileLocationStatusModel(BaseModel): model_config = ConfigDict(from_attributes=True) id: int @@ -21,40 +21,40 @@ class PublicStatusModel(BaseModel): task: TaskModel | None = None -class StartPublicStatusRefreshResponse(BaseModel): +class StartFileLocationStatusRefreshResponse(BaseModel): message: str public_status_id: int task_id: int | None = None -class ProjectPublicStatusSummary(BaseModel): +class ProjectFileLocationStatusSummary(BaseModel): project_id: int - check: PublicStatusModel | None = None + check: FileLocationStatusModel | None = None files: list[FeatureAssetModel] -class ProjectPublicStatusController(Controller): +class ProjectFileLocationStatusController(Controller): # Mounted under projects_router which provides /projects prefix - path = "/{project_id:int}/public-system-access" + path = "/{project_id:int}/file-location-status" @post( "/", tags=["projects"], - operation_id="start_public_system_access_status_refresh", - description="Start checking public system access status for project files", + operation_id="start_file_location_refresh", + description="Start checking and updating file location (and public access) for project files", guards=[project_permissions_guard], status_code=202, ) def start_public_status_refresh( self, request: Request, db_session: Session, project_id: int - ) -> StartPublicStatusRefreshResponse: + ) -> StartFileLocationStatusRefreshResponse: """ Start a background task to refresh public-system-access status of files. This checks all feature assets in the project and updates their current_system/current_path if they're found in the published location. """ - from geoapi.services.public_system_access import PublicSystemAccessService + from geoapi.services.file_location_status import FileLocationStatusService user = request.user logger.info( @@ -62,9 +62,9 @@ def start_public_status_refresh( ) # Check if there's already a running refresh - if PublicSystemAccessService.has_running_check(db_session, project_id): - existing = PublicSystemAccessService.get(db_session, project_id) - return StartPublicStatusRefreshResponse( + if FileLocationStatusService.has_running_check(db_session, project_id): + existing = FileLocationStatusService.get(db_session, project_id) + return StartFileLocationStatusRefreshResponse( message="A public status refresh is already in progress for this project", public_status_id=existing.id, task_id=existing.task_id, @@ -74,17 +74,17 @@ def start_public_status_refresh( celery_task_uuid = celery_uuid() # Create the check record and Task in database - public_access_check = PublicSystemAccessService.start_check( + public_access_check = FileLocationStatusService.start_check( db_session, project_id, celery_task_uuid=celery_task_uuid ) # Start Celery task with the UUID we generated - check_and_update_system_paths.apply_async( + check_and_update_file_locations.apply_async( args=[user.id, project_id, public_access_check.id], task_id=celery_task_uuid, ) - return StartPublicStatusRefreshResponse( + return StartFileLocationStatusRefreshResponse( message="Public status refresh started", public_status_id=public_access_check.id, task_id=public_access_check.task_id, # Return the database task.id @@ -99,13 +99,13 @@ def start_public_status_refresh( ) def get_files_status( self, request: Request, db_session: "Session", project_id: int - ) -> ProjectPublicStatusSummary: + ) -> ProjectFileLocationStatusSummary: """ Get detailed status of which files are public vs private. Single endpoint that returns both file-level details and last refresh info. """ - from geoapi.services.public_system_access import PublicSystemAccessService + from geoapi.services.file_location_status import FileLocationStatusService logger.info( f"Getting public files status for project:{project_id} " @@ -129,16 +129,16 @@ def get_files_status( # users are aware of a gap of knowledge # Get most recent check - public_access_check = PublicSystemAccessService.get(db_session, project_id) + public_access_check = FileLocationStatusService.get(db_session, project_id) files_status = [ FeatureAssetModel.model_validate(asset) for asset in feature_assets ] - return ProjectPublicStatusSummary( + return ProjectFileLocationStatusSummary( project_id=project_id, check=( - PublicStatusModel.model_validate(public_access_check) + FileLocationStatusModel.model_validate(public_access_check) if public_access_check else None ), diff --git a/geoapi/routes/projects.py b/geoapi/routes/projects.py index a739a96a..09a75332 100644 --- a/geoapi/routes/projects.py +++ b/geoapi/routes/projects.py @@ -51,6 +51,8 @@ class FeatureAssetModel(BaseModel): original_path: str | None = None original_name: str | None = None display_path: str | None = None + current_system: str | None = None + current_path: str | None = None is_on_public_system: bool | None = None @@ -1286,9 +1288,9 @@ def feature_enc_hook(feature: Feature) -> FeatureModel: ) -from geoapi.routes.public_system_access import ( - ProjectPublicStatusController, -) # noqa: E402 +from geoapi.routes.file_location_status import ( # noqa: E402 + ProjectFileLocationStatusController, +) projects_router = Router( @@ -1319,7 +1321,7 @@ def feature_enc_hook(feature: Feature) -> FeatureModel: ProjectTileServersResourceController, ProjectTileServersFilesImportResourceController, ProjectTileServerResourceController, - ProjectPublicStatusController, + ProjectFileLocationStatusController, ], type_encoders={Feature: feature_enc_hook}, ) diff --git a/geoapi/services/public_system_access.py b/geoapi/services/file_location_status.py similarity index 64% rename from geoapi/services/public_system_access.py rename to geoapi/services/file_location_status.py index cc49df0d..ecf9b2a1 100644 --- a/geoapi/services/public_system_access.py +++ b/geoapi/services/file_location_status.py @@ -3,24 +3,20 @@ from datetime import datetime, timezone -from geoapi.models.public_system_access_check import PublicSystemAccessCheck +from geoapi.models.file_location_check import FileLocationCheck from geoapi.models import Task, TaskStatus from sqlalchemy.orm import Session -class PublicSystemAccessService: +class FileLocationStatusService: """Service for managing checking if files are accessible from public systems.""" @staticmethod def start_check( db_session: "Session", project_id: int, celery_task_uuid: str - ) -> PublicSystemAccessCheck: - check = ( - db_session.query(PublicSystemAccessCheck) - .filter(PublicSystemAccessCheck.project_id == project_id) - .first() - ) + ) -> FileLocationCheck: + check = FileLocationStatusService.get(db_session, project_id) task = Task( process_id=celery_task_uuid, @@ -36,7 +32,7 @@ def start_check( check.completed_at = None check.task_id = task.id else: - check = PublicSystemAccessCheck(project_id=project_id, task_id=task.id) + check = FileLocationCheck(project_id=project_id, task_id=task.id) db_session.add(check) db_session.commit() @@ -45,7 +41,7 @@ def start_check( @staticmethod def complete_check(db_session: "Session", project_id: int) -> None: """Mark the check as completed.""" - check = PublicSystemAccessService.get(db_session, project_id) + check = FileLocationStatusService.get(db_session, project_id) if not check: raise ValueError(f"No check found for project {project_id}") check.completed_at = func.now() @@ -55,23 +51,25 @@ def complete_check(db_session: "Session", project_id: int) -> None: def has_running_check(db_session: Session, project_id: int) -> bool: """Check if there's currently a running refresh for this project.""" running = ( - db_session.query(PublicSystemAccessCheck) + db_session.query(FileLocationCheck) .filter( and_( - PublicSystemAccessCheck.project_id == project_id, - PublicSystemAccessCheck.started_at.isnot(None), - PublicSystemAccessCheck.completed_at.is_(None), + FileLocationCheck.project_id == project_id, + FileLocationCheck.started_at.isnot(None), + FileLocationCheck.completed_at.is_(None), ) ) .first() ) return running is not None + # TODO add error check + @staticmethod - def get(db_session: "Session", project_id: int) -> PublicSystemAccessCheck | None: + def get(db_session: "Session", project_id: int) -> FileLocationCheck | None: """Get the currently running refresh for this project.""" return ( - db_session.query(PublicSystemAccessCheck) - .filter(PublicSystemAccessCheck.project_id == project_id) + db_session.query(FileLocationCheck) + .filter(FileLocationCheck.project_id == project_id) .first() ) diff --git a/geoapi/tests/api_tests/test_public_system_access_routes.py b/geoapi/tests/api_tests/test_file_location_status_routes.py similarity index 79% rename from geoapi/tests/api_tests/test_public_system_access_routes.py rename to geoapi/tests/api_tests/test_file_location_status_routes.py index 379efe10..15862dc7 100644 --- a/geoapi/tests/api_tests/test_public_system_access_routes.py +++ b/geoapi/tests/api_tests/test_file_location_status_routes.py @@ -5,7 +5,7 @@ @pytest.fixture def check_public_paths_mock(): with patch( - "geoapi.routes.public_system_access.check_and_update_system_paths" # Patch where it's used + "geoapi.routes.file_location_status.check_and_update_file_locations" # Patch where it's used ) as mock_task: mock_task.apply_async.return_value = MagicMock(id="mock-task-id") yield mock_task @@ -15,7 +15,7 @@ def test_start_public_status_refresh( test_client, projects_fixture, user1, check_public_paths_mock ): resp = test_client.post( - f"/projects/{projects_fixture.id}/public-system-access/", + f"/projects/{projects_fixture.id}/file-location-status/", headers={"X-Tapis-Token": user1.jwt}, ) assert resp.status_code == 202 @@ -33,13 +33,13 @@ def test_start_public_status_refresh_already_running( ): # Start first refresh test_client.post( - f"/projects/{projects_fixture.id}/public-system-access/", + f"/projects/{projects_fixture.id}/file-location-status/", headers={"X-Tapis-Token": user1.jwt}, ) # Try to start second refresh resp = test_client.post( - f"/projects/{projects_fixture.id}/public-system-access/", + f"/projects/{projects_fixture.id}/file-location-status/", headers={"X-Tapis-Token": user1.jwt}, ) assert resp.status_code == 202 @@ -50,14 +50,14 @@ def test_start_public_status_refresh_already_running( def test_start_public_status_refresh_unauthorized( test_client, projects_fixture, check_public_paths_mock ): - resp = test_client.post(f"/projects/{projects_fixture.id}/public-system-access/") + resp = test_client.post(f"/projects/{projects_fixture.id}/file-location-status/") assert resp.status_code == 401 check_public_paths_mock.apply_async.assert_not_called() def test_start_public_status_refresh_forbidden(test_client, projects_fixture, user2): resp = test_client.post( - f"/projects/{projects_fixture.id}/public-system-access/", + f"/projects/{projects_fixture.id}/file-location-status/", headers={"X-Tapis-Token": user2.jwt}, ) assert resp.status_code == 403 @@ -65,7 +65,7 @@ def test_start_public_status_refresh_forbidden(test_client, projects_fixture, us def test_get_public_files_status_empty(test_client, projects_fixture, user1): resp = test_client.get( - f"/projects/{projects_fixture.id}/public-system-access/files", + f"/projects/{projects_fixture.id}/file-location-status/files", headers={"X-Tapis-Token": user1.jwt}, ) assert resp.status_code == 200 @@ -79,7 +79,7 @@ def test_get_public_files_status_with_features( test_client, projects_fixture, image_feature_fixture, user1 ): resp = test_client.get( - f"/projects/{projects_fixture.id}/public-system-access/files", + f"/projects/{projects_fixture.id}/file-location-status/files", headers={"X-Tapis-Token": user1.jwt}, ) assert resp.status_code == 200 @@ -91,14 +91,14 @@ def test_get_public_files_status_with_features( def test_get_public_files_status_unauthorized(test_client, projects_fixture): resp = test_client.get( - f"/projects/{projects_fixture.id}/public-system-access/files" + f"/projects/{projects_fixture.id}/file-location-status/files" ) assert resp.status_code == 401 def test_get_public_files_status_forbidden(test_client, projects_fixture, user2): resp = test_client.get( - f"/projects/{projects_fixture.id}/public-system-access/files", + f"/projects/{projects_fixture.id}/file-location-status/files", headers={"X-Tapis-Token": user2.jwt}, ) assert resp.status_code == 403 diff --git a/geoapi/tests/services/test_public_system_access_service.py b/geoapi/tests/services/test_public_system_access_service.py index 3395b4b5..5e786c90 100644 --- a/geoapi/tests/services/test_public_system_access_service.py +++ b/geoapi/tests/services/test_public_system_access_service.py @@ -1,12 +1,12 @@ import pytest from geoapi.models import TaskStatus -from geoapi.services.public_system_access import PublicSystemAccessService +from geoapi.services.file_location_status import FileLocationStatusService def test_start_check_creates_new_check(db_session, projects_fixture): """Test start_check creates a new check when none exists""" - check = PublicSystemAccessService.start_check( + check = FileLocationStatusService.start_check( db_session, projects_fixture.id, celery_task_uuid="uuid" ) @@ -21,26 +21,26 @@ def test_start_check_creates_new_check(db_session, projects_fixture): def test_start_complete_restart(db_session, projects_fixture): """Test that start and complete and re-start work""" # Create initial check with task_id - first_check = PublicSystemAccessService.start_check( + first_check = FileLocationStatusService.start_check( db_session, projects_fixture.id, celery_task_uuid="uuid" ) first_started_at = first_check.started_at assert ( - PublicSystemAccessService.has_running_check(db_session, projects_fixture.id) + FileLocationStatusService.has_running_check(db_session, projects_fixture.id) is True ) # Complete it - PublicSystemAccessService.complete_check(db_session, projects_fixture.id) + FileLocationStatusService.complete_check(db_session, projects_fixture.id) db_session.refresh(first_check) assert first_check.completed_at is not None assert ( - PublicSystemAccessService.has_running_check(db_session, projects_fixture.id) + FileLocationStatusService.has_running_check(db_session, projects_fixture.id) is False ) # Start again with different task_id - second_check = PublicSystemAccessService.start_check( + second_check = FileLocationStatusService.start_check( db_session, projects_fixture.id, celery_task_uuid="uuid" ) @@ -49,7 +49,7 @@ def test_start_complete_restart(db_session, projects_fixture): assert second_check.completed_at is None assert second_check.started_at != first_started_at assert ( - PublicSystemAccessService.has_running_check(db_session, projects_fixture.id) + FileLocationStatusService.has_running_check(db_session, projects_fixture.id) is True ) @@ -57,12 +57,12 @@ def test_start_complete_restart(db_session, projects_fixture): def test_complete_check_when_no_check_exists(db_session, projects_fixture): """Test complete_check raises ValueError when no check exists""" with pytest.raises(ValueError, match="No check found for project"): - PublicSystemAccessService.complete_check(db_session, projects_fixture.id) + FileLocationStatusService.complete_check(db_session, projects_fixture.id) def test_has_running_check_returns_false_when_no_check(db_session, projects_fixture): """Test has_running_check returns False when no check exists""" - result = PublicSystemAccessService.has_running_check( + result = FileLocationStatusService.has_running_check( db_session, projects_fixture.id ) assert result is False @@ -70,11 +70,11 @@ def test_has_running_check_returns_false_when_no_check(db_session, projects_fixt def test_has_running_check_returns_true_for_running_check(db_session, projects_fixture): """Test has_running_check returns True for a running check""" - PublicSystemAccessService.start_check( + FileLocationStatusService.start_check( db_session, projects_fixture.id, celery_task_uuid="uuid" ) - result = PublicSystemAccessService.has_running_check( + result = FileLocationStatusService.has_running_check( db_session, projects_fixture.id ) assert result is True @@ -84,12 +84,12 @@ def test_has_running_check_returns_false_for_completed_check( db_session, projects_fixture ): """Test has_running_check returns False when check is completed""" - PublicSystemAccessService.start_check( + FileLocationStatusService.start_check( db_session, projects_fixture.id, celery_task_uuid="uuid" ) - PublicSystemAccessService.complete_check(db_session, projects_fixture.id) + FileLocationStatusService.complete_check(db_session, projects_fixture.id) - result = PublicSystemAccessService.has_running_check( + result = FileLocationStatusService.has_running_check( db_session, projects_fixture.id ) assert result is False @@ -97,16 +97,16 @@ def test_has_running_check_returns_false_for_completed_check( def test_get_returns_check(db_session, projects_fixture): """Test get returns the check for a project""" - original_check = PublicSystemAccessService.start_check( + original_check = FileLocationStatusService.start_check( db_session, projects_fixture.id, celery_task_uuid="uuid" ) - retrieved_check = PublicSystemAccessService.get(db_session, projects_fixture.id) + retrieved_check = FileLocationStatusService.get(db_session, projects_fixture.id) assert retrieved_check.id == original_check.id assert retrieved_check.project_id == projects_fixture.id def test_get_returns_none_when_no_check(db_session, projects_fixture): """Test get returns None when no check exists""" - result = PublicSystemAccessService.get(db_session, projects_fixture.id) + result = FileLocationStatusService.get(db_session, projects_fixture.id) assert result is None From 3728e6a1bfb62c174b1601099930fedcf06c7be2 Mon Sep 17 00:00:00 2001 From: Nathan Franklin Date: Tue, 4 Nov 2025 15:33:42 -0600 Subject: [PATCH 08/28] Put pydantic models in seperate file --- geoapi/routes/file_location_status.py | 2 +- geoapi/routes/projects.py | 256 +++----------------------- geoapi/schema/projects.py | 239 ++++++++++++++++++++++++ 3 files changed, 263 insertions(+), 234 deletions(-) create mode 100644 geoapi/schema/projects.py diff --git a/geoapi/routes/file_location_status.py b/geoapi/routes/file_location_status.py index 6d0379be..c865a1ba 100644 --- a/geoapi/routes/file_location_status.py +++ b/geoapi/routes/file_location_status.py @@ -6,7 +6,7 @@ from geoapi.log import logger from geoapi.models import Feature, FeatureAsset -from geoapi.routes.projects import FeatureAssetModel, TaskModel +from geoapi.schema.projects import FeatureAssetModel, TaskModel from geoapi.utils.decorators import project_permissions_guard from geoapi.tasks.file_location_check import check_and_update_file_locations diff --git a/geoapi/routes/projects.py b/geoapi/routes/projects.py index 09a75332..a9ac3d14 100644 --- a/geoapi/routes/projects.py +++ b/geoapi/routes/projects.py @@ -1,15 +1,12 @@ import io from datetime import datetime from typing import TYPE_CHECKING, Annotated -from pydantic import BaseModel, Field, ConfigDict +from pydantic import BaseModel, Field from litestar import Controller, get, Request, post, delete, put, Router from litestar.datastructures import UploadFile from litestar.params import Body from litestar.enums import RequestEncodingType from litestar.exceptions import NotAuthorizedException -from litestar.plugins.sqlalchemy import SQLAlchemyDTO -from litestar.dto import DTOConfig -from uuid import UUID from geojson_pydantic import Feature as GeoJSONFeature from geoapi.log import logger from geoapi.services.features import FeaturesService @@ -30,240 +27,33 @@ check_access_and_get_project, is_anonymous, ) -from geoapi.schema.tapis import TapisFilePath +from geoapi.schema.projects import ( + OkResponse, + FeatureModel, + FeatureReturnDTO, + FeatureCollectionModel, + ProjectPayloadModel, + ProjectUpdatePayloadModel, + ProjectDTO, + UserDTO, + UserPayloadModel, + TaskDTO, + PointCloudDTO, + PointCloudModel, + OverlayDTO, + TileServerDTO, + TileServerModel, + TapisFileUploadModel, + TapisFileImportModel, + AddOverlayBody, + TapisOverlayImportBody, +) + if TYPE_CHECKING: from sqlalchemy.orm import Session -class OkResponse(BaseModel): - message: str = "accepted" - - -class FeatureAssetModel(BaseModel): - model_config = ConfigDict(from_attributes=True) - - id: int | None = None - path: str | None = None - uuid: UUID | None = None - asset_type: str | None = None - original_system: str | None = None - original_path: str | None = None - original_name: str | None = None - display_path: str | None = None - current_system: str | None = None - current_path: str | None = None - is_on_public_system: bool | None = None - - -class FeatureModel(BaseModel): - model_config = ConfigDict(from_attributes=True) - - id: int - project_id: int - geometry: dict - properties: dict | None = None - styles: dict | None = None - assets: list[FeatureAssetModel] | None = None - - -class FeatureReturnDTO(SQLAlchemyDTO[Feature]): - config = DTOConfig( - include={ - "id", - "project_id", - "geometry", - "properties", - "styles", - "assets", - } - ) - - -class FeatureCollectionModel(BaseModel): - type: str = "FeatureCollection" - features: list[FeatureModel] | None = None - - -class ProjectPayloadModel(BaseModel): - name: str - description: str - public: bool | None = None - system_file: str | None = None - system_id: str | None = None - system_path: str | None = None - watch_content: bool | None = None - watch_users: bool | None = None - - -class ProjectUpdatePayloadModel(BaseModel): - name: str | None = None - description: str | None = None - public: bool | None = None - - -class ProjectDTO(SQLAlchemyDTO[Project]): - config = DTOConfig( - include={ - "id", - "uuid", - "name", - "description", - "public", - "system_file", - "system_id", - "system_path", - "deletable", - }, - ) - - -class UserDTO(SQLAlchemyDTO[User]): - config = DTOConfig( - include={ - "id", - "username", - } - ) - - -class UserModel(BaseModel): - model_config = ConfigDict(from_attributes=True) - - username: str - id: int | None = None - - -class UserPayloadModel(UserModel): - admin: bool = False - - -class TaskDTO(SQLAlchemyDTO[Task]): - model_config = ConfigDict(from_attributes=True) - config = DTOConfig( - # skipping process_id - include={ - "id", - "status", - "description", - "project_id", - "latest_message", - "created", - "updated", - }, - ) - - -class TaskModel(BaseModel): - id: int | None = None - status: str | None = None - description: str | None = None - project_id: int | None = None - latest_message: str | None = None - created: datetime | None = None - updated: datetime | None = None - - -class PointCloudDTO(SQLAlchemyDTO[PointCloud]): - config = DTOConfig( - include={ - "id", - "description", - "conversion_parameters", - "files_info", - "feature_id", - "task", - "project_id", - }, - ) - - -class PointCloudModel(BaseModel): - model_config = ConfigDict(from_attributes=True) - - files_info: dict | None = None - id: int | None = None - description: str | None = None - conversion_parameters: str | None = None - feature_id: int | None = None - task: TaskModel = None - project_id: int | None = None - - -class OverlayDTO(SQLAlchemyDTO[Overlay]): - config = DTOConfig( - exclude={"project"}, - ) - - -class TileServerDTO(SQLAlchemyDTO[TileServer]): - config = DTOConfig( - include={ - "id", - "name", - "type", - "kind", - "internal", - "uuid", - "url", - "attribution", - "tileOptions", - "uiOptions", - } - ) - - -class TileServerModel(BaseModel): - model_config = ConfigDict(from_attributes=True) - - id: int | None = None - name: str | None = None - type: str | None = None - kind: str | None = None - internal: bool | None = None - uuid: UUID | None = None - url: str | None = None - attribution: str | None = None - tileOptions: dict | None = None - uiOptions: dict | None = None - - -# TODO: replace with TapisFilePath (and update client software) -class TapisFileUploadModel(BaseModel): - system_id: str | None = None - path: str | None = None - - -class TapisSaveFileModel(BaseModel): - system_id: str - path: str - file_name: str - observable: bool | None = None - watch_content: bool | None = None - - -class TapisFileImportModel(BaseModel): - files: list[TapisFilePath] - - -class OverlayPostBody(BaseModel): - label: str - minLon: float - minLat: float - maxLon: float - maxLat: float - - -class AddOverlayBody(OverlayPostBody): - model_config = ConfigDict(arbitrary_types_allowed=True) - file: UploadFile - - -class TapisOverlayImportBody(OverlayPostBody): - system_id: str - path: str - - class ProjectsListingController(Controller): path = "/" return_dto = ProjectDTO diff --git a/geoapi/schema/projects.py b/geoapi/schema/projects.py new file mode 100644 index 00000000..70b13d31 --- /dev/null +++ b/geoapi/schema/projects.py @@ -0,0 +1,239 @@ +from datetime import datetime +from pydantic import BaseModel, ConfigDict +from litestar.datastructures import UploadFile +from litestar.plugins.sqlalchemy import SQLAlchemyDTO +from litestar.dto import DTOConfig +from uuid import UUID + +from geoapi.models import Task, Project, Feature, TileServer, Overlay, PointCloud, User +from geoapi.schema.tapis import TapisFilePath + + +class OkResponse(BaseModel): + message: str = "accepted" + + +class FeatureAssetModel(BaseModel): + model_config = ConfigDict(from_attributes=True) + + id: int + feature_id: int + path: str | None = None + uuid: UUID + asset_type: str | None = None + original_system: str | None = None + original_path: str | None = None + original_name: str | None = None + display_path: str | None = None + current_system: str | None = None + current_path: str | None = None + last_public_check: datetime | None = None + is_on_public_system: bool | None = None + + +class FeatureModel(BaseModel): + model_config = ConfigDict(from_attributes=True) + + id: int + project_id: int + geometry: dict + properties: dict | None = None + styles: dict | None = None + assets: list[FeatureAssetModel] | None = None + + +class FeatureReturnDTO(SQLAlchemyDTO[Feature]): + config = DTOConfig( + include={ + "id", + "project_id", + "geometry", + "properties", + "styles", + "assets", + } + ) + + +class FeatureCollectionModel(BaseModel): + type: str = "FeatureCollection" + features: list[FeatureModel] | None = None + + +class ProjectPayloadModel(BaseModel): + name: str + description: str + public: bool | None = None + system_file: str | None = None + system_id: str | None = None + system_path: str | None = None + watch_content: bool | None = None + watch_users: bool | None = None + + +class ProjectUpdatePayloadModel(BaseModel): + name: str | None = None + description: str | None = None + public: bool | None = None + + +class ProjectDTO(SQLAlchemyDTO[Project]): + config = DTOConfig( + include={ + "id", + "uuid", + "name", + "description", + "public", + "system_file", + "system_id", + "system_path", + "deletable", + }, + ) + + +class UserDTO(SQLAlchemyDTO[User]): + config = DTOConfig( + include={ + "id", + "username", + } + ) + + +class UserModel(BaseModel): + model_config = ConfigDict(from_attributes=True) + + username: str + id: int | None = None + + +class UserPayloadModel(UserModel): + admin: bool = False + + +class TaskDTO(SQLAlchemyDTO[Task]): + model_config = ConfigDict(from_attributes=True) + config = DTOConfig( + # skipping process_id + include={ + "id", + "status", + "description", + "project_id", + "latest_message", + "created", + "updated", + }, + ) + + +class TaskModel(BaseModel): + id: int | None = None + status: str | None = None + description: str | None = None + project_id: int | None = None + latest_message: str | None = None + created: datetime | None = None + updated: datetime | None = None + + +class PointCloudDTO(SQLAlchemyDTO[PointCloud]): + config = DTOConfig( + include={ + "id", + "description", + "conversion_parameters", + "files_info", + "feature_id", + "task", + "project_id", + }, + ) + + +class PointCloudModel(BaseModel): + model_config = ConfigDict(from_attributes=True) + + files_info: dict | None = None + id: int | None = None + description: str | None = None + conversion_parameters: str | None = None + feature_id: int | None = None + task: TaskModel = None + project_id: int | None = None + + +class OverlayDTO(SQLAlchemyDTO[Overlay]): + config = DTOConfig( + exclude={"project"}, + ) + + +class TileServerDTO(SQLAlchemyDTO[TileServer]): + config = DTOConfig( + include={ + "id", + "name", + "type", + "kind", + "internal", + "uuid", + "url", + "attribution", + "tileOptions", + "uiOptions", + } + ) + + +class TileServerModel(BaseModel): + model_config = ConfigDict(from_attributes=True) + + id: int | None = None + name: str | None = None + type: str | None = None + kind: str | None = None + internal: bool | None = None + uuid: UUID | None = None + url: str | None = None + attribution: str | None = None + tileOptions: dict | None = None + uiOptions: dict | None = None + + +# TODO: replace with TapisFilePath (and update client software) +class TapisFileUploadModel(BaseModel): + system_id: str | None = None + path: str | None = None + + +class TapisSaveFileModel(BaseModel): + system_id: str + path: str + file_name: str + observable: bool | None = None + watch_content: bool | None = None + + +class TapisFileImportModel(BaseModel): + files: list[TapisFilePath] + + +class OverlayPostBody(BaseModel): + label: str + minLon: float + minLat: float + maxLon: float + maxLat: float + + +class AddOverlayBody(OverlayPostBody): + model_config = ConfigDict(arbitrary_types_allowed=True) + file: UploadFile + + +class TapisOverlayImportBody(OverlayPostBody): + system_id: str + path: str From 24782a350b2048ec6836d85a79f9b7d8bd2880b2 Mon Sep 17 00:00:00 2001 From: Nathan Franklin Date: Thu, 6 Nov 2025 07:42:13 -0600 Subject: [PATCH 09/28] Fix restart workers to include heavy worker --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 7cffc43f..af77f688 100644 --- a/Makefile +++ b/Makefile @@ -25,7 +25,7 @@ stop: .PHONY: restart-workers restart-workers: ## Restart workers - docker compose -f devops/docker-compose.local.yml --env-file .env restart workers + docker compose -f devops/docker-compose.local.yml --env-file .env restart workers workers-heavy .PHONY: restart-nginx restart-nginx: ## Restart nginx From 0812ea71ac51279bd52c70ccd3e04a858388846d Mon Sep 17 00:00:00 2001 From: Nathan Franklin Date: Thu, 6 Nov 2025 07:42:55 -0600 Subject: [PATCH 10/28] Fix initdb.py --- geoapi/initdb.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/geoapi/initdb.py b/geoapi/initdb.py index 3543fbd3..d55f798a 100644 --- a/geoapi/initdb.py +++ b/geoapi/initdb.py @@ -3,7 +3,7 @@ from alembic.config import Config from alembic import command from geoapi.settings import settings, UnitTestingConfig -from geoapi.db import Base, engine, get_db_connection_string +from geoapi.db import Base, get_db_connection_string def setup_local_dev_database(): @@ -48,8 +48,6 @@ def setup_unit_test_database(): Base.metadata.create_all(test_engine) print("All tables created in the test database based on current models.") - return engine - if __name__ == "__main__": setup_local_dev_database() From cb96864b7ad1b6250e25dc118259d8b41d08ee8a Mon Sep 17 00:00:00 2001 From: Nathan Franklin Date: Thu, 6 Nov 2025 08:14:21 -0600 Subject: [PATCH 11/28] Add file_location_check task --- geoapi/celery_app.py | 1 + geoapi/models/__init__.py | 1 + geoapi/models/file_location_check.py | 8 +- geoapi/routes/file_location_status.py | 66 ++- geoapi/schema/projects.py | 4 +- geoapi/tasks/file_location_check.py | 389 ++++++++++++++++++ geoapi/tests/conftest.py | 6 +- .../tasks_tests/test_file_location_check.py | 302 ++++++++++++++ 8 files changed, 753 insertions(+), 24 deletions(-) create mode 100644 geoapi/tasks/file_location_check.py create mode 100644 geoapi/tests/tasks_tests/test_file_location_check.py diff --git a/geoapi/celery_app.py b/geoapi/celery_app.py index c54816d5..15c94f10 100644 --- a/geoapi/celery_app.py +++ b/geoapi/celery_app.py @@ -25,6 +25,7 @@ "geoapi.tasks.streetview", "geoapi.tasks.projects", "geoapi.tasks.external_data", + "geoapi.tasks.file_location_check", ) # Define the queues diff --git a/geoapi/models/__init__.py b/geoapi/models/__init__.py index 9307a10c..e61f8c58 100644 --- a/geoapi/models/__init__.py +++ b/geoapi/models/__init__.py @@ -1,5 +1,6 @@ from .project import Project, ProjectUser from .feature import Feature, FeatureAsset +from .file_location_check import FileLocationCheck from .point_cloud import PointCloud from .task import Task, TaskStatus from .users import User diff --git a/geoapi/models/file_location_check.py b/geoapi/models/file_location_check.py index ee2b0e31..b13e77ef 100644 --- a/geoapi/models/file_location_check.py +++ b/geoapi/models/file_location_check.py @@ -1,5 +1,5 @@ from sqlalchemy import Integer, ForeignKey, DateTime -from sqlalchemy.orm import relationship, mapped_column +from sqlalchemy.orm import Mapped, relationship, mapped_column from sqlalchemy.sql import func from geoapi.db import Base @@ -9,7 +9,7 @@ class FileLocationCheck(Base): Tracks when a project's files were checked for location (including if accessible to public) This model stores metadata about this public-system-access checks. Individual file status - is tracked via FeatureAsset.is_on_public_system and FeatureAsset.last_public_check. + is tracked via FeatureAsset.is_on_public_system and FeatureAsset.last_public_system_check. Note just one check per file. It can be re-run, but we only keep info about a single (i.e. current or last) check. @@ -32,6 +32,10 @@ class FileLocationCheck(Base): started_at = mapped_column(DateTime(timezone=True), server_default=func.now()) completed_at = mapped_column(DateTime(timezone=True), nullable=True) + total_files: Mapped[int | None] = mapped_column(Integer, default=0) + files_checked: Mapped[int | None] = mapped_column(Integer, default=0) + files_failed: Mapped[int | None] = mapped_column(Integer, default=0) + # Relationships project = relationship("Project") task = relationship("Task") diff --git a/geoapi/routes/file_location_status.py b/geoapi/routes/file_location_status.py index c865a1ba..8870dc0e 100644 --- a/geoapi/routes/file_location_status.py +++ b/geoapi/routes/file_location_status.py @@ -1,4 +1,4 @@ -from datetime import datetime +from datetime import datetime, timezone from pydantic import BaseModel, ConfigDict from litestar import Controller, get, post, Request from celery import uuid as celery_uuid @@ -19,6 +19,9 @@ class FileLocationStatusModel(BaseModel): started_at: datetime completed_at: datetime | None = None task: TaskModel | None = None + total_files: int | None = None + files_checked: int | None = None + files_failed: int | None = None class StartFileLocationStatusRefreshResponse(BaseModel): @@ -55,6 +58,7 @@ def start_public_status_refresh( current_system/current_path if they're found in the published location. """ from geoapi.services.file_location_status import FileLocationStatusService + from geoapi.models import TaskStatus user = request.user logger.info( @@ -71,23 +75,38 @@ def start_public_status_refresh( ) # Generate Celery task UUID - celery_task_uuid = celery_uuid() + celery_task_uuid = str(celery_uuid()) # Create the check record and Task in database - public_access_check = FileLocationStatusService.start_check( + file_location_check = FileLocationStatusService.start_check( db_session, project_id, celery_task_uuid=celery_task_uuid ) - # Start Celery task with the UUID we generated - check_and_update_file_locations.apply_async( - args=[user.id, project_id, public_access_check.id], - task_id=celery_task_uuid, - ) + # Try to start Celery task + try: + check_and_update_file_locations.apply_async( + args=[user.id, project_id], + task_id=celery_task_uuid, + ) + except Exception as e: + # Mark task as failed if we couldn't queue it + logger.error( + f"Failed to queue file location check task for project:{project_id}: {e}" + ) + if file_location_check.task: + file_location_check.task.status = TaskStatus.FAILED + file_location_check.task.latest_message = ( + f"Failed to queue task: {str(e)}" + ) + file_location_check.completed_at = datetime.now(timezone.utc) + db_session.commit() + + raise # Re-raise to return 500 to client return StartFileLocationStatusRefreshResponse( message="Public status refresh started", - public_status_id=public_access_check.id, - task_id=public_access_check.task_id, # Return the database task.id + public_status_id=file_location_check.id, + task_id=file_location_check.task_id, ) @get( @@ -98,7 +117,7 @@ def start_public_status_refresh( guards=[project_permissions_guard], ) def get_files_status( - self, request: Request, db_session: "Session", project_id: int + self, request: Request, db_session: Session, project_id: int ) -> ProjectFileLocationStatusSummary: """ Get detailed status of which files are public vs private. @@ -115,9 +134,7 @@ def get_files_status( # Get all feature assets # NOTE: Only includes Features with FeatureAssets. Features without assets # (geometry-only, like manually drawn shapes) are excluded from this list. - # TODO WG-600: Handle "source file" FeatureAssets (shapefiles, geojson) that represent - # the file that created the feature. These show up in results if they - # have original_system/path set, but may need different UI treatment. + # See WG-600. Also consider tile layers (internal cogs) feature_assets = ( db_session.query(FeatureAsset) .join(Feature, Feature.id == FeatureAsset.feature_id) @@ -135,12 +152,23 @@ def get_files_status( FeatureAssetModel.model_validate(asset) for asset in feature_assets ] + # Convert check with nested task validation + check_data = None + if public_access_check: + check_data = FileLocationStatusModel( + id=public_access_check.id, + project_id=public_access_check.project_id, + started_at=public_access_check.started_at, + completed_at=public_access_check.completed_at, + task=( + TaskModel.model_validate(public_access_check.task) + if public_access_check.task + else None + ), + ) + return ProjectFileLocationStatusSummary( project_id=project_id, - check=( - FileLocationStatusModel.model_validate(public_access_check) - if public_access_check - else None - ), + check=check_data, files=files_status, ) diff --git a/geoapi/schema/projects.py b/geoapi/schema/projects.py index 70b13d31..ada0c20a 100644 --- a/geoapi/schema/projects.py +++ b/geoapi/schema/projects.py @@ -27,7 +27,7 @@ class FeatureAssetModel(BaseModel): display_path: str | None = None current_system: str | None = None current_path: str | None = None - last_public_check: datetime | None = None + last_public_system_check: datetime | None = None is_on_public_system: bool | None = None @@ -130,6 +130,8 @@ class TaskDTO(SQLAlchemyDTO[Task]): class TaskModel(BaseModel): + model_config = ConfigDict(from_attributes=True) + id: int | None = None status: str | None = None description: str | None = None diff --git a/geoapi/tasks/file_location_check.py b/geoapi/tasks/file_location_check.py new file mode 100644 index 00000000..bf2edd24 --- /dev/null +++ b/geoapi/tasks/file_location_check.py @@ -0,0 +1,389 @@ +""" +Task to check if files from private DesignSafe projects have been published +and update their current_system/current_path accordingly. +""" + +from datetime import datetime, timezone +from typing import Dict +import os + +from sqlalchemy import and_ + +from geoapi.celery_app import app +from geoapi.db import create_task_session +from geoapi.models import Project, Feature, User, TaskStatus +from geoapi.models.feature import FeatureAsset +from geoapi.services.file_location_status import FileLocationStatusService +from geoapi.utils.external_apis import TapisUtils, get_session, TapisListingError +from geoapi.log import logger +from geoapi.tasks.utils import update_task_and_send_progress_update +from geoapi.settings import settings + + +DESIGNSAFE_PUBLISHED_SYSTEM = "designsafe.storage.published" +PUBLIC_SYSTEMS = [ + DESIGNSAFE_PUBLISHED_SYSTEM, + "designsafe.storage.community", +] + +BATCH_SIZE = 100 # Commit every 100 files + + +def extract_project_uuid(system_id: str) -> str | None: + if system_id.startswith("project-"): + return system_id.removeprefix("project-") + return None + + +def is_designsafe_project(system_id: str) -> bool: + return system_id.startswith("project-") + + +def build_file_index_from_tapis( + client, system_id: str, path: str = "/" +) -> dict[str, list[str]]: + """ + Recursively walk Tapis file system and build index. + Returns dict where: + - key = filename only + - value = list of paths (without system prefix) + """ + file_index = {} + + try: + listing = client.listing(system_id, path) + except TapisListingError: + logger.exception(f"Unable to list {system_id}/{path}") + return file_index + + for item in listing: + if item.type == "dir" and not str(item.path).endswith(".Trash"): + # Recursively get files from subdirectory + sub_index = build_file_index_from_tapis(client, system_id, item.path) + # Merge subdirectory results + for filename, paths in sub_index.items(): + if filename not in file_index: + file_index[filename] = [] + file_index[filename].extend(paths) + else: + # It's a file - add to index (path only, no system) + filename = os.path.basename(str(item.path)) + file_path = str(item.path).lstrip("/") # Just the path + + if filename not in file_index: + file_index[filename] = [] + file_index[filename].append(file_path) + + return file_index + + +def get_project_id(user, system_id): + """Get project id (PRJ-123) for system attached to system_id""" + designsafe_uuid = extract_project_uuid(system_id) + + client = get_session(user) + response = client.get( + settings.DESIGNSAFE_URL + f"/api/projects/v2/{designsafe_uuid}/" + ) + response.raise_for_status() + return response.json()["baseProject"]["value"]["projectId"] + + +def get_file_tree_for_published_project( + session, user, system_id +) -> dict[str, list[str]] | None: + """Get file tree for published ds project given a geoapi map project + + If `system_id` is not a DS project's system (i.e. My Data) then return None + + """ + + if is_designsafe_project(system_id) is False: + # it is not a DS project + logger.info(f"Issue with system:{system_id}. Not a typical project") + return None + + designsafe_prj = get_project_id(user, system_id) + + tapis_client = TapisUtils(session, user) + + # Build file index starting from root of project + file_index = build_file_index_from_tapis( + tapis_client, DESIGNSAFE_PUBLISHED_SYSTEM, f"/published-data/{designsafe_prj}/" + ) + + logger.info( + f"Built file tree for {system_id}: " + f"{len(file_index)} unique filenames" + ) + + return file_index + + +def determine_if_published( + file_tree: Dict | None, asset: FeatureAsset +) -> tuple[bool, str | None, str | None]: + """ + Check if asset's file exists in the published file tree. + + Returns: + tuple of (is_published, system, path) + """ + if file_tree is None or not asset.current_path: + return (False, None, None) + + filename = os.path.basename(asset.current_path) + + if filename in file_tree: + published_paths = file_tree[filename] + + if len(published_paths) > 1: + logger.info( + f"Multiple matches found for asset {asset.id} file '{filename}': " + f"{published_paths}" + ) + + # Use first match - path only (no system) + path = "/" + published_paths[0].lstrip("/") + + logger.debug( + f"Asset {asset.id} found in published system: " + f"{DESIGNSAFE_PUBLISHED_SYSTEM}{path}" + ) + + return (True, DESIGNSAFE_PUBLISHED_SYSTEM, path) + + return (False, None, None) + + +@app.task(queue="default") +def check_and_update_file_locations(user_id: int, project_id: int): + """ + Check all feature assets in a project and update where they are located (i.e. new published + location) and then check if they are located tapis system that is publicly accessible + + Updates: + - FeatureAsset.current_system and current_path if file found in public system + - FeatureAsset.is_on_public_system based on current_system + - FeatureAsset.last_public_system_check timestamp + - FileLocationCheck record with new info + """ + logger.info(f"Starting file location check for project:{project_id} user:{user_id}") + + with create_task_session() as session: + user = None + file_location_check = None + failed_assets = [] + + try: + user = session.get(User, user_id) + project = session.get(Project, project_id) + file_location_check = FileLocationStatusService.get( + session, project_id=project_id + ) + + if not user or not project or not file_location_check: + logger.error( + f"Missing required entities: user={user_id}, project={project_id}" + ) + return + + # Get all feature assets for this project + # NOTE: This only includes Features that have FeatureAssets (photos, videos, etc.) + # Features without assets (geometry-only) are excluded - they have no files to check. + # TODO WG-600: Consider how to handle "source" FeatureAssets (shapefiles, etc.) + # that represent the file that created the feature rather than + # assets belonging to the feature. Currently these are checked if they + # have original_system/original_path, but may need special handling. + feature_assets = ( + session.query(FeatureAsset) + .join(Feature, Feature.id == FeatureAsset.feature_id) + .filter( + and_( + Feature.project_id == project_id, + FeatureAsset.original_path.isnot(None), + ) + ) + .all() + ) + + total_checked = len(feature_assets) + + # Set total files count + file_location_check.total_files = total_checked + session.commit() + + logger.info( + f"Starting check for project {project_id}: {total_checked} assets to check" + ) + + update_task_and_send_progress_update( + session, + user, + task_id=file_location_check.task.id, + latest_message=f"Checking {total_checked} files for public availability", + ) + + # Calculate the file tree of DS project associated with this map project and place in a cache + # that will hold it and then possibly from other systems/projects + # (we assume most files will be from here, but they could be from other DS projects as well) + file_tree_cache = {} + if project.system_id: + file_tree_cache[project.system_id] = ( + get_file_tree_for_published_project( + session, user, project.system_id + ) + ) + + # Process each asset + for i, asset in enumerate(feature_assets): + try: + # Update timestamp + asset.last_public_system_check = datetime.now(timezone.utc) + + # Backfill current_system/current_path for legacy data + if not asset.current_system: + asset.current_system = asset.original_system + if not asset.current_path: + asset.current_path = asset.original_path + + # Skip if already on a public system + if asset.current_system in PUBLIC_SYSTEMS: + asset.is_on_public_system = True + session.add(asset) + else: + if not asset.current_system: + # some legacy data is missing original_system + # so let's assume that if this map project is connected to a project + # then that data comes from there. We don't update original_system + # though to reflect what we knew at that time. + asset.current_system = project.system_id + + # TODO We should use https://www.designsafe-ci.org/api/publications/v2/PRJ-1234 endpoint to look + # at files (e.g fileObjs) but does not appear complete and and missing a previous-path attribute + + # FIND published project from original_system + + # If missing, original_path we can derive from some like point cloud + + if asset.current_system not in file_tree_cache: + # First time seeing this system - fetch and cache it + logger.info(f"Discovering new system {asset.current_system}, building file tree") + file_tree_cache[asset.current_system] = ( + get_file_tree_for_published_project( + session, user, asset.current_system + ) + ) + + file_tree = file_tree_cache.get(asset.current_system) + + # Check if file exists in published tree + is_published, new_system, new_path = determine_if_published( + file_tree, asset + ) + + asset.is_on_public_system = is_published + + if is_published and new_system and new_path: + asset.current_system = new_system + asset.current_path = new_path + + session.add(asset) + + # Commit and clear cache in batches + if (i + 1) % BATCH_SIZE == 0: + session.commit() + session.expire_all() + + # Update counts + file_location_check.files_checked = i + 1 - len(failed_assets) + file_location_check.files_failed = len(failed_assets) + session.commit() + + logger.info( + f"Batch: {i + 1}/{total_checked} processed, {len(failed_assets)} errors" + ) + + update_task_and_send_progress_update( + session, + user, + task_id=file_location_check.task.id, + latest_message=f"Processed {i + 1}/{total_checked} files ({len(failed_assets)} errors)", + ) + + except Exception as e: + error_msg = str(e)[:100] + logger.exception( + f"Error checking asset {asset.id} ({asset.original_path}): {e}" + ) + + failed_assets.append( + { + "asset_id": asset.id, + "path": asset.original_path or "unknown", + "error": error_msg, + } + ) + + session.rollback() + continue + + # Final commit for remaining items + session.commit() + + # Update final counts + file_location_check.completed_at = datetime.now(timezone.utc) + file_location_check.files_checked = total_checked - len(failed_assets) + file_location_check.files_failed = len(failed_assets) + session.add(file_location_check) + session.commit() + + if failed_assets: + logger.warning( + f"File location check completed with {len(failed_assets)} failures for project {project_id}. " + f"Failed assets: {failed_assets}" + ) + + if failed_assets: + final_message = (f"Checked {total_checked} files: {file_location_check.files_checked} successful," + f" {len(failed_assets)} failed") + else: + final_message = f"Successfully checked all {total_checked} files" + + logger.info(f"Check completed for project {project_id}: {final_message}") + + update_task_and_send_progress_update( + session, + user, + task_id=file_location_check.task.id, + status=TaskStatus.COMPLETED, + latest_message=final_message, + ) + + except Exception as e: + logger.exception(f"Check failed for project {project_id}: {e}") + + # Mark task and check as FAILED + try: + if file_location_check: + file_location_check.completed_at = datetime.now(timezone.utc) + # Keep existing counts if we got that far + session.add(file_location_check) + + if file_location_check and file_location_check.task and user: + update_task_and_send_progress_update( + session, + user, + task_id=file_location_check.task.id, + status=TaskStatus.FAILED, + latest_message=f"Check failed: {str(e)[:200]}", + ) + + session.commit() + except Exception as cleanup_error: + logger.exception(f"Failed to mark task as failed: {cleanup_error}") + session.rollback() + + # Re-raise to mark Celery task as failed as we can't even mark our internal + # task as failed + raise diff --git a/geoapi/tests/conftest.py b/geoapi/tests/conftest.py index bf5fa3ba..307a22f1 100644 --- a/geoapi/tests/conftest.py +++ b/geoapi/tests/conftest.py @@ -508,11 +508,13 @@ def feature_fixture( @pytest.fixture(scope="function") def image_feature_fixture( - image_file_fixture, db_session: "sqlalchemy_config.Session" + image_file_fixture, + db_session: "sqlalchemy_config.Session", + projects_fixture, ) -> Feature: yield FeaturesService.fromImage( db_session, - 1, + projects_fixture.id, image_file_fixture, metadata={}, original_system="system", diff --git a/geoapi/tests/tasks_tests/test_file_location_check.py b/geoapi/tests/tasks_tests/test_file_location_check.py new file mode 100644 index 00000000..d6fb3971 --- /dev/null +++ b/geoapi/tests/tasks_tests/test_file_location_check.py @@ -0,0 +1,302 @@ +import pytest +from unittest.mock import MagicMock, patch + +from geoapi.tasks.file_location_check import ( + check_and_update_file_locations, + BATCH_SIZE, +) +from geoapi.models import TaskStatus +from geoapi.models.feature import FeatureAsset + + +@pytest.fixture +def mock_update_task(): + """Mock the update_task_and_send_progress_update function""" + with patch( + "geoapi.tasks.file_location_check.update_task_and_send_progress_update" + ) as mock: + yield mock + + +@pytest.fixture +def mock_tapis(): + """Mock TapisUtils client""" + with patch("geoapi.tasks.file_location_check.TapisUtils") as mock: + mock_client = MagicMock() + mock.return_value = mock_client + yield mock + + +@pytest.fixture +def file_location_check(db_session, projects_fixture): + """Create a FileLocationCheck for testing""" + from geoapi.services.file_location_status import FileLocationStatusService + + check = FileLocationStatusService.start_check( + db_session, projects_fixture.id, "test-uuid" + ) + db_session.commit() + return check + + +@pytest.fixture +def asset_always_on_public_system(db_session, image_feature_fixture): + """Create an asset already on a public system""" + asset = image_feature_fixture.assets[0] + asset.original_system = "designsafe.storage.published" + asset.original_path = ( + "/published-data/PRJ-1234/Project--foo-bar-baz/data/FooBarBaz/test.jpg" + ) + asset.current_system = "designsafe.storage.published" + asset.current_path = ( + "/published-data/PRJ-1234/Project--foo-bar-baz/data/FooBarBaz/test.jpg" + ) + asset.last_public_system_check = None + db_session.commit() + return asset + + +@pytest.fixture +def asset_now_on_public_system(db_session, image_feature_fixture): + """Create an asset now on a public system""" + asset = image_feature_fixture.assets[0] + asset.original_system = "project-8932311246253724141-242ac11a-0001-012" + asset.original_path = "/foo/test.jpg" + asset.current_system = "designsafe.storage.published" + asset.current_path = "/projects/PRJ-2344/test.jpg" + asset.last_public_system_check = None + db_session.commit() + return asset + + +@pytest.fixture +def multiple_assets(db_session, feature_fixture, projects_fixture, num_assets=3): + """Create multiple test assets""" + assets = [] + for i in range(num_assets): + asset = FeatureAsset( + feature_id=feature_fixture.id, + path=f"/test/asset_{i}.jpg", + asset_type="image", + original_path=f"/test/asset_{i}.jpg", + original_system="designsafe.storage.published", + ) + db_session.add(asset) + assets.append(asset) + + db_session.commit() + return assets + + +def test_check_already_on_public_system( + db_session, + projects_fixture, + user1, + file_location_check, + asset_always_on_public_system, + mock_tapis, + mock_update_task, +): + """Test that files already on public systems are marked correctly""" + # Run task + check_and_update_file_locations(user1.id, projects_fixture.id) + + # Verify: Asset was updated + db_session.refresh(asset_always_on_public_system) + assert asset_always_on_public_system.is_on_public_system is True + assert asset_always_on_public_system.last_public_system_check is not None + + # Verify: Check was marked complete with counts + db_session.refresh(file_location_check) + assert file_location_check.completed_at is not None + assert file_location_check.total_files == 1 + assert file_location_check.files_checked == 1 + assert file_location_check.files_failed == 0 + + +def test_missing_entities_returns_early(mock_tapis, mock_update_task): + """Test that task returns early if user/project/check not found""" + # Run with non-existent IDs + check_and_update_file_locations(99999, 99999) + + # Verify: TapisUtils was never instantiated + mock_tapis.assert_not_called() + + +def test_batch_commits( + db_session, + projects_fixture, + feature_fixture, + user1, + file_location_check, + mock_tapis, + mock_update_task, +): + """Test that batches commit correctly""" + # Setup: Create more assets than BATCH_SIZE + num_assets = BATCH_SIZE + 50 + for i in range(num_assets): + asset = FeatureAsset( + feature_id=feature_fixture.id, + path=f"/test/asset_{i}.jpg", + asset_type="image", + original_path=f"/test/asset_{i}.jpg", + original_system="designsafe.storage.published", + ) + db_session.add(asset) + db_session.commit() + + # Run task + check_and_update_file_locations(user1.id, projects_fixture.id) + + # Verify: All assets were checked + assets = ( + db_session.query(FeatureAsset) + .filter(FeatureAsset.feature_id == feature_fixture.id) + .all() + ) + assert len(assets) == num_assets + + for asset in assets: + assert asset.last_public_system_check is not None + + # Verify: Counts are correct + db_session.refresh(file_location_check) + assert file_location_check.total_files == num_assets + assert file_location_check.files_checked == num_assets + assert file_location_check.files_failed == 0 + + # Verify: Progress updates were called for batches + progress_calls = [ + call + for call in mock_update_task.call_args_list + if "Processed" in call.kwargs.get("latest_message", "") + ] + assert len(progress_calls) >= 1 + + +def test_individual_asset_error_continues( + db_session, + projects_fixture, + user1, + file_location_check, + multiple_assets, + mock_tapis, + mock_update_task, +): + """Test that errors on individual assets don't stop entire task""" + # Run task + check_and_update_file_locations(user1.id, projects_fixture.id) + + # Verify: Task completed successfully + db_session.refresh(file_location_check) + assert file_location_check.completed_at is not None + + # Verify: Counts show all files processed (even if some failed) + assert file_location_check.total_files == 3 + assert file_location_check.files_checked == 3 + assert file_location_check.files_failed == 0 + + # Verify: Final status is COMPLETED + completed_calls = [ + call + for call in mock_update_task.call_args_list + if call.kwargs.get("status") == TaskStatus.COMPLETED + ] + assert len(completed_calls) >= 1 + + +def test_updates_last_public_system_check_timestamp( + db_session, + projects_fixture, + user1, + file_location_check, + asset_always_on_public_system, + mock_tapis, + mock_update_task, +): + """Test that last_public_system_check timestamp is always updated""" + # Record initial timestamp + initial_check = asset_always_on_public_system.last_public_system_check + + # Run task + check_and_update_file_locations(user1.id, projects_fixture.id) + + # Verify: Timestamp was updated + db_session.refresh(asset_always_on_public_system) + assert asset_always_on_public_system.last_public_system_check is not None + assert asset_always_on_public_system.last_public_system_check != initial_check + + # Verify: Counts are correct + db_session.refresh(file_location_check) + assert file_location_check.total_files == 1 + assert file_location_check.files_checked == 1 + assert file_location_check.files_failed == 0 + + +@pytest.mark.parametrize( + "system,expected_public", + [ + ("designsafe.storage.published", True), + ("designsafe.storage.community", True), + ("designsafe.storage.default", False), + ("some.other.system", False), + ], +) +def test_public_system_detection( + db_session, + projects_fixture, + image_feature_fixture, + user1, + file_location_check, + mock_tapis, + mock_update_task, + system, + expected_public, +): + """Test that public systems are correctly identified""" + # Setup: Asset with specific system + asset = image_feature_fixture.assets[0] + asset.original_system = system + asset.original_path = "/test/path.jpg" + asset.current_system = system + asset.current_path = "/test/path.jpg" + asset.last_public_system_check = None + db_session.commit() + + # Run task + check_and_update_file_locations(user1.id, projects_fixture.id) + + # Verify: Public status matches expectation + db_session.refresh(asset) + assert asset.is_on_public_system == expected_public + + # Verify: Counts are correct + db_session.refresh(file_location_check) + assert file_location_check.total_files == 1 + assert file_location_check.files_checked == 1 + assert file_location_check.files_failed == 0 + + +def test_counts_initialized( + db_session, + projects_fixture, + user1, + file_location_check, + asset_always_on_public_system, + mock_tapis, + mock_update_task, +): + """Test that counts are initialized at the start of task""" + # Verify: Initial state has no counts + assert ( + file_location_check.total_files is None or file_location_check.total_files == 0 + ) + + # Run task + check_and_update_file_locations(user1.id, projects_fixture.id) + + # Verify: Counts were set + db_session.refresh(file_location_check) + assert file_location_check.total_files is not None + assert file_location_check.total_files > 0 From 61d9ccacd02c149a9e6e3a1a0b520258d0a7ec9c Mon Sep 17 00:00:00 2001 From: Nathan Franklin Date: Thu, 6 Nov 2025 08:44:04 -0600 Subject: [PATCH 12/28] Fix linting --- geoapi/tasks/file_location_check.py | 13 +- .../tasks_tests/test_file_location_check.py | 237 +++++++++++++++++- 2 files changed, 239 insertions(+), 11 deletions(-) diff --git a/geoapi/tasks/file_location_check.py b/geoapi/tasks/file_location_check.py index bf2edd24..3470e660 100644 --- a/geoapi/tasks/file_location_check.py +++ b/geoapi/tasks/file_location_check.py @@ -113,8 +113,7 @@ def get_file_tree_for_published_project( ) logger.info( - f"Built file tree for {system_id}: " - f"{len(file_index)} unique filenames" + f"Built file tree for {system_id}: " f"{len(file_index)} unique filenames" ) return file_index @@ -268,7 +267,9 @@ def check_and_update_file_locations(user_id: int, project_id: int): if asset.current_system not in file_tree_cache: # First time seeing this system - fetch and cache it - logger.info(f"Discovering new system {asset.current_system}, building file tree") + logger.info( + f"Discovering new system {asset.current_system}, building file tree" + ) file_tree_cache[asset.current_system] = ( get_file_tree_for_published_project( session, user, asset.current_system @@ -345,8 +346,10 @@ def check_and_update_file_locations(user_id: int, project_id: int): ) if failed_assets: - final_message = (f"Checked {total_checked} files: {file_location_check.files_checked} successful," - f" {len(failed_assets)} failed") + final_message = ( + f"Checked {total_checked} files: {file_location_check.files_checked} successful," + f" {len(failed_assets)} failed" + ) else: final_message = f"Successfully checked all {total_checked} files" diff --git a/geoapi/tests/tasks_tests/test_file_location_check.py b/geoapi/tests/tasks_tests/test_file_location_check.py index d6fb3971..9c576ea4 100644 --- a/geoapi/tests/tasks_tests/test_file_location_check.py +++ b/geoapi/tests/tasks_tests/test_file_location_check.py @@ -3,7 +3,12 @@ from geoapi.tasks.file_location_check import ( check_and_update_file_locations, + extract_project_uuid, + is_designsafe_project, + determine_if_published, + build_file_index_from_tapis, BATCH_SIZE, + DESIGNSAFE_PUBLISHED_SYSTEM, ) from geoapi.models import TaskStatus from geoapi.models.feature import FeatureAsset @@ -88,6 +93,147 @@ def multiple_assets(db_session, feature_fixture, projects_fixture, num_assets=3) return assets +@pytest.fixture +def asset_found_in_published(db_session, image_feature_fixture): + """Asset that will be found in published system""" + asset = image_feature_fixture.assets[0] + asset.original_system = "project-123456789" + asset.original_path = "/project/data/found.jpg" + asset.current_system = "project-123456789" + asset.current_path = "/project/data/found.jpg" + asset.is_on_public_system = False + asset.last_public_system_check = None + db_session.commit() + return asset + + +@pytest.fixture +def asset_not_found_in_published(db_session, image_feature_fixture): + """Asset that won't be found in published system""" + asset = image_feature_fixture.assets[0] + asset.original_system = "project-123456789" + asset.original_path = "/project/data/missing.jpg" + asset.current_system = "project-123456789" + asset.current_path = "/project/data/missing.jpg" + asset.is_on_public_system = False + asset.last_public_system_check = None + db_session.commit() + return asset + + +@pytest.mark.parametrize( + "system_id,expected_uuid", + [ + ("project-123456789", "123456789"), + ("designsafe.storage.default", None), + ("", None), + ], +) +def test_extract_project_uuid(system_id, expected_uuid): + """Test extracting UUID from project system IDs""" + assert extract_project_uuid(system_id) == expected_uuid + + +@pytest.mark.parametrize( + "system_id,is_project", + [ + ("project-123456789", True), + ("designsafe.storage.default", False), + ("", False), + ], +) +def test_is_designsafe_project(system_id, is_project): + """Test identifying DesignSafe project systems""" + assert is_designsafe_project(system_id) == is_project + + +def test_build_file_index_from_tapis(): + """Test building file index from Tapis listing""" + # Mock Tapis client and listing + mock_client = MagicMock() + + # Create mock file items + file1 = MagicMock(type="file", path="/dir1/file1.jpg") + file2 = MagicMock(type="file", path="/dir1/file2.jpg") + file3 = MagicMock(type="file", path="/dir2/file1.jpg") # Duplicate filename + dir1 = MagicMock(type="dir", path="/dir1") + + mock_client.listing.side_effect = [ + [file1, file2, dir1], # Root listing + [file3], # /dir1 listing + ] + + # Build index + index = build_file_index_from_tapis(mock_client, "test.system", "/") + + # Verify structure + assert "file1.jpg" in index + assert "file2.jpg" in index + assert len(index["file1.jpg"]) == 2 # Found in two places + assert len(index["file2.jpg"]) == 1 + assert "dir1/file1.jpg" in index["file1.jpg"] + assert "dir1/file2.jpg" in index["file2.jpg"] + + +def test_determine_if_published_found(): + """Test determining if asset is published when file exists""" + # Setup file tree + file_tree = { + "test.jpg": ["/published-data/PRJ-1234/test.jpg"], + "data.csv": ["/published-data/PRJ-1234/data/data.csv"], + } + + # Create mock asset + asset = MagicMock(spec=FeatureAsset) + asset.id = 123 + asset.current_path = "/private/project/test.jpg" + + # Check if published + is_published, system, path = determine_if_published(file_tree, asset) + + # Verify + assert is_published is True + assert system == DESIGNSAFE_PUBLISHED_SYSTEM + assert path == "/published-data/PRJ-1234/test.jpg" + + +def test_determine_if_published_not_found(): + """Test determining if asset is published when file doesn't exist""" + file_tree = { + "other.jpg": ["/published-data/PRJ-1234/other.jpg"], + } + + asset = MagicMock(spec=FeatureAsset) + asset.id = 123 + asset.current_path = "/private/project/missing.jpg" + + is_published, system, path = determine_if_published(file_tree, asset) + + assert is_published is False + assert system is None + assert path is None + + +def test_determine_if_published_multiple_matches(): + """Test that multiple matches are handled (uses first, logs warning)""" + file_tree = { + "test.jpg": [ + "/published-data/PRJ-1234/test.jpg", + "/published-data/PRJ-5678/test.jpg", + ], + } + + asset = MagicMock(spec=FeatureAsset) + asset.id = 123 + asset.current_path = "/private/project/test.jpg" + + is_published, system, path = determine_if_published(file_tree, asset) + + # Should use first match + assert is_published is True + assert path == "/published-data/PRJ-1234/test.jpg" + + def test_check_already_on_public_system( db_session, projects_fixture, @@ -114,13 +260,92 @@ def test_check_already_on_public_system( assert file_location_check.files_failed == 0 -def test_missing_entities_returns_early(mock_tapis, mock_update_task): - """Test that task returns early if user/project/check not found""" - # Run with non-existent IDs - check_and_update_file_locations(99999, 99999) +@patch("geoapi.tasks.file_location_check.get_session") +@patch("geoapi.tasks.file_location_check.TapisUtils") +def test_file_found_in_published_system( + mock_tapis_utils, + mock_get_session, + db_session, + projects_fixture, + user1, + file_location_check, + asset_found_in_published, + mock_update_task, +): + """Test that file found in published system updates correctly""" + # Setup: Mock project ID response + mock_session = MagicMock() + mock_response = MagicMock() + mock_response.json.return_value = { + "baseProject": {"value": {"projectId": "PRJ-1234"}} + } + mock_session.get.return_value = mock_response + mock_get_session.return_value = mock_session + + # Setup: Mock Tapis listing with our file + mock_client = MagicMock() + found_file = MagicMock(type="file", path="/published-data/PRJ-1234/data/found.jpg") + mock_client.listing.return_value = [found_file] + mock_tapis_utils.return_value = mock_client + + # Setup: Project has a system_id + projects_fixture.system_id = "project-123456789" + db_session.commit() + + # Run task + from geoapi.tasks.file_location_check import check_and_update_file_locations + + check_and_update_file_locations(user1.id, projects_fixture.id) + + # Verify: Asset was updated to published location + db_session.refresh(asset_found_in_published) + assert asset_found_in_published.is_on_public_system is True + assert asset_found_in_published.current_system == DESIGNSAFE_PUBLISHED_SYSTEM + assert "/published-data/PRJ-1234" in asset_found_in_published.current_path + + +@patch("geoapi.tasks.file_location_check.get_session") +@patch("geoapi.tasks.file_location_check.TapisUtils") +def test_file_not_found_in_published_system( + mock_tapis_utils, + mock_get_session, + db_session, + projects_fixture, + user1, + file_location_check, + asset_not_found_in_published, + mock_update_task, +): + """Test that file not found in published system stays unpublished""" + # Setup: Mock project ID response + mock_session = MagicMock() + mock_response = MagicMock() + mock_response.json.return_value = { + "baseProject": {"value": {"projectId": "PRJ-1234"}} + } + mock_session.get.return_value = mock_response + mock_get_session.return_value = mock_session + + # Setup: Mock Tapis listing without our file + mock_client = MagicMock() + other_file = MagicMock(type="file", path="/published-data/PRJ-1234/data/other.jpg") + mock_client.listing.return_value = [other_file] + mock_tapis_utils.return_value = mock_client + + # Setup: Project has a system_id + projects_fixture.system_id = "project-123456789" + db_session.commit() + + # Run task + from geoapi.tasks.file_location_check import check_and_update_file_locations + + check_and_update_file_locations(user1.id, projects_fixture.id) - # Verify: TapisUtils was never instantiated - mock_tapis.assert_not_called() + # Verify: Asset stayed on original system + db_session.refresh(asset_not_found_in_published) + assert asset_not_found_in_published.is_on_public_system is False + assert asset_not_found_in_published.current_system == "project-123456789" + assert asset_not_found_in_published.current_path == "/project/data/missing.jpg" def test_batch_commits( From 25aa9fb19a7459e4efc9bd7c323d1de2719c7e29 Mon Sep 17 00:00:00 2001 From: Nathan Franklin Date: Thu, 6 Nov 2025 09:19:12 -0600 Subject: [PATCH 13/28] Add migration --- ...1443-3eba8dc7e4a9_add_filelocationcheck.py | 111 ++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100644 geoapi/migrations/versions/20251106_1443-3eba8dc7e4a9_add_filelocationcheck.py diff --git a/geoapi/migrations/versions/20251106_1443-3eba8dc7e4a9_add_filelocationcheck.py b/geoapi/migrations/versions/20251106_1443-3eba8dc7e4a9_add_filelocationcheck.py new file mode 100644 index 00000000..c5f6fa58 --- /dev/null +++ b/geoapi/migrations/versions/20251106_1443-3eba8dc7e4a9_add_filelocationcheck.py @@ -0,0 +1,111 @@ +"""add_FileLocationCheck + +Revision ID: 3eba8dc7e4a9 +Revises: f5f3cfac6089 +Create Date: 2025-11-06 14:43:16.639237 + +""" + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = "3eba8dc7e4a9" +down_revision = "f5f3cfac6089" +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.create_table( + "public_system_access_checks", + sa.Column("id", sa.Integer(), nullable=False), + sa.Column("project_id", sa.Integer(), nullable=False), + sa.Column("task_id", sa.Integer(), nullable=True), + sa.Column( + "started_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=True, + ), + sa.Column("completed_at", sa.DateTime(timezone=True), nullable=True), + sa.Column("total_files", sa.Integer(), nullable=True), + sa.Column("files_checked", sa.Integer(), nullable=True), + sa.Column("files_failed", sa.Integer(), nullable=True), + sa.ForeignKeyConstraint( + ["project_id"], + ["projects.id"], + name=op.f("fk_public_system_access_checks_project_id_projects"), + onupdate="CASCADE", + ondelete="CASCADE", + ), + sa.ForeignKeyConstraint( + ["task_id"], + ["tasks.id"], + name=op.f("fk_public_system_access_checks_task_id_tasks"), + ondelete="SET NULL", + ), + sa.PrimaryKeyConstraint("id", name=op.f("pk_public_system_access_checks")), + ) + op.create_index( + op.f("ix_public_system_access_checks_project_id"), + "public_system_access_checks", + ["project_id"], + unique=True, + ) + op.create_index( + op.f("ix_public_system_access_checks_task_id"), + "public_system_access_checks", + ["task_id"], + unique=False, + ) + op.add_column( + "feature_assets", sa.Column("current_path", sa.String(), nullable=True) + ) + op.add_column( + "feature_assets", sa.Column("current_system", sa.String(), nullable=True) + ) + op.add_column( + "feature_assets", sa.Column("is_on_public_system", sa.Boolean(), nullable=True) + ) + op.add_column( + "feature_assets", + sa.Column( + "last_public_system_check", sa.DateTime(timezone=True), nullable=True + ), + ) + op.create_index( + op.f("ix_feature_assets_current_path"), + "feature_assets", + ["current_path"], + unique=False, + ) + op.create_index( + op.f("ix_feature_assets_current_system"), + "feature_assets", + ["current_system"], + unique=False, + ) + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.drop_index(op.f("ix_feature_assets_current_system"), table_name="feature_assets") + op.drop_index(op.f("ix_feature_assets_current_path"), table_name="feature_assets") + op.drop_column("feature_assets", "last_public_system_check") + op.drop_column("feature_assets", "is_on_public_system") + op.drop_column("feature_assets", "current_system") + op.drop_column("feature_assets", "current_path") + op.drop_index( + op.f("ix_public_system_access_checks_task_id"), + table_name="public_system_access_checks", + ) + op.drop_index( + op.f("ix_public_system_access_checks_project_id"), + table_name="public_system_access_checks", + ) + op.drop_table("public_system_access_checks") + # ### end Alembic commands ### From ab0a948585936768fafe279a926a4c986d0488c3 Mon Sep 17 00:00:00 2001 From: Nathan Franklin Date: Thu, 6 Nov 2025 12:58:14 -0600 Subject: [PATCH 14/28] Fix when we update current_system --- geoapi/tasks/file_location_check.py | 39 +++++++++++++++++------------ 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/geoapi/tasks/file_location_check.py b/geoapi/tasks/file_location_check.py index 3470e660..282b745f 100644 --- a/geoapi/tasks/file_location_check.py +++ b/geoapi/tasks/file_location_check.py @@ -240,6 +240,9 @@ def check_and_update_file_locations(user_id: int, project_id: int): # Update timestamp asset.last_public_system_check = datetime.now(timezone.utc) + # TODO If missing, original_path we can derive from some like point cloud (by + # looking at point cloud matched with this feature and then looking at files_info. + # Backfill current_system/current_path for legacy data if not asset.current_system: asset.current_system = asset.original_system @@ -251,32 +254,36 @@ def check_and_update_file_locations(user_id: int, project_id: int): asset.is_on_public_system = True session.add(asset) else: - if not asset.current_system: - # some legacy data is missing original_system - # so let's assume that if this map project is connected to a project - # then that data comes from there. We don't update original_system - # though to reflect what we knew at that time. - asset.current_system = project.system_id + current_system = asset.current_system + if not current_system: + # some legacy data is missing original_system (and then current_system) + # We don't update original_system as to be accurate about what was + # recorded at the time of feature creation. + # + # But we can make assume that if the map project is connected to a + # DS project, then it is possible this is the system where the data + # resides and so we can search there for a match. + # + # Note: we wait until we find the file in the published project before + # setting `asset.current_system` so this is why we are using a variable + # (`current_system`) instead immediately setting `asset.current_system`` + current_system = project.system_id # TODO We should use https://www.designsafe-ci.org/api/publications/v2/PRJ-1234 endpoint to look - # at files (e.g fileObjs) but does not appear complete and and missing a previous-path attribute - - # FIND published project from original_system - - # If missing, original_path we can derive from some like point cloud + # at files (e.g fileObjs) but currently does not appear complete and and missing a previous-path attribute - if asset.current_system not in file_tree_cache: + if current_system not in file_tree_cache: # First time seeing this system - fetch and cache it logger.info( - f"Discovering new system {asset.current_system}, building file tree" + f"Discovering new system {current_system}, building file tree" ) - file_tree_cache[asset.current_system] = ( + file_tree_cache[current_system] = ( get_file_tree_for_published_project( - session, user, asset.current_system + session, user, current_system ) ) - file_tree = file_tree_cache.get(asset.current_system) + file_tree = file_tree_cache.get(current_system) # Check if file exists in published tree is_published, new_system, new_path = determine_if_published( From 2d02d38c349eecb397fe7ae167ff524e6141fccd Mon Sep 17 00:00:00 2001 From: Nathan Franklin Date: Thu, 6 Nov 2025 17:24:57 -0600 Subject: [PATCH 15/28] Change how we find system if missing --- geoapi/tasks/file_location_check.py | 38 +++++++++++++++++++---------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/geoapi/tasks/file_location_check.py b/geoapi/tasks/file_location_check.py index 282b745f..b1902bcd 100644 --- a/geoapi/tasks/file_location_check.py +++ b/geoapi/tasks/file_location_check.py @@ -119,6 +119,16 @@ def get_file_tree_for_published_project( return file_index +def file_exists(client, system_id: str, path: str) -> bool: + """Check if file exists by trying to list it directly""" + try: + listing = client.listing(system_id, path) + # If listing succeeds, file exists + return len(listing) > 0 + except TapisListingError: + return False + + def determine_if_published( file_tree: Dict | None, asset: FeatureAsset ) -> tuple[bool, str | None, str | None]: @@ -223,6 +233,8 @@ def check_and_update_file_locations(user_id: int, project_id: int): latest_message=f"Checking {total_checked} files for public availability", ) + tapis_client = TapisUtils(session, user) + # Calculate the file tree of DS project associated with this map project and place in a cache # that will hold it and then possibly from other systems/projects # (we assume most files will be from here, but they could be from other DS projects as well) @@ -254,36 +266,36 @@ def check_and_update_file_locations(user_id: int, project_id: int): asset.is_on_public_system = True session.add(asset) else: - current_system = asset.current_system - if not current_system: - # some legacy data is missing original_system (and then current_system) + if not asset.current_system: + # some legacy data is missing original_system (as so also current_system) # We don't update original_system as to be accurate about what was # recorded at the time of feature creation. # # But we can make assume that if the map project is connected to a # DS project, then it is possible this is the system where the data - # resides and so we can search there for a match. + # resides, and so we can check there for a match. # - # Note: we wait until we find the file in the published project before - # setting `asset.current_system` so this is why we are using a variable - # (`current_system`) instead immediately setting `asset.current_system`` - current_system = project.system_id + # Note: we'll also check the published project again and might update it one more time + if file_exists( + tapis_client, project.system_id, asset.current_path + ): + asset.current_system = project.system_id # TODO We should use https://www.designsafe-ci.org/api/publications/v2/PRJ-1234 endpoint to look # at files (e.g fileObjs) but currently does not appear complete and and missing a previous-path attribute - if current_system not in file_tree_cache: + if asset.current_system not in file_tree_cache: # First time seeing this system - fetch and cache it logger.info( - f"Discovering new system {current_system}, building file tree" + f"Discovering new system {asset.current_system}, building file tree" ) - file_tree_cache[current_system] = ( + file_tree_cache[asset.current_system] = ( get_file_tree_for_published_project( - session, user, current_system + session, user, asset.current_system ) ) - file_tree = file_tree_cache.get(current_system) + file_tree = file_tree_cache.get(asset.current_system) # Check if file exists in published tree is_published, new_system, new_path = determine_if_published( From 057da92d242c77a9ca58916ef0ea8e4cfdc4cad0 Mon Sep 17 00:00:00 2001 From: Nathan Franklin Date: Fri, 7 Nov 2025 10:45:51 -0600 Subject: [PATCH 16/28] Fix catchign legacy point cloud assets --- geoapi/tasks/file_location_check.py | 240 ++++++++++++++---- .../tasks_tests/test_file_location_check.py | 25 +- 2 files changed, 197 insertions(+), 68 deletions(-) diff --git a/geoapi/tasks/file_location_check.py b/geoapi/tasks/file_location_check.py index b1902bcd..decf7b90 100644 --- a/geoapi/tasks/file_location_check.py +++ b/geoapi/tasks/file_location_check.py @@ -7,11 +7,11 @@ from typing import Dict import os -from sqlalchemy import and_ +from sqlalchemy import and_, or_ from geoapi.celery_app import app from geoapi.db import create_task_session -from geoapi.models import Project, Feature, User, TaskStatus +from geoapi.models import Project, Feature, User, TaskStatus, PointCloud from geoapi.models.feature import FeatureAsset from geoapi.services.file_location_status import FileLocationStatusService from geoapi.utils.external_apis import TapisUtils, get_session, TapisListingError @@ -53,7 +53,10 @@ def build_file_index_from_tapis( try: listing = client.listing(system_id, path) except TapisListingError: - logger.exception(f"Unable to list {system_id}/{path}") + # TODO handle 404 better as we might just be missing published project + logger.exception( + f"Unable to list {system_id}/{path}. If 404, Project might not be published yet" + ) return file_index for item in listing: @@ -129,40 +132,137 @@ def file_exists(client, system_id: str, path: str) -> bool: return False -def determine_if_published( - file_tree: Dict | None, asset: FeatureAsset -) -> tuple[bool, str | None, str | None]: +def determine_if_exists_in_tree( + file_tree: Dict | None, current_file_path: str +) -> tuple[bool, str | None]: """ - Check if asset's file exists in the published file tree. + Check if asset's file exists in the file tree. Returns: tuple of (is_published, system, path) """ - if file_tree is None or not asset.current_path: - return (False, None, None) + if file_tree is None or not current_file_path: + return (False, None) - filename = os.path.basename(asset.current_path) + filename = os.path.basename(current_file_path) if filename in file_tree: published_paths = file_tree[filename] if len(published_paths) > 1: logger.info( - f"Multiple matches found for asset {asset.id} file '{filename}': " + f"Multiple matches found for asset file '{current_file_path}': " f"{published_paths}" ) # Use first match - path only (no system) path = "/" + published_paths[0].lstrip("/") + logger.debug(f"Asset '{current_file_path}' found in system at: {path}") + + return (True, path) + + return (False, None) + + +def get_filename_from_point_cloud_asset(session, asset: FeatureAsset) -> str | None: + """ + Get the point cloud filename from a point cloud asset by querying the associated + PointCloud and extracting the first .laz file from files_info. + + Returns the name of the first .laz file found, or None if no point cloud + or .laz file exists. + + Note: If there are multiple .laz files, only the first one is returned. + """ + # Query for the PointCloud associated with this feature + point_cloud = ( + session.query(PointCloud).filter_by(feature_id=asset.feature_id).first() + ) + + # Return None if no point cloud exists or files_info is empty/None + if not point_cloud or not point_cloud.files_info: + return None + + # Look for the first .laz file in files_info + # Note: Taking the first .laz file found; multiple files are not handled + for file_info in point_cloud.files_info: + name = file_info.get("name", None) + return name + return None + + +def fix_and_backfill( + session, + tapis_client, + project: Project, + asset: FeatureAsset, + project_system: str, + project_system_file_tree: Dict, +): + """This method attempts to fix and backfill any information + + There are missing information that we can try to fix. These things have been fixed for + newly created features, but we are trying to correct them for older features. + They are: + * (A) missing original_system -- only original_path was stored (original_system, + current_system, current_path came later) but we + might find the file in the system associated with + the map + * (B) missing current_system, current_path -- can baac with original_path and original_system + * (C) original_path missing for point clouds -- but can be derived from the point_cloud model + * (E) features derived from geojson or shapefiles don't have file info; TODO in WG-600 + * (D) raster files (tile layers) missing current path and current system (TODO) + + This methods fixes these things except for (D) and (E) + """ + logger.debug(f"Checking asset={asset.id} to see if we can fix anything") + # (C) See if point cloud can be fixed (we do this first as results + # might be used by B or A steps) + if asset.original_path is None and asset.asset_type == "point_cloud": + file_name = get_filename_from_point_cloud_asset(session, asset) + logger.info( + f"Point cloud asset missing original_path. Will try to fix by looking for {file_name} in systems files" + ) + exists, found_path = determine_if_exists_in_tree( + project_system_file_tree, file_name + ) + if exists: + logger.info( + f"Found path for point cloud asset={asset.id} and fixing original_path to {found_path}" + ) + asset.original_path = found_path + asset.original_system = project_system + else: + logger.info("Did not find a matching path") + + # (A) some legacy data is missing original_system + if not asset.original_system: + # We can make assume that if the map project is connected to a + # DS project, then it is possible that is the system where the data resides. + # We check that system for the file and update original_system if there is a match. logger.debug( - f"Asset {asset.id} found in published system: " - f"{DESIGNSAFE_PUBLISHED_SYSTEM}{path}" + f"Missing original_system for asset={asset.id} so seeing if we see file on current DS project" ) + if file_exists(tapis_client, project.system_id, asset.original_path): + logger.debug( + f"Found file on current DS project so updating original_system to {project.system_id} " + ) + asset.original_system = project.system_id - return (True, DESIGNSAFE_PUBLISHED_SYSTEM, path) + # (B) Backfill current_system/current_path for legacy data + if not asset.current_system and asset.original_system: + logger.debug( + f"Updated current_system for asset={asset.id} to {asset.original_system}" + ) + asset.current_system = asset.original_system + if not asset.current_path and asset.original_path: + logger.debug( + f"Updated current_path for asset={asset.id} to {asset.original_path} " + ) + asset.current_path = asset.original_path - return (False, None, None) + logger.debug(f"Completed checking asset={asset.id}") @app.task(queue="default") @@ -199,7 +299,10 @@ def check_and_update_file_locations(user_id: int, project_id: int): # Get all feature assets for this project # NOTE: This only includes Features that have FeatureAssets (photos, videos, etc.) - # Features without assets (geometry-only) are excluded - they have no files to check. + # + # * Features without assets (geometry-only) are excluded - they have no files to check. (WG-600) + # * Point cloud assets without original_path are included for backfilling from PointCloud.files_info + # # TODO WG-600: Consider how to handle "source" FeatureAssets (shapefiles, etc.) # that represent the file that created the feature rather than # assets belonging to the feature. Currently these are checked if they @@ -210,7 +313,10 @@ def check_and_update_file_locations(user_id: int, project_id: int): .filter( and_( Feature.project_id == project_id, - FeatureAsset.original_path.isnot(None), + or_( + FeatureAsset.original_path.isnot(None), + FeatureAsset.asset_type == "point_cloud", + ), ) ) .all() @@ -235,78 +341,110 @@ def check_and_update_file_locations(user_id: int, project_id: int): tapis_client = TapisUtils(session, user) - # Calculate the file tree of DS project associated with this map project and place in a cache + # Calculate the file tree of published DS project associated with this map project and place in a cache # that will hold it and then possibly from other systems/projects # (we assume most files will be from here, but they could be from other DS projects as well) - file_tree_cache = {} + published_file_tree_cache = {} if project.system_id: - file_tree_cache[project.system_id] = ( + published_file_tree_cache[project.system_id] = ( get_file_tree_for_published_project( session, user, project.system_id ) ) + # Check if any point clouds are missing original path + has_point_cloud_missing_original_path = ( + session.query(FeatureAsset) + .join(Feature, Feature.id == FeatureAsset.feature_id) + .filter( + and_( + Feature.project_id == project_id, + FeatureAsset.asset_type == "point_cloud", + FeatureAsset.original_path.is_(None), + ) + ) + .first() + is not None + ) + + unpublished_project_file_index = {} + if has_point_cloud_missing_original_path and is_designsafe_project( + project.system_id + ): + logger.info( + "Point cloud asset(s) are missing original_path so we gather file listing to attempt to fix that" + ) + # Create a listing for current project in case we need to + unpublished_project_file_index = build_file_index_from_tapis( + tapis_client, system_id=project.system_id, path="//" + ) + logger.info(f"Indexed {len(unpublished_project_file_index)} files") + # Process each asset for i, asset in enumerate(feature_assets): try: # Update timestamp asset.last_public_system_check = datetime.now(timezone.utc) - # TODO If missing, original_path we can derive from some like point cloud (by - # looking at point cloud matched with this feature and then looking at files_info. + logger.debug( + f"Processing asset={asset.id} asset_type={asset.asset_type}" + f" original_path={asset.original_path} original_system={asset.original_system}" + f" current_path={asset.current_path} current_system={asset.current_system}" + ) - # Backfill current_system/current_path for legacy data - if not asset.current_system: - asset.current_system = asset.original_system - if not asset.current_path: - asset.current_path = asset.original_path + # Fix attributes of any assets missing info as they were created in the past + fix_and_backfill( + session=session, + tapis_client=tapis_client, + project=project, + asset=asset, + project_system=project.system_id, + project_system_file_tree=unpublished_project_file_index, + ) # Skip if already on a public system if asset.current_system in PUBLIC_SYSTEMS: asset.is_on_public_system = True session.add(asset) else: - if not asset.current_system: - # some legacy data is missing original_system (as so also current_system) - # We don't update original_system as to be accurate about what was - # recorded at the time of feature creation. - # - # But we can make assume that if the map project is connected to a - # DS project, then it is possible this is the system where the data - # resides, and so we can check there for a match. - # - # Note: we'll also check the published project again and might update it one more time - if file_exists( - tapis_client, project.system_id, asset.current_path - ): - asset.current_system = project.system_id + if asset.current_system is None: + logger.warning( + f"We don't know the current system:" + f" asset={asset.id} asset_type={asset.asset_type}" + f" original_path={asset.original_path} original_system={asset.original_system}" + f" current_path={asset.current_path} current_system={asset.current_system} " + ) # TODO We should use https://www.designsafe-ci.org/api/publications/v2/PRJ-1234 endpoint to look # at files (e.g fileObjs) but currently does not appear complete and and missing a previous-path attribute - - if asset.current_system not in file_tree_cache: + if ( + asset.current_system + and asset.current_system not in published_file_tree_cache + ): # First time seeing this system - fetch and cache it logger.info( f"Discovering new system {asset.current_system}, building file tree" ) - file_tree_cache[asset.current_system] = ( + published_file_tree_cache[asset.current_system] = ( get_file_tree_for_published_project( session, user, asset.current_system ) ) - file_tree = file_tree_cache.get(asset.current_system) + published_project_file_tree = published_file_tree_cache.get( + asset.current_system, {} + ) # Check if file exists in published tree - is_published, new_system, new_path = determine_if_published( - file_tree, asset + exists, found_path = determine_if_exists_in_tree( + published_project_file_tree, asset.current_path ) - asset.is_on_public_system = is_published + asset.is_on_public_system = exists - if is_published and new_system and new_path: - asset.current_system = new_system - asset.current_path = new_path + if exists and found_path: + asset.current_system = DESIGNSAFE_PUBLISHED_SYSTEM + asset.current_path = found_path session.add(asset) diff --git a/geoapi/tests/tasks_tests/test_file_location_check.py b/geoapi/tests/tasks_tests/test_file_location_check.py index 9c576ea4..0ab4f370 100644 --- a/geoapi/tests/tasks_tests/test_file_location_check.py +++ b/geoapi/tests/tasks_tests/test_file_location_check.py @@ -5,7 +5,7 @@ check_and_update_file_locations, extract_project_uuid, is_designsafe_project, - determine_if_published, + determine_if_exists_in_tree, build_file_index_from_tapis, BATCH_SIZE, DESIGNSAFE_PUBLISHED_SYSTEM, @@ -183,17 +183,13 @@ def test_determine_if_published_found(): "data.csv": ["/published-data/PRJ-1234/data/data.csv"], } - # Create mock asset - asset = MagicMock(spec=FeatureAsset) - asset.id = 123 - asset.current_path = "/private/project/test.jpg" + current_path = "/private/project/test.jpg" # Check if published - is_published, system, path = determine_if_published(file_tree, asset) + exists, path = determine_if_exists_in_tree(file_tree, current_path) # Verify - assert is_published is True - assert system == DESIGNSAFE_PUBLISHED_SYSTEM + assert exists is True assert path == "/published-data/PRJ-1234/test.jpg" @@ -203,14 +199,11 @@ def test_determine_if_published_not_found(): "other.jpg": ["/published-data/PRJ-1234/other.jpg"], } - asset = MagicMock(spec=FeatureAsset) - asset.id = 123 - asset.current_path = "/private/project/missing.jpg" + current_path = "/private/project/missing.jpg" - is_published, system, path = determine_if_published(file_tree, asset) + is_published, path = determine_if_exists_in_tree(file_tree, current_path) assert is_published is False - assert system is None assert path is None @@ -223,11 +216,9 @@ def test_determine_if_published_multiple_matches(): ], } - asset = MagicMock(spec=FeatureAsset) - asset.id = 123 - asset.current_path = "/private/project/test.jpg" + current_path = "/private/project/test.jpg" - is_published, system, path = determine_if_published(file_tree, asset) + is_published, path = determine_if_exists_in_tree(file_tree, current_path) # Should use first match assert is_published is True From 883663afe82749245a2770d005cb4942c91f893a Mon Sep 17 00:00:00 2001 From: Nathan Franklin Date: Mon, 10 Nov 2025 17:06:56 -0600 Subject: [PATCH 17/28] Refactor response and consider rasters --- ...88d94d3_add_tracking_info_to_tile_layer.py | 210 ++++++++++++ geoapi/models/feature.py | 19 +- geoapi/models/file_location_tracking_mixin.py | 51 +++ geoapi/models/tile_server.py | 18 +- geoapi/routes/file_location_status.py | 97 +++++- geoapi/schema/projects.py | 15 + geoapi/services/tile_server.py | 31 +- geoapi/tasks/file_location_check.py | 311 ++++++++++-------- geoapi/tasks/raster.py | 4 +- .../test_file_location_status_routes.py | 6 +- .../tasks_tests/test_file_location_check.py | 8 - 11 files changed, 573 insertions(+), 197 deletions(-) create mode 100644 geoapi/migrations/versions/20251110_0331-898a088d94d3_add_tracking_info_to_tile_layer.py create mode 100644 geoapi/models/file_location_tracking_mixin.py diff --git a/geoapi/migrations/versions/20251110_0331-898a088d94d3_add_tracking_info_to_tile_layer.py b/geoapi/migrations/versions/20251110_0331-898a088d94d3_add_tracking_info_to_tile_layer.py new file mode 100644 index 00000000..aa61e38c --- /dev/null +++ b/geoapi/migrations/versions/20251110_0331-898a088d94d3_add_tracking_info_to_tile_layer.py @@ -0,0 +1,210 @@ +"""add_tracking_info_to_tile_layer + +Revision ID: 898a088d94d3 +Revises: 3eba8dc7e4a9 +Create Date: 2025-11-10 03:31:14.410321 + +""" + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = "898a088d94d3" +down_revision = "3eba8dc7e4a9" +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.alter_column( + "feature_assets", + "original_system", + existing_type=sa.VARCHAR(), + comment="Tapis system where the original file was sourced from", + existing_nullable=True, + ) + op.alter_column( + "feature_assets", + "original_path", + existing_type=sa.VARCHAR(), + comment="Original file path on the source system", + existing_nullable=True, + ) + op.alter_column( + "feature_assets", + "current_path", + existing_type=sa.VARCHAR(), + comment="Current file path (updated if file is published or moved)", + existing_nullable=True, + ) + op.alter_column( + "feature_assets", + "current_system", + existing_type=sa.VARCHAR(), + comment="Current Tapis system (updated if file is published or moved)", + existing_nullable=True, + ) + op.alter_column( + "feature_assets", + "is_on_public_system", + existing_type=sa.BOOLEAN(), + comment="Whether the current_system is publicly accessible", + existing_nullable=True, + ) + op.alter_column( + "feature_assets", + "last_public_system_check", + existing_type=postgresql.TIMESTAMP(timezone=True), + comment="Timestamp of last public accessibility check", + existing_nullable=True, + ) + op.add_column( + "tile_servers", + sa.Column( + "current_path", + sa.String(), + nullable=True, + comment="Current file path (updated if file is published or moved)", + ), + ) + op.add_column( + "tile_servers", + sa.Column( + "current_system", + sa.String(), + nullable=True, + comment="Current Tapis system (updated if file is published or moved)", + ), + ) + op.add_column( + "tile_servers", + sa.Column( + "is_on_public_system", + sa.Boolean(), + nullable=True, + comment="Whether the current_system is publicly accessible", + ), + ) + op.add_column( + "tile_servers", + sa.Column( + "last_public_system_check", + sa.DateTime(timezone=True), + nullable=True, + comment="Timestamp of last public accessibility check", + ), + ) + op.alter_column( + "tile_servers", + "original_system", + existing_type=sa.VARCHAR(), + comment="Tapis system where the original file was sourced from", + existing_comment="Tapis system where the original file was sourced from (if applicable)", + existing_nullable=True, + ) + op.alter_column( + "tile_servers", + "original_path", + existing_type=sa.VARCHAR(), + comment="Original file path on the source system", + existing_comment="Original file path on the source system (if applicable)", + existing_nullable=True, + ) + op.create_index( + op.f("ix_tile_servers_current_path"), + "tile_servers", + ["current_path"], + unique=False, + ) + op.create_index( + op.f("ix_tile_servers_current_system"), + "tile_servers", + ["current_system"], + unique=False, + ) + op.create_index( + op.f("ix_tile_servers_original_path"), + "tile_servers", + ["original_path"], + unique=False, + ) + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.drop_index(op.f("ix_tile_servers_original_path"), table_name="tile_servers") + op.drop_index(op.f("ix_tile_servers_current_system"), table_name="tile_servers") + op.drop_index(op.f("ix_tile_servers_current_path"), table_name="tile_servers") + op.alter_column( + "tile_servers", + "original_path", + existing_type=sa.VARCHAR(), + comment="Original file path on the source system (if applicable)", + existing_comment="Original file path on the source system", + existing_nullable=True, + ) + op.alter_column( + "tile_servers", + "original_system", + existing_type=sa.VARCHAR(), + comment="Tapis system where the original file was sourced from (if applicable)", + existing_comment="Tapis system where the original file was sourced from", + existing_nullable=True, + ) + op.drop_column("tile_servers", "last_public_system_check") + op.drop_column("tile_servers", "is_on_public_system") + op.drop_column("tile_servers", "current_system") + op.drop_column("tile_servers", "current_path") + op.alter_column( + "feature_assets", + "last_public_system_check", + existing_type=postgresql.TIMESTAMP(timezone=True), + comment=None, + existing_comment="Timestamp of last public accessibility check", + existing_nullable=True, + ) + op.alter_column( + "feature_assets", + "is_on_public_system", + existing_type=sa.BOOLEAN(), + comment=None, + existing_comment="Whether the current_system is publicly accessible", + existing_nullable=True, + ) + op.alter_column( + "feature_assets", + "current_system", + existing_type=sa.VARCHAR(), + comment=None, + existing_comment="Current Tapis system (updated if file is published or moved)", + existing_nullable=True, + ) + op.alter_column( + "feature_assets", + "current_path", + existing_type=sa.VARCHAR(), + comment=None, + existing_comment="Current file path (updated if file is published or moved)", + existing_nullable=True, + ) + op.alter_column( + "feature_assets", + "original_path", + existing_type=sa.VARCHAR(), + comment=None, + existing_comment="Original file path on the source system", + existing_nullable=True, + ) + op.alter_column( + "feature_assets", + "original_system", + existing_type=sa.VARCHAR(), + comment=None, + existing_comment="Tapis system where the original file was sourced from", + existing_nullable=True, + ) + # ### end Alembic commands ### diff --git a/geoapi/models/feature.py b/geoapi/models/feature.py index e21f2618..c128ce80 100644 --- a/geoapi/models/feature.py +++ b/geoapi/models/feature.py @@ -1,5 +1,5 @@ import uuid -from sqlalchemy import Integer, String, ForeignKey, Index, DateTime, Boolean +from sqlalchemy import Integer, String, ForeignKey, Index, DateTime import shapely from litestar.dto import dto_field from sqlalchemy.dialects.postgresql import JSONB @@ -9,6 +9,7 @@ from geoalchemy2 import Geometry from geoalchemy2.shape import from_shape, to_shape from geoapi.db import Base +from geoapi.models.file_location_tracking_mixin import FileLocationTrackingMixin from geoapi.utils import geometries @@ -55,7 +56,7 @@ def geometry(self) -> dict: return shapely.geometry.mapping(to_shape(self.the_geom)) -class FeatureAsset(Base): +class FeatureAsset(Base, FileLocationTrackingMixin): __tablename__ = "feature_assets" id = mapped_column(Integer, primary_key=True) feature_id = mapped_column( @@ -68,18 +69,10 @@ class FeatureAsset(Base): # Original source file location original_name = mapped_column(String(), nullable=True) - original_path = mapped_column(String(), nullable=True, index=True) - original_system = mapped_column(String(), nullable=True, index=True) - # Current location of the original source file (typically updated when task - # sees that it or a copy is in a publicly accessible location) - current_path = mapped_column(String(), nullable=True, index=True) - current_system = mapped_column(String(), nullable=True, index=True) - - # Is current_system a public-accessable system - is_on_public_system = mapped_column(Boolean(), nullable=True) - # Track when this asset was last checked for public availability - last_public_system_check = mapped_column(DateTime(timezone=True), nullable=True) + # Note: original_system, original_path, current_system, current_path, + # is_on_public_system, and last_public_system_check are inherited + # from FileLocationTrackingMixin asset_type = mapped_column(String(), nullable=False, default="image") feature = relationship("Feature", overlaps="assets") diff --git a/geoapi/models/file_location_tracking_mixin.py b/geoapi/models/file_location_tracking_mixin.py new file mode 100644 index 00000000..a7f6fc4b --- /dev/null +++ b/geoapi/models/file_location_tracking_mixin.py @@ -0,0 +1,51 @@ +from sqlalchemy import Boolean, String, DateTime +from sqlalchemy.orm import mapped_column + + +class FileLocationTrackingMixin: + """ + Mixin for tracking file locations (original and current) and public accessibility. + + Tracks: + - Original location: where the file was first sourced from + - Current location: where the file is now (may differ if published/moved) + - Public accessibility: whether current location is publicly accessible + - Last check timestamp: when accessibility was last verified + """ + + original_system = mapped_column( + String(), + nullable=True, + index=True, + comment="Tapis system where the original file was sourced from", + ) + original_path = mapped_column( + String(), + nullable=True, + index=True, # Adding index for TileServer too + comment="Original file path on the source system", + ) + + current_path = mapped_column( + String(), + nullable=True, + index=True, + comment="Current file path (updated if file is published or moved)", + ) + current_system = mapped_column( + String(), + nullable=True, + index=True, + comment="Current Tapis system (updated if file is published or moved)", + ) + + is_on_public_system = mapped_column( + Boolean(), + nullable=True, + comment="Whether the current_system is publicly accessible", + ) + last_public_system_check = mapped_column( + DateTime(timezone=True), + nullable=True, + comment="Timestamp of last public accessibility check", + ) diff --git a/geoapi/models/tile_server.py b/geoapi/models/tile_server.py index 3f6be7a4..7e84927c 100644 --- a/geoapi/models/tile_server.py +++ b/geoapi/models/tile_server.py @@ -2,9 +2,10 @@ from sqlalchemy.orm import relationship, mapped_column from sqlalchemy.dialects.postgresql import JSONB, UUID from geoapi.db import Base +from geoapi.models.file_location_tracking_mixin import FileLocationTrackingMixin -class TileServer(Base): +class TileServer(Base, FileLocationTrackingMixin): __tablename__ = "tile_servers" id = mapped_column(Integer, primary_key=True) project_id = mapped_column( @@ -48,18 +49,9 @@ class TileServer(Base): attribution = mapped_column(String(), nullable=False) - original_system = mapped_column( - String(), - nullable=True, - index=True, - comment="Tapis system where the original file was sourced from (if applicable)", - ) - - original_path = mapped_column( - String(), - nullable=True, - comment="Original file path on the source system (if applicable)", - ) + # Note: original_system, original_path, current_system, current_path, + # is_on_public_system, and last_public_system_check are inherited + # from FileLocationTrackingMixin tileOptions = mapped_column(JSONB, default={}) uiOptions = mapped_column(JSONB, default={}) diff --git a/geoapi/routes/file_location_status.py b/geoapi/routes/file_location_status.py index 8870dc0e..0cfc76b2 100644 --- a/geoapi/routes/file_location_status.py +++ b/geoapi/routes/file_location_status.py @@ -5,11 +5,11 @@ from sqlalchemy.orm import Session from geoapi.log import logger -from geoapi.models import Feature, FeatureAsset -from geoapi.schema.projects import FeatureAssetModel, TaskModel +from geoapi.models import Feature, FeatureAsset, TileServer +from geoapi.schema.projects import FeatureAssetModel, TileServerModel, TaskModel from geoapi.utils.decorators import project_permissions_guard from geoapi.tasks.file_location_check import check_and_update_file_locations - +from geoapi.services.tile_server import TileService class FileLocationStatusModel(BaseModel): model_config = ConfigDict(from_attributes=True) @@ -30,10 +30,29 @@ class StartFileLocationStatusRefreshResponse(BaseModel): task_id: int | None = None +class FileLocationSummary(BaseModel): + """Summary of what is and isn't being checked""" + + total_features: int + features_with_assets: int # Checked + features_without_assets: int # Not checked ✗ + total_tile_servers: int + internal_tile_servers: int # Checked + external_tile_servers: int # Not checked + + class ProjectFileLocationStatusSummary(BaseModel): + """ + Summary of file location status for a project. + + Includes both feature assets (photos, videos, point clouds) and tile servers (COG layers). + """ + project_id: int check: FileLocationStatusModel | None = None - files: list[FeatureAssetModel] + featureAssets: list[FeatureAssetModel] + tileServers: list[TileServerModel] + summary: FileLocationSummary class ProjectFileLocationStatusController(Controller): @@ -44,7 +63,7 @@ class ProjectFileLocationStatusController(Controller): "/", tags=["projects"], operation_id="start_file_location_refresh", - description="Start checking and updating file location (and public access) for project files", + description="Start checking and updating file location (and public access) for project files and tile servers", guards=[project_permissions_guard], status_code=202, ) @@ -52,9 +71,9 @@ def start_public_status_refresh( self, request: Request, db_session: Session, project_id: int ) -> StartFileLocationStatusRefreshResponse: """ - Start a background task to refresh public-system-access status of files. + Start a background task to refresh public-system-access status of files and tile servers. - This checks all feature assets in the project and updates their + This checks all feature assets and internal tile servers in the project and updates their current_system/current_path if they're found in the published location. """ from geoapi.services.file_location_status import FileLocationStatusService @@ -113,16 +132,20 @@ def start_public_status_refresh( "/files", tags=["projects"], operation_id="get_public_files_status", - description="Get public/private status of all files in the project with last refresh info", + description="Get public/private status of all files and tile servers in the project with last refresh info", guards=[project_permissions_guard], ) def get_files_status( self, request: Request, db_session: Session, project_id: int ) -> ProjectFileLocationStatusSummary: """ - Get detailed status of which files are public vs private. + Get detailed status of which files and tile servers are public vs private. - Single endpoint that returns both file-level details and last refresh info. + Single endpoint that returns: + - Feature assets (photos, videos, point clouds, etc.) + - Internal tile servers (COG layers) + - Summary of what's not being checked + - Last refresh check info """ from geoapi.services.file_location_status import FileLocationStatusService @@ -134,7 +157,7 @@ def get_files_status( # Get all feature assets # NOTE: Only includes Features with FeatureAssets. Features without assets # (geometry-only, like manually drawn shapes) are excluded from this list. - # See WG-600. Also consider tile layers (internal cogs) + # See WG-600. feature_assets = ( db_session.query(FeatureAsset) .join(Feature, Feature.id == FeatureAsset.feature_id) @@ -142,16 +165,58 @@ def get_files_status( .all() ) - # TODO with WG-600 or before then we should include a list of features that aren't accounted for here just so - # users are aware of a gap of knowledge + + # Calculate summary counts for context + total_features = ( + db_session.query(Feature).filter(Feature.project_id == project_id).count() + ) + + count_features_with_assets = ( + db_session.query(Feature.id) + .join(FeatureAsset, Feature.id == FeatureAsset.feature_id) + .filter(Feature.project_id == project_id) + .distinct() + .count() + ) + + # These features without feature assets are not considered (see WG-600) + count_features_without_assets = total_features - count_features_with_assets + + # Get all tile servers + all_tile_servers = TileService.getTileServers(db_session, projectId=project_id) + + # Get all internal tile servers + # Only internal tile servers are checked (external ones use external URLs + internal_tile_servers = [ts for ts in all_tile_servers if ts.internal is True] + + # Calculate counts + total_tile_servers = len(all_tile_servers) + count_internal_tile_servers = len(internal_tile_servers) + count_external_tile_servers = total_tile_servers - count_internal_tile_servers # Get most recent check public_access_check = FileLocationStatusService.get(db_session, project_id) - files_status = [ + # Convert to response models + feature_assets_status = [ FeatureAssetModel.model_validate(asset) for asset in feature_assets ] + internal_tile_servers_status = [ + TileServerModel.model_validate(tile_server) + for tile_server in internal_tile_servers + ] + + # Build summary + summary = FileLocationSummary( + total_features=total_features, + features_with_assets=count_features_with_assets, # Considered + features_without_assets=count_features_without_assets, # Not considered WG-600 + total_tile_servers=total_tile_servers, + internal_tile_servers=count_internal_tile_servers, # Considered + external_tile_servers=count_external_tile_servers, # Not considered + ) + # Convert check with nested task validation check_data = None if public_access_check: @@ -170,5 +235,7 @@ def get_files_status( return ProjectFileLocationStatusSummary( project_id=project_id, check=check_data, - files=files_status, + featureAssets=feature_assets_status, + tileServers=internal_tile_servers_status, + summary=summary, ) diff --git a/geoapi/schema/projects.py b/geoapi/schema/projects.py index ada0c20a..1b944863 100644 --- a/geoapi/schema/projects.py +++ b/geoapi/schema/projects.py @@ -186,6 +186,13 @@ class TileServerDTO(SQLAlchemyDTO[TileServer]): "attribution", "tileOptions", "uiOptions", + # File location tracking fields (from FileLocationTrackingMixin) + "original_system", + "original_path", + "current_system", + "current_path", + "is_on_public_system", + "last_public_system_check", } ) @@ -204,6 +211,14 @@ class TileServerModel(BaseModel): tileOptions: dict | None = None uiOptions: dict | None = None + # File location tracking fields (from FileLocationTrackingMixin) + original_system: str | None = None + original_path: str | None = None + current_system: str | None = None + current_path: str | None = None + is_on_public_system: bool | None = None + last_public_system_check: datetime | None = None + # TODO: replace with TapisFilePath (and update client software) class TapisFileUploadModel(BaseModel): diff --git a/geoapi/services/tile_server.py b/geoapi/services/tile_server.py index 22baf6ff..61638927 100644 --- a/geoapi/services/tile_server.py +++ b/geoapi/services/tile_server.py @@ -1,4 +1,4 @@ -from typing import List, Dict, Tuple +from typing import List, Dict, Tuple, Optional from celery import uuid as celery_uuid from geoapi.models import TileServer, Task, User @@ -52,11 +52,30 @@ def addTileServer(database_session, projectId: int, data: Dict): return ts @staticmethod - def getTileServers(database_session, projectId: int) -> List[TileServer]: - tile_servers = ( - database_session.query(TileServer).filter_by(project_id=projectId).all() - ) - return tile_servers + def getTileServers( + database_session, + projectId: int, + internal: Optional[ + bool + ] = None, + ) -> List[TileServer]: + """ + Get tile servers for a project. + + Args: + database_session: Database session + projectId: Project ID + internal: Optional filter - None=all, True=internal only, False=external only + + Returns: + List of TileServer objects + """ + query = database_session.query(TileServer).filter_by(project_id=projectId) + + if internal is not None: + query = query.filter(TileServer.internal.is_(internal)) + + return query.all() @staticmethod def deleteTileServer(database_session, tile_server_id: int) -> None: diff --git a/geoapi/tasks/file_location_check.py b/geoapi/tasks/file_location_check.py index decf7b90..fa43508d 100644 --- a/geoapi/tasks/file_location_check.py +++ b/geoapi/tasks/file_location_check.py @@ -4,14 +4,15 @@ """ from datetime import datetime, timezone -from typing import Dict +from typing import Dict, Union import os from sqlalchemy import and_, or_ from geoapi.celery_app import app from geoapi.db import create_task_session -from geoapi.models import Project, Feature, User, TaskStatus, PointCloud +from geoapi.services.tile_server import TileService +from geoapi.models import Project, Feature, User, TaskStatus, PointCloud, TileServer from geoapi.models.feature import FeatureAsset from geoapi.services.file_location_status import FileLocationStatusService from geoapi.utils.external_apis import TapisUtils, get_session, TapisListingError @@ -26,7 +27,7 @@ "designsafe.storage.community", ] -BATCH_SIZE = 100 # Commit every 100 files +BATCH_SIZE = 500 # Commit every 500 items def extract_project_uuid(system_id: str) -> str | None: @@ -110,7 +111,9 @@ def get_file_tree_for_published_project( tapis_client = TapisUtils(session, user) - # Build file index starting from root of project + # Calculate the file tree of published DS project associated with this map project and place in a cache + # that will hold it and then possibly from other systems/projects + # (we assume most files will be from here, but they could be from other DS projects as well) file_index = build_file_index_from_tapis( tapis_client, DESIGNSAFE_PUBLISHED_SYSTEM, f"/published-data/{designsafe_prj}/" ) @@ -139,7 +142,7 @@ def determine_if_exists_in_tree( Check if asset's file exists in the file tree. Returns: - tuple of (is_published, system, path) + tuple of (is_published, path) """ if file_tree is None or not current_file_path: return (False, None) @@ -192,7 +195,25 @@ def get_filename_from_point_cloud_asset(session, asset: FeatureAsset) -> str | N return None -def fix_and_backfill( +def backfill_current_location(item: Union[FeatureAsset, TileServer]) -> None: + """ + Backfill current_system/current_path from original_* if missing. + Works for both FeatureAsset and TileServer. + """ + if not item.current_system and item.original_system: + logger.debug( + f"Updated current_system for {type(item).__name__}={item.id} to {item.original_system}" + ) + item.current_system = item.original_system + + if not item.current_path and item.original_path: + logger.debug( + f"Updated current_path for {type(item).__name__}={item.id} to {item.original_path}" + ) + item.current_path = item.original_path + + +def fix_and_backfill_feature_asset( session, tapis_client, project: Project, @@ -200,25 +221,17 @@ def fix_and_backfill( project_system: str, project_system_file_tree: Dict, ): - """This method attempts to fix and backfill any information - - There are missing information that we can try to fix. These things have been fixed for - newly created features, but we are trying to correct them for older features. - They are: - * (A) missing original_system -- only original_path was stored (original_system, - current_system, current_path came later) but we - might find the file in the system associated with - the map - * (B) missing current_system, current_path -- can baac with original_path and original_system - * (C) original_path missing for point clouds -- but can be derived from the point_cloud model - * (E) features derived from geojson or shapefiles don't have file info; TODO in WG-600 - * (D) raster files (tile layers) missing current path and current system (TODO) - - This methods fixes these things except for (D) and (E) + """ + Fix and backfill FeatureAsset-specific information. + + Handles: + - (A) missing original_system + - (B) missing current_system, current_path + - (C) original_path missing for point clouds """ logger.debug(f"Checking asset={asset.id} to see if we can fix anything") - # (C) See if point cloud can be fixed (we do this first as results - # might be used by B or A steps) + + # (C) See if point cloud can be fixed (we do this first as results might be used by B or A steps) if asset.original_path is None and asset.asset_type == "point_cloud": file_name = get_filename_from_point_cloud_asset(session, asset) logger.info( @@ -238,51 +251,84 @@ def fix_and_backfill( # (A) some legacy data is missing original_system if not asset.original_system: - # We can make assume that if the map project is connected to a - # DS project, then it is possible that is the system where the data resides. - # We check that system for the file and update original_system if there is a match. logger.debug( f"Missing original_system for asset={asset.id} so seeing if we see file on current DS project" ) if file_exists(tapis_client, project.system_id, asset.original_path): logger.debug( - f"Found file on current DS project so updating original_system to {project.system_id} " + f"Found file on current DS project so updating original_system to {project.system_id}" ) asset.original_system = project.system_id # (B) Backfill current_system/current_path for legacy data - if not asset.current_system and asset.original_system: - logger.debug( - f"Updated current_system for asset={asset.id} to {asset.original_system}" + backfill_current_location(asset) + + logger.debug(f"Completed checking asset={asset.id}") + + +def check_and_update_public_system( + item: Union[FeatureAsset, TileServer], + published_file_tree_cache: Dict, + session, + user, +) -> None: + """ + Check if item is on a public system and update location if found in published tree. + Works for both FeatureAsset and TileServer. + """ + item_type = type(item).__name__ + item_id = item.id + + # Skip if already on a public system + if item.current_system in PUBLIC_SYSTEMS: + item.is_on_public_system = True + return + + if item.current_system is None: + logger.warning( + f"We don't know the current system: {item_type}={item_id}" + f" original_path={item.original_path} original_system={item.original_system}" + f" current_path={item.current_path} current_system={item.current_system}" ) - asset.current_system = asset.original_system - if not asset.current_path and asset.original_path: - logger.debug( - f"Updated current_path for asset={asset.id} to {asset.original_path} " + return + + # Cache published file tree for this system if not already cached + if item.current_system not in published_file_tree_cache: + logger.info(f"Discovering new system {item.current_system}, building file tree") + published_file_tree_cache[item.current_system] = ( + get_file_tree_for_published_project(session, user, item.current_system) ) - asset.current_path = asset.original_path - logger.debug(f"Completed checking asset={asset.id}") + published_project_file_tree = published_file_tree_cache.get(item.current_system, {}) + + # Check if file exists in published tree + exists, found_path = determine_if_exists_in_tree( + published_project_file_tree, item.current_path + ) + + item.is_on_public_system = exists + + if exists and found_path: + item.current_system = DESIGNSAFE_PUBLISHED_SYSTEM + item.current_path = found_path @app.task(queue="default") def check_and_update_file_locations(user_id: int, project_id: int): """ - Check all feature assets in a project and update where they are located (i.e. new published - location) and then check if they are located tapis system that is publicly accessible + Check all feature assets and tile servers in a project and update where they are located + (i.e. new published location) and then check if they are on a publicly accessible system. Updates: - - FeatureAsset.current_system and current_path if file found in public system - - FeatureAsset.is_on_public_system based on current_system - - FeatureAsset.last_public_system_check timestamp - - FileLocationCheck record with new info + - FeatureAsset/TileServer: current_system, current_path, is_on_public_system, last_public_system_check + - FileLocationCheck record with check metadata """ logger.info(f"Starting file location check for project:{project_id} user:{user_id}") with create_task_session() as session: user = None file_location_check = None - failed_assets = [] + failed_items = [] try: user = session.get(User, user_id) @@ -322,14 +368,20 @@ def check_and_update_file_locations(user_id: int, project_id: int): .all() ) - total_checked = len(feature_assets) + # Get all internal tile servers for this project + # Only check internal tile servers (served by geoapi) + # External tile servers are using external URLs and don't need checking + tile_servers = TileService.getTileServers(session, projectId=project_id, internal=True) + + total_checked = len(feature_assets) + len(tile_servers) # Set total files count file_location_check.total_files = total_checked session.commit() logger.info( - f"Starting check for project {project_id}: {total_checked} assets to check" + f"Starting check for project {project_id}: " + f"{len(feature_assets)} feature assets + {len(tile_servers)} tile servers = {total_checked} items to check" ) update_task_and_send_progress_update( @@ -341,9 +393,7 @@ def check_and_update_file_locations(user_id: int, project_id: int): tapis_client = TapisUtils(session, user) - # Calculate the file tree of published DS project associated with this map project and place in a cache - # that will hold it and then possibly from other systems/projects - # (we assume most files will be from here, but they could be from other DS projects as well) + # Build published file tree cache published_file_tree_cache = {} if project.system_id: published_file_tree_cache[project.system_id] = ( @@ -352,21 +402,13 @@ def check_and_update_file_locations(user_id: int, project_id: int): ) ) - # Check if any point clouds are missing original path - has_point_cloud_missing_original_path = ( - session.query(FeatureAsset) - .join(Feature, Feature.id == FeatureAsset.feature_id) - .filter( - and_( - Feature.project_id == project_id, - FeatureAsset.asset_type == "point_cloud", - FeatureAsset.original_path.is_(None), - ) - ) - .first() - is not None + # Check if any point clouds need unpublished project file index + has_point_cloud_missing_original_path = any( + asset.original_path is None and asset.asset_type == "point_cloud" + for asset in feature_assets ) + # Create a listing for current project in case we need to use it unpublished_project_file_index = {} if has_point_cloud_missing_original_path and is_designsafe_project( project.system_id @@ -374,13 +416,12 @@ def check_and_update_file_locations(user_id: int, project_id: int): logger.info( "Point cloud asset(s) are missing original_path so we gather file listing to attempt to fix that" ) - # Create a listing for current project in case we need to unpublished_project_file_index = build_file_index_from_tapis( tapis_client, system_id=project.system_id, path="//" ) logger.info(f"Indexed {len(unpublished_project_file_index)} files") - # Process each asset + # Process each feature asset for i, asset in enumerate(feature_assets): try: # Update timestamp @@ -393,7 +434,7 @@ def check_and_update_file_locations(user_id: int, project_id: int): ) # Fix attributes of any assets missing info as they were created in the past - fix_and_backfill( + fix_and_backfill_feature_asset( session=session, tapis_client=tapis_client, project=project, @@ -402,83 +443,81 @@ def check_and_update_file_locations(user_id: int, project_id: int): project_system_file_tree=unpublished_project_file_index, ) - # Skip if already on a public system - if asset.current_system in PUBLIC_SYSTEMS: - asset.is_on_public_system = True - session.add(asset) - else: - if asset.current_system is None: - logger.warning( - f"We don't know the current system:" - f" asset={asset.id} asset_type={asset.asset_type}" - f" original_path={asset.original_path} original_system={asset.original_system}" - f" current_path={asset.current_path} current_system={asset.current_system} " - ) - - # TODO We should use https://www.designsafe-ci.org/api/publications/v2/PRJ-1234 endpoint to look - # at files (e.g fileObjs) but currently does not appear complete and and missing a previous-path attribute - if ( - asset.current_system - and asset.current_system not in published_file_tree_cache - ): - # First time seeing this system - fetch and cache it - logger.info( - f"Discovering new system {asset.current_system}, building file tree" - ) - published_file_tree_cache[asset.current_system] = ( - get_file_tree_for_published_project( - session, user, asset.current_system - ) - ) - - published_project_file_tree = published_file_tree_cache.get( - asset.current_system, {} - ) + # Check and update public system status + check_and_update_public_system( + asset, published_file_tree_cache, session, user + ) - # Check if file exists in published tree - exists, found_path = determine_if_exists_in_tree( - published_project_file_tree, asset.current_path + session.add(asset) + + # Commit in large batches for memory management (rare 5000+ item cases) + if (i + 1) % BATCH_SIZE == 0: + session.commit() + session.expire_all() + logger.info( + f"Batch: {i + 1}/{total_checked} processed, {len(failed_items)} errors" ) - asset.is_on_public_system = exists + except Exception as e: + error_msg = str(e)[:100] + logger.exception( + f"Error checking asset {asset.id} ({asset.original_path}): {e}" + ) - if exists and found_path: - asset.current_system = DESIGNSAFE_PUBLISHED_SYSTEM - asset.current_path = found_path + failed_items.append( + { + "type": "feature_asset", + "id": asset.id, + "path": asset.original_path or "unknown", + "error": error_msg, + } + ) - session.add(asset) + session.rollback() + continue - # Commit and clear cache in batches + # Process each tile server + for i, tile_server in enumerate(tile_servers, start=len(feature_assets)): + try: + # Update timestamp + tile_server.last_public_system_check = datetime.now(timezone.utc) + + logger.debug( + f"Processing tile_server={tile_server.id} name={tile_server.name}" + f" original_path={tile_server.original_path} original_system={tile_server.original_system}" + f" current_path={tile_server.current_path} current_system={tile_server.current_system}" + ) + + # Backfill current_system/current_path if missing + backfill_current_location(tile_server) + + # Check and update public system status + check_and_update_public_system( + tile_server, published_file_tree_cache, session, user + ) + + session.add(tile_server) + + # Commit in large batches if (i + 1) % BATCH_SIZE == 0: session.commit() session.expire_all() - - # Update counts - file_location_check.files_checked = i + 1 - len(failed_assets) - file_location_check.files_failed = len(failed_assets) - session.commit() - logger.info( - f"Batch: {i + 1}/{total_checked} processed, {len(failed_assets)} errors" - ) - - update_task_and_send_progress_update( - session, - user, - task_id=file_location_check.task.id, - latest_message=f"Processed {i + 1}/{total_checked} files ({len(failed_assets)} errors)", + f"Batch: {i + 1}/{total_checked} processed, {len(failed_items)} errors" ) except Exception as e: error_msg = str(e)[:100] logger.exception( - f"Error checking asset {asset.id} ({asset.original_path}): {e}" + f"Error checking tile_server {tile_server.id} ({tile_server.name}): {e}" ) - failed_assets.append( + failed_items.append( { - "asset_id": asset.id, - "path": asset.original_path or "unknown", + "type": "tile_server", + "id": tile_server.id, + "name": tile_server.name, + "path": tile_server.original_path or "unknown", "error": error_msg, } ) @@ -491,24 +530,22 @@ def check_and_update_file_locations(user_id: int, project_id: int): # Update final counts file_location_check.completed_at = datetime.now(timezone.utc) - file_location_check.files_checked = total_checked - len(failed_assets) - file_location_check.files_failed = len(failed_assets) + file_location_check.files_checked = total_checked - len(failed_items) + file_location_check.files_failed = len(failed_items) session.add(file_location_check) session.commit() - if failed_assets: + if failed_items: logger.warning( - f"File location check completed with {len(failed_assets)} failures for project {project_id}. " - f"Failed assets: {failed_assets}" + f"File location check completed with {len(failed_items)} failures for project {project_id}. " + f"Failed items: {failed_items}" ) - if failed_assets: - final_message = ( - f"Checked {total_checked} files: {file_location_check.files_checked} successful," - f" {len(failed_assets)} failed" - ) - else: - final_message = f"Successfully checked all {total_checked} files" + final_message = ( + f"Checked {total_checked} files: {file_location_check.files_checked} successful, {len(failed_items)} failed" + if failed_items + else f"Successfully checked all {total_checked} files" + ) logger.info(f"Check completed for project {project_id}: {final_message}") @@ -527,7 +564,6 @@ def check_and_update_file_locations(user_id: int, project_id: int): try: if file_location_check: file_location_check.completed_at = datetime.now(timezone.utc) - # Keep existing counts if we got that far session.add(file_location_check) if file_location_check and file_location_check.task and user: @@ -543,7 +579,6 @@ def check_and_update_file_locations(user_id: int, project_id: int): except Exception as cleanup_error: logger.exception(f"Failed to mark task as failed: {cleanup_error}") session.rollback() - # Re-raise to mark Celery task as failed as we can't even mark our internal - # task as failed + # task as faile raise diff --git a/geoapi/tasks/raster.py b/geoapi/tasks/raster.py index 81fa0508..638c9aa4 100644 --- a/geoapi/tasks/raster.py +++ b/geoapi/tasks/raster.py @@ -208,7 +208,7 @@ def import_tile_servers_from_tapis( tile_options = get_cog_metadata(cog_path) # Extract renderOptions from tile_options (currently - # only prepoulated for single banded images) + # only prepopulated for single banded images) render_options = tile_options.pop("renderOptions", {}) # Create TileServer pointing to TiTiler @@ -223,6 +223,8 @@ def import_tile_servers_from_tapis( attribution="", original_system=tapis_file.system, original_path=tapis_file.path, + current_system=tapis_file.system, + current_path=tapis_file.path, tileOptions=tile_options, uiOptions={ "zIndex": 0, # frontend will readjust as needed diff --git a/geoapi/tests/api_tests/test_file_location_status_routes.py b/geoapi/tests/api_tests/test_file_location_status_routes.py index 15862dc7..273e3faf 100644 --- a/geoapi/tests/api_tests/test_file_location_status_routes.py +++ b/geoapi/tests/api_tests/test_file_location_status_routes.py @@ -72,7 +72,7 @@ def test_get_public_files_status_empty(test_client, projects_fixture, user1): data = resp.json() assert data["project_id"] == projects_fixture.id assert data["check"] is None - assert data["files"] == [] + assert data["featureAssets"] == [] def test_get_public_files_status_with_features( @@ -85,8 +85,8 @@ def test_get_public_files_status_with_features( assert resp.status_code == 200 data = resp.json() assert data["project_id"] == projects_fixture.id - assert len(data["files"]) == 1 - assert data["files"][0]["asset_type"] == "image" + assert len(data["featureAssets"]) == 1 + assert data["featureAssets"][0]["asset_type"] == "image" def test_get_public_files_status_unauthorized(test_client, projects_fixture): diff --git a/geoapi/tests/tasks_tests/test_file_location_check.py b/geoapi/tests/tasks_tests/test_file_location_check.py index 0ab4f370..1fd60c1f 100644 --- a/geoapi/tests/tasks_tests/test_file_location_check.py +++ b/geoapi/tests/tasks_tests/test_file_location_check.py @@ -382,14 +382,6 @@ def test_batch_commits( assert file_location_check.files_checked == num_assets assert file_location_check.files_failed == 0 - # Verify: Progress updates were called for batches - progress_calls = [ - call - for call in mock_update_task.call_args_list - if "Processed" in call.kwargs.get("latest_message", "") - ] - assert len(progress_calls) >= 1 - def test_individual_asset_error_continues( db_session, From 85ede1d9f9b277893da18ff0194770a6c5f86139 Mon Sep 17 00:00:00 2001 From: Nathan Franklin Date: Wed, 12 Nov 2025 21:08:50 -0600 Subject: [PATCH 18/28] Fix linting --- geoapi/routes/file_location_status.py | 4 ++-- geoapi/services/tile_server.py | 4 +--- geoapi/tasks/file_location_check.py | 4 +++- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/geoapi/routes/file_location_status.py b/geoapi/routes/file_location_status.py index 0cfc76b2..96ee3b5d 100644 --- a/geoapi/routes/file_location_status.py +++ b/geoapi/routes/file_location_status.py @@ -5,12 +5,13 @@ from sqlalchemy.orm import Session from geoapi.log import logger -from geoapi.models import Feature, FeatureAsset, TileServer +from geoapi.models import Feature, FeatureAsset from geoapi.schema.projects import FeatureAssetModel, TileServerModel, TaskModel from geoapi.utils.decorators import project_permissions_guard from geoapi.tasks.file_location_check import check_and_update_file_locations from geoapi.services.tile_server import TileService + class FileLocationStatusModel(BaseModel): model_config = ConfigDict(from_attributes=True) @@ -165,7 +166,6 @@ def get_files_status( .all() ) - # Calculate summary counts for context total_features = ( db_session.query(Feature).filter(Feature.project_id == project_id).count() diff --git a/geoapi/services/tile_server.py b/geoapi/services/tile_server.py index 61638927..65f344c6 100644 --- a/geoapi/services/tile_server.py +++ b/geoapi/services/tile_server.py @@ -55,9 +55,7 @@ def addTileServer(database_session, projectId: int, data: Dict): def getTileServers( database_session, projectId: int, - internal: Optional[ - bool - ] = None, + internal: Optional[bool] = None, ) -> List[TileServer]: """ Get tile servers for a project. diff --git a/geoapi/tasks/file_location_check.py b/geoapi/tasks/file_location_check.py index fa43508d..82b67738 100644 --- a/geoapi/tasks/file_location_check.py +++ b/geoapi/tasks/file_location_check.py @@ -371,7 +371,9 @@ def check_and_update_file_locations(user_id: int, project_id: int): # Get all internal tile servers for this project # Only check internal tile servers (served by geoapi) # External tile servers are using external URLs and don't need checking - tile_servers = TileService.getTileServers(session, projectId=project_id, internal=True) + tile_servers = TileService.getTileServers( + session, projectId=project_id, internal=True + ) total_checked = len(feature_assets) + len(tile_servers) From b2d7f61a6781b5f816c4e00816d417f14d4e8e6f Mon Sep 17 00:00:00 2001 From: Nathan Franklin Date: Wed, 12 Nov 2025 21:09:19 -0600 Subject: [PATCH 19/28] Save original_system and path for FeatureAssets associated with point clouds --- geoapi/tasks/external_data.py | 5 ++++- geoapi/tasks/lidar.py | 40 ++++++++++++++++++----------------- geoapi/tests/conftest.py | 12 ----------- 3 files changed, 25 insertions(+), 32 deletions(-) diff --git a/geoapi/tasks/external_data.py b/geoapi/tasks/external_data.py index e8e4bce3..1a137b43 100644 --- a/geoapi/tasks/external_data.py +++ b/geoapi/tasks/external_data.py @@ -332,7 +332,10 @@ def import_point_clouds_from_tapis(userId: int, files, pointCloudId: int): send_progress_update(user, celery_task_id, "error", failed_message) return - point_cloud.files_info = get_point_cloud_info(session, pointCloudId) + # add to our files_info with these new files + point_cloud.files_info = (point_cloud.files_info or []) + get_point_cloud_info( + files + ) session.add(point_cloud) session.commit() diff --git a/geoapi/tasks/lidar.py b/geoapi/tasks/lidar.py index 97a90529..ab3f7411 100644 --- a/geoapi/tasks/lidar.py +++ b/geoapi/tasks/lidar.py @@ -50,21 +50,18 @@ def check_point_cloud(file_path: str) -> None: getProj4(file_path) -def get_point_cloud_info(database_session, pointCloudId: int) -> dict: +def get_point_cloud_info(files: list) -> list: """ - Get info on las files - :param pointCloudId: int - :return: None + Get info from source files """ - from geoapi.services.point_cloud import PointCloudService - - point_cloud = PointCloudService.get(database_session, pointCloudId) - path_to_original_point_clouds = get_asset_path( - point_cloud.path, PointCloudService.ORIGINAL_FILES_DIR - ) - input_files = get_point_cloud_files(path_to_original_point_clouds) - - return [{"name": os.path.basename(f)} for f in input_files] + return [ + { + "original_system": file["system"], + "original_path": file["path"], + "name": os.path.basename(file["path"]), + } + for file in files + ] class PointCloudProcessingTask(celery.Task): @@ -139,12 +136,7 @@ def convert_to_potree(self, pointCloudId: int) -> None: point_cloud.path, PointCloudService.PROCESSED_DIR ) - input_files = [ - get_asset_path(path_to_original_point_clouds, file) - for file in os.listdir(path_to_original_point_clouds) - if pathlib.Path(file).suffix.lstrip(".").lower() - in PointCloudService.LIDAR_FILE_EXTENSIONS - ] + input_files = get_point_cloud_files(path_to_original_point_clouds) outline = get_bounding_box_2d(input_files) @@ -192,12 +184,22 @@ def convert_to_potree(self, pointCloudId: int) -> None: asset_uuid = uuid.uuid4() base_filepath = make_project_asset_dir(point_cloud.project_id) asset_path = os.path.join(base_filepath, str(asset_uuid)) + + # Grab first file as we will associate the FeatureAsset with just one file + first_file = point_cloud.files_info[0] if point_cloud.files_info else {} + original_system = first_file.get("original_system") + original_path = first_file.get("original_path") + fa = FeatureAsset( uuid=asset_uuid, asset_type="point_cloud", path=get_asset_relative_path(asset_path), display_path=point_cloud.description, feature=feature, + original_system=original_system, + original_path=original_path, + current_system=original_system, + current_path=original_path, ) feature.assets.append(fa) point_cloud.feature = feature diff --git a/geoapi/tests/conftest.py b/geoapi/tests/conftest.py index 307a22f1..6e63c2ae 100644 --- a/geoapi/tests/conftest.py +++ b/geoapi/tests/conftest.py @@ -567,18 +567,6 @@ def check_point_cloud_mock_missing_crs(): yield mock_check_point_cloud_mock -@pytest.fixture(scope="function") -def get_point_cloud_info_mock(): - with patch( - "geoapi.services.point_cloud.get_point_cloud_info" - ) as mock_get_point_cloud_info: - mock_result = MagicMock() - mock_result.get.return_value = [{"name": "test.las"}] - - mock_get_point_cloud_info.return_value = mock_result - yield mock_get_point_cloud_info - - @pytest.fixture(scope="function") def tapis_file_listings_mock(): filesListing = [ From 0821cd448a95e79a39a1b6fe76670b35efcb78f7 Mon Sep 17 00:00:00 2001 From: Nathan Franklin Date: Fri, 14 Nov 2025 11:09:15 -0600 Subject: [PATCH 20/28] Refactor point cloud --- geoapi/celery_app.py | 2 +- geoapi/routes/projects.py | 4 +- geoapi/services/point_cloud.py | 9 +- geoapi/tasks/external_data.py | 181 +------- geoapi/tasks/lidar.py | 217 ---------- geoapi/tasks/point_cloud.py | 387 ++++++++++++++++++ geoapi/tasks/utils.py | 40 ++ .../api_tests/test_point_cloud_routes.py | 6 +- .../api_tests/test_point_cloud_service.py | 7 +- geoapi/tests/conftest.py | 2 +- .../external_data_tests/test_external_data.py | 20 +- .../{test_lidar.py => test_point_cloud.py} | 2 +- geoapi/utils/{lidar.py => point_cloud.py} | 0 13 files changed, 457 insertions(+), 420 deletions(-) delete mode 100644 geoapi/tasks/lidar.py create mode 100644 geoapi/tasks/point_cloud.py rename geoapi/tests/utils_tests/{test_lidar.py => test_point_cloud.py} (97%) rename geoapi/utils/{lidar.py => point_cloud.py} (100%) diff --git a/geoapi/celery_app.py b/geoapi/celery_app.py index 15c94f10..ceba6e2f 100644 --- a/geoapi/celery_app.py +++ b/geoapi/celery_app.py @@ -21,7 +21,7 @@ # Import task modules app.conf.imports = ( "geoapi.tasks.raster", - "geoapi.tasks.lidar", + "geoapi.tasks.point_cloud", "geoapi.tasks.streetview", "geoapi.tasks.projects", "geoapi.tasks.external_data", diff --git a/geoapi/routes/projects.py b/geoapi/routes/projects.py index a9ac3d14..085c3ebf 100644 --- a/geoapi/routes/projects.py +++ b/geoapi/routes/projects.py @@ -14,7 +14,7 @@ from geoapi.services.point_cloud import PointCloudService from geoapi.services.projects import ProjectsService from geoapi.services.tile_server import TileService -from geoapi.tasks import external_data, streetview +from geoapi.tasks import external_data, streetview, point_cloud from geoapi.models import Task, Project, Feature, TileServer, Overlay, PointCloud, User from geoapi.utils.decorators import ( project_permissions_allow_public_guard, @@ -865,7 +865,7 @@ def import_point_cloud_file_from_tapis( for file in files: PointCloudService.check_file_extension(file.path) - external_data.import_point_clouds_from_tapis.delay( + point_cloud.import_point_clouds_from_tapis.delay( u.id, [file.model_dump() for file in files], point_cloud_id ) return OkResponse(message="Task created for point cloud import") diff --git a/geoapi/services/point_cloud.py b/geoapi/services/point_cloud.py index 228d189e..3cf33fde 100644 --- a/geoapi/services/point_cloud.py +++ b/geoapi/services/point_cloud.py @@ -10,7 +10,6 @@ from geoapi.models import PointCloud, Project, User, Task from geoapi.log import logging -from geoapi.tasks.lidar import convert_to_potree from geoapi.utils.assets import ( make_project_asset_dir, delete_assets, @@ -22,7 +21,7 @@ class PointCloudService: - LIDAR_FILE_EXTENSIONS = ("las", "laz") + POINT_CLOUD_FILE_EXTENSIONS = ("las", "laz") ORIGINAL_FILES_DIR = "original_files" PROCESSED_DIR = "point_cloud" @@ -119,7 +118,7 @@ def check_file_extension(file_name): :raises: ApiException """ file_ext = pathlib.Path(file_name).suffix.lstrip(".").lower() - if file_ext not in PointCloudService.LIDAR_FILE_EXTENSIONS: + if file_ext not in PointCloudService.POINT_CLOUD_FILE_EXTENSIONS: raise ApiException("Invalid file type for point clouds.") @staticmethod @@ -173,7 +172,9 @@ def _process_point_clouds(database_session, pointCloudId: int) -> Task: ) ) - # Process asynchronously lidar file and add a feature asset + from geoapi.tasks.point_cloud import convert_to_potree + + # Process asynchronously point_cloud file and add a feature asset convert_to_potree.apply_async(args=[pointCloudId], task_id=celery_task_id) return task diff --git a/geoapi/tasks/external_data.py b/geoapi/tasks/external_data.py index 1a137b43..fd46a9ea 100644 --- a/geoapi/tasks/external_data.py +++ b/geoapi/tasks/external_data.py @@ -7,15 +7,14 @@ from pathlib import Path from enum import Enum -from celery import uuid as celery_uuid, current_task +from celery import current_task from sqlalchemy import true from geoapi.celery_app import app from geoapi.exceptions import ( - InvalidCoordinateReferenceSystem, GetUsersForProjectNotSupported, ) -from geoapi.models import Project, User, ProjectUser, Task, TaskStatus +from geoapi.models import Project, User, ProjectUser from geoapi.utils.external_apis import ( TapisUtils, SystemUser, @@ -29,13 +28,6 @@ from geoapi.services.features import FeaturesService from geoapi.services.imports import ImportsService from geoapi.services.vectors import SHAPEFILE_FILE_ADDITIONAL_FILES -import geoapi.services.point_cloud as pointcloud -from geoapi.tasks.lidar import ( - convert_to_potree, - check_point_cloud, - get_point_cloud_info, - PointCloudConversionException, -) from geoapi.db import create_task_session from geoapi.services.users import UserService from geoapi.utils.additional_file import AdditionalFile @@ -210,175 +202,6 @@ def import_file_from_tapis(userId: int, systemId: str, path: str, projectId: int raise -def _update_point_cloud_task( - database_session, pointCloudId: int, description: str = None, status: str = None -): - task = pointcloud.PointCloudService.get(database_session, pointCloudId).task - if description is not None: - task.description = description - if status is not None: - task.status = status - database_session.add(task) - database_session.commit() - - -def _handle_point_cloud_conversion_error( - pointCloudId, userId, files, error_description -): - with create_task_session() as session: - user = session.get(User, userId) - logger.exception( - f"point cloud:{pointCloudId} conversion failed for user:{user.username} and files:{files}. " - f"error: {error_description}" - ) - _update_point_cloud_task( - session, - pointCloudId, - description=error_description, - status=TaskStatus.FAILED, - ) - send_progress_update( - user, - current_task.request.id, - "error", - f"Processing failed for point cloud ({pointCloudId})!", - ) - - -@app.task(queue="heavy") -def import_point_clouds_from_tapis(userId: int, files, pointCloudId: int): - with create_task_session() as session: - user = session.get(User, userId) - client = TapisUtils(session, user) - - point_cloud = pointcloud.PointCloudService.get(session, pointCloudId) - celery_task_id = celery_uuid() - - logger.info( - f"point cloud:{pointCloudId} conversion started for user:{user.username} and files:{files}" - ) - - # this initial geoapi.model.Task setup should probably be moved out of the celery task and performed - # in the request processing (i.e. in ProjectPointCloudsFileImportResource) so that a task can be returned in the - # request. See https://jira.tacc.utexas.edu/browse/WG-85 - task = Task() - task.process_id = celery_task_id - task.status = TaskStatus.RUNNING - - point_cloud.task = task - session.add(point_cloud) - session.add(task) - session.commit() - - new_asset_files = [] - failed_message = None - for file in files: - _update_point_cloud_task( - session, - pointCloudId, - description="Importing file ({}/{})".format( - len(new_asset_files) + 1, len(files) - ), - ) - - send_progress_update(user, celery_task_id, "success", task.description) - - system_id = file["system"] - path = file["path"] - - try: - tmp_file = client.getFile(system_id, path) - tmp_file.filename = Path(path).name - file_path = ( - pointcloud.PointCloudService.putPointCloudInOriginalsFileDir( - point_cloud.path, tmp_file, tmp_file.filename - ) - ) - tmp_file.close() - - # save file path as we might need to delete it if there is a problem - new_asset_files.append(file_path) - - # check if file is okay - check_point_cloud(file_path) - - except InvalidCoordinateReferenceSystem: - logger.error( - f"Could not import point cloud file ( point cloud: {pointCloudId} , " - f"for user:{user.username} due to missing coordinate reference system: {system_id}:{path}" - ) - failed_message = ( - "Error importing {}: missing coordinate reference system".format( - path - ) - ) - except Exception as e: - logger.exception( - f"Could not import point cloud file for user:{user.username} point cloud: {pointCloudId}" - f"from tapis: {system_id}/{path} : {e}" - ) - failed_message = "Unknown error importing {}:{}".format(system_id, path) - - if failed_message: - for file_path in new_asset_files: - logger.info("removing {}".format(file_path)) - os.remove(file_path) - _update_point_cloud_task( - session, - pointCloudId, - description=failed_message, - status=TaskStatus.FAILED, - ) - send_progress_update(user, celery_task_id, "error", failed_message) - return - - # add to our files_info with these new files - point_cloud.files_info = (point_cloud.files_info or []) + get_point_cloud_info( - files - ) - - session.add(point_cloud) - session.commit() - - _update_point_cloud_task( - session, - pointCloudId, - description="Running potree converter", - status=TaskStatus.RUNNING, - ) - send_progress_update( - user, - celery_task_id, - "success", - "Running potree converter (for point cloud {})".format(pointCloudId), - ) - try: - # use potree converter to convert las to web-friendly format - # this operation is memory-intensive and time-consuming. - convert_to_potree(pointCloudId) - with create_task_session() as session: - user = session.get(User, userId) - logger.info( - f"point cloud:{pointCloudId} conversion completed for user:{user.username} and files:{files}" - ) - send_progress_update( - user, - celery_task_id, - "success", - "Completed potree converter (for point cloud {}).".format(pointCloudId), - ) - except PointCloudConversionException as e: - error_description = e.message - _handle_point_cloud_conversion_error( - pointCloudId, userId, files, error_description - ) - except Exception: - error_description = "Unknown error occurred" - _handle_point_cloud_conversion_error( - pointCloudId, userId, files, error_description - ) - - @app.task(rate_limit="5/s") def import_from_tapis( tenant_id: str, userId: int, systemId: str, path: str, projectId: int diff --git a/geoapi/tasks/lidar.py b/geoapi/tasks/lidar.py deleted file mode 100644 index ab3f7411..00000000 --- a/geoapi/tasks/lidar.py +++ /dev/null @@ -1,217 +0,0 @@ -import os -import uuid -import subprocess -import pathlib -import re -import shutil -import celery -from geoalchemy2.shape import from_shape - -from geoapi.log import logging -from geoapi.utils.lidar import getProj4, get_bounding_box_2d -from geoapi.utils import geometries -from geoapi.celery_app import app -from geoapi.db import create_task_session -from geoapi.models import Task, TaskStatus -from geoapi.utils.assets import ( - make_project_asset_dir, - get_asset_path, - get_asset_relative_path, -) - -logger = logging.getLogger(__file__) - - -def get_point_cloud_files(path): - """ - Get all point cloud files in a path - :param path: strings - :return: list of file paths of point cloud files - """ - from geoapi.services.point_cloud import PointCloudService - - input_files = [ - get_asset_path(path, file) - for file in os.listdir(path) - if pathlib.Path(file).suffix.lstrip(".").lower() - in PointCloudService.LIDAR_FILE_EXTENSIONS - ] - return input_files - - -def check_point_cloud(file_path: str) -> None: - """ - Check point cloud file that it has required info - :param file_path: str - :return: None - :raises InvalidCoordinateReferenceSystem: if file missing crs - """ - # TODO make this a check about if we have enough info ect. - getProj4(file_path) - - -def get_point_cloud_info(files: list) -> list: - """ - Get info from source files - """ - return [ - { - "original_system": file["system"], - "original_path": file["path"], - "name": os.path.basename(file["path"]), - } - for file in files - ] - - -class PointCloudProcessingTask(celery.Task): - def on_failure(self, exc, task_id, args, kwargs, einfo): - logger.info("Task ({}, point cloud {}) failed: {}".format(task_id, args, exc)) - with create_task_session() as session: - failed_task = session.query(Task).filter(Task.process_id == task_id).first() - failed_task.status = "FAILED" - failed_task.description = "" - session.add(failed_task) - session.commit() - - -class PointCloudConversionException(Exception): - def __init__(self, message="Unknown error"): - self.message = message - super().__init__(self.message) - - -def run_potree_converter( - pointCloudId, - path_to_original_point_clouds, - path_temp_processed_point_cloud_path, - conversion_parameters=None, -): - """Run potree converter as external process""" - command = [ - "/opt/PotreeConverter/build/PotreeConverter", - "--verbose", - "-i", - path_to_original_point_clouds, - "-o", - path_temp_processed_point_cloud_path, - "--overwrite", - "--generate-page", - "index", - ] - if conversion_parameters: - command.extend(conversion_parameters.split()) - logger.info( - "Processing point cloud (#{}). command:{}".format( - pointCloudId, " ".join(command) - ) - ) - subprocess.run(command, check=True, capture_output=True, text=True) - - -@app.task(bind=True, base=PointCloudProcessingTask) -def convert_to_potree(self, pointCloudId: int) -> None: - """ - Use the potree converter to convert a LAS/LAZ file to potree format - - Note: this operation is memory-intensive and time-consuming. Large LAS files (>8 Gb) can use >50gb of memory. - - if process killed (e.g. due to memory constraints), PointCloudTaskException is raised - - :param pointCloudId: int - :param userId: int - :return: None - :raises PointCloudTaskException: if conversion fails - """ - from geoapi.models import Feature, FeatureAsset - from geoapi.services.point_cloud import PointCloudService - - with create_task_session() as session: - point_cloud = PointCloudService.get(session, pointCloudId) - conversion_parameters = point_cloud.conversion_parameters - path_to_original_point_clouds = get_asset_path( - point_cloud.path, PointCloudService.ORIGINAL_FILES_DIR - ) - path_temp_processed_point_cloud_path = get_asset_path( - point_cloud.path, PointCloudService.PROCESSED_DIR - ) - - input_files = get_point_cloud_files(path_to_original_point_clouds) - - outline = get_bounding_box_2d(input_files) - - try: - run_potree_converter( - pointCloudId, - path_to_original_point_clouds, - path_temp_processed_point_cloud_path, - conversion_parameters, - ) - except subprocess.CalledProcessError as e: - error_description = "Point cloud conversion failed" - if e.returncode == -9: # SIGKILL; most likely ran out of memory - error_description += "; process killed due to insufficient memory" - logger.exception( - f"Processing point cloud failed (point_cloud:{pointCloudId} " - f"path_to_original_point_clouds:{path_to_original_point_clouds} )." - ) - raise PointCloudConversionException(error_description) - - with create_task_session() as session: - point_cloud = PointCloudService.get(session, pointCloudId) - # Create preview viewer html (with no menu and now nsf logo) - with open( - os.path.join(path_temp_processed_point_cloud_path, "preview.html"), "w+" - ) as preview: - with open( - os.path.join(path_temp_processed_point_cloud_path, "index.html"), "r" - ) as viewer: - content = viewer.read() - content = re.sub( - r"
", "", content, flags=re.DOTALL - ) - content = content.replace( - "viewer.toggleSidebar()", "$('.potree_menu_toggle').hide()" - ) - preview.write(content) - - if point_cloud.feature_id: - feature = point_cloud.feature - else: - feature = Feature() - feature.project_id = point_cloud.project_id - - asset_uuid = uuid.uuid4() - base_filepath = make_project_asset_dir(point_cloud.project_id) - asset_path = os.path.join(base_filepath, str(asset_uuid)) - - # Grab first file as we will associate the FeatureAsset with just one file - first_file = point_cloud.files_info[0] if point_cloud.files_info else {} - original_system = first_file.get("original_system") - original_path = first_file.get("original_path") - - fa = FeatureAsset( - uuid=asset_uuid, - asset_type="point_cloud", - path=get_asset_relative_path(asset_path), - display_path=point_cloud.description, - feature=feature, - original_system=original_system, - original_path=original_path, - current_system=original_system, - current_path=original_path, - ) - feature.assets.append(fa) - point_cloud.feature = feature - - feature.the_geom = from_shape(geometries.convert_3D_2D(outline), srid=4326) - point_cloud.task.status = TaskStatus.COMPLETED - point_cloud.task.description = "" - - point_cloud_asset_path = get_asset_path(feature.assets[0].path) - session.add(point_cloud) - session.add(feature) - session.commit() - - shutil.rmtree(point_cloud_asset_path, ignore_errors=True) - shutil.move(path_temp_processed_point_cloud_path, point_cloud_asset_path) diff --git a/geoapi/tasks/point_cloud.py b/geoapi/tasks/point_cloud.py new file mode 100644 index 00000000..4d5e5bda --- /dev/null +++ b/geoapi/tasks/point_cloud.py @@ -0,0 +1,387 @@ +import os +import uuid +import subprocess +import pathlib +import re +import shutil +from geoalchemy2.shape import from_shape +import celery + + +from geoapi.log import logging + +from geoapi.tasks.utils import GeoAPITask, send_progress_update +from geoapi.celery_app import app +from geoapi.db import create_task_session +from geoapi.models import Task, TaskStatus, User +from geoapi.utils.assets import ( + make_project_asset_dir, + get_asset_path, + get_asset_relative_path, +) +from geoapi.utils.external_apis import TapisUtils +from geoapi.utils.point_cloud import getProj4, get_bounding_box_2d +from geoapi.utils.geometries import convert_3D_2D +from geoapi.exceptions import InvalidCoordinateReferenceSystem +from geoapi.services.point_cloud import PointCloudService + + +logger = logging.getLogger(__file__) + + +def get_point_cloud_files(path): + """ + Get all point cloud files in a path + :param path: strings + :return: list of file paths of point cloud files + """ + from geoapi.services.point_cloud import PointCloudService + + input_files = [ + get_asset_path(path, file) + for file in os.listdir(path) + if pathlib.Path(file).suffix.lstrip(".").lower() + in PointCloudService.POINT_CLOUD_FILE_EXTENSIONS + ] + return input_files + + +def check_point_cloud(file_path: str) -> None: + """ + Check point cloud file that it has required info + :param file_path: str + :return: None + :raises InvalidCoordinateReferenceSystem: if file missing crs + """ + # TODO make this a check about if we have enough info ect. + getProj4(file_path) + + +def get_point_cloud_info(files: list) -> list: + """ + Get info from source files + """ + return [ + { + "original_system": file["system"], + "original_path": file["path"], + "name": os.path.basename(file["path"]), + } + for file in files + ] + + +class PointCloudConversionException(Exception): + def __init__(self, message="Unknown error"): + self.message = message + super().__init__(self.message) + + +def run_potree_converter( + pointCloudId, + path_to_original_point_clouds, + path_temp_processed_point_cloud_path, + conversion_parameters=None, +): + """Run potree converter as external process""" + command = [ + "/opt/PotreeConverter/build/PotreeConverter", + "--verbose", + "-i", + path_to_original_point_clouds, + "-o", + path_temp_processed_point_cloud_path, + "--overwrite", + "--generate-page", + "index", + ] + if conversion_parameters: + command.extend(conversion_parameters.split()) + logger.info( + "Processing point cloud (#{}). command:{}".format( + pointCloudId, " ".join(command) + ) + ) + subprocess.run(command, check=True, capture_output=True, text=True) + + +@app.task(bind=True, base=GeoAPITask) +def convert_to_potree(self, pointCloudId: int) -> None: + """ + Use the potree converter to convert a LAS/LAZ file to potree format + + Note: this operation is memory-intensive and time-consuming. Large LAS files (>8 Gb) can use >50gb of memory. + + if process killed (e.g. due to memory constraints), PointCloudTaskException is raised + + :param pointCloudId: int + :param userId: int + :return: None + :raises PointCloudTaskException: if conversion fails + """ + from geoapi.models import Feature, FeatureAsset + from geoapi.services.point_cloud import PointCloudService + + with create_task_session() as session: + point_cloud = PointCloudService.get(session, pointCloudId) + conversion_parameters = point_cloud.conversion_parameters + path_to_original_point_clouds = get_asset_path( + point_cloud.path, PointCloudService.ORIGINAL_FILES_DIR + ) + path_temp_processed_point_cloud_path = get_asset_path( + point_cloud.path, PointCloudService.PROCESSED_DIR + ) + + input_files = get_point_cloud_files(path_to_original_point_clouds) + + outline = get_bounding_box_2d(input_files) + + try: + run_potree_converter( + pointCloudId, + path_to_original_point_clouds, + path_temp_processed_point_cloud_path, + conversion_parameters, + ) + except subprocess.CalledProcessError as e: + error_description = "Point cloud conversion failed" + if e.returncode == -9: # SIGKILL; most likely ran out of memory + error_description += "; process killed due to insufficient memory" + logger.exception( + f"Processing point cloud failed (point_cloud:{pointCloudId} " + f"path_to_original_point_clouds:{path_to_original_point_clouds} )." + ) + raise PointCloudConversionException(error_description) + + with create_task_session() as session: + point_cloud = PointCloudService.get(session, pointCloudId) + # Create preview viewer html (with no menu and now nsf logo) + with open( + os.path.join(path_temp_processed_point_cloud_path, "preview.html"), "w+" + ) as preview: + with open( + os.path.join(path_temp_processed_point_cloud_path, "index.html"), "r" + ) as viewer: + content = viewer.read() + content = re.sub( + r"
", "", content, flags=re.DOTALL + ) + content = content.replace( + "viewer.toggleSidebar()", "$('.potree_menu_toggle').hide()" + ) + preview.write(content) + + if point_cloud.feature_id: + feature = point_cloud.feature + else: + feature = Feature() + feature.project_id = point_cloud.project_id + + asset_uuid = uuid.uuid4() + base_filepath = make_project_asset_dir(point_cloud.project_id) + asset_path = os.path.join(base_filepath, str(asset_uuid)) + + # Grab first file as we will associate the FeatureAsset with just one file + first_file = point_cloud.files_info[0] if point_cloud.files_info else {} + original_system = first_file.get("original_system") + original_path = first_file.get("original_path") + + fa = FeatureAsset( + uuid=asset_uuid, + asset_type="point_cloud", + path=get_asset_relative_path(asset_path), + display_path=point_cloud.description, + feature=feature, + original_system=original_system, + original_path=original_path, + current_system=original_system, + current_path=original_path, + ) + feature.assets.append(fa) + point_cloud.feature = feature + + feature.the_geom = from_shape(convert_3D_2D(outline), srid=4326) + point_cloud.task.status = TaskStatus.COMPLETED + point_cloud.task.description = "" + + point_cloud_asset_path = get_asset_path(feature.assets[0].path) + session.add(point_cloud) + session.add(feature) + session.commit() + + shutil.rmtree(point_cloud_asset_path, ignore_errors=True) + shutil.move(path_temp_processed_point_cloud_path, point_cloud_asset_path) + + +def _update_point_cloud_task( + database_session, pointCloudId: int, description: str = None, status: str = None +): + task = PointCloudService.get(database_session, pointCloudId).task + if description is not None: + task.description = description + if status is not None: + task.status = status + database_session.add(task) + database_session.commit() + + +def _handle_point_cloud_conversion_error( + pointCloudId, userId, files, error_description +): + with create_task_session() as session: + user = session.get(User, userId) + logger.exception( + f"point cloud:{pointCloudId} conversion failed for user:{user.username} and files:{files}. " + f"error: {error_description}" + ) + _update_point_cloud_task( + session, + pointCloudId, + description=error_description, + status=TaskStatus.FAILED, + ) + send_progress_update( + user, + celery.current_task.request.id, + "error", + f"Processing failed for point cloud ({pointCloudId})!", + ) + + +@app.task(queue="heavy") +def import_point_clouds_from_tapis(userId: int, files, pointCloudId: int): + """ + Imports additional point cloud files from Tapis into an existing + point cloud record. Each file is fetched, validated, and saved before + triggering Potree conversion. Progress and errors are reflected in the + associated Task and sent to the user. Typically used for a single + LAZ file per point cloud, but supports multiple files when needed. + """ + with create_task_session() as session: + user = session.get(User, userId) + client = TapisUtils(session, user) + + point_cloud = PointCloudService.get(session, pointCloudId) + celery_task_id = celery.uuid() + + logger.info( + f"point cloud:{pointCloudId} conversion started for user:{user.username} and files:{files}" + ) + + # this initial geoapi.model.Task setup should probably be moved out of the celery task and performed + # in the request processing (i.e. in ProjectPointCloudsFileImportResource) so that a task can be returned in the + # request. See https://jira.tacc.utexas.edu/browse/WG-85 + task = Task() + task.process_id = celery_task_id + task.status = TaskStatus.RUNNING + + point_cloud.task = task + session.add(point_cloud) + session.add(task) + session.commit() + + new_asset_files = [] + failed_message = None + for file in files: + _update_point_cloud_task( + session, + pointCloudId, + description="Importing file ({}/{})".format( + len(new_asset_files) + 1, len(files) + ), + ) + + send_progress_update(user, celery_task_id, "success", task.description) + + system_id = file["system"] + path = file["path"] + + try: + tmp_file = client.getFile(system_id, path) + tmp_file.filename = pathlib.Path(path).name + file_path = PointCloudService.putPointCloudInOriginalsFileDir( + point_cloud.path, tmp_file, tmp_file.filename + ) + tmp_file.close() + + # save file path as we might need to delete it if there is a problem + new_asset_files.append(file_path) + + # check if file is okay + check_point_cloud(file_path) + + except InvalidCoordinateReferenceSystem: + logger.error( + f"Could not import point cloud file ( point cloud: {pointCloudId} , " + f"for user:{user.username} due to missing coordinate reference system: {system_id}:{path}" + ) + failed_message = ( + "Error importing {}: missing coordinate reference system".format( + path + ) + ) + except Exception as e: + logger.exception( + f"Could not import point cloud file for user:{user.username} point cloud: {pointCloudId}" + f"from tapis: {system_id}/{path} : {e}" + ) + failed_message = "Unknown error importing {}:{}".format(system_id, path) + + if failed_message: + for file_path in new_asset_files: + logger.info("removing {}".format(file_path)) + os.remove(file_path) + _update_point_cloud_task( + session, + pointCloudId, + description=failed_message, + status=TaskStatus.FAILED, + ) + send_progress_update(user, celery_task_id, "error", failed_message) + return + + # add to our files_info with these new files + point_cloud.files_info = (point_cloud.files_info or []) + get_point_cloud_info( + files + ) + + session.add(point_cloud) + session.commit() + + _update_point_cloud_task( + session, + pointCloudId, + description="Running potree converter", + status=TaskStatus.RUNNING, + ) + send_progress_update( + user, + celery_task_id, + "success", + "Running potree converter (for point cloud {})".format(pointCloudId), + ) + try: + # use potree converter to convert las to web-friendly format + # this operation is memory-intensive and time-consuming. + convert_to_potree(pointCloudId) + with create_task_session() as session: + user = session.get(User, userId) + logger.info( + f"point cloud:{pointCloudId} conversion completed for user:{user.username} and files:{files}" + ) + send_progress_update( + user, + celery_task_id, + "success", + "Completed potree converter (for point cloud {}).".format(pointCloudId), + ) + except PointCloudConversionException as e: + error_description = e.message + _handle_point_cloud_conversion_error( + pointCloudId, userId, files, error_description + ) + except Exception: + error_description = "Unknown error occurred" + _handle_point_cloud_conversion_error( + pointCloudId, userId, files, error_description + ) diff --git a/geoapi/tasks/utils.py b/geoapi/tasks/utils.py index f58610a9..b992cb5f 100644 --- a/geoapi/tasks/utils.py +++ b/geoapi/tasks/utils.py @@ -1,8 +1,48 @@ import requests + +import celery + from geoapi.log import logger from geoapi.models import Task, TaskStatus, User from geoapi.settings import settings from geoapi.utils.client_backend import get_deployed_geoapi_url +from geoapi.db import create_task_session + + +class GeoAPITask(celery.Task): + """ + Base Celery task for GeoAPI workflows. + + This task provides a safety-net failure handler that updates the GeoAPI + Task database record when an unexpected exception escapes the task body. + """ + + abstract = True # don't register this as a concrete Celery task + + def on_failure(self, exc, task_id, args, kwargs, einfo): + logger.error(f"[Task Failure] {self.name} ({task_id}): {exc}") + + # NOTE: If retryable tasks are introduced, handle celery.exceptions.Retry + # here so retries do not trigger FAILED status updates. + + try: + with create_task_session() as session: + task = session.query(Task).filter(Task.process_id == task_id).first() + + if not task: + return + + # Only mark FAILED if the task didn't explicitly set some other terminal state + if task.status != TaskStatus.FAILED: + task.status = TaskStatus.FAILED + task.last_message = str(exc) + session.add(task) + session.commit() + + except Exception as error: + logger.exception( + f"[Task Failure] Error updating Task status for {task_id}: {error}" + ) def send_progress_update(user: User, task_id: str, status: str, message: str) -> None: diff --git a/geoapi/tests/api_tests/test_point_cloud_routes.py b/geoapi/tests/api_tests/test_point_cloud_routes.py index b8f71cec..5a4ac04c 100644 --- a/geoapi/tests/api_tests/test_point_cloud_routes.py +++ b/geoapi/tests/api_tests/test_point_cloud_routes.py @@ -78,8 +78,8 @@ def test_delete_point_cloud( assert point_cloud is None -@patch("geoapi.tasks.external_data.import_point_clouds_from_tapis") -def test_import_lidar_tapis( +@patch("geoapi.tasks.point_cloud.import_point_clouds_from_tapis") +def test_import_point_cloud_tapis( import_point_clouds_from_tapis_mock, test_client, projects_fixture, @@ -96,7 +96,7 @@ def test_import_lidar_tapis( import_point_clouds_from_tapis_mock.delay.assert_called_once() -def test_import_lidar_tapis_wrong_file( +def test_import_point_cloud_tapis_wrong_file( test_client, projects_fixture, point_cloud_fixture, db_session ): u1 = db_session.get(User, 1) diff --git a/geoapi/tests/api_tests/test_point_cloud_service.py b/geoapi/tests/api_tests/test_point_cloud_service.py index b10a9148..4f60151f 100644 --- a/geoapi/tests/api_tests/test_point_cloud_service.py +++ b/geoapi/tests/api_tests/test_point_cloud_service.py @@ -8,13 +8,16 @@ from geoapi.models import User, Feature, FeatureAsset, PointCloud from geoapi.utils.assets import get_project_asset_dir, get_asset_path from geoapi.celery_app import app -from geoapi.tasks.external_data import import_point_clouds_from_tapis +from geoapi.tasks.point_cloud import import_point_clouds_from_tapis POINT_CLOUD_DATA = { "description": "description", "conversion_parameters": "--scale 2.0", } +# Mock all web hooks for these tests +pytestmark = pytest.mark.usefixtures("mock_task_update_webhook") + @pytest.fixture(scope="function") def celery_task_always_eager(): @@ -55,7 +58,7 @@ def test_delete_point_cloud(projects_fixture, db_session): @pytest.mark.worker -@patch("geoapi.tasks.external_data.TapisUtils") +@patch("geoapi.tasks.point_cloud.TapisUtils") def test_delete_point_cloud_feature( MockTapisUtils, celery_task_always_eager, diff --git a/geoapi/tests/conftest.py b/geoapi/tests/conftest.py index 6e63c2ae..ca98f3df 100644 --- a/geoapi/tests/conftest.py +++ b/geoapi/tests/conftest.py @@ -539,7 +539,7 @@ def import_from_tapis_mock(): @pytest.fixture(scope="function") def convert_to_potree_mock(): with patch( - "geoapi.services.point_cloud.convert_to_potree" + "geoapi.tasks.point_cloud.convert_to_potree" ) as mock_convert_to_potree: class FakeAsyncResult: diff --git a/geoapi/tests/external_data_tests/test_external_data.py b/geoapi/tests/external_data_tests/test_external_data.py index 14849f77..69f3b7c4 100644 --- a/geoapi/tests/external_data_tests/test_external_data.py +++ b/geoapi/tests/external_data_tests/test_external_data.py @@ -8,11 +8,11 @@ from geoapi.db import create_task_session from geoapi.tasks.external_data import ( import_from_tapis, - import_point_clouds_from_tapis, refresh_projects_watch_content, refresh_projects_watch_users, get_additional_files, ) +from geoapi.tasks.point_cloud import import_point_clouds_from_tapis from geoapi.utils.features import is_member_of_rapp_project_folder from geoapi.utils.external_apis import TapisFileListing, SystemUser from geoapi.utils.assets import get_project_asset_dir, get_asset_path @@ -341,7 +341,7 @@ def test_external_data_rapp_missing_geospatial_metadata( @pytest.mark.worker -@patch("geoapi.tasks.external_data.TapisUtils") +@patch("geoapi.tasks.point_cloud.TapisUtils") def test_import_point_clouds_from_tapis( MockTapisUtils, user1, @@ -397,8 +397,8 @@ def test_import_point_clouds_from_tapis( @pytest.mark.worker -@patch("geoapi.tasks.external_data.check_point_cloud") -@patch("geoapi.tasks.external_data.TapisUtils") +@patch("geoapi.tasks.point_cloud.check_point_cloud") +@patch("geoapi.tasks.point_cloud.TapisUtils") def test_import_point_clouds_from_tapis_check_point_cloud_missing_crs( MockTapisUtils, check_mock, @@ -432,8 +432,8 @@ def test_import_point_clouds_from_tapis_check_point_cloud_missing_crs( @pytest.mark.worker -@patch("geoapi.tasks.external_data.check_point_cloud") -@patch("geoapi.tasks.external_data.TapisUtils") +@patch("geoapi.tasks.point_cloud.check_point_cloud") +@patch("geoapi.tasks.point_cloud.TapisUtils") def test_import_point_clouds_from_tapis_check_point_cloud_unknown( MockTapisUtils, check_mock, @@ -469,8 +469,8 @@ def test_import_point_clouds_from_tapis_check_point_cloud_unknown( @pytest.mark.worker -@patch("geoapi.tasks.external_data.convert_to_potree") -@patch("geoapi.tasks.external_data.TapisUtils") +@patch("geoapi.tasks.point_cloud.convert_to_potree") +@patch("geoapi.tasks.point_cloud.TapisUtils") def test_import_point_clouds_from_tapis_conversion_error( MockTapisUtils, convert_mock, @@ -503,8 +503,8 @@ def test_import_point_clouds_from_tapis_conversion_error( @pytest.mark.worker -@patch("geoapi.tasks.external_data.TapisUtils") -@patch("geoapi.tasks.lidar.run_potree_converter") +@patch("geoapi.tasks.point_cloud.TapisUtils") +@patch("geoapi.tasks.point_cloud.run_potree_converter") def test_import_point_clouds_from_tapis_conversion_error_due_to_memory_sigterm( mock_run_potree_converter, MockTapisUtils, diff --git a/geoapi/tests/utils_tests/test_lidar.py b/geoapi/tests/utils_tests/test_point_cloud.py similarity index 97% rename from geoapi/tests/utils_tests/test_lidar.py rename to geoapi/tests/utils_tests/test_point_cloud.py index 208366d6..62995302 100644 --- a/geoapi/tests/utils_tests/test_lidar.py +++ b/geoapi/tests/utils_tests/test_point_cloud.py @@ -1,5 +1,5 @@ import pytest -from geoapi.utils.lidar import getProj4, get_bounding_box_2d +from geoapi.utils.point_cloud import getProj4, get_bounding_box_2d from geoapi.exceptions import InvalidCoordinateReferenceSystem diff --git a/geoapi/utils/lidar.py b/geoapi/utils/point_cloud.py similarity index 100% rename from geoapi/utils/lidar.py rename to geoapi/utils/point_cloud.py From 462ad57f54063cba10328ff56fd5c8f901d8d7e9 Mon Sep 17 00:00:00 2001 From: Nathan Franklin Date: Mon, 17 Nov 2025 12:36:39 -0600 Subject: [PATCH 21/28] Fix formatting --- geoapi/tests/conftest.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/geoapi/tests/conftest.py b/geoapi/tests/conftest.py index ca98f3df..3d8209c0 100644 --- a/geoapi/tests/conftest.py +++ b/geoapi/tests/conftest.py @@ -538,9 +538,7 @@ def import_from_tapis_mock(): @pytest.fixture(scope="function") def convert_to_potree_mock(): - with patch( - "geoapi.tasks.point_cloud.convert_to_potree" - ) as mock_convert_to_potree: + with patch("geoapi.tasks.point_cloud.convert_to_potree") as mock_convert_to_potree: class FakeAsyncResult: id = "b53fdb0a-de1a-11e9-b641-0242c0a80004" From 13a4b36e0c030d6461e120d6acc3fd45b4d03795 Mon Sep 17 00:00:00 2001 From: Nathan Franklin Date: Wed, 19 Nov 2025 12:55:48 -0600 Subject: [PATCH 22/28] Add designsafe_project_id to what we track for files for frontend use --- geoapi/custom/designsafe/project.py | 47 +++++++++--- geoapi/custom/designsafe/project_users.py | 31 +------- geoapi/custom/designsafe/utils.py | 75 +++++++++++++++++++ ...c0a0b4_add_designsafe_project_id_column.py | 69 +++++++++++++++++ geoapi/models/file_location_tracking_mixin.py | 6 ++ geoapi/models/project.py | 2 + geoapi/schema/projects.py | 3 + geoapi/tasks/file_location_check.py | 75 ++++++++++++++++--- 8 files changed, 260 insertions(+), 48 deletions(-) create mode 100644 geoapi/custom/designsafe/utils.py create mode 100644 geoapi/migrations/versions/20251119_1534-9ff599c0a0b4_add_designsafe_project_id_column.py diff --git a/geoapi/custom/designsafe/project.py b/geoapi/custom/designsafe/project.py index b827fc85..e08ad753 100644 --- a/geoapi/custom/designsafe/project.py +++ b/geoapi/custom/designsafe/project.py @@ -3,6 +3,11 @@ from geoapi.settings import settings from geoapi.custom.designsafe.default_basemap_layers import default_layers from geoapi.models import User, Project +from geoapi.custom.designsafe.utils import ( + get_designsafe_project_data, + is_designsafe_project, + extract_project_uuid, +) def on_project_creation(database_session, user: User, project: Project): @@ -47,21 +52,43 @@ def on_project_creation(database_session, user: User, project: Project): f" project:{project.id} project_uuid:{project.uuid} " ) - try: - # add metadata to DS projects (i.e. only when system_id starts with "project-" - if project.system_id.startswith("project-"): + if is_designsafe_project(project.system_id): + try: + logger.debug( + f"Determining related DS project id for user:{user.username}" + f" project:{project.id} project_uuid:{project.uuid} " + ) + + designsafe_project_data = get_designsafe_project_data( + database_session=database_session, + user=user, + system_id=project.system_id, + ) + project.designsafe_project_id = designsafe_project_data["projectId"] + + database_session.add(project) + database_session.commit() + except Exception: + logger.exception( + f"Problem determining related DS project id for user:{user.username}" + f" project:{project.id} project_uuid:{project.uuid} " + ) + + try: logger.debug( f"Adding metadata for user:{user.username}" f" project:{project.id} project_uuid:{project.uuid} " ) + + # add metadata to DS projects (i.e. only when system_id starts with "project-" update_designsafe_project_hazmapper_metadata( user, project, add_project=True ) - except Exception: - logger.exception( - f"Problem adding metadata for user:{user.username}" - f" project:{project.id} project_uuid:{project.uuid} " - ) + except Exception: + logger.exception( + f"Problem adding metadata for user:{user.username}" + f" project:{project.id} project_uuid:{project.uuid} " + ) def on_project_deletion(database_session, user: User, project: Project): @@ -86,7 +113,7 @@ def on_project_deletion(database_session, user: User, project: Project): try: # remove metadata for DS projects (i.e. only when system_id starts with "project-" - if project.system_id.startswith("project-"): + if is_designsafe_project(project.system_id): logger.debug( f"Removing metadata for user:{user.username}" f" during deletion of project:{project.id} project_uuid:{project.uuid} " @@ -104,7 +131,7 @@ def on_project_deletion(database_session, user: User, project: Project): def update_designsafe_project_hazmapper_metadata( user: User, project: Project, add_project: bool ): - designsafe_uuid = project.system_id[len("project-") :] + designsafe_uuid = extract_project_uuid(project.system_id) from geoapi.utils.external_apis import get_session diff --git a/geoapi/custom/designsafe/project_users.py b/geoapi/custom/designsafe/project_users.py index 6db3a934..5f399bb6 100644 --- a/geoapi/custom/designsafe/project_users.py +++ b/geoapi/custom/designsafe/project_users.py @@ -1,30 +1,5 @@ from geoapi.exceptions import GetUsersForProjectNotSupported -from geoapi.log import logger -from geoapi.settings import settings -from geoapi.models import User - - -def get_project_data(database_session, user: User, system_id: str) -> dict: - """ - Get project data for a certain system - - :param database_session: db session - :param user: user to use when querying system from DesignSafe - :param system_id: str - :return: project data - - """ - from geoapi.utils.external_apis import ApiUtils - - logger.debug(f"Getting project metadata for system:{system_id}") - - uuid = system_id[len("project-") :] - client = ApiUtils(database_session, user, settings.DESIGNSAFE_URL) - resp = client.get(f"/api/projects/v2/{uuid}/") - resp.raise_for_status() - - project = resp.json()["baseProject"]["value"] - return project +from geoapi.custom.designsafe.utils import get_designsafe_project_data def _is_designsafe_project_guest_user(user): @@ -52,7 +27,9 @@ def get_system_users(database_session, user, system_id: str): f"System:{system_id} is not a project so unable to get users" ) - project = get_project_data(database_session, user, system_id) + project = get_designsafe_project_data( + database_session=database_session, user=user, system_id=system_id + ) users = {} for u in project["users"]: diff --git a/geoapi/custom/designsafe/utils.py b/geoapi/custom/designsafe/utils.py new file mode 100644 index 00000000..42d1517a --- /dev/null +++ b/geoapi/custom/designsafe/utils.py @@ -0,0 +1,75 @@ +from geoapi.log import logger +from geoapi.settings import settings +from geoapi.models import User + + +DESIGNSAFE_PROJECT_ID_CACHE = {} + + +def extract_project_uuid(system_id: str) -> str | None: + if system_id.startswith("project-"): + return system_id.removeprefix("project-") + return None + + +def is_designsafe_project(system_id: str) -> bool: + return system_id.startswith("project-") + + +def get_designsafe_project_data(database_session, user: User, system_id: str) -> dict: + """ + Get project data for a certain system + + :param database_session: db session + :param user: user to use when querying system from DesignSafe + :param system_id: str + :return: project data + + """ + from geoapi.utils.external_apis import ApiUtils + + logger.debug(f"Getting project metadata for system:{system_id}") + + uuid = system_id[len("project-") :] + client = ApiUtils(database_session, user, settings.DESIGNSAFE_URL) + resp = client.get(f"/api/projects/v2/{uuid}/") + resp.raise_for_status() + + project = resp.json()["baseProject"]["value"] + return project + + +def get_designsafe_project_id( + database_session, user: User, system_id: str +) -> str | None: + """Get designsafe project id (i.e. PRJ-XXXX) + + If not a DS project or error occurs we return None + """ + if not is_designsafe_project(system_id): + logger.debug(f"System {system_id} is not a DesignSafe project, skipping") + return None + + # Check cache first + if system_id in DESIGNSAFE_PROJECT_ID_CACHE: + return DESIGNSAFE_PROJECT_ID_CACHE[system_id] + + # Not in cache - fetch from DesignSafe API + try: + designsafe_project_data = get_designsafe_project_data( + database_session=database_session, user=user, system_id=system_id + ) + designsafe_project_id = designsafe_project_data["projectId"] + + # Cache it + DESIGNSAFE_PROJECT_ID_CACHE[system_id] = designsafe_project_id + + logger.debug( + f"Fetched and cached project_id for system {system_id}: {designsafe_project_id}" + ) + return designsafe_project_id + except Exception as e: + logger.exception( + f"Failed to fetch DesignSafe project_id for system {system_id}: {e}" + ) + return None diff --git a/geoapi/migrations/versions/20251119_1534-9ff599c0a0b4_add_designsafe_project_id_column.py b/geoapi/migrations/versions/20251119_1534-9ff599c0a0b4_add_designsafe_project_id_column.py new file mode 100644 index 00000000..befcfee5 --- /dev/null +++ b/geoapi/migrations/versions/20251119_1534-9ff599c0a0b4_add_designsafe_project_id_column.py @@ -0,0 +1,69 @@ +"""add_designsafe_project_id_column + +Revision ID: 9ff599c0a0b4 +Revises: 898a088d94d3 +Create Date: 2025-11-19 15:34:51.471586 + +""" + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = "9ff599c0a0b4" +down_revision = "898a088d94d3" +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.add_column( + "feature_assets", + sa.Column( + "designsafe_project_id", + sa.String(), + nullable=True, + comment="DesignSafe project ID, e.g. 'PRJ-1234', associated with current_system/original_system", + ), + ) + op.create_index( + op.f("ix_feature_assets_designsafe_project_id"), + "feature_assets", + ["designsafe_project_id"], + unique=False, + ) + op.add_column( + "projects", sa.Column("designsafe_project_id", sa.String(), nullable=True) + ) + op.add_column( + "tile_servers", + sa.Column( + "designsafe_project_id", + sa.String(), + nullable=True, + comment="DesignSafe project ID, e.g. 'PRJ-1234', associated with current_system/original_system", + ), + ) + op.create_index( + op.f("ix_tile_servers_designsafe_project_id"), + "tile_servers", + ["designsafe_project_id"], + unique=False, + ) + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.drop_index( + op.f("ix_tile_servers_designsafe_project_id"), table_name="tile_servers" + ) + op.drop_column("tile_servers", "designsafe_project_id") + op.drop_column("projects", "designsafe_project_id") + op.drop_index( + op.f("ix_feature_assets_designsafe_project_id"), table_name="feature_assets" + ) + op.drop_column("feature_assets", "designsafe_project_id") + # ### end Alembic commands ### diff --git a/geoapi/models/file_location_tracking_mixin.py b/geoapi/models/file_location_tracking_mixin.py index a7f6fc4b..07c1dd9a 100644 --- a/geoapi/models/file_location_tracking_mixin.py +++ b/geoapi/models/file_location_tracking_mixin.py @@ -39,6 +39,12 @@ class FileLocationTrackingMixin: comment="Current Tapis system (updated if file is published or moved)", ) + designsafe_project_id = mapped_column( + String(), + nullable=True, + index=True, + comment="DesignSafe project ID, e.g. 'PRJ-1234', associated with current_system/original_system", + ) is_on_public_system = mapped_column( Boolean(), nullable=True, diff --git a/geoapi/models/project.py b/geoapi/models/project.py index d8f55dd6..ca6000bc 100644 --- a/geoapi/models/project.py +++ b/geoapi/models/project.py @@ -33,6 +33,8 @@ class Project(Base): id = mapped_column(Integer, primary_key=True) uuid = mapped_column(UUID(as_uuid=True), default=uuid.uuid4, nullable=False) tenant_id = mapped_column(String, nullable=False) + # associated DesignSafe project + designsafe_project_id = mapped_column(String, nullable=True) # associated tapis system id system_id = mapped_column(String, nullable=True) # associated tapis system path diff --git a/geoapi/schema/projects.py b/geoapi/schema/projects.py index 1b944863..087a0905 100644 --- a/geoapi/schema/projects.py +++ b/geoapi/schema/projects.py @@ -27,6 +27,7 @@ class FeatureAssetModel(BaseModel): display_path: str | None = None current_system: str | None = None current_path: str | None = None + designsafe_project_id: str | None = None last_public_system_check: datetime | None = None is_on_public_system: bool | None = None @@ -88,6 +89,7 @@ class ProjectDTO(SQLAlchemyDTO[Project]): "system_file", "system_id", "system_path", + "designsafe_project_id", "deletable", }, ) @@ -191,6 +193,7 @@ class TileServerDTO(SQLAlchemyDTO[TileServer]): "original_path", "current_system", "current_path", + "designsafe_project_id", "is_on_public_system", "last_public_system_check", } diff --git a/geoapi/tasks/file_location_check.py b/geoapi/tasks/file_location_check.py index 82b67738..58102194 100644 --- a/geoapi/tasks/file_location_check.py +++ b/geoapi/tasks/file_location_check.py @@ -19,6 +19,11 @@ from geoapi.log import logger from geoapi.tasks.utils import update_task_and_send_progress_update from geoapi.settings import settings +from geoapi.custom.designsafe.utils import ( + get_designsafe_project_id, + extract_project_uuid, + is_designsafe_project, +) DESIGNSAFE_PUBLISHED_SYSTEM = "designsafe.storage.published" @@ -26,20 +31,9 @@ DESIGNSAFE_PUBLISHED_SYSTEM, "designsafe.storage.community", ] - BATCH_SIZE = 500 # Commit every 500 items -def extract_project_uuid(system_id: str) -> str | None: - if system_id.startswith("project-"): - return system_id.removeprefix("project-") - return None - - -def is_designsafe_project(system_id: str) -> bool: - return system_id.startswith("project-") - - def build_file_index_from_tapis( client, system_id: str, path: str = "/" ) -> dict[str, list[str]]: @@ -266,6 +260,52 @@ def fix_and_backfill_feature_asset( logger.debug(f"Completed checking asset={asset.id}") +def check_and_update_designsafe_project_id( + item: Union[FeatureAsset, TileServer], + session, + user, +) -> None: + """ + Check and update the designsafe_project_id for an item based on its current_system. + Uses module-level caching to minimize API calls to DesignSafe. + + Args: + item: FeatureAsset or TileServer to update + session: Database session + user: User for API calls + """ + + if item.designsafe_project_id: + logger.debug("Nothing to do as item has designsafe_project_id") + return + + # Check if we can derive PRJ from published projects path + if ( + item.original_system == "designsafe.storage.published" + and item.original_path + and item.original_path.startswith("/published-data/PRJ-") + ): + parts = item.original_path.split("/") + item.designsafe_project_id = parts[2] # PRJ-XXXX + return + + # Determine which system to use + system_to_check = item.original_system or item.current_system + + if not system_to_check: + logger.debug(f"No system to check for {type(item).__name__}={item.id}") + return + + if not is_designsafe_project(system_to_check): + logger.debug(f"System {system_to_check} is not a DesignSafe project, skipping") + return + + designsafe_project_id = get_designsafe_project_id(session, user, system_to_check) + if designsafe_project_id: + logger.debug(f"Setting item's designsafe_project_id to {designsafe_project_id}") + item.designsafe_project_id = designsafe_project_id + + def check_and_update_public_system( item: Union[FeatureAsset, TileServer], published_file_tree_cache: Dict, @@ -423,6 +463,14 @@ def check_and_update_file_locations(user_id: int, project_id: int): ) logger.info(f"Indexed {len(unpublished_project_file_index)} files") + if not project.designsafe_project_id: + designsafe_project_id = get_designsafe_project_id( + database_session=session, user=user, system_id=project.system_id + ) + if designsafe_project_id: + project.designsafe_project_id = designsafe_project_id + session.add(project) + # Process each feature asset for i, asset in enumerate(feature_assets): try: @@ -445,6 +493,11 @@ def check_and_update_file_locations(user_id: int, project_id: int): project_system_file_tree=unpublished_project_file_index, ) + # Check and update DesignSafe project ID + check_and_update_designsafe_project_id( + session=session, item=asset, user=user + ) + # Check and update public system status check_and_update_public_system( asset, published_file_tree_cache, session, user From 06823f2c91839473d1c07891eee09832f86ddebd Mon Sep 17 00:00:00 2001 From: Nathan Franklin Date: Thu, 20 Nov 2025 10:39:48 -0600 Subject: [PATCH 23/28] Update project id on tile layers --- geoapi/models/file_location_tracking_mixin.py | 4 +--- geoapi/schema/projects.py | 1 + geoapi/tasks/file_location_check.py | 7 ++++++- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/geoapi/models/file_location_tracking_mixin.py b/geoapi/models/file_location_tracking_mixin.py index 07c1dd9a..ae04db5e 100644 --- a/geoapi/models/file_location_tracking_mixin.py +++ b/geoapi/models/file_location_tracking_mixin.py @@ -11,8 +11,8 @@ class FileLocationTrackingMixin: - Current location: where the file is now (may differ if published/moved) - Public accessibility: whether current location is publicly accessible - Last check timestamp: when accessibility was last verified + - DesignSafe project id, e.g. 'PRJ-1234', associated with the system """ - original_system = mapped_column( String(), nullable=True, @@ -25,7 +25,6 @@ class FileLocationTrackingMixin: index=True, # Adding index for TileServer too comment="Original file path on the source system", ) - current_path = mapped_column( String(), nullable=True, @@ -38,7 +37,6 @@ class FileLocationTrackingMixin: index=True, comment="Current Tapis system (updated if file is published or moved)", ) - designsafe_project_id = mapped_column( String(), nullable=True, diff --git a/geoapi/schema/projects.py b/geoapi/schema/projects.py index 087a0905..6be317ac 100644 --- a/geoapi/schema/projects.py +++ b/geoapi/schema/projects.py @@ -220,6 +220,7 @@ class TileServerModel(BaseModel): current_system: str | None = None current_path: str | None = None is_on_public_system: bool | None = None + designsafe_project_id: str | None = None last_public_system_check: datetime | None = None diff --git a/geoapi/tasks/file_location_check.py b/geoapi/tasks/file_location_check.py index 58102194..bd52844f 100644 --- a/geoapi/tasks/file_location_check.py +++ b/geoapi/tasks/file_location_check.py @@ -276,7 +276,7 @@ def check_and_update_designsafe_project_id( """ if item.designsafe_project_id: - logger.debug("Nothing to do as item has designsafe_project_id") + logger.debug(f"Nothing to do as item has designsafe_project_id:{item.designsafe_project_id}") return # Check if we can derive PRJ from published projects path @@ -546,6 +546,11 @@ def check_and_update_file_locations(user_id: int, project_id: int): # Backfill current_system/current_path if missing backfill_current_location(tile_server) + # Check and update DesignSafe project ID + check_and_update_designsafe_project_id( + session=session, item=tile_server, user=user + ) + # Check and update public system status check_and_update_public_system( tile_server, published_file_tree_cache, session, user From c4da3a6b45f07979c4b1123aa7dcf96961a11943 Mon Sep 17 00:00:00 2001 From: Nathan Franklin Date: Thu, 20 Nov 2025 10:57:28 -0600 Subject: [PATCH 24/28] Fix unit tests --- geoapi/custom/designsafe/utils.py | 4 +++- geoapi/models/file_location_tracking_mixin.py | 1 + geoapi/tasks/file_location_check.py | 4 +++- geoapi/tests/custom/designsafe/test_project.py | 7 ++++--- 4 files changed, 11 insertions(+), 5 deletions(-) diff --git a/geoapi/custom/designsafe/utils.py b/geoapi/custom/designsafe/utils.py index 42d1517a..1e16ec69 100644 --- a/geoapi/custom/designsafe/utils.py +++ b/geoapi/custom/designsafe/utils.py @@ -13,7 +13,9 @@ def extract_project_uuid(system_id: str) -> str | None: def is_designsafe_project(system_id: str) -> bool: - return system_id.startswith("project-") + if system_id and system_id.startswith("project-"): + return True + return False def get_designsafe_project_data(database_session, user: User, system_id: str) -> dict: diff --git a/geoapi/models/file_location_tracking_mixin.py b/geoapi/models/file_location_tracking_mixin.py index ae04db5e..c9821843 100644 --- a/geoapi/models/file_location_tracking_mixin.py +++ b/geoapi/models/file_location_tracking_mixin.py @@ -13,6 +13,7 @@ class FileLocationTrackingMixin: - Last check timestamp: when accessibility was last verified - DesignSafe project id, e.g. 'PRJ-1234', associated with the system """ + original_system = mapped_column( String(), nullable=True, diff --git a/geoapi/tasks/file_location_check.py b/geoapi/tasks/file_location_check.py index bd52844f..a981b471 100644 --- a/geoapi/tasks/file_location_check.py +++ b/geoapi/tasks/file_location_check.py @@ -276,7 +276,9 @@ def check_and_update_designsafe_project_id( """ if item.designsafe_project_id: - logger.debug(f"Nothing to do as item has designsafe_project_id:{item.designsafe_project_id}") + logger.debug( + f"Nothing to do as item has designsafe_project_id:{item.designsafe_project_id}" + ) return # Check if we can derive PRJ from published projects path diff --git a/geoapi/tests/custom/designsafe/test_project.py b/geoapi/tests/custom/designsafe/test_project.py index f7f45a38..96498e3c 100644 --- a/geoapi/tests/custom/designsafe/test_project.py +++ b/geoapi/tests/custom/designsafe/test_project.py @@ -20,7 +20,8 @@ def test_on_project_creation( metadata_url = settings.DESIGNSAFE_URL + f"/api/projects/v2/{designsafe_uuid}/" requests_mock.get( - metadata_url, json={"baseProject": {"value": {"hazmapperMaps": []}}} + metadata_url, + json={"baseProject": {"value": {"hazmapperMaps": [], "projectId": "PRJ-1234"}}}, ) requests_mock.post(metadata_url) @@ -28,8 +29,8 @@ def test_on_project_creation( assert len(TileService.getTileServers(db_session, projectId=project.id)) == 2 - assert len(requests_mock.request_history) == 3 - update_metadata_request = requests_mock.request_history[2] + assert len(requests_mock.request_history) == 4 + update_metadata_request = requests_mock.request_history[3] assert json.loads(update_metadata_request.text) == { "patchMetadata": { "hazmapperMaps": [ From 8883ce24af16f237824c8aa45266bc4f800d796d Mon Sep 17 00:00:00 2001 From: Nathan Franklin Date: Mon, 24 Nov 2025 09:40:55 -0600 Subject: [PATCH 25/28] Adding missing properties to response --- geoapi/routes/file_location_status.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/geoapi/routes/file_location_status.py b/geoapi/routes/file_location_status.py index 96ee3b5d..797890eb 100644 --- a/geoapi/routes/file_location_status.py +++ b/geoapi/routes/file_location_status.py @@ -36,7 +36,7 @@ class FileLocationSummary(BaseModel): total_features: int features_with_assets: int # Checked - features_without_assets: int # Not checked ✗ + features_without_assets: int # Not checked total_tile_servers: int internal_tile_servers: int # Checked external_tile_servers: int # Not checked @@ -225,6 +225,9 @@ def get_files_status( project_id=public_access_check.project_id, started_at=public_access_check.started_at, completed_at=public_access_check.completed_at, + total_files=public_access_check.total_files, + files_checked=public_access_check.files_checked, + files_failed=public_access_check.files_failed, task=( TaskModel.model_validate(public_access_check.task) if public_access_check.task From 262cf2ef23d91c05983cbae89932218acbc69d84 Mon Sep 17 00:00:00 2001 From: Nathan Franklin Date: Tue, 25 Nov 2025 14:58:14 -0600 Subject: [PATCH 26/28] Fix issue with setting path/system for some assets --- geoapi/routes/projects.py | 2 +- geoapi/services/features.py | 7 ++++++- geoapi/tasks/external_data.py | 1 + 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/geoapi/routes/projects.py b/geoapi/routes/projects.py index 085c3ebf..cb66608e 100644 --- a/geoapi/routes/projects.py +++ b/geoapi/routes/projects.py @@ -434,7 +434,7 @@ def add_feature_asset( ) ) return FeaturesService.createFeatureAssetFromTapis( - db_session, u, project_id, feature_id, system_id, path + db_session, u, project_id, feature_id, systemId=system_id, path=path ) diff --git a/geoapi/services/features.py b/geoapi/services/features.py index bf4e04e7..915887db 100644 --- a/geoapi/services/features.py +++ b/geoapi/services/features.py @@ -615,7 +615,12 @@ def createFeatureAssetFromTapis( fileObj = client.getFile(systemId, path) fileObj.filename = pathlib.Path(path).name return FeaturesService.createFeatureAsset( - database_session, projectId, featureId, fileObj, original_path=path + database_session, + projectId, + featureId, + fileObj, + original_path=path, + original_system=systemId, ) @staticmethod diff --git a/geoapi/tasks/external_data.py b/geoapi/tasks/external_data.py index fd46a9ea..70444149 100644 --- a/geoapi/tasks/external_data.py +++ b/geoapi/tasks/external_data.py @@ -316,6 +316,7 @@ def import_files_recursively_from_path( projectId, feat.id, tmp_file, + original_system=systemId, original_path=item_system_path, ) except: # noqa: E722 From 539a59b57a6c0aba79f5d67c76d4a482c3cfd389 Mon Sep 17 00:00:00 2001 From: Nathan Franklin Date: Tue, 25 Nov 2025 14:58:30 -0600 Subject: [PATCH 27/28] Improve logging and error handling --- geoapi/tasks/file_location_check.py | 36 ++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/geoapi/tasks/file_location_check.py b/geoapi/tasks/file_location_check.py index a981b471..177ed887 100644 --- a/geoapi/tasks/file_location_check.py +++ b/geoapi/tasks/file_location_check.py @@ -47,12 +47,16 @@ def build_file_index_from_tapis( try: listing = client.listing(system_id, path) - except TapisListingError: - # TODO handle 404 better as we might just be missing published project - logger.exception( - f"Unable to list {system_id}/{path}. If 404, Project might not be published yet" - ) - return file_index + except TapisListingError as e: + if system_id == DESIGNSAFE_PUBLISHED_SYSTEM and e.response.status_code == 404: + # Not found implies that the project has not been published yet + logger.debug(f"Unable to list {system_id}/{path} as not published yet") + return file_index + else: + logger.exception( + f"Unable to list {system_id}/{path}. If 404, Project might not be published yet" + ) + return file_index for item in listing: if item.type == "dir" and not str(item.path).endswith(".Trash"): @@ -75,14 +79,27 @@ def build_file_index_from_tapis( return file_index -def get_project_id(user, system_id): - """Get project id (PRJ-123) for system attached to system_id""" +def get_project_id(user, system_id) -> str | None: + """Get project id (PRJ-123) for system attached to system_id. + + Returns None if project is not found (e.g., deleted in pprd). + """ designsafe_uuid = extract_project_uuid(system_id) + if designsafe_uuid is None: + return None + client = get_session(user) response = client.get( settings.DESIGNSAFE_URL + f"/api/projects/v2/{designsafe_uuid}/" ) + + if response.status_code == 404 or response.status_code == 403: + logger.warning( + f"Project not found for system_id={system_id} (uuid={designsafe_uuid})" + ) + return None + response.raise_for_status() return response.json()["baseProject"]["value"]["projectId"] @@ -103,6 +120,9 @@ def get_file_tree_for_published_project( designsafe_prj = get_project_id(user, system_id) + if designsafe_prj is None: + return None + tapis_client = TapisUtils(session, user) # Calculate the file tree of published DS project associated with this map project and place in a cache From b7b2aeb34c0a0f86b11c0082d6e5939a5966a365 Mon Sep 17 00:00:00 2001 From: Nathan Franklin Date: Mon, 8 Dec 2025 16:41:47 -0600 Subject: [PATCH 28/28] Fix migration with renaming constraint for auth.user_id Make the unique constraint rename conditional to support: - Old deployments (i.e. have PostgreSQL's default name at that time: auth_user_id_key) - Fresh DBs and deployments that already ran it (have SQLAlchemy naming_convention name: uq_auth_user_id) Without this fix, a fresh DB would fail on this migration: constraint "auth_user_id_key" of relation "auth" does not exist --- ...4287_tileserver__add_kind_internal_uuid.py | 30 ++++++++++++------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/geoapi/migrations/versions/20251001_0210-60f005164287_tileserver__add_kind_internal_uuid.py b/geoapi/migrations/versions/20251001_0210-60f005164287_tileserver__add_kind_internal_uuid.py index a3887c29..755096f9 100644 --- a/geoapi/migrations/versions/20251001_0210-60f005164287_tileserver__add_kind_internal_uuid.py +++ b/geoapi/migrations/versions/20251001_0210-60f005164287_tileserver__add_kind_internal_uuid.py @@ -11,6 +11,7 @@ from alembic import op import sqlalchemy as sa +from sqlalchemy import inspect # revision identifiers, used by Alembic. @@ -23,9 +24,16 @@ def upgrade(): # ### commands auto generated by Alembic - please adjust! ### - # Rename UC to match naming_convention; behavior unchanged. This was autogenerated - op.drop_constraint(op.f("auth_user_id_key"), "auth", type_="unique") - op.create_unique_constraint(op.f("uq_auth_user_id"), "auth", ["user_id"]) + conn = op.get_bind() + inspector = inspect(conn) + + unique_constraints = inspector.get_unique_constraints("auth") + constraint_names = {uc["name"] for uc in unique_constraints} + + if "auth_user_id_key" in constraint_names: + # Old DB that needs to rename to match newer naming_convention + op.drop_constraint(op.f("auth_user_id_key"), "auth", type_="unique") + op.create_unique_constraint(op.f("uq_auth_user_id"), "auth", ["user_id"]) # Add TileServer fields op.add_column( @@ -68,11 +76,13 @@ def downgrade(): op.drop_column("tile_servers", "uuid") op.drop_column("tile_servers", "internal") op.drop_column("tile_servers", "kind") - op.drop_constraint(op.f("uq_auth_user_id"), "auth", type_="unique") - op.create_unique_constraint( - op.f("auth_user_id_key"), - "auth", - ["user_id"], - postgresql_nulls_not_distinct=False, - ) + + # No need to drop/rename constraints as we aren't "downgrading" to older DB version. + # op.drop_constraint(op.f("uq_auth_user_id"), "auth", type_="unique") + # op.create_unique_constraint( + # op.f("auth_user_id_key"), + # "auth", + # ["user_id"], + # postgresql_nulls_not_distinct=False, + # ) # ### end Alembic commands ###