Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ services:
depends_on:
- mongo
# Run ingest with data dir mounted to /data
command: ["uv", "run", "python", "/app/mongodb/ingest_data.py", "--mongo-uri", "mongodb://admin:root@mongo:27017", "--input", "/data"]
command: ["uv", "run", "python", "/app/mongodb/ingest_data.py", "--mongo-uri", "mongodb://admin:root@mongo:27017", "--input", "/data", "--clean"]

test:
# Use the same container image as the app service for consistency
Expand Down
34 changes: 0 additions & 34 deletions mongodb/gold-example.json

This file was deleted.

25 changes: 23 additions & 2 deletions mongodb/ingest_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,20 @@ def connect(self) -> None:
logger.error(f"Failed to connect to MongoDB: {e}")
sys.exit(1)

def clean_collections(self) -> None:
"""Delete existing collections to start fresh."""
try:
collection_names = self.db.list_collection_names()
if 'entities' in collection_names:
logger.info("Dropping existing 'entities' collection")
self.db.entities.drop()
logger.info("Successfully dropped 'entities' collection")
else:
logger.info("No existing 'entities' collection found")
except PyMongoError as e:
logger.error(f"Error dropping collections: {e}")
sys.exit(1)

def load_schema(self) -> Dict:
"""Load the JSON schema from file."""
try:
Expand Down Expand Up @@ -82,7 +96,7 @@ def insert_entity(self, entity: Dict) -> Optional[str]:
if 'coordinates' in entity:
coordinates = entity['coordinates']
if isinstance(coordinates, dict) and 'latitude' in coordinates and 'longitude' in coordinates:
entity['coordinates'] = {
entity['geojson'] = {
Comment thread
shreddd marked this conversation as resolved.
'type': 'Point',
'coordinates': [coordinates['longitude'], coordinates['latitude']]
}
Expand All @@ -97,7 +111,7 @@ def insert_entity(self, entity: Dict) -> Optional[str]:
self.db.entities.create_index('data_type')

# Create 2dsphere index for geospatial queries on coordinates
self.db.entities.create_index([('coordinates', pymongo.GEOSPHERE)])
self.db.entities.create_index([('geojson', pymongo.GEOSPHERE)])
Copy link
Copy Markdown
Collaborator

@eecavanna eecavanna Jun 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am surprised to see indexes being created each time an entity gets inserted (as opposed to after all entities have been inserted).

Edit: Maybe these are effectively "no op"s when the index already exists.


# Insert with upsert to handle potential duplicates based on URI
result = self.db.entities.update_one(
Expand Down Expand Up @@ -167,6 +181,8 @@ def main():
help='Path or URL to the BERtron schema JSON file')
parser.add_argument('--input', required=True,
help='Path to the input JSON file or directory')
parser.add_argument('--clean', action='store_true',
help='Delete existing collections before ingesting new data')

args = parser.parse_args()

Expand All @@ -180,6 +196,11 @@ def main():
ingestor.connect()
ingestor.load_schema()

# Clean collections if requested
if args.clean:
logger.info("Clean flag enabled - removing existing collections")
ingestor.clean_collections()

total_stats = {
'processed': 0,
'valid': 0,
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@ authors = [
]
description = " Cross-BER Data Integration"
readme = "README.md"
requires-python = ">=3.12,<3.14"
requires-python = ">=3.12.9,<3.14"
dependencies = [
"bertron-schema @ git+https://github.com/ber-data/bertron-schema.git",
"dtspy @ https://github.com/kbase/dtspy/archive/730828cff3924fc4b2215fe5c1b67bc04aad377f.tar.gz",
"fastapi[standard]>=0.115.12",
"jsonschema>=4.0.0",
Expand Down
Loading