From 75c77ca4c9208fc0c96c791e1dc32d2cc4205912 Mon Sep 17 00:00:00 2001 From: lesh Date: Tue, 27 May 2025 16:23:47 +0300 Subject: [PATCH 1/2] reformat line length 100 --- dimos/agents/agent.py | 97 +++- dimos/agents/agent_ctransformers_gguf.py | 12 +- dimos/agents/agent_huggingface_local.py | 42 +- dimos/agents/agent_huggingface_remote.py | 8 +- dimos/agents/claude_agent.py | 101 +++- dimos/agents/memory/base.py | 13 +- dimos/agents/memory/chroma_impl.py | 8 +- dimos/agents/memory/image_embedding.py | 4 +- dimos/agents/memory/spatial_vector_db.py | 49 +- dimos/agents/memory/visual_memory.py | 4 +- dimos/agents/planning_agent.py | 12 +- dimos/agents/prompt_builder/impl.py | 17 +- .../agents/tokenizer/huggingface_tokenizer.py | 4 +- dimos/agents/tokenizer/openai_tokenizer.py | 4 +- dimos/data/data_pipeline.py | 6 +- dimos/data/pointcloud.py | 25 +- dimos/data/segment.py | 12 +- dimos/environment/agent_environment.py | 4 +- dimos/environment/environment.py | 4 +- dimos/hardware/interface.py | 5 +- .../models/Detic/configs/BoxSup_ViLD_200e.py | 8 +- dimos/models/Detic/configs/Detic_ViLD_200e.py | 13 +- dimos/models/Detic/demo.py | 4 +- dimos/models/Detic/detic/config.py | 4 +- .../detic/data/custom_dataset_dataloader.py | 33 +- .../Detic/detic/data/custom_dataset_mapper.py | 24 +- .../detic/data/datasets/coco_zeroshot.py | 34 +- .../Detic/detic/data/datasets/imagenet.py | 14 +- .../data/datasets/lvis_22k_categories.py | 466 +++++++++++++++--- .../Detic/detic/data/datasets/lvis_v1.py | 21 +- .../Detic/detic/data/datasets/objects365.py | 20 +- dimos/models/Detic/detic/data/datasets/oid.py | 15 +- .../Detic/detic/data/datasets/register_oid.py | 8 +- dimos/models/Detic/detic/data/tar_dataset.py | 8 +- .../detic/evaluation/custom_coco_eval.py | 8 +- .../models/Detic/detic/evaluation/oideval.py | 35 +- .../modeling/backbone/swintransformer.py | 66 ++- .../Detic/detic/modeling/backbone/timm.py | 15 +- dimos/models/Detic/detic/modeling/debug.py | 71 ++- .../detic/modeling/meta_arch/custom_rcnn.py | 25 +- .../modeling/meta_arch/d2_deformable_detr.py | 32 +- .../modeling/roi_heads/detic_fast_rcnn.py | 80 ++- .../modeling/roi_heads/detic_roi_heads.py | 48 +- .../modeling/roi_heads/res5_roi_heads.py | 52 +- .../roi_heads/zero_shot_classifier.py | 10 +- .../Detic/detic/modeling/text/text_encoder.py | 14 +- dimos/models/Detic/detic/modeling/utils.py | 8 +- dimos/models/Detic/detic/predictor.py | 12 +- dimos/models/Detic/lazy_train_net.py | 8 +- dimos/models/Detic/predict.py | 4 +- .../data/custom_build_augmentation.py | 2 - .../data/custom_dataset_dataloader.py | 24 +- .../centernet/data/datasets/coco.py | 13 +- .../centernet/data/datasets/nuimages.py | 5 +- .../centernet/data/datasets/objects365.py | 5 +- .../centernet/modeling/backbone/bifpn.py | 12 +- .../centernet/modeling/backbone/bifpn_fcos.py | 25 +- .../centernet/modeling/backbone/dla.py | 93 +++- .../centernet/modeling/backbone/dlafpn.py | 87 +++- .../centernet/modeling/backbone/res2net.py | 20 +- .../CenterNet2/centernet/modeling/debug.py | 75 ++- .../modeling/dense_heads/centernet.py | 117 ++++- .../modeling/dense_heads/centernet_head.py | 15 +- .../centernet/modeling/layers/deform_conv.py | 9 +- .../modeling/meta_arch/centernet_detector.py | 4 +- .../modeling/roi_heads/custom_fast_rcnn.py | 16 +- .../modeling/roi_heads/custom_roi_heads.py | 20 +- .../Detic/third_party/CenterNet2/predictor.py | 16 +- .../CenterNet2/tools/analyze_model.py | 18 +- .../third_party/CenterNet2/tools/benchmark.py | 12 +- .../CenterNet2/tools/deploy/export_model.py | 4 +- .../CenterNet2/tools/lazyconfig_train_net.py | 8 +- .../CenterNet2/tools/lightning_train_net.py | 14 +- .../CenterNet2/tools/plain_train_net.py | 38 +- .../third_party/CenterNet2/tools/train_net.py | 12 +- .../Detic/third_party/CenterNet2/train_net.py | 27 +- .../third_party/Deformable-DETR/benchmark.py | 8 +- .../Deformable-DETR/datasets/coco.py | 17 +- .../Deformable-DETR/datasets/coco_eval.py | 12 +- .../Deformable-DETR/datasets/coco_panoptic.py | 20 +- .../datasets/data_prefetcher.py | 4 +- .../Deformable-DETR/datasets/panoptic_eval.py | 7 +- .../Deformable-DETR/datasets/samplers.py | 13 +- .../Deformable-DETR/datasets/transforms.py | 12 +- .../third_party/Deformable-DETR/engine.py | 16 +- .../Detic/third_party/Deformable-DETR/main.py | 110 ++++- .../Deformable-DETR/models/backbone.py | 12 +- .../Deformable-DETR/models/deformable_detr.py | 57 ++- .../models/deformable_transformer.py | 121 ++++- .../Deformable-DETR/models/matcher.py | 19 +- .../ops/functions/ms_deform_attn_func.py | 35 +- .../models/ops/modules/ms_deform_attn.py | 24 +- .../Deformable-DETR/models/ops/test.py | 18 +- .../models/position_encoding.py | 8 +- .../Deformable-DETR/models/segmentation.py | 50 +- .../Deformable-DETR/tools/launch.py | 10 +- .../third_party/Deformable-DETR/util/misc.py | 36 +- .../Deformable-DETR/util/plot_utils.py | 19 +- .../Detic/tools/create_imagenetlvis_json.py | 12 +- dimos/models/Detic/tools/create_lvis_21k.py | 4 +- .../models/Detic/tools/dump_clip_features.py | 10 +- dimos/models/Detic/tools/fix_o365_path.py | 4 +- dimos/models/Detic/tools/get_cc_tags.py | 10 +- .../Detic/tools/get_coco_zeroshot_oriorder.py | 4 +- .../tools/get_imagenet_21k_full_tar_json.py | 4 +- dimos/models/Detic/tools/merge_lvis_coco.py | 12 +- .../Detic/tools/preprocess_imagenet22k.py | 4 +- dimos/models/Detic/tools/remove_lvis_rare.py | 4 +- dimos/models/Detic/train_net.py | 30 +- dimos/models/depth/metric3d.py | 24 +- dimos/models/labels/llava-34b.py | 23 +- dimos/models/pointcloud/pointcloud_utils.py | 8 +- dimos/models/qwen/video_query.py | 17 +- dimos/models/segmentation/clipseg.py | 5 +- dimos/models/segmentation/sam.py | 8 +- dimos/models/segmentation/segment_utils.py | 4 +- dimos/perception/common/cuboid_fit.py | 34 +- .../perception/common/detection2d_tracker.py | 20 +- dimos/perception/common/export_tensorrt.py | 16 +- dimos/perception/common/ibvs.py | 8 +- dimos/perception/detection2d/detic_2d_det.py | 30 +- dimos/perception/detection2d/utils.py | 31 +- dimos/perception/detection2d/yolo_2d_det.py | 10 +- dimos/perception/object_detection_stream.py | 26 +- dimos/perception/object_tracker.py | 36 +- dimos/perception/person_tracker.py | 63 ++- .../perception/segmentation/image_analyzer.py | 13 +- dimos/perception/segmentation/sam_2d_seg.py | 42 +- dimos/perception/segmentation/utils.py | 17 +- dimos/perception/semantic_seg.py | 48 +- dimos/perception/spatial_perception.py | 37 +- dimos/perception/visual_servoing.py | 8 +- dimos/robot/global_planner/algo.py | 22 +- dimos/robot/global_planner/planner.py | 5 +- dimos/robot/local_planner/local_planner.py | 141 ++++-- .../robot/local_planner/vfh_local_planner.py | 28 +- dimos/robot/position_stream.py | 10 +- dimos/robot/recorder.py | 4 +- dimos/robot/robot.py | 4 +- dimos/robot/ros_command_queue.py | 31 +- dimos/robot/ros_control.py | 48 +- dimos/robot/ros_observable_topic.py | 17 +- dimos/robot/ros_transform.py | 57 ++- dimos/robot/test_ros_observable_topic.py | 4 +- dimos/robot/unitree/unitree_go2.py | 8 +- dimos/robot/unitree/unitree_skills.py | 110 ++++- dimos/robot/unitree_webrtc/connection.py | 6 +- dimos/robot/unitree_webrtc/testing/helpers.py | 7 +- .../unitree_webrtc/testing/test_multimock.py | 12 +- dimos/robot/unitree_webrtc/type/costmap.py | 8 +- dimos/robot/unitree_webrtc/type/lidar.py | 4 +- dimos/robot/unitree_webrtc/type/map.py | 8 +- dimos/robot/unitree_webrtc/type/test_lidar.py | 5 +- dimos/robot/unitree_webrtc/type/vector.py | 3 +- dimos/simulation/genesis/stream.py | 4 +- dimos/simulation/isaac/stream.py | 8 +- dimos/skills/navigation.py | 91 +++- dimos/skills/observe_stream.py | 11 +- dimos/skills/rest/rest.py | 8 +- dimos/skills/skills.py | 16 +- dimos/skills/speak.py | 4 +- dimos/skills/visual_navigation_skills.py | 17 +- dimos/stream/audio/node_key_recorder.py | 3 +- dimos/stream/audio/node_normalizer.py | 8 +- dimos/stream/audio/node_simulated.py | 17 +- dimos/stream/audio/node_volume_monitor.py | 4 +- dimos/stream/audio/volume.py | 4 +- dimos/stream/frame_processor.py | 6 +- dimos/stream/ros_video_provider.py | 8 +- dimos/stream/rtsp_video_provider.py | 45 +- dimos/stream/video_operators.py | 62 ++- dimos/stream/video_provider.py | 4 +- dimos/stream/video_providers/unitree.py | 12 +- dimos/types/costmap.py | 12 +- dimos/types/path.py | 4 +- dimos/types/position.py | 4 +- dimos/types/robot_location.py | 12 +- dimos/types/sample.py | 26 +- dimos/types/vector.py | 3 +- dimos/utils/extract_frames.py | 9 +- dimos/utils/logging_config.py | 4 +- dimos/utils/reactive.py | 4 +- dimos/utils/simple_controller.py | 4 +- dimos/utils/test_reactive.py | 41 +- dimos/web/dimos_interface/api/server.py | 28 +- dimos/web/fastapi_server.py | 19 +- dimos/web/flask_server.py | 12 +- dimos/web/robot_web_interface.py | 1 - 188 files changed, 3616 insertions(+), 977 deletions(-) diff --git a/dimos/agents/agent.py b/dimos/agents/agent.py index 6cfebd9950..c5d7195585 100644 --- a/dimos/agents/agent.py +++ b/dimos/agents/agent.py @@ -175,7 +175,9 @@ def __init__( self.image_detail: str = "low" self.max_input_tokens_per_request: int = max_input_tokens_per_request self.max_output_tokens_per_request: int = max_output_tokens_per_request - self.max_tokens_per_request: int = self.max_input_tokens_per_request + self.max_output_tokens_per_request + self.max_tokens_per_request: int = ( + self.max_input_tokens_per_request + self.max_output_tokens_per_request + ) self.rag_query_n: int = 4 self.rag_similarity_threshold: float = 0.45 self.frame_processor: Optional[FrameProcessor] = None @@ -200,10 +202,14 @@ def __init__( RxOps.map( lambda combined: { "query": combined[0], - "objects": combined[1] if len(combined) > 1 else "No object data available", + "objects": combined[1] + if len(combined) > 1 + else "No object data available", } ), - RxOps.map(lambda data: f"{data['query']}\n\nCurrent objects detected:\n{data['objects']}"), + RxOps.map( + lambda data: f"{data['query']}\n\nCurrent objects detected:\n{data['objects']}" + ), RxOps.do_action( lambda x: print(f"\033[34mEnriched query: {x.split(chr(10))[0]}\033[0m") or [print(f"\033[34m{line}\033[0m") for line in x.split(chr(10))[1:]] @@ -222,7 +228,9 @@ def __init__( # Define a query extractor for the merged stream query_extractor = lambda emission: (emission[0], emission[1][0]) self.disposables.add( - self.subscribe_to_image_processing(self.merged_stream, query_extractor=query_extractor) + self.subscribe_to_image_processing( + self.merged_stream, query_extractor=query_extractor + ) ) else: # If no merged stream, fall back to individual streams @@ -250,7 +258,9 @@ def _get_rag_context(self) -> Tuple[str, str]: and condensed results (for use in the prompt). """ results = self.agent_memory.query( - query_texts=self.query, n_results=self.rag_query_n, similarity_threshold=self.rag_similarity_threshold + query_texts=self.query, + n_results=self.rag_query_n, + similarity_threshold=self.rag_similarity_threshold, ) formatted_results = "\n".join( f"Document ID: {doc.id}\nMetadata: {doc.metadata}\nContent: {doc.page_content}\nScore: {score}\n" @@ -334,7 +344,12 @@ def _tooling_callback(message, messages, response_message, skill_library: SkillL result = skill_library.call(name, **args) logger.info(f"Function Call Results: {result}") new_messages.append( - {"role": "tool", "tool_call_id": tool_call.id, "content": str(result), "name": name} + { + "role": "tool", + "tool_call_id": tool_call.id, + "content": str(result), + "name": name, + } ) if has_called_tools: logger.info("Sending Another Query.") @@ -347,7 +362,9 @@ def _tooling_callback(message, messages, response_message, skill_library: SkillL return None if response_message.tool_calls is not None: - return _tooling_callback(response_message, messages, response_message, self.skill_library) + return _tooling_callback( + response_message, messages, response_message, self.skill_library + ) return None def _observable_query( @@ -373,7 +390,9 @@ def _observable_query( try: self._update_query(incoming_query) _, condensed_results = self._get_rag_context() - messages = self._build_prompt(base64_image, dimensions, override_token_limit, condensed_results) + messages = self._build_prompt( + base64_image, dimensions, override_token_limit, condensed_results + ) # logger.debug(f"Sending Query: {messages}") logger.info("Sending Query.") response_message = self._send_query(messages) @@ -391,13 +410,19 @@ def _observable_query( final_msg = ( response_message.parsed if hasattr(response_message, "parsed") and response_message.parsed - else (response_message.content if hasattr(response_message, "content") else response_message) + else ( + response_message.content + if hasattr(response_message, "content") + else response_message + ) ) observer.on_next(final_msg) self.response_subject.on_next(final_msg) else: response_message_2 = self._handle_tooling(response_message, messages) - final_msg = response_message_2 if response_message_2 is not None else response_message + final_msg = ( + response_message_2 if response_message_2 is not None else response_message + ) if isinstance(final_msg, BaseModel): # TODO: Test final_msg = str(final_msg.content) observer.on_next(final_msg) @@ -440,7 +465,9 @@ def _log_response_to_file(self, response, output_dir: str = None): file.write(f"{self.dev_name}: {response}\n") logger.info(f"LLM Response [{self.dev_name}]: {response}") - def subscribe_to_image_processing(self, frame_observable: Observable, query_extractor=None) -> Disposable: + def subscribe_to_image_processing( + self, frame_observable: Observable, query_extractor=None + ) -> Disposable: """Subscribes to a stream of video frames for processing. This method sets up a subscription to process incoming video frames. @@ -480,7 +507,9 @@ def _process_frame(emission) -> Observable: RxOps.subscribe_on(self.pool_scheduler), MyOps.print_emission(id="D", **print_emission_args), MyVidOps.with_jpeg_export( - self.frame_processor, suffix=f"{self.dev_name}_frame_", save_limit=_MAX_SAVED_FRAMES + self.frame_processor, + suffix=f"{self.dev_name}_frame_", + save_limit=_MAX_SAVED_FRAMES, ), MyOps.print_emission(id="E", **print_emission_args), MyVidOps.encode_image(), @@ -562,7 +591,9 @@ def _process_query(query) -> Observable: return just(query).pipe( MyOps.print_emission(id="Pr A", **print_emission_args), RxOps.flat_map( - lambda query: create(lambda observer, _: self._observable_query(observer, incoming_query=query)) + lambda query: create( + lambda observer, _: self._observable_query(observer, incoming_query=query) + ) ), MyOps.print_emission(id="Pr B", **print_emission_args), ) @@ -612,7 +643,9 @@ def get_response_observable(self) -> Observable: Observable: An observable that emits string responses from the agent. """ return self.response_subject.pipe( - RxOps.observe_on(self.pool_scheduler), RxOps.subscribe_on(self.pool_scheduler), RxOps.share() + RxOps.observe_on(self.pool_scheduler), + RxOps.subscribe_on(self.pool_scheduler), + RxOps.share(), ) def run_observable_query(self, query_text: str, **kwargs) -> Observable: @@ -631,7 +664,11 @@ def run_observable_query(self, query_text: str, **kwargs) -> Observable: Returns: Observable: An observable that emits the response as a string. """ - return create(lambda observer, _: self._observable_query(observer, incoming_query=query_text, **kwargs)) + return create( + lambda observer, _: self._observable_query( + observer, incoming_query=query_text, **kwargs + ) + ) def dispose_all(self): """Disposes of all active subscriptions managed by this agent.""" @@ -749,7 +786,9 @@ def __init__( self.response_model = response_model if response_model is not None else NOT_GIVEN self.model_name = model_name self.tokenizer = tokenizer or OpenAITokenizer(model_name=self.model_name) - self.prompt_builder = prompt_builder or PromptBuilder(self.model_name, tokenizer=self.tokenizer) + self.prompt_builder = prompt_builder or PromptBuilder( + self.model_name, tokenizer=self.tokenizer + ) self.rag_query_n = rag_query_n self.rag_similarity_threshold = rag_similarity_threshold self.image_detail = image_detail @@ -767,8 +806,14 @@ def __init__( def _add_context_to_memory(self): """Adds initial context to the agent's memory.""" context_data = [ - ("id0", "Optical Flow is a technique used to track the movement of objects in a video sequence."), - ("id1", "Edge Detection is a technique used to identify the boundaries of objects in an image."), + ( + "id0", + "Optical Flow is a technique used to track the movement of objects in a video sequence.", + ), + ( + "id1", + "Edge Detection is a technique used to identify the boundaries of objects in an image.", + ), ("id2", "Video is a sequence of frames captured at regular intervals."), ( "id3", @@ -805,7 +850,11 @@ def _send_query(self, messages: list) -> Any: model=self.model_name, messages=messages, response_format=self.response_model, - tools=(self.skill_library.get_tools() if self.skill_library is not None else NOT_GIVEN), + tools=( + self.skill_library.get_tools() + if self.skill_library is not None + else NOT_GIVEN + ), max_tokens=self.max_output_tokens_per_request, ) else: @@ -813,7 +862,11 @@ def _send_query(self, messages: list) -> Any: model=self.model_name, messages=messages, max_tokens=self.max_output_tokens_per_request, - tools=(self.skill_library.get_tools() if self.skill_library is not None else NOT_GIVEN), + tools=( + self.skill_library.get_tools() + if self.skill_library is not None + else NOT_GIVEN + ), ) response_message = response.choices[0].message if response_message is None: @@ -843,7 +896,9 @@ def stream_query(self, query_text: str) -> Observable: Returns: Observable: An observable that emits the response as a string. """ - return create(lambda observer, _: self._observable_query(observer, incoming_query=query_text)) + return create( + lambda observer, _: self._observable_query(observer, incoming_query=query_text) + ) # endregion OpenAIAgent Subclass (OpenAI-Specific Implementation) diff --git a/dimos/agents/agent_ctransformers_gguf.py b/dimos/agents/agent_ctransformers_gguf.py index 6b2dd878c2..32d6fc59ca 100644 --- a/dimos/agents/agent_ctransformers_gguf.py +++ b/dimos/agents/agent_ctransformers_gguf.py @@ -141,7 +141,9 @@ def __init__( self.tokenizer = CTransformersTokenizerAdapter(self.model) - self.prompt_builder = prompt_builder or PromptBuilder(self.model_name, tokenizer=self.tokenizer) + self.prompt_builder = prompt_builder or PromptBuilder( + self.model_name, tokenizer=self.tokenizer + ) self.max_output_tokens_per_request = max_output_tokens_per_request @@ -152,7 +154,9 @@ def __init__( # Ensure only one input stream is provided. if self.input_video_stream is not None and self.input_query_stream is not None: - raise ValueError("More than one input stream provided. Please provide only one input stream.") + raise ValueError( + "More than one input stream provided. Please provide only one input stream." + ) if self.input_video_stream is not None: logger.info("Subscribing to input video stream...") @@ -198,7 +202,9 @@ def stream_query(self, query_text: str) -> Subject: """ Creates an observable that processes a text query and emits the response. """ - return create(lambda observer, _: self._observable_query(observer, incoming_query=query_text)) + return create( + lambda observer, _: self._observable_query(observer, incoming_query=query_text) + ) # endregion HuggingFaceLLMAgent Subclass (HuggingFace-Specific Implementation) diff --git a/dimos/agents/agent_huggingface_local.py b/dimos/agents/agent_huggingface_local.py index 0ee9bfa940..14f970c3bc 100644 --- a/dimos/agents/agent_huggingface_local.py +++ b/dimos/agents/agent_huggingface_local.py @@ -98,10 +98,14 @@ def __init__( self.tokenizer = tokenizer or HuggingFaceTokenizer(self.model_name) - self.prompt_builder = prompt_builder or PromptBuilder(self.model_name, tokenizer=self.tokenizer) + self.prompt_builder = prompt_builder or PromptBuilder( + self.model_name, tokenizer=self.tokenizer + ) self.model = AutoModelForCausalLM.from_pretrained( - model_name, torch_dtype=torch.float16 if self.device == "cuda" else torch.float32, device_map=self.device + model_name, + torch_dtype=torch.float16 if self.device == "cuda" else torch.float32, + device_map=self.device, ) self.max_output_tokens_per_request = max_output_tokens_per_request @@ -113,7 +117,9 @@ def __init__( # Ensure only one input stream is provided. if self.input_video_stream is not None and self.input_query_stream is not None: - raise ValueError("More than one input stream provided. Please provide only one input stream.") + raise ValueError( + "More than one input stream provided. Please provide only one input stream." + ) if self.input_video_stream is not None: logger.info("Subscribing to input video stream...") @@ -142,21 +148,28 @@ def _send_query(self, messages: list) -> Any: # Tokenize the prompt print("Preparing model inputs...") - model_inputs = self.tokenizer.tokenizer([prompt_text], return_tensors="pt").to(self.model.device) + model_inputs = self.tokenizer.tokenizer([prompt_text], return_tensors="pt").to( + self.model.device + ) print("Model inputs prepared.") # Generate the response print("Generating response...") - generated_ids = self.model.generate(**model_inputs, max_new_tokens=self.max_output_tokens_per_request) + generated_ids = self.model.generate( + **model_inputs, max_new_tokens=self.max_output_tokens_per_request + ) # Extract the generated tokens (excluding the input prompt tokens) print("Processing generated output...") generated_ids = [ - output_ids[len(input_ids) :] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) + output_ids[len(input_ids) :] + for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) ] # Convert tokens back to text - response = self.tokenizer.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] + response = self.tokenizer.tokenizer.batch_decode( + generated_ids, skip_special_tokens=True + )[0] print("Response successfully generated.") return response @@ -168,14 +181,21 @@ def _send_query(self, messages: list) -> Any: except Exception as e: # Log any other errors but continue execution - logger.warning(f"Error in chat template processing: {e}. Falling back to simple format.") + logger.warning( + f"Error in chat template processing: {e}. Falling back to simple format." + ) # Fallback approach for models without chat template support # This code runs if the try block above raises an exception print("Using simple prompt format...") # Convert messages to a simple text format - if isinstance(messages, list) and messages and isinstance(messages[0], dict) and "content" in messages[0]: + if ( + isinstance(messages, list) + and messages + and isinstance(messages[0], dict) + and "content" in messages[0] + ): prompt_text = messages[0]["content"] else: prompt_text = str(messages) @@ -207,7 +227,9 @@ def stream_query(self, query_text: str) -> Subject: """ Creates an observable that processes a text query and emits the response. """ - return create(lambda observer, _: self._observable_query(observer, incoming_query=query_text)) + return create( + lambda observer, _: self._observable_query(observer, incoming_query=query_text) + ) # endregion HuggingFaceLLMAgent Subclass (HuggingFace-Specific Implementation) diff --git a/dimos/agents/agent_huggingface_remote.py b/dimos/agents/agent_huggingface_remote.py index 446e9804ea..d98b277706 100644 --- a/dimos/agents/agent_huggingface_remote.py +++ b/dimos/agents/agent_huggingface_remote.py @@ -110,7 +110,9 @@ def __init__( # Ensure only one input stream is provided. if self.input_video_stream is not None and self.input_query_stream is not None: - raise ValueError("More than one input stream provided. Please provide only one input stream.") + raise ValueError( + "More than one input stream provided. Please provide only one input stream." + ) if self.input_video_stream is not None: logger.info("Subscribing to input video stream...") @@ -136,4 +138,6 @@ def stream_query(self, query_text: str) -> Subject: """ Creates an observable that processes a text query and emits the response. """ - return create(lambda observer, _: self._observable_query(observer, incoming_query=query_text)) + return create( + lambda observer, _: self._observable_query(observer, incoming_query=query_text) + ) diff --git a/dimos/agents/claude_agent.py b/dimos/agents/claude_agent.py index 9d87567eda..53148d17b3 100644 --- a/dimos/agents/claude_agent.py +++ b/dimos/agents/claude_agent.py @@ -159,9 +159,7 @@ def __init__( # Configure skills self.skills = skills - self.skill_library = ( - None # Required for error 'ClaudeAgent' object has no attribute 'skill_library' due to skills refactor - ) + self.skill_library = None # Required for error 'ClaudeAgent' object has no attribute 'skill_library' due to skills refactor if isinstance(self.skills, SkillLibrary): self.skill_library = self.skills elif isinstance(self.skills, list): @@ -188,15 +186,23 @@ def __init__( # Ensure only one input stream is provided. if self.input_video_stream is not None and self.input_query_stream is not None: - raise ValueError("More than one input stream provided. Please provide only one input stream.") + raise ValueError( + "More than one input stream provided. Please provide only one input stream." + ) logger.info("Claude Agent Initialized.") def _add_context_to_memory(self): """Adds initial context to the agent's memory.""" context_data = [ - ("id0", "Optical Flow is a technique used to track the movement of objects in a video sequence."), - ("id1", "Edge Detection is a technique used to identify the boundaries of objects in an image."), + ( + "id0", + "Optical Flow is a technique used to track the movement of objects in a video sequence.", + ), + ( + "id1", + "Edge Detection is a technique used to identify the boundaries of objects in an image.", + ), ("id2", "Video is a sequence of frames captured at regular intervals."), ( "id3", @@ -283,7 +289,9 @@ def _build_prompt( ) else: messages.append({"role": "user", "content": self.query}) - logger.info(f"Added new user message to conversation history (now has {len(messages)} messages)") + logger.info( + f"Added new user message to conversation history (now has {len(messages)} messages)" + ) if base64_image is not None: # Handle both single image (str) and multiple images (List[str]) @@ -291,11 +299,18 @@ def _build_prompt( # Add each image as a separate entry in conversation history for img in images: - img_content = [{"type": "image", "source": {"type": "base64", "media_type": "image/jpeg", "data": img}}] + img_content = [ + { + "type": "image", + "source": {"type": "base64", "media_type": "image/jpeg", "data": img}, + } + ] messages.append({"role": "user", "content": img_content}) if images: - logger.info(f"Added {len(images)} image(s) as separate entries to conversation history") + logger.info( + f"Added {len(images)} image(s) as separate entries to conversation history" + ) # Create Claude parameters with basic settings claude_params = { @@ -377,7 +392,12 @@ def _send_query(self, messages: list, claude_params: dict) -> Any: if event.type == "content_block_start": # Initialize a new content block block_type = event.content_block.type - current_block = {"type": block_type, "id": event.index, "content": "", "signature": None} + current_block = { + "type": block_type, + "id": event.index, + "content": "", + "signature": None, + } logger.debug(f"Starting {block_type} block...") elif event.type == "content_block_delta": @@ -397,7 +417,9 @@ def _send_query(self, messages: list, claude_params: dict) -> Any: elif event.delta.type == "signature_delta": # Store signature for thinking blocks current_block["signature"] = event.delta.signature - memory_file.write(f"\n[Signature received for block {current_block['id']}]\n") + memory_file.write( + f"\n[Signature received for block {current_block['id']}]\n" + ) memory_file.flush() elif event.type == "content_block_stop": @@ -408,7 +430,9 @@ def _send_query(self, messages: list, claude_params: dict) -> Any: if hasattr(event, "content_block"): # Use the exact thinking block as provided by Claude thinking_blocks.append(event.content_block.model_dump()) - memory_file.write(f"\nTHINKING COMPLETE: block {current_block['id']}\n") + memory_file.write( + f"\nTHINKING COMPLETE: block {current_block['id']}\n" + ) else: # Fallback to constructed thinking block if content_block missing thinking_block = { @@ -417,12 +441,19 @@ def _send_query(self, messages: list, claude_params: dict) -> Any: "signature": current_block["signature"], } thinking_blocks.append(thinking_block) - memory_file.write(f"\nTHINKING COMPLETE: block {current_block['id']}\n") + memory_file.write( + f"\nTHINKING COMPLETE: block {current_block['id']}\n" + ) elif current_block["type"] == "redacted_thinking": # Handle redacted thinking blocks - if hasattr(event, "content_block") and hasattr(event.content_block, "data"): - redacted_block = {"type": "redacted_thinking", "data": event.content_block.data} + if hasattr(event, "content_block") and hasattr( + event.content_block, "data" + ): + redacted_block = { + "type": "redacted_thinking", + "data": event.content_block.data, + } thinking_blocks.append(redacted_block) elif current_block["type"] == "tool_use": @@ -440,7 +471,12 @@ def _send_query(self, messages: list, claude_params: dict) -> Any: { "id": tool_id, "function": type( - "Function", (), {"name": tool_name, "arguments": json.dumps(tool_input)} + "Function", + (), + { + "name": tool_name, + "arguments": json.dumps(tool_input), + }, ), }, ) @@ -448,13 +484,22 @@ def _send_query(self, messages: list, claude_params: dict) -> Any: # Write tool call information to memory.txt memory_file.write(f"\n\nTOOL CALL: {tool_name}\n") - memory_file.write(f"ARGUMENTS: {json.dumps(tool_input, indent=2)}\n") + memory_file.write( + f"ARGUMENTS: {json.dumps(tool_input, indent=2)}\n" + ) # Reset current block - current_block = {"type": None, "id": None, "content": "", "signature": None} + current_block = { + "type": None, + "id": None, + "content": "", + "signature": None, + } memory_file.flush() - elif event.type == "message_delta" and event.delta.stop_reason == "tool_use": + elif ( + event.type == "message_delta" and event.delta.stop_reason == "tool_use" + ): # When a tool use is detected logger.info("Tool use stop reason detected in stream") @@ -529,7 +574,11 @@ def _observable_query( _, rag_results = self._get_rag_context() # Build prompt and get Claude parameters - budget = thinking_budget_tokens if thinking_budget_tokens is not None else self.thinking_budget_tokens + budget = ( + thinking_budget_tokens + if thinking_budget_tokens is not None + else self.thinking_budget_tokens + ) messages, claude_params = self._build_prompt( messages, base64_image, dimensions, override_token_limit, rag_results, budget ) @@ -585,7 +634,9 @@ def _handle_tooling(self, response_message, messages): return None if len(response_message.tool_calls) > 1: - logger.warning("Multiple tool calls detected in response message. Not a tested feature.") + logger.warning( + "Multiple tool calls detected in response message. Not a tested feature." + ) # Execute all tools first and collect their results for tool_call in response_message.tool_calls: @@ -607,7 +658,13 @@ def _handle_tooling(self, response_message, messages): messages.append( { "role": "user", - "content": [{"type": "tool_result", "tool_use_id": tool_call.id, "content": f"{tool_result}"}], + "content": [ + { + "type": "tool_result", + "tool_use_id": tool_call.id, + "content": f"{tool_result}", + } + ], } ) diff --git a/dimos/agents/memory/base.py b/dimos/agents/memory/base.py index 4d1724d2ed..af8cbf689f 100644 --- a/dimos/agents/memory/base.py +++ b/dimos/agents/memory/base.py @@ -13,7 +13,10 @@ # limitations under the License. from abc import abstractmethod -from dimos.exceptions.agent_memory_exceptions import UnknownConnectionTypeError, AgentMemoryConnectionError +from dimos.exceptions.agent_memory_exceptions import ( + UnknownConnectionTypeError, + AgentMemoryConnectionError, +) from dimos.utils.logging_config import setup_logger # TODO @@ -36,7 +39,9 @@ def __init__(self, connection_type="local", **kwargs): self.logger = setup_logger(self.__class__.__name__) self.logger.info("Initializing AgentMemory with connection type: %s", connection_type) self.connection_params = kwargs - self.db_connection = None # Holds the conection, whether local or remote, to the database used. + self.db_connection = ( + None # Holds the conection, whether local or remote, to the database used. + ) if connection_type not in ["local", "remote"]: error = UnknownConnectionTypeError( @@ -52,7 +57,9 @@ def __init__(self, connection_type="local", **kwargs): self.create() except Exception as e: self.logger.error("Failed to initialize database connection: %s", str(e), exc_info=True) - raise AgentMemoryConnectionError("Initialization failed due to an unexpected error.", cause=e) from e + raise AgentMemoryConnectionError( + "Initialization failed due to an unexpected error.", cause=e + ) from e @abstractmethod def connect(self): diff --git a/dimos/agents/memory/chroma_impl.py b/dimos/agents/memory/chroma_impl.py index 74fe3f2f57..06f6989355 100644 --- a/dimos/agents/memory/chroma_impl.py +++ b/dimos/agents/memory/chroma_impl.py @@ -83,7 +83,9 @@ def delete_vector(self, vector_id): class OpenAISemanticMemory(ChromaAgentSemanticMemory): """Semantic memory implementation using OpenAI's embedding API.""" - def __init__(self, collection_name="my_collection", model="text-embedding-3-large", dimensions=1024): + def __init__( + self, collection_name="my_collection", model="text-embedding-3-large", dimensions=1024 + ): """Initialize OpenAI-based semantic memory. Args: @@ -120,7 +122,9 @@ def create(self): class LocalSemanticMemory(ChromaAgentSemanticMemory): """Semantic memory implementation using local models.""" - def __init__(self, collection_name="my_collection", model_name="sentence-transformers/all-MiniLM-L6-v2"): + def __init__( + self, collection_name="my_collection", model_name="sentence-transformers/all-MiniLM-L6-v2" + ): """Initialize the local semantic memory using SentenceTransformer. Args: diff --git a/dimos/agents/memory/image_embedding.py b/dimos/agents/memory/image_embedding.py index f2229e0289..8bcd225d85 100644 --- a/dimos/agents/memory/image_embedding.py +++ b/dimos/agents/memory/image_embedding.py @@ -165,7 +165,9 @@ def get_text_embedding(self, text: str) -> np.ndarray: text_embedding = text_features / text_features.norm(dim=1, keepdim=True) embedding = text_embedding.numpy()[0] - logger.debug(f"Generated text embedding with shape {embedding.shape} for text: '{text}'") + logger.debug( + f"Generated text embedding with shape {embedding.shape} for text: '{text}'" + ) return embedding except Exception as e: diff --git a/dimos/agents/memory/spatial_vector_db.py b/dimos/agents/memory/spatial_vector_db.py index b7e8d61962..cf44d0c589 100644 --- a/dimos/agents/memory/spatial_vector_db.py +++ b/dimos/agents/memory/spatial_vector_db.py @@ -37,7 +37,9 @@ class SpatialVectorDB: their absolute locations and querying by location, text, or image cosine semantic similarity. """ - def __init__(self, collection_name: str = "spatial_memory", chroma_client=None, visual_memory=None): + def __init__( + self, collection_name: str = "spatial_memory", chroma_client=None, visual_memory=None + ): """ Initialize the spatial vector database. @@ -80,11 +82,15 @@ def __init__(self, collection_name: str = "spatial_memory", chroma_client=None, try: count = len(self.image_collection.get(include=[])["ids"]) if collection_exists: - logger.info(f"Using EXISTING {client_type} collection '{collection_name}' with {count} entries") + logger.info( + f"Using EXISTING {client_type} collection '{collection_name}' with {count} entries" + ) else: logger.info(f"Created NEW {client_type} collection '{collection_name}'") except Exception as e: - logger.info(f"Initialized {client_type} collection '{collection_name}' (count error: {str(e)})") + logger.info( + f"Initialized {client_type} collection '{collection_name}' (count error: {str(e)})" + ) def add_image_vector( self, vector_id: str, image: np.ndarray, embedding: np.ndarray, metadata: Dict[str, Any] @@ -102,7 +108,9 @@ def add_image_vector( self.visual_memory.add(vector_id, image) # Add the vector to ChromaDB - self.image_collection.add(ids=[vector_id], embeddings=[embedding.tolist()], metadatas=[metadata]) + self.image_collection.add( + ids=[vector_id], embeddings=[embedding.tolist()], metadatas=[metadata] + ) logger.debug(f"Added image vector {vector_id} with metadata: {metadata}") @@ -117,12 +125,16 @@ def query_by_embedding(self, embedding: np.ndarray, limit: int = 5) -> List[Dict Returns: List of results, each containing the image and its metadata """ - results = self.image_collection.query(query_embeddings=[embedding.tolist()], n_results=limit) + results = self.image_collection.query( + query_embeddings=[embedding.tolist()], n_results=limit + ) return self._process_query_results(results) # TODO: implement efficient nearest neighbor search - def query_by_location(self, x: float, y: float, radius: float = 2.0, limit: int = 5) -> List[Dict]: + def query_by_location( + self, x: float, y: float, radius: float = 2.0, limit: int = 5 + ) -> List[Dict]: """ Query the vector database for images near the specified location. @@ -156,8 +168,12 @@ def query_by_location(self, x: float, y: float, radius: float = 2.0, limit: int sorted_indices = np.argsort(filtered_results["distances"]) filtered_results["ids"] = [filtered_results["ids"][i] for i in sorted_indices[:limit]] - filtered_results["metadatas"] = [filtered_results["metadatas"][i] for i in sorted_indices[:limit]] - filtered_results["distances"] = [filtered_results["distances"][i] for i in sorted_indices[:limit]] + filtered_results["metadatas"] = [ + filtered_results["metadatas"][i] for i in sorted_indices[:limit] + ] + filtered_results["distances"] = [ + filtered_results["distances"][i] for i in sorted_indices[:limit] + ] return self._process_query_results(filtered_results) @@ -172,12 +188,17 @@ def _process_query_results(self, results) -> List[Dict]: lookup_id = vector_id[0] if isinstance(vector_id, list) else vector_id # Create the result dictionary with metadata regardless of image availability - result = {"metadata": results["metadatas"][i] if "metadatas" in results else {}, "id": lookup_id} + result = { + "metadata": results["metadatas"][i] if "metadatas" in results else {}, + "id": lookup_id, + } # Add distance if available if "distances" in results: result["distance"] = ( - results["distances"][i][0] if isinstance(results["distances"][i], list) else results["distances"][i] + results["distances"][i][0] + if isinstance(results["distances"][i], list) + else results["distances"][i] ) # Get the image from visual memory @@ -209,10 +230,14 @@ def query_by_text(self, text: str, limit: int = 5) -> List[Dict]: text_embedding = embedding_provider.get_text_embedding(text) results = self.image_collection.query( - query_embeddings=[text_embedding.tolist()], n_results=limit, include=["documents", "metadatas", "distances"] + query_embeddings=[text_embedding.tolist()], + n_results=limit, + include=["documents", "metadatas", "distances"], ) - logger.info(f"Text query: '{text}' returned {len(results['ids'] if 'ids' in results else [])} results") + logger.info( + f"Text query: '{text}' returned {len(results['ids'] if 'ids' in results else [])} results" + ) return self._process_query_results(results) def get_all_locations(self) -> List[Tuple[float, float, float]]: diff --git a/dimos/agents/memory/visual_memory.py b/dimos/agents/memory/visual_memory.py index fed15a4026..0087a4fe9b 100644 --- a/dimos/agents/memory/visual_memory.py +++ b/dimos/agents/memory/visual_memory.py @@ -85,7 +85,9 @@ def get(self, image_id: str) -> Optional[np.ndarray]: The decoded image as a numpy array, or None if not found """ if image_id not in self.images: - logger.warning(f"Image not found in storage for ID {image_id}. Incomplete or corrupted image storage.") + logger.warning( + f"Image not found in storage for ID {image_id}. Incomplete or corrupted image storage." + ) return None try: diff --git a/dimos/agents/planning_agent.py b/dimos/agents/planning_agent.py index 04f7cad291..52971e770a 100644 --- a/dimos/agents/planning_agent.py +++ b/dimos/agents/planning_agent.py @@ -193,7 +193,9 @@ def _send_query(self, messages: list) -> PlanningAgentResponse: return super()._send_query(messages) except Exception as e: logger.error(f"Caught exception in _send_query: {str(e)}") - return PlanningAgentResponse(type="dialogue", content=f"Error: {str(e)}", needs_confirmation=False) + return PlanningAgentResponse( + type="dialogue", content=f"Error: {str(e)}", needs_confirmation=False + ) def process_user_input(self, user_input: str) -> None: """Process user input and generate appropriate response. @@ -210,7 +212,9 @@ def process_user_input(self, user_input: str) -> None: self.plan_confirmed = True # Create a proper PlanningAgentResponse with content as a list confirmation_msg = PlanningAgentResponse( - type="dialogue", content="Plan confirmed! Streaming steps to execution...", needs_confirmation=False + type="dialogue", + content="Plan confirmed! Streaming steps to execution...", + needs_confirmation=False, ) self._handle_response(confirmation_msg) self._stream_plan() @@ -247,7 +251,9 @@ def start_terminal_interface(self): print("=" * 50) print("\nDimOS Action PlanningAgent\n") print("I have access to your Robot() and Robot Skills()") - print("Describe your task and I'll break it down into steps using your skills as a reference.") + print( + "Describe your task and I'll break it down into steps using your skills as a reference." + ) print("Once you're happy with the plan, type 'yes' to execute it.") print("Type 'quit' to exit.\n") diff --git a/dimos/agents/prompt_builder/impl.py b/dimos/agents/prompt_builder/impl.py index 53983c990f..0e66191837 100644 --- a/dimos/agents/prompt_builder/impl.py +++ b/dimos/agents/prompt_builder/impl.py @@ -38,7 +38,9 @@ class PromptBuilder: - Maintain a professional and informative tone in your response. """) - def __init__(self, model_name="gpt-4o", max_tokens=128000, tokenizer: Optional[AbstractTokenizer] = None): + def __init__( + self, model_name="gpt-4o", max_tokens=128000, tokenizer: Optional[AbstractTokenizer] = None + ): """ Initialize the prompt builder. Args: @@ -149,7 +151,9 @@ def build( system_prompt_token_cnt = self.tokenizer.token_count(system_prompt) user_query_token_cnt = self.tokenizer.token_count(user_query) image_token_cnt = ( - self.tokenizer.image_token_count(image_width, image_height, image_detail) if base64_image else 0 + self.tokenizer.image_token_count(image_width, image_height, image_detail) + if base64_image + else 0 ) else: rag_token_cnt = 0 @@ -176,7 +180,9 @@ def build( break if policies[key] != "do_not_truncate": max_allowed = max(0, budgets[key] - excess_tokens) - components[key]["text"] = self.truncate_tokens(component["text"], max_allowed, policies[key]) + components[key]["text"] = self.truncate_tokens( + component["text"], max_allowed, policies[key] + ) tokens_after = self.tokenizer.token_count(components[key]["text"]) excess_tokens -= component["tokens"] - tokens_after component["tokens"] = tokens_after @@ -186,7 +192,10 @@ def build( if components["rag"]["text"]: user_content = [ - {"type": "text", "text": f"{components['rag']['text']}\n\n{components['user_query']['text']}"} + { + "type": "text", + "text": f"{components['rag']['text']}\n\n{components['user_query']['text']}", + } ] else: user_content = [{"type": "text", "text": components["user_query"]["text"]}] diff --git a/dimos/agents/tokenizer/huggingface_tokenizer.py b/dimos/agents/tokenizer/huggingface_tokenizer.py index b13cab8484..2a7b0d2283 100644 --- a/dimos/agents/tokenizer/huggingface_tokenizer.py +++ b/dimos/agents/tokenizer/huggingface_tokenizer.py @@ -26,7 +26,9 @@ def __init__(self, model_name: str = "Qwen/Qwen2.5-0.5B", **kwargs): try: self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) except Exception as e: - raise ValueError(f"Failed to initialize tokenizer for model {self.model_name}. Error: {str(e)}") + raise ValueError( + f"Failed to initialize tokenizer for model {self.model_name}. Error: {str(e)}" + ) def tokenize_text(self, text): """ diff --git a/dimos/agents/tokenizer/openai_tokenizer.py b/dimos/agents/tokenizer/openai_tokenizer.py index ba0126259b..7517ae5e72 100644 --- a/dimos/agents/tokenizer/openai_tokenizer.py +++ b/dimos/agents/tokenizer/openai_tokenizer.py @@ -26,7 +26,9 @@ def __init__(self, model_name: str = "gpt-4o", **kwargs): try: self.tokenizer = tiktoken.encoding_for_model(self.model_name) except Exception as e: - raise ValueError(f"Failed to initialize tokenizer for model {self.model_name}. Error: {str(e)}") + raise ValueError( + f"Failed to initialize tokenizer for model {self.model_name}. Error: {str(e)}" + ) def tokenize_text(self, text): """ diff --git a/dimos/data/data_pipeline.py b/dimos/data/data_pipeline.py index 52ae5c3b18..bebb881f48 100644 --- a/dimos/data/data_pipeline.py +++ b/dimos/data/data_pipeline.py @@ -135,7 +135,11 @@ def _process_frame(self, frame): if self.run_labels: label = self.labels_processor.caption_image_data(frame) - if self.run_pointclouds and isinstance(depth_map, DepthMapType) and self.pointcloud_processor: + if ( + self.run_pointclouds + and isinstance(depth_map, DepthMapType) + and self.pointcloud_processor + ): pointcloud = self.pointcloud_processor.process_frame(frame, depth_map.depth_data) if self.run_segmentations and isinstance(label, LabelType) and self.segmentation_processor: diff --git a/dimos/data/pointcloud.py b/dimos/data/pointcloud.py index b9db6eaa13..8d95635d2e 100644 --- a/dimos/data/pointcloud.py +++ b/dimos/data/pointcloud.py @@ -20,7 +20,10 @@ import logging from dimos.models.segmentation.segment_utils import apply_mask_to_image -from dimos.models.pointcloud.pointcloud_utils import create_point_cloud_from_rgbd, canonicalize_point_cloud +from dimos.models.pointcloud.pointcloud_utils import ( + create_point_cloud_from_rgbd, + canonicalize_point_cloud, +) from dimos.types.pointcloud import PointCloudType # Setup logging @@ -51,7 +54,9 @@ def __init__(self, output_dir, intrinsic_parameters=None): "cx": 320.0, "cy": 240.0, } - self.intrinsic_parameters = intrinsic_parameters if intrinsic_parameters else self.default_intrinsic_parameters + self.intrinsic_parameters = ( + intrinsic_parameters if intrinsic_parameters else self.default_intrinsic_parameters + ) def process_frame(self, image, depth_map, masks): """ @@ -67,7 +72,9 @@ def process_frame(self, image, depth_map, masks): bool: A flag indicating if the point clouds were canonicalized. """ try: - self.logger.info("STARTING POINT CLOUD PROCESSING ---------------------------------------") + self.logger.info( + "STARTING POINT CLOUD PROCESSING ---------------------------------------" + ) # Convert images to OpenCV format if they are PIL Images if isinstance(image, Image.Image): @@ -95,8 +102,12 @@ def process_frame(self, image, depth_map, masks): point_cloud_data = [] # Create original point cloud - original_pcd = create_point_cloud_from_rgbd(original_image_cv, depth_image_cv, intrinsic_parameters) - pcd, canonicalized, transformation = canonicalize_point_cloud(original_pcd, canonicalize_threshold=0.3) + original_pcd = create_point_cloud_from_rgbd( + original_image_cv, depth_image_cv, intrinsic_parameters + ) + pcd, canonicalized, transformation = canonicalize_point_cloud( + original_pcd, canonicalize_threshold=0.3 + ) for idx, mask in enumerate(masks): mask_binary = mask > 0 @@ -111,7 +122,9 @@ def process_frame(self, image, depth_map, masks): if canonicalized: inlier_cloud.transform(transformation) - point_clouds.append(PointCloudType(point_cloud=inlier_cloud, metadata={"mask_index": idx})) + point_clouds.append( + PointCloudType(point_cloud=inlier_cloud, metadata={"mask_index": idx}) + ) # Save point cloud to file pointcloud_filename = f"pointcloud_{idx}.pcd" pointcloud_filepath = os.path.join(self.output_dir, pointcloud_filename) diff --git a/dimos/data/segment.py b/dimos/data/segment.py index 214c6016b9..5279235b4e 100644 --- a/dimos/data/segment.py +++ b/dimos/data/segment.py @@ -73,10 +73,16 @@ def process_frame(self, image, captions): mask = (255 * mask_tensor[0].numpy().squeeze()).astype(np.uint8) sam_masks.append(mask) else: - self.logger.info(f"No mask tensor returned for sampled points at index {idx}") - sam_masks.append(np.zeros((original_size[1], original_size[0]), dtype=np.uint8)) + self.logger.info( + f"No mask tensor returned for sampled points at index {idx}" + ) + sam_masks.append( + np.zeros((original_size[1], original_size[0]), dtype=np.uint8) + ) else: - self.logger.info(f"No sampled points for prediction index {idx}, skipping mask inference") + self.logger.info( + f"No sampled points for prediction index {idx}, skipping mask inference" + ) sam_masks.append(np.zeros((original_size[1], original_size[0]), dtype=np.uint8)) self.logger.info("DONE PROCESSING IMAGE ---------------------------------------") diff --git a/dimos/environment/agent_environment.py b/dimos/environment/agent_environment.py index 491452c121..861a1f429b 100644 --- a/dimos/environment/agent_environment.py +++ b/dimos/environment/agent_environment.py @@ -93,7 +93,9 @@ def label_objects(self) -> List[str]: # TODO: Implement object labeling using a detection model raise NotImplementedError("Object labeling not yet implemented") - def generate_segmentations(self, model: str = None, objects: List[str] = None, *args, **kwargs) -> List[np.ndarray]: + def generate_segmentations( + self, model: str = None, objects: List[str] = None, *args, **kwargs + ) -> List[np.ndarray]: """Generate segmentations for the current frame.""" # TODO: Implement segmentation generation using specified model raise NotImplementedError("Segmentation generation not yet implemented") diff --git a/dimos/environment/environment.py b/dimos/environment/environment.py index b4f0d4fc76..0770b0f2ce 100644 --- a/dimos/environment/environment.py +++ b/dimos/environment/environment.py @@ -37,7 +37,9 @@ def get_visualization(self, format_type): pass @abstractmethod - def generate_segmentations(self, model: str = None, objects: list[str] = None, *args, **kwargs) -> list[np.ndarray]: + def generate_segmentations( + self, model: str = None, objects: list[str] = None, *args, **kwargs + ) -> list[np.ndarray]: """ Generate object segmentations of objects[] using neural methods. diff --git a/dimos/hardware/interface.py b/dimos/hardware/interface.py index c74567265a..9d7797a569 100644 --- a/dimos/hardware/interface.py +++ b/dimos/hardware/interface.py @@ -20,7 +20,10 @@ class HardwareInterface: def __init__( - self, end_effector: EndEffector = None, sensors: list = None, arm_architecture: UFactory7DOFArm = None + self, + end_effector: EndEffector = None, + sensors: list = None, + arm_architecture: UFactory7DOFArm = None, ): self.end_effector = end_effector self.sensors = sensors if sensors is not None else [] diff --git a/dimos/models/Detic/configs/BoxSup_ViLD_200e.py b/dimos/models/Detic/configs/BoxSup_ViLD_200e.py index 6a13101b28..b0bc16c30b 100644 --- a/dimos/models/Detic/configs/BoxSup_ViLD_200e.py +++ b/dimos/models/Detic/configs/BoxSup_ViLD_200e.py @@ -61,7 +61,9 @@ ) for (w1, w2) in [(10, 5)] ], - proposal_matchers=[L(Matcher)(thresholds=[th], labels=[0, 1], allow_low_quality_matches=False) for th in [0.5]], + proposal_matchers=[ + L(Matcher)(thresholds=[th], labels=[0, 1], allow_low_quality_matches=False) for th in [0.5] + ], ) model.roi_heads.mask_head.num_classes = 1 @@ -73,7 +75,9 @@ ) image_size = 896 dataloader.train.mapper.augmentations = [ - L(T.ResizeScale)(min_scale=0.1, max_scale=2.0, target_height=image_size, target_width=image_size), + L(T.ResizeScale)( + min_scale=0.1, max_scale=2.0, target_height=image_size, target_width=image_size + ), L(T.FixedSizeCrop)(crop_size=(image_size, image_size)), L(T.RandomFlip)(horizontal=True), ] diff --git a/dimos/models/Detic/configs/Detic_ViLD_200e.py b/dimos/models/Detic/configs/Detic_ViLD_200e.py index b2c4193769..c0983e291c 100644 --- a/dimos/models/Detic/configs/Detic_ViLD_200e.py +++ b/dimos/models/Detic/configs/Detic_ViLD_200e.py @@ -69,7 +69,9 @@ ) for (w1, w2) in [(10, 5)] ], - proposal_matchers=[L(Matcher)(thresholds=[th], labels=[0, 1], allow_low_quality_matches=False) for th in [0.5]], + proposal_matchers=[ + L(Matcher)(thresholds=[th], labels=[0, 1], allow_low_quality_matches=False) for th in [0.5] + ], with_image_labels=True, ws_num_props=128, ) @@ -97,13 +99,18 @@ use_diff_bs_size=True, dataset_augs=[ [ - L(T.ResizeScale)(min_scale=0.1, max_scale=2.0, target_height=image_size, target_width=image_size), + L(T.ResizeScale)( + min_scale=0.1, max_scale=2.0, target_height=image_size, target_width=image_size + ), L(T.FixedSizeCrop)(crop_size=(image_size, image_size)), L(T.RandomFlip)(horizontal=True), ], [ L(T.ResizeScale)( - min_scale=0.5, max_scale=1.5, target_height=image_size_weak, target_width=image_size_weak + min_scale=0.5, + max_scale=1.5, + target_height=image_size_weak, + target_width=image_size_weak, ), L(T.FixedSizeCrop)(crop_size=(image_size_weak, image_size_weak)), L(T.RandomFlip)(horizontal=True), diff --git a/dimos/models/Detic/demo.py b/dimos/models/Detic/demo.py index 429cae7a89..80efc99884 100755 --- a/dimos/models/Detic/demo.py +++ b/dimos/models/Detic/demo.py @@ -192,7 +192,9 @@ def test_opencv_video_format(codec, file_ext): frames_per_second = video.get(cv2.CAP_PROP_FPS) num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT)) basename = os.path.basename(args.video_input) - codec, file_ext = ("x264", ".mkv") if test_opencv_video_format("x264", ".mkv") else ("mp4v", ".mp4") + codec, file_ext = ( + ("x264", ".mkv") if test_opencv_video_format("x264", ".mkv") else ("mp4v", ".mp4") + ) if codec == ".mp4v": warnings.warn("x264 codec not available, switching to mp4v") if args.output: diff --git a/dimos/models/Detic/detic/config.py b/dimos/models/Detic/detic/config.py index 80eb65a8a6..eb8882f3b2 100644 --- a/dimos/models/Detic/detic/config.py +++ b/dimos/models/Detic/detic/config.py @@ -8,7 +8,9 @@ def add_detic_config(cfg): _C.WITH_IMAGE_LABELS = False # Turn on co-training with classification data # Open-vocabulary classifier - _C.MODEL.ROI_BOX_HEAD.USE_ZEROSHOT_CLS = False # Use fixed classifier for open-vocabulary detection + _C.MODEL.ROI_BOX_HEAD.USE_ZEROSHOT_CLS = ( + False # Use fixed classifier for open-vocabulary detection + ) _C.MODEL.ROI_BOX_HEAD.ZEROSHOT_WEIGHT_PATH = "datasets/metadata/lvis_v1_clip_a+cname.npy" _C.MODEL.ROI_BOX_HEAD.ZEROSHOT_WEIGHT_DIM = 512 _C.MODEL.ROI_BOX_HEAD.NORM_WEIGHT = True diff --git a/dimos/models/Detic/detic/data/custom_dataset_dataloader.py b/dimos/models/Detic/detic/data/custom_dataset_dataloader.py index a41cfb8f9d..bfbab55733 100644 --- a/dimos/models/Detic/detic/data/custom_dataset_dataloader.py +++ b/dimos/models/Detic/detic/data/custom_dataset_dataloader.py @@ -29,14 +29,18 @@ def _custom_train_loader_from_config(cfg, mapper=None, *, dataset=None, sampler= dataset_dicts = get_detection_dataset_dicts_with_source( cfg.DATASETS.TRAIN, filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS, - min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE if cfg.MODEL.KEYPOINT_ON else 0, + min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE + if cfg.MODEL.KEYPOINT_ON + else 0, proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None, ) else: dataset_dicts = get_detection_dataset_dicts( cfg.DATASETS.TRAIN, filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS, - min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE if cfg.MODEL.KEYPOINT_ON else 0, + min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE + if cfg.MODEL.KEYPOINT_ON + else 0, proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None, ) @@ -128,7 +132,9 @@ def build_multi_dataset_batch_data_loader( """ """ world_size = get_world_size() assert total_batch_size > 0 and total_batch_size % world_size == 0, ( - "Total batch size ({}) must be divisible by the number of gpus ({}).".format(total_batch_size, world_size) + "Total batch size ({}) must be divisible by the number of gpus ({}).".format( + total_batch_size, world_size + ) ) batch_size = total_batch_size // world_size @@ -146,7 +152,9 @@ def build_multi_dataset_batch_data_loader( return MDAspectRatioGroupedDataset(data_loader, batch_size, num_datasets) -def get_detection_dataset_dicts_with_source(dataset_names, filter_empty=True, min_keypoints=0, proposal_files=None): +def get_detection_dataset_dicts_with_source( + dataset_names, filter_empty=True, min_keypoints=0, proposal_files=None +): assert len(dataset_names) dataset_dicts = [DatasetCatalog.get(dataset_name) for dataset_name in dataset_names] for dataset_name, dicts in zip(dataset_names, dataset_dicts): @@ -195,7 +203,9 @@ def __init__( print("dataset sizes", sizes) self.sizes = sizes assert len(dataset_ratio) == len(sizes), ( - "length of dataset ratio {} should be equal to number if dataset {}".format(len(dataset_ratio), len(sizes)) + "length of dataset ratio {} should be equal to number if dataset {}".format( + len(dataset_ratio), len(sizes) + ) ) if seed is None: seed = comm.shared_random_seed() @@ -203,7 +213,9 @@ def __init__( self._rank = comm.get_rank() self._world_size = comm.get_world_size() - self.dataset_ids = torch.tensor([d["dataset_source"] for d in dataset_dicts], dtype=torch.long) + self.dataset_ids = torch.tensor( + [d["dataset_source"] for d in dataset_dicts], dtype=torch.long + ) dataset_weight = [ torch.ones(s) * max(sizes) / s * r / sum(dataset_ratio) @@ -238,7 +250,9 @@ def _infinite_indices(self): g = torch.Generator() g.manual_seed(self._seed) while True: - ids = torch.multinomial(self.weights, self.sample_epoch_size, generator=g, replacement=True) + ids = torch.multinomial( + self.weights, self.sample_epoch_size, generator=g, replacement=True + ) nums = [(self.dataset_ids[ids] == i).sum().int().item() for i in range(len(self.sizes))] yield from ids @@ -292,7 +306,10 @@ def repeat_factors_from_tag_frequency(dataset_dicts, repeat_thresh): for k, v in category_freq.items(): category_freq[k] = v / num_images - category_rep = {cat_id: max(1.0, math.sqrt(repeat_thresh / cat_freq)) for cat_id, cat_freq in category_freq.items()} + category_rep = { + cat_id: max(1.0, math.sqrt(repeat_thresh / cat_freq)) + for cat_id, cat_freq in category_freq.items() + } rep_factors = [] for dataset_dict in dataset_dicts: diff --git a/dimos/models/Detic/detic/data/custom_dataset_mapper.py b/dimos/models/Detic/detic/data/custom_dataset_mapper.py index 8eb8c94cfb..ed8e6ade59 100644 --- a/dimos/models/Detic/detic/data/custom_dataset_mapper.py +++ b/dimos/models/Detic/detic/data/custom_dataset_mapper.py @@ -125,7 +125,9 @@ def __call__(self, dataset_dict): # USER: Remove if you don't use pre-computed proposals. # Most users would not need this feature. if self.proposal_topk is not None: - utils.transform_proposals(dataset_dict, image_shape, transforms, proposal_topk=self.proposal_topk) + utils.transform_proposals( + dataset_dict, image_shape, transforms, proposal_topk=self.proposal_topk + ) if not self.is_train: # USER: Modify this if you want to keep them for some reason. @@ -155,7 +157,9 @@ def __call__(self, dataset_dict): for obj in dataset_dict.pop("annotations") ] annos = [ann[0] for ann in all_annos if ann[1] == 0] - instances = utils.annotations_to_instances(annos, image_shape, mask_format=self.instance_mask_format) + instances = utils.annotations_to_instances( + annos, image_shape, mask_format=self.instance_mask_format + ) del all_annos if self.recompute_boxes: @@ -164,8 +168,12 @@ def __call__(self, dataset_dict): if self.with_ann_type: dataset_dict["pos_category_ids"] = dataset_dict.get("pos_category_ids", []) dataset_dict["ann_type"] = self.dataset_ann[dataset_dict["dataset_source"]] - if self.is_debug and (("pos_category_ids" not in dataset_dict) or (dataset_dict["pos_category_ids"] == [])): - dataset_dict["pos_category_ids"] = [x for x in sorted(set(dataset_dict["instances"].gt_classes.tolist()))] + if self.is_debug and ( + ("pos_category_ids" not in dataset_dict) or (dataset_dict["pos_category_ids"] == []) + ): + dataset_dict["pos_category_ids"] = [ + x for x in sorted(set(dataset_dict["instances"].gt_classes.tolist())) + ] return dataset_dict @@ -181,7 +189,9 @@ def build_transform_gen(cfg, is_train): max_size = cfg.INPUT.MAX_SIZE_TEST sample_style = "choice" if sample_style == "range": - assert len(min_size) == 2, "more than 2 ({}) min_size(s) are provided for ranges".format(len(min_size)) + assert len(min_size) == 2, "more than 2 ({}) min_size(s) are provided for ranges".format( + len(min_size) + ) logger = logging.getLogger(__name__) tfm_gens = [] @@ -216,7 +226,9 @@ def __init__(self, cfg, is_train=True): self.mask_on = cfg.MODEL.MASK_ON self.tfm_gens = build_transform_gen(cfg, is_train) logging.getLogger(__name__).info( - "Full TransformGens used in training: {}, crop: {}".format(str(self.tfm_gens), str(self.crop_gen)) + "Full TransformGens used in training: {}, crop: {}".format( + str(self.tfm_gens), str(self.crop_gen) + ) ) self.img_format = cfg.INPUT.FORMAT diff --git a/dimos/models/Detic/detic/data/datasets/coco_zeroshot.py b/dimos/models/Detic/detic/data/datasets/coco_zeroshot.py index f61cdfb9fe..caf169adc9 100644 --- a/dimos/models/Detic/detic/data/datasets/coco_zeroshot.py +++ b/dimos/models/Detic/detic/data/datasets/coco_zeroshot.py @@ -88,14 +88,33 @@ def _get_metadata(cat): thing_dataset_id_to_contiguous_id = {x: i for i, x in enumerate(sorted(id_to_name))} thing_classes = [id_to_name[k] for k in sorted(id_to_name)] - return {"thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id, "thing_classes": thing_classes} + return { + "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id, + "thing_classes": thing_classes, + } _PREDEFINED_SPLITS_COCO = { - "coco_zeroshot_train": ("coco/train2017", "coco/zero-shot/instances_train2017_seen_2.json", "seen"), - "coco_zeroshot_val": ("coco/val2017", "coco/zero-shot/instances_val2017_unseen_2.json", "unseen"), - "coco_not_zeroshot_val": ("coco/val2017", "coco/zero-shot/instances_val2017_seen_2.json", "seen"), - "coco_generalized_zeroshot_val": ("coco/val2017", "coco/zero-shot/instances_val2017_all_2_oriorder.json", "all"), + "coco_zeroshot_train": ( + "coco/train2017", + "coco/zero-shot/instances_train2017_seen_2.json", + "seen", + ), + "coco_zeroshot_val": ( + "coco/val2017", + "coco/zero-shot/instances_val2017_unseen_2.json", + "unseen", + ), + "coco_not_zeroshot_val": ( + "coco/val2017", + "coco/zero-shot/instances_val2017_seen_2.json", + "seen", + ), + "coco_generalized_zeroshot_val": ( + "coco/val2017", + "coco/zero-shot/instances_val2017_all_2_oriorder.json", + "all", + ), "coco_zeroshot_train_oriorder": ( "coco/train2017", "coco/zero-shot/instances_train2017_seen_2_oriorder.json", @@ -113,7 +132,10 @@ def _get_metadata(cat): _CUSTOM_SPLITS_COCO = { "cc3m_coco_train_tags": ("cc3m/training/", "cc3m/coco_train_image_info_tags.json"), - "coco_caption_train_tags": ("coco/train2017/", "coco/annotations/captions_train2017_tags_allcaps.json"), + "coco_caption_train_tags": ( + "coco/train2017/", + "coco/annotations/captions_train2017_tags_allcaps.json", + ), } for key, (image_root, json_file) in _CUSTOM_SPLITS_COCO.items(): diff --git a/dimos/models/Detic/detic/data/datasets/imagenet.py b/dimos/models/Detic/detic/data/datasets/imagenet.py index 8a96823e37..9b893a704e 100644 --- a/dimos/models/Detic/detic/data/datasets/imagenet.py +++ b/dimos/models/Detic/detic/data/datasets/imagenet.py @@ -9,11 +9,16 @@ def custom_register_imagenet_instances(name, metadata, json_file, image_root): """ """ DatasetCatalog.register(name, lambda: custom_load_lvis_json(json_file, image_root, name)) - MetadataCatalog.get(name).set(json_file=json_file, image_root=image_root, evaluator_type="imagenet", **metadata) + MetadataCatalog.get(name).set( + json_file=json_file, image_root=image_root, evaluator_type="imagenet", **metadata + ) _CUSTOM_SPLITS_IMAGENET = { - "imagenet_lvis_v1": ("imagenet/ImageNet-LVIS/", "imagenet/annotations/imagenet_lvis_image_info.json"), + "imagenet_lvis_v1": ( + "imagenet/ImageNet-LVIS/", + "imagenet/annotations/imagenet_lvis_image_info.json", + ), } for key, (image_root, json_file) in _CUSTOM_SPLITS_IMAGENET.items(): @@ -26,7 +31,10 @@ def custom_register_imagenet_instances(name, metadata, json_file, image_root): _CUSTOM_SPLITS_IMAGENET_22K = { - "imagenet_lvis-22k": ("imagenet/ImageNet-LVIS/", "imagenet/annotations/imagenet-22k_image_info_lvis-22k.json"), + "imagenet_lvis-22k": ( + "imagenet/ImageNet-LVIS/", + "imagenet/annotations/imagenet-22k_image_info_lvis-22k.json", + ), } for key, (image_root, json_file) in _CUSTOM_SPLITS_IMAGENET_22K.items(): diff --git a/dimos/models/Detic/detic/data/datasets/lvis_22k_categories.py b/dimos/models/Detic/detic/data/datasets/lvis_22k_categories.py index 2420b82ebe..2e10b5dd23 100644 --- a/dimos/models/Detic/detic/data/datasets/lvis_22k_categories.py +++ b/dimos/models/Detic/detic/data/datasets/lvis_22k_categories.py @@ -228,7 +228,12 @@ {"name": "cayenne_(spice)", "id": 227, "frequency": "c", "synset": "cayenne.n.02"}, {"name": "CD_player", "id": 228, "frequency": "c", "synset": "cd_player.n.01"}, {"name": "celery", "id": 229, "frequency": "f", "synset": "celery.n.01"}, - {"name": "cellular_telephone", "id": 230, "frequency": "f", "synset": "cellular_telephone.n.01"}, + { + "name": "cellular_telephone", + "id": 230, + "frequency": "f", + "synset": "cellular_telephone.n.01", + }, {"name": "chain_mail", "id": 231, "frequency": "r", "synset": "chain_mail.n.01"}, {"name": "chair", "id": 232, "frequency": "f", "synset": "chair.n.01"}, {"name": "chaise_longue", "id": 233, "frequency": "r", "synset": "chaise_longue.n.01"}, @@ -372,7 +377,12 @@ {"name": "dishrag", "id": 371, "frequency": "c", "synset": "dishrag.n.01"}, {"name": "dishtowel", "id": 372, "frequency": "f", "synset": "dishtowel.n.01"}, {"name": "dishwasher", "id": 373, "frequency": "f", "synset": "dishwasher.n.01"}, - {"name": "dishwasher_detergent", "id": 374, "frequency": "r", "synset": "dishwasher_detergent.n.01"}, + { + "name": "dishwasher_detergent", + "id": 374, + "frequency": "r", + "synset": "dishwasher_detergent.n.01", + }, {"name": "dispenser", "id": 375, "frequency": "f", "synset": "dispenser.n.01"}, {"name": "diving_board", "id": 376, "frequency": "r", "synset": "diving_board.n.01"}, {"name": "Dixie_cup", "id": 377, "frequency": "f", "synset": "dixie_cup.n.01"}, @@ -459,7 +469,12 @@ {"name": "fleece", "id": 458, "frequency": "r", "synset": "fleece.n.03"}, {"name": "flip-flop_(sandal)", "id": 459, "frequency": "f", "synset": "flip-flop.n.02"}, {"name": "flipper_(footwear)", "id": 460, "frequency": "c", "synset": "flipper.n.01"}, - {"name": "flower_arrangement", "id": 461, "frequency": "f", "synset": "flower_arrangement.n.01"}, + { + "name": "flower_arrangement", + "id": 461, + "frequency": "f", + "synset": "flower_arrangement.n.01", + }, {"name": "flute_glass", "id": 462, "frequency": "c", "synset": "flute.n.02"}, {"name": "foal", "id": 463, "frequency": "c", "synset": "foal.n.01"}, {"name": "folding_chair", "id": 464, "frequency": "c", "synset": "folding_chair.n.01"}, @@ -653,7 +668,12 @@ {"name": "lizard", "id": 652, "frequency": "c", "synset": "lizard.n.01"}, {"name": "log", "id": 653, "frequency": "f", "synset": "log.n.01"}, {"name": "lollipop", "id": 654, "frequency": "c", "synset": "lollipop.n.02"}, - {"name": "speaker_(stero_equipment)", "id": 655, "frequency": "f", "synset": "loudspeaker.n.01"}, + { + "name": "speaker_(stero_equipment)", + "id": 655, + "frequency": "f", + "synset": "loudspeaker.n.01", + }, {"name": "loveseat", "id": 656, "frequency": "c", "synset": "love_seat.n.01"}, {"name": "machine_gun", "id": 657, "frequency": "r", "synset": "machine_gun.n.01"}, {"name": "magazine", "id": 658, "frequency": "f", "synset": "magazine.n.02"}, @@ -696,7 +716,12 @@ {"name": "mitten", "id": 695, "frequency": "c", "synset": "mitten.n.01"}, {"name": "mixer_(kitchen_tool)", "id": 696, "frequency": "c", "synset": "mixer.n.04"}, {"name": "money", "id": 697, "frequency": "c", "synset": "money.n.03"}, - {"name": "monitor_(computer_equipment) computer_monitor", "id": 698, "frequency": "f", "synset": "monitor.n.04"}, + { + "name": "monitor_(computer_equipment) computer_monitor", + "id": 698, + "frequency": "f", + "synset": "monitor.n.04", + }, {"name": "monkey", "id": 699, "frequency": "c", "synset": "monkey.n.01"}, {"name": "motor", "id": 700, "frequency": "f", "synset": "motor.n.01"}, {"name": "motor_scooter", "id": 701, "frequency": "f", "synset": "motor_scooter.n.01"}, @@ -709,7 +734,12 @@ {"name": "mug", "id": 708, "frequency": "f", "synset": "mug.n.04"}, {"name": "mushroom", "id": 709, "frequency": "f", "synset": "mushroom.n.02"}, {"name": "music_stool", "id": 710, "frequency": "r", "synset": "music_stool.n.01"}, - {"name": "musical_instrument", "id": 711, "frequency": "c", "synset": "musical_instrument.n.01"}, + { + "name": "musical_instrument", + "id": 711, + "frequency": "c", + "synset": "musical_instrument.n.01", + }, {"name": "nailfile", "id": 712, "frequency": "r", "synset": "nailfile.n.01"}, {"name": "napkin", "id": 713, "frequency": "f", "synset": "napkin.n.01"}, {"name": "neckerchief", "id": 714, "frequency": "r", "synset": "neckerchief.n.01"}, @@ -766,7 +796,12 @@ {"name": "parka", "id": 765, "frequency": "c", "synset": "parka.n.01"}, {"name": "parking_meter", "id": 766, "frequency": "f", "synset": "parking_meter.n.01"}, {"name": "parrot", "id": 767, "frequency": "c", "synset": "parrot.n.01"}, - {"name": "passenger_car_(part_of_a_train)", "id": 768, "frequency": "c", "synset": "passenger_car.n.01"}, + { + "name": "passenger_car_(part_of_a_train)", + "id": 768, + "frequency": "c", + "synset": "passenger_car.n.01", + }, {"name": "passenger_ship", "id": 769, "frequency": "r", "synset": "passenger_ship.n.01"}, {"name": "passport", "id": 770, "frequency": "c", "synset": "passport.n.02"}, {"name": "pastry", "id": 771, "frequency": "f", "synset": "pastry.n.02"}, @@ -775,7 +810,12 @@ {"name": "peach", "id": 774, "frequency": "c", "synset": "peach.n.03"}, {"name": "peanut_butter", "id": 775, "frequency": "c", "synset": "peanut_butter.n.01"}, {"name": "pear", "id": 776, "frequency": "f", "synset": "pear.n.01"}, - {"name": "peeler_(tool_for_fruit_and_vegetables)", "id": 777, "frequency": "c", "synset": "peeler.n.03"}, + { + "name": "peeler_(tool_for_fruit_and_vegetables)", + "id": 777, + "frequency": "c", + "synset": "peeler.n.03", + }, {"name": "wooden_leg", "id": 778, "frequency": "r", "synset": "peg.n.04"}, {"name": "pegboard", "id": 779, "frequency": "r", "synset": "pegboard.n.01"}, {"name": "pelican", "id": 780, "frequency": "c", "synset": "pelican.n.01"}, @@ -1047,7 +1087,12 @@ {"name": "sword", "id": 1046, "frequency": "c", "synset": "sword.n.01"}, {"name": "syringe", "id": 1047, "frequency": "r", "synset": "syringe.n.01"}, {"name": "Tabasco_sauce", "id": 1048, "frequency": "r", "synset": "tabasco.n.02"}, - {"name": "table-tennis_table", "id": 1049, "frequency": "r", "synset": "table-tennis_table.n.01"}, + { + "name": "table-tennis_table", + "id": 1049, + "frequency": "r", + "synset": "table-tennis_table.n.01", + }, {"name": "table", "id": 1050, "frequency": "f", "synset": "table.n.02"}, {"name": "table_lamp", "id": 1051, "frequency": "c", "synset": "table_lamp.n.01"}, {"name": "tablecloth", "id": 1052, "frequency": "f", "synset": "tablecloth.n.01"}, @@ -1117,7 +1162,12 @@ {"name": "trampoline", "id": 1116, "frequency": "r", "synset": "trampoline.n.01"}, {"name": "tray", "id": 1117, "frequency": "f", "synset": "tray.n.01"}, {"name": "trench_coat", "id": 1118, "frequency": "r", "synset": "trench_coat.n.01"}, - {"name": "triangle_(musical_instrument)", "id": 1119, "frequency": "r", "synset": "triangle.n.05"}, + { + "name": "triangle_(musical_instrument)", + "id": 1119, + "frequency": "r", + "synset": "triangle.n.05", + }, {"name": "tricycle", "id": 1120, "frequency": "c", "synset": "tricycle.n.01"}, {"name": "tripod", "id": 1121, "frequency": "f", "synset": "tripod.n.01"}, {"name": "trousers", "id": 1122, "frequency": "f", "synset": "trouser.n.01"}, @@ -1472,7 +1522,11 @@ {"id": 1471, "synset": "spirillum.n.01", "name": "spirillum"}, {"id": 1472, "synset": "francisella.n.01", "name": "Francisella"}, {"id": 1473, "synset": "gonococcus.n.01", "name": "gonococcus"}, - {"id": 1474, "synset": "corynebacterium_diphtheriae.n.01", "name": "Corynebacterium_diphtheriae"}, + { + "id": 1474, + "synset": "corynebacterium_diphtheriae.n.01", + "name": "Corynebacterium_diphtheriae", + }, {"id": 1475, "synset": "enteric_bacteria.n.01", "name": "enteric_bacteria"}, {"id": 1476, "synset": "klebsiella.n.01", "name": "klebsiella"}, {"id": 1477, "synset": "salmonella_typhimurium.n.01", "name": "Salmonella_typhimurium"}, @@ -2108,8 +2162,16 @@ {"id": 2107, "synset": "pacific_giant_salamander.n.01", "name": "Pacific_giant_salamander"}, {"id": 2108, "synset": "olympic_salamander.n.01", "name": "olympic_salamander"}, {"id": 2109, "synset": "lungless_salamander.n.01", "name": "lungless_salamander"}, - {"id": 2110, "synset": "eastern_red-backed_salamander.n.01", "name": "eastern_red-backed_salamander"}, - {"id": 2111, "synset": "western_red-backed_salamander.n.01", "name": "western_red-backed_salamander"}, + { + "id": 2110, + "synset": "eastern_red-backed_salamander.n.01", + "name": "eastern_red-backed_salamander", + }, + { + "id": 2111, + "synset": "western_red-backed_salamander.n.01", + "name": "western_red-backed_salamander", + }, {"id": 2112, "synset": "dusky_salamander.n.01", "name": "dusky_salamander"}, {"id": 2113, "synset": "climbing_salamander.n.01", "name": "climbing_salamander"}, {"id": 2114, "synset": "arboreal_salamander.n.01", "name": "arboreal_salamander"}, @@ -2165,8 +2227,16 @@ {"id": 2164, "synset": "eastern_cricket_frog.n.01", "name": "eastern_cricket_frog"}, {"id": 2165, "synset": "chorus_frog.n.01", "name": "chorus_frog"}, {"id": 2166, "synset": "lowland_burrowing_treefrog.n.01", "name": "lowland_burrowing_treefrog"}, - {"id": 2167, "synset": "western_narrow-mouthed_toad.n.01", "name": "western_narrow-mouthed_toad"}, - {"id": 2168, "synset": "eastern_narrow-mouthed_toad.n.01", "name": "eastern_narrow-mouthed_toad"}, + { + "id": 2167, + "synset": "western_narrow-mouthed_toad.n.01", + "name": "western_narrow-mouthed_toad", + }, + { + "id": 2168, + "synset": "eastern_narrow-mouthed_toad.n.01", + "name": "eastern_narrow-mouthed_toad", + }, {"id": 2169, "synset": "sheep_frog.n.01", "name": "sheep_frog"}, {"id": 2170, "synset": "tongueless_frog.n.01", "name": "tongueless_frog"}, {"id": 2171, "synset": "surinam_toad.n.01", "name": "Surinam_toad"}, @@ -2243,7 +2313,11 @@ {"id": 2242, "synset": "whiptail.n.01", "name": "whiptail"}, {"id": 2243, "synset": "racerunner.n.01", "name": "racerunner"}, {"id": 2244, "synset": "plateau_striped_whiptail.n.01", "name": "plateau_striped_whiptail"}, - {"id": 2245, "synset": "chihuahuan_spotted_whiptail.n.01", "name": "Chihuahuan_spotted_whiptail"}, + { + "id": 2245, + "synset": "chihuahuan_spotted_whiptail.n.01", + "name": "Chihuahuan_spotted_whiptail", + }, {"id": 2246, "synset": "western_whiptail.n.01", "name": "western_whiptail"}, {"id": 2247, "synset": "checkered_whiptail.n.01", "name": "checkered_whiptail"}, {"id": 2248, "synset": "teju.n.01", "name": "teju"}, @@ -2449,7 +2523,11 @@ {"id": 2448, "synset": "whip-scorpion.n.01", "name": "whip-scorpion"}, {"id": 2449, "synset": "vinegarroon.n.01", "name": "vinegarroon"}, {"id": 2450, "synset": "orb-weaving_spider.n.01", "name": "orb-weaving_spider"}, - {"id": 2451, "synset": "black_and_gold_garden_spider.n.01", "name": "black_and_gold_garden_spider"}, + { + "id": 2451, + "synset": "black_and_gold_garden_spider.n.01", + "name": "black_and_gold_garden_spider", + }, {"id": 2452, "synset": "barn_spider.n.01", "name": "barn_spider"}, {"id": 2453, "synset": "garden_spider.n.01", "name": "garden_spider"}, {"id": 2454, "synset": "comb-footed_spider.n.01", "name": "comb-footed_spider"}, @@ -3278,7 +3356,11 @@ {"id": 3277, "synset": "bottle-nosed_whale.n.01", "name": "bottle-nosed_whale"}, {"id": 3278, "synset": "common_dolphin.n.01", "name": "common_dolphin"}, {"id": 3279, "synset": "bottlenose_dolphin.n.01", "name": "bottlenose_dolphin"}, - {"id": 3280, "synset": "atlantic_bottlenose_dolphin.n.01", "name": "Atlantic_bottlenose_dolphin"}, + { + "id": 3280, + "synset": "atlantic_bottlenose_dolphin.n.01", + "name": "Atlantic_bottlenose_dolphin", + }, {"id": 3281, "synset": "pacific_bottlenose_dolphin.n.01", "name": "Pacific_bottlenose_dolphin"}, {"id": 3282, "synset": "porpoise.n.01", "name": "porpoise"}, {"id": 3283, "synset": "harbor_porpoise.n.01", "name": "harbor_porpoise"}, @@ -3376,7 +3458,11 @@ {"id": 3375, "synset": "terrier.n.01", "name": "terrier"}, {"id": 3376, "synset": "bullterrier.n.01", "name": "bullterrier"}, {"id": 3377, "synset": "staffordshire_bullterrier.n.01", "name": "Staffordshire_bullterrier"}, - {"id": 3378, "synset": "american_staffordshire_terrier.n.01", "name": "American_Staffordshire_terrier"}, + { + "id": 3378, + "synset": "american_staffordshire_terrier.n.01", + "name": "American_Staffordshire_terrier", + }, {"id": 3379, "synset": "bedlington_terrier.n.01", "name": "Bedlington_terrier"}, {"id": 3380, "synset": "border_terrier.n.01", "name": "Border_terrier"}, {"id": 3381, "synset": "kerry_blue_terrier.n.01", "name": "Kerry_blue_terrier"}, @@ -3408,8 +3494,16 @@ {"id": 3407, "synset": "silky_terrier.n.01", "name": "silky_terrier"}, {"id": 3408, "synset": "skye_terrier.n.01", "name": "Skye_terrier"}, {"id": 3409, "synset": "clydesdale_terrier.n.01", "name": "Clydesdale_terrier"}, - {"id": 3410, "synset": "soft-coated_wheaten_terrier.n.01", "name": "soft-coated_wheaten_terrier"}, - {"id": 3411, "synset": "west_highland_white_terrier.n.01", "name": "West_Highland_white_terrier"}, + { + "id": 3410, + "synset": "soft-coated_wheaten_terrier.n.01", + "name": "soft-coated_wheaten_terrier", + }, + { + "id": 3411, + "synset": "west_highland_white_terrier.n.01", + "name": "West_Highland_white_terrier", + }, {"id": 3412, "synset": "lhasa.n.02", "name": "Lhasa"}, {"id": 3413, "synset": "sporting_dog.n.01", "name": "sporting_dog"}, {"id": 3414, "synset": "bird_dog.n.01", "name": "bird_dog"}, @@ -3421,7 +3515,11 @@ {"id": 3420, "synset": "labrador_retriever.n.01", "name": "Labrador_retriever"}, {"id": 3421, "synset": "chesapeake_bay_retriever.n.01", "name": "Chesapeake_Bay_retriever"}, {"id": 3422, "synset": "pointer.n.04", "name": "pointer"}, - {"id": 3423, "synset": "german_short-haired_pointer.n.01", "name": "German_short-haired_pointer"}, + { + "id": 3423, + "synset": "german_short-haired_pointer.n.01", + "name": "German_short-haired_pointer", + }, {"id": 3424, "synset": "setter.n.02", "name": "setter"}, {"id": 3425, "synset": "vizsla.n.01", "name": "vizsla"}, {"id": 3426, "synset": "english_setter.n.01", "name": "English_setter"}, @@ -3855,7 +3953,11 @@ {"id": 3854, "synset": "dry-wood_termite.n.01", "name": "dry-wood_termite"}, {"id": 3855, "synset": "reticulitermes_lucifugus.n.01", "name": "Reticulitermes_lucifugus"}, {"id": 3856, "synset": "mastotermes_darwiniensis.n.01", "name": "Mastotermes_darwiniensis"}, - {"id": 3857, "synset": "mastotermes_electrodominicus.n.01", "name": "Mastotermes_electrodominicus"}, + { + "id": 3857, + "synset": "mastotermes_electrodominicus.n.01", + "name": "Mastotermes_electrodominicus", + }, {"id": 3858, "synset": "powder-post_termite.n.01", "name": "powder-post_termite"}, {"id": 3859, "synset": "orthopterous_insect.n.01", "name": "orthopterous_insect"}, {"id": 3860, "synset": "grasshopper.n.01", "name": "grasshopper"}, @@ -4231,7 +4333,11 @@ {"id": 4230, "synset": "brown_lemming.n.01", "name": "brown_lemming"}, {"id": 4231, "synset": "grey_lemming.n.01", "name": "grey_lemming"}, {"id": 4232, "synset": "pied_lemming.n.01", "name": "pied_lemming"}, - {"id": 4233, "synset": "hudson_bay_collared_lemming.n.01", "name": "Hudson_bay_collared_lemming"}, + { + "id": 4233, + "synset": "hudson_bay_collared_lemming.n.01", + "name": "Hudson_bay_collared_lemming", + }, {"id": 4234, "synset": "southern_bog_lemming.n.01", "name": "southern_bog_lemming"}, {"id": 4235, "synset": "northern_bog_lemming.n.01", "name": "northern_bog_lemming"}, {"id": 4236, "synset": "porcupine.n.01", "name": "porcupine"}, @@ -5530,7 +5636,11 @@ {"id": 5529, "synset": "auxiliary_boiler.n.01", "name": "auxiliary_boiler"}, {"id": 5530, "synset": "auxiliary_engine.n.01", "name": "auxiliary_engine"}, {"id": 5531, "synset": "auxiliary_pump.n.01", "name": "auxiliary_pump"}, - {"id": 5532, "synset": "auxiliary_research_submarine.n.01", "name": "auxiliary_research_submarine"}, + { + "id": 5532, + "synset": "auxiliary_research_submarine.n.01", + "name": "auxiliary_research_submarine", + }, {"id": 5533, "synset": "auxiliary_storage.n.01", "name": "auxiliary_storage"}, {"id": 5534, "synset": "aviary.n.01", "name": "aviary"}, {"id": 5535, "synset": "awl.n.01", "name": "awl"}, @@ -5669,7 +5779,11 @@ {"id": 5668, "synset": "baseball_equipment.n.01", "name": "baseball_equipment"}, {"id": 5669, "synset": "basement.n.01", "name": "basement"}, {"id": 5670, "synset": "basement.n.02", "name": "basement"}, - {"id": 5671, "synset": "basic_point_defense_missile_system.n.01", "name": "basic_point_defense_missile_system"}, + { + "id": 5671, + "synset": "basic_point_defense_missile_system.n.01", + "name": "basic_point_defense_missile_system", + }, {"id": 5672, "synset": "basilica.n.02", "name": "basilica"}, {"id": 5673, "synset": "basilica.n.01", "name": "basilica"}, {"id": 5674, "synset": "basilisk.n.02", "name": "basilisk"}, @@ -6480,7 +6594,11 @@ {"id": 6479, "synset": "chapterhouse.n.01", "name": "chapterhouse"}, {"id": 6480, "synset": "character_printer.n.01", "name": "character_printer"}, {"id": 6481, "synset": "charcuterie.n.01", "name": "charcuterie"}, - {"id": 6482, "synset": "charge-exchange_accelerator.n.01", "name": "charge-exchange_accelerator"}, + { + "id": 6482, + "synset": "charge-exchange_accelerator.n.01", + "name": "charge-exchange_accelerator", + }, {"id": 6483, "synset": "charger.n.02", "name": "charger"}, {"id": 6484, "synset": "chariot.n.01", "name": "chariot"}, {"id": 6485, "synset": "chariot.n.02", "name": "chariot"}, @@ -6668,7 +6786,11 @@ {"id": 6667, "synset": "coaxial_cable.n.01", "name": "coaxial_cable"}, {"id": 6668, "synset": "cobweb.n.03", "name": "cobweb"}, {"id": 6669, "synset": "cobweb.n.01", "name": "cobweb"}, - {"id": 6670, "synset": "cockcroft_and_walton_accelerator.n.01", "name": "Cockcroft_and_Walton_accelerator"}, + { + "id": 6670, + "synset": "cockcroft_and_walton_accelerator.n.01", + "name": "Cockcroft_and_Walton_accelerator", + }, {"id": 6671, "synset": "cocked_hat.n.01", "name": "cocked_hat"}, {"id": 6672, "synset": "cockhorse.n.01", "name": "cockhorse"}, {"id": 6673, "synset": "cockleshell.n.01", "name": "cockleshell"}, @@ -7149,7 +7271,11 @@ {"id": 7144, "synset": "die.n.03", "name": "die"}, {"id": 7145, "synset": "diesel.n.02", "name": "diesel"}, {"id": 7146, "synset": "diesel-electric_locomotive.n.01", "name": "diesel-electric_locomotive"}, - {"id": 7147, "synset": "diesel-hydraulic_locomotive.n.01", "name": "diesel-hydraulic_locomotive"}, + { + "id": 7147, + "synset": "diesel-hydraulic_locomotive.n.01", + "name": "diesel-hydraulic_locomotive", + }, {"id": 7148, "synset": "diesel_locomotive.n.01", "name": "diesel_locomotive"}, {"id": 7149, "synset": "diestock.n.01", "name": "diestock"}, {"id": 7150, "synset": "differential_analyzer.n.01", "name": "differential_analyzer"}, @@ -7465,7 +7591,11 @@ {"id": 7460, "synset": "electric_range.n.01", "name": "electric_range"}, {"id": 7461, "synset": "electric_toothbrush.n.01", "name": "electric_toothbrush"}, {"id": 7462, "synset": "electric_typewriter.n.01", "name": "electric_typewriter"}, - {"id": 7463, "synset": "electro-acoustic_transducer.n.01", "name": "electro-acoustic_transducer"}, + { + "id": 7463, + "synset": "electro-acoustic_transducer.n.01", + "name": "electro-acoustic_transducer", + }, {"id": 7464, "synset": "electrode.n.01", "name": "electrode"}, {"id": 7465, "synset": "electrodynamometer.n.01", "name": "electrodynamometer"}, {"id": 7466, "synset": "electroencephalograph.n.01", "name": "electroencephalograph"}, @@ -7673,7 +7803,11 @@ {"id": 7664, "synset": "field_house.n.01", "name": "field_house"}, {"id": 7665, "synset": "field_lens.n.01", "name": "field_lens"}, {"id": 7666, "synset": "field_magnet.n.01", "name": "field_magnet"}, - {"id": 7667, "synset": "field-sequential_color_television.n.01", "name": "field-sequential_color_television"}, + { + "id": 7667, + "synset": "field-sequential_color_television.n.01", + "name": "field-sequential_color_television", + }, {"id": 7668, "synset": "field_tent.n.01", "name": "field_tent"}, {"id": 7669, "synset": "fieldwork.n.01", "name": "fieldwork"}, {"id": 7670, "synset": "fife.n.01", "name": "fife"}, @@ -7770,7 +7904,11 @@ {"id": 7761, "synset": "flat_panel_display.n.01", "name": "flat_panel_display"}, {"id": 7762, "synset": "flats.n.01", "name": "flats"}, {"id": 7763, "synset": "flat_tip_screwdriver.n.01", "name": "flat_tip_screwdriver"}, - {"id": 7764, "synset": "fleet_ballistic_missile_submarine.n.01", "name": "fleet_ballistic_missile_submarine"}, + { + "id": 7764, + "synset": "fleet_ballistic_missile_submarine.n.01", + "name": "fleet_ballistic_missile_submarine", + }, {"id": 7765, "synset": "fleur-de-lis.n.02", "name": "fleur-de-lis"}, {"id": 7766, "synset": "flight_simulator.n.01", "name": "flight_simulator"}, {"id": 7767, "synset": "flintlock.n.02", "name": "flintlock"}, @@ -8558,7 +8696,11 @@ {"id": 8549, "synset": "implement.n.01", "name": "implement"}, {"id": 8550, "synset": "impression.n.07", "name": "impression"}, {"id": 8551, "synset": "imprint.n.05", "name": "imprint"}, - {"id": 8552, "synset": "improvised_explosive_device.n.01", "name": "improvised_explosive_device"}, + { + "id": 8552, + "synset": "improvised_explosive_device.n.01", + "name": "improvised_explosive_device", + }, {"id": 8553, "synset": "impulse_turbine.n.01", "name": "impulse_turbine"}, {"id": 8554, "synset": "in-basket.n.01", "name": "in-basket"}, {"id": 8555, "synset": "incendiary_bomb.n.01", "name": "incendiary_bomb"}, @@ -8601,7 +8743,11 @@ {"id": 8592, "synset": "interceptor.n.01", "name": "interceptor"}, {"id": 8593, "synset": "interchange.n.01", "name": "interchange"}, {"id": 8594, "synset": "intercommunication_system.n.01", "name": "intercommunication_system"}, - {"id": 8595, "synset": "intercontinental_ballistic_missile.n.01", "name": "intercontinental_ballistic_missile"}, + { + "id": 8595, + "synset": "intercontinental_ballistic_missile.n.01", + "name": "intercontinental_ballistic_missile", + }, {"id": 8596, "synset": "interface.n.04", "name": "interface"}, {"id": 8597, "synset": "interferometer.n.01", "name": "interferometer"}, {"id": 8598, "synset": "interior_door.n.01", "name": "interior_door"}, @@ -8679,7 +8825,11 @@ {"id": 8670, "synset": "jodhpur.n.01", "name": "jodhpur"}, {"id": 8671, "synset": "joinery.n.01", "name": "joinery"}, {"id": 8672, "synset": "joint.n.05", "name": "joint"}, - {"id": 8673, "synset": "joint_direct_attack_munition.n.01", "name": "Joint_Direct_Attack_Munition"}, + { + "id": 8673, + "synset": "joint_direct_attack_munition.n.01", + "name": "Joint_Direct_Attack_Munition", + }, {"id": 8674, "synset": "jointer.n.01", "name": "jointer"}, {"id": 8675, "synset": "joist.n.01", "name": "joist"}, {"id": 8676, "synset": "jolly_boat.n.01", "name": "jolly_boat"}, @@ -9170,7 +9320,11 @@ {"id": 9161, "synset": "mausoleum.n.01", "name": "mausoleum"}, {"id": 9162, "synset": "maxi.n.01", "name": "maxi"}, {"id": 9163, "synset": "maxim_gun.n.01", "name": "Maxim_gun"}, - {"id": 9164, "synset": "maximum_and_minimum_thermometer.n.01", "name": "maximum_and_minimum_thermometer"}, + { + "id": 9164, + "synset": "maximum_and_minimum_thermometer.n.01", + "name": "maximum_and_minimum_thermometer", + }, {"id": 9165, "synset": "maypole.n.01", "name": "maypole"}, {"id": 9166, "synset": "maze.n.01", "name": "maze"}, {"id": 9167, "synset": "mazer.n.01", "name": "mazer"}, @@ -9233,8 +9387,16 @@ {"id": 9224, "synset": "micrometer.n.02", "name": "micrometer"}, {"id": 9225, "synset": "microprocessor.n.01", "name": "microprocessor"}, {"id": 9226, "synset": "microtome.n.01", "name": "microtome"}, - {"id": 9227, "synset": "microwave_diathermy_machine.n.01", "name": "microwave_diathermy_machine"}, - {"id": 9228, "synset": "microwave_linear_accelerator.n.01", "name": "microwave_linear_accelerator"}, + { + "id": 9227, + "synset": "microwave_diathermy_machine.n.01", + "name": "microwave_diathermy_machine", + }, + { + "id": 9228, + "synset": "microwave_linear_accelerator.n.01", + "name": "microwave_linear_accelerator", + }, {"id": 9229, "synset": "middy.n.01", "name": "middy"}, {"id": 9230, "synset": "midiron.n.01", "name": "midiron"}, {"id": 9231, "synset": "mihrab.n.02", "name": "mihrab"}, @@ -9561,7 +9723,11 @@ {"id": 9552, "synset": "ordinary.n.04", "name": "ordinary"}, {"id": 9553, "synset": "organ.n.05", "name": "organ"}, {"id": 9554, "synset": "organdy.n.01", "name": "organdy"}, - {"id": 9555, "synset": "organic_light-emitting_diode.n.01", "name": "organic_light-emitting_diode"}, + { + "id": 9555, + "synset": "organic_light-emitting_diode.n.01", + "name": "organic_light-emitting_diode", + }, {"id": 9556, "synset": "organ_loft.n.01", "name": "organ_loft"}, {"id": 9557, "synset": "organ_pipe.n.01", "name": "organ_pipe"}, {"id": 9558, "synset": "organza.n.01", "name": "organza"}, @@ -10301,7 +10467,11 @@ {"id": 10288, "synset": "rattrap.n.03", "name": "rattrap"}, {"id": 10289, "synset": "rayon.n.01", "name": "rayon"}, {"id": 10290, "synset": "razor.n.01", "name": "razor"}, - {"id": 10291, "synset": "reaction-propulsion_engine.n.01", "name": "reaction-propulsion_engine"}, + { + "id": 10291, + "synset": "reaction-propulsion_engine.n.01", + "name": "reaction-propulsion_engine", + }, {"id": 10292, "synset": "reaction_turbine.n.01", "name": "reaction_turbine"}, {"id": 10293, "synset": "reactor.n.01", "name": "reactor"}, {"id": 10294, "synset": "reading_lamp.n.01", "name": "reading_lamp"}, @@ -10738,7 +10908,11 @@ {"id": 10725, "synset": "selector.n.02", "name": "selector"}, {"id": 10726, "synset": "selenium_cell.n.01", "name": "selenium_cell"}, {"id": 10727, "synset": "self-propelled_vehicle.n.01", "name": "self-propelled_vehicle"}, - {"id": 10728, "synset": "self-registering_thermometer.n.01", "name": "self-registering_thermometer"}, + { + "id": 10728, + "synset": "self-registering_thermometer.n.01", + "name": "self-registering_thermometer", + }, {"id": 10729, "synset": "self-starter.n.02", "name": "self-starter"}, {"id": 10730, "synset": "selsyn.n.01", "name": "selsyn"}, {"id": 10731, "synset": "selvage.n.02", "name": "selvage"}, @@ -10777,7 +10951,11 @@ {"id": 10764, "synset": "settle.n.01", "name": "settle"}, {"id": 10765, "synset": "settlement_house.n.01", "name": "settlement_house"}, {"id": 10766, "synset": "seventy-eight.n.02", "name": "seventy-eight"}, - {"id": 10767, "synset": "seven_wonders_of_the_ancient_world.n.01", "name": "Seven_Wonders_of_the_Ancient_World"}, + { + "id": 10767, + "synset": "seven_wonders_of_the_ancient_world.n.01", + "name": "Seven_Wonders_of_the_Ancient_World", + }, {"id": 10768, "synset": "sewage_disposal_plant.n.01", "name": "sewage_disposal_plant"}, {"id": 10769, "synset": "sewer.n.01", "name": "sewer"}, {"id": 10770, "synset": "sewing_basket.n.01", "name": "sewing_basket"}, @@ -10866,7 +11044,11 @@ {"id": 10849, "synset": "short_circuit.n.01", "name": "short_circuit"}, {"id": 10850, "synset": "short_iron.n.01", "name": "short_iron"}, {"id": 10851, "synset": "short_sleeve.n.01", "name": "short_sleeve"}, - {"id": 10852, "synset": "shortwave_diathermy_machine.n.01", "name": "shortwave_diathermy_machine"}, + { + "id": 10852, + "synset": "shortwave_diathermy_machine.n.01", + "name": "shortwave_diathermy_machine", + }, {"id": 10853, "synset": "shot.n.12", "name": "shot"}, {"id": 10854, "synset": "shotgun.n.01", "name": "shotgun"}, {"id": 10855, "synset": "shotgun_shell.n.01", "name": "shotgun_shell"}, @@ -11017,7 +11199,11 @@ {"id": 11000, "synset": "sluice.n.01", "name": "sluice"}, {"id": 11001, "synset": "smack.n.03", "name": "smack"}, {"id": 11002, "synset": "small_boat.n.01", "name": "small_boat"}, - {"id": 11003, "synset": "small_computer_system_interface.n.01", "name": "small_computer_system_interface"}, + { + "id": 11003, + "synset": "small_computer_system_interface.n.01", + "name": "small_computer_system_interface", + }, {"id": 11004, "synset": "small_ship.n.01", "name": "small_ship"}, {"id": 11005, "synset": "small_stores.n.01", "name": "small_stores"}, {"id": 11006, "synset": "smart_bomb.n.01", "name": "smart_bomb"}, @@ -11162,7 +11348,11 @@ {"id": 11145, "synset": "spinning_rod.n.01", "name": "spinning_rod"}, {"id": 11146, "synset": "spinning_wheel.n.01", "name": "spinning_wheel"}, {"id": 11147, "synset": "spiral_bandage.n.01", "name": "spiral_bandage"}, - {"id": 11148, "synset": "spiral_ratchet_screwdriver.n.01", "name": "spiral_ratchet_screwdriver"}, + { + "id": 11148, + "synset": "spiral_ratchet_screwdriver.n.01", + "name": "spiral_ratchet_screwdriver", + }, {"id": 11149, "synset": "spiral_spring.n.01", "name": "spiral_spring"}, {"id": 11150, "synset": "spirit_lamp.n.01", "name": "spirit_lamp"}, {"id": 11151, "synset": "spirit_stove.n.01", "name": "spirit_stove"}, @@ -11448,7 +11638,11 @@ {"id": 11431, "synset": "sunsuit.n.01", "name": "sunsuit"}, {"id": 11432, "synset": "supercharger.n.01", "name": "supercharger"}, {"id": 11433, "synset": "supercomputer.n.01", "name": "supercomputer"}, - {"id": 11434, "synset": "superconducting_supercollider.n.01", "name": "superconducting_supercollider"}, + { + "id": 11434, + "synset": "superconducting_supercollider.n.01", + "name": "superconducting_supercollider", + }, {"id": 11435, "synset": "superhighway.n.02", "name": "superhighway"}, {"id": 11436, "synset": "supermarket.n.01", "name": "supermarket"}, {"id": 11437, "synset": "superstructure.n.01", "name": "superstructure"}, @@ -11469,7 +11663,11 @@ {"id": 11452, "synset": "surface_search_radar.n.01", "name": "surface_search_radar"}, {"id": 11453, "synset": "surface_ship.n.01", "name": "surface_ship"}, {"id": 11454, "synset": "surface-to-air_missile.n.01", "name": "surface-to-air_missile"}, - {"id": 11455, "synset": "surface-to-air_missile_system.n.01", "name": "surface-to-air_missile_system"}, + { + "id": 11455, + "synset": "surface-to-air_missile_system.n.01", + "name": "surface-to-air_missile_system", + }, {"id": 11456, "synset": "surfboat.n.01", "name": "surfboat"}, {"id": 11457, "synset": "surcoat.n.01", "name": "surcoat"}, {"id": 11458, "synset": "surgeon's_knot.n.01", "name": "surgeon's_knot"}, @@ -11686,7 +11884,11 @@ {"id": 11669, "synset": "thermal_printer.n.01", "name": "thermal_printer"}, {"id": 11670, "synset": "thermal_reactor.n.01", "name": "thermal_reactor"}, {"id": 11671, "synset": "thermocouple.n.01", "name": "thermocouple"}, - {"id": 11672, "synset": "thermoelectric_thermometer.n.01", "name": "thermoelectric_thermometer"}, + { + "id": 11672, + "synset": "thermoelectric_thermometer.n.01", + "name": "thermoelectric_thermometer", + }, {"id": 11673, "synset": "thermograph.n.02", "name": "thermograph"}, {"id": 11674, "synset": "thermograph.n.01", "name": "thermograph"}, {"id": 11675, "synset": "thermohydrometer.n.01", "name": "thermohydrometer"}, @@ -11739,7 +11941,11 @@ {"id": 11722, "synset": "time_bomb.n.02", "name": "time_bomb"}, {"id": 11723, "synset": "time_capsule.n.01", "name": "time_capsule"}, {"id": 11724, "synset": "time_clock.n.01", "name": "time_clock"}, - {"id": 11725, "synset": "time-delay_measuring_instrument.n.01", "name": "time-delay_measuring_instrument"}, + { + "id": 11725, + "synset": "time-delay_measuring_instrument.n.01", + "name": "time-delay_measuring_instrument", + }, {"id": 11726, "synset": "time-fuse.n.01", "name": "time-fuse"}, {"id": 11727, "synset": "timepiece.n.01", "name": "timepiece"}, {"id": 11728, "synset": "timer.n.03", "name": "timer"}, @@ -12285,7 +12491,11 @@ {"id": 12268, "synset": "wire_cloth.n.01", "name": "wire_cloth"}, {"id": 12269, "synset": "wire_cutter.n.01", "name": "wire_cutter"}, {"id": 12270, "synset": "wire_gauge.n.01", "name": "wire_gauge"}, - {"id": 12271, "synset": "wireless_local_area_network.n.01", "name": "wireless_local_area_network"}, + { + "id": 12271, + "synset": "wireless_local_area_network.n.01", + "name": "wireless_local_area_network", + }, {"id": 12272, "synset": "wire_matrix_printer.n.01", "name": "wire_matrix_printer"}, {"id": 12273, "synset": "wire_recorder.n.01", "name": "wire_recorder"}, {"id": 12274, "synset": "wire_stripper.n.01", "name": "wire_stripper"}, @@ -12374,7 +12584,11 @@ {"id": 12357, "synset": "glint.n.02", "name": "glint"}, {"id": 12358, "synset": "opalescence.n.01", "name": "opalescence"}, {"id": 12359, "synset": "polish.n.01", "name": "polish"}, - {"id": 12360, "synset": "primary_color_for_pigments.n.01", "name": "primary_color_for_pigments"}, + { + "id": 12360, + "synset": "primary_color_for_pigments.n.01", + "name": "primary_color_for_pigments", + }, {"id": 12361, "synset": "primary_color_for_light.n.01", "name": "primary_color_for_light"}, {"id": 12362, "synset": "colorlessness.n.01", "name": "colorlessness"}, {"id": 12363, "synset": "mottle.n.01", "name": "mottle"}, @@ -12580,7 +12794,11 @@ {"id": 12563, "synset": "radio.n.01", "name": "radio"}, {"id": 12564, "synset": "television.n.01", "name": "television"}, {"id": 12565, "synset": "cable_television.n.01", "name": "cable_television"}, - {"id": 12566, "synset": "high-definition_television.n.01", "name": "high-definition_television"}, + { + "id": 12566, + "synset": "high-definition_television.n.01", + "name": "high-definition_television", + }, {"id": 12567, "synset": "reception.n.03", "name": "reception"}, {"id": 12568, "synset": "signal_detection.n.01", "name": "signal_detection"}, {"id": 12569, "synset": "hakham.n.01", "name": "Hakham"}, @@ -13116,7 +13334,11 @@ {"id": 13099, "synset": "sloppy_joe.n.01", "name": "Sloppy_Joe"}, {"id": 13100, "synset": "bomber.n.03", "name": "bomber"}, {"id": 13101, "synset": "gyro.n.01", "name": "gyro"}, - {"id": 13102, "synset": "bacon-lettuce-tomato_sandwich.n.01", "name": "bacon-lettuce-tomato_sandwich"}, + { + "id": 13102, + "synset": "bacon-lettuce-tomato_sandwich.n.01", + "name": "bacon-lettuce-tomato_sandwich", + }, {"id": 13103, "synset": "reuben.n.02", "name": "Reuben"}, {"id": 13104, "synset": "western.n.02", "name": "western"}, {"id": 13105, "synset": "wrap.n.02", "name": "wrap"}, @@ -14672,7 +14894,11 @@ {"id": 14655, "synset": "uphill.n.01", "name": "uphill"}, {"id": 14656, "synset": "urolith.n.01", "name": "urolith"}, {"id": 14657, "synset": "valley.n.01", "name": "valley"}, - {"id": 14658, "synset": "vehicle-borne_transmission.n.01", "name": "vehicle-borne_transmission"}, + { + "id": 14658, + "synset": "vehicle-borne_transmission.n.01", + "name": "vehicle-borne_transmission", + }, {"id": 14659, "synset": "vein.n.04", "name": "vein"}, {"id": 14660, "synset": "volcanic_crater.n.01", "name": "volcanic_crater"}, {"id": 14661, "synset": "volcano.n.02", "name": "volcano"}, @@ -15380,7 +15606,11 @@ {"id": 15363, "synset": "centenarian.n.01", "name": "centenarian"}, {"id": 15364, "synset": "centrist.n.01", "name": "centrist"}, {"id": 15365, "synset": "centurion.n.01", "name": "centurion"}, - {"id": 15366, "synset": "certified_public_accountant.n.01", "name": "certified_public_accountant"}, + { + "id": 15366, + "synset": "certified_public_accountant.n.01", + "name": "certified_public_accountant", + }, {"id": 15367, "synset": "chachka.n.01", "name": "chachka"}, {"id": 15368, "synset": "chambermaid.n.01", "name": "chambermaid"}, {"id": 15369, "synset": "chameleon.n.01", "name": "chameleon"}, @@ -15474,7 +15704,11 @@ {"id": 15457, "synset": "commanding_officer.n.01", "name": "commanding_officer"}, {"id": 15458, "synset": "commissar.n.01", "name": "commissar"}, {"id": 15459, "synset": "commissioned_officer.n.01", "name": "commissioned_officer"}, - {"id": 15460, "synset": "commissioned_military_officer.n.01", "name": "commissioned_military_officer"}, + { + "id": 15460, + "synset": "commissioned_military_officer.n.01", + "name": "commissioned_military_officer", + }, {"id": 15461, "synset": "commissioner.n.01", "name": "commissioner"}, {"id": 15462, "synset": "commissioner.n.02", "name": "commissioner"}, {"id": 15463, "synset": "committee_member.n.01", "name": "committee_member"}, @@ -16740,7 +16974,11 @@ {"id": 16723, "synset": "presentist.n.01", "name": "presentist"}, {"id": 16724, "synset": "preserver.n.03", "name": "preserver"}, {"id": 16725, "synset": "president.n.03", "name": "president"}, - {"id": 16726, "synset": "president_of_the_united_states.n.01", "name": "President_of_the_United_States"}, + { + "id": 16726, + "synset": "president_of_the_united_states.n.01", + "name": "President_of_the_United_States", + }, {"id": 16727, "synset": "president.n.05", "name": "president"}, {"id": 16728, "synset": "press_agent.n.01", "name": "press_agent"}, {"id": 16729, "synset": "press_photographer.n.01", "name": "press_photographer"}, @@ -17592,7 +17830,11 @@ {"id": 17571, "synset": "rocky_mountain_pinon.n.01", "name": "Rocky_mountain_pinon"}, {"id": 17572, "synset": "single-leaf.n.01", "name": "single-leaf"}, {"id": 17573, "synset": "bishop_pine.n.01", "name": "bishop_pine"}, - {"id": 17574, "synset": "california_single-leaf_pinyon.n.01", "name": "California_single-leaf_pinyon"}, + { + "id": 17574, + "synset": "california_single-leaf_pinyon.n.01", + "name": "California_single-leaf_pinyon", + }, {"id": 17575, "synset": "parry's_pinyon.n.01", "name": "Parry's_pinyon"}, {"id": 17576, "synset": "spruce_pine.n.04", "name": "spruce_pine"}, {"id": 17577, "synset": "black_pine.n.05", "name": "black_pine"}, @@ -18349,7 +18591,11 @@ {"id": 18328, "synset": "everlasting.n.01", "name": "everlasting"}, {"id": 18329, "synset": "achillea.n.01", "name": "achillea"}, {"id": 18330, "synset": "yarrow.n.01", "name": "yarrow"}, - {"id": 18331, "synset": "pink-and-white_everlasting.n.01", "name": "pink-and-white_everlasting"}, + { + "id": 18331, + "synset": "pink-and-white_everlasting.n.01", + "name": "pink-and-white_everlasting", + }, {"id": 18332, "synset": "white_snakeroot.n.01", "name": "white_snakeroot"}, {"id": 18333, "synset": "ageratum.n.02", "name": "ageratum"}, {"id": 18334, "synset": "common_ageratum.n.01", "name": "common_ageratum"}, @@ -18420,7 +18666,11 @@ {"id": 18399, "synset": "flat-topped_white_aster.n.01", "name": "flat-topped_white_aster"}, {"id": 18400, "synset": "late_purple_aster.n.01", "name": "late_purple_aster"}, {"id": 18401, "synset": "panicled_aster.n.01", "name": "panicled_aster"}, - {"id": 18402, "synset": "perennial_salt_marsh_aster.n.01", "name": "perennial_salt_marsh_aster"}, + { + "id": 18402, + "synset": "perennial_salt_marsh_aster.n.01", + "name": "perennial_salt_marsh_aster", + }, {"id": 18403, "synset": "purple-stemmed_aster.n.01", "name": "purple-stemmed_aster"}, {"id": 18404, "synset": "rough-leaved_aster.n.01", "name": "rough-leaved_aster"}, {"id": 18405, "synset": "rush_aster.n.01", "name": "rush_aster"}, @@ -18648,7 +18898,11 @@ {"id": 18627, "synset": "arrowleaf_groundsel.n.01", "name": "arrowleaf_groundsel"}, {"id": 18628, "synset": "black_salsify.n.01", "name": "black_salsify"}, {"id": 18629, "synset": "white-topped_aster.n.01", "name": "white-topped_aster"}, - {"id": 18630, "synset": "narrow-leaved_white-topped_aster.n.01", "name": "narrow-leaved_white-topped_aster"}, + { + "id": 18630, + "synset": "narrow-leaved_white-topped_aster.n.01", + "name": "narrow-leaved_white-topped_aster", + }, {"id": 18631, "synset": "silver_sage.n.01", "name": "silver_sage"}, {"id": 18632, "synset": "sea_wormwood.n.01", "name": "sea_wormwood"}, {"id": 18633, "synset": "sawwort.n.01", "name": "sawwort"}, @@ -18766,7 +19020,11 @@ {"id": 18745, "synset": "common_lady's-slipper.n.01", "name": "common_lady's-slipper"}, {"id": 18746, "synset": "ram's-head.n.01", "name": "ram's-head"}, {"id": 18747, "synset": "yellow_lady's_slipper.n.01", "name": "yellow_lady's_slipper"}, - {"id": 18748, "synset": "large_yellow_lady's_slipper.n.01", "name": "large_yellow_lady's_slipper"}, + { + "id": 18748, + "synset": "large_yellow_lady's_slipper.n.01", + "name": "large_yellow_lady's_slipper", + }, {"id": 18749, "synset": "california_lady's_slipper.n.01", "name": "California_lady's_slipper"}, {"id": 18750, "synset": "clustered_lady's_slipper.n.01", "name": "clustered_lady's_slipper"}, {"id": 18751, "synset": "mountain_lady's_slipper.n.01", "name": "mountain_lady's_slipper"}, @@ -18785,7 +19043,11 @@ {"id": 18764, "synset": "tongueflower.n.01", "name": "tongueflower"}, {"id": 18765, "synset": "rattlesnake_plantain.n.01", "name": "rattlesnake_plantain"}, {"id": 18766, "synset": "fragrant_orchid.n.01", "name": "fragrant_orchid"}, - {"id": 18767, "synset": "short-spurred_fragrant_orchid.n.01", "name": "short-spurred_fragrant_orchid"}, + { + "id": 18767, + "synset": "short-spurred_fragrant_orchid.n.01", + "name": "short-spurred_fragrant_orchid", + }, {"id": 18768, "synset": "fringed_orchis.n.01", "name": "fringed_orchis"}, {"id": 18769, "synset": "frog_orchid.n.01", "name": "frog_orchid"}, {"id": 18770, "synset": "rein_orchid.n.01", "name": "rein_orchid"}, @@ -18829,7 +19091,11 @@ {"id": 18808, "synset": "rattlesnake_orchid.n.01", "name": "rattlesnake_orchid"}, {"id": 18809, "synset": "lesser_butterfly_orchid.n.01", "name": "lesser_butterfly_orchid"}, {"id": 18810, "synset": "greater_butterfly_orchid.n.01", "name": "greater_butterfly_orchid"}, - {"id": 18811, "synset": "prairie_white-fringed_orchid.n.01", "name": "prairie_white-fringed_orchid"}, + { + "id": 18811, + "synset": "prairie_white-fringed_orchid.n.01", + "name": "prairie_white-fringed_orchid", + }, {"id": 18812, "synset": "tangle_orchid.n.01", "name": "tangle_orchid"}, {"id": 18813, "synset": "indian_crocus.n.01", "name": "Indian_crocus"}, {"id": 18814, "synset": "pleurothallis.n.01", "name": "pleurothallis"}, @@ -19696,7 +19962,11 @@ {"id": 19675, "synset": "richweed.n.02", "name": "richweed"}, {"id": 19676, "synset": "artillery_plant.n.01", "name": "artillery_plant"}, {"id": 19677, "synset": "friendship_plant.n.01", "name": "friendship_plant"}, - {"id": 19678, "synset": "queensland_grass-cloth_plant.n.01", "name": "Queensland_grass-cloth_plant"}, + { + "id": 19678, + "synset": "queensland_grass-cloth_plant.n.01", + "name": "Queensland_grass-cloth_plant", + }, {"id": 19679, "synset": "pipturus_albidus.n.01", "name": "Pipturus_albidus"}, {"id": 19680, "synset": "cannabis.n.01", "name": "cannabis"}, {"id": 19681, "synset": "indian_hemp.n.01", "name": "Indian_hemp"}, @@ -20035,7 +20305,11 @@ {"id": 20014, "synset": "licorice.n.01", "name": "licorice"}, {"id": 20015, "synset": "wild_licorice.n.02", "name": "wild_licorice"}, {"id": 20016, "synset": "licorice_root.n.01", "name": "licorice_root"}, - {"id": 20017, "synset": "western_australia_coral_pea.n.01", "name": "Western_Australia_coral_pea"}, + { + "id": 20017, + "synset": "western_australia_coral_pea.n.01", + "name": "Western_Australia_coral_pea", + }, {"id": 20018, "synset": "sweet_vetch.n.01", "name": "sweet_vetch"}, {"id": 20019, "synset": "french_honeysuckle.n.02", "name": "French_honeysuckle"}, {"id": 20020, "synset": "anil.n.02", "name": "anil"}, @@ -20058,7 +20332,11 @@ {"id": 20037, "synset": "sericea_lespedeza.n.01", "name": "sericea_lespedeza"}, {"id": 20038, "synset": "lentil.n.03", "name": "lentil"}, {"id": 20039, "synset": "lentil.n.02", "name": "lentil"}, - {"id": 20040, "synset": "prairie_bird's-foot_trefoil.n.01", "name": "prairie_bird's-foot_trefoil"}, + { + "id": 20040, + "synset": "prairie_bird's-foot_trefoil.n.01", + "name": "prairie_bird's-foot_trefoil", + }, {"id": 20041, "synset": "bird's_foot_trefoil.n.02", "name": "bird's_foot_trefoil"}, {"id": 20042, "synset": "winged_pea.n.02", "name": "winged_pea"}, {"id": 20043, "synset": "lupine.n.01", "name": "lupine"}, @@ -20231,7 +20509,11 @@ {"id": 20210, "synset": "water_star_grass.n.01", "name": "water_star_grass"}, {"id": 20211, "synset": "naiad.n.01", "name": "naiad"}, {"id": 20212, "synset": "water_plantain.n.01", "name": "water_plantain"}, - {"id": 20213, "synset": "narrow-leaved_water_plantain.n.01", "name": "narrow-leaved_water_plantain"}, + { + "id": 20213, + "synset": "narrow-leaved_water_plantain.n.01", + "name": "narrow-leaved_water_plantain", + }, {"id": 20214, "synset": "hydrilla.n.01", "name": "hydrilla"}, {"id": 20215, "synset": "american_frogbit.n.01", "name": "American_frogbit"}, {"id": 20216, "synset": "waterweed.n.01", "name": "waterweed"}, @@ -20769,7 +21051,11 @@ {"id": 20748, "synset": "five-point_bishop's_cap.n.01", "name": "five-point_bishop's_cap"}, {"id": 20749, "synset": "parnassia.n.01", "name": "parnassia"}, {"id": 20750, "synset": "bog_star.n.01", "name": "bog_star"}, - {"id": 20751, "synset": "fringed_grass_of_parnassus.n.01", "name": "fringed_grass_of_Parnassus"}, + { + "id": 20751, + "synset": "fringed_grass_of_parnassus.n.01", + "name": "fringed_grass_of_Parnassus", + }, {"id": 20752, "synset": "false_alumroot.n.01", "name": "false_alumroot"}, {"id": 20753, "synset": "foamflower.n.01", "name": "foamflower"}, {"id": 20754, "synset": "false_miterwort.n.01", "name": "false_miterwort"}, @@ -20835,7 +21121,11 @@ {"id": 20814, "synset": "man-of-the-earth.n.01", "name": "man-of-the-earth"}, {"id": 20815, "synset": "scammony.n.01", "name": "scammony"}, {"id": 20816, "synset": "japanese_morning_glory.n.01", "name": "Japanese_morning_glory"}, - {"id": 20817, "synset": "imperial_japanese_morning_glory.n.01", "name": "imperial_Japanese_morning_glory"}, + { + "id": 20817, + "synset": "imperial_japanese_morning_glory.n.01", + "name": "imperial_Japanese_morning_glory", + }, {"id": 20818, "synset": "gesneriad.n.01", "name": "gesneriad"}, {"id": 20819, "synset": "gesneria.n.01", "name": "gesneria"}, {"id": 20820, "synset": "achimenes.n.01", "name": "achimenes"}, @@ -21301,12 +21591,20 @@ {"id": 21280, "synset": "pholiota_flammans.n.01", "name": "Pholiota_flammans"}, {"id": 21281, "synset": "pholiota_flavida.n.01", "name": "Pholiota_flavida"}, {"id": 21282, "synset": "nameko.n.01", "name": "nameko"}, - {"id": 21283, "synset": "pholiota_squarrosa-adiposa.n.01", "name": "Pholiota_squarrosa-adiposa"}, + { + "id": 21283, + "synset": "pholiota_squarrosa-adiposa.n.01", + "name": "Pholiota_squarrosa-adiposa", + }, {"id": 21284, "synset": "pholiota_squarrosa.n.01", "name": "Pholiota_squarrosa"}, {"id": 21285, "synset": "pholiota_squarrosoides.n.01", "name": "Pholiota_squarrosoides"}, {"id": 21286, "synset": "stropharia_ambigua.n.01", "name": "Stropharia_ambigua"}, {"id": 21287, "synset": "stropharia_hornemannii.n.01", "name": "Stropharia_hornemannii"}, - {"id": 21288, "synset": "stropharia_rugoso-annulata.n.01", "name": "Stropharia_rugoso-annulata"}, + { + "id": 21288, + "synset": "stropharia_rugoso-annulata.n.01", + "name": "Stropharia_rugoso-annulata", + }, {"id": 21289, "synset": "gill_fungus.n.01", "name": "gill_fungus"}, {"id": 21290, "synset": "entoloma_lividum.n.01", "name": "Entoloma_lividum"}, {"id": 21291, "synset": "entoloma_aprile.n.01", "name": "Entoloma_aprile"}, @@ -21480,13 +21778,21 @@ {"id": 21459, "synset": "hygrophorus_sordidus.n.01", "name": "Hygrophorus_sordidus"}, {"id": 21460, "synset": "hygrophorus_tennesseensis.n.01", "name": "Hygrophorus_tennesseensis"}, {"id": 21461, "synset": "hygrophorus_turundus.n.01", "name": "Hygrophorus_turundus"}, - {"id": 21462, "synset": "neohygrophorus_angelesianus.n.01", "name": "Neohygrophorus_angelesianus"}, + { + "id": 21462, + "synset": "neohygrophorus_angelesianus.n.01", + "name": "Neohygrophorus_angelesianus", + }, {"id": 21463, "synset": "cortinarius_armillatus.n.01", "name": "Cortinarius_armillatus"}, {"id": 21464, "synset": "cortinarius_atkinsonianus.n.01", "name": "Cortinarius_atkinsonianus"}, {"id": 21465, "synset": "cortinarius_corrugatus.n.01", "name": "Cortinarius_corrugatus"}, {"id": 21466, "synset": "cortinarius_gentilis.n.01", "name": "Cortinarius_gentilis"}, {"id": 21467, "synset": "cortinarius_mutabilis.n.01", "name": "Cortinarius_mutabilis"}, - {"id": 21468, "synset": "cortinarius_semisanguineus.n.01", "name": "Cortinarius_semisanguineus"}, + { + "id": 21468, + "synset": "cortinarius_semisanguineus.n.01", + "name": "Cortinarius_semisanguineus", + }, {"id": 21469, "synset": "cortinarius_subfoetidus.n.01", "name": "Cortinarius_subfoetidus"}, {"id": 21470, "synset": "cortinarius_violaceus.n.01", "name": "Cortinarius_violaceus"}, {"id": 21471, "synset": "gymnopilus_spectabilis.n.01", "name": "Gymnopilus_spectabilis"}, @@ -21728,7 +22034,11 @@ {"id": 21707, "synset": "silver_tree_fern.n.01", "name": "silver_tree_fern"}, {"id": 21708, "synset": "davallia.n.01", "name": "davallia"}, {"id": 21709, "synset": "hare's-foot_fern.n.01", "name": "hare's-foot_fern"}, - {"id": 21710, "synset": "canary_island_hare's_foot_fern.n.01", "name": "Canary_Island_hare's_foot_fern"}, + { + "id": 21710, + "synset": "canary_island_hare's_foot_fern.n.01", + "name": "Canary_Island_hare's_foot_fern", + }, {"id": 21711, "synset": "squirrel's-foot_fern.n.01", "name": "squirrel's-foot_fern"}, {"id": 21712, "synset": "bracken.n.01", "name": "bracken"}, {"id": 21713, "synset": "soft_tree_fern.n.01", "name": "soft_tree_fern"}, @@ -21994,7 +22304,11 @@ {"id": 21973, "synset": "chyme.n.01", "name": "chyme"}, {"id": 21974, "synset": "ragweed_pollen.n.01", "name": "ragweed_pollen"}, {"id": 21975, "synset": "pina_cloth.n.01", "name": "pina_cloth"}, - {"id": 21976, "synset": "chlorobenzylidenemalononitrile.n.01", "name": "chlorobenzylidenemalononitrile"}, + { + "id": 21976, + "synset": "chlorobenzylidenemalononitrile.n.01", + "name": "chlorobenzylidenemalononitrile", + }, {"id": 21977, "synset": "carbon.n.01", "name": "carbon"}, {"id": 21978, "synset": "charcoal.n.01", "name": "charcoal"}, {"id": 21979, "synset": "rock.n.02", "name": "rock"}, diff --git a/dimos/models/Detic/detic/data/datasets/lvis_v1.py b/dimos/models/Detic/detic/data/datasets/lvis_v1.py index 6fc0afc6d3..3eb88bb4a1 100644 --- a/dimos/models/Detic/detic/data/datasets/lvis_v1.py +++ b/dimos/models/Detic/detic/data/datasets/lvis_v1.py @@ -16,7 +16,9 @@ def custom_register_lvis_instances(name, metadata, json_file, image_root): """ """ DatasetCatalog.register(name, lambda: custom_load_lvis_json(json_file, image_root, name)) - MetadataCatalog.get(name).set(json_file=json_file, image_root=image_root, evaluator_type="lvis", **metadata) + MetadataCatalog.get(name).set( + json_file=json_file, image_root=image_root, evaluator_type="lvis", **metadata + ) def custom_load_lvis_json(json_file, image_root, dataset_name=None): @@ -35,7 +37,10 @@ def custom_load_lvis_json(json_file, image_root, dataset_name=None): if timer.seconds() > 1: logger.info("Loading {} takes {:.2f} seconds.".format(json_file, timer.seconds())) - catid2contid = {x["id"]: i for i, x in enumerate(sorted(lvis_api.dataset["categories"], key=lambda x: x["id"]))} + catid2contid = { + x["id"]: i + for i, x in enumerate(sorted(lvis_api.dataset["categories"], key=lambda x: x["id"])) + } if len(lvis_api.dataset["categories"]) == 1203: for x in lvis_api.dataset["categories"]: assert catid2contid[x["id"]] == x["id"] - 1 @@ -44,7 +49,9 @@ def custom_load_lvis_json(json_file, image_root, dataset_name=None): anns = [lvis_api.img_ann_map[img_id] for img_id in img_ids] ann_ids = [ann["id"] for anns_per_image in anns for ann in anns_per_image] - assert len(set(ann_ids)) == len(ann_ids), "Annotation ids in '{}' are not unique".format(json_file) + assert len(set(ann_ids)) == len(ann_ids), "Annotation ids in '{}' are not unique".format( + json_file + ) imgs_anns = list(zip(imgs, anns)) logger.info("Loaded {} images in the LVIS v1 format from {}".format(len(imgs_anns), json_file)) @@ -72,7 +79,9 @@ def custom_load_lvis_json(json_file, image_root, dataset_name=None): # NOTE: modified by Xingyi: convert to 0-based record["neg_category_ids"] = [catid2contid[x] for x in record["neg_category_ids"]] if "pos_category_ids" in img_dict: - record["pos_category_ids"] = [catid2contid[x] for x in img_dict.get("pos_category_ids", [])] + record["pos_category_ids"] = [ + catid2contid[x] for x in img_dict.get("pos_category_ids", []) + ] if "captions" in img_dict: record["captions"] = img_dict["captions"] if "caption_features" in img_dict: @@ -122,7 +131,9 @@ def get_lvis_22k_meta(): from .lvis_22k_categories import CATEGORIES cat_ids = [k["id"] for k in CATEGORIES] - assert min(cat_ids) == 1 and max(cat_ids) == len(cat_ids), "Category ids are not in [1, #categories], as expected" + assert min(cat_ids) == 1 and max(cat_ids) == len(cat_ids), ( + "Category ids are not in [1, #categories], as expected" + ) # Ensure that the category list is sorted by id lvis_categories = sorted(CATEGORIES, key=lambda x: x["id"]) thing_classes = [k["name"] for k in lvis_categories] diff --git a/dimos/models/Detic/detic/data/datasets/objects365.py b/dimos/models/Detic/detic/data/datasets/objects365.py index 71489e1865..6e0a45044e 100644 --- a/dimos/models/Detic/detic/data/datasets/objects365.py +++ b/dimos/models/Detic/detic/data/datasets/objects365.py @@ -749,14 +749,26 @@ def _get_builtin_metadata(): x["id"]: i for i, x in enumerate(sorted(categories_v2_fix, key=lambda x: x["id"])) } thing_classes = [id_to_name[k] for k in sorted(id_to_name)] - return {"thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id, "thing_classes": thing_classes} + return { + "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id, + "thing_classes": thing_classes, + } _PREDEFINED_SPLITS_OBJECTS365 = { - "objects365_v2_train": ("objects365/train", "objects365/annotations/zhiyuan_objv2_train_fixname_fixmiss.json"), + "objects365_v2_train": ( + "objects365/train", + "objects365/annotations/zhiyuan_objv2_train_fixname_fixmiss.json", + ), # 80,000 images, 1,240,587 annotations - "objects365_v2_val": ("objects365/val", "objects365/annotations/zhiyuan_objv2_val_fixname.json"), - "objects365_v2_val_rare": ("objects365/val", "objects365/annotations/zhiyuan_objv2_val_fixname_rare.json"), + "objects365_v2_val": ( + "objects365/val", + "objects365/annotations/zhiyuan_objv2_val_fixname.json", + ), + "objects365_v2_val_rare": ( + "objects365/val", + "objects365/annotations/zhiyuan_objv2_val_fixname_rare.json", + ), } for key, (image_root, json_file) in _PREDEFINED_SPLITS_OBJECTS365.items(): diff --git a/dimos/models/Detic/detic/data/datasets/oid.py b/dimos/models/Detic/detic/data/datasets/oid.py index 6ca72f24c9..d3a6fd14b2 100644 --- a/dimos/models/Detic/detic/data/datasets/oid.py +++ b/dimos/models/Detic/detic/data/datasets/oid.py @@ -511,7 +511,10 @@ def _get_builtin_metadata(cats): id_to_name = {x["id"]: x["name"] for x in cats} thing_dataset_id_to_contiguous_id = {i + 1: i for i in range(len(cats))} thing_classes = [x["name"] for x in sorted(cats, key=lambda x: x["id"])] - return {"thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id, "thing_classes": thing_classes} + return { + "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id, + "thing_classes": thing_classes, + } _PREDEFINED_SPLITS_OID = { @@ -520,8 +523,14 @@ def _get_builtin_metadata(cats): # "expanded" duplicates annotations to their father classes based on the official # hierarchy. This is used in the official evaulation protocol. # https://storage.googleapis.com/openimages/web/evaluation.html - "oid_val_expanded": ("oid/images/validation/", "oid/annotations/oid_challenge_2019_val_expanded.json"), - "oid_val_expanded_rare": ("oid/images/validation/", "oid/annotations/oid_challenge_2019_val_expanded_rare.json"), + "oid_val_expanded": ( + "oid/images/validation/", + "oid/annotations/oid_challenge_2019_val_expanded.json", + ), + "oid_val_expanded_rare": ( + "oid/images/validation/", + "oid/annotations/oid_challenge_2019_val_expanded_rare.json", + ), } diff --git a/dimos/models/Detic/detic/data/datasets/register_oid.py b/dimos/models/Detic/detic/data/datasets/register_oid.py index 47d456946c..59a4da9ab7 100644 --- a/dimos/models/Detic/detic/data/datasets/register_oid.py +++ b/dimos/models/Detic/detic/data/datasets/register_oid.py @@ -27,10 +27,14 @@ def register_oid_instances(name, metadata, json_file, image_root): # 2. Optionally, add metadata about this dataset, # since they might be useful in evaluation, visualization or logging - MetadataCatalog.get(name).set(json_file=json_file, image_root=image_root, evaluator_type="oid", **metadata) + MetadataCatalog.get(name).set( + json_file=json_file, image_root=image_root, evaluator_type="oid", **metadata + ) -def load_coco_json_mem_efficient(json_file, image_root, dataset_name=None, extra_annotation_keys=None): +def load_coco_json_mem_efficient( + json_file, image_root, dataset_name=None, extra_annotation_keys=None +): """ Actually not mem efficient """ diff --git a/dimos/models/Detic/detic/data/tar_dataset.py b/dimos/models/Detic/detic/data/tar_dataset.py index aabebfb4d7..323ef7dbb1 100644 --- a/dimos/models/Detic/detic/data/tar_dataset.py +++ b/dimos/models/Detic/detic/data/tar_dataset.py @@ -48,7 +48,9 @@ def __init__( labels = np.zeros(self.dataset_lens.sum(), dtype=np.int64) sI = 0 for k in range(len(self.dataset_lens)): - assert (sI + self.dataset_lens[k]) <= len(labels), f"{k} {sI + self.dataset_lens[k]} vs. {len(labels)}" + assert (sI + self.dataset_lens[k]) <= len(labels), ( + f"{k} {sI + self.dataset_lens[k]} vs. {len(labels)}" + ) labels[sI : (sI + self.dataset_lens[k])] = k sI += self.dataset_lens[k] self.labels = labels @@ -65,7 +67,9 @@ def __getitem__(self, index): if index in self.dataset_cumsums: d_index += 1 - assert d_index == self.labels[index], f"{d_index} vs. {self.labels[index]} mismatch for {index}" + assert d_index == self.labels[index], ( + f"{d_index} vs. {self.labels[index]} mismatch for {index}" + ) # change index to local dataset index if d_index == 0: diff --git a/dimos/models/Detic/detic/evaluation/custom_coco_eval.py b/dimos/models/Detic/detic/evaluation/custom_coco_eval.py index 95c27e181e..b4bbc9fc94 100644 --- a/dimos/models/Detic/detic/evaluation/custom_coco_eval.py +++ b/dimos/models/Detic/detic/evaluation/custom_coco_eval.py @@ -29,7 +29,9 @@ def _derive_coco_results(self, coco_eval, iou_type, class_names=None): metric: float(coco_eval.stats[idx] * 100 if coco_eval.stats[idx] >= 0 else "nan") for idx, metric in enumerate(metrics) } - self._logger.info("Evaluation results for {}: \n".format(iou_type) + create_small_table(results)) + self._logger.info( + "Evaluation results for {}: \n".format(iou_type) + create_small_table(results) + ) if not np.isfinite(sum(results.values())): self._logger.info("Some metrics cannot be computed and is shown as NaN.") @@ -102,5 +104,7 @@ def _derive_coco_results(self, coco_eval, iou_type, class_names=None): results.update({"AP-" + name: ap for name, ap in results_per_category}) results["AP50-seen"] = sum(results_per_category50_seen) / len(results_per_category50_seen) - results["AP50-unseen"] = sum(results_per_category50_unseen) / len(results_per_category50_unseen) + results["AP50-unseen"] = sum(results_per_category50_unseen) / len( + results_per_category50_unseen + ) return results diff --git a/dimos/models/Detic/detic/evaluation/oideval.py b/dimos/models/Detic/detic/evaluation/oideval.py index 854c96a162..d52a151371 100644 --- a/dimos/models/Detic/detic/evaluation/oideval.py +++ b/dimos/models/Detic/detic/evaluation/oideval.py @@ -178,8 +178,12 @@ def _prepare(self): cat_ids = self.params.cat_ids if self.params.cat_ids else None - gts = self.lvis_gt.load_anns(self.lvis_gt.get_ann_ids(img_ids=self.params.img_ids, cat_ids=cat_ids)) - dts = self.lvis_dt.load_anns(self.lvis_dt.get_ann_ids(img_ids=self.params.img_ids, cat_ids=cat_ids)) + gts = self.lvis_gt.load_anns( + self.lvis_gt.get_ann_ids(img_ids=self.params.img_ids, cat_ids=cat_ids) + ) + dts = self.lvis_dt.load_anns( + self.lvis_dt.get_ann_ids(img_ids=self.params.img_ids, cat_ids=cat_ids) + ) # convert ground truth to mask if iou_type == 'segm' if self.params.iou_type == "segm": self._to_mask(gts, self.lvis_gt) @@ -228,7 +232,9 @@ def evaluate(self): self._prepare() self.ious = { - (img_id, cat_id): self.compute_iou(img_id, cat_id) for img_id in self.params.img_ids for cat_id in cat_ids + (img_id, cat_id): self.compute_iou(img_id, cat_id) + for img_id in self.params.img_ids + for cat_id in cat_ids } # loop through images, area range, max detection number @@ -321,7 +327,9 @@ def compute_match_iou(iou): is_gt_detected = np.zeros(iou.shape[1], dtype=bool) for i in range(num_detected_boxes): gt_id = max_overlap_gt_ids[i] - is_evaluatable = not tp_fp_labels[i] and iou[i, gt_id] >= 0.5 and not is_matched_to_group_of[i] + is_evaluatable = ( + not tp_fp_labels[i] and iou[i, gt_id] >= 0.5 and not is_matched_to_group_of[i] + ) if is_evaluatable: if not is_gt_detected[gt_id]: tp_fp_labels[i] = True @@ -333,7 +341,9 @@ def compute_match_ioa(ioa): max_overlap_group_of_gt_ids = np.argmax(ioa, axis=1) for i in range(num_detected_boxes): gt_id = max_overlap_group_of_gt_ids[i] - is_evaluatable = not tp_fp_labels[i] and ioa[i, gt_id] >= 0.5 and not is_matched_to_group_of[i] + is_evaluatable = ( + not tp_fp_labels[i] and ioa[i, gt_id] >= 0.5 and not is_matched_to_group_of[i] + ) if is_evaluatable: is_matched_to_group_of[i] = True scores_group_of[gt_id] = max(scores_group_of[gt_id], scores[i]) @@ -355,13 +365,17 @@ def compute_match_ioa(ioa): valid_entries = ~is_matched_to_group_of scores = np.concatenate((scores[valid_entries], scores_box_group_of)) - tp_fps = np.concatenate((tp_fp_labels[valid_entries].astype(float), tp_fp_labels_box_group_of)) + tp_fps = np.concatenate( + (tp_fp_labels[valid_entries].astype(float), tp_fp_labels_box_group_of) + ) return { "image_id": img_id, "category_id": cat_id, "area_rng": area_rng, - "dt_matches": np.array([1 if x > 0 else 0 for x in tp_fps], dtype=np.int32).reshape(1, -1), + "dt_matches": np.array([1 if x > 0 else 0 for x in tp_fps], dtype=np.int32).reshape( + 1, -1 + ), "dt_scores": [x for x in scores], "dt_ignore": np.array([0 for x in scores], dtype=np.int32).reshape(1, -1), "num_gt": len(gt), @@ -521,7 +535,9 @@ def __init__(self, iou_type): self.cat_ids = [] # np.arange causes trouble. the data point on arange is slightly # larger than the true value - self.iou_thrs = np.linspace(0.5, 0.95, int(np.round((0.95 - 0.5) / 0.05)) + 1, endpoint=True) + self.iou_thrs = np.linspace( + 0.5, 0.95, int(np.round((0.95 - 0.5) / 0.05)) + 1, endpoint=True + ) self.google_style = True # print('Using google style PR curve') self.iou_thrs = self.iou_thrs[:1] @@ -644,7 +660,8 @@ def _evaluate_predictions_on_oid(oid_gt, oid_results_path, eval_seg=False, class results_per_category.append( ( "{} {}".format( - name.replace(" ", "_"), inst_num if inst_num < 1000 else "{:.1f}k".format(inst_num / 1000) + name.replace(" ", "_"), + inst_num if inst_num < 1000 else "{:.1f}k".format(inst_num / 1000), ), float(ap * 100), ) diff --git a/dimos/models/Detic/detic/modeling/backbone/swintransformer.py b/dimos/models/Detic/detic/modeling/backbone/swintransformer.py index b538212226..541d3c99dc 100644 --- a/dimos/models/Detic/detic/modeling/backbone/swintransformer.py +++ b/dimos/models/Detic/detic/modeling/backbone/swintransformer.py @@ -29,7 +29,9 @@ class Mlp(nn.Module): """Multilayer perceptron.""" - def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.0): + def __init__( + self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.0 + ): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features @@ -90,7 +92,16 @@ class WindowAttention(nn.Module): proj_drop (float, optional): Dropout ratio of output. Default: 0.0 """ - def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0.0, proj_drop=0.0): + def __init__( + self, + dim, + window_size, + num_heads, + qkv_bias=True, + qk_scale=None, + attn_drop=0.0, + proj_drop=0.0, + ): super().__init__() self.dim = dim self.window_size = window_size # Wh, Ww @@ -131,16 +142,24 @@ def forward(self, x, mask=None): mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None """ B_, N, C = x.shape - qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + qkv = ( + self.qkv(x) + .reshape(B_, N, 3, self.num_heads, C // self.num_heads) + .permute(2, 0, 3, 1, 4) + ) q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) q = q * self.scale attn = q @ k.transpose(-2, -1) - relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view( + relative_position_bias = self.relative_position_bias_table[ + self.relative_position_index.view(-1) + ].view( self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1 ) # Wh*Ww,Wh*Ww,nH - relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww + relative_position_bias = relative_position_bias.permute( + 2, 0, 1 + ).contiguous() # nH, Wh*Ww, Wh*Ww attn = attn + relative_position_bias.unsqueeze(0) if mask is not None: @@ -213,7 +232,9 @@ def __init__( self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() self.norm2 = norm_layer(dim) mlp_hidden_dim = int(dim * mlp_ratio) - self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + self.mlp = Mlp( + in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop + ) self.H = None self.W = None @@ -249,8 +270,12 @@ def forward(self, x, mask_matrix): attn_mask = None # partition windows - x_windows = window_partition(shifted_x, self.window_size) # nW*B, window_size, window_size, C - x_windows = x_windows.view(-1, self.window_size * self.window_size, C) # nW*B, window_size*window_size, C + x_windows = window_partition( + shifted_x, self.window_size + ) # nW*B, window_size, window_size, C + x_windows = x_windows.view( + -1, self.window_size * self.window_size, C + ) # nW*B, window_size*window_size, C # W-MSA/SW-MSA attn_windows = self.attn(x_windows, mask=attn_mask) # nW*B, window_size*window_size, C @@ -412,10 +437,14 @@ def forward(self, x, H, W): img_mask[:, h, w, :] = cnt cnt += 1 - mask_windows = window_partition(img_mask, self.window_size) # nW, window_size, window_size, 1 + mask_windows = window_partition( + img_mask, self.window_size + ) # nW, window_size, window_size, 1 mask_windows = mask_windows.view(-1, self.window_size * self.window_size) attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) - attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0)) + attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill( + attn_mask == 0, float(0.0) + ) for blk in self.blocks: blk.H, blk.W = H, W @@ -545,7 +574,10 @@ def __init__( if self.ape: pretrain_img_size = to_2tuple(pretrain_img_size) patch_size = to_2tuple(patch_size) - patches_resolution = [pretrain_img_size[0] // patch_size[0], pretrain_img_size[1] // patch_size[1]] + patches_resolution = [ + pretrain_img_size[0] // patch_size[0], + pretrain_img_size[1] // patch_size[1], + ] self.absolute_pos_embed = nn.Parameter( torch.zeros(1, embed_dim, patches_resolution[0], patches_resolution[1]) @@ -555,7 +587,9 @@ def __init__( self.pos_drop = nn.Dropout(p=drop_rate) # stochastic depth - dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule + dpr = [ + x.item() for x in torch.linspace(0, drop_path_rate, sum(depths)) + ] # stochastic depth decay rule # build layers self.layers = nn.ModuleList() @@ -588,7 +622,9 @@ def __init__( self._freeze_stages() self._out_features = ["swin{}".format(i) for i in self.out_indices] - self._out_feature_channels = {"swin{}".format(i): self.embed_dim * 2**i for i in self.out_indices} + self._out_feature_channels = { + "swin{}".format(i): self.embed_dim * 2**i for i in self.out_indices + } self._out_feature_strides = {"swin{}".format(i): 2 ** (i + 2) for i in self.out_indices} self._size_devisibility = 32 @@ -640,7 +676,9 @@ def forward(self, x): Wh, Ww = x.size(2), x.size(3) if self.ape: # interpolate the position embedding to the corresponding size - absolute_pos_embed = F.interpolate(self.absolute_pos_embed, size=(Wh, Ww), mode="bicubic") + absolute_pos_embed = F.interpolate( + self.absolute_pos_embed, size=(Wh, Ww), mode="bicubic" + ) x = (x + absolute_pos_embed).flatten(2).transpose(1, 2) # B Wh*Ww C else: x = x.flatten(2).transpose(1, 2) diff --git a/dimos/models/Detic/detic/modeling/backbone/timm.py b/dimos/models/Detic/detic/modeling/backbone/timm.py index 704bb941fb..8b7dd00006 100644 --- a/dimos/models/Detic/detic/modeling/backbone/timm.py +++ b/dimos/models/Detic/detic/modeling/backbone/timm.py @@ -125,7 +125,9 @@ def __init__(self, base_name, out_levels, freeze_at=0, norm="FrozenBN", pretrain if base_name in model_params: self.base = create_timm_resnet(base_name, out_indices=out_indices, pretrained=False) elif "eff" in base_name or "resnet" in base_name or "regnet" in base_name: - self.base = create_model(base_name, features_only=True, out_indices=out_indices, pretrained=pretrained) + self.base = create_model( + base_name, features_only=True, out_indices=out_indices, pretrained=pretrained + ) elif "convnext" in base_name: drop_path_rate = 0.2 if ("tiny" in base_name or "small" in base_name) else 0.3 self.base = create_model( @@ -138,11 +140,16 @@ def __init__(self, base_name, out_levels, freeze_at=0, norm="FrozenBN", pretrain else: assert 0, base_name feature_info = [ - dict(num_chs=f["num_chs"], reduction=f["reduction"]) for i, f in enumerate(self.base.feature_info) + dict(num_chs=f["num_chs"], reduction=f["reduction"]) + for i, f in enumerate(self.base.feature_info) ] self._out_features = ["layer{}".format(x) for x in out_levels] - self._out_feature_channels = {"layer{}".format(l): feature_info[l - 1]["num_chs"] for l in out_levels} - self._out_feature_strides = {"layer{}".format(l): feature_info[l - 1]["reduction"] for l in out_levels} + self._out_feature_channels = { + "layer{}".format(l): feature_info[l - 1]["num_chs"] for l in out_levels + } + self._out_feature_strides = { + "layer{}".format(l): feature_info[l - 1]["reduction"] for l in out_levels + } self._size_divisibility = max(self._out_feature_strides.values()) if "resnet" in base_name: self.freeze(freeze_at) diff --git a/dimos/models/Detic/detic/modeling/debug.py b/dimos/models/Detic/detic/modeling/debug.py index 9ff2e65425..21136de2f0 100644 --- a/dimos/models/Detic/detic/modeling/debug.py +++ b/dimos/models/Detic/detic/modeling/debug.py @@ -11,7 +11,9 @@ def _get_color_image(heatmap): heatmap = heatmap.reshape(heatmap.shape[0], heatmap.shape[1], heatmap.shape[2], 1) if heatmap.shape[0] == 1: - color_map = (heatmap * np.ones((1, 1, 1, 3), np.uint8) * 255).max(axis=0).astype(np.uint8) # H, W, 3 + color_map = ( + (heatmap * np.ones((1, 1, 1, 3), np.uint8) * 255).max(axis=0).astype(np.uint8) + ) # H, W, 3 else: color_map = (heatmap * COLORS[: heatmap.shape[0]]).max(axis=0).astype(np.uint8) # H, W, 3 @@ -74,7 +76,15 @@ def _ind2il(ind, shapes_per_level, N): def debug_train( - images, gt_instances, flattened_hms, reg_targets, labels, pos_inds, shapes_per_level, locations, strides + images, + gt_instances, + flattened_hms, + reg_targets, + labels, + pos_inds, + shapes_per_level, + locations, + strides, ): """ images: N x 3 x H x W @@ -104,7 +114,12 @@ def debug_train( for j in range(len(bboxes)): bbox = bboxes[j] cv2.rectangle( - blend, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), (0, 0, 255), 3, cv2.LINE_AA + blend, + (int(bbox[0]), int(bbox[1])), + (int(bbox[2]), int(bbox[3])), + (0, 0, 255), + 3, + cv2.LINE_AA, ) for j in range(len(pos_inds)): @@ -112,7 +127,9 @@ def debug_train( if image_id != i: continue loc = locations[pos_inds[j]] - cv2.drawMarker(blend, (int(loc[0]), int(loc[1])), (0, 255, 255), markerSize=(l + 1) * 16) + cv2.drawMarker( + blend, (int(loc[0]), int(loc[1])), (0, 255, 255), markerSize=(l + 1) * 16 + ) for j in range(len(reg_inds)): image_id, l = _ind2il(reg_inds[j], shapes_per_level, N) @@ -123,7 +140,12 @@ def debug_train( loc = locations[reg_inds[j]] bbox = [(loc[0] - ltrb[0]), (loc[1] - ltrb[1]), (loc[0] + ltrb[2]), (loc[1] + ltrb[3])] cv2.rectangle( - blend, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), (255, 0, 0), 1, cv2.LINE_AA + blend, + (int(bbox[0]), int(bbox[1])), + (int(bbox[2]), int(bbox[3])), + (255, 0, 0), + 1, + cv2.LINE_AA, ) cv2.circle(blend, (int(loc[0]), int(loc[1])), 2, (255, 0, 0), -1) @@ -132,7 +154,14 @@ def debug_train( def debug_test( - images, logits_pred, reg_pred, agn_hm_pred=[], preds=[], vis_thresh=0.3, debug_show_name=False, mult_agn=False + images, + logits_pred, + reg_pred, + agn_hm_pred=[], + preds=[], + vis_thresh=0.3, + debug_show_name=False, + mult_agn=False, ): """ images: N x 3 x H x W @@ -170,7 +199,11 @@ def debug_test( cat2name = [x["name"] for x in LVIS_CATEGORIES] for j in range(len(preds[i].scores) if preds is not None else 0): if preds[i].scores[j] > vis_thresh: - bbox = preds[i].proposal_boxes[j] if preds[i].has("proposal_boxes") else preds[i].pred_boxes[j] + bbox = ( + preds[i].proposal_boxes[j] + if preds[i].has("proposal_boxes") + else preds[i].pred_boxes[j] + ) bbox = bbox.tensor[0].detach().cpu().numpy().astype(np.int32) cat = int(preds[i].pred_classes[j]) if preds[i].has("pred_classes") else 0 cl = COLORS[cat, 0, 0] @@ -183,7 +216,9 @@ def debug_test( cv2.LINE_AA, ) if debug_show_name: - txt = "{}{:.1f}".format(cat2name[cat] if cat > 0 else "", preds[i].scores[j]) + txt = "{}{:.1f}".format( + cat2name[cat] if cat > 0 else "", preds[i].scores[j] + ) font = cv2.FONT_HERSHEY_SIMPLEX cat_size = cv2.getTextSize(txt, font, 0.5, 2)[0] cv2.rectangle( @@ -255,7 +290,14 @@ def debug_second_stage( bbox = bboxes[j] cl = COLORS[cats[j], 0, 0] cl = (int(cl[0]), int(cl[1]), int(cl[2])) - cv2.rectangle(image, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), cl, 2, cv2.LINE_AA) + cv2.rectangle( + image, + (int(bbox[0]), int(bbox[1])), + (int(bbox[2]), int(bbox[3])), + cl, + 2, + cv2.LINE_AA, + ) if debug_show_name: cat = cats[j] txt = "{}{:.1f}".format(cat2name[cat] if cat > 0 else "", scores[j]) @@ -279,7 +321,9 @@ def debug_second_stage( lineType=cv2.LINE_AA, ) if proposals is not None: - proposal_image = images[i].detach().cpu().numpy().transpose(1, 2, 0).astype(np.uint8).copy() + proposal_image = ( + images[i].detach().cpu().numpy().transpose(1, 2, 0).astype(np.uint8).copy() + ) if bgr: proposal_image = proposal_image.copy() else: @@ -305,7 +349,12 @@ def debug_second_stage( cl = (0, 0, 0xA4) th = 4 cv2.rectangle( - proposal_image, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), cl, th, cv2.LINE_AA + proposal_image, + (int(bbox[0]), int(bbox[1])), + (int(bbox[2]), int(bbox[3])), + cl, + th, + cv2.LINE_AA, ) if selected[j] >= 0 and debug_show_name: cat = selected[j].item() diff --git a/dimos/models/Detic/detic/modeling/meta_arch/custom_rcnn.py b/dimos/models/Detic/detic/modeling/meta_arch/custom_rcnn.py index ebab5d5023..5711c87beb 100644 --- a/dimos/models/Detic/detic/modeling/meta_arch/custom_rcnn.py +++ b/dimos/models/Detic/detic/modeling/meta_arch/custom_rcnn.py @@ -134,13 +134,18 @@ def forward(self, batched_inputs: List[Dict[str, torch.Tensor]]): caps = [x["captions"][ind] for ind, x in zip(inds, batched_inputs)] caption_features = self.text_encoder(caps).float() if self.sync_caption_batch: - caption_features = self._sync_caption_features(caption_features, ann_type, len(batched_inputs)) + caption_features = self._sync_caption_features( + caption_features, ann_type, len(batched_inputs) + ) if self.dynamic_classifier and ann_type != "caption": cls_inds = self._sample_cls_inds(gt_instances, ann_type) # inds, inv_inds ind_with_bg = cls_inds[0].tolist() + [-1] cls_features = ( - self.roi_heads.box_predictor[0].cls_score.zs_weight[:, ind_with_bg].permute(1, 0).contiguous() + self.roi_heads.box_predictor[0] + .cls_score.zs_weight[:, ind_with_bg] + .permute(1, 0) + .contiguous() ) classifier_info = cls_features, cls_inds, caption_features @@ -150,7 +155,12 @@ def forward(self, batched_inputs: List[Dict[str, torch.Tensor]]): proposals, detector_losses = self.roi_heads(images, features, proposals, gt_instances) else: proposals, detector_losses = self.roi_heads( - images, features, proposals, gt_instances, ann_type=ann_type, classifier_info=classifier_info + images, + features, + proposals, + gt_instances, + ann_type=ann_type, + classifier_info=classifier_info, ) if self.vis_period > 0: @@ -188,7 +198,9 @@ def _sync_caption_features(self, caption_features, ann_type, BS): caption_features = torch.cat([caption_features, rank], dim=1) global_caption_features = comm.all_gather(caption_features) caption_features = ( - torch.cat([x.to(self.device) for x in global_caption_features], dim=0) if has_caption_feature else None + torch.cat([x.to(self.device) for x in global_caption_features], dim=0) + if has_caption_feature + else None ) # (NB) x (D + 1) return caption_features @@ -199,7 +211,10 @@ def _sample_cls_inds(self, gt_instances, ann_type="box"): freq_weight = self.freq_weight else: gt_classes = torch.cat( - [torch.tensor(x._pos_category_ids, dtype=torch.long, device=x.gt_classes.device) for x in gt_instances] + [ + torch.tensor(x._pos_category_ids, dtype=torch.long, device=x.gt_classes.device) + for x in gt_instances + ] ) C = self.num_classes freq_weight = None diff --git a/dimos/models/Detic/detic/modeling/meta_arch/d2_deformable_detr.py b/dimos/models/Detic/detic/modeling/meta_arch/d2_deformable_detr.py index 99990f7e2d..636adb1f44 100644 --- a/dimos/models/Detic/detic/modeling/meta_arch/d2_deformable_detr.py +++ b/dimos/models/Detic/detic/modeling/meta_arch/d2_deformable_detr.py @@ -21,7 +21,9 @@ class CustomSetCriterion(SetCriterion): - def __init__(self, num_classes, matcher, weight_dict, losses, focal_alpha=0.25, use_fed_loss=False): + def __init__( + self, num_classes, matcher, weight_dict, losses, focal_alpha=0.25, use_fed_loss=False + ): super().__init__(num_classes, matcher, weight_dict, losses, focal_alpha) self.use_fed_loss = use_fed_loss if self.use_fed_loss: @@ -36,7 +38,9 @@ def loss_labels(self, outputs, targets, indices, num_boxes, log=True): idx = self._get_src_permutation_idx(indices) target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)]) - target_classes = torch.full(src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device) + target_classes = torch.full( + src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device + ) target_classes[idx] = target_classes_o target_classes_onehot = torch.zeros( @@ -67,7 +71,9 @@ def loss_labels(self, outputs, targets, indices, num_boxes, log=True): ) else: loss_ce = ( - sigmoid_focal_loss(src_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2) + sigmoid_focal_loss( + src_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2 + ) * src_logits.shape[1] ) losses = {"loss_ce": loss_ce} @@ -169,7 +175,9 @@ def __init__(self, cfg): if self.mask_on: assert 0, "Mask is not supported yet :(" - matcher = HungarianMatcher(cost_class=cls_weight, cost_bbox=l1_weight, cost_giou=giou_weight) + matcher = HungarianMatcher( + cost_class=cls_weight, cost_bbox=l1_weight, cost_giou=giou_weight + ) weight_dict = {"loss_ce": cls_weight, "loss_bbox": l1_weight} weight_dict["loss_giou"] = giou_weight if deep_supervision: @@ -212,13 +220,17 @@ def forward(self, batched_inputs): loss_dict[k] *= weight_dict[k] if self.with_image_labels: if batched_inputs[0]["ann_type"] in ["image", "captiontag"]: - loss_dict["loss_image"] = self.weak_weight * self._weak_loss(output, batched_inputs) + loss_dict["loss_image"] = self.weak_weight * self._weak_loss( + output, batched_inputs + ) else: loss_dict["loss_image"] = images[0].new_zeros([1], dtype=torch.float32)[0] # import pdb; pdb.set_trace() return loss_dict else: - image_sizes = output["pred_boxes"].new_tensor([(t["height"], t["width"]) for t in batched_inputs]) + image_sizes = output["pred_boxes"].new_tensor( + [(t["height"], t["width"]) for t in batched_inputs] + ) results = self.post_process(output, image_sizes) return results @@ -245,7 +257,9 @@ def post_process(self, outputs, target_sizes): assert target_sizes.shape[1] == 2 prob = out_logits.sigmoid() - topk_values, topk_indexes = torch.topk(prob.view(out_logits.shape[0], -1), self.test_topk, dim=1) + topk_values, topk_indexes = torch.topk( + prob.view(out_logits.shape[0], -1), self.test_topk, dim=1 + ) scores = topk_values topk_boxes = topk_indexes // out_logits.shape[2] labels = topk_indexes % out_logits.shape[2] @@ -299,5 +313,7 @@ def _max_size_loss(self, logits, boxes, label): target[:, label] = 1.0 sizes = boxes[..., 2] * boxes[..., 3] # L x N ind = sizes.argmax(dim=1) # L - loss = F.binary_cross_entropy_with_logits(logits[range(len(ind)), ind], target, reduction="sum") + loss = F.binary_cross_entropy_with_logits( + logits[range(len(ind)), ind], target, reduction="sum" + ) return loss diff --git a/dimos/models/Detic/detic/modeling/roi_heads/detic_fast_rcnn.py b/dimos/models/Detic/detic/modeling/roi_heads/detic_fast_rcnn.py index f64bd0128d..6d4d2e786e 100644 --- a/dimos/models/Detic/detic/modeling/roi_heads/detic_fast_rcnn.py +++ b/dimos/models/Detic/detic/modeling/roi_heads/detic_fast_rcnn.py @@ -85,7 +85,10 @@ def __init__( # assert self.num_classes == 11493 print("Extending federated loss weight") self.freq_weight = torch.cat( - [self.freq_weight, self.freq_weight.new_zeros(self.num_classes - len(self.freq_weight))] + [ + self.freq_weight, + self.freq_weight.new_zeros(self.num_classes - len(self.freq_weight)), + ] ) assert (not self.dynamic_classifier) or (not self.use_fed_loss) @@ -143,12 +146,16 @@ def from_config(cls, cfg, input_shape): ret["cls_score"] = ZeroShotClassifier(cfg, input_shape) return ret - def losses(self, predictions, proposals, use_advanced_loss=True, classifier_info=(None, None, None)): + def losses( + self, predictions, proposals, use_advanced_loss=True, classifier_info=(None, None, None) + ): """ enable advanced loss """ scores, proposal_deltas = predictions - gt_classes = cat([p.gt_classes for p in proposals], dim=0) if len(proposals) else torch.empty(0) + gt_classes = ( + cat([p.gt_classes for p in proposals], dim=0) if len(proposals) else torch.empty(0) + ) num_classes = self.num_classes if self.dynamic_classifier: _, cls_id_map = classifier_info[1] @@ -205,7 +212,9 @@ def sigmoid_cross_entropy_loss(self, pred_class_logits, gt_classes): weight = weight * w.view(1, C).expand(B, C) # import pdb; pdb.set_trace() - cls_loss = F.binary_cross_entropy_with_logits(pred_class_logits[:, :-1], target, reduction="none") # B x C + cls_loss = F.binary_cross_entropy_with_logits( + pred_class_logits[:, :-1], target, reduction="none" + ) # B x C loss = torch.sum(cls_loss * weight) / B return loss @@ -217,8 +226,12 @@ def softmax_cross_entropy_loss(self, pred_class_logits, gt_classes): return pred_class_logits.new_zeros([1])[0] if self.ignore_zero_cats and (self.freq_weight is not None): - zero_weight = torch.cat([(self.freq_weight.view(-1) > 1e-4).float(), self.freq_weight.new_ones(1)]) # C + 1 - loss = F.cross_entropy(pred_class_logits, gt_classes, weight=zero_weight, reduction="mean") + zero_weight = torch.cat( + [(self.freq_weight.view(-1) > 1e-4).float(), self.freq_weight.new_ones(1)] + ) # C + 1 + loss = F.cross_entropy( + pred_class_logits, gt_classes, weight=zero_weight, reduction="mean" + ) elif self.use_fed_loss and (self.freq_weight is not None): # fedloss C = pred_class_logits.shape[1] - 1 appeared = get_fed_loss_inds( @@ -227,7 +240,9 @@ def softmax_cross_entropy_loss(self, pred_class_logits, gt_classes): appeared_mask = appeared.new_zeros(C + 1).float() appeared_mask[appeared] = 1.0 # C + 1 appeared_mask[C] = 1.0 - loss = F.cross_entropy(pred_class_logits, gt_classes, weight=appeared_mask, reduction="mean") + loss = F.cross_entropy( + pred_class_logits, gt_classes, weight=appeared_mask, reduction="mean" + ) else: loss = F.cross_entropy(pred_class_logits, gt_classes, reduction="mean") return loss @@ -242,16 +257,22 @@ def box_reg_loss(self, proposal_boxes, gt_boxes, pred_deltas, gt_classes, num_cl if pred_deltas.shape[1] == box_dim: # cls-agnostic regression fg_pred_deltas = pred_deltas[fg_inds] else: - fg_pred_deltas = pred_deltas.view(-1, self.num_classes, box_dim)[fg_inds, gt_classes[fg_inds]] + fg_pred_deltas = pred_deltas.view(-1, self.num_classes, box_dim)[ + fg_inds, gt_classes[fg_inds] + ] if self.box_reg_loss_type == "smooth_l1": gt_pred_deltas = self.box2box_transform.get_deltas( proposal_boxes[fg_inds], gt_boxes[fg_inds], ) - loss_box_reg = smooth_l1_loss(fg_pred_deltas, gt_pred_deltas, self.smooth_l1_beta, reduction="sum") + loss_box_reg = smooth_l1_loss( + fg_pred_deltas, gt_pred_deltas, self.smooth_l1_beta, reduction="sum" + ) elif self.box_reg_loss_type == "giou": - fg_pred_boxes = self.box2box_transform.apply_deltas(fg_pred_deltas, proposal_boxes[fg_inds]) + fg_pred_boxes = self.box2box_transform.apply_deltas( + fg_pred_deltas, proposal_boxes[fg_inds] + ) loss_box_reg = giou_loss(fg_pred_boxes, gt_boxes[fg_inds], reduction="sum") else: raise ValueError(f"Invalid bbox reg loss type '{self.box_reg_loss_type}'") @@ -291,7 +312,12 @@ def predict_probs(self, predictions, proposals): return probs.split(num_inst_per_image, dim=0) def image_label_losses( - self, predictions, proposals, image_labels, classifier_info=(None, None, None), ann_type="image" + self, + predictions, + proposals, + image_labels, + classifier_info=(None, None, None), + ann_type="image", ): """ Inputs: @@ -314,7 +340,9 @@ def image_label_losses( storage = get_event_storage() loss = scores[0].new_zeros([1])[0] caption_loss = scores[0].new_zeros([1])[0] - for idx, (score, labels, prop_score, p) in enumerate(zip(scores, image_labels, prop_scores, proposals)): + for idx, (score, labels, prop_score, p) in enumerate( + zip(scores, image_labels, prop_scores, proposals) + ): if score.shape[0] == 0: loss += score.new_zeros([1])[0] continue @@ -354,13 +382,19 @@ def image_label_losses( p.selected[ind_i] = label else: img_box_count = ind - select_size_count = p[ind].proposal_boxes.area() / (p.image_size[0] * p.image_size[1]) + select_size_count = p[ind].proposal_boxes.area() / ( + p.image_size[0] * p.image_size[1] + ) max_score_count = score[ind, label].sigmoid() select_x_count = ( - (p.proposal_boxes.tensor[ind, 0] + p.proposal_boxes.tensor[ind, 2]) / 2 / p.image_size[1] + (p.proposal_boxes.tensor[ind, 0] + p.proposal_boxes.tensor[ind, 2]) + / 2 + / p.image_size[1] ) select_y_count = ( - (p.proposal_boxes.tensor[ind, 1] + p.proposal_boxes.tensor[ind, 3]) / 2 / p.image_size[0] + (p.proposal_boxes.tensor[ind, 1] + p.proposal_boxes.tensor[ind, 3]) + / 2 + / p.image_size[0] ) if self.debug: p.selected[ind] = label @@ -423,7 +457,9 @@ def _caption_loss(self, score, classifier_info, idx, B): score, caption_score = score.split([cls_and_cap_num - cap_num, cap_num], dim=1) # n x (C + 1), n x B caption_score = caption_score[-1:] # 1 x B # -1: image level box - caption_target = caption_score.new_zeros(caption_score.shape) # 1 x B or 1 x MB, M: num machines + caption_target = caption_score.new_zeros( + caption_score.shape + ) # 1 x B or 1 x MB, M: num machines if self.sync_caption_batch: # caption_target: 1 x MB rank = comm.get_rank() @@ -439,7 +475,9 @@ def _caption_loss(self, score, classifier_info, idx, B): else: assert caption_score.shape[1] == B caption_target[:, idx] = 1.0 - caption_loss_img = F.binary_cross_entropy_with_logits(caption_score, caption_target, reduction="none") + caption_loss_img = F.binary_cross_entropy_with_logits( + caption_score, caption_target, reduction="none" + ) if self.sync_caption_batch: fg_mask = (caption_target > 0.5).float() assert (fg_mask.sum().item() - 1.0) ** 2 < 1e-8, "{} {}".format(fg_mask.shape, fg_mask) @@ -503,7 +541,9 @@ def _max_size_loss(self, score, label, p): ind = sizes[:-1].argmax().item() if len(sizes) > 1 else 0 if self.softmax_weak_loss: loss += F.cross_entropy( - score[ind : ind + 1], score.new_tensor(label, dtype=torch.long).view(1), reduction="sum" + score[ind : ind + 1], + score.new_tensor(label, dtype=torch.long).view(1), + reduction="sum", ) else: loss += F.binary_cross_entropy_with_logits(score[ind], target, reduction="sum") @@ -513,7 +553,9 @@ def _max_size_loss(self, score, label, p): def put_label_distribution(storage, hist_name, hist_counts, num_classes): """ """ ht_min, ht_max = 0, num_classes - hist_edges = torch.linspace(start=ht_min, end=ht_max, steps=num_classes + 1, dtype=torch.float32) + hist_edges = torch.linspace( + start=ht_min, end=ht_max, steps=num_classes + 1, dtype=torch.float32 + ) hist_params = dict( tag=hist_name, diff --git a/dimos/models/Detic/detic/modeling/roi_heads/detic_roi_heads.py b/dimos/models/Detic/detic/modeling/roi_heads/detic_roi_heads.py index 4f1c0f6ac7..8fa0e3f538 100644 --- a/dimos/models/Detic/detic/modeling/roi_heads/detic_roi_heads.py +++ b/dimos/models/Detic/detic/modeling/roi_heads/detic_roi_heads.py @@ -12,7 +12,6 @@ from .detic_fast_rcnn import DeticFastRCNNOutputLayers - @ROI_HEADS_REGISTRY.register() class DeticCascadeROIHeads(CascadeROIHeads): @configurable @@ -65,13 +64,17 @@ def _init_box_head(self, cfg, input_shape): for box_head, bbox_reg_weights in zip(ret["box_heads"], cascade_bbox_reg_weights): box_predictors.append( DeticFastRCNNOutputLayers( - cfg, box_head.output_shape, box2box_transform=Box2BoxTransform(weights=bbox_reg_weights) + cfg, + box_head.output_shape, + box2box_transform=Box2BoxTransform(weights=bbox_reg_weights), ) ) ret["box_predictors"] = box_predictors return ret - def _forward_box(self, features, proposals, targets=None, ann_type="box", classifier_info=(None, None, None)): + def _forward_box( + self, features, proposals, targets=None, ann_type="box", classifier_info=(None, None, None) + ): """ Add mult proposal scores at testing Add ann_type @@ -95,7 +98,9 @@ def _forward_box(self, features, proposals, targets=None, ann_type="box", classi if self.training and ann_type in ["box"]: proposals = self._match_and_label_boxes(proposals, k, targets) predictions = self._run_stage(features, proposals, k, classifier_info=classifier_info) - prev_pred_boxes = self.box_predictor[k].predict_boxes((predictions[0], predictions[1]), proposals) + prev_pred_boxes = self.box_predictor[k].predict_boxes( + (predictions[0], predictions[1]), proposals + ) head_outputs.append((self.box_predictor[k], predictions, proposals)) if self.training: @@ -108,12 +113,18 @@ def _forward_box(self, features, proposals, targets=None, ann_type="box", classi if ann_type in ["image", "caption", "captiontag"]: image_labels = [x._pos_category_ids for x in targets] weak_losses = predictor.image_label_losses( - predictions, proposals, image_labels, classifier_info=classifier_info, ann_type=ann_type + predictions, + proposals, + image_labels, + classifier_info=classifier_info, + ann_type=ann_type, ) stage_losses.update(weak_losses) else: # supervised stage_losses = predictor.losses( - (predictions[0], predictions[1]), proposals, classifier_info=classifier_info + (predictions[0], predictions[1]), + proposals, + classifier_info=classifier_info, ) if self.with_image_labels: stage_losses["image_loss"] = predictions[0].new_zeros([1])[0] @@ -142,7 +153,15 @@ def _forward_box(self, features, proposals, targets=None, ann_type="box", classi ) return pred_instances - def forward(self, images, features, proposals, targets=None, ann_type="box", classifier_info=(None, None, None)): + def forward( + self, + images, + features, + proposals, + targets=None, + ann_type="box", + classifier_info=(None, None, None), + ): """ enable debug and image labels classifier_info is shared across the batch @@ -153,14 +172,18 @@ def forward(self, images, features, proposals, targets=None, ann_type="box", cla else: proposals = self.get_top_proposals(proposals) - losses = self._forward_box(features, proposals, targets, ann_type=ann_type, classifier_info=classifier_info) + losses = self._forward_box( + features, proposals, targets, ann_type=ann_type, classifier_info=classifier_info + ) if ann_type == "box" and targets[0].has("gt_masks"): mask_losses = self._forward_mask(features, proposals) losses.update({k: v * self.mask_weight for k, v in mask_losses.items()}) losses.update(self._forward_keypoint(features, proposals)) else: losses.update( - self._get_empty_mask_loss(features, proposals, device=proposals[0].objectness_logits.device) + self._get_empty_mask_loss( + features, proposals, device=proposals[0].objectness_logits.device + ) ) return proposals, losses else: @@ -185,7 +208,12 @@ def _add_image_box(self, p): f = self.image_box_size image_box.proposal_boxes = Boxes( p.proposal_boxes.tensor.new_tensor( - [w * (1.0 - f) / 2.0, h * (1.0 - f) / 2.0, w * (1.0 - (1.0 - f) / 2.0), h * (1.0 - (1.0 - f) / 2.0)] + [ + w * (1.0 - f) / 2.0, + h * (1.0 - f) / 2.0, + w * (1.0 - (1.0 - f) / 2.0), + h * (1.0 - (1.0 - f) / 2.0), + ] ).view(n, 4) ) image_box.objectness_logits = p.objectness_logits.new_ones(n) diff --git a/dimos/models/Detic/detic/modeling/roi_heads/res5_roi_heads.py b/dimos/models/Detic/detic/modeling/roi_heads/res5_roi_heads.py index 8b0917fb4c..d05a5d0537 100644 --- a/dimos/models/Detic/detic/modeling/roi_heads/res5_roi_heads.py +++ b/dimos/models/Detic/detic/modeling/roi_heads/res5_roi_heads.py @@ -11,7 +11,6 @@ from ..debug import debug_second_stage - @ROI_HEADS_REGISTRY.register() class CustomRes5ROIHeads(Res5ROIHeads): @configurable @@ -26,15 +25,21 @@ def __init__(self, **kwargs): self.add_image_box = cfg.MODEL.ROI_BOX_HEAD.ADD_IMAGE_BOX self.add_feature_to_prop = cfg.MODEL.ROI_BOX_HEAD.ADD_FEATURE_TO_PROP self.image_box_size = cfg.MODEL.ROI_BOX_HEAD.IMAGE_BOX_SIZE - self.box_predictor = DeticFastRCNNOutputLayers(cfg, ShapeSpec(channels=out_channels, height=1, width=1)) + self.box_predictor = DeticFastRCNNOutputLayers( + cfg, ShapeSpec(channels=out_channels, height=1, width=1) + ) self.save_debug = cfg.SAVE_DEBUG self.save_debug_path = cfg.SAVE_DEBUG_PATH if self.save_debug: self.debug_show_name = cfg.DEBUG_SHOW_NAME self.vis_thresh = cfg.VIS_THRESH - self.pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(torch.device(cfg.MODEL.DEVICE)).view(3, 1, 1) - self.pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(torch.device(cfg.MODEL.DEVICE)).view(3, 1, 1) + self.pixel_mean = ( + torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(torch.device(cfg.MODEL.DEVICE)).view(3, 1, 1) + ) + self.pixel_std = ( + torch.Tensor(cfg.MODEL.PIXEL_STD).to(torch.device(cfg.MODEL.DEVICE)).view(3, 1, 1) + ) self.bgr = cfg.INPUT.FORMAT == "BGR" @classmethod @@ -43,7 +48,15 @@ def from_config(cls, cfg, input_shape): ret["cfg"] = cfg return ret - def forward(self, images, features, proposals, targets=None, ann_type="box", classifier_info=(None, None, None)): + def forward( + self, + images, + features, + proposals, + targets=None, + ann_type="box", + classifier_info=(None, None, None), + ): """ enable debug and image labels classifier_info is shared across the batch @@ -58,11 +71,17 @@ def forward(self, images, features, proposals, targets=None, ann_type="box", cla proposals = self.get_top_proposals(proposals) proposal_boxes = [x.proposal_boxes for x in proposals] - box_features = self._shared_roi_transform([features[f] for f in self.in_features], proposal_boxes) - predictions = self.box_predictor(box_features.mean(dim=[2, 3]), classifier_info=classifier_info) + box_features = self._shared_roi_transform( + [features[f] for f in self.in_features], proposal_boxes + ) + predictions = self.box_predictor( + box_features.mean(dim=[2, 3]), classifier_info=classifier_info + ) if self.add_feature_to_prop: - feats_per_image = box_features.mean(dim=[2, 3]).split([len(p) for p in proposals], dim=0) + feats_per_image = box_features.mean(dim=[2, 3]).split( + [len(p) for p in proposals], dim=0 + ) for feat, p in zip(feats_per_image, proposals): p.feat = feat @@ -71,7 +90,11 @@ def forward(self, images, features, proposals, targets=None, ann_type="box", cla if ann_type != "box": image_labels = [x._pos_category_ids for x in targets] losses = self.box_predictor.image_label_losses( - predictions, proposals, image_labels, classifier_info=classifier_info, ann_type=ann_type + predictions, + proposals, + image_labels, + classifier_info=classifier_info, + ann_type=ann_type, ) else: losses = self.box_predictor.losses((predictions[0], predictions[1]), proposals) @@ -131,11 +154,18 @@ def _add_image_box(self, p, use_score=False): f = self.image_box_size image_box.proposal_boxes = Boxes( p.proposal_boxes.tensor.new_tensor( - [w * (1.0 - f) / 2.0, h * (1.0 - f) / 2.0, w * (1.0 - (1.0 - f) / 2.0), h * (1.0 - (1.0 - f) / 2.0)] + [ + w * (1.0 - f) / 2.0, + h * (1.0 - f) / 2.0, + w * (1.0 - (1.0 - f) / 2.0), + h * (1.0 - (1.0 - f) / 2.0), + ] ).view(n, 4) ) else: - image_box.proposal_boxes = Boxes(p.proposal_boxes.tensor.new_tensor([0, 0, w, h]).view(n, 4)) + image_box.proposal_boxes = Boxes( + p.proposal_boxes.tensor.new_tensor([0, 0, w, h]).view(n, 4) + ) if use_score: image_box.scores = p.objectness_logits.new_ones(n) image_box.pred_classes = p.objectness_logits.new_zeros(n, dtype=torch.long) diff --git a/dimos/models/Detic/detic/modeling/roi_heads/zero_shot_classifier.py b/dimos/models/Detic/detic/modeling/roi_heads/zero_shot_classifier.py index 4bd24680a1..7dfe0d7097 100644 --- a/dimos/models/Detic/detic/modeling/roi_heads/zero_shot_classifier.py +++ b/dimos/models/Detic/detic/modeling/roi_heads/zero_shot_classifier.py @@ -37,8 +37,14 @@ def __init__( zs_weight = torch.randn((zs_weight_dim, num_classes)) nn.init.normal_(zs_weight, std=0.01) else: - zs_weight = torch.tensor(np.load(zs_weight_path), dtype=torch.float32).permute(1, 0).contiguous() # D x C - zs_weight = torch.cat([zs_weight, zs_weight.new_zeros((zs_weight_dim, 1))], dim=1) # D x (C + 1) + zs_weight = ( + torch.tensor(np.load(zs_weight_path), dtype=torch.float32) + .permute(1, 0) + .contiguous() + ) # D x C + zs_weight = torch.cat( + [zs_weight, zs_weight.new_zeros((zs_weight_dim, 1))], dim=1 + ) # D x (C + 1) if self.norm_weight: zs_weight = F.normalize(zs_weight, p=2, dim=0) diff --git a/dimos/models/Detic/detic/modeling/text/text_encoder.py b/dimos/models/Detic/detic/modeling/text/text_encoder.py index 1ad3bd6f08..ff58592bd8 100644 --- a/dimos/models/Detic/detic/modeling/text/text_encoder.py +++ b/dimos/models/Detic/detic/modeling/text/text_encoder.py @@ -47,7 +47,11 @@ def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None): self.attn_mask = attn_mask def attention(self, x: torch.Tensor): - self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None + self.attn_mask = ( + self.attn_mask.to(dtype=x.dtype, device=x.device) + if self.attn_mask is not None + else None + ) return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0] def forward(self, x: torch.Tensor): @@ -61,7 +65,9 @@ def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor super().__init__() self.width = width self.layers = layers - self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers)]) + self.resblocks = nn.Sequential( + *[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers)] + ) def forward(self, x: torch.Tensor): return self.resblocks(x) @@ -92,7 +98,9 @@ def __init__( self.vocab_size = vocab_size self.token_embedding = nn.Embedding(vocab_size, transformer_width) - self.positional_embedding = nn.Parameter(torch.empty(self.context_length, transformer_width)) + self.positional_embedding = nn.Parameter( + torch.empty(self.context_length, transformer_width) + ) self.ln_final = LayerNorm(transformer_width) self.text_projection = nn.Parameter(torch.empty(transformer_width, embed_dim)) diff --git a/dimos/models/Detic/detic/modeling/utils.py b/dimos/models/Detic/detic/modeling/utils.py index 4f93ec4256..a028e9246d 100644 --- a/dimos/models/Detic/detic/modeling/utils.py +++ b/dimos/models/Detic/detic/modeling/utils.py @@ -29,10 +29,14 @@ def reset_cls_test(model, cls_path, num_classes): model.roi_heads.num_classes = num_classes if type(cls_path) == str: print("Resetting zs_weight", cls_path) - zs_weight = torch.tensor(np.load(cls_path), dtype=torch.float32).permute(1, 0).contiguous() # D x C + zs_weight = ( + torch.tensor(np.load(cls_path), dtype=torch.float32).permute(1, 0).contiguous() + ) # D x C else: zs_weight = cls_path - zs_weight = torch.cat([zs_weight, zs_weight.new_zeros((zs_weight.shape[0], 1))], dim=1) # D x (C + 1) + zs_weight = torch.cat( + [zs_weight, zs_weight.new_zeros((zs_weight.shape[0], 1))], dim=1 + ) # D x (C + 1) if model.roi_heads.box_predictor[0].cls_score.norm_weight: zs_weight = F.normalize(zs_weight, p=2, dim=0) zs_weight = zs_weight.to(model.device) diff --git a/dimos/models/Detic/detic/predictor.py b/dimos/models/Detic/detic/predictor.py index 35035d3589..9985c2d854 100644 --- a/dimos/models/Detic/detic/predictor.py +++ b/dimos/models/Detic/detic/predictor.py @@ -85,10 +85,14 @@ def run_on_image(self, image): visualizer = Visualizer(image, self.metadata, instance_mode=self.instance_mode) if "panoptic_seg" in predictions: panoptic_seg, segments_info = predictions["panoptic_seg"] - vis_output = visualizer.draw_panoptic_seg_predictions(panoptic_seg.to(self.cpu_device), segments_info) + vis_output = visualizer.draw_panoptic_seg_predictions( + panoptic_seg.to(self.cpu_device), segments_info + ) else: if "sem_seg" in predictions: - vis_output = visualizer.draw_sem_seg(predictions["sem_seg"].argmax(dim=0).to(self.cpu_device)) + vis_output = visualizer.draw_sem_seg( + predictions["sem_seg"].argmax(dim=0).to(self.cpu_device) + ) if "instances" in predictions: instances = predictions["instances"].to(self.cpu_device) vis_output = visualizer.draw_instance_predictions(predictions=instances) @@ -201,7 +205,9 @@ def __init__(self, cfg, num_gpus: int = 1): cfg = cfg.clone() cfg.defrost() cfg.MODEL.DEVICE = "cuda:{}".format(gpuid) if num_gpus > 0 else "cpu" - self.procs.append(AsyncPredictor._PredictWorker(cfg, self.task_queue, self.result_queue)) + self.procs.append( + AsyncPredictor._PredictWorker(cfg, self.task_queue, self.result_queue) + ) self.put_idx = 0 self.get_idx = 0 diff --git a/dimos/models/Detic/lazy_train_net.py b/dimos/models/Detic/lazy_train_net.py index 79f137cc49..d6c4e7e841 100644 --- a/dimos/models/Detic/lazy_train_net.py +++ b/dimos/models/Detic/lazy_train_net.py @@ -35,7 +35,9 @@ def do_test(cfg, model): if "evaluator" in cfg.dataloader: - ret = inference_on_dataset(model, instantiate(cfg.dataloader.test), instantiate(cfg.dataloader.evaluator)) + ret = inference_on_dataset( + model, instantiate(cfg.dataloader.test), instantiate(cfg.dataloader.evaluator) + ) print_csv_format(ret) return ret @@ -80,7 +82,9 @@ def do_train(args, cfg): train_hooks = [ hooks.IterationTimer(), hooks.LRScheduler(scheduler=instantiate(cfg.lr_multiplier)), - hooks.PeriodicCheckpointer(checkpointer, **cfg.train.checkpointer) if comm.is_main_process() else None, + hooks.PeriodicCheckpointer(checkpointer, **cfg.train.checkpointer) + if comm.is_main_process() + else None, hooks.EvalHook(cfg.train.eval_period, lambda: do_test(cfg, model)), hooks.PeriodicWriter( default_writers(cfg.train.output_dir, cfg.train.max_iter), diff --git a/dimos/models/Detic/predict.py b/dimos/models/Detic/predict.py index 82e20db6c3..4091bec3b9 100644 --- a/dimos/models/Detic/predict.py +++ b/dimos/models/Detic/predict.py @@ -81,7 +81,9 @@ def predict(self, image, vocabulary, custom_vocabulary): # Reset visualization threshold output_score_threshold = 0.3 for cascade_stages in range(len(self.predictor.model.roi_heads.box_predictor)): - self.predictor.model.roi_heads.box_predictor[cascade_stages].test_score_thresh = output_score_threshold + self.predictor.model.roi_heads.box_predictor[ + cascade_stages + ].test_score_thresh = output_score_threshold outputs = self.predictor(image) v = Visualizer(image[:, :, ::-1], metadata) diff --git a/dimos/models/Detic/third_party/CenterNet2/centernet/data/custom_build_augmentation.py b/dimos/models/Detic/third_party/CenterNet2/centernet/data/custom_build_augmentation.py index 13afd5ab12..72e399fa40 100644 --- a/dimos/models/Detic/third_party/CenterNet2/centernet/data/custom_build_augmentation.py +++ b/dimos/models/Detic/third_party/CenterNet2/centernet/data/custom_build_augmentation.py @@ -1,5 +1,3 @@ - - from detectron2.data import transforms as T from .transforms.custom_augmentation_impl import EfficientDetResizeCrop diff --git a/dimos/models/Detic/third_party/CenterNet2/centernet/data/custom_dataset_dataloader.py b/dimos/models/Detic/third_party/CenterNet2/centernet/data/custom_dataset_dataloader.py index c18adf87bb..b8776789cf 100644 --- a/dimos/models/Detic/third_party/CenterNet2/centernet/data/custom_dataset_dataloader.py +++ b/dimos/models/Detic/third_party/CenterNet2/centernet/data/custom_dataset_dataloader.py @@ -30,7 +30,9 @@ def build_custom_train_loader(cfg, mapper=None): dataset_dicts = get_detection_dataset_dicts_with_source( cfg.DATASETS.TRAIN, filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS, - min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE if cfg.MODEL.KEYPOINT_ON else 0, + min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE + if cfg.MODEL.KEYPOINT_ON + else 0, proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None, ) sizes = [0 for _ in range(len(cfg.DATASETS.TRAIN))] @@ -41,7 +43,9 @@ def build_custom_train_loader(cfg, mapper=None): dataset_dicts = get_detection_dataset_dicts( cfg.DATASETS.TRAIN, filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS, - min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE if cfg.MODEL.KEYPOINT_ON else 0, + min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE + if cfg.MODEL.KEYPOINT_ON + else 0, proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None, ) dataset = DatasetFromList(dataset_dicts, copy=False) @@ -123,7 +127,9 @@ def _get_class_balance_factor(self, dataset_dicts, l=1.0): return torch.tensor(ret).float() -def get_detection_dataset_dicts_with_source(dataset_names, filter_empty=True, min_keypoints=0, proposal_files=None): +def get_detection_dataset_dicts_with_source( + dataset_names, filter_empty=True, min_keypoints=0, proposal_files=None +): assert len(dataset_names) dataset_dicts = [DatasetCatalog.get(dataset_name) for dataset_name in dataset_names] for dataset_name, dicts in zip(dataset_names, dataset_dicts): @@ -168,7 +174,9 @@ def __init__(self, cfg, sizes, dataset_dicts, seed: Optional[int] = None): dataset_ratio = cfg.DATALOADER.DATASET_RATIO self._batch_size = cfg.SOLVER.IMS_PER_BATCH assert len(dataset_ratio) == len(sizes), ( - "length of dataset ratio {} should be equal to number if dataset {}".format(len(dataset_ratio), len(sizes)) + "length of dataset ratio {} should be equal to number if dataset {}".format( + len(dataset_ratio), len(sizes) + ) ) if seed is None: seed = comm.shared_random_seed() @@ -177,7 +185,9 @@ def __init__(self, cfg, sizes, dataset_dicts, seed: Optional[int] = None): self._world_size = comm.get_world_size() self._ims_per_gpu = self._batch_size // self._world_size - self.dataset_ids = torch.tensor([d["dataset_source"] for d in dataset_dicts], dtype=torch.long) + self.dataset_ids = torch.tensor( + [d["dataset_source"] for d in dataset_dicts], dtype=torch.long + ) dataset_weight = [ torch.ones(s) * max(sizes) / s * r / sum(dataset_ratio) @@ -195,7 +205,9 @@ def _infinite_indices(self): g = torch.Generator() g.manual_seed(self._seed) while True: - ids = torch.multinomial(self.weights, self.sample_epoch_size, generator=g, replacement=True) + ids = torch.multinomial( + self.weights, self.sample_epoch_size, generator=g, replacement=True + ) nums = [(self.dataset_ids[ids] == i).sum().int().item() for i in range(len(self.sizes))] print("_rank, len, nums", self._rank, len(ids), nums, flush=True) # print('_rank, len, nums, self.dataset_ids[ids[:10]], ', diff --git a/dimos/models/Detic/third_party/CenterNet2/centernet/data/datasets/coco.py b/dimos/models/Detic/third_party/CenterNet2/centernet/data/datasets/coco.py index de23b1e5a0..93f0a13428 100644 --- a/dimos/models/Detic/third_party/CenterNet2/centernet/data/datasets/coco.py +++ b/dimos/models/Detic/third_party/CenterNet2/centernet/data/datasets/coco.py @@ -14,11 +14,15 @@ def register_distill_coco_instances(name, metadata, json_file, image_root): assert isinstance(json_file, (str, os.PathLike)), json_file assert isinstance(image_root, (str, os.PathLike)), image_root # 1. register a function which returns dicts - DatasetCatalog.register(name, lambda: load_coco_json(json_file, image_root, name, extra_annotation_keys=["score"])) + DatasetCatalog.register( + name, lambda: load_coco_json(json_file, image_root, name, extra_annotation_keys=["score"]) + ) # 2. Optionally, add metadata about this dataset, # since they might be useful in evaluation, visualization or logging - MetadataCatalog.get(name).set(json_file=json_file, image_root=image_root, evaluator_type="coco", **metadata) + MetadataCatalog.get(name).set( + json_file=json_file, image_root=image_root, evaluator_type="coco", **metadata + ) _PREDEFINED_SPLITS_COCO = { @@ -34,7 +38,10 @@ def register_distill_coco_instances(name, metadata, json_file, image_root): ) _PREDEFINED_SPLITS_DISTILL_COCO = { - "coco_un_yolov4_55_0.5": ("coco/unlabeled2017", "coco/annotations/yolov4_cocounlabeled_55_ann0.5.json"), + "coco_un_yolov4_55_0.5": ( + "coco/unlabeled2017", + "coco/annotations/yolov4_cocounlabeled_55_ann0.5.json", + ), } for key, (image_root, json_file) in _PREDEFINED_SPLITS_DISTILL_COCO.items(): diff --git a/dimos/models/Detic/third_party/CenterNet2/centernet/data/datasets/nuimages.py b/dimos/models/Detic/third_party/CenterNet2/centernet/data/datasets/nuimages.py index 965a0d1f4b..22b80828c0 100644 --- a/dimos/models/Detic/third_party/CenterNet2/centernet/data/datasets/nuimages.py +++ b/dimos/models/Detic/third_party/CenterNet2/centernet/data/datasets/nuimages.py @@ -19,7 +19,10 @@ def _get_builtin_metadata(): id_to_name = {x["id"]: x["name"] for x in categories} thing_dataset_id_to_contiguous_id = {i: i for i in range(len(categories))} thing_classes = [id_to_name[k] for k in sorted(id_to_name)] - return {"thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id, "thing_classes": thing_classes} + return { + "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id, + "thing_classes": thing_classes, + } _PREDEFINED_SPLITS = { diff --git a/dimos/models/Detic/third_party/CenterNet2/centernet/data/datasets/objects365.py b/dimos/models/Detic/third_party/CenterNet2/centernet/data/datasets/objects365.py index ae2296e65f..22a017444f 100644 --- a/dimos/models/Detic/third_party/CenterNet2/centernet/data/datasets/objects365.py +++ b/dimos/models/Detic/third_party/CenterNet2/centernet/data/datasets/objects365.py @@ -377,7 +377,10 @@ def _get_builtin_metadata(version): assert 0, version thing_dataset_id_to_contiguous_id = {i + 1: i for i in range(365)} thing_classes = [id_to_name[k] for k in sorted(id_to_name)] - return {"thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id, "thing_classes": thing_classes} + return { + "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id, + "thing_classes": thing_classes, + } _PREDEFINED_SPLITS_OBJECTS365 = { diff --git a/dimos/models/Detic/third_party/CenterNet2/centernet/modeling/backbone/bifpn.py b/dimos/models/Detic/third_party/CenterNet2/centernet/modeling/backbone/bifpn.py index 7ea340f2a4..dd66c1f0c3 100644 --- a/dimos/models/Detic/third_party/CenterNet2/centernet/modeling/backbone/bifpn.py +++ b/dimos/models/Detic/third_party/CenterNet2/centernet/modeling/backbone/bifpn.py @@ -277,7 +277,10 @@ def forward(self, x): elif self.weight_method == "fastattn": edge_weights = nn.functional.relu(self.edge_weights.type(dtype)) weights_sum = torch.sum(edge_weights) - x = torch.stack([(nodes[i] * edge_weights[i]) / (weights_sum + 0.0001) for i in range(len(nodes))], dim=-1) + x = torch.stack( + [(nodes[i] * edge_weights[i]) / (weights_sum + 0.0001) for i in range(len(nodes))], + dim=-1, + ) elif self.weight_method == "sum": x = torch.stack(nodes, dim=-1) else: @@ -351,7 +354,9 @@ def __init__( norm=norm, act_layer=conv_act, ) - after_combine["conv"] = SeparableConv2d(**conv_kwargs) if separable_conv else ConvBnAct2d(**conv_kwargs) + after_combine["conv"] = ( + SeparableConv2d(**conv_kwargs) if separable_conv else ConvBnAct2d(**conv_kwargs) + ) fnode_layers["after_combine"] = nn.Sequential(after_combine) self.fnode.add_module(str(i), nn.Sequential(fnode_layers)) @@ -401,7 +406,8 @@ def __init__( # print('self._out_feature_channels', self._out_feature_channels) feature_info = [ - {"num_chs": in_channels[level], "reduction": in_strides[level]} for level in range(len(self.in_features)) + {"num_chs": in_channels[level], "reduction": in_strides[level]} + for level in range(len(self.in_features)) ] # self.config = config fpn_config = get_fpn_config() diff --git a/dimos/models/Detic/third_party/CenterNet2/centernet/modeling/backbone/bifpn_fcos.py b/dimos/models/Detic/third_party/CenterNet2/centernet/modeling/backbone/bifpn_fcos.py index 7df59bcd49..67c7b67b9e 100644 --- a/dimos/models/Detic/third_party/CenterNet2/centernet/modeling/backbone/bifpn_fcos.py +++ b/dimos/models/Detic/third_party/CenterNet2/centernet/modeling/backbone/bifpn_fcos.py @@ -61,8 +61,12 @@ def __init__(self, backbone, out_channels, num_top_levels, norm=""): self.backbone = backbone backbone_output_shape = backbone.output_shape() - self._out_feature_channels = {name: shape.channels for name, shape in backbone_output_shape.items()} - self._out_feature_strides = {name: shape.stride for name, shape in backbone_output_shape.items()} + self._out_feature_channels = { + name: shape.channels for name, shape in backbone_output_shape.items() + } + self._out_feature_strides = { + name: shape.stride for name, shape in backbone_output_shape.items() + } self._out_features = list(self._out_feature_strides.keys()) last_feature_name = max(self._out_feature_strides.keys(), key=lambda x: split_name(x)[1]) @@ -154,7 +158,9 @@ def __init__(self, in_channels_list, out_channels, norm=""): in_channels = node_info[input_offset] if in_channels != out_channels: - lateral_conv = Conv2d(in_channels, out_channels, kernel_size=1, norm=get_norm(norm, out_channels)) + lateral_conv = Conv2d( + in_channels, out_channels, kernel_size=1, norm=get_norm(norm, out_channels) + ) self.add_module("lateral_{}_f{}".format(input_offset, feat_level), lateral_conv) node_info.append(out_channels) num_output_connections.append(0) @@ -162,7 +168,10 @@ def __init__(self, in_channels_list, out_channels, norm=""): # generate attention weights name = "weights_f{}_{}".format(feat_level, inputs_offsets_str) self.__setattr__( - name, nn.Parameter(torch.ones(len(inputs_offsets), dtype=torch.float32), requires_grad=True) + name, + nn.Parameter( + torch.ones(len(inputs_offsets), dtype=torch.float32), requires_grad=True + ), ) # generate convolutions after combination @@ -223,7 +232,9 @@ def forward(self, feats): ) elif h <= target_h and w <= target_w: if h < target_h or w < target_w: - input_node = F.interpolate(input_node, size=(target_h, target_w), mode="nearest") + input_node = F.interpolate( + input_node, size=(target_h, target_w), mode="nearest" + ) else: raise NotImplementedError() input_nodes.append(input_node) @@ -340,7 +351,9 @@ def _assert_strides_are_log2_contiguous(strides): Assert that each stride is 2x times its preceding stride, i.e. "contiguous in log2". """ for i, stride in enumerate(strides[1:], 1): - assert stride == 2 * strides[i - 1], "Strides {} {} are not log2 contiguous".format(stride, strides[i - 1]) + assert stride == 2 * strides[i - 1], "Strides {} {} are not log2 contiguous".format( + stride, strides[i - 1] + ) @BACKBONE_REGISTRY.register() diff --git a/dimos/models/Detic/third_party/CenterNet2/centernet/modeling/backbone/dla.py b/dimos/models/Detic/third_party/CenterNet2/centernet/modeling/backbone/dla.py index f652781b34..1cb2fa51e8 100644 --- a/dimos/models/Detic/third_party/CenterNet2/centernet/modeling/backbone/dla.py +++ b/dimos/models/Detic/third_party/CenterNet2/centernet/modeling/backbone/dla.py @@ -42,11 +42,19 @@ class BasicBlock(nn.Module): def __init__(self, inplanes, planes, stride=1, dilation=1, norm="BN"): super(BasicBlock, self).__init__() self.conv1 = nn.Conv2d( - inplanes, planes, kernel_size=3, stride=stride, padding=dilation, bias=False, dilation=dilation + inplanes, + planes, + kernel_size=3, + stride=stride, + padding=dilation, + bias=False, + dilation=dilation, ) self.bn1 = get_norm(norm, planes) self.relu = nn.ReLU(inplace=True) - self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=dilation, bias=False, dilation=dilation) + self.conv2 = nn.Conv2d( + planes, planes, kernel_size=3, stride=1, padding=dilation, bias=False, dilation=dilation + ) self.bn2 = get_norm(norm, planes) self.stride = stride @@ -77,7 +85,13 @@ def __init__(self, inplanes, planes, stride=1, dilation=1, norm="BN"): self.conv1 = nn.Conv2d(inplanes, bottle_planes, kernel_size=1, bias=False) self.bn1 = get_norm(norm, bottle_planes) self.conv2 = nn.Conv2d( - bottle_planes, bottle_planes, kernel_size=3, stride=stride, padding=dilation, bias=False, dilation=dilation + bottle_planes, + bottle_planes, + kernel_size=3, + stride=stride, + padding=dilation, + bias=False, + dilation=dilation, ) self.bn2 = get_norm(norm, bottle_planes) self.conv3 = nn.Conv2d(bottle_planes, planes, kernel_size=1, bias=False) @@ -109,7 +123,9 @@ def forward(self, x, residual=None): class Root(nn.Module): def __init__(self, in_channels, out_channels, kernel_size, residual, norm="BN"): super(Root, self).__init__() - self.conv = nn.Conv2d(in_channels, out_channels, 1, stride=1, bias=False, padding=(kernel_size - 1) // 2) + self.conv = nn.Conv2d( + in_channels, out_channels, 1, stride=1, bias=False, padding=(kernel_size - 1) // 2 + ) self.bn = get_norm(norm, out_channels) self.relu = nn.ReLU(inplace=True) self.residual = residual @@ -183,7 +199,8 @@ def __init__( self.downsample = nn.MaxPool2d(stride, stride=stride) if in_channels != out_channels: self.project = nn.Sequential( - nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, bias=False), get_norm(norm, out_channels) + nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, bias=False), + get_norm(norm, out_channels), ) def forward(self, x, residual=None, children=None): @@ -203,7 +220,9 @@ def forward(self, x, residual=None, children=None): class DLA(nn.Module): - def __init__(self, num_layers, levels, channels, block=BasicBlock, residual_root=False, norm="BN"): + def __init__( + self, num_layers, levels, channels, block=BasicBlock, residual_root=False, norm="BN" + ): """ Args: """ @@ -218,24 +237,56 @@ def __init__(self, num_layers, levels, channels, block=BasicBlock, residual_root self.level0 = self._make_conv_level(channels[0], channels[0], levels[0]) self.level1 = self._make_conv_level(channels[0], channels[1], levels[1], stride=2) self.level2 = Tree( - levels[2], block, channels[1], channels[2], 2, level_root=False, root_residual=residual_root, norm=norm + levels[2], + block, + channels[1], + channels[2], + 2, + level_root=False, + root_residual=residual_root, + norm=norm, ) self.level3 = Tree( - levels[3], block, channels[2], channels[3], 2, level_root=True, root_residual=residual_root, norm=norm + levels[3], + block, + channels[2], + channels[3], + 2, + level_root=True, + root_residual=residual_root, + norm=norm, ) self.level4 = Tree( - levels[4], block, channels[3], channels[4], 2, level_root=True, root_residual=residual_root, norm=norm + levels[4], + block, + channels[3], + channels[4], + 2, + level_root=True, + root_residual=residual_root, + norm=norm, ) self.level5 = Tree( - levels[5], block, channels[4], channels[5], 2, level_root=True, root_residual=residual_root, norm=norm + levels[5], + block, + channels[4], + channels[5], + 2, + level_root=True, + root_residual=residual_root, + norm=norm, + ) + self.load_pretrained_model( + data="imagenet", name="dla{}".format(num_layers), hash=HASH[num_layers] ) - self.load_pretrained_model(data="imagenet", name="dla{}".format(num_layers), hash=HASH[num_layers]) def load_pretrained_model(self, data, name, hash): model_url = get_model_url(data, name, hash) model_weights = model_zoo.load_url(model_url) num_classes = len(model_weights[list(model_weights.keys())[-1]]) - self.fc = nn.Conv2d(self.channels[-1], num_classes, kernel_size=1, stride=1, padding=0, bias=True) + self.fc = nn.Conv2d( + self.channels[-1], num_classes, kernel_size=1, stride=1, padding=0, bias=True + ) print("Loading pretrained") self.load_state_dict(model_weights, strict=False) @@ -286,7 +337,9 @@ def __init__(self, chi, cho, norm="BN"): self.actf = nn.Sequential(get_norm(norm, cho), nn.ReLU(inplace=True)) if DCNV1: self.offset = Conv2d(chi, 18, kernel_size=3, stride=1, padding=1, dilation=1) - self.conv = DeformConv(chi, cho, kernel_size=(3, 3), stride=1, padding=1, dilation=1, deformable_groups=1) + self.conv = DeformConv( + chi, cho, kernel_size=(3, 3), stride=1, padding=1, dilation=1, deformable_groups=1 + ) else: self.offset = Conv2d(chi, 27, kernel_size=3, stride=1, padding=1, dilation=1) self.conv = ModulatedDeformConv( @@ -318,7 +371,9 @@ def __init__(self, o, channels, up_f, norm="BN"): proj = _DeformConv(c, o, norm=norm) node = _DeformConv(o, o, norm=norm) - up = nn.ConvTranspose2d(o, o, f * 2, stride=f, padding=f // 2, output_padding=0, groups=o, bias=False) + up = nn.ConvTranspose2d( + o, o, f * 2, stride=f, padding=f // 2, output_padding=0, groups=o, bias=False + ) fill_up_weights(up) setattr(self, "proj_" + str(i), proj) @@ -345,7 +400,11 @@ def __init__(self, startp, channels, scales, in_channels=None, norm="BN"): scales = np.array(scales, dtype=int) for i in range(len(channels) - 1): j = -i - 2 - setattr(self, "ida_{}".format(i), IDAUp(channels[j], in_channels[j:], scales[j:] // scales[j], norm=norm)) + setattr( + self, + "ida_{}".format(i), + IDAUp(channels[j], in_channels[j:], scales[j:] // scales[j], norm=norm), + ) scales[j + 1 :] = scales[j] in_channels[j + 1 :] = [channels[j] for _ in channels[j + 1 :]] @@ -369,7 +428,9 @@ def __init__(self, num_layers, out_features, use_dla_up=True, ms_output=False, n super(DLASeg, self).__init__() # depth = 34 levels, channels, Block = DLA_CONFIGS[num_layers] - self.base = DLA(num_layers=num_layers, levels=levels, channels=channels, block=Block, norm=norm) + self.base = DLA( + num_layers=num_layers, levels=levels, channels=channels, block=Block, norm=norm + ) down_ratio = 4 self.first_level = int(np.log2(down_ratio)) self.ms_output = ms_output diff --git a/dimos/models/Detic/third_party/CenterNet2/centernet/modeling/backbone/dlafpn.py b/dimos/models/Detic/third_party/CenterNet2/centernet/modeling/backbone/dlafpn.py index 5a2c948521..8cc478ece9 100644 --- a/dimos/models/Detic/third_party/CenterNet2/centernet/modeling/backbone/dlafpn.py +++ b/dimos/models/Detic/third_party/CenterNet2/centernet/modeling/backbone/dlafpn.py @@ -35,11 +35,19 @@ class BasicBlock(nn.Module): def __init__(self, cfg, inplanes, planes, stride=1, dilation=1): super(BasicBlock, self).__init__() self.conv1 = nn.Conv2d( - inplanes, planes, kernel_size=3, stride=stride, padding=dilation, bias=False, dilation=dilation + inplanes, + planes, + kernel_size=3, + stride=stride, + padding=dilation, + bias=False, + dilation=dilation, ) self.bn1 = get_norm(cfg.MODEL.DLA.NORM, planes) self.relu = nn.ReLU(inplace=True) - self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=dilation, bias=False, dilation=dilation) + self.conv2 = nn.Conv2d( + planes, planes, kernel_size=3, stride=1, padding=dilation, bias=False, dilation=dilation + ) self.bn2 = get_norm(cfg.MODEL.DLA.NORM, planes) self.stride = stride @@ -70,7 +78,13 @@ def __init__(self, cfg, inplanes, planes, stride=1, dilation=1): self.conv1 = nn.Conv2d(inplanes, bottle_planes, kernel_size=1, bias=False) self.bn1 = get_norm(cfg.MODEL.DLA.NORM, bottle_planes) self.conv2 = nn.Conv2d( - bottle_planes, bottle_planes, kernel_size=3, stride=stride, padding=dilation, bias=False, dilation=dilation + bottle_planes, + bottle_planes, + kernel_size=3, + stride=stride, + padding=dilation, + bias=False, + dilation=dilation, ) self.bn2 = get_norm(cfg.MODEL.DLA.NORM, bottle_planes) self.conv3 = nn.Conv2d(bottle_planes, planes, kernel_size=1, bias=False) @@ -103,7 +117,12 @@ class Root(nn.Module): def __init__(self, cfg, in_channels, out_channels, kernel_size, residual): super(Root, self).__init__() self.conv = nn.Conv2d( - in_channels, out_channels, kernel_size, stride=1, bias=False, padding=(kernel_size - 1) // 2 + in_channels, + out_channels, + kernel_size, + stride=1, + bias=False, + padding=(kernel_size - 1) // 2, ) self.bn = get_norm(cfg.MODEL.DLA.NORM, out_channels) self.relu = nn.ReLU(inplace=True) @@ -218,16 +237,44 @@ def __init__(self, cfg, levels, channels, block=BasicBlock, residual_root=False) self.level0 = self._make_conv_level(channels[0], channels[0], levels[0]) self.level1 = self._make_conv_level(channels[0], channels[1], levels[1], stride=2) self.level2 = Tree( - cfg, levels[2], block, channels[1], channels[2], 2, level_root=False, root_residual=residual_root + cfg, + levels[2], + block, + channels[1], + channels[2], + 2, + level_root=False, + root_residual=residual_root, ) self.level3 = Tree( - cfg, levels[3], block, channels[2], channels[3], 2, level_root=True, root_residual=residual_root + cfg, + levels[3], + block, + channels[2], + channels[3], + 2, + level_root=True, + root_residual=residual_root, ) self.level4 = Tree( - cfg, levels[4], block, channels[3], channels[4], 2, level_root=True, root_residual=residual_root + cfg, + levels[4], + block, + channels[3], + channels[4], + 2, + level_root=True, + root_residual=residual_root, ) self.level5 = Tree( - cfg, levels[5], block, channels[4], channels[5], 2, level_root=True, root_residual=residual_root + cfg, + levels[5], + block, + channels[4], + channels[5], + 2, + level_root=True, + root_residual=residual_root, ) for m in self.modules(): @@ -291,7 +338,9 @@ class Conv(nn.Module): def __init__(self, chi, cho, norm): super(Conv, self).__init__() self.conv = nn.Sequential( - nn.Conv2d(chi, cho, kernel_size=1, stride=1, bias=False), get_norm(norm, cho), nn.ReLU(inplace=True) + nn.Conv2d(chi, cho, kernel_size=1, stride=1, bias=False), + get_norm(norm, cho), + nn.ReLU(inplace=True), ) def forward(self, x): @@ -303,7 +352,9 @@ def __init__(self, chi, cho, norm): super(DeformConv, self).__init__() self.actf = nn.Sequential(get_norm(norm, cho), nn.ReLU(inplace=True)) self.offset = Conv2d(chi, 27, kernel_size=3, stride=1, padding=1, dilation=1) - self.conv = ModulatedDeformConv(chi, cho, kernel_size=3, stride=1, padding=1, dilation=1, deformable_groups=1) + self.conv = ModulatedDeformConv( + chi, cho, kernel_size=3, stride=1, padding=1, dilation=1, deformable_groups=1 + ) nn.init.constant_(self.offset.weight, 0) nn.init.constant_(self.offset.bias, 0) @@ -326,7 +377,9 @@ def __init__(self, o, channels, up_f, norm="FrozenBN", node_type=Conv): proj = node_type(c, o, norm) node = node_type(o, o, norm) - up = nn.ConvTranspose2d(o, o, f * 2, stride=f, padding=f // 2, output_padding=0, groups=o, bias=False) + up = nn.ConvTranspose2d( + o, o, f * 2, stride=f, padding=f // 2, output_padding=0, groups=o, bias=False + ) fill_up_weights(up) setattr(self, "proj_" + str(i), proj) @@ -360,7 +413,9 @@ def __init__(self, bottom_up, in_features, norm, dlaup_node="conv"): self.in_features = in_features out_features = ["dlaup{}".format(l) for l in in_levels] self._out_features = out_features - self._out_feature_channels = {"dlaup{}".format(l): in_channels[i] for i, l in enumerate(in_levels)} + self._out_feature_channels = { + "dlaup{}".format(l): in_channels[i] for i, l in enumerate(in_levels) + } self._out_feature_strides = {"dlaup{}".format(l): 2**l for l in in_levels} print("self._out_features", self._out_features) @@ -379,7 +434,13 @@ def __init__(self, bottom_up, in_features, norm, dlaup_node="conv"): setattr( self, "ida_{}".format(i), - IDAUp(channels[j], in_channels[j:], scales[j:] // scales[j], norm=norm, node_type=node_type), + IDAUp( + channels[j], + in_channels[j:], + scales[j:] // scales[j], + norm=norm, + node_type=node_type, + ), ) scales[j + 1 :] = scales[j] in_channels[j + 1 :] = [channels[j] for _ in channels[j + 1 :]] diff --git a/dimos/models/Detic/third_party/CenterNet2/centernet/modeling/backbone/res2net.py b/dimos/models/Detic/third_party/CenterNet2/centernet/modeling/backbone/res2net.py index ffdde0c2d4..b35f9b2413 100644 --- a/dimos/models/Detic/third_party/CenterNet2/centernet/modeling/backbone/res2net.py +++ b/dimos/models/Detic/third_party/CenterNet2/centernet/modeling/backbone/res2net.py @@ -142,7 +142,9 @@ def __init__( if in_channels != out_channels: self.shortcut = nn.Sequential( - nn.AvgPool2d(kernel_size=stride, stride=stride, ceil_mode=True, count_include_pad=False), + nn.AvgPool2d( + kernel_size=stride, stride=stride, ceil_mode=True, count_include_pad=False + ), Conv2d( in_channels, out_channels, @@ -299,7 +301,9 @@ def __init__( # norm=get_norm(norm, out_channels), # ) self.shortcut = nn.Sequential( - nn.AvgPool2d(kernel_size=stride, stride=stride, ceil_mode=True, count_include_pad=False), + nn.AvgPool2d( + kernel_size=stride, stride=stride, ceil_mode=True, count_include_pad=False + ), Conv2d( in_channels, out_channels, @@ -602,7 +606,9 @@ def __init__(self, stem, stages, num_classes=None, out_features=None): self.add_module(name, stage) self.stages_and_names.append((stage, name)) - self._out_feature_strides[name] = current_stride = int(current_stride * np.prod([k.stride for k in blocks])) + self._out_feature_strides[name] = current_stride = int( + current_stride * np.prod([k.stride for k in blocks]) + ) self._out_feature_channels[name] = curr_channels = blocks[-1].out_channels if num_classes is not None: @@ -642,7 +648,9 @@ def forward(self, x): def output_shape(self): return { - name: ShapeSpec(channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]) + name: ShapeSpec( + channels=self._out_feature_channels[name], stride=self._out_feature_strides[name] + ) for name in self._out_features } @@ -709,7 +717,9 @@ def build_res2net_backbone(cfg, input_shape): if depth in [18, 34]: assert out_channels == 64, "Must set MODEL.RESNETS.RES2_OUT_CHANNELS = 64 for R18/R34" - assert not any(deform_on_per_stage), "MODEL.RESNETS.DEFORM_ON_PER_STAGE unsupported for R18/R34" + assert not any(deform_on_per_stage), ( + "MODEL.RESNETS.DEFORM_ON_PER_STAGE unsupported for R18/R34" + ) assert res5_dilation == 1, "Must set MODEL.RESNETS.RES5_DILATION = 1 for R18/R34" assert num_groups == 1, "Must set MODEL.RESNETS.NUM_GROUPS = 1 for R18/R34" diff --git a/dimos/models/Detic/third_party/CenterNet2/centernet/modeling/debug.py b/dimos/models/Detic/third_party/CenterNet2/centernet/modeling/debug.py index 3048633535..247653c23a 100644 --- a/dimos/models/Detic/third_party/CenterNet2/centernet/modeling/debug.py +++ b/dimos/models/Detic/third_party/CenterNet2/centernet/modeling/debug.py @@ -9,7 +9,9 @@ def _get_color_image(heatmap): heatmap = heatmap.reshape(heatmap.shape[0], heatmap.shape[1], heatmap.shape[2], 1) if heatmap.shape[0] == 1: - color_map = (heatmap * np.ones((1, 1, 1, 3), np.uint8) * 255).max(axis=0).astype(np.uint8) # H, W, 3 + color_map = ( + (heatmap * np.ones((1, 1, 1, 3), np.uint8) * 255).max(axis=0).astype(np.uint8) + ) # H, W, 3 else: color_map = (heatmap * COLORS[: heatmap.shape[0]]).max(axis=0).astype(np.uint8) # H, W, 3 @@ -72,7 +74,15 @@ def _ind2il(ind, shapes_per_level, N): def debug_train( - images, gt_instances, flattened_hms, reg_targets, labels, pos_inds, shapes_per_level, locations, strides + images, + gt_instances, + flattened_hms, + reg_targets, + labels, + pos_inds, + shapes_per_level, + locations, + strides, ): """ images: N x 3 x H x W @@ -102,7 +112,12 @@ def debug_train( for j in range(len(bboxes)): bbox = bboxes[j] cv2.rectangle( - blend, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), (0, 0, 255), 3, cv2.LINE_AA + blend, + (int(bbox[0]), int(bbox[1])), + (int(bbox[2]), int(bbox[3])), + (0, 0, 255), + 3, + cv2.LINE_AA, ) for j in range(len(pos_inds)): @@ -110,7 +125,9 @@ def debug_train( if image_id != i: continue loc = locations[pos_inds[j]] - cv2.drawMarker(blend, (int(loc[0]), int(loc[1])), (0, 255, 255), markerSize=(l + 1) * 16) + cv2.drawMarker( + blend, (int(loc[0]), int(loc[1])), (0, 255, 255), markerSize=(l + 1) * 16 + ) for j in range(len(reg_inds)): image_id, l = _ind2il(reg_inds[j], shapes_per_level, N) @@ -121,7 +138,12 @@ def debug_train( loc = locations[reg_inds[j]] bbox = [(loc[0] - ltrb[0]), (loc[1] - ltrb[1]), (loc[0] + ltrb[2]), (loc[1] + ltrb[3])] cv2.rectangle( - blend, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), (255, 0, 0), 1, cv2.LINE_AA + blend, + (int(bbox[0]), int(bbox[1])), + (int(bbox[2]), int(bbox[3])), + (255, 0, 0), + 1, + cv2.LINE_AA, ) cv2.circle(blend, (int(loc[0]), int(loc[1])), 2, (255, 0, 0), -1) @@ -130,7 +152,14 @@ def debug_train( def debug_test( - images, logits_pred, reg_pred, agn_hm_pred=[], preds=[], vis_thresh=0.3, debug_show_name=False, mult_agn=False + images, + logits_pred, + reg_pred, + agn_hm_pred=[], + preds=[], + vis_thresh=0.3, + debug_show_name=False, + mult_agn=False, ): """ images: N x 3 x H x W @@ -168,7 +197,11 @@ def debug_test( cat2name = [x["name"] for x in LVIS_CATEGORIES] for j in range(len(preds[i].scores) if preds is not None else 0): if preds[i].scores[j] > vis_thresh: - bbox = preds[i].proposal_boxes[j] if preds[i].has("proposal_boxes") else preds[i].pred_boxes[j] + bbox = ( + preds[i].proposal_boxes[j] + if preds[i].has("proposal_boxes") + else preds[i].pred_boxes[j] + ) bbox = bbox.tensor[0].detach().cpu().numpy().astype(np.int32) cat = int(preds[i].pred_classes[j]) if preds[i].has("pred_classes") else 0 cl = COLORS[cat, 0, 0] @@ -181,7 +214,9 @@ def debug_test( cv2.LINE_AA, ) if debug_show_name: - txt = "{}{:.1f}".format(cat2name[cat] if cat > 0 else "", preds[i].scores[j]) + txt = "{}{:.1f}".format( + cat2name[cat] if cat > 0 else "", preds[i].scores[j] + ) font = cv2.FONT_HERSHEY_SIMPLEX cat_size = cv2.getTextSize(txt, font, 0.5, 2)[0] cv2.rectangle( @@ -216,7 +251,9 @@ def debug_test( cnt = 0 -def debug_second_stage(images, instances, proposals=None, vis_thresh=0.3, save_debug=False, debug_show_name=False): +def debug_second_stage( + images, instances, proposals=None, vis_thresh=0.3, save_debug=False, debug_show_name=False +): images = _imagelist_to_tensor(images) if debug_show_name: from detectron2.data.datasets.lvis_v1_categories import LVIS_CATEGORIES @@ -237,7 +274,14 @@ def debug_second_stage(images, instances, proposals=None, vis_thresh=0.3, save_d bbox = bboxes[j] cl = COLORS[cats[j], 0, 0] cl = (int(cl[0]), int(cl[1]), int(cl[2])) - cv2.rectangle(image, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), cl, 2, cv2.LINE_AA) + cv2.rectangle( + image, + (int(bbox[0]), int(bbox[1])), + (int(bbox[2]), int(bbox[3])), + cl, + 2, + cv2.LINE_AA, + ) if debug_show_name: cat = cats[j] txt = "{}{:.1f}".format(cat2name[cat] if cat > 0 else "", scores[j]) @@ -261,7 +305,9 @@ def debug_second_stage(images, instances, proposals=None, vis_thresh=0.3, save_d lineType=cv2.LINE_AA, ) if proposals is not None: - proposal_image = images[i].detach().cpu().numpy().transpose(1, 2, 0).astype(np.uint8).copy() + proposal_image = ( + images[i].detach().cpu().numpy().transpose(1, 2, 0).astype(np.uint8).copy() + ) bboxes = proposals[i].proposal_boxes.tensor.cpu().numpy() if proposals[i].has("scores"): scores = proposals[i].scores.cpu().numpy() @@ -272,7 +318,12 @@ def debug_second_stage(images, instances, proposals=None, vis_thresh=0.3, save_d bbox = bboxes[j] cl = (209, 159, 83) cv2.rectangle( - proposal_image, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), cl, 2, cv2.LINE_AA + proposal_image, + (int(bbox[0]), int(bbox[1])), + (int(bbox[2]), int(bbox[3])), + cl, + 2, + cv2.LINE_AA, ) cv2.imshow("image", image) diff --git a/dimos/models/Detic/third_party/CenterNet2/centernet/modeling/dense_heads/centernet.py b/dimos/models/Detic/third_party/CenterNet2/centernet/modeling/dense_heads/centernet.py index f5a3f8b426..53b28eb18a 100644 --- a/dimos/models/Detic/third_party/CenterNet2/centernet/modeling/dense_heads/centernet.py +++ b/dimos/models/Detic/third_party/CenterNet2/centernet/modeling/dense_heads/centernet.py @@ -166,7 +166,9 @@ def from_config(cls, cfg, input_shape): "pixel_mean": cfg.MODEL.PIXEL_MEAN, "pixel_std": cfg.MODEL.PIXEL_STD, "device": cfg.MODEL.DEVICE, - "centernet_head": CenterNetHead(cfg, [input_shape[f] for f in cfg.MODEL.CENTERNET.IN_FEATURES]), + "centernet_head": CenterNetHead( + cfg, [input_shape[f] for f in cfg.MODEL.CENTERNET.IN_FEATURES] + ), } return ret @@ -174,12 +176,18 @@ def forward(self, images, features_dict, gt_instances): features = [features_dict[f] for f in self.in_features] clss_per_level, reg_pred_per_level, agn_hm_pred_per_level = self.centernet_head(features) grids = self.compute_grids(features) - shapes_per_level = grids[0].new_tensor([(x.shape[2], x.shape[3]) for x in reg_pred_per_level]) + shapes_per_level = grids[0].new_tensor( + [(x.shape[2], x.shape[3]) for x in reg_pred_per_level] + ) if not self.training: - return self.inference(images, clss_per_level, reg_pred_per_level, agn_hm_pred_per_level, grids) + return self.inference( + images, clss_per_level, reg_pred_per_level, agn_hm_pred_per_level, grids + ) else: - pos_inds, labels, reg_targets, flattened_hms = self._get_ground_truth(grids, shapes_per_level, gt_instances) + pos_inds, labels, reg_targets, flattened_hms = self._get_ground_truth( + grids, shapes_per_level, gt_instances + ) # logits_pred: M x F, reg_pred: M x 4, agn_hm_pred: M logits_pred, reg_pred, agn_hm_pred = self._flatten_outputs( clss_per_level, reg_pred_per_level, agn_hm_pred_per_level @@ -191,7 +199,9 @@ def forward(self, images, features_dict, gt_instances): # 2. their regression losses are small ( 0 # M x N is_center3x3 = self.get_center3x3(grids, centers, strides) & is_in_boxes # M x N is_cared_in_the_level = self.assign_reg_fpn(reg_target, reg_size_ranges) # M x N reg_mask = is_center3x3 & is_cared_in_the_level # M x N - dist2 = ((grids.view(M, 1, 2).expand(M, N, 2) - centers_expanded) ** 2).sum(dim=2) # M x N + dist2 = ((grids.view(M, 1, 2).expand(M, N, 2) - centers_expanded) ** 2).sum( + dim=2 + ) # M x N dist2[is_peak] = 0 radius2 = self.delta**2 * 2 * area # N radius2 = torch.clamp(radius2, min=self.min_radius**2) weighted_dist2 = dist2 / radius2.view(1, N).expand(M, N) # M x N - reg_target = self._get_reg_targets(reg_target, weighted_dist2.clone(), reg_mask, area) # M x 4 + reg_target = self._get_reg_targets( + reg_target, weighted_dist2.clone(), reg_mask, area + ) # M x 4 if self.only_proposal: flattened_hm = self._create_agn_heatmaps_from_dist(weighted_dist2.clone()) # M x 1 @@ -464,7 +501,9 @@ def _get_label_inds(self, gt_instances, shapes_per_level): ) # n x L is_cared_in_the_level = self.assign_fpn_level(bboxes) pos_ind = pos_ind[is_cared_in_the_level].view(-1) - label = targets_per_im.gt_classes.view(n, 1).expand(n, L)[is_cared_in_the_level].view(-1) + label = ( + targets_per_im.gt_classes.view(n, 1).expand(n, L)[is_cared_in_the_level].view(-1) + ) pos_inds.append(pos_ind) # n' labels.append(label) # n' @@ -480,12 +519,16 @@ def assign_fpn_level(self, boxes): Return: is_cared_in_the_level: n x L """ - size_ranges = boxes.new_tensor(self.sizes_of_interest).view(len(self.sizes_of_interest), 2) # L x 2 + size_ranges = boxes.new_tensor(self.sizes_of_interest).view( + len(self.sizes_of_interest), 2 + ) # L x 2 crit = ((boxes[:, 2:] - boxes[:, :2]) ** 2).sum(dim=1) ** 0.5 / 2 # n n, L = crit.shape[0], size_ranges.shape[0] crit = crit.view(n, 1).expand(n, L) size_ranges_expand = size_ranges.view(1, L, 2).expand(n, L, 2) - is_cared_in_the_level = (crit >= size_ranges_expand[:, :, 0]) & (crit <= size_ranges_expand[:, :, 1]) + is_cared_in_the_level = (crit >= size_ranges_expand[:, :, 0]) & ( + crit <= size_ranges_expand[:, :, 1] + ) return is_cared_in_the_level def assign_reg_fpn(self, reg_targets_per_im, size_ranges): @@ -495,7 +538,9 @@ def assign_reg_fpn(self, reg_targets_per_im, size_ranges): reg_targets_per_im: M x N x 4 size_ranges: M x 2 """ - crit = ((reg_targets_per_im[:, :, :2] + reg_targets_per_im[:, :, 2:]) ** 2).sum(dim=2) ** 0.5 / 2 # M x N + crit = ((reg_targets_per_im[:, :, :2] + reg_targets_per_im[:, :, 2:]) ** 2).sum( + dim=2 + ) ** 0.5 / 2 # M x N is_cared_in_the_level = (crit >= size_ranges[:, [0]]) & (crit <= size_ranges[:, [1]]) return is_cared_in_the_level @@ -544,10 +589,16 @@ def _create_agn_heatmaps_from_dist(self, dist): def _flatten_outputs(self, clss, reg_pred, agn_hm_pred): # Reshape: (N, F, Hl, Wl) -> (N, Hl, Wl, F) -> (sum_l N*Hl*Wl, F) clss = ( - cat([x.permute(0, 2, 3, 1).reshape(-1, x.shape[1]) for x in clss], dim=0) if clss[0] is not None else None + cat([x.permute(0, 2, 3, 1).reshape(-1, x.shape[1]) for x in clss], dim=0) + if clss[0] is not None + else None ) reg_pred = cat([x.permute(0, 2, 3, 1).reshape(-1, 4) for x in reg_pred], dim=0) - agn_hm_pred = cat([x.permute(0, 2, 3, 1).reshape(-1) for x in agn_hm_pred], dim=0) if self.with_agn_hm else None + agn_hm_pred = ( + cat([x.permute(0, 2, 3, 1).reshape(-1) for x in agn_hm_pred], dim=0) + if self.with_agn_hm + else None + ) return clss, reg_pred, agn_hm_pred def get_center3x3(self, locations, centers, strides): @@ -571,7 +622,9 @@ def get_center3x3(self, locations, centers, strides): @torch.no_grad() def inference(self, images, clss_per_level, reg_pred_per_level, agn_hm_pred_per_level, grids): logits_pred = [x.sigmoid() if x is not None else None for x in clss_per_level] - agn_hm_pred_per_level = [x.sigmoid() if x is not None else None for x in agn_hm_pred_per_level] + agn_hm_pred_per_level = [ + x.sigmoid() if x is not None else None for x in agn_hm_pred_per_level + ] if self.only_proposal: proposals = self.predict_instances( @@ -604,7 +657,9 @@ def inference(self, images, clss_per_level, reg_pred_per_level, agn_hm_pred_per_ return proposals, {} @torch.no_grad() - def predict_instances(self, grids, logits_pred, reg_pred, image_sizes, agn_hm_pred, is_proposal=False): + def predict_instances( + self, grids, logits_pred, reg_pred, image_sizes, agn_hm_pred, is_proposal=False + ): sampled_boxes = [] for l in range(len(grids)): sampled_boxes.append( @@ -624,7 +679,9 @@ def predict_instances(self, grids, logits_pred, reg_pred, image_sizes, agn_hm_pr return boxlists @torch.no_grad() - def predict_single_level(self, grids, heatmap, reg_pred, image_sizes, agn_hm, level, is_proposal=False): + def predict_single_level( + self, grids, heatmap, reg_pred, image_sizes, agn_hm, level, is_proposal=False + ): N, C, H, W = heatmap.shape # put in the same format as grids if self.center_nms: @@ -702,7 +759,9 @@ def nms_and_topK(self, boxlists, nms=True): post_nms_topk = self.post_nms_topk_train if self.training else self.post_nms_topk_test if num_dets > post_nms_topk: cls_scores = result.scores - image_thresh, _ = torch.kthvalue(cls_scores.float().cpu(), num_dets - post_nms_topk + 1) + image_thresh, _ = torch.kthvalue( + cls_scores.float().cpu(), num_dets - post_nms_topk + 1 + ) keep = cls_scores >= image_thresh.item() keep = torch.nonzero(keep).squeeze(1) result = result[keep] @@ -713,7 +772,9 @@ def nms_and_topK(self, boxlists, nms=True): @torch.no_grad() def _add_more_pos(self, reg_pred, gt_instances, shapes_per_level): - labels, level_masks, c33_inds, c33_masks, c33_regs = self._get_c33_inds(gt_instances, shapes_per_level) + labels, level_masks, c33_inds, c33_masks, c33_regs = self._get_c33_inds( + gt_instances, shapes_per_level + ) N, L, K = labels.shape[0], len(self.strides), 9 c33_inds[c33_masks == 0] = 0 reg_pred_c33 = reg_pred[c33_inds].detach() # N x L x K @@ -722,7 +783,9 @@ def _add_more_pos(self, reg_pred, gt_instances, shapes_per_level): if N > 0: with torch.no_grad(): c33_reg_loss = ( - self.iou_loss(reg_pred_c33.view(N * L * K, 4), c33_regs_expand, None, reduction="none") + self.iou_loss( + reg_pred_c33.view(N * L * K, 4), c33_regs_expand, None, reduction="none" + ) .view(N, L, K) .detach() ) # N x L x K diff --git a/dimos/models/Detic/third_party/CenterNet2/centernet/modeling/dense_heads/centernet_head.py b/dimos/models/Detic/third_party/CenterNet2/centernet/modeling/dense_heads/centernet_head.py index 8374b752f1..3f939233a1 100644 --- a/dimos/models/Detic/third_party/CenterNet2/centernet/modeling/dense_heads/centernet_head.py +++ b/dimos/models/Detic/third_party/CenterNet2/centernet/modeling/dense_heads/centernet_head.py @@ -69,7 +69,12 @@ def __init__( conv_func = nn.Conv2d tower.append( conv_func( - in_channels if i == 0 else channel, channel, kernel_size=3, stride=1, padding=1, bias=True + in_channels if i == 0 else channel, + channel, + kernel_size=3, + stride=1, + padding=1, + bias=True, ) ) if norm == "GN" and channel % 32 != 0: @@ -79,7 +84,9 @@ def __init__( tower.append(nn.ReLU()) self.add_module("{}_tower".format(head), nn.Sequential(*tower)) - self.bbox_pred = nn.Conv2d(in_channels, 4, kernel_size=self.out_kernel, stride=1, padding=self.out_kernel // 2) + self.bbox_pred = nn.Conv2d( + in_channels, 4, kernel_size=self.out_kernel, stride=1, padding=self.out_kernel // 2 + ) self.scales = nn.ModuleList([Scale(init_value=1.0) for _ in range(num_levels)]) @@ -99,7 +106,9 @@ def __init__( bias_value = -math.log((1 - prior_prob) / prior_prob) if self.with_agn_hm: - self.agn_hm = nn.Conv2d(in_channels, 1, kernel_size=self.out_kernel, stride=1, padding=self.out_kernel // 2) + self.agn_hm = nn.Conv2d( + in_channels, 1, kernel_size=self.out_kernel, stride=1, padding=self.out_kernel // 2 + ) torch.nn.init.constant_(self.agn_hm.bias, bias_value) torch.nn.init.normal_(self.agn_hm.weight, std=0.01) diff --git a/dimos/models/Detic/third_party/CenterNet2/centernet/modeling/layers/deform_conv.py b/dimos/models/Detic/third_party/CenterNet2/centernet/modeling/layers/deform_conv.py index fa0f2f649d..396aa9554a 100644 --- a/dimos/models/Detic/third_party/CenterNet2/centernet/modeling/layers/deform_conv.py +++ b/dimos/models/Detic/third_party/CenterNet2/centernet/modeling/layers/deform_conv.py @@ -39,7 +39,10 @@ def __init__( assert len(kernel_size) == 2 assert len(stride) == 2 assert len(dilation) == 2 - padding = (dilation[0] * (kernel_size[0] - 1) // 2, dilation[1] * (kernel_size[1] - 1) // 2) + padding = ( + dilation[0] * (kernel_size[0] - 1) // 2, + dilation[1] * (kernel_size[1] - 1) // 2, + ) offset_base_channels = kernel_size[0] * kernel_size[1] else: padding = dilation * (kernel_size - 1) // 2 @@ -104,7 +107,9 @@ def forward(self, x, return_offset=False): # get output shape output_shape = [ (i + 2 * p - (di * (k - 1) + 1)) // d + 1 - for i, p, di, k, d in zip(x.shape[-2:], self.padding, self.dilation, self.kernel_size, self.stride) + for i, p, di, k, d in zip( + x.shape[-2:], self.padding, self.dilation, self.kernel_size, self.stride + ) ] output_shape = [x.shape[0], self.conv.weight.shape[0]] + output_shape return _NewEmptyTensorOp.apply(x, output_shape) diff --git a/dimos/models/Detic/third_party/CenterNet2/centernet/modeling/meta_arch/centernet_detector.py b/dimos/models/Detic/third_party/CenterNet2/centernet/modeling/meta_arch/centernet_detector.py index fd7d574b2f..63a1cb13f9 100644 --- a/dimos/models/Detic/third_party/CenterNet2/centernet/modeling/meta_arch/centernet_detector.py +++ b/dimos/models/Detic/third_party/CenterNet2/centernet/modeling/meta_arch/centernet_detector.py @@ -42,7 +42,9 @@ def inference(self, batched_inputs, do_postprocess=True): proposals, _ = self.proposal_generator(images, features, None) processed_results = [] - for results_per_image, input_per_image, image_size in zip(proposals, batched_inputs, images.image_sizes): + for results_per_image, input_per_image, image_size in zip( + proposals, batched_inputs, images.image_sizes + ): if do_postprocess: height = input_per_image.get("height", image_size[0]) width = input_per_image.get("width", image_size[1]) diff --git a/dimos/models/Detic/third_party/CenterNet2/centernet/modeling/roi_heads/custom_fast_rcnn.py b/dimos/models/Detic/third_party/CenterNet2/centernet/modeling/roi_heads/custom_fast_rcnn.py index 491a5dc75a..a0c44fec3d 100644 --- a/dimos/models/Detic/third_party/CenterNet2/centernet/modeling/roi_heads/custom_fast_rcnn.py +++ b/dimos/models/Detic/third_party/CenterNet2/centernet/modeling/roi_heads/custom_fast_rcnn.py @@ -40,7 +40,9 @@ def losses(self, predictions, proposals): enable advanced loss """ scores, proposal_deltas = predictions - gt_classes = cat([p.gt_classes for p in proposals], dim=0) if len(proposals) else torch.empty(0) + gt_classes = ( + cat([p.gt_classes for p in proposals], dim=0) if len(proposals) else torch.empty(0) + ) num_classes = self.num_classes _log_classification_stats(scores, gt_classes) @@ -60,7 +62,9 @@ def losses(self, predictions, proposals): loss_cls = self.softmax_cross_entropy_loss(scores, gt_classes) return { "loss_cls": loss_cls, - "loss_box_reg": self.box_reg_loss(proposal_boxes, gt_boxes, proposal_deltas, gt_classes), + "loss_box_reg": self.box_reg_loss( + proposal_boxes, gt_boxes, proposal_deltas, gt_classes + ), } def sigmoid_cross_entropy_loss(self, pred_class_logits, gt_classes): @@ -85,7 +89,9 @@ def sigmoid_cross_entropy_loss(self, pred_class_logits, gt_classes): fed_w = appeared_mask.view(1, C).expand(B, C) weight = weight * fed_w.float() - cls_loss = F.binary_cross_entropy_with_logits(pred_class_logits[:, :-1], target, reduction="none") # B x C + cls_loss = F.binary_cross_entropy_with_logits( + pred_class_logits[:, :-1], target, reduction="none" + ) # B x C loss = torch.sum(cls_loss * weight) / B return loss @@ -104,7 +110,9 @@ def softmax_cross_entropy_loss(self, pred_class_logits, gt_classes): appeared_mask = appeared.new_zeros(C + 1).float() appeared_mask[appeared] = 1.0 # C + 1 appeared_mask[C] = 1.0 - loss = F.cross_entropy(pred_class_logits, gt_classes, weight=appeared_mask, reduction="mean") + loss = F.cross_entropy( + pred_class_logits, gt_classes, weight=appeared_mask, reduction="mean" + ) else: loss = F.cross_entropy(pred_class_logits, gt_classes, reduction="mean") return loss diff --git a/dimos/models/Detic/third_party/CenterNet2/centernet/modeling/roi_heads/custom_roi_heads.py b/dimos/models/Detic/third_party/CenterNet2/centernet/modeling/roi_heads/custom_roi_heads.py index 1a85c20231..aefd1d164e 100644 --- a/dimos/models/Detic/third_party/CenterNet2/centernet/modeling/roi_heads/custom_roi_heads.py +++ b/dimos/models/Detic/third_party/CenterNet2/centernet/modeling/roi_heads/custom_roi_heads.py @@ -22,8 +22,12 @@ def _init_box_head(self, cfg, input_shape): self.debug_show_name = cfg.DEBUG_SHOW_NAME self.save_debug = cfg.SAVE_DEBUG self.vis_thresh = cfg.VIS_THRESH - self.pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(torch.device(cfg.MODEL.DEVICE)).view(3, 1, 1) - self.pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(torch.device(cfg.MODEL.DEVICE)).view(3, 1, 1) + self.pixel_mean = ( + torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(torch.device(cfg.MODEL.DEVICE)).view(3, 1, 1) + ) + self.pixel_std = ( + torch.Tensor(cfg.MODEL.PIXEL_STD).to(torch.device(cfg.MODEL.DEVICE)).view(3, 1, 1) + ) return ret def forward(self, images, features, proposals, targets=None): @@ -70,7 +74,9 @@ def _init_box_head(self, cfg, input_shape): for box_head, bbox_reg_weights in zip(ret["box_heads"], cascade_bbox_reg_weights): box_predictors.append( CustomFastRCNNOutputLayers( - cfg, box_head.output_shape, box2box_transform=Box2BoxTransform(weights=bbox_reg_weights) + cfg, + box_head.output_shape, + box2box_transform=Box2BoxTransform(weights=bbox_reg_weights), ) ) ret["box_predictors"] = box_predictors @@ -79,8 +85,12 @@ def _init_box_head(self, cfg, input_shape): self.debug_show_name = cfg.DEBUG_SHOW_NAME self.save_debug = cfg.SAVE_DEBUG self.vis_thresh = cfg.VIS_THRESH - self.pixel_mean = torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(torch.device(cfg.MODEL.DEVICE)).view(3, 1, 1) - self.pixel_std = torch.Tensor(cfg.MODEL.PIXEL_STD).to(torch.device(cfg.MODEL.DEVICE)).view(3, 1, 1) + self.pixel_mean = ( + torch.Tensor(cfg.MODEL.PIXEL_MEAN).to(torch.device(cfg.MODEL.DEVICE)).view(3, 1, 1) + ) + self.pixel_std = ( + torch.Tensor(cfg.MODEL.PIXEL_STD).to(torch.device(cfg.MODEL.DEVICE)).view(3, 1, 1) + ) return ret def _forward_box(self, features, proposals, targets=None): diff --git a/dimos/models/Detic/third_party/CenterNet2/predictor.py b/dimos/models/Detic/third_party/CenterNet2/predictor.py index a51f5b2cc7..990040fc03 100644 --- a/dimos/models/Detic/third_party/CenterNet2/predictor.py +++ b/dimos/models/Detic/third_party/CenterNet2/predictor.py @@ -21,7 +21,9 @@ def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False): parallel (bool): whether to run the model in different processes from visualization. Useful since the visualization logic can be slow. """ - self.metadata = MetadataCatalog.get(cfg.DATASETS.TRAIN[0] if len(cfg.DATASETS.TRAIN) else "__unused") + self.metadata = MetadataCatalog.get( + cfg.DATASETS.TRAIN[0] if len(cfg.DATASETS.TRAIN) else "__unused" + ) self.cpu_device = torch.device("cpu") self.instance_mode = instance_mode @@ -52,10 +54,14 @@ def run_on_image(self, image, visualizer=None): visualizer = Visualizer(image, self.metadata, instance_mode=self.instance_mode) if "panoptic_seg" in predictions: panoptic_seg, segments_info = predictions["panoptic_seg"] - vis_output = visualizer.draw_panoptic_seg_predictions(panoptic_seg.to(self.cpu_device), segments_info) + vis_output = visualizer.draw_panoptic_seg_predictions( + panoptic_seg.to(self.cpu_device), segments_info + ) else: if "sem_seg" in predictions: - vis_output = visualizer.draw_sem_seg(predictions["sem_seg"].argmax(dim=0).to(self.cpu_device)) + vis_output = visualizer.draw_sem_seg( + predictions["sem_seg"].argmax(dim=0).to(self.cpu_device) + ) if "instances" in predictions: instances = predictions["instances"].to(self.cpu_device) if use_video_vis: @@ -186,7 +192,9 @@ def __init__(self, cfg, num_gpus: int = 1): cfg = cfg.clone() cfg.defrost() cfg.MODEL.DEVICE = "cuda:{}".format(gpuid) if num_gpus > 0 else "cpu" - self.procs.append(AsyncPredictor._PredictWorker(cfg, self.task_queue, self.result_queue)) + self.procs.append( + AsyncPredictor._PredictWorker(cfg, self.task_queue, self.result_queue) + ) self.put_idx = 0 self.get_idx = 0 diff --git a/dimos/models/Detic/third_party/CenterNet2/tools/analyze_model.py b/dimos/models/Detic/third_party/CenterNet2/tools/analyze_model.py index 150a03663f..75a4a794df 100755 --- a/dimos/models/Detic/third_party/CenterNet2/tools/analyze_model.py +++ b/dimos/models/Detic/third_party/CenterNet2/tools/analyze_model.py @@ -60,9 +60,12 @@ def do_flop(cfg): logger.info("Flops table computed from only one input sample:\n" + flop_count_table(flops)) logger.info( - "Average GFlops for each type of operators:\n" + str([(k, v / (idx + 1) / 1e9) for k, v in counts.items()]) + "Average GFlops for each type of operators:\n" + + str([(k, v / (idx + 1) / 1e9) for k, v in counts.items()]) + ) + logger.info( + "Total GFlops: {:.1f}±{:.1f}".format(np.mean(total_flops) / 1e9, np.std(total_flops) / 1e9) ) - logger.info("Total GFlops: {:.1f}±{:.1f}".format(np.mean(total_flops) / 1e9, np.std(total_flops) / 1e9)) def do_activation(cfg): @@ -83,8 +86,15 @@ def do_activation(cfg): count = activation_count_operators(model, data) counts += count total_activations.append(sum(count.values())) - logger.info("(Million) Activations for Each Type of Operators:\n" + str([(k, v / idx) for k, v in counts.items()])) - logger.info("Total (Million) Activations: {}±{}".format(np.mean(total_activations), np.std(total_activations))) + logger.info( + "(Million) Activations for Each Type of Operators:\n" + + str([(k, v / idx) for k, v in counts.items()]) + ) + logger.info( + "Total (Million) Activations: {}±{}".format( + np.mean(total_activations), np.std(total_activations) + ) + ) def do_parameter(cfg): diff --git a/dimos/models/Detic/third_party/CenterNet2/tools/benchmark.py b/dimos/models/Detic/third_party/CenterNet2/tools/benchmark.py index 06b8f3b91a..c2d673fab1 100755 --- a/dimos/models/Detic/third_party/CenterNet2/tools/benchmark.py +++ b/dimos/models/Detic/third_party/CenterNet2/tools/benchmark.py @@ -61,7 +61,9 @@ def create_data_benchmark(cfg, args): def RAM_msg(): vram = psutil.virtual_memory() - return "RAM Usage: {:.2f}/{:.2f} GB".format((vram.total - vram.available) / 1024**3, vram.total / 1024**3) + return "RAM Usage: {:.2f}/{:.2f} GB".format( + (vram.total - vram.available) / 1024**3, vram.total / 1024**3 + ) def benchmark_data(args): @@ -97,7 +99,9 @@ def benchmark_train(args): model = build_model(cfg) logger.info("Model:\n{}".format(model)) if comm.get_world_size() > 1: - model = DistributedDataParallel(model, device_ids=[comm.get_local_rank()], broadcast_buffers=False) + model = DistributedDataParallel( + model, device_ids=[comm.get_local_rank()], broadcast_buffers=False + ) optimizer = build_optimizer(cfg, model) checkpointer = DetectionCheckpointer(model, optimizer=optimizer) checkpointer.load(cfg.MODEL.WEIGHTS) @@ -118,7 +122,9 @@ def f(): [ hooks.IterationTimer(), hooks.PeriodicWriter([CommonMetricPrinter(max_iter)]), - hooks.TorchProfiler(lambda trainer: trainer.iter == max_iter - 1, cfg.OUTPUT_DIR, save_tensorboard=True), + hooks.TorchProfiler( + lambda trainer: trainer.iter == max_iter - 1, cfg.OUTPUT_DIR, save_tensorboard=True + ), ] ) trainer.train(1, max_iter) diff --git a/dimos/models/Detic/third_party/CenterNet2/tools/deploy/export_model.py b/dimos/models/Detic/third_party/CenterNet2/tools/deploy/export_model.py index 0660399b35..067309f241 100755 --- a/dimos/models/Detic/third_party/CenterNet2/tools/deploy/export_model.py +++ b/dimos/models/Detic/third_party/CenterNet2/tools/deploy/export_model.py @@ -156,7 +156,9 @@ def get_sample_inputs(args): # get a sample data original_image = detection_utils.read_image(args.sample_image, format=cfg.INPUT.FORMAT) # Do same preprocessing as DefaultPredictor - aug = T.ResizeShortestEdge([cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MIN_SIZE_TEST], cfg.INPUT.MAX_SIZE_TEST) + aug = T.ResizeShortestEdge( + [cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MIN_SIZE_TEST], cfg.INPUT.MAX_SIZE_TEST + ) height, width = original_image.shape[:2] image = aug.get_transform(original_image).apply_image(original_image) image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1)) diff --git a/dimos/models/Detic/third_party/CenterNet2/tools/lazyconfig_train_net.py b/dimos/models/Detic/third_party/CenterNet2/tools/lazyconfig_train_net.py index 32a4952a60..506e8baff6 100755 --- a/dimos/models/Detic/third_party/CenterNet2/tools/lazyconfig_train_net.py +++ b/dimos/models/Detic/third_party/CenterNet2/tools/lazyconfig_train_net.py @@ -35,7 +35,9 @@ def do_test(cfg, model): if "evaluator" in cfg.dataloader: - ret = inference_on_dataset(model, instantiate(cfg.dataloader.test), instantiate(cfg.dataloader.evaluator)) + ret = inference_on_dataset( + model, instantiate(cfg.dataloader.test), instantiate(cfg.dataloader.evaluator) + ) print_csv_format(ret) return ret @@ -80,7 +82,9 @@ def do_train(args, cfg): [ hooks.IterationTimer(), hooks.LRScheduler(scheduler=instantiate(cfg.lr_multiplier)), - hooks.PeriodicCheckpointer(checkpointer, **cfg.train.checkpointer) if comm.is_main_process() else None, + hooks.PeriodicCheckpointer(checkpointer, **cfg.train.checkpointer) + if comm.is_main_process() + else None, hooks.EvalHook(cfg.train.eval_period, lambda: do_test(cfg, model)), hooks.PeriodicWriter( default_writers(cfg.train.output_dir, cfg.train.max_iter), diff --git a/dimos/models/Detic/third_party/CenterNet2/tools/lightning_train_net.py b/dimos/models/Detic/third_party/CenterNet2/tools/lightning_train_net.py index 46807c1b41..037957bac6 100644 --- a/dimos/models/Detic/third_party/CenterNet2/tools/lightning_train_net.py +++ b/dimos/models/Detic/third_party/CenterNet2/tools/lightning_train_net.py @@ -83,13 +83,19 @@ def training_step(self, batch, batch_idx): self.storage.__enter__() self.iteration_timer.trainer = weakref.proxy(self) self.iteration_timer.before_step() - self.writers = default_writers(self.cfg.OUTPUT_DIR, self.max_iter) if comm.is_main_process() else {} + self.writers = ( + default_writers(self.cfg.OUTPUT_DIR, self.max_iter) + if comm.is_main_process() + else {} + ) loss_dict = self.model(batch) SimpleTrainer.write_metrics(loss_dict, data_time) opt = self.optimizers() - self.storage.put_scalar("lr", opt.param_groups[self._best_param_group_id]["lr"], smoothing_hint=False) + self.storage.put_scalar( + "lr", opt.param_groups[self._best_param_group_id]["lr"], smoothing_hint=False + ) self.iteration_timer.after_step() self.storage.step() # A little odd to put before step here, but it's the best way to get a proper timing @@ -143,7 +149,9 @@ def validation_epoch_end(self, _outputs): v = float(v) except Exception as e: raise ValueError( - "[EvalHook] eval_function should return a nested dict of float. Got '{}: {}' instead.".format(k, v) + "[EvalHook] eval_function should return a nested dict of float. Got '{}: {}' instead.".format( + k, v + ) ) from e self.storage.put_scalars(**flattened_results, smoothing_hint=False) diff --git a/dimos/models/Detic/third_party/CenterNet2/tools/plain_train_net.py b/dimos/models/Detic/third_party/CenterNet2/tools/plain_train_net.py index 999fd48c88..2ff9080f7f 100755 --- a/dimos/models/Detic/third_party/CenterNet2/tools/plain_train_net.py +++ b/dimos/models/Detic/third_party/CenterNet2/tools/plain_train_net.py @@ -103,7 +103,9 @@ def do_test(cfg, model): results = OrderedDict() for dataset_name in cfg.DATASETS.TEST: data_loader = build_detection_test_loader(cfg, dataset_name) - evaluator = get_evaluator(cfg, dataset_name, os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name)) + evaluator = get_evaluator( + cfg, dataset_name, os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) + ) results_i = inference_on_dataset(model, data_loader, evaluator) results[dataset_name] = results_i if comm.is_main_process(): @@ -119,11 +121,17 @@ def do_train(cfg, model, resume=False): optimizer = build_optimizer(cfg, model) scheduler = build_lr_scheduler(cfg, optimizer) - checkpointer = DetectionCheckpointer(model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler) - start_iter = checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1 + checkpointer = DetectionCheckpointer( + model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler + ) + start_iter = ( + checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1 + ) max_iter = cfg.SOLVER.MAX_ITER - periodic_checkpointer = PeriodicCheckpointer(checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter) + periodic_checkpointer = PeriodicCheckpointer( + checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter + ) writers = default_writers(cfg.OUTPUT_DIR, max_iter) if comm.is_main_process() else [] @@ -150,12 +158,18 @@ def do_train(cfg, model, resume=False): storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False) scheduler.step() - if cfg.TEST.EVAL_PERIOD > 0 and (iteration + 1) % cfg.TEST.EVAL_PERIOD == 0 and iteration != max_iter - 1: + if ( + cfg.TEST.EVAL_PERIOD > 0 + and (iteration + 1) % cfg.TEST.EVAL_PERIOD == 0 + and iteration != max_iter - 1 + ): do_test(cfg, model) # Compared to "train_net.py", the test results are not dumped to EventStorage comm.synchronize() - if iteration - start_iter > 5 and ((iteration + 1) % 20 == 0 or iteration == max_iter - 1): + if iteration - start_iter > 5 and ( + (iteration + 1) % 20 == 0 or iteration == max_iter - 1 + ): for writer in writers: writer.write() periodic_checkpointer.step(iteration) @@ -169,7 +183,9 @@ def setup(args): cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() - default_setup(cfg, args) # if you don't like any of the default setup, write your own setup code + default_setup( + cfg, args + ) # if you don't like any of the default setup, write your own setup code return cfg @@ -179,12 +195,16 @@ def main(args): model = build_model(cfg) logger.info("Model:\n{}".format(model)) if args.eval_only: - DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(cfg.MODEL.WEIGHTS, resume=args.resume) + DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( + cfg.MODEL.WEIGHTS, resume=args.resume + ) return do_test(cfg, model) distributed = comm.get_world_size() > 1 if distributed: - model = DistributedDataParallel(model, device_ids=[comm.get_local_rank()], broadcast_buffers=False) + model = DistributedDataParallel( + model, device_ids=[comm.get_local_rank()], broadcast_buffers=False + ) do_train(cfg, model, resume=args.resume) return do_test(cfg, model) diff --git a/dimos/models/Detic/third_party/CenterNet2/tools/train_net.py b/dimos/models/Detic/third_party/CenterNet2/tools/train_net.py index 5d08a62f68..10334aa1d8 100755 --- a/dimos/models/Detic/third_party/CenterNet2/tools/train_net.py +++ b/dimos/models/Detic/third_party/CenterNet2/tools/train_net.py @@ -106,7 +106,9 @@ def test_with_TTA(cls, cfg, model): logger.info("Running inference with test-time augmentation ...") model = GeneralizedRCNNWithTTA(cfg, model) evaluators = [ - cls.build_evaluator(cfg, name, output_folder=os.path.join(cfg.OUTPUT_DIR, "inference_TTA")) + cls.build_evaluator( + cfg, name, output_folder=os.path.join(cfg.OUTPUT_DIR, "inference_TTA") + ) for name in cfg.DATASETS.TEST ] res = cls.test(cfg, model, evaluators) @@ -131,7 +133,9 @@ def main(args): if args.eval_only: model = Trainer.build_model(cfg) - DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(cfg.MODEL.WEIGHTS, resume=args.resume) + DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( + cfg.MODEL.WEIGHTS, resume=args.resume + ) res = Trainer.test(cfg, model) if cfg.TEST.AUG.ENABLED: res.update(Trainer.test_with_TTA(cfg, model)) @@ -147,7 +151,9 @@ def main(args): trainer = Trainer(cfg) trainer.resume_or_load(resume=args.resume) if cfg.TEST.AUG.ENABLED: - trainer.register_hooks([hooks.EvalHook(0, lambda: trainer.test_with_TTA(cfg, trainer.model))]) + trainer.register_hooks( + [hooks.EvalHook(0, lambda: trainer.test_with_TTA(cfg, trainer.model))] + ) return trainer.train() diff --git a/dimos/models/Detic/third_party/CenterNet2/train_net.py b/dimos/models/Detic/third_party/CenterNet2/train_net.py index c410fda67b..1ca9f4cdd7 100644 --- a/dimos/models/Detic/third_party/CenterNet2/train_net.py +++ b/dimos/models/Detic/third_party/CenterNet2/train_net.py @@ -73,7 +73,9 @@ def do_train(cfg, model, resume=False): optimizer = build_optimizer(cfg, model) scheduler = build_lr_scheduler(cfg, optimizer) - checkpointer = DetectionCheckpointer(model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler) + checkpointer = DetectionCheckpointer( + model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler + ) start_iter = ( checkpointer.resume_or_load( @@ -87,7 +89,9 @@ def do_train(cfg, model, resume=False): start_iter = 0 max_iter = cfg.SOLVER.MAX_ITER if cfg.SOLVER.TRAIN_ITER < 0 else cfg.SOLVER.TRAIN_ITER - periodic_checkpointer = PeriodicCheckpointer(checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter) + periodic_checkpointer = PeriodicCheckpointer( + checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter + ) writers = ( [ @@ -143,7 +147,11 @@ def do_train(cfg, model, resume=False): data_timer.reset() scheduler.step() - if cfg.TEST.EVAL_PERIOD > 0 and iteration % cfg.TEST.EVAL_PERIOD == 0 and iteration != max_iter: + if ( + cfg.TEST.EVAL_PERIOD > 0 + and iteration % cfg.TEST.EVAL_PERIOD == 0 + and iteration != max_iter + ): do_test(cfg, model) comm.synchronize() @@ -153,7 +161,9 @@ def do_train(cfg, model, resume=False): periodic_checkpointer.step(iteration) total_time = time.perf_counter() - start_time - logger.info("Total training time: {}".format(str(datetime.timedelta(seconds=int(total_time))))) + logger.info( + "Total training time: {}".format(str(datetime.timedelta(seconds=int(total_time)))) + ) def setup(args): @@ -179,7 +189,9 @@ def main(args): model = build_model(cfg) logger.info("Model:\n{}".format(model)) if args.eval_only: - DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(cfg.MODEL.WEIGHTS, resume=args.resume) + DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( + cfg.MODEL.WEIGHTS, resume=args.resume + ) if cfg.TEST.AUG.ENABLED: logger.info("Running inference with test-time augmentation ...") model = GeneralizedRCNNWithTTA(cfg, model, batch_size=1) @@ -189,7 +201,10 @@ def main(args): distributed = comm.get_world_size() > 1 if distributed: model = DistributedDataParallel( - model, device_ids=[comm.get_local_rank()], broadcast_buffers=False, find_unused_parameters=True + model, + device_ids=[comm.get_local_rank()], + broadcast_buffers=False, + find_unused_parameters=True, ) do_train(cfg, model, resume=args.resume) diff --git a/dimos/models/Detic/third_party/Deformable-DETR/benchmark.py b/dimos/models/Detic/third_party/Deformable-DETR/benchmark.py index 5fea6e539f..9830274aa6 100644 --- a/dimos/models/Detic/third_party/Deformable-DETR/benchmark.py +++ b/dimos/models/Detic/third_party/Deformable-DETR/benchmark.py @@ -23,7 +23,9 @@ def get_benckmark_arg_parser(): parser = argparse.ArgumentParser("Benchmark inference speed of Deformable DETR.") parser.add_argument("--num_iters", type=int, default=300, help="total iters to benchmark speed") - parser.add_argument("--warm_iters", type=int, default=5, help="ignore first several iters that are very slow") + parser.add_argument( + "--warm_iters", type=int, default=5, help="ignore first several iters that are very slow" + ) parser.add_argument("--batch_size", type=int, default=1, help="batch size in inference") parser.add_argument("--resume", type=str, help="load the pre-trained checkpoint") return parser @@ -57,7 +59,9 @@ def benchmark(): if args.resume is not None: ckpt = torch.load(args.resume, map_location=lambda storage, loc: storage) model.load_state_dict(ckpt["model"]) - inputs = nested_tensor_from_tensor_list([dataset.__getitem__(0)[0].cuda() for _ in range(args.batch_size)]) + inputs = nested_tensor_from_tensor_list( + [dataset.__getitem__(0)[0].cuda() for _ in range(args.batch_size)] + ) t = measure_average_inference_time(model, inputs, args.num_iters, args.warm_iters) return 1.0 / t * args.batch_size diff --git a/dimos/models/Detic/third_party/Deformable-DETR/datasets/coco.py b/dimos/models/Detic/third_party/Deformable-DETR/datasets/coco.py index ced6a2cd53..00e3d431ba 100644 --- a/dimos/models/Detic/third_party/Deformable-DETR/datasets/coco.py +++ b/dimos/models/Detic/third_party/Deformable-DETR/datasets/coco.py @@ -25,9 +25,22 @@ class CocoDetection(TvCocoDetection): - def __init__(self, img_folder, ann_file, transforms, return_masks, cache_mode=False, local_rank=0, local_size=1): + def __init__( + self, + img_folder, + ann_file, + transforms, + return_masks, + cache_mode=False, + local_rank=0, + local_size=1, + ): super(CocoDetection, self).__init__( - img_folder, ann_file, cache_mode=cache_mode, local_rank=local_rank, local_size=local_size + img_folder, + ann_file, + cache_mode=cache_mode, + local_rank=local_rank, + local_size=local_size, ) self._transforms = transforms self.prepare = ConvertCocoPolysToMask(return_masks) diff --git a/dimos/models/Detic/third_party/Deformable-DETR/datasets/coco_eval.py b/dimos/models/Detic/third_party/Deformable-DETR/datasets/coco_eval.py index 94a0922a82..b0b9a76d39 100644 --- a/dimos/models/Detic/third_party/Deformable-DETR/datasets/coco_eval.py +++ b/dimos/models/Detic/third_party/Deformable-DETR/datasets/coco_eval.py @@ -64,7 +64,9 @@ def update(self, predictions): def synchronize_between_processes(self): for iou_type in self.iou_types: self.eval_imgs[iou_type] = np.concatenate(self.eval_imgs[iou_type], 2) - create_common_coco_eval(self.coco_eval[iou_type], self.img_ids, self.eval_imgs[iou_type]) + create_common_coco_eval( + self.coco_eval[iou_type], self.img_ids, self.eval_imgs[iou_type] + ) def accumulate(self): for coco_eval in self.coco_eval.values(): @@ -125,7 +127,8 @@ def prepare_for_coco_segmentation(self, predictions): labels = prediction["labels"].tolist() rles = [ - mask_util.encode(np.array(mask[0, :, :, np.newaxis], dtype=np.uint8, order="F"))[0] for mask in masks + mask_util.encode(np.array(mask[0, :, :, np.newaxis], dtype=np.uint8, order="F"))[0] + for mask in masks ] for rle in rles: rle["counts"] = rle["counts"].decode("utf-8") @@ -245,7 +248,10 @@ def evaluate(self): evaluateImg = self.evaluateImg maxDet = p.maxDets[-1] evalImgs = [ - evaluateImg(imgId, catId, areaRng, maxDet) for catId in catIds for areaRng in p.areaRng for imgId in p.imgIds + evaluateImg(imgId, catId, areaRng, maxDet) + for catId in catIds + for areaRng in p.areaRng + for imgId in p.imgIds ] # this is NOT in the pycocotools code, but could be done outside evalImgs = np.asarray(evalImgs).reshape(len(catIds), len(p.areaRng), len(p.imgIds)) diff --git a/dimos/models/Detic/third_party/Deformable-DETR/datasets/coco_panoptic.py b/dimos/models/Detic/third_party/Deformable-DETR/datasets/coco_panoptic.py index f33e52fadb..f0697b63b2 100644 --- a/dimos/models/Detic/third_party/Deformable-DETR/datasets/coco_panoptic.py +++ b/dimos/models/Detic/third_party/Deformable-DETR/datasets/coco_panoptic.py @@ -40,7 +40,11 @@ def __init__(self, img_folder, ann_folder, ann_file, transforms=None, return_mas self.return_masks = return_masks def __getitem__(self, idx): - ann_info = self.coco["annotations"][idx] if "annotations" in self.coco else self.coco["images"][idx] + ann_info = ( + self.coco["annotations"][idx] + if "annotations" in self.coco + else self.coco["images"][idx] + ) img_path = Path(self.img_folder) / ann_info["file_name"].replace(".png", ".jpg") ann_path = Path(self.ann_folder) / ann_info["file_name"] @@ -54,10 +58,14 @@ def __getitem__(self, idx): masks = masks == ids[:, None, None] masks = torch.as_tensor(masks, dtype=torch.uint8) - labels = torch.tensor([ann["category_id"] for ann in ann_info["segments_info"]], dtype=torch.int64) + labels = torch.tensor( + [ann["category_id"] for ann in ann_info["segments_info"]], dtype=torch.int64 + ) target = {} - target["image_id"] = torch.tensor([ann_info["image_id"] if "image_id" in ann_info else ann_info["id"]]) + target["image_id"] = torch.tensor( + [ann_info["image_id"] if "image_id" in ann_info else ann_info["id"]] + ) if self.return_masks: target["masks"] = masks target["labels"] = labels @@ -102,7 +110,11 @@ def build(image_set, args): ann_file = ann_folder_root / ann_file dataset = CocoPanoptic( - img_folder_path, ann_folder, ann_file, transforms=make_coco_transforms(image_set), return_masks=args.masks + img_folder_path, + ann_folder, + ann_file, + transforms=make_coco_transforms(image_set), + return_masks=args.masks, ) return dataset diff --git a/dimos/models/Detic/third_party/Deformable-DETR/datasets/data_prefetcher.py b/dimos/models/Detic/third_party/Deformable-DETR/datasets/data_prefetcher.py index f97cdd9c2f..731ebc19d4 100644 --- a/dimos/models/Detic/third_party/Deformable-DETR/datasets/data_prefetcher.py +++ b/dimos/models/Detic/third_party/Deformable-DETR/datasets/data_prefetcher.py @@ -37,7 +37,9 @@ def preload(self): # at the time we start copying to next_*: # self.stream.wait_stream(torch.cuda.current_stream()) with torch.cuda.stream(self.stream): - self.next_samples, self.next_targets = to_cuda(self.next_samples, self.next_targets, self.device) + self.next_samples, self.next_targets = to_cuda( + self.next_samples, self.next_targets, self.device + ) # more code for the alternative if record_stream() doesn't work: # copy_ will record the use of the pinned source tensor in this side stream. # self.next_input_gpu.copy_(self.next_input, non_blocking=True) diff --git a/dimos/models/Detic/third_party/Deformable-DETR/datasets/panoptic_eval.py b/dimos/models/Detic/third_party/Deformable-DETR/datasets/panoptic_eval.py index 0dabffdb58..ad606603a9 100644 --- a/dimos/models/Detic/third_party/Deformable-DETR/datasets/panoptic_eval.py +++ b/dimos/models/Detic/third_party/Deformable-DETR/datasets/panoptic_eval.py @@ -48,5 +48,10 @@ def summarize(self): predictions_json = os.path.join(self.output_dir, "predictions.json") with open(predictions_json, "w") as f: f.write(json.dumps(json_data)) - return pq_compute(self.gt_json, predictions_json, gt_folder=self.gt_folder, pred_folder=self.output_dir) + return pq_compute( + self.gt_json, + predictions_json, + gt_folder=self.gt_folder, + pred_folder=self.output_dir, + ) return None diff --git a/dimos/models/Detic/third_party/Deformable-DETR/datasets/samplers.py b/dimos/models/Detic/third_party/Deformable-DETR/datasets/samplers.py index fe8cd673c1..a8892f7561 100644 --- a/dimos/models/Detic/third_party/Deformable-DETR/datasets/samplers.py +++ b/dimos/models/Detic/third_party/Deformable-DETR/datasets/samplers.py @@ -28,7 +28,9 @@ class DistributedSampler(Sampler): rank (optional): Rank of the current process within num_replicas. """ - def __init__(self, dataset, num_replicas=None, rank=None, local_rank=None, local_size=None, shuffle=True): + def __init__( + self, dataset, num_replicas=None, rank=None, local_rank=None, local_size=None, shuffle=True + ): if num_replicas is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") @@ -87,7 +89,9 @@ class NodeDistributedSampler(Sampler): rank (optional): Rank of the current process within num_replicas. """ - def __init__(self, dataset, num_replicas=None, rank=None, local_rank=None, local_size=None, shuffle=True): + def __init__( + self, dataset, num_replicas=None, rank=None, local_rank=None, local_size=None, shuffle=True + ): if num_replicas is None: if not dist.is_available(): raise RuntimeError("Requires distributed package to be available") @@ -127,7 +131,10 @@ def __iter__(self): assert len(indices) == self.total_size_parts # subsample - indices = indices[self.rank // self.num_parts : self.total_size_parts : self.num_replicas // self.num_parts] + indices = indices[ + self.rank // self.num_parts : self.total_size_parts : self.num_replicas + // self.num_parts + ] assert len(indices) == self.num_samples return iter(indices) diff --git a/dimos/models/Detic/third_party/Deformable-DETR/datasets/transforms.py b/dimos/models/Detic/third_party/Deformable-DETR/datasets/transforms.py index 542c4d968e..08a771d475 100644 --- a/dimos/models/Detic/third_party/Deformable-DETR/datasets/transforms.py +++ b/dimos/models/Detic/third_party/Deformable-DETR/datasets/transforms.py @@ -73,7 +73,9 @@ def hflip(image, target): target = target.copy() if "boxes" in target: boxes = target["boxes"] - boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor([w, 0, w, 0]) + boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor( + [w, 0, w, 0] + ) target["boxes"] = boxes if "masks" in target: @@ -123,7 +125,9 @@ def get_size(image_size, size, max_size=None): target = target.copy() if "boxes" in target: boxes = target["boxes"] - scaled_boxes = boxes * torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height]) + scaled_boxes = boxes * torch.as_tensor( + [ratio_width, ratio_height, ratio_width, ratio_height] + ) target["boxes"] = scaled_boxes if "area" in target: @@ -135,7 +139,9 @@ def get_size(image_size, size, max_size=None): target["size"] = torch.tensor([h, w]) if "masks" in target: - target["masks"] = interpolate(target["masks"][:, None].float(), size, mode="nearest")[:, 0] > 0.5 + target["masks"] = ( + interpolate(target["masks"][:, None].float(), size, mode="nearest")[:, 0] > 0.5 + ) return rescaled_image, target diff --git a/dimos/models/Detic/third_party/Deformable-DETR/engine.py b/dimos/models/Detic/third_party/Deformable-DETR/engine.py index 9717687083..f47471648c 100644 --- a/dimos/models/Detic/third_party/Deformable-DETR/engine.py +++ b/dimos/models/Detic/third_party/Deformable-DETR/engine.py @@ -54,7 +54,9 @@ def train_one_epoch( # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) loss_dict_reduced_unscaled = {f"{k}_unscaled": v for k, v in loss_dict_reduced.items()} - loss_dict_reduced_scaled = {k: v * weight_dict[k] for k, v in loss_dict_reduced.items() if k in weight_dict} + loss_dict_reduced_scaled = { + k: v * weight_dict[k] for k, v in loss_dict_reduced.items() if k in weight_dict + } losses_reduced_scaled = sum(loss_dict_reduced_scaled.values()) loss_value = losses_reduced_scaled.item() @@ -72,7 +74,9 @@ def train_one_epoch( grad_total_norm = utils.get_total_grad_norm(model.parameters(), max_norm) optimizer.step() - metric_logger.update(loss=loss_value, **loss_dict_reduced_scaled, **loss_dict_reduced_unscaled) + metric_logger.update( + loss=loss_value, **loss_dict_reduced_scaled, **loss_dict_reduced_unscaled + ) metric_logger.update(class_error=loss_dict_reduced["class_error"]) metric_logger.update(lr=optimizer.param_groups[0]["lr"]) metric_logger.update(grad_norm=grad_total_norm) @@ -115,10 +119,14 @@ def evaluate(model, criterion, postprocessors, data_loader, base_ds, device, out # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) - loss_dict_reduced_scaled = {k: v * weight_dict[k] for k, v in loss_dict_reduced.items() if k in weight_dict} + loss_dict_reduced_scaled = { + k: v * weight_dict[k] for k, v in loss_dict_reduced.items() if k in weight_dict + } loss_dict_reduced_unscaled = {f"{k}_unscaled": v for k, v in loss_dict_reduced.items()} metric_logger.update( - loss=sum(loss_dict_reduced_scaled.values()), **loss_dict_reduced_scaled, **loss_dict_reduced_unscaled + loss=sum(loss_dict_reduced_scaled.values()), + **loss_dict_reduced_scaled, + **loss_dict_reduced_unscaled, ) metric_logger.update(class_error=loss_dict_reduced["class_error"]) diff --git a/dimos/models/Detic/third_party/Deformable-DETR/main.py b/dimos/models/Detic/third_party/Deformable-DETR/main.py index 1652fbbbc5..ff91fd52a5 100644 --- a/dimos/models/Detic/third_party/Deformable-DETR/main.py +++ b/dimos/models/Detic/third_party/Deformable-DETR/main.py @@ -31,14 +31,21 @@ def get_args_parser(): parser.add_argument("--lr", default=2e-4, type=float) parser.add_argument("--lr_backbone_names", default=["backbone.0"], type=str, nargs="+") parser.add_argument("--lr_backbone", default=2e-5, type=float) - parser.add_argument("--lr_linear_proj_names", default=["reference_points", "sampling_offsets"], type=str, nargs="+") + parser.add_argument( + "--lr_linear_proj_names", + default=["reference_points", "sampling_offsets"], + type=str, + nargs="+", + ) parser.add_argument("--lr_linear_proj_mult", default=0.1, type=float) parser.add_argument("--batch_size", default=2, type=int) parser.add_argument("--weight_decay", default=1e-4, type=float) parser.add_argument("--epochs", default=50, type=int) parser.add_argument("--lr_drop", default=40, type=int) parser.add_argument("--lr_drop_epochs", default=None, type=int, nargs="+") - parser.add_argument("--clip_max_norm", default=0.1, type=float, help="gradient clipping max norm") + parser.add_argument( + "--clip_max_norm", default=0.1, type=float, help="gradient clipping max norm" + ) parser.add_argument("--sgd", action="store_true") @@ -55,7 +62,9 @@ def get_args_parser(): ) # * Backbone - parser.add_argument("--backbone", default="resnet50", type=str, help="Name of the convolutional backbone to use") + parser.add_argument( + "--backbone", default="resnet50", type=str, help="Name of the convolutional backbone to use" + ) parser.add_argument( "--dilation", action="store_true", @@ -68,12 +77,20 @@ def get_args_parser(): choices=("sine", "learned"), help="Type of positional embedding to use on top of the image features", ) - parser.add_argument("--position_embedding_scale", default=2 * np.pi, type=float, help="position / size * scale") - parser.add_argument("--num_feature_levels", default=4, type=int, help="number of feature levels") + parser.add_argument( + "--position_embedding_scale", default=2 * np.pi, type=float, help="position / size * scale" + ) + parser.add_argument( + "--num_feature_levels", default=4, type=int, help="number of feature levels" + ) # * Transformer - parser.add_argument("--enc_layers", default=6, type=int, help="Number of encoding layers in the transformer") - parser.add_argument("--dec_layers", default=6, type=int, help="Number of decoding layers in the transformer") + parser.add_argument( + "--enc_layers", default=6, type=int, help="Number of encoding layers in the transformer" + ) + parser.add_argument( + "--dec_layers", default=6, type=int, help="Number of decoding layers in the transformer" + ) parser.add_argument( "--dim_feedforward", default=1024, @@ -81,18 +98,28 @@ def get_args_parser(): help="Intermediate size of the feedforward layers in the transformer blocks", ) parser.add_argument( - "--hidden_dim", default=256, type=int, help="Size of the embeddings (dimension of the transformer)" + "--hidden_dim", + default=256, + type=int, + help="Size of the embeddings (dimension of the transformer)", + ) + parser.add_argument( + "--dropout", default=0.1, type=float, help="Dropout applied in the transformer" ) - parser.add_argument("--dropout", default=0.1, type=float, help="Dropout applied in the transformer") parser.add_argument( - "--nheads", default=8, type=int, help="Number of attention heads inside the transformer's attentions" + "--nheads", + default=8, + type=int, + help="Number of attention heads inside the transformer's attentions", ) parser.add_argument("--num_queries", default=300, type=int, help="Number of query slots") parser.add_argument("--dec_n_points", default=4, type=int) parser.add_argument("--enc_n_points", default=4, type=int) # * Segmentation - parser.add_argument("--masks", action="store_true", help="Train segmentation head if the flag is provided") + parser.add_argument( + "--masks", action="store_true", help="Train segmentation head if the flag is provided" + ) # Loss parser.add_argument( @@ -103,9 +130,15 @@ def get_args_parser(): ) # * Matcher - parser.add_argument("--set_cost_class", default=2, type=float, help="Class coefficient in the matching cost") - parser.add_argument("--set_cost_bbox", default=5, type=float, help="L1 box coefficient in the matching cost") - parser.add_argument("--set_cost_giou", default=2, type=float, help="giou box coefficient in the matching cost") + parser.add_argument( + "--set_cost_class", default=2, type=float, help="Class coefficient in the matching cost" + ) + parser.add_argument( + "--set_cost_bbox", default=5, type=float, help="L1 box coefficient in the matching cost" + ) + parser.add_argument( + "--set_cost_giou", default=2, type=float, help="giou box coefficient in the matching cost" + ) # * Loss coefficients parser.add_argument("--mask_loss_coef", default=1, type=float) @@ -128,7 +161,9 @@ def get_args_parser(): parser.add_argument("--start_epoch", default=0, type=int, metavar="N", help="start epoch") parser.add_argument("--eval", action="store_true") parser.add_argument("--num_workers", default=2, type=int) - parser.add_argument("--cache_mode", default=False, action="store_true", help="whether to cache images on memory") + parser.add_argument( + "--cache_mode", default=False, action="store_true", help="whether to cache images on memory" + ) return parser @@ -170,7 +205,9 @@ def main(args): sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) - batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) + batch_sampler_train = torch.utils.data.BatchSampler( + sampler_train, args.batch_size, drop_last=True + ) data_loader_train = DataLoader( dataset_train, @@ -230,7 +267,9 @@ def match_name_keywords(n, name_keywords): }, ] if args.sgd: - optimizer = torch.optim.SGD(param_dicts, lr=args.lr, momentum=0.9, weight_decay=args.weight_decay) + optimizer = torch.optim.SGD( + param_dicts, lr=args.lr, momentum=0.9, weight_decay=args.weight_decay + ) else: optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) @@ -253,16 +292,29 @@ def match_name_keywords(n, name_keywords): output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith("https"): - checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location="cpu", check_hash=True) + checkpoint = torch.hub.load_state_dict_from_url( + args.resume, map_location="cpu", check_hash=True + ) else: checkpoint = torch.load(args.resume, map_location="cpu") - missing_keys, unexpected_keys = model_without_ddp.load_state_dict(checkpoint["model"], strict=False) - unexpected_keys = [k for k in unexpected_keys if not (k.endswith("total_params") or k.endswith("total_ops"))] + missing_keys, unexpected_keys = model_without_ddp.load_state_dict( + checkpoint["model"], strict=False + ) + unexpected_keys = [ + k + for k in unexpected_keys + if not (k.endswith("total_params") or k.endswith("total_ops")) + ] if len(missing_keys) > 0: print("Missing Keys: {}".format(missing_keys)) if len(unexpected_keys) > 0: print("Unexpected Keys: {}".format(unexpected_keys)) - if not args.eval and "optimizer" in checkpoint and "lr_scheduler" in checkpoint and "epoch" in checkpoint: + if ( + not args.eval + and "optimizer" in checkpoint + and "lr_scheduler" in checkpoint + and "epoch" in checkpoint + ): import copy p_groups = copy.deepcopy(optimizer.param_groups) @@ -279,7 +331,9 @@ def match_name_keywords(n, name_keywords): "Warning: (hack) args.override_resumed_lr_drop is set to True, so args.lr_drop would override lr_drop in resumed lr_scheduler." ) lr_scheduler.step_size = args.lr_drop - lr_scheduler.base_lrs = list(map(lambda group: group["initial_lr"], optimizer.param_groups)) + lr_scheduler.base_lrs = list( + map(lambda group: group["initial_lr"], optimizer.param_groups) + ) lr_scheduler.step(lr_scheduler.last_epoch) args.start_epoch = checkpoint["epoch"] + 1 # check the resumed model @@ -301,7 +355,9 @@ def match_name_keywords(n, name_keywords): for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) - train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) + train_stats = train_one_epoch( + model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm + ) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / "checkpoint.pth"] @@ -343,7 +399,9 @@ def match_name_keywords(n, name_keywords): if epoch % 50 == 0: filenames.append(f"{epoch:03}.pth") for name in filenames: - torch.save(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval" / name) + torch.save( + coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval" / name + ) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) @@ -351,7 +409,9 @@ def match_name_keywords(n, name_keywords): if __name__ == "__main__": - parser = argparse.ArgumentParser("Deformable DETR training and evaluation script", parents=[get_args_parser()]) + parser = argparse.ArgumentParser( + "Deformable DETR training and evaluation script", parents=[get_args_parser()] + ) args = parser.parse_args() if args.output_dir: Path(args.output_dir).mkdir(parents=True, exist_ok=True) diff --git a/dimos/models/Detic/third_party/Deformable-DETR/models/backbone.py b/dimos/models/Detic/third_party/Deformable-DETR/models/backbone.py index 4b9e52e103..341dac2bde 100644 --- a/dimos/models/Detic/third_party/Deformable-DETR/models/backbone.py +++ b/dimos/models/Detic/third_party/Deformable-DETR/models/backbone.py @@ -11,7 +11,6 @@ Backbone modules. """ - import torch import torch.nn.functional as F import torchvision @@ -69,7 +68,12 @@ class BackboneBase(nn.Module): def __init__(self, backbone: nn.Module, train_backbone: bool, return_interm_layers: bool): super().__init__() for name, parameter in backbone.named_parameters(): - if not train_backbone or "layer2" not in name and "layer3" not in name and "layer4" not in name: + if ( + not train_backbone + or "layer2" not in name + and "layer3" not in name + and "layer4" not in name + ): parameter.requires_grad_(False) if return_interm_layers: # return_layers = {"layer1": "0", "layer2": "1", "layer3": "2", "layer4": "3"} @@ -99,7 +103,9 @@ class Backbone(BackboneBase): def __init__(self, name: str, train_backbone: bool, return_interm_layers: bool, dilation: bool): norm_layer = FrozenBatchNorm2d backbone = getattr(torchvision.models, name)( - replace_stride_with_dilation=[False, False, dilation], pretrained=is_main_process(), norm_layer=norm_layer + replace_stride_with_dilation=[False, False, dilation], + pretrained=is_main_process(), + norm_layer=norm_layer, ) assert name not in ("resnet18", "resnet34"), "number of channels are hard coded" super().__init__(backbone, train_backbone, return_interm_layers) diff --git a/dimos/models/Detic/third_party/Deformable-DETR/models/deformable_detr.py b/dimos/models/Detic/third_party/Deformable-DETR/models/deformable_detr.py index 8926adb47f..cce6571795 100644 --- a/dimos/models/Detic/third_party/Deformable-DETR/models/deformable_detr.py +++ b/dimos/models/Detic/third_party/Deformable-DETR/models/deformable_detr.py @@ -29,7 +29,13 @@ from .backbone import build_backbone from .matcher import build_matcher -from .segmentation import DETRsegm, PostProcessPanoptic, PostProcessSegm, dice_loss, sigmoid_focal_loss +from .segmentation import ( + DETRsegm, + PostProcessPanoptic, + PostProcessSegm, + dice_loss, + sigmoid_focal_loss, +) from .deformable_transformer import build_deforamble_transformer import copy @@ -116,7 +122,9 @@ def __init__( nn.init.constant_(proj[0].bias, 0) # if two-stage, the last class_embed and bbox_embed is for region proposal generation - num_pred = (transformer.decoder.num_layers + 1) if two_stage else transformer.decoder.num_layers + num_pred = ( + (transformer.decoder.num_layers + 1) if two_stage else transformer.decoder.num_layers + ) if with_box_refine: self.class_embed = _get_clones(self.class_embed, num_pred) self.bbox_embed = _get_clones(self.bbox_embed, num_pred) @@ -177,8 +185,8 @@ def forward(self, samples: NestedTensor): query_embeds = None if not self.two_stage: query_embeds = self.query_embed.weight - hs, init_reference, inter_references, enc_outputs_class, enc_outputs_coord_unact = self.transformer( - srcs, masks, pos, query_embeds + hs, init_reference, inter_references, enc_outputs_class, enc_outputs_coord_unact = ( + self.transformer(srcs, masks, pos, query_embeds) ) outputs_classes = [] @@ -216,7 +224,10 @@ def _set_aux_loss(self, outputs_class, outputs_coord): # this is a workaround to make torchscript happy, as torchscript # doesn't support dictionary with non-homogeneous values, such # as a dict having both a Tensor and a list. - return [{"pred_logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1])] + return [ + {"pred_logits": a, "pred_boxes": b} + for a, b in zip(outputs_class[:-1], outputs_coord[:-1]) + ] class SetCriterion(nn.Module): @@ -251,7 +262,9 @@ def loss_labels(self, outputs, targets, indices, num_boxes, log=True): idx = self._get_src_permutation_idx(indices) target_classes_o = torch.cat([t["labels"][J] for t, (_, J) in zip(targets, indices)]) - target_classes = torch.full(src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device) + target_classes = torch.full( + src_logits.shape[:2], self.num_classes, dtype=torch.int64, device=src_logits.device + ) target_classes[idx] = target_classes_o target_classes_onehot = torch.zeros( @@ -264,7 +277,9 @@ def loss_labels(self, outputs, targets, indices, num_boxes, log=True): target_classes_onehot = target_classes_onehot[:, :, :-1] loss_ce = ( - sigmoid_focal_loss(src_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2) + sigmoid_focal_loss( + src_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2 + ) * src_logits.shape[1] ) losses = {"loss_ce": loss_ce} @@ -304,7 +319,9 @@ def loss_boxes(self, outputs, targets, indices, num_boxes): losses["loss_bbox"] = loss_bbox.sum() / num_boxes loss_giou = 1 - torch.diag( - box_ops.generalized_box_iou(box_ops.box_cxcywh_to_xyxy(src_boxes), box_ops.box_cxcywh_to_xyxy(target_boxes)) + box_ops.generalized_box_iou( + box_ops.box_cxcywh_to_xyxy(src_boxes), box_ops.box_cxcywh_to_xyxy(target_boxes) + ) ) losses["loss_giou"] = loss_giou.sum() / num_boxes return losses @@ -321,12 +338,16 @@ def loss_masks(self, outputs, targets, indices, num_boxes): src_masks = outputs["pred_masks"] # TODO use valid to mask invalid areas due to padding in loss - target_masks, valid = nested_tensor_from_tensor_list([t["masks"] for t in targets]).decompose() + target_masks, valid = nested_tensor_from_tensor_list( + [t["masks"] for t in targets] + ).decompose() target_masks = target_masks.to(src_masks) src_masks = src_masks[src_idx] # upsample predictions to the target size - src_masks = interpolate(src_masks[:, None], size=target_masks.shape[-2:], mode="bilinear", align_corners=False) + src_masks = interpolate( + src_masks[:, None], size=target_masks.shape[-2:], mode="bilinear", align_corners=False + ) src_masks = src_masks[:, 0].flatten(1) target_masks = target_masks[tgt_idx].flatten(1) @@ -366,14 +387,18 @@ def forward(self, outputs, targets): targets: list of dicts, such that len(targets) == batch_size. The expected keys in each dict depends on the losses applied, see each loss' doc """ - outputs_without_aux = {k: v for k, v in outputs.items() if k != "aux_outputs" and k != "enc_outputs"} + outputs_without_aux = { + k: v for k, v in outputs.items() if k != "aux_outputs" and k != "enc_outputs" + } # Retrieve the matching between the outputs of the last layer and the targets indices = self.matcher(outputs_without_aux, targets) # Compute the average number of target boxes accross all nodes, for normalization purposes num_boxes = sum(len(t["labels"]) for t in targets) - num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device) + num_boxes = torch.as_tensor( + [num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device + ) if is_dist_avail_and_initialized(): torch.distributed.all_reduce(num_boxes) num_boxes = torch.clamp(num_boxes / get_world_size(), min=1).item() @@ -463,7 +488,9 @@ def __init__(self, input_dim, hidden_dim, output_dim, num_layers): super().__init__() self.num_layers = num_layers h = [hidden_dim] * (num_layers - 1) - self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) + self.layers = nn.ModuleList( + nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]) + ) def forward(self, x): for i, layer in enumerate(self.layers): @@ -510,7 +537,9 @@ def build(args): if args.masks: losses += ["masks"] # num_classes, matcher, weight_dict, losses, focal_alpha=0.25 - criterion = SetCriterion(num_classes, matcher, weight_dict, losses, focal_alpha=args.focal_alpha) + criterion = SetCriterion( + num_classes, matcher, weight_dict, losses, focal_alpha=args.focal_alpha + ) criterion.to(device) postprocessors = {"bbox": PostProcess()} if args.masks: diff --git a/dimos/models/Detic/third_party/Deformable-DETR/models/deformable_transformer.py b/dimos/models/Detic/third_party/Deformable-DETR/models/deformable_transformer.py index 5314b5ce16..6e75127833 100644 --- a/dimos/models/Detic/third_party/Deformable-DETR/models/deformable_transformer.py +++ b/dimos/models/Detic/third_party/Deformable-DETR/models/deformable_transformer.py @@ -51,7 +51,9 @@ def __init__( decoder_layer = DeformableTransformerDecoderLayer( d_model, dim_feedforward, dropout, activation, num_feature_levels, nhead, dec_n_points ) - self.decoder = DeformableTransformerDecoder(decoder_layer, num_decoder_layers, return_intermediate_dec) + self.decoder = DeformableTransformerDecoder( + decoder_layer, num_decoder_layers, return_intermediate_dec + ) self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model)) @@ -115,9 +117,13 @@ def gen_encoder_output_proposals(self, memory, memory_padding_mask, spatial_shap proposals.append(proposal) _cur += H_ * W_ output_proposals = torch.cat(proposals, 1) - output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True) + output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all( + -1, keepdim=True + ) output_proposals = torch.log(output_proposals / (1 - output_proposals)) - output_proposals = output_proposals.masked_fill(memory_padding_mask.unsqueeze(-1), float("inf")) + output_proposals = output_proposals.masked_fill( + memory_padding_mask.unsqueeze(-1), float("inf") + ) output_proposals = output_proposals.masked_fill(~output_proposals_valid, float("inf")) output_memory = memory @@ -157,31 +163,48 @@ def forward(self, srcs, masks, pos_embeds, query_embed=None): src_flatten = torch.cat(src_flatten, 1) mask_flatten = torch.cat(mask_flatten, 1) lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1) - spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=src_flatten.device) - level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1])) + spatial_shapes = torch.as_tensor( + spatial_shapes, dtype=torch.long, device=src_flatten.device + ) + level_start_index = torch.cat( + (spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]) + ) valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1) # encoder memory = self.encoder( - src_flatten, spatial_shapes, level_start_index, valid_ratios, lvl_pos_embed_flatten, mask_flatten + src_flatten, + spatial_shapes, + level_start_index, + valid_ratios, + lvl_pos_embed_flatten, + mask_flatten, ) # prepare input for decoder bs, _, c = memory.shape if self.two_stage: - output_memory, output_proposals = self.gen_encoder_output_proposals(memory, mask_flatten, spatial_shapes) + output_memory, output_proposals = self.gen_encoder_output_proposals( + memory, mask_flatten, spatial_shapes + ) # hack implementation for two-stage Deformable DETR enc_outputs_class = self.decoder.class_embed[self.decoder.num_layers](output_memory) - enc_outputs_coord_unact = self.decoder.bbox_embed[self.decoder.num_layers](output_memory) + output_proposals + enc_outputs_coord_unact = ( + self.decoder.bbox_embed[self.decoder.num_layers](output_memory) + output_proposals + ) topk = self.two_stage_num_proposals topk_proposals = torch.topk(enc_outputs_class[..., 0], topk, dim=1)[1] - topk_coords_unact = torch.gather(enc_outputs_coord_unact, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4)) + topk_coords_unact = torch.gather( + enc_outputs_coord_unact, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4) + ) topk_coords_unact = topk_coords_unact.detach() reference_points = topk_coords_unact.sigmoid() init_reference_out = reference_points - pos_trans_out = self.pos_trans_norm(self.pos_trans(self.get_proposal_pos_embed(topk_coords_unact))) + pos_trans_out = self.pos_trans_norm( + self.pos_trans(self.get_proposal_pos_embed(topk_coords_unact)) + ) query_embed, tgt = torch.split(pos_trans_out, c, dim=2) else: query_embed, tgt = torch.split(query_embed, c, dim=1) @@ -192,17 +215,39 @@ def forward(self, srcs, masks, pos_embeds, query_embed=None): # decoder hs, inter_references = self.decoder( - tgt, reference_points, memory, spatial_shapes, level_start_index, valid_ratios, query_embed, mask_flatten + tgt, + reference_points, + memory, + spatial_shapes, + level_start_index, + valid_ratios, + query_embed, + mask_flatten, ) inter_references_out = inter_references if self.two_stage: - return hs, init_reference_out, inter_references_out, enc_outputs_class, enc_outputs_coord_unact + return ( + hs, + init_reference_out, + inter_references_out, + enc_outputs_class, + enc_outputs_coord_unact, + ) return hs, init_reference_out, inter_references_out, None, None class DeformableTransformerEncoderLayer(nn.Module): - def __init__(self, d_model=256, d_ffn=1024, dropout=0.1, activation="relu", n_levels=4, n_heads=8, n_points=4): + def __init__( + self, + d_model=256, + d_ffn=1024, + dropout=0.1, + activation="relu", + n_levels=4, + n_heads=8, + n_points=4, + ): super().__init__() # self attention @@ -228,10 +273,17 @@ def forward_ffn(self, src): src = self.norm2(src) return src - def forward(self, src, pos, reference_points, spatial_shapes, level_start_index, padding_mask=None): + def forward( + self, src, pos, reference_points, spatial_shapes, level_start_index, padding_mask=None + ): # self attention src2 = self.self_attn( - self.with_pos_embed(src, pos), reference_points, src, spatial_shapes, level_start_index, padding_mask + self.with_pos_embed(src, pos), + reference_points, + src, + spatial_shapes, + level_start_index, + padding_mask, ) src = src + self.dropout1(src2) src = self.norm1(src) @@ -264,17 +316,32 @@ def get_reference_points(spatial_shapes, valid_ratios, device): reference_points = reference_points[:, :, None] * valid_ratios[:, None] return reference_points - def forward(self, src, spatial_shapes, level_start_index, valid_ratios, pos=None, padding_mask=None): + def forward( + self, src, spatial_shapes, level_start_index, valid_ratios, pos=None, padding_mask=None + ): output = src - reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=src.device) + reference_points = self.get_reference_points( + spatial_shapes, valid_ratios, device=src.device + ) for _, layer in enumerate(self.layers): - output = layer(output, pos, reference_points, spatial_shapes, level_start_index, padding_mask) + output = layer( + output, pos, reference_points, spatial_shapes, level_start_index, padding_mask + ) return output class DeformableTransformerDecoderLayer(nn.Module): - def __init__(self, d_model=256, d_ffn=1024, dropout=0.1, activation="relu", n_levels=4, n_heads=8, n_points=4): + def __init__( + self, + d_model=256, + d_ffn=1024, + dropout=0.1, + activation="relu", + n_levels=4, + n_heads=8, + n_points=4, + ): super().__init__() # cross attention @@ -306,11 +373,20 @@ def forward_ffn(self, tgt): return tgt def forward( - self, tgt, query_pos, reference_points, src, src_spatial_shapes, level_start_index, src_padding_mask=None + self, + tgt, + query_pos, + reference_points, + src, + src_spatial_shapes, + level_start_index, + src_padding_mask=None, ): # self attention q = k = self.with_pos_embed(tgt, query_pos) - tgt2 = self.self_attn(q.transpose(0, 1), k.transpose(0, 1), tgt.transpose(0, 1))[0].transpose(0, 1) + tgt2 = self.self_attn(q.transpose(0, 1), k.transpose(0, 1), tgt.transpose(0, 1))[ + 0 + ].transpose(0, 1) tgt = tgt + self.dropout2(tgt2) tgt = self.norm2(tgt) @@ -360,7 +436,8 @@ def forward( for lid, layer in enumerate(self.layers): if reference_points.shape[-1] == 4: reference_points_input = ( - reference_points[:, :, None] * torch.cat([src_valid_ratios, src_valid_ratios], -1)[:, None] + reference_points[:, :, None] + * torch.cat([src_valid_ratios, src_valid_ratios], -1)[:, None] ) else: assert reference_points.shape[-1] == 2 diff --git a/dimos/models/Detic/third_party/Deformable-DETR/models/matcher.py b/dimos/models/Detic/third_party/Deformable-DETR/models/matcher.py index 5603646561..29838972ab 100644 --- a/dimos/models/Detic/third_party/Deformable-DETR/models/matcher.py +++ b/dimos/models/Detic/third_party/Deformable-DETR/models/matcher.py @@ -82,16 +82,27 @@ def forward(self, outputs, targets): cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1) # Compute the giou cost betwen boxes - cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox)) + cost_giou = -generalized_box_iou( + box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox) + ) # Final cost matrix - C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou + C = ( + self.cost_bbox * cost_bbox + + self.cost_class * cost_class + + self.cost_giou * cost_giou + ) C = C.view(bs, num_queries, -1).cpu() sizes = [len(v["boxes"]) for v in targets] indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))] - return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices] + return [ + (torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) + for i, j in indices + ] def build_matcher(args): - return HungarianMatcher(cost_class=args.set_cost_class, cost_bbox=args.set_cost_bbox, cost_giou=args.set_cost_giou) + return HungarianMatcher( + cost_class=args.set_cost_class, cost_bbox=args.set_cost_bbox, cost_giou=args.set_cost_giou + ) diff --git a/dimos/models/Detic/third_party/Deformable-DETR/models/ops/functions/ms_deform_attn_func.py b/dimos/models/Detic/third_party/Deformable-DETR/models/ops/functions/ms_deform_attn_func.py index ed75727364..c18582590e 100644 --- a/dimos/models/Detic/third_party/Deformable-DETR/models/ops/functions/ms_deform_attn_func.py +++ b/dimos/models/Detic/third_party/Deformable-DETR/models/ops/functions/ms_deform_attn_func.py @@ -21,21 +21,42 @@ class MSDeformAttnFunction(Function): @staticmethod def forward( - ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step + ctx, + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + im2col_step, ): ctx.im2col_step = im2col_step output = MSDA.ms_deform_attn_forward( - value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + ctx.im2col_step, ) ctx.save_for_backward( - value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, ) return output @staticmethod @once_differentiable def backward(ctx, grad_output): - value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors + ( + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + ) = ctx.saved_tensors grad_value, grad_sampling_loc, grad_attn_weight = MSDA.ms_deform_attn_backward( value, value_spatial_shapes, @@ -69,5 +90,9 @@ def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, sampling_value_list.append(sampling_value_l_) # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_) attention_weights = attention_weights.transpose(1, 2).reshape(N_ * M_, 1, Lq_, L_ * P_) - output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_ * D_, Lq_) + output = ( + (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights) + .sum(-1) + .view(N_, M_ * D_, Lq_) + ) return output.transpose(1, 2).contiguous() diff --git a/dimos/models/Detic/third_party/Deformable-DETR/models/ops/modules/ms_deform_attn.py b/dimos/models/Detic/third_party/Deformable-DETR/models/ops/modules/ms_deform_attn.py index 7a0f863d97..bc02668b96 100644 --- a/dimos/models/Detic/third_party/Deformable-DETR/models/ops/modules/ms_deform_attn.py +++ b/dimos/models/Detic/third_party/Deformable-DETR/models/ops/modules/ms_deform_attn.py @@ -38,7 +38,9 @@ def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4): """ super().__init__() if d_model % n_heads != 0: - raise ValueError("d_model must be divisible by n_heads, but got {} and {}".format(d_model, n_heads)) + raise ValueError( + "d_model must be divisible by n_heads, but got {} and {}".format(d_model, n_heads) + ) _d_per_head = d_model // n_heads # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation if not _is_power_of_2(_d_per_head): @@ -109,12 +111,20 @@ def forward( if input_padding_mask is not None: value = value.masked_fill(input_padding_mask[..., None], float(0)) value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads) - sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2) - attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points) - attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points) + sampling_offsets = self.sampling_offsets(query).view( + N, Len_q, self.n_heads, self.n_levels, self.n_points, 2 + ) + attention_weights = self.attention_weights(query).view( + N, Len_q, self.n_heads, self.n_levels * self.n_points + ) + attention_weights = F.softmax(attention_weights, -1).view( + N, Len_q, self.n_heads, self.n_levels, self.n_points + ) # N, Len_q, n_heads, n_levels, n_points, 2 if reference_points.shape[-1] == 2: - offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1) + offset_normalizer = torch.stack( + [input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1 + ) sampling_locations = ( reference_points[:, :, None, :, None, :] + sampling_offsets / offset_normalizer[None, None, None, :, None, :] @@ -126,7 +136,9 @@ def forward( ) else: raise ValueError( - "Last dim of reference_points must be 2 or 4, but get {} instead.".format(reference_points.shape[-1]) + "Last dim of reference_points must be 2 or 4, but get {} instead.".format( + reference_points.shape[-1] + ) ) output = MSDeformAttnFunction.apply( value, diff --git a/dimos/models/Detic/third_party/Deformable-DETR/models/ops/test.py b/dimos/models/Detic/third_party/Deformable-DETR/models/ops/test.py index e49ce3a324..3fa3c7da6d 100644 --- a/dimos/models/Detic/third_party/Deformable-DETR/models/ops/test.py +++ b/dimos/models/Detic/third_party/Deformable-DETR/models/ops/test.py @@ -34,7 +34,9 @@ def check_forward_equal_with_pytorch_double(): attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) im2col_step = 2 output_pytorch = ( - ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()) + ms_deform_attn_core_pytorch( + value.double(), shapes, sampling_locations.double(), attention_weights.double() + ) .detach() .cpu() ) @@ -66,9 +68,15 @@ def check_forward_equal_with_pytorch_float(): attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) im2col_step = 2 - output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu() + output_pytorch = ( + ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights) + .detach() + .cpu() + ) output_cuda = ( - MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step) + MSDeformAttnFunction.apply( + value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step + ) .detach() .cpu() ) @@ -81,7 +89,9 @@ def check_forward_equal_with_pytorch_float(): ) -def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True): +def check_gradient_numerical( + channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True +): value = torch.rand(N, S, M, channels).cuda() * 0.01 sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 diff --git a/dimos/models/Detic/third_party/Deformable-DETR/models/position_encoding.py b/dimos/models/Detic/third_party/Deformable-DETR/models/position_encoding.py index a05865b7f3..c0ab1b34c3 100644 --- a/dimos/models/Detic/third_party/Deformable-DETR/models/position_encoding.py +++ b/dimos/models/Detic/third_party/Deformable-DETR/models/position_encoding.py @@ -52,8 +52,12 @@ def forward(self, tensor_list: NestedTensor): pos_x = x_embed[:, :, :, None] / dim_t pos_y = y_embed[:, :, :, None] / dim_t - pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) - pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) + pos_x = torch.stack( + (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4 + ).flatten(3) + pos_y = torch.stack( + (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4 + ).flatten(3) pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) return pos diff --git a/dimos/models/Detic/third_party/Deformable-DETR/models/segmentation.py b/dimos/models/Detic/third_party/Deformable-DETR/models/segmentation.py index 05b998bf01..edb3f0a3c4 100644 --- a/dimos/models/Detic/third_party/Deformable-DETR/models/segmentation.py +++ b/dimos/models/Detic/third_party/Deformable-DETR/models/segmentation.py @@ -57,14 +57,19 @@ def forward(self, samples: NestedTensor): out = {"pred_logits": outputs_class[-1], "pred_boxes": outputs_coord[-1]} if self.detr.aux_loss: out["aux_outputs"] = [ - {"pred_logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1]) + {"pred_logits": a, "pred_boxes": b} + for a, b in zip(outputs_class[:-1], outputs_coord[:-1]) ] # FIXME h_boxes takes the last one computed, keep this in mind bbox_mask = self.bbox_attention(hs[-1], memory, mask=mask) - seg_masks = self.mask_head(src_proj, bbox_mask, [features[2].tensors, features[1].tensors, features[0].tensors]) - outputs_seg_masks = seg_masks.view(bs, self.detr.num_queries, seg_masks.shape[-2], seg_masks.shape[-1]) + seg_masks = self.mask_head( + src_proj, bbox_mask, [features[2].tensors, features[1].tensors, features[0].tensors] + ) + outputs_seg_masks = seg_masks.view( + bs, self.detr.num_queries, seg_masks.shape[-2], seg_masks.shape[-1] + ) out["pred_masks"] = outputs_seg_masks return out @@ -79,7 +84,14 @@ class MaskHeadSmallConv(nn.Module): def __init__(self, dim, fpn_dims, context_dim): super().__init__() - inter_dims = [dim, context_dim // 2, context_dim // 4, context_dim // 8, context_dim // 16, context_dim // 64] + inter_dims = [ + dim, + context_dim // 2, + context_dim // 4, + context_dim // 8, + context_dim // 16, + context_dim // 64, + ] self.lay1 = torch.nn.Conv2d(dim, dim, 3, padding=1) self.gn1 = torch.nn.GroupNorm(8, dim) self.lay2 = torch.nn.Conv2d(dim, inter_dims[1], 3, padding=1) @@ -166,7 +178,9 @@ def forward(self, q, k, mask=None): q = self.q_linear(q) k = F.conv2d(k, self.k_linear.weight.unsqueeze(-1).unsqueeze(-1), self.k_linear.bias) qh = q.view(q.shape[0], q.shape[1], self.num_heads, self.hidden_dim // self.num_heads) - kh = k.view(k.shape[0], self.num_heads, self.hidden_dim // self.num_heads, k.shape[-2], k.shape[-1]) + kh = k.view( + k.shape[0], self.num_heads, self.hidden_dim // self.num_heads, k.shape[-2], k.shape[-1] + ) weights = torch.einsum("bqnc,bnchw->bqnhw", qh * self.normalize_fact, kh) if mask is not None: @@ -232,10 +246,14 @@ def forward(self, results, outputs, orig_target_sizes, max_target_sizes): assert len(orig_target_sizes) == len(max_target_sizes) max_h, max_w = max_target_sizes.max(0)[0].tolist() outputs_masks = outputs["pred_masks"].squeeze(2) - outputs_masks = F.interpolate(outputs_masks, size=(max_h, max_w), mode="bilinear", align_corners=False) + outputs_masks = F.interpolate( + outputs_masks, size=(max_h, max_w), mode="bilinear", align_corners=False + ) outputs_masks = (outputs_masks.sigmoid() > self.threshold).cpu() - for i, (cur_mask, t, tt) in enumerate(zip(outputs_masks, max_target_sizes, orig_target_sizes)): + for i, (cur_mask, t, tt) in enumerate( + zip(outputs_masks, max_target_sizes, orig_target_sizes) + ): img_h, img_w = t[0], t[1] results[i]["masks"] = cur_mask[:, :img_h, :img_w].unsqueeze(1) results[i]["masks"] = F.interpolate( @@ -272,7 +290,11 @@ def forward(self, outputs, processed_sizes, target_sizes=None): if target_sizes is None: target_sizes = processed_sizes assert len(processed_sizes) == len(target_sizes) - out_logits, raw_masks, raw_boxes = outputs["pred_logits"], outputs["pred_masks"], outputs["pred_boxes"] + out_logits, raw_masks, raw_boxes = ( + outputs["pred_logits"], + outputs["pred_masks"], + outputs["pred_boxes"], + ) assert len(out_logits) == len(raw_masks) == len(target_sizes) preds = [] @@ -330,7 +352,9 @@ def get_ids_area(masks, scores, dedup=False): seg_img = seg_img.resize(size=(final_w, final_h), resample=Image.NEAREST) np_seg_img = ( - torch.ByteTensor(torch.ByteStorage.from_buffer(seg_img.tobytes())).view(final_h, final_w, 3).numpy() + torch.ByteTensor(torch.ByteStorage.from_buffer(seg_img.tobytes())) + .view(final_h, final_w, 3) + .numpy() ) m_id = torch.from_numpy(rgb2id(np_seg_img)) @@ -344,7 +368,9 @@ def get_ids_area(masks, scores, dedup=False): # We know filter empty masks as long as we find some while True: filtered_small = torch.as_tensor( - [area[i] <= 4 for i, c in enumerate(cur_classes)], dtype=torch.bool, device=keep.device + [area[i] <= 4 for i, c in enumerate(cur_classes)], + dtype=torch.bool, + device=keep.device, ) if filtered_small.any().item(): cur_scores = cur_scores[~filtered_small] @@ -360,7 +386,9 @@ def get_ids_area(masks, scores, dedup=False): segments_info = [] for i, a in enumerate(area): cat = cur_classes[i].item() - segments_info.append({"id": i, "isthing": self.is_thing_map[cat], "category_id": cat, "area": a}) + segments_info.append( + {"id": i, "isthing": self.is_thing_map[cat], "category_id": cat, "area": a} + ) del cur_classes with io.BytesIO() as out: diff --git a/dimos/models/Detic/third_party/Deformable-DETR/tools/launch.py b/dimos/models/Detic/third_party/Deformable-DETR/tools/launch.py index 6ef7faef94..9e9fdfea2c 100644 --- a/dimos/models/Detic/third_party/Deformable-DETR/tools/launch.py +++ b/dimos/models/Detic/third_party/Deformable-DETR/tools/launch.py @@ -108,7 +108,6 @@ from argparse import ArgumentParser, REMAINDER - def parse_args(): """ Helper function parsing the command line options @@ -121,9 +120,14 @@ def parse_args(): ) # Optional arguments for the launch helper - parser.add_argument("--nnodes", type=int, default=1, help="The number of nodes to use for distributed training") parser.add_argument( - "--node_rank", type=int, default=0, help="The rank of the node for multi-node distributed training" + "--nnodes", type=int, default=1, help="The number of nodes to use for distributed training" + ) + parser.add_argument( + "--node_rank", + type=int, + default=0, + help="The rank of the node for multi-node distributed training", ) parser.add_argument( "--nproc_per_node", diff --git a/dimos/models/Detic/third_party/Deformable-DETR/util/misc.py b/dimos/models/Detic/third_party/Deformable-DETR/util/misc.py index 52079399dd..661807da15 100644 --- a/dimos/models/Detic/third_party/Deformable-DETR/util/misc.py +++ b/dimos/models/Detic/third_party/Deformable-DETR/util/misc.py @@ -116,7 +116,11 @@ def value(self): def __str__(self): return self.fmt.format( - median=self.median, avg=self.avg, global_avg=self.global_avg, max=self.max, value=self.value + median=self.median, + avg=self.avg, + global_avg=self.global_avg, + max=self.max, + value=self.value, ) @@ -245,7 +249,14 @@ def log_every(self, iterable, print_freq, header=None): ) else: log_msg = self.delimiter.join( - [header, "[{0" + space_fmt + "}/{1}]", "eta: {eta}", "{meters}", "time: {time}", "data: {data}"] + [ + header, + "[{0" + space_fmt + "}/{1}]", + "eta: {eta}", + "{meters}", + "time: {time}", + "data: {data}", + ] ) MB = 1024.0 * 1024.0 for obj in iterable: @@ -270,14 +281,23 @@ def log_every(self, iterable, print_freq, header=None): else: print( log_msg.format( - i, len(iterable), eta=eta_string, meters=str(self), time=str(iter_time), data=str(data_time) + i, + len(iterable), + eta=eta_string, + meters=str(self), + time=str(iter_time), + data=str(data_time), ) ) i += 1 end = time.time() total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) - print("{} Total time: {} ({:.4f} s / it)".format(header, total_time_str, total_time / len(iterable))) + print( + "{} Total time: {} ({:.4f} s / it)".format( + header, total_time_str, total_time / len(iterable) + ) + ) def get_sha(): @@ -455,7 +475,10 @@ def init_distributed_mode(args): args.dist_backend = "nccl" print("| distributed init (rank {}): {}".format(args.rank, args.dist_url), flush=True) torch.distributed.init_process_group( - backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank + backend=args.dist_backend, + init_method=args.dist_url, + world_size=args.world_size, + rank=args.rank, ) torch.distributed.barrier() setup_for_distributed(args.rank == 0) @@ -505,7 +528,8 @@ def get_total_grad_norm(parameters, norm_type=2): norm_type = float(norm_type) device = parameters[0].grad.device total_norm = torch.norm( - torch.stack([torch.norm(p.grad.detach(), norm_type).to(device) for p in parameters]), norm_type + torch.stack([torch.norm(p.grad.detach(), norm_type).to(device) for p in parameters]), + norm_type, ) return total_norm diff --git a/dimos/models/Detic/third_party/Deformable-DETR/util/plot_utils.py b/dimos/models/Detic/third_party/Deformable-DETR/util/plot_utils.py index d4568bfb87..3bbb97b3d1 100644 --- a/dimos/models/Detic/third_party/Deformable-DETR/util/plot_utils.py +++ b/dimos/models/Detic/third_party/Deformable-DETR/util/plot_utils.py @@ -19,7 +19,9 @@ from pathlib import Path, PurePath -def plot_logs(logs, fields=("class_error", "loss_bbox_unscaled", "mAP"), ewm_col=0, log_name="log.txt"): +def plot_logs( + logs, fields=("class_error", "loss_bbox_unscaled", "mAP"), ewm_col=0, log_name="log.txt" +): """ Function to plot specific fields from training log(s). Plots both training and test results. @@ -50,7 +52,9 @@ def plot_logs(logs, fields=("class_error", "loss_bbox_unscaled", "mAP"), ewm_col # verify valid dir(s) and that every item in list is Path object for i, dir in enumerate(logs): if not isinstance(dir, PurePath): - raise ValueError(f"{func_name} - non-Path object in logs argument of {type(dir)}: \n{dir}") + raise ValueError( + f"{func_name} - non-Path object in logs argument of {type(dir)}: \n{dir}" + ) if dir.exists(): continue raise ValueError(f"{func_name} - invalid directory in logs argument:\n{dir}") @@ -63,11 +67,18 @@ def plot_logs(logs, fields=("class_error", "loss_bbox_unscaled", "mAP"), ewm_col for df, color in zip(dfs, sns.color_palette(n_colors=len(logs))): for j, field in enumerate(fields): if field == "mAP": - coco_eval = pd.DataFrame(pd.np.stack(df.test_coco_eval.dropna().values)[:, 1]).ewm(com=ewm_col).mean() + coco_eval = ( + pd.DataFrame(pd.np.stack(df.test_coco_eval.dropna().values)[:, 1]) + .ewm(com=ewm_col) + .mean() + ) axs[j].plot(coco_eval, c=color) else: df.interpolate().ewm(com=ewm_col).mean().plot( - y=[f"train_{field}", f"test_{field}"], ax=axs[j], color=[color] * 2, style=["-", "--"] + y=[f"train_{field}", f"test_{field}"], + ax=axs[j], + color=[color] * 2, + style=["-", "--"], ) for ax, field in zip(axs, fields): ax.legend([Path(p).name for p in logs]) diff --git a/dimos/models/Detic/tools/create_imagenetlvis_json.py b/dimos/models/Detic/tools/create_imagenetlvis_json.py index 4733b857a2..54883d7337 100644 --- a/dimos/models/Detic/tools/create_imagenetlvis_json.py +++ b/dimos/models/Detic/tools/create_imagenetlvis_json.py @@ -9,7 +9,9 @@ parser = argparse.ArgumentParser() parser.add_argument("--imagenet_path", default="datasets/imagenet/ImageNet-LVIS") parser.add_argument("--lvis_meta_path", default="datasets/lvis/lvis_v1_val.json") - parser.add_argument("--out_path", default="datasets/imagenet/annotations/imagenet_lvis_image_info.json") + parser.add_argument( + "--out_path", default="datasets/imagenet/annotations/imagenet_lvis_image_info.json" + ) args = parser.parse_args() print("Loading LVIS meta") @@ -34,7 +36,13 @@ # img = cv2.imread('{}/{}'.format(args.imagenet_path, file_name)) img = read_image("{}/{}".format(args.imagenet_path, file_name)) h, w = img.shape[:2] - image = {"id": count, "file_name": file_name, "pos_category_ids": [cat_id], "width": w, "height": h} + image = { + "id": count, + "file_name": file_name, + "pos_category_ids": [cat_id], + "width": w, + "height": h, + } cat_images.append(image) images.extend(cat_images) image_counts[cat_id] = len(cat_images) diff --git a/dimos/models/Detic/tools/create_lvis_21k.py b/dimos/models/Detic/tools/create_lvis_21k.py index 4526f7f2fd..05e9530181 100644 --- a/dimos/models/Detic/tools/create_lvis_21k.py +++ b/dimos/models/Detic/tools/create_lvis_21k.py @@ -5,7 +5,9 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--imagenet_path", default="datasets/imagenet/annotations/imagenet-21k_image_info.json") + parser.add_argument( + "--imagenet_path", default="datasets/imagenet/annotations/imagenet-21k_image_info.json" + ) parser.add_argument("--lvis_path", default="datasets/lvis/lvis_v1_train.json") parser.add_argument("--save_categories", default="") parser.add_argument("--not_save_imagenet", action="store_true") diff --git a/dimos/models/Detic/tools/dump_clip_features.py b/dimos/models/Detic/tools/dump_clip_features.py index 803b19d365..941fe221ed 100644 --- a/dimos/models/Detic/tools/dump_clip_features.py +++ b/dimos/models/Detic/tools/dump_clip_features.py @@ -52,7 +52,9 @@ sentences_synonyms = [["a photo of a {}".format(xx) for xx in x] for x in synonyms] elif args.prompt == "scene": sentences = ["a photo of a {} in the scene".format(x) for x in cat_names] - sentences_synonyms = [["a photo of a {} in the scene".format(xx) for xx in x] for x in synonyms] + sentences_synonyms = [ + ["a photo of a {} in the scene".format(xx) for xx in x] for x in synonyms + ] print("sentences_synonyms", len(sentences_synonyms), sum(len(x) for x in sentences_synonyms)) if args.model == "clip": @@ -67,7 +69,11 @@ with torch.no_grad(): if len(text) > 10000: text_features = torch.cat( - [model.encode_text(text[: len(text) // 2]), model.encode_text(text[len(text) // 2 :])], dim=0 + [ + model.encode_text(text[: len(text) // 2]), + model.encode_text(text[len(text) // 2 :]), + ], + dim=0, ) else: text_features = model.encode_text(text) diff --git a/dimos/models/Detic/tools/fix_o365_path.py b/dimos/models/Detic/tools/fix_o365_path.py index 3f0b8c306e..8e0b476323 100644 --- a/dimos/models/Detic/tools/fix_o365_path.py +++ b/dimos/models/Detic/tools/fix_o365_path.py @@ -6,7 +6,9 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--ann", default="datasets/objects365/annotations/zhiyuan_objv2_train_fixname.json") + parser.add_argument( + "--ann", default="datasets/objects365/annotations/zhiyuan_objv2_train_fixname.json" + ) parser.add_argument("--img_dir", default="datasets/objects365/train/") args = parser.parse_args() diff --git a/dimos/models/Detic/tools/get_cc_tags.py b/dimos/models/Detic/tools/get_cc_tags.py index f9e7a71c49..52aa05445c 100644 --- a/dimos/models/Detic/tools/get_cc_tags.py +++ b/dimos/models/Detic/tools/get_cc_tags.py @@ -135,7 +135,9 @@ def map_name(x): id2cat = {x["id"]: x for x in cc_data["categories"]} class_count = {x["id"]: 0 for x in cc_data["categories"]} - class_data = {x["id"]: [" " + map_name(xx) + " " for xx in x["synonyms"]] for x in cc_data["categories"]} + class_data = { + x["id"]: [" " + map_name(xx) + " " for xx in x["synonyms"]] for x in cc_data["categories"] + } num_examples = 5 examples = {x["id"]: [] for x in cc_data["categories"]} @@ -177,7 +179,11 @@ def map_name(x): # if x['frequency'] == freq] and class_count[x['id']] > 0)) for freq in ["r", "c", "f"]: - print("#Images", freq, sum([v for k, v in class_count.items() if id2cat[k]["frequency"] == freq])) + print( + "#Images", + freq, + sum([v for k, v in class_count.items() if id2cat[k]["frequency"] == freq]), + ) try: out_data = {"images": images, "categories": cc_data["categories"], "annotations": []} diff --git a/dimos/models/Detic/tools/get_coco_zeroshot_oriorder.py b/dimos/models/Detic/tools/get_coco_zeroshot_oriorder.py index 843f2ab56e..874d378d48 100644 --- a/dimos/models/Detic/tools/get_coco_zeroshot_oriorder.py +++ b/dimos/models/Detic/tools/get_coco_zeroshot_oriorder.py @@ -4,7 +4,9 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--data_path", default="datasets/coco/annotations/instances_val2017_unseen_2.json") + parser.add_argument( + "--data_path", default="datasets/coco/annotations/instances_val2017_unseen_2.json" + ) parser.add_argument("--cat_path", default="datasets/coco/annotations/instances_val2017.json") args = parser.parse_args() print("Loading", args.cat_path) diff --git a/dimos/models/Detic/tools/get_imagenet_21k_full_tar_json.py b/dimos/models/Detic/tools/get_imagenet_21k_full_tar_json.py index 5509ae7335..2f19a6cf91 100644 --- a/dimos/models/Detic/tools/get_imagenet_21k_full_tar_json.py +++ b/dimos/models/Detic/tools/get_imagenet_21k_full_tar_json.py @@ -18,7 +18,9 @@ parser.add_argument("--imagenet_dir", default="datasets/imagenet/ImageNet-21k/") parser.add_argument("--tarfile_path", default="datasets/imagenet/metadata-22k/tar_files.npy") parser.add_argument("--tar_index_dir", default="datasets/imagenet/metadata-22k/tarindex_npy") - parser.add_argument("--out_path", default="datasets/imagenet/annotations/imagenet-22k_image_info.json") + parser.add_argument( + "--out_path", default="datasets/imagenet/annotations/imagenet-22k_image_info.json" + ) parser.add_argument("--workers", default=16, type=int) args = parser.parse_args() diff --git a/dimos/models/Detic/tools/merge_lvis_coco.py b/dimos/models/Detic/tools/merge_lvis_coco.py index 93fc232f8f..5ef480d28e 100644 --- a/dimos/models/Detic/tools/merge_lvis_coco.py +++ b/dimos/models/Detic/tools/merge_lvis_coco.py @@ -123,7 +123,9 @@ def get_bbox(ann): synset2lvisid = {x["synset"]: x["id"] for x in lvis_cats} # cocoid2synset = {x['coco_cat_id']: x['synset'] for x in COCO_SYNSET_CATEGORIES} coco2lviscats = { - x["coco_cat_id"]: synset2lvisid[x["synset"]] for x in COCO_SYNSET_CATEGORIES if x["synset"] in synset2lvisid + x["coco_cat_id"]: synset2lvisid[x["synset"]] + for x in COCO_SYNSET_CATEGORIES + if x["synset"] in synset2lvisid } print(len(coco2lviscats)) @@ -168,7 +170,8 @@ def get_bbox(ann): coco_anns = coco_img2anns[file_name] lvis_anns = lvis_img2anns[file_name] ious = pairwise_iou( - Boxes(torch.tensor([get_bbox(x) for x in coco_anns])), Boxes(torch.tensor([get_bbox(x) for x in lvis_anns])) + Boxes(torch.tensor([get_bbox(x) for x in coco_anns])), + Boxes(torch.tensor([get_bbox(x) for x in lvis_anns])), ) for ann in lvis_anns: @@ -184,7 +187,10 @@ def get_bbox(ann): else: duplicated = False for j in range(len(ious[i])): - if ious[i, j] >= THRESH and coco_anns[i]["category_id"] == lvis_anns[j]["category_id"]: + if ( + ious[i, j] >= THRESH + and coco_anns[i]["category_id"] == lvis_anns[j]["category_id"] + ): duplicated = True if not duplicated: ann_id_count = ann_id_count + 1 diff --git a/dimos/models/Detic/tools/preprocess_imagenet22k.py b/dimos/models/Detic/tools/preprocess_imagenet22k.py index 1c30bb3a95..f4ea6fcbfe 100644 --- a/dimos/models/Detic/tools/preprocess_imagenet22k.py +++ b/dimos/models/Detic/tools/preprocess_imagenet22k.py @@ -89,7 +89,9 @@ def preprocess(): for log_file in log_files: syn = log_file.replace(".tarlog", "") dataset = _RawTarDataset( - os.path.join(i22kdir, syn + ".tar"), os.path.join(i22ktarlogs, syn + ".tarlog"), preload=False + os.path.join(i22kdir, syn + ".tar"), + os.path.join(i22ktarlogs, syn + ".tarlog"), + preload=False, ) names = np.array(dataset.names) offsets = np.array(dataset.offsets, dtype=np.int64) diff --git a/dimos/models/Detic/tools/remove_lvis_rare.py b/dimos/models/Detic/tools/remove_lvis_rare.py index 6083121422..2e1705d50c 100644 --- a/dimos/models/Detic/tools/remove_lvis_rare.py +++ b/dimos/models/Detic/tools/remove_lvis_rare.py @@ -12,7 +12,9 @@ catid2freq = {x["id"]: x["frequency"] for x in data["categories"]} print("ori #anns", len(data["annotations"])) exclude = ["r"] - data["annotations"] = [x for x in data["annotations"] if catid2freq[x["category_id"]] not in exclude] + data["annotations"] = [ + x for x in data["annotations"] if catid2freq[x["category_id"]] not in exclude + ] print("filtered #anns", len(data["annotations"])) out_path = args.ann[:-5] + "_norare.json" print("Saving to", out_path) diff --git a/dimos/models/Detic/train_net.py b/dimos/models/Detic/train_net.py index e18f4de9a9..53699045bd 100644 --- a/dimos/models/Detic/train_net.py +++ b/dimos/models/Detic/train_net.py @@ -101,14 +101,20 @@ def do_train(cfg, model, resume=False): optimizer = build_optimizer(cfg, model) scheduler = build_lr_scheduler(cfg, optimizer) - checkpointer = DetectionCheckpointer(model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler) + checkpointer = DetectionCheckpointer( + model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler + ) - start_iter = checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1 + start_iter = ( + checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1 + ) if not resume: start_iter = 0 max_iter = cfg.SOLVER.MAX_ITER if cfg.SOLVER.TRAIN_ITER < 0 else cfg.SOLVER.TRAIN_ITER - periodic_checkpointer = PeriodicCheckpointer(checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter) + periodic_checkpointer = PeriodicCheckpointer( + checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter + ) writers = ( [ @@ -174,7 +180,11 @@ def do_train(cfg, model, resume=False): data_timer.reset() scheduler.step() - if cfg.TEST.EVAL_PERIOD > 0 and iteration % cfg.TEST.EVAL_PERIOD == 0 and iteration != max_iter: + if ( + cfg.TEST.EVAL_PERIOD > 0 + and iteration % cfg.TEST.EVAL_PERIOD == 0 + and iteration != max_iter + ): do_test(cfg, model) comm.synchronize() @@ -184,7 +194,9 @@ def do_train(cfg, model, resume=False): periodic_checkpointer.step(iteration) total_time = time.perf_counter() - start_time - logger.info("Total training time: {}".format(str(datetime.timedelta(seconds=int(total_time))))) + logger.info( + "Total training time: {}".format(str(datetime.timedelta(seconds=int(total_time)))) + ) def setup(args): @@ -212,7 +224,9 @@ def main(args): model = build_model(cfg) logger.info("Model:\n{}".format(model)) if args.eval_only: - DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(cfg.MODEL.WEIGHTS, resume=args.resume) + DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( + cfg.MODEL.WEIGHTS, resume=args.resume + ) return do_test(cfg, model) @@ -238,7 +252,9 @@ def main(args): if args.dist_url == "host": args.dist_url = "tcp://{}:12345".format(os.environ["SLURM_JOB_NODELIST"]) elif not args.dist_url.startswith("tcp"): - tmp = os.popen("echo $(scontrol show job {} | grep BatchHost)".format(args.dist_url)).read() + tmp = os.popen( + "echo $(scontrol show job {} | grep BatchHost)".format(args.dist_url) + ).read() tmp = tmp[tmp.find("=") + 1 : -1] args.dist_url = "tcp://{}:12345".format(tmp) print("Command Line Args:", args) diff --git a/dimos/models/depth/metric3d.py b/dimos/models/depth/metric3d.py index e120a412a6..58cb63f640 100644 --- a/dimos/models/depth/metric3d.py +++ b/dimos/models/depth/metric3d.py @@ -27,7 +27,9 @@ class Metric3D: def __init__(self, gt_depth_scale=256.0): # self.conf = get_config("zoedepth", "infer") # self.depth_model = build_model(self.conf) - self.depth_model = torch.hub.load("yvanyin/metric3d", "metric3d_vit_small", pretrain=True).cuda() + self.depth_model = torch.hub.load( + "yvanyin/metric3d", "metric3d_vit_small", pretrain=True + ).cuda() if torch.cuda.device_count() > 1: print(f"Using {torch.cuda.device_count()} GPUs!") # self.depth_model = torch.nn.DataParallel(self.depth_model) @@ -74,7 +76,9 @@ def infer_depth(self, img, debug=False): # Convert to PIL format depth_image = self.unpad_transform_depth(pred_depth) - out_16bit_numpy = (depth_image.squeeze().cpu().numpy() * self.gt_depth_scale).astype(np.uint16) + out_16bit_numpy = (depth_image.squeeze().cpu().numpy() * self.gt_depth_scale).astype( + np.uint16 + ) depth_map_pil = Image.fromarray(out_16bit_numpy) return depth_map_pil @@ -94,7 +98,9 @@ def rescale_input(self, rgb, rgb_origin): # input_size = (544, 1216) # for convnext model h, w = rgb_origin.shape[:2] scale = min(input_size[0] / h, input_size[1] / w) - rgb = cv2.resize(rgb_origin, (int(w * scale), int(h * scale)), interpolation=cv2.INTER_LINEAR) + rgb = cv2.resize( + rgb_origin, (int(w * scale), int(h * scale)), interpolation=cv2.INTER_LINEAR + ) # remember to scale intrinsic, hold depth self.intrinsic_scaled = [ self.intrinsic[0] * scale, @@ -110,7 +116,13 @@ def rescale_input(self, rgb, rgb_origin): pad_h_half = pad_h // 2 pad_w_half = pad_w // 2 rgb = cv2.copyMakeBorder( - rgb, pad_h_half, pad_h - pad_h_half, pad_w_half, pad_w - pad_w_half, cv2.BORDER_CONSTANT, value=padding + rgb, + pad_h_half, + pad_h - pad_h_half, + pad_w_half, + pad_w - pad_w_half, + cv2.BORDER_CONSTANT, + value=padding, ) self.pad_info = [pad_h_half, pad_h - pad_h_half, pad_w_half, pad_w - pad_w_half] @@ -137,7 +149,9 @@ def unpad_transform_depth(self, pred_depth): ###################### canonical camera space ###################### #### de-canonical transform - canonical_to_real_scale = self.intrinsic_scaled[0] / 1000.0 # 1000.0 is the focal length of canonical camera + canonical_to_real_scale = ( + self.intrinsic_scaled[0] / 1000.0 + ) # 1000.0 is the focal length of canonical camera pred_depth = pred_depth * canonical_to_real_scale # now the depth is metric pred_depth = torch.clamp(pred_depth, 0, 1000) return pred_depth diff --git a/dimos/models/labels/llava-34b.py b/dimos/models/labels/llava-34b.py index 4c330f3345..c59a5c8aa9 100644 --- a/dimos/models/labels/llava-34b.py +++ b/dimos/models/labels/llava-34b.py @@ -34,14 +34,21 @@ def __init__( if gpu: n_gpu_layers = -1 self.llm = Llama( - model_path=model_path, chat_handler=chat_handler, n_ctx=2048, logits_all=True, n_gpu_layers=n_gpu_layers + model_path=model_path, + chat_handler=chat_handler, + n_ctx=2048, + logits_all=True, + n_gpu_layers=n_gpu_layers, ) def run_inference(self, image, prompt, return_json=True): data_uri = image_to_base64_data_uri(image) res = self.llm.create_chat_completion( messages=[ - {"role": "system", "content": "You are an assistant who perfectly describes images."}, + { + "role": "system", + "content": "You are an assistant who perfectly describes images.", + }, { "role": "user", "content": [ @@ -52,7 +59,13 @@ def run_inference(self, image, prompt, return_json=True): ] ) if return_json: - return list(set(self.extract_descriptions_from_incomplete_json(res["choices"][0]["message"]["content"]))) + return list( + set( + self.extract_descriptions_from_incomplete_json( + res["choices"][0]["message"]["content"] + ) + ) + ) return res["choices"][0]["message"]["content"] @@ -69,7 +82,9 @@ def extract_descriptions_from_incomplete_json(self, json_like_str): try: json_obj = json.loads(json_str) descriptions = [ - details["description"].replace(".", "") for key, details in json_obj.items() if "description" in details + details["description"].replace(".", "") + for key, details in json_obj.items() + if "description" in details ] return descriptions diff --git a/dimos/models/pointcloud/pointcloud_utils.py b/dimos/models/pointcloud/pointcloud_utils.py index 569cdd0577..c0951f44f2 100644 --- a/dimos/models/pointcloud/pointcloud_utils.py +++ b/dimos/models/pointcloud/pointcloud_utils.py @@ -54,7 +54,9 @@ def create_point_cloud_from_rgbd(rgb_image, depth_image, intrinsic_parameters): def canonicalize_point_cloud(pcd, canonicalize_threshold=0.3): # Segment the largest plane, assumed to be the floor - plane_model, inliers = pcd.segment_plane(distance_threshold=0.01, ransac_n=3, num_iterations=1000) + plane_model, inliers = pcd.segment_plane( + distance_threshold=0.01, ransac_n=3, num_iterations=1000 + ) canonicalized = False if len(inliers) / len(pcd.points) > canonicalize_threshold: @@ -82,7 +84,9 @@ def canonicalize_point_cloud(pcd, canonicalize_threshold=0.3): pcd.transform(transformation) # Additional 180-degree rotation around the Z-axis - rotation_z_180 = np.array([[np.cos(np.pi), -np.sin(np.pi), 0], [np.sin(np.pi), np.cos(np.pi), 0], [0, 0, 1]]) + rotation_z_180 = np.array( + [[np.cos(np.pi), -np.sin(np.pi), 0], [np.sin(np.pi), np.cos(np.pi), 0], [0, 0, 1]] + ) pcd.rotate(rotation_z_180, center=(0, 0, 0)) return pcd, canonicalized, transformation diff --git a/dimos/models/qwen/video_query.py b/dimos/models/qwen/video_query.py index 40df45cd91..3471e52268 100644 --- a/dimos/models/qwen/video_query.py +++ b/dimos/models/qwen/video_query.py @@ -14,7 +14,10 @@ def query_single_frame_observable( - video_observable: Observable, query: str, api_key: Optional[str] = None, model_name: str = "qwen2.5-vl-72b-instruct" + video_observable: Observable, + query: str, + api_key: Optional[str] = None, + model_name: str = "qwen2.5-vl-72b-instruct", ) -> Observable: """Process a single frame from a video observable with Qwen model. @@ -38,7 +41,9 @@ def query_single_frame_observable( # Get API key from env if not provided api_key = api_key or os.getenv("ALIBABA_API_KEY") if not api_key: - raise ValueError("Alibaba API key must be provided or set in ALIBABA_API_KEY environment variable") + raise ValueError( + "Alibaba API key must be provided or set in ALIBABA_API_KEY environment variable" + ) # Create Qwen client qwen_client = OpenAI( @@ -107,7 +112,9 @@ def query_single_frame( # Get API key from env if not provided api_key = api_key or os.getenv("ALIBABA_API_KEY") if not api_key: - raise ValueError("Alibaba API key must be provided or set in ALIBABA_API_KEY environment variable") + raise ValueError( + "Alibaba API key must be provided or set in ALIBABA_API_KEY environment variable" + ) # Create Qwen client qwen_client = OpenAI( @@ -151,7 +158,9 @@ def query_single_frame( return response -def get_bbox_from_qwen(video_stream: Observable, object_name: Optional[str] = None) -> Optional[list]: +def get_bbox_from_qwen( + video_stream: Observable, object_name: Optional[str] = None +) -> Optional[list]: """Get bounding box coordinates from Qwen for a specific object or any object. Args: diff --git a/dimos/models/segmentation/clipseg.py b/dimos/models/segmentation/clipseg.py index f401aa4ead..043cd194b0 100644 --- a/dimos/models/segmentation/clipseg.py +++ b/dimos/models/segmentation/clipseg.py @@ -22,7 +22,10 @@ def __init__(self, model_name="CIDAS/clipseg-rd64-refined"): def run_inference(self, image, text_descriptions): inputs = self.clipseg_processor( - text=text_descriptions, images=[image] * len(text_descriptions), padding=True, return_tensors="pt" + text=text_descriptions, + images=[image] * len(text_descriptions), + padding=True, + return_tensors="pt", ) outputs = self.clipseg_model(**inputs) logits = outputs.logits diff --git a/dimos/models/segmentation/sam.py b/dimos/models/segmentation/sam.py index 0b416e11e0..1efb07c484 100644 --- a/dimos/models/segmentation/sam.py +++ b/dimos/models/segmentation/sam.py @@ -23,9 +23,13 @@ def __init__(self, model_name="facebook/sam-vit-huge", device="cuda"): self.sam_processor = SamProcessor.from_pretrained(model_name) def run_inference_from_points(self, image, points): - sam_inputs = self.sam_processor(image, input_points=points, return_tensors="pt").to(self.device) + sam_inputs = self.sam_processor(image, input_points=points, return_tensors="pt").to( + self.device + ) with torch.no_grad(): sam_outputs = self.sam_model(**sam_inputs) return self.sam_processor.image_processor.post_process_masks( - sam_outputs.pred_masks.cpu(), sam_inputs["original_sizes"].cpu(), sam_inputs["reshaped_input_sizes"].cpu() + sam_outputs.pred_masks.cpu(), + sam_inputs["original_sizes"].cpu(), + sam_inputs["reshaped_input_sizes"].cpu(), ) diff --git a/dimos/models/segmentation/segment_utils.py b/dimos/models/segmentation/segment_utils.py index 4238a5d075..9808f5d4e4 100644 --- a/dimos/models/segmentation/segment_utils.py +++ b/dimos/models/segmentation/segment_utils.py @@ -48,7 +48,9 @@ def sample_points_from_heatmap(heatmap, original_size, num_points=5, percentile= attn = torch.sigmoid(heatmap) w = attn.shape[0] - sampled_indices = torch.multinomial(torch.tensor(probabilities.ravel()), num_points, replacement=True) + sampled_indices = torch.multinomial( + torch.tensor(probabilities.ravel()), num_points, replacement=True + ) sampled_coords = np.array(np.unravel_index(sampled_indices, attn.shape)).T medoid, sampled_coords = find_medoid_and_closest_points(sampled_coords) diff --git a/dimos/perception/common/cuboid_fit.py b/dimos/perception/common/cuboid_fit.py index e1ea6251ef..cbfed972af 100644 --- a/dimos/perception/common/cuboid_fit.py +++ b/dimos/perception/common/cuboid_fit.py @@ -25,7 +25,9 @@ def depth_to_point_cloud(depth_image, camera_matrix, subsample_factor=4): # Create pixel coordinate grid rows, cols = depth_image.shape - x_grid, y_grid = np.meshgrid(np.arange(0, cols, subsample_factor), np.arange(0, rows, subsample_factor)) + x_grid, y_grid = np.meshgrid( + np.arange(0, cols, subsample_factor), np.arange(0, rows, subsample_factor) + ) # Flatten grid and depth x = x_grid.flatten() @@ -124,14 +126,21 @@ def fit_cuboid(points, n_iterations=5, inlier_thresh=2.0): dy = np.abs(local_points[:, 1]) - half_dims[1] dz = np.abs(local_points[:, 2]) - half_dims[2] - outside_dist = np.sqrt(np.maximum(dx, 0) ** 2 + np.maximum(dy, 0) ** 2 + np.maximum(dz, 0) ** 2) + outside_dist = np.sqrt( + np.maximum(dx, 0) ** 2 + np.maximum(dy, 0) ** 2 + np.maximum(dz, 0) ** 2 + ) inside_dist = np.minimum(np.maximum(np.maximum(dx, dy), dz), 0) distances = outside_dist + inside_dist error = np.mean(distances**2) if error < best_error: best_error = error - best_params = {"center": center, "rotation": rotation, "dimensions": dimensions, "error": error} + best_params = { + "center": center, + "rotation": rotation, + "dimensions": dimensions, + "error": error, + } # Update points for next iteration current_points = current_points[inlier_mask] @@ -180,7 +189,9 @@ def visualize_fit(image, cuboid_params, camera_matrix, R=None, t=None): Draw the fitted cuboid on the image. """ # Get corners in world coordinates - corners = get_cuboid_corners(cuboid_params["center"], cuboid_params["dimensions"], cuboid_params["rotation"]) + corners = get_cuboid_corners( + cuboid_params["center"], cuboid_params["dimensions"], cuboid_params["rotation"] + ) # Transform corners if R and t are provided if R is not None and t is not None: @@ -237,10 +248,14 @@ def plot_3d_fit(points, cuboid_params, title="3D Cuboid Fit"): ax = fig.add_subplot(111, projection="3d") # Plot points - ax.scatter(points[:, 0], points[:, 1], points[:, 2], c="b", marker=".", alpha=0.1, label="Points") + ax.scatter( + points[:, 0], points[:, 1], points[:, 2], c="b", marker=".", alpha=0.1, label="Points" + ) # Plot fitted cuboid - corners = get_cuboid_corners(cuboid_params["center"], cuboid_params["dimensions"], cuboid_params["rotation"]) + corners = get_cuboid_corners( + cuboid_params["center"], cuboid_params["dimensions"], cuboid_params["rotation"] + ) # Define edges edges = [ @@ -263,7 +278,12 @@ def plot_3d_fit(points, cuboid_params, title="3D Cuboid Fit"): # Plot edges for i, j in edges: - ax.plot3D([corners[i, 0], corners[j, 0]], [corners[i, 1], corners[j, 1]], [corners[i, 2], corners[j, 2]], "r-") + ax.plot3D( + [corners[i, 0], corners[j, 0]], + [corners[i, 1], corners[j, 1]], + [corners[i, 2], corners[j, 2]], + "r-", + ) # Set labels and title ax.set_xlabel("X") diff --git a/dimos/perception/common/detection2d_tracker.py b/dimos/perception/common/detection2d_tracker.py index 3fc12ede92..90ddbe5c7f 100644 --- a/dimos/perception/common/detection2d_tracker.py +++ b/dimos/perception/common/detection2d_tracker.py @@ -61,7 +61,17 @@ class target2d: detection probabilities, and computed texture values. """ - def __init__(self, initial_mask, initial_bbox, track_id, prob, name, texture_value, target_id, history_size=10): + def __init__( + self, + initial_mask, + initial_bbox, + track_id, + prob, + name, + texture_value, + target_id, + history_size=10, + ): """ Args: initial_mask (torch.Tensor): Latest segmentation mask. @@ -288,7 +298,9 @@ def update(self, frame, masks, bboxes, track_ids, probs, names, texture_values): frame_shape = frame.shape[:2] # (height, width) # For each detection, try to match with an existing target. - for mask, bbox, det_tid, prob, name, texture in zip(masks, bboxes, track_ids, probs, names, texture_values): + for mask, bbox, det_tid, prob, name, texture in zip( + masks, bboxes, track_ids, probs, names, texture_values + ): matched_target = None # First, try matching by detection track ID if valid. @@ -312,7 +324,9 @@ def update(self, frame, masks, bboxes, track_ids, probs, names, texture_values): matched_target.update(mask, bbox, det_tid, prob, name, texture) updated_target_ids.add(matched_target.target_id) else: - new_target = target2d(mask, bbox, det_tid, prob, name, texture, self.next_target_id, self.history_size) + new_target = target2d( + mask, bbox, det_tid, prob, name, texture, self.next_target_id, self.history_size + ) self.targets[self.next_target_id] = new_target updated_target_ids.add(self.next_target_id) self.next_target_id += 1 diff --git a/dimos/perception/common/export_tensorrt.py b/dimos/perception/common/export_tensorrt.py index a42dcfedec..b7b48e0367 100644 --- a/dimos/perception/common/export_tensorrt.py +++ b/dimos/perception/common/export_tensorrt.py @@ -6,12 +6,22 @@ def parse_args(): parser = argparse.ArgumentParser(description="Export YOLO/FastSAM models to different formats") parser.add_argument("--model_path", type=str, required=True, help="Path to the model weights") parser.add_argument( - "--model_type", type=str, choices=["yolo", "fastsam"], required=True, help="Type of model to export" + "--model_type", + type=str, + choices=["yolo", "fastsam"], + required=True, + help="Type of model to export", ) parser.add_argument( - "--precision", type=str, choices=["fp32", "fp16", "int8"], default="fp32", help="Precision for export" + "--precision", + type=str, + choices=["fp32", "fp16", "int8"], + default="fp32", + help="Precision for export", + ) + parser.add_argument( + "--format", type=str, choices=["onnx", "engine"], default="onnx", help="Export format" ) - parser.add_argument("--format", type=str, choices=["onnx", "engine"], default="onnx", help="Export format") return parser.parse_args() diff --git a/dimos/perception/common/ibvs.py b/dimos/perception/common/ibvs.py index 0a84c5a220..2c08752a0c 100644 --- a/dimos/perception/common/ibvs.py +++ b/dimos/perception/common/ibvs.py @@ -24,7 +24,9 @@ def __init__(self, K, camera_pitch, camera_height): # Pitch rotation matrix (positive is upward) theta = -camera_pitch # Negative since positive pitch is negative rotation about robot Y - self.R_pitch = np.array([[np.cos(theta), 0, np.sin(theta)], [0, 1, 0], [-np.sin(theta), 0, np.cos(theta)]]) + self.R_pitch = np.array( + [[np.cos(theta), 0, np.sin(theta)], [0, 1, 0], [-np.sin(theta), 0, np.cos(theta)]] + ) # Combined transform from camera to robot frame self.A = self.R_pitch @ self.T @@ -129,7 +131,9 @@ def __init__(self, K, camera_pitch, camera_height): # Pitch rotation matrix (positive is upward) theta = -camera_pitch # Negative since positive pitch is negative rotation about robot Y - self.R_pitch = np.array([[np.cos(theta), 0, np.sin(theta)], [0, 1, 0], [-np.sin(theta), 0, np.cos(theta)]]) + self.R_pitch = np.array( + [[np.cos(theta), 0, np.sin(theta)], [0, 1, 0], [-np.sin(theta), 0, np.cos(theta)]] + ) # Combined transform from camera to robot frame self.A = self.R_pitch @ self.T diff --git a/dimos/perception/detection2d/detic_2d_det.py b/dimos/perception/detection2d/detic_2d_det.py index b1e1feeed7..bed5700521 100644 --- a/dimos/perception/detection2d/detic_2d_det.py +++ b/dimos/perception/detection2d/detic_2d_det.py @@ -105,7 +105,12 @@ def update(self, detections): # Add to results result.append( - [track_id, detections[best_idx][:4], detections[best_idx][4], int(detections[best_idx][5])] + [ + track_id, + detections[best_idx][:4], + detections[best_idx][4], + int(detections[best_idx][5]), + ] ) # Create new tracks for unmatched detections @@ -117,7 +122,12 @@ def update(self, detections): new_id = self.next_id self.next_id += 1 - self.tracks[new_id] = {"bbox": det[:4], "score": det[4], "class_id": int(det[5]), "age": 0} + self.tracks[new_id] = { + "bbox": det[:4], + "score": det[4], + "class_id": int(det[5]), + "age": 0, + } # Add to results result.append([new_id, det[:4], det[4], int(det[5])]) @@ -158,14 +168,14 @@ def __init__(self, model_path=None, device="cuda", vocabulary=None, threshold=0. # Use default Detic config self.cfg.merge_from_file( - os.path.join(detic_path, "configs/Detic_LCOCOI21k_CLIP_SwinB_896b32_4x_ft4x_max-size.yaml") + os.path.join( + detic_path, "configs/Detic_LCOCOI21k_CLIP_SwinB_896b32_4x_ft4x_max-size.yaml" + ) ) # Set default weights if not provided if model_path is None: - self.cfg.MODEL.WEIGHTS = ( - "https://dl.fbaipublicfiles.com/detic/Detic_LCOCOI21k_CLIP_SwinB_896b32_4x_ft4x_max-size.pth" - ) + self.cfg.MODEL.WEIGHTS = "https://dl.fbaipublicfiles.com/detic/Detic_LCOCOI21k_CLIP_SwinB_896b32_4x_ft4x_max-size.pth" else: self.cfg.MODEL.WEIGHTS = model_path @@ -182,11 +192,15 @@ def __init__(self, model_path=None, device="cuda", vocabulary=None, threshold=0. self.builtin_datasets = { "lvis": { "metadata": "lvis_v1_val", - "classifier": os.path.join(detic_path, "datasets/metadata/lvis_v1_clip_a+cname.npy"), + "classifier": os.path.join( + detic_path, "datasets/metadata/lvis_v1_clip_a+cname.npy" + ), }, "objects365": { "metadata": "objects365_v2_val", - "classifier": os.path.join(detic_path, "datasets/metadata/o365_clip_a+cnamefix.npy"), + "classifier": os.path.join( + detic_path, "datasets/metadata/o365_clip_a+cnamefix.npy" + ), }, "openimages": { "metadata": "oid_val_expanded", diff --git a/dimos/perception/detection2d/utils.py b/dimos/perception/detection2d/utils.py index 9698509e91..1eb26db76c 100644 --- a/dimos/perception/detection2d/utils.py +++ b/dimos/perception/detection2d/utils.py @@ -5,7 +5,14 @@ def filter_detections( - bboxes, track_ids, class_ids, confidences, names, class_filter=None, name_filter=None, track_id_filter=None + bboxes, + track_ids, + class_ids, + confidences, + names, + class_filter=None, + name_filter=None, + track_id_filter=None, ): """ Filter detection results based on class IDs, names, and/or tracking IDs. @@ -40,7 +47,9 @@ def filter_detections( filtered_names = [] # Filter detections - for bbox, track_id, class_id, conf, name in zip(bboxes, track_ids, class_ids, confidences, names): + for bbox, track_id, class_id, conf, name in zip( + bboxes, track_ids, class_ids, confidences, names + ): # Check if detection passes all specified filters keep = True @@ -61,7 +70,13 @@ def filter_detections( filtered_confidences.append(conf) filtered_names.append(name) - return (filtered_bboxes, filtered_track_ids, filtered_class_ids, filtered_confidences, filtered_names) + return ( + filtered_bboxes, + filtered_track_ids, + filtered_class_ids, + filtered_confidences, + filtered_names, + ) def extract_detection_results(result, class_filter=None, name_filter=None, track_id_filter=None): @@ -170,7 +185,9 @@ def plot_results(image, bboxes, track_ids, class_ids, confidences, names, alpha= cv2.rectangle(vis_img, (x1, y1 - text_h - 8), (x1 + text_w + 4, y1), color.tolist(), -1) # Draw text with white color for better visibility - cv2.putText(vis_img, label, (x1 + 2, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1) + cv2.putText( + vis_img, label, (x1 + 2, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1 + ) return vis_img @@ -306,6 +323,10 @@ def calculate_position_rotation_from_bbox(bbox, depth, camera_intrinsics): # We can use the negative of the angle as an estimate of the object's yaw # assuming objects tend to face the camera position = {"x": x, "y": y, "z": 0.0} # z=0 assuming objects are on the ground - rotation = {"roll": 0.0, "pitch": 0.0, "yaw": -angle} # Only yaw is meaningful with monocular camera + rotation = { + "roll": 0.0, + "pitch": 0.0, + "yaw": -angle, + } # Only yaw is meaningful with monocular camera return position, rotation diff --git a/dimos/perception/detection2d/yolo_2d_det.py b/dimos/perception/detection2d/yolo_2d_det.py index f6b76358a8..ad47b58eda 100644 --- a/dimos/perception/detection2d/yolo_2d_det.py +++ b/dimos/perception/detection2d/yolo_2d_det.py @@ -1,6 +1,10 @@ import cv2 from ultralytics import YOLO -from dimos.perception.detection2d.utils import extract_detection_results, plot_results, filter_detections +from dimos.perception.detection2d.utils import ( + extract_detection_results, + plot_results, + filter_detections, +) import os @@ -103,7 +107,9 @@ def main(): # Visualize results if len(bboxes) > 0: - frame = detector.visualize_results(frame, bboxes, track_ids, class_ids, confidences, names) + frame = detector.visualize_results( + frame, bboxes, track_ids, class_ids, confidences, names + ) # Display results cv2.imshow("YOLO Detection", frame) diff --git a/dimos/perception/object_detection_stream.py b/dimos/perception/object_detection_stream.py index e939fed2b0..daa9fa8273 100644 --- a/dimos/perception/object_detection_stream.py +++ b/dimos/perception/object_detection_stream.py @@ -114,7 +114,9 @@ def process_frame(frame): continue # Calculate object position and rotation - position, rotation = calculate_position_rotation_from_bbox(bbox, depth, self.camera_intrinsics) + position, rotation = calculate_position_rotation_from_bbox( + bbox, depth, self.camera_intrinsics + ) # Get object dimensions width, height = calculate_object_size_from_bbox(bbox, depth, self.camera_intrinsics) @@ -124,7 +126,9 @@ def process_frame(frame): if self.transform_to_map: position = Vector([position["x"], position["y"], position["z"]]) rotation = Vector([rotation["roll"], rotation["pitch"], rotation["yaw"]]) - position, rotation = self.transform_to_map(position, rotation, source_frame="base_link") + position, rotation = self.transform_to_map( + position, rotation, source_frame="base_link" + ) position = dict(x=position.x, y=position.y, z=position.z) rotation = dict(roll=rotation.x, pitch=rotation.y, yaw=rotation.z) except Exception as e: @@ -159,13 +163,25 @@ def process_frame(frame): # Draw text background text_size = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 2)[0] - cv2.rectangle(viz_frame, (x1, y1 - text_size[1] - 5), (x1 + text_size[0], y1), (0, 0, 0), -1) + cv2.rectangle( + viz_frame, (x1, y1 - text_size[1] - 5), (x1 + text_size[0], y1), (0, 0, 0), -1 + ) # Draw text - cv2.putText(viz_frame, text, (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2) + cv2.putText( + viz_frame, text, (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2 + ) # Position text below - cv2.putText(viz_frame, pos_text, (x1, y1 + 15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2) + cv2.putText( + viz_frame, + pos_text, + (x1, y1 + 15), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5, + (255, 255, 255), + 2, + ) return {"frame": frame, "viz_frame": viz_frame, "objects": objects} diff --git a/dimos/perception/object_tracker.py b/dimos/perception/object_tracker.py index 8f1e81c987..8b77434fa9 100644 --- a/dimos/perception/object_tracker.py +++ b/dimos/perception/object_tracker.py @@ -218,7 +218,9 @@ def process_frame(frame): if roi.size > 0: _, self.original_des = self.orb.detectAndCompute(roi, None) if self.original_des is None: - print("Warning: No ORB features found in initial ROI during stream processing.") + print( + "Warning: No ORB features found in initial ROI during stream processing." + ) else: print(f"Initial ORB features extracted: {len(self.original_des)}") @@ -228,7 +230,12 @@ def process_frame(frame): self.tracking_initialized = True tracker_succeeded = True reid_confirmed_this_frame = True # Assume re-id true on init - current_bbox_x1y1x2y2 = [x_init, y_init, x_init + w_init, y_init + h_init] + current_bbox_x1y1x2y2 = [ + x_init, + y_init, + x_init + w_init, + y_init + h_init, + ] print("Tracker initialized successfully.") else: print("Error: Tracker initialization failed in stream.") @@ -286,8 +293,13 @@ def process_frame(frame): if not reid_confirmed_this_frame: dist_text += " (Re-ID Failed - Tolerated)" - if self.distance_estimator is not None and self.distance_estimator.estimated_object_size is not None: - distance, angle = self.distance_estimator.estimate_distance_angle(current_bbox_x1y1x2y2) + if ( + self.distance_estimator is not None + and self.distance_estimator.estimated_object_size is not None + ): + distance, angle = self.distance_estimator.estimate_distance_angle( + current_bbox_x1y1x2y2 + ) if distance is not None: target_data["distance"] = distance target_data["angle"] = angle @@ -298,7 +310,15 @@ def process_frame(frame): text_size = cv2.getTextSize(dist_text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)[0] label_bg_y = max(y1 - text_size[1] - 5, 0) cv2.rectangle(viz_frame, (x1, label_bg_y), (x1 + text_size[0], y1), (0, 0, 0), -1) - cv2.putText(viz_frame, dist_text, (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1) + cv2.putText( + viz_frame, + dist_text, + (x1, y1 - 5), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5, + (255, 255, 255), + 1, + ) elif ( self.tracking_initialized @@ -308,7 +328,11 @@ def process_frame(frame): # else: # Not tracking or initialization failed, do nothing, return empty result # pass - return {"frame": frame, "viz_frame": viz_frame, "targets": [target_data] if target_data else []} + return { + "frame": frame, + "viz_frame": viz_frame, + "targets": [target_data] if target_data else [], + } return video_stream.pipe(ops.map(process_frame)) diff --git a/dimos/perception/person_tracker.py b/dimos/perception/person_tracker.py index 3537f235c7..50302d1382 100644 --- a/dimos/perception/person_tracker.py +++ b/dimos/perception/person_tracker.py @@ -9,7 +9,12 @@ class PersonTrackingStream: def __init__( - self, model_path="yolo11n.pt", device="cuda", camera_intrinsics=None, camera_pitch=0.0, camera_height=1.0 + self, + model_path="yolo11n.pt", + device="cuda", + camera_intrinsics=None, + camera_pitch=0.0, + camera_height=1.0, ): """ Initialize a person tracking stream using Yolo2DDetector and PersonDistanceEstimator. @@ -32,14 +37,19 @@ def __init__( raise ValueError("Camera intrinsics are required for distance estimation") # Validate camera intrinsics format [fx, fy, cx, cy] - if not isinstance(camera_intrinsics, (list, tuple, np.ndarray)) or len(camera_intrinsics) != 4: + if ( + not isinstance(camera_intrinsics, (list, tuple, np.ndarray)) + or len(camera_intrinsics) != 4 + ): raise ValueError("Camera intrinsics must be provided as [fx, fy, cx, cy]") # Convert [fx, fy, cx, cy] to 3x3 camera matrix fx, fy, cx, cy = camera_intrinsics K = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]], dtype=np.float32) - self.distance_estimator = PersonDistanceEstimator(K=K, camera_pitch=camera_pitch, camera_height=camera_height) + self.distance_estimator = PersonDistanceEstimator( + K=K, camera_pitch=camera_pitch, camera_height=camera_height + ) def create_stream(self, video_stream: Observable) -> Observable: """ @@ -57,21 +67,30 @@ def process_frame(frame): bboxes, track_ids, class_ids, confidences, names = self.detector.process_image(frame) # Filter to keep only person detections using filter_detections - filtered_bboxes, filtered_track_ids, filtered_class_ids, filtered_confidences, filtered_names = ( - filter_detections( - bboxes, - track_ids, - class_ids, - confidences, - names, - class_filter=[0], # 0 is the class_id for person - name_filter=["person"], - ) + ( + filtered_bboxes, + filtered_track_ids, + filtered_class_ids, + filtered_confidences, + filtered_names, + ) = filter_detections( + bboxes, + track_ids, + class_ids, + confidences, + names, + class_filter=[0], # 0 is the class_id for person + name_filter=["person"], ) # Create visualization viz_frame = self.detector.visualize_results( - frame, filtered_bboxes, filtered_track_ids, filtered_class_ids, filtered_confidences, filtered_names + frame, + filtered_bboxes, + filtered_track_ids, + filtered_class_ids, + filtered_confidences, + filtered_names, ) # Calculate distance and angle for each person @@ -80,7 +99,9 @@ def process_frame(frame): target_data = { "target_id": filtered_track_ids[i] if i < len(filtered_track_ids) else -1, "bbox": bbox, - "confidence": filtered_confidences[i] if i < len(filtered_confidences) else None, + "confidence": filtered_confidences[i] + if i < len(filtered_confidences) + else None, } distance, angle = self.distance_estimator.estimate_distance_angle(bbox) @@ -94,11 +115,19 @@ def process_frame(frame): # Add black background for better visibility text_size = cv2.getTextSize(dist_text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 2)[0] # Position at top-right corner - cv2.rectangle(viz_frame, (x2 - text_size[0], y1 - text_size[1] - 5), (x2, y1), (0, 0, 0), -1) + cv2.rectangle( + viz_frame, (x2 - text_size[0], y1 - text_size[1] - 5), (x2, y1), (0, 0, 0), -1 + ) # Draw text in white at top-right cv2.putText( - viz_frame, dist_text, (x2 - text_size[0], y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2 + viz_frame, + dist_text, + (x2 - text_size[0], y1 - 5), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5, + (255, 255, 255), + 2, ) targets.append(target_data) diff --git a/dimos/perception/segmentation/image_analyzer.py b/dimos/perception/segmentation/image_analyzer.py index b6bcbe25fa..8e6676116f 100644 --- a/dimos/perception/segmentation/image_analyzer.py +++ b/dimos/perception/segmentation/image_analyzer.py @@ -8,12 +8,14 @@ if does not look like an object, say 'unknown'. Export objects as a list of strings \ in this exact format '['object 1', 'object 2', '...']'." -RICH_PROMPT = "What are in these images? Give a detailed description of each item, the first n images will be \ +RICH_PROMPT = ( + "What are in these images? Give a detailed description of each item, the first n images will be \ cropped patches of the original image detected by the object detection model. \ The last image will be the original image. Use the last image only for context, \ do not describe objects in the last image. \ Export the objects as a list of strings in this exact format, '['description of object 1', '...', '...']', \ don't include anything else. " +) class ImageAnalyzer: @@ -51,7 +53,10 @@ def analyze_images(self, images, detail="auto", prompt_type="normal"): image_data = [ { "type": "image_url", - "image_url": {"url": f"data:image/jpeg;base64,{self.encode_image(img)}", "detail": detail}, + "image_url": { + "url": f"data:image/jpeg;base64,{self.encode_image(img)}", + "detail": detail, + }, } for img in images ] @@ -127,7 +132,9 @@ def main(): y = text_height + 10 # Add white background for text - cv2.rectangle(img, (x - 5, y - text_height - 5), (x + text_width + 5, y + 5), (255, 255, 255), -1) + cv2.rectangle( + img, (x - 5, y - text_height - 5), (x + text_width + 5, y + 5), (255, 255, 255), -1 + ) # Add text cv2.putText(img, text, (x, y), font, font_scale, (0, 0, 0), thickness) diff --git a/dimos/perception/segmentation/sam_2d_seg.py b/dimos/perception/segmentation/sam_2d_seg.py index 7cfaaf7727..97976dd007 100644 --- a/dimos/perception/segmentation/sam_2d_seg.py +++ b/dimos/perception/segmentation/sam_2d_seg.py @@ -75,7 +75,9 @@ def process_image(self, image): if len(results) > 0: # Get initial segmentation results - masks, bboxes, track_ids, probs, names, areas = extract_masks_bboxes_probs_names(results[0]) + masks, bboxes, track_ids, probs, names, areas = extract_masks_bboxes_probs_names( + results[0] + ) # Filter results ( @@ -100,8 +102,8 @@ def process_image(self, image): ) # Get tracked results - tracked_masks, tracked_bboxes, tracked_target_ids, tracked_probs, tracked_names = get_tracked_results( - tracked_targets + tracked_masks, tracked_bboxes, tracked_target_ids, tracked_probs, tracked_names = ( + get_tracked_results(tracked_targets) ) if self.use_analyzer: @@ -111,7 +113,9 @@ def process_image(self, image): # Remove untracked objects from object_names all_target_ids = list(self.tracker.targets.keys()) self.object_names = { - track_id: name for track_id, name in self.object_names.items() if track_id in all_target_ids + track_id: name + for track_id, name in self.object_names.items() + if track_id in all_target_ids } # Remove untracked objects from queue and results @@ -122,18 +126,37 @@ def process_image(self, image): # Filter out any IDs being analyzed from the to_be_analyzed queue if self.current_queue_ids: self.to_be_analyzed = deque( - [tid for tid in self.to_be_analyzed if tid not in self.current_queue_ids] + [ + tid + for tid in self.to_be_analyzed + if tid not in self.current_queue_ids + ] ) # Add new track_ids to analysis queue for track_id in tracked_target_ids: - if track_id not in self.object_names and track_id not in self.to_be_analyzed: + if ( + track_id not in self.object_names + and track_id not in self.to_be_analyzed + ): self.to_be_analyzed.append(track_id) - return tracked_masks, tracked_bboxes, tracked_target_ids, tracked_probs, tracked_names + return ( + tracked_masks, + tracked_bboxes, + tracked_target_ids, + tracked_probs, + tracked_names, + ) else: # Return filtered results directly if tracker is disabled - return filtered_masks, filtered_bboxes, filtered_track_ids, filtered_probs, filtered_names + return ( + filtered_masks, + filtered_bboxes, + filtered_track_ids, + filtered_probs, + filtered_names, + ) return [], [], [], [], [] def check_analysis_status(self, tracked_target_ids): @@ -209,7 +232,8 @@ def get_object_names(self, track_ids, tracked_names): return tracked_names return [ - self.object_names.get(track_id, tracked_name) for track_id, tracked_name in zip(track_ids, tracked_names) + self.object_names.get(track_id, tracked_name) + for track_id, tracked_name in zip(track_ids, tracked_names) ] def visualize_results(self, image, masks, bboxes, track_ids, probs, names): diff --git a/dimos/perception/segmentation/utils.py b/dimos/perception/segmentation/utils.py index 9fb5cb03ac..cdc265303c 100644 --- a/dimos/perception/segmentation/utils.py +++ b/dimos/perception/segmentation/utils.py @@ -169,7 +169,9 @@ def filter_segmentation_results( # Create mask_sum tensor where each pixel stores the index of the mask that claims it mask_sum = torch.zeros_like(masks[0], dtype=torch.int32) - texture_map = torch.from_numpy(texture_map).to(device) # Convert texture_map to tensor and move to device + texture_map = torch.from_numpy(texture_map).to( + device + ) # Convert texture_map to tensor and move to device filtered_texture_values = [] # List to store texture values of filtered masks @@ -181,7 +183,9 @@ def filter_segmentation_results( # Only claim pixels if mask passes texture threshold if texture_value >= texture_threshold: mask_sum[mask > 0] = i - filtered_texture_values.append(texture_value.item()) # Store the texture value as a Python float + filtered_texture_values.append( + texture_value.item() + ) # Store the texture value as a Python float # Get indices that appear in mask_sum (these are the masks we want to keep) keep_indices, counts = torch.unique(mask_sum[mask_sum > 0], return_counts=True) @@ -200,7 +204,14 @@ def filter_segmentation_results( filtered_probs = [probs[i] for i in final_indices] filtered_names = [names[i] for i in final_indices] - return filtered_masks, filtered_bboxes, filtered_track_ids, filtered_probs, filtered_names, filtered_texture_values + return ( + filtered_masks, + filtered_bboxes, + filtered_track_ids, + filtered_probs, + filtered_names, + filtered_texture_values, + ) def plot_results(image, masks, bboxes, track_ids, probs, names, alpha=0.5): diff --git a/dimos/perception/semantic_seg.py b/dimos/perception/semantic_seg.py index 2efdcd69a8..3ef2eb7399 100644 --- a/dimos/perception/semantic_seg.py +++ b/dimos/perception/semantic_seg.py @@ -103,7 +103,9 @@ def process_frame(frame): self.segmenter.run_analysis(frame, bboxes, target_ids) names = self.segmenter.get_object_names(target_ids, names) - viz_frame = self.segmenter.visualize_results(frame, masks, bboxes, target_ids, probs, names) + viz_frame = self.segmenter.visualize_results( + frame, masks, bboxes, target_ids, probs, names + ) # Process depth if enabled depth_viz = None @@ -134,9 +136,23 @@ def process_frame(frame): depth_text = f"{depth:.2f}mm" # Add black background for better visibility text_size = cv2.getTextSize(depth_text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 2)[0] - cv2.rectangle(viz_frame, (x1, y2 - text_size[1] - 5), (x1 + text_size[0], y2), (0, 0, 0), -1) + cv2.rectangle( + viz_frame, + (x1, y2 - text_size[1] - 5), + (x1 + text_size[0], y2), + (0, 0, 0), + -1, + ) # Draw text in white - cv2.putText(viz_frame, depth_text, (x1, y2 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2) + cv2.putText( + viz_frame, + depth_text, + (x1, y2 - 5), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5, + (255, 255, 255), + 2, + ) # Create metadata in the new requested format objects = [] @@ -181,7 +197,9 @@ def _create_depth_visualization(self, depth_map): # Normalize depth map to 0-255 range for visualization depth_min = np.min(depth_map) depth_max = np.max(depth_map) - depth_normalized = ((depth_map - depth_min) / (depth_max - depth_min) * 255).astype(np.uint8) + depth_normalized = ((depth_map - depth_min) / (depth_max - depth_min) * 255).astype( + np.uint8 + ) # Apply colormap (using JET colormap for better depth perception) depth_colored = cv2.applyColorMap(depth_normalized, cv2.COLORMAP_JET) @@ -193,13 +211,29 @@ def _create_depth_visualization(self, depth_map): # Create gradient for scale bar for i in range(scale_width): - color = cv2.applyColorMap(np.array([[i * 255 // scale_width]], dtype=np.uint8), cv2.COLORMAP_JET) + color = cv2.applyColorMap( + np.array([[i * 255 // scale_width]], dtype=np.uint8), cv2.COLORMAP_JET + ) scale_bar[:, i] = color[0, 0] # Add depth values to scale bar - cv2.putText(scale_bar, f"{depth_min:.1f}mm", (5, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1) cv2.putText( - scale_bar, f"{depth_max:.1f}mm", (scale_width - 60, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1 + scale_bar, + f"{depth_min:.1f}mm", + (5, 20), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5, + (255, 255, 255), + 1, + ) + cv2.putText( + scale_bar, + f"{depth_max:.1f}mm", + (scale_width - 60, 20), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5, + (255, 255, 255), + 1, ) # Combine depth map and scale bar diff --git a/dimos/perception/spatial_perception.py b/dimos/perception/spatial_perception.py index 337ba4b61b..6a9ee553b3 100644 --- a/dimos/perception/spatial_perception.py +++ b/dimos/perception/spatial_perception.py @@ -57,9 +57,13 @@ def __init__( new_memory: bool = False, # Whether to create a new memory from scratch output_dir: Optional[str] = None, # Directory for storing visual memory data chroma_client: Any = None, # Optional ChromaDB client for persistence - visual_memory: Optional["VisualMemory"] = None, # Optional VisualMemory instance for storing images + visual_memory: Optional[ + "VisualMemory" + ] = None, # Optional VisualMemory instance for storing images video_stream: Optional[Observable] = None, # Video stream to process - transform_provider: Optional[callable] = None, # Function that returns position and rotation + transform_provider: Optional[ + callable + ] = None, # Function that returns position and rotation ): """ Initialize the spatial perception system. @@ -111,7 +115,9 @@ def __init__( from chromadb.config import Settings import chromadb - self._chroma_client = chromadb.PersistentClient(path=db_path, settings=Settings(anonymized_telemetry=False)) + self._chroma_client = chromadb.PersistentClient( + path=db_path, settings=Settings(anonymized_telemetry=False) + ) # Initialize or load visual memory self._visual_memory = visual_memory @@ -122,7 +128,9 @@ def __init__( else: try: logger.info(f"Loading existing visual memory from {visual_memory_path}...") - self._visual_memory = VisualMemory.load(visual_memory_path, output_dir=output_dir) + self._visual_memory = VisualMemory.load( + visual_memory_path, output_dir=output_dir + ) logger.info(f"Loaded {self._visual_memory.count()} images from previous runs") except Exception as e: logger.error(f"Error loading visual memory: {e}") @@ -130,7 +138,9 @@ def __init__( # Initialize vector database self.vector_db: SpatialVectorDB = SpatialVectorDB( - collection_name=collection_name, chroma_client=self._chroma_client, visual_memory=self._visual_memory + collection_name=collection_name, + chroma_client=self._chroma_client, + visual_memory=self._visual_memory, ) self.embedding_provider: ImageEmbeddingProvider = ImageEmbeddingProvider( @@ -155,7 +165,9 @@ def __init__( if video_stream is not None and transform_provider is not None: self.start_continuous_processing(video_stream, transform_provider) - def query_by_location(self, x: float, y: float, radius: float = 2.0, limit: int = 5) -> List[Dict]: + def query_by_location( + self, x: float, y: float, radius: float = 2.0, limit: int = 5 + ) -> List[Dict]: """ Query the vector database for images near the specified location. @@ -190,7 +202,9 @@ def start_continuous_processing( combined_stream = video_stream.pipe( ops.map(lambda video_frame: {"frame": video_frame, **transform_provider()}), # Filter out bad transforms - ops.filter(lambda data: data.get("position") is not None and data.get("rotation") is not None), + ops.filter( + lambda data: data.get("position") is not None and data.get("rotation") is not None + ), ) # Process with spatial memory @@ -285,7 +299,10 @@ def process_combined_data(data): logger.debug("Position has not moved, skipping frame") return None - if self.last_record_time is not None and (time.time() - self.last_record_time) < self.min_time_threshold: + if ( + self.last_record_time is not None + and (time.time() - self.last_record_time) < self.min_time_threshold + ): logger.debug("Time since last record too short, skipping frame") return None @@ -329,7 +346,9 @@ def process_combined_data(data): "timestamp": current_time, } - return combined_stream.pipe(ops.map(process_combined_data), ops.filter(lambda result: result is not None)) + return combined_stream.pipe( + ops.map(process_combined_data), ops.filter(lambda result: result is not None) + ) def query_by_image(self, image: np.ndarray, limit: int = 5) -> List[Dict]: """ diff --git a/dimos/perception/visual_servoing.py b/dimos/perception/visual_servoing.py index 2c9d6dcc10..40cee7c60c 100644 --- a/dimos/perception/visual_servoing.py +++ b/dimos/perception/visual_servoing.py @@ -155,7 +155,9 @@ def start_tracking( if result is not None: break - logger.warning(f"Attempt {attempt + 1}: No tracking result, retrying in 1 second...") + logger.warning( + f"Attempt {attempt + 1}: No tracking result, retrying in 1 second..." + ) time.sleep(3) # Wait 1 second between attempts if result is None: @@ -480,7 +482,9 @@ def is_goal_reached(self, distance_threshold=0.2, angle_threshold=0.1) -> bool: distance_error = abs(self.current_distance - self.desired_distance) angle_error = abs(self.current_angle) # Desired angle is always 0 (centered) - logger.debug(f"Goal check - Distance error: {distance_error:.2f}m, Angle error: {angle_error:.2f}rad") + logger.debug( + f"Goal check - Distance error: {distance_error:.2f}m, Angle error: {angle_error:.2f}rad" + ) return (distance_error <= distance_threshold) and (angle_error <= angle_threshold) diff --git a/dimos/robot/global_planner/algo.py b/dimos/robot/global_planner/algo.py index ad101b0fca..c710b2027e 100644 --- a/dimos/robot/global_planner/algo.py +++ b/dimos/robot/global_planner/algo.py @@ -53,13 +53,17 @@ def find_nearest_free_cell( # Check if we've reached the maximum search radius if dist > max_search_radius: - print(f"Could not find free cell within {max_search_radius} cells of ({start_x}, {start_y})") + print( + f"Could not find free cell within {max_search_radius} cells of ({start_x}, {start_y})" + ) return (start_x, start_y) # Return original position if no free cell found # Check if this cell is valid and free if 0 <= x < costmap.width and 0 <= y < costmap.height: if costmap.grid[y, x] < cost_threshold: - print(f"Found free cell at ({x}, {y}), {dist} cells away from ({start_x}, {start_y})") + print( + f"Found free cell at ({x}, {y}), {dist} cells away from ({start_x}, {start_y})" + ) return (x, y) # Add neighbors to the queue @@ -169,7 +173,9 @@ def heuristic(x1, y1, x2, y2): return math.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2) # Start with the starting node - f_score = g_score[start_tuple] + heuristic(start_tuple[0], start_tuple[1], goal_tuple[0], goal_tuple[1]) + f_score = g_score[start_tuple] + heuristic( + start_tuple[0], start_tuple[1], goal_tuple[0], goal_tuple[1] + ) heapq.heappush(open_set, (f_score, start_tuple)) while open_set: @@ -228,7 +234,11 @@ def heuristic(x1, y1, x2, y2): continue obstacle_proximity_penalty = costmap.grid[neighbor_y, neighbor_x] / 25 - tentative_g_score = g_score[current] + movement_costs[i] + (obstacle_proximity_penalty * movement_costs[i]) + tentative_g_score = ( + g_score[current] + + movement_costs[i] + + (obstacle_proximity_penalty * movement_costs[i]) + ) # Get the current g_score for the neighbor or set to infinity if not yet explored neighbor_g_score = g_score.get(neighbor, float("inf")) @@ -238,7 +248,9 @@ def heuristic(x1, y1, x2, y2): # Update the neighbor's scores and parent parents[neighbor] = current g_score[neighbor] = tentative_g_score - f_score = tentative_g_score + heuristic(neighbor_x, neighbor_y, goal_tuple[0], goal_tuple[1]) + f_score = tentative_g_score + heuristic( + neighbor_x, neighbor_y, goal_tuple[0], goal_tuple[1] + ) # Add the neighbor to the open set with its f_score heapq.heappush(open_set, (f_score, neighbor)) diff --git a/dimos/robot/global_planner/planner.py b/dimos/robot/global_planner/planner.py index 0a59428917..f4b5d17bc6 100644 --- a/dimos/robot/global_planner/planner.py +++ b/dimos/robot/global_planner/planner.py @@ -35,7 +35,10 @@ class Planner(Visualizable): def plan(self, goal: VectorLike) -> Path: ... def set_goal( - self, goal: VectorLike, goal_theta: Optional[float] = None, stop_event: Optional[threading.Event] = None + self, + goal: VectorLike, + goal_theta: Optional[float] = None, + stop_event: Optional[threading.Event] = None, ): path = self.plan(goal) if not path: diff --git a/dimos/robot/local_planner/local_planner.py b/dimos/robot/local_planner/local_planner.py index 3036d43381..d7af8a80d3 100644 --- a/dimos/robot/local_planner/local_planner.py +++ b/dimos/robot/local_planner/local_planner.py @@ -98,7 +98,9 @@ def __init__( # Stuck detection self.stuck_detection_window_seconds = 8.0 # Time window for stuck detection (seconds) self.position_history_size = int(self.stuck_detection_window_seconds * control_frequency) - self.position_history = deque(maxlen=self.position_history_size) # History of recent positions + self.position_history = deque( + maxlen=self.position_history_size + ) # History of recent positions self.stuck_distance_threshold = 0.1 # Distance threshold for stuck detection (meters) self.unstuck_distance_threshold = 0.5 # Distance threshold for unstuck detection (meters) self.stuck_time_threshold = 4.0 # Time threshold for stuck detection (seconds) @@ -127,7 +129,9 @@ def reset(self): logger.info("Local planner state has been reset") - def set_goal(self, goal_xy: VectorLike, frame: str = "odom", goal_theta: Optional[float] = None): + def set_goal( + self, goal_xy: VectorLike, frame: str = "odom", goal_theta: Optional[float] = None + ): """Set a single goal position, converting to odom frame if necessary. This clears any existing waypoints being followed. @@ -147,13 +151,21 @@ def set_goal(self, goal_xy: VectorLike, frame: str = "odom", goal_theta: Optiona target_goal_xy: Optional[Tuple[float, float]] = None - target_goal_xy = self.transform.transform_point(goal_xy, source_frame=frame, target_frame="odom").to_tuple() + target_goal_xy = self.transform.transform_point( + goal_xy, source_frame=frame, target_frame="odom" + ).to_tuple() - logger.info(f"Goal set directly in odom frame: ({target_goal_xy[0]:.2f}, {target_goal_xy[1]:.2f})") + logger.info( + f"Goal set directly in odom frame: ({target_goal_xy[0]:.2f}, {target_goal_xy[1]:.2f})" + ) # Check if goal is valid (in bounds and not colliding) - if not self.is_goal_in_costmap_bounds(target_goal_xy) or self.check_goal_collision(target_goal_xy): - logger.warning("Goal is in collision or out of bounds. Adjusting goal to valid position.") + if not self.is_goal_in_costmap_bounds(target_goal_xy) or self.check_goal_collision( + target_goal_xy + ): + logger.warning( + "Goal is in collision or out of bounds. Adjusting goal to valid position." + ) self.goal_xy = self.adjust_goal_to_valid_position(target_goal_xy) else: self.goal_xy = target_goal_xy # Set the adjusted or original valid goal @@ -165,7 +177,9 @@ def set_goal(self, goal_xy: VectorLike, frame: str = "odom", goal_theta: Optiona ) self.goal_theta = transformed_rot[2] - def set_goal_waypoints(self, waypoints: Path, frame: str = "map", goal_theta: Optional[float] = None): + def set_goal_waypoints( + self, waypoints: Path, frame: str = "map", goal_theta: Optional[float] = None + ): """Sets a path of waypoints for the robot to follow. Args: @@ -191,11 +205,15 @@ def set_goal_waypoints(self, waypoints: Path, frame: str = "map", goal_theta: Op self.current_waypoint_index = 0 # Transform waypoints to odom frame - self.waypoints_in_odom = self.transform.transform_path(self.waypoints, source_frame=frame, target_frame="odom") + self.waypoints_in_odom = self.transform.transform_path( + self.waypoints, source_frame=frame, target_frame="odom" + ) # Set the initial target to the first waypoint, adjusting if necessary first_waypoint = self.waypoints_in_odom[0] - if not self.is_goal_in_costmap_bounds(first_waypoint) or self.check_goal_collision(first_waypoint): + if not self.is_goal_in_costmap_bounds(first_waypoint) or self.check_goal_collision( + first_waypoint + ): logger.warning("First waypoint is invalid. Adjusting...") self.goal_xy = self.adjust_goal_to_valid_position(first_waypoint) else: @@ -244,7 +262,9 @@ def _distance_to_position(self, target_position: Tuple[float, float]) -> float: Distance in meters """ robot_pos, _ = self._get_robot_pose() - return np.linalg.norm([target_position[0] - robot_pos[0], target_position[1] - robot_pos[1]]) + return np.linalg.norm( + [target_position[0] - robot_pos[0], target_position[1] - robot_pos[1]] + ) def plan(self) -> Dict[str, float]: """ @@ -256,7 +276,11 @@ def plan(self) -> Dict[str, float]: Dict[str, float]: Velocity commands with 'x_vel' and 'angular_vel' keys """ # If goal orientation is specified, rotate to match it - if self.position_reached and self.goal_theta is not None and not self._is_goal_orientation_reached(): + if ( + self.position_reached + and self.goal_theta is not None + and not self._is_goal_orientation_reached() + ): logger.info("Position goal reached. Rotating to target orientation.") return self._rotate_to_goal_orientation() @@ -400,7 +424,9 @@ def _is_goal_orientation_reached(self) -> bool: # Calculate the angle difference and normalize angle_diff = abs(normalize_angle(self.goal_theta - robot_theta)) - logger.debug(f"Orientation error: {angle_diff:.4f} rad, tolerance: {self.angle_tolerance:.4f} rad") + logger.debug( + f"Orientation error: {angle_diff:.4f} rad, tolerance: {self.angle_tolerance:.4f} rad" + ) return angle_diff <= self.angle_tolerance def _update_waypoint_target(self, robot_pos_np: np.ndarray) -> bool: @@ -453,7 +479,9 @@ def _update_waypoint_target(self, robot_pos_np: np.ndarray) -> bool: self.current_waypoint_index = len(self.waypoints_in_odom) - 1 # Set the lookahead point as the immediate target, adjusting if needed - if not self.is_goal_in_costmap_bounds(lookahead_point) or self.check_goal_collision(lookahead_point): + if not self.is_goal_in_costmap_bounds(lookahead_point) or self.check_goal_collision( + lookahead_point + ): logger.debug("Lookahead point is invalid. Adjusting...") adjusted_lookahead = self.adjust_goal_to_valid_position(lookahead_point) # Only update if adjustment didn't fail completely @@ -579,7 +607,9 @@ def is_goal_in_costmap_bounds(self, goal_xy: VectorLike) -> bool: return is_in_bounds - def adjust_goal_to_valid_position(self, goal_xy: VectorLike, clearance: float = 0.5) -> Tuple[float, float]: + def adjust_goal_to_valid_position( + self, goal_xy: VectorLike, clearance: float = 0.5 + ) -> Tuple[float, float]: """Find a valid (non-colliding) goal position by moving it towards the robot. Args: @@ -639,9 +669,9 @@ def adjust_goal_to_valid_position(self, goal_xy: VectorLike, clearance: float = break # Check if this position is valid - if not self.check_goal_collision((current_x, current_y)) and self.is_goal_in_costmap_bounds( + if not self.check_goal_collision( (current_x, current_y) - ): + ) and self.is_goal_in_costmap_bounds((current_x, current_y)): # Store the first valid position if not valid_found: valid_found = True @@ -657,13 +687,17 @@ def adjust_goal_to_valid_position(self, goal_xy: VectorLike, clearance: float = clearance_x = current_x + dx * clearance clearance_y = current_y + dy * clearance - logger.info(f"Checking clearance position at ({clearance_x:.2f}, {clearance_y:.2f})") + logger.info( + f"Checking clearance position at ({clearance_x:.2f}, {clearance_y:.2f})" + ) # Check if the clearance position is also valid - if not self.check_goal_collision((clearance_x, clearance_y)) and self.is_goal_in_costmap_bounds( + if not self.check_goal_collision( (clearance_x, clearance_y) - ): - logger.info(f"Found valid goal with clearance at ({clearance_x:.2f}, {clearance_y:.2f})") + ) and self.is_goal_in_costmap_bounds((clearance_x, clearance_y)): + logger.info( + f"Found valid goal with clearance at ({clearance_x:.2f}, {clearance_y:.2f})" + ) return (clearance_x, clearance_y) # Return the valid position without clearance @@ -675,7 +709,9 @@ def adjust_goal_to_valid_position(self, goal_xy: VectorLike, clearance: float = logger.info(f"Using valid goal found at ({valid_x:.2f}, {valid_y:.2f})") return (valid_x, valid_y) - logger.warning(f"Could not find valid goal after {steps} steps, using closest point to robot") + logger.warning( + f"Could not find valid goal after {steps} steps, using closest point to robot" + ) return (current_x, current_y) def check_if_stuck(self) -> bool: @@ -735,7 +771,9 @@ def check_if_stuck(self) -> bool: ) if is_currently_stuck: - logger.warning(f"Robot appears to be stuck! Displacement {displacement:.3f}m over {time_range:.1f}s") + logger.warning( + f"Robot appears to be stuck! Displacement {displacement:.3f}m over {time_range:.1f}s" + ) # Don't trigger recovery if it's already active if not self.is_recovery_active: @@ -745,7 +783,9 @@ def check_if_stuck(self) -> bool: # Check if we've been trying to recover for too long elif current_time - self.recovery_start_time > self.recovery_duration: - logger.error(f"Recovery behavior has been active for {self.recovery_duration}s without success") + logger.error( + f"Recovery behavior has been active for {self.recovery_duration}s without success" + ) # Reset recovery state - maybe a different behavior will work self.is_recovery_active = False self.recovery_start_time = current_time @@ -814,7 +854,9 @@ def navigate_to_goal_local( Returns: bool: True if the goal was reached within the timeout, False otherwise. """ - logger.info(f"Starting navigation to local goal {goal_xy_robot} with distance {distance}m and timeout {timeout}s.") + logger.info( + f"Starting navigation to local goal {goal_xy_robot} with distance {distance}m and timeout {timeout}s." + ) goal_x, goal_y = goal_xy_robot @@ -902,7 +944,9 @@ def navigate_path_local( Returns: bool: True if the entire path was successfully followed, False otherwise """ - logger.info(f"Starting navigation along path with {len(path)} waypoints and timeout {timeout}s.") + logger.info( + f"Starting navigation along path with {len(path)} waypoints and timeout {timeout}s." + ) # Set the path in the local planner robot.local_planner.set_goal_waypoints(path, goal_theta=goal_theta) @@ -939,7 +983,9 @@ def navigate_path_local( time.sleep(control_period) if not path_completed: - logger.warning(f"Path following timed out after {timeout} seconds before completing the path.") + logger.warning( + f"Path following timed out after {timeout} seconds before completing the path." + ) except KeyboardInterrupt: logger.info("Path navigation interrupted by user.") @@ -1010,8 +1056,12 @@ def visualize_local_planner_state( half_size_cells = int(map_size_meters / grid_resolution / 2) # Draw grid cells (using standard occupancy coloring) - for y in range(max(0, robot_cell_y - half_size_cells), min(grid_height, robot_cell_y + half_size_cells)): - for x in range(max(0, robot_cell_x - half_size_cells), min(grid_width, robot_cell_x + half_size_cells)): + for y in range( + max(0, robot_cell_y - half_size_cells), min(grid_height, robot_cell_y + half_size_cells) + ): + for x in range( + max(0, robot_cell_x - half_size_cells), min(grid_width, robot_cell_x + half_size_cells) + ): cell_rel_x_meters = (x - robot_cell_x) * grid_resolution cell_rel_y_meters = (y - robot_cell_y) * grid_resolution @@ -1064,7 +1114,9 @@ def visualize_local_planner_state( # Connect waypoints with lines to show the path if len(path_points) > 1: for i in range(len(path_points) - 1): - cv2.line(vis_img, path_points[i], path_points[i + 1], (0, 200, 0), 1) # Green line + cv2.line( + vis_img, path_points[i], path_points[i + 1], (0, 200, 0), 1 + ) # Green line except Exception as e: logger.error(f"Error drawing waypoints: {e}") @@ -1121,12 +1173,17 @@ def visualize_local_planner_state( dtype=np.float32, ) rotation_matrix = np.array( - [[math.cos(robot_theta), -math.sin(robot_theta)], [math.sin(robot_theta), math.cos(robot_theta)]] + [ + [math.cos(robot_theta), -math.sin(robot_theta)], + [math.sin(robot_theta), math.cos(robot_theta)], + ] ) robot_pts = np.dot(robot_pts, rotation_matrix.T) robot_pts[:, 0] += center_x robot_pts[:, 1] = center_y - robot_pts[:, 1] # Flip y-axis - cv2.fillPoly(vis_img, [robot_pts.reshape((-1, 1, 2)).astype(np.int32)], (0, 0, 255)) # Red robot + cv2.fillPoly( + vis_img, [robot_pts.reshape((-1, 1, 2)).astype(np.int32)], (0, 0, 255) + ) # Red robot # Draw robot direction line front_x = int(center_x + (robot_length_px / 2) * math.cos(robot_theta)) @@ -1145,7 +1202,9 @@ def visualize_local_planner_state( sel_end_x = int(center_x + sel_dir_line_length * math.cos(vis_angle_selected)) sel_end_y = int(center_y - sel_dir_line_length * math.sin(vis_angle_selected)) # Flipped Y - cv2.line(vis_img, (center_x, center_y), (sel_end_x, sel_end_y), (0, 165, 255), 2) # BGR for Orange + cv2.line( + vis_img, (center_x, center_y), (sel_end_x, sel_end_y), (0, 165, 255), 2 + ) # BGR for Orange # Draw goal if goal_xy is not None: @@ -1183,15 +1242,27 @@ def visualize_local_planner_state( # Draw goal orientation arrow if 0 <= goal_img_x < vis_size and 0 <= goal_img_y < vis_size: cv2.arrowedLine( - vis_img, (goal_img_x, goal_img_y), (goal_dir_end_x, goal_dir_end_y), (255, 0, 255), 4 + vis_img, + (goal_img_x, goal_img_y), + (goal_dir_end_x, goal_dir_end_y), + (255, 0, 255), + 4, ) # Magenta arrow # Add scale bar scale_bar_length_px = int(1.0 * scale) scale_bar_x = vis_size - scale_bar_length_px - 10 scale_bar_y = vis_size - 20 - cv2.line(vis_img, (scale_bar_x, scale_bar_y), (scale_bar_x + scale_bar_length_px, scale_bar_y), (0, 0, 0), 2) - cv2.putText(vis_img, "1m", (scale_bar_x, scale_bar_y - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0, 0, 0), 1) + cv2.line( + vis_img, + (scale_bar_x, scale_bar_y), + (scale_bar_x + scale_bar_length_px, scale_bar_y), + (0, 0, 0), + 2, + ) + cv2.putText( + vis_img, "1m", (scale_bar_x, scale_bar_y - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0, 0, 0), 1 + ) # Add status info status_text = [] diff --git a/dimos/robot/local_planner/vfh_local_planner.py b/dimos/robot/local_planner/vfh_local_planner.py index 18c9bda1e8..d133c3ade8 100644 --- a/dimos/robot/local_planner/vfh_local_planner.py +++ b/dimos/robot/local_planner/vfh_local_planner.py @@ -145,7 +145,9 @@ def _compute_velocity_commands(self) -> Dict[str, float]: linear_vel *= turn_factor # Apply Collision Avoidance Stop - skip if ignoring obstacles - if not self.ignore_obstacles and self.check_collision(self.selected_direction, safety_threshold=0.5): + if not self.ignore_obstacles and self.check_collision( + self.selected_direction, safety_threshold=0.5 + ): # Re-select direction prioritizing obstacle avoidance if colliding self.selected_direction = self.select_direction( self.goal_weight * 0.2, @@ -154,7 +156,9 @@ def _compute_velocity_commands(self) -> Dict[str, float]: self.histogram, goal_direction, ) - linear_vel, angular_vel = self.compute_pure_pursuit(goal_distance, self.selected_direction) + linear_vel, angular_vel = self.compute_pure_pursuit( + goal_distance, self.selected_direction + ) if self.check_collision(0.0, safety_threshold=self.safety_threshold): logger.warning("Collision detected ahead. Stopping.") @@ -242,7 +246,9 @@ def build_polar_histogram(self, costmap: Costmap, robot_pose: Tuple[float, float angles_robot = normalize_angle(angles_grid - robot_theta) # Convert to bin indices - bin_indices = ((angles_robot + np.pi) / (2 * np.pi) * self.histogram_bins).astype(int) % self.histogram_bins + bin_indices = ((angles_robot + np.pi) / (2 * np.pi) * self.histogram_bins).astype( + int + ) % self.histogram_bins # Get obstacle values obstacle_values = occupancy_grid[y_indices, x_indices] / 100.0 @@ -256,7 +262,9 @@ def build_polar_histogram(self, costmap: Costmap, robot_pose: Tuple[float, float # Apply the enhanced smoothing return self._smooth_histogram(histogram) - def select_direction(self, goal_weight, obstacle_weight, prev_direction_weight, histogram, goal_direction): + def select_direction( + self, goal_weight, obstacle_weight, prev_direction_weight, histogram, goal_direction + ): """ Select best direction based on a simple weighted cost function. @@ -294,7 +302,9 @@ def select_direction(self, goal_weight, obstacle_weight, prev_direction_weight, return selected_angle - def compute_pure_pursuit(self, goal_distance: float, goal_direction: float) -> Tuple[float, float]: + def compute_pure_pursuit( + self, goal_distance: float, goal_direction: float + ) -> Tuple[float, float]: """Compute pure pursuit velocities.""" if goal_distance < self.goal_tolerance: return 0.0, 0.0 @@ -365,7 +375,9 @@ def update_visualization(self) -> np.ndarray: # Get waypoint data if in waypoint mode waypoints_to_draw = self.waypoints_in_odom - current_wp_index_to_draw = self.current_waypoint_index if self.waypoints_in_odom is not None else None + current_wp_index_to_draw = ( + self.current_waypoint_index if self.waypoints_in_odom is not None else None + ) # Ensure index is valid before passing if waypoints_to_draw is not None and current_wp_index_to_draw is not None: if not (0 <= current_wp_index_to_draw < len(waypoints_to_draw)): @@ -389,7 +401,9 @@ def update_visualization(self) -> np.ndarray: except Exception as e: logger.error(f"Error during visualization update: {e}") # Return a blank image with error text - blank = np.ones((self.visualization_size, self.visualization_size, 3), dtype=np.uint8) * 255 + blank = ( + np.ones((self.visualization_size, self.visualization_size, 3), dtype=np.uint8) * 255 + ) cv2.putText( blank, "Viz Error", diff --git a/dimos/robot/position_stream.py b/dimos/robot/position_stream.py index 68780da0f3..05d80b8bcf 100644 --- a/dimos/robot/position_stream.py +++ b/dimos/robot/position_stream.py @@ -41,7 +41,11 @@ class PositionStreamProvider: """ def __init__( - self, ros_node: Node, odometry_topic: str = "/odom", pose_topic: Optional[str] = None, use_odometry: bool = True + self, + ros_node: Node, + odometry_topic: str = "/odom", + pose_topic: Optional[str] = None, + use_odometry: bool = True, ): """ Initialize the position stream provider. @@ -81,7 +85,9 @@ def _create_subscription(self): if not self.pose_topic: raise ValueError("Pose topic must be specified when use_odometry is False") - self.subscription = self.ros_node.create_subscription(PoseStamped, self.pose_topic, self._pose_callback, 10) + self.subscription = self.ros_node.create_subscription( + PoseStamped, self.pose_topic, self._pose_callback, 10 + ) logger.info(f"Subscribed to pose topic: {self.pose_topic}") def _odometry_callback(self, msg: Odometry): diff --git a/dimos/robot/recorder.py b/dimos/robot/recorder.py index eb98e16d52..56b6cea888 100644 --- a/dimos/robot/recorder.py +++ b/dimos/robot/recorder.py @@ -125,7 +125,9 @@ def _process_queue(self) -> None: """Processes the recording queue asynchronously.""" while True: image, instruction, action, state = self.recording_queue.get() - self.recorder.record(observation={"image": image, "instruction": instruction}, action=action, state=state) + self.recorder.record( + observation={"image": image, "instruction": instruction}, action=action, state=state + ) self.recording_queue.task_done() def record_current_state(self) -> None: diff --git a/dimos/robot/robot.py b/dimos/robot/robot.py index e69734bbd2..aaf4b57083 100644 --- a/dimos/robot/robot.py +++ b/dimos/robot/robot.py @@ -95,7 +95,9 @@ def __init__( logger.info(f"Robot outputs will be saved to: {self.output_dir}") # Initialize spatial memory properties - self.spatial_memory_dir = spatial_memory_dir or os.path.join(self.output_dir, "spatial_memory") + self.spatial_memory_dir = spatial_memory_dir or os.path.join( + self.output_dir, "spatial_memory" + ) self.spatial_memory_collection = spatial_memory_collection self.db_path = os.path.join(self.spatial_memory_dir, "chromadb_data") self.visual_memory_path = os.path.join(self.spatial_memory_dir, "visual_memory.pkl") diff --git a/dimos/robot/ros_command_queue.py b/dimos/robot/ros_command_queue.py index bfd8ef3ba5..fc48ce5cde 100644 --- a/dimos/robot/ros_command_queue.py +++ b/dimos/robot/ros_command_queue.py @@ -110,7 +110,9 @@ def __init__( self._failure_count = 0 self._command_history = [] - self._max_queue_wait_time = 30.0 # Maximum time to wait for robot to be ready before forcing + self._max_queue_wait_time = ( + 30.0 # Maximum time to wait for robot to be ready before forcing + ) logger.info("ROSCommandQueue initialized") @@ -203,7 +205,9 @@ def execute_webrtc(): # Wait for the robot to complete the command (timeout check) while self._is_busy_func() and (time.time() - start_time) < timeout: - if self._debug and (time.time() - start_time) % 5 < 0.1: # Print every ~5 seconds + if ( + self._debug and (time.time() - start_time) % 5 < 0.1 + ): # Print every ~5 seconds logger.debug( f"[WebRTC Queue] Still waiting on API ID {api_id} - elapsed: {time.time() - start_time:.1f}s" ) @@ -216,7 +220,9 @@ def execute_webrtc(): wait_time = time.time() - start_time if self._debug: - logger.debug(f"[WebRTC Queue] Request API ID {api_id} completed after {wait_time:.1f}s") + logger.debug( + f"[WebRTC Queue] Request API ID {api_id} completed after {wait_time:.1f}s" + ) logger.info(f"WebRTC request completed: {api_id} (ID: {request_id})") return True @@ -248,7 +254,12 @@ def execute_webrtc(): return request_id def queue_action_client_request( - self, action_name: str, execute_func: Callable, priority: int = 0, timeout: float = 30.0, **kwargs + self, + action_name: str, + execute_func: Callable, + priority: int = 0, + timeout: float = 30.0, + **kwargs, ) -> str: """ Queue any action client request or function @@ -308,7 +319,9 @@ def _process_queue(self): # Track robot state changes if is_ready != self._last_ready_state: - logger.debug(f"Robot ready state changed: {self._last_ready_state} -> {is_ready}") + logger.debug( + f"Robot ready state changed: {self._last_ready_state} -> {is_ready}" + ) self._last_ready_state = is_ready if is_busy != self._last_busy_state: @@ -371,7 +384,9 @@ def _process_queue(self): self._success_count += 1 logger.info(f"Command succeeded: {cmd_info}") if self._debug: - logger.debug(f"[WebRTC Queue] Command {command.id} marked as COMPLETED") + logger.debug( + f"[WebRTC Queue] Command {command.id} marked as COMPLETED" + ) else: self._failure_count += 1 logger.warning(f"Command failed: {cmd_info}") @@ -398,7 +413,9 @@ def _process_queue(self): # Mark the command as complete self._current_command = None if self._debug: - logger.debug("[WebRTC Queue] Adding 0.5s stabilization delay before next command") + logger.debug( + "[WebRTC Queue] Adding 0.5s stabilization delay before next command" + ) time.sleep(0.5) except Empty: diff --git a/dimos/robot/ros_control.py b/dimos/robot/ros_control.py index 6e1fb4b50d..454f41c2b6 100644 --- a/dimos/robot/ros_control.py +++ b/dimos/robot/ros_control.py @@ -177,8 +177,12 @@ def __init__( topic = camera_config["topic"] msg_type = camera_config["type"] - logger.info(f"Subscribing to {topic} with BEST_EFFORT QoS using message type {msg_type.__name__}") - _camera_subscription = self._node.create_subscription(msg_type, topic, self._image_callback, sensor_qos) + logger.info( + f"Subscribing to {topic} with BEST_EFFORT QoS using message type {msg_type.__name__}" + ) + _camera_subscription = self._node.create_subscription( + msg_type, topic, self._image_callback, sensor_qos + ) self._subscriptions.append(_camera_subscription) # Subscribe to state topic if provided @@ -192,7 +196,9 @@ def __init__( ) self._subscriptions.append(self._state_sub) else: - logger.warning("No state topic andor message type provided - robot state tracking will be unavailable") + logger.warning( + "No state topic andor message type provided - robot state tracking will be unavailable" + ) if self._imu_topic and self._imu_msg_type: self._imu_sub = self._node.create_subscription( @@ -200,13 +206,19 @@ def __init__( ) self._subscriptions.append(self._imu_sub) else: - logger.warning("No IMU topic and/or message type provided - IMU data tracking will be unavailable") + logger.warning( + "No IMU topic and/or message type provided - IMU data tracking will be unavailable" + ) if self._odom_topic: - self._odom_sub = self._node.create_subscription(Odometry, self._odom_topic, self._odom_callback, sensor_qos) + self._odom_sub = self._node.create_subscription( + Odometry, self._odom_topic, self._odom_callback, sensor_qos + ) self._subscriptions.append(self._odom_sub) else: - logger.warning("No odometry topic provided - odometry data tracking will be unavailable") + logger.warning( + "No odometry topic provided - odometry data tracking will be unavailable" + ) if self._costmap_topic: self._costmap_sub = self._node.create_subscription( @@ -228,7 +240,9 @@ def __init__( self._pose_pub = self._node.create_publisher(Vector3, pose_topic, command_qos) if webrtc_msg_type: - self._webrtc_pub = self._node.create_publisher(webrtc_msg_type, webrtc_topic, qos_profile=command_qos) + self._webrtc_pub = self._node.create_publisher( + webrtc_msg_type, webrtc_topic, qos_profile=command_qos + ) # Initialize command queue self._command_queue = ROSCommandQueue( @@ -261,7 +275,9 @@ def get_global_costmap(self) -> Optional[OccupancyGrid]: Optional[OccupancyGrid]: Current global_costmap data or None if not available """ if not self._global_costmap_topic: - logger.warning("No global_costmap topic provided - global_costmap data tracking will be unavailable") + logger.warning( + "No global_costmap topic provided - global_costmap data tracking will be unavailable" + ) return None if self._global_costmap_data: @@ -357,7 +373,9 @@ def get_odometry(self) -> Optional[Odometry]: Optional[Odometry]: Current odometry data or None if not available """ if not self._odom_topic: - logger.warning("No odometry topic provided - odometry data tracking will be unavailable") + logger.warning( + "No odometry topic provided - odometry data tracking will be unavailable" + ) return None return self._odom_data @@ -426,7 +444,9 @@ def _send_action_client_goal(self, client, goal_msg, description=None, time_allo time.sleep(0.1) elapsed = time.time() - start_time - print(f"[ROSControl] Action completed in {elapsed:.2f}s with result: {self._action_success}") + print( + f"[ROSControl] Action completed in {elapsed:.2f}s with result: {self._action_success}" + ) # Check result if self._action_success is None: @@ -526,7 +546,9 @@ def execute_reverse(): goal.speed = speed # BackUp expects positive speed goal.time_allowance = Duration(sec=time_allowance) - print(f"[ROSControl] execute_reverse: Creating BackUp goal with distance={distance}m, speed={speed}m/s") + print( + f"[ROSControl] execute_reverse: Creating BackUp goal with distance={distance}m, speed={speed}m/s" + ) print( f"[ROSControl] execute_reverse: Goal details: x={goal.target.x}, y={goal.target.y}, z={goal.target.z}, speed={goal.speed}" ) @@ -552,7 +574,9 @@ def execute_reverse(): distance=distance, speed=speed, ) - logger.info(f"Queued reverse command: {cmd_id} - Distance: {distance}m, Speed: {speed}m/s") + logger.info( + f"Queued reverse command: {cmd_id} - Distance: {distance}m, Speed: {speed}m/s" + ) return True except Exception as e: diff --git a/dimos/robot/ros_observable_topic.py b/dimos/robot/ros_observable_topic.py index 8826a894f4..697ddff398 100644 --- a/dimos/robot/ros_observable_topic.py +++ b/dimos/robot/ros_observable_topic.py @@ -116,7 +116,10 @@ def topic( # upstream ROS callback def _on_subscribe(obs, _): ros_sub = self._node.create_subscription( - self._sub_msg_type(msg_type), topic_name, self._maybe_conversion(msg_type, obs.on_next), qos_profile + self._sub_msg_type(msg_type), + topic_name, + self._maybe_conversion(msg_type, obs.on_next), + qos_profile, ) return Disposable(lambda: self._node.destroy_subscription(ros_sub)) @@ -158,7 +161,9 @@ def _subscribe(observer, sch=None): # odom.dispose() # clean up the subscription # # see test_ros_observable_topic.py test_topic_latest for more details - def topic_latest(self, topic_name: str, msg_type: TopicType, timeout: float | None = 100.0, qos=QOS.SENSOR): + def topic_latest( + self, topic_name: str, msg_type: TopicType, timeout: float | None = 100.0, qos=QOS.SENSOR + ): """ Blocks the current thread until the first message is received, then returns `reader()` (sync) and keeps one ROS subscription alive @@ -173,7 +178,9 @@ def topic_latest(self, topic_name: str, msg_type: TopicType, timeout: float | No conn = core.connect() # starts the ROS subscription immediately try: - first_val = core.pipe(ops.first(), *([ops.timeout(timeout)] if timeout is not None else [])).run() + first_val = core.pipe( + ops.first(), *([ops.timeout(timeout)] if timeout is not None else []) + ).run() except Exception: conn.dispose() msg = f"{topic_name} message not received after {timeout} seconds. Is robot connected?" @@ -204,7 +211,9 @@ def reader(): # odom.dispose() # clean up the subscription # # see test_ros_observable_topic.py test_topic_latest for more details - async def topic_latest_async(self, topic_name: str, msg_type: TopicType, qos=QOS.SENSOR, timeout: float = 30.0): + async def topic_latest_async( + self, topic_name: str, msg_type: TopicType, qos=QOS.SENSOR, timeout: float = 30.0 + ): loop = asyncio.get_running_loop() first = loop.create_future() cache = {"val": None} diff --git a/dimos/robot/ros_transform.py b/dimos/robot/ros_transform.py index d60117442e..b0c46fd275 100644 --- a/dimos/robot/ros_transform.py +++ b/dimos/robot/ros_transform.py @@ -54,10 +54,14 @@ def tf_buffer(self) -> Buffer: return self._tf_buffer - def transform_euler_pos(self, source_frame: str, target_frame: str = "map", timeout: float = 1.0): + def transform_euler_pos( + self, source_frame: str, target_frame: str = "map", timeout: float = 1.0 + ): return to_euler_pos(self.transform(source_frame, target_frame, timeout)) - def transform_euler_rot(self, source_frame: str, target_frame: str = "map", timeout: float = 1.0): + def transform_euler_rot( + self, source_frame: str, target_frame: str = "map", timeout: float = 1.0 + ): return to_euler_rot(self.transform(source_frame, target_frame, timeout)) def transform_euler(self, source_frame: str, target_frame: str = "map", timeout: float = 1.0): @@ -83,7 +87,9 @@ def transform( logger.error(f"Transform lookup failed: {e}") return None - def transform_point(self, point: Vector, source_frame: str, target_frame: str = "map", timeout: float = 1.0): + def transform_point( + self, point: Vector, source_frame: str, target_frame: str = "map", timeout: float = 1.0 + ): """Transform a point from source_frame to target_frame. Args: @@ -98,7 +104,10 @@ def transform_point(self, point: Vector, source_frame: str, target_frame: str = try: # Wait for transform to become available self.tf_buffer.can_transform( - target_frame, source_frame, rclpy.time.Time(), rclpy.duration.Duration(seconds=timeout) + target_frame, + source_frame, + rclpy.time.Time(), + rclpy.duration.Duration(seconds=timeout), ) # Create a PointStamped message @@ -110,18 +119,28 @@ def transform_point(self, point: Vector, source_frame: str, target_frame: str = ps.point.z = point[2] if len(point) > 2 else 0.0 # Transform point - transformed_ps = self.tf_buffer.transform(ps, target_frame, rclpy.duration.Duration(seconds=timeout)) + transformed_ps = self.tf_buffer.transform( + ps, target_frame, rclpy.duration.Duration(seconds=timeout) + ) # Return as Vector type if len(point) > 2: - return Vector(transformed_ps.point.x, transformed_ps.point.y, transformed_ps.point.z) + return Vector( + transformed_ps.point.x, transformed_ps.point.y, transformed_ps.point.z + ) else: return Vector(transformed_ps.point.x, transformed_ps.point.y) - except (tf2_ros.LookupException, tf2_ros.ConnectivityException, tf2_ros.ExtrapolationException) as e: + except ( + tf2_ros.LookupException, + tf2_ros.ConnectivityException, + tf2_ros.ExtrapolationException, + ) as e: logger.error(f"Transform from {source_frame} to {target_frame} failed: {e}") return None - def transform_path(self, path: Path, source_frame: str, target_frame: str = "map", timeout: float = 1.0): + def transform_path( + self, path: Path, source_frame: str, target_frame: str = "map", timeout: float = 1.0 + ): """Transform a path from source_frame to target_frame. Args: @@ -140,7 +159,9 @@ def transform_path(self, path: Path, source_frame: str, target_frame: str = "map transformed_path.append(transformed_point) return transformed_path - def transform_rot(self, rotation: Vector, source_frame: str, target_frame: str = "map", timeout: float = 1.0): + def transform_rot( + self, rotation: Vector, source_frame: str, target_frame: str = "map", timeout: float = 1.0 + ): """Transform a rotation from source_frame to target_frame. Args: @@ -155,7 +176,10 @@ def transform_rot(self, rotation: Vector, source_frame: str, target_frame: str = try: # Wait for transform to become available self.tf_buffer.can_transform( - target_frame, source_frame, rclpy.time.Time(), rclpy.duration.Duration(seconds=timeout) + target_frame, + source_frame, + rclpy.time.Time(), + rclpy.duration.Duration(seconds=timeout), ) # Create a rotation matrix from the input Euler angles @@ -180,12 +204,21 @@ def transform_rot(self, rotation: Vector, source_frame: str, target_frame: str = # Return as Vector type return Vector(euler_angles) - except (tf2_ros.LookupException, tf2_ros.ConnectivityException, tf2_ros.ExtrapolationException) as e: + except ( + tf2_ros.LookupException, + tf2_ros.ConnectivityException, + tf2_ros.ExtrapolationException, + ) as e: logger.error(f"Transform rotation from {source_frame} to {target_frame} failed: {e}") return None def transform_pose( - self, position: Vector, rotation: Vector, source_frame: str, target_frame: str = "map", timeout: float = 1.0 + self, + position: Vector, + rotation: Vector, + source_frame: str, + target_frame: str = "map", + timeout: float = 1.0, ): """Transform a pose from source_frame to target_frame. diff --git a/dimos/robot/test_ros_observable_topic.py b/dimos/robot/test_ros_observable_topic.py index efff34ab65..e62af0c55d 100644 --- a/dimos/robot/test_ros_observable_topic.py +++ b/dimos/robot/test_ros_observable_topic.py @@ -95,7 +95,9 @@ def test_parallel_and_cleanup(): assert i in received_messages, f"Expected {i} in received messages, got {received_messages}" # ensure that ROS end has only a single subscription - assert len(robot._node.subs) == 1, f"Expected 1 subscription, got {len(robot._node.subs)}: {robot._node.subs}" + assert len(robot._node.subs) == 1, ( + f"Expected 1 subscription, got {len(robot._node.subs)}: {robot._node.subs}" + ) subscription1.dispose() subscription2.dispose() diff --git a/dimos/robot/unitree/unitree_go2.py b/dimos/robot/unitree/unitree_go2.py index 037eb696ab..58447ba9e0 100644 --- a/dimos/robot/unitree/unitree_go2.py +++ b/dimos/robot/unitree/unitree_go2.py @@ -21,7 +21,9 @@ from dimos.stream.video_providers.unitree import UnitreeVideoProvider from reactivex.disposable import CompositeDisposable import logging -from dimos.robot.unitree.external.go2_webrtc_connect.go2_webrtc_driver.webrtc_driver import WebRTCConnectionMethod +from dimos.robot.unitree.external.go2_webrtc_connect.go2_webrtc_driver.webrtc_driver import ( + WebRTCConnectionMethod, +) import os from dimos.robot.unitree.unitree_ros_control import UnitreeROSControl from reactivex.scheduler import ThreadPoolScheduler @@ -81,7 +83,9 @@ def __init__( # Initialize ros_control if it is not provided and use_ros is True if ros_control is None and use_ros: ros_control = UnitreeROSControl( - node_name="unitree_go2", disable_video_stream=disable_video_stream, mock_connection=mock_connection + node_name="unitree_go2", + disable_video_stream=disable_video_stream, + mock_connection=mock_connection, ) # Initialize skill library diff --git a/dimos/robot/unitree/unitree_skills.py b/dimos/robot/unitree/unitree_skills.py index ec8c43dd8c..197d7a14fd 100644 --- a/dimos/robot/unitree/unitree_skills.py +++ b/dimos/robot/unitree/unitree_skills.py @@ -30,18 +30,38 @@ # Module-level constant for Unitree ROS control definitions UNITREE_ROS_CONTROLS: List[Tuple[str, int, str]] = [ ("Damp", 1001, "Lowers the robot to the ground fully."), - ("BalanceStand", 1002, "Activates a mode that maintains the robot in a balanced standing position."), - ("StandUp", 1004, "Commands the robot to transition from a sitting or prone position to a standing posture."), - ("StandDown", 1005, "Instructs the robot to move from a standing position to a sitting or prone posture."), + ( + "BalanceStand", + 1002, + "Activates a mode that maintains the robot in a balanced standing position.", + ), + ( + "StandUp", + 1004, + "Commands the robot to transition from a sitting or prone position to a standing posture.", + ), + ( + "StandDown", + 1005, + "Instructs the robot to move from a standing position to a sitting or prone posture.", + ), ( "RecoveryStand", 1006, "Recovers the robot to a state from which it can take more commands. Useful to run after multiple dynamic commands like front flips.", ), - ("Euler", 1007, "Adjusts the robot's orientation using Euler angles, providing precise control over its rotation."), + ( + "Euler", + 1007, + "Adjusts the robot's orientation using Euler angles, providing precise control over its rotation.", + ), # ("Move", 1008, "Move the robot using velocity commands."), # Intentionally omitted ("Sit", 1009, "Commands the robot to sit down from a standing or moving stance."), - ("RiseSit", 1010, "Commands the robot to rise back to a standing position from a sitting posture."), + ( + "RiseSit", + 1010, + "Commands the robot to rise back to a standing position from a sitting posture.", + ), ( "SwitchGait", 1011, @@ -63,17 +83,29 @@ 1015, "Sets or adjusts the speed at which the robot moves, with various levels available for different operational needs.", ), - ("Hello", 1016, "Performs a greeting action, which could involve a wave or other friendly gesture."), + ( + "Hello", + 1016, + "Performs a greeting action, which could involve a wave or other friendly gesture.", + ), ("Stretch", 1017, "Engages the robot in a stretching routine."), ( "TrajectoryFollow", 1018, "Directs the robot to follow a predefined trajectory, which could involve complex paths or maneuvers.", ), - ("ContinuousGait", 1019, "Enables a mode for continuous walking or running, ideal for long-distance travel."), + ( + "ContinuousGait", + 1019, + "Enables a mode for continuous walking or running, ideal for long-distance travel.", + ), ("Content", 1020, "To display or trigger when the robot is happy."), ("Wallow", 1021, "The robot falls onto its back and rolls around."), - ("Dance1", 1022, "Performs a predefined dance routine 1, programmed for entertainment or demonstration."), + ( + "Dance1", + 1022, + "Performs a predefined dance routine 1, programmed for entertainment or demonstration.", + ), ("Dance2", 1023, "Performs another variant of a predefined dance routine 2."), ("GetBodyHeight", 1024, "Retrieves the current height of the robot's body from the ground."), ( @@ -92,22 +124,50 @@ 1028, "Directs the robot to take a specific pose or stance, which could be used for tasks or performances.", ), - ("Scrape", 1029, "Robot falls to its hind legs and makes scraping motions with its front legs."), + ( + "Scrape", + 1029, + "Robot falls to its hind legs and makes scraping motions with its front legs.", + ), ("FrontFlip", 1030, "Executes a front flip, a complex and dynamic maneuver."), ("FrontJump", 1031, "Commands the robot to perform a forward jump."), - ("FrontPounce", 1032, "Initiates a pouncing movement forward, mimicking animal-like pouncing behavior."), + ( + "FrontPounce", + 1032, + "Initiates a pouncing movement forward, mimicking animal-like pouncing behavior.", + ), ("WiggleHips", 1033, "Causes the robot to wiggle its hips."), ( "GetState", 1034, "Retrieves the current operational state of the robot, including status reports or diagnostic information.", ), - ("EconomicGait", 1035, "Engages a more energy-efficient walking or running mode to conserve battery life."), + ( + "EconomicGait", + 1035, + "Engages a more energy-efficient walking or running mode to conserve battery life.", + ), ("FingerHeart", 1036, "Performs a finger heart gesture while on its hind legs."), - ("Handstand", 1301, "Commands the robot to perform a handstand, demonstrating balance and control."), - ("CrossStep", 1302, "Engages the robot in a cross-stepping routine, useful for complex locomotion or dance moves."), - ("OnesidedStep", 1303, "Commands the robot to perform a stepping motion that predominantly uses one side."), - ("Bound", 1304, "Initiates a bounding motion, similar to a light, repetitive hopping or leaping."), + ( + "Handstand", + 1301, + "Commands the robot to perform a handstand, demonstrating balance and control.", + ), + ( + "CrossStep", + 1302, + "Engages the robot in a cross-stepping routine, useful for complex locomotion or dance moves.", + ), + ( + "OnesidedStep", + 1303, + "Commands the robot to perform a stepping motion that predominantly uses one side.", + ), + ( + "Bound", + 1304, + "Initiates a bounding motion, similar to a light, repetitive hopping or leaping.", + ), ( "LeadFollow", 1045, @@ -156,7 +216,9 @@ def initialize_skills(self): # Provide the robot instance to each skill for skill_class in self: - print(f"{Colors.GREEN_PRINT_COLOR}Creating instance for skill: {skill_class}{Colors.RESET_COLOR}") + print( + f"{Colors.GREEN_PRINT_COLOR}Creating instance for skill: {skill_class}{Colors.RESET_COLOR}" + ) self.create_instance(skill_class.__name__, robot=self._robot) # Refresh the class skills @@ -207,7 +269,9 @@ class Move(AbstractRobotSkill): x: float = Field(..., description="Forward velocity (m/s).") y: float = Field(default=0.0, description="Left/right velocity (m/s)") yaw: float = Field(default=0.0, description="Rotational velocity (rad/s)") - duration: float = Field(default=0.0, description="How long to move (seconds). If 0, command is continuous") + duration: float = Field( + default=0.0, description="How long to move (seconds). If 0, command is continuous" + ) def __call__(self): super().__call__() @@ -231,7 +295,9 @@ async def continuous_move(): asyncio.set_event_loop(loop) start_time = time.time() try: - while not stop_event.is_set() and (time.time() - start_time) < self.duration: + while ( + not stop_event.is_set() and (time.time() - start_time) < self.duration + ): self._robot.move(vector) await asyncio.sleep(0.001) # Send commands at 1000Hz # Always stop at the end @@ -262,7 +328,9 @@ class Reverse(AbstractRobotSkill): x: float = Field(..., description="Backward velocity (m/s). Positive values move backward.") y: float = Field(default=0.0, description="Left/right velocity (m/s)") yaw: float = Field(default=0.0, description="Rotational velocity (rad/s)") - duration: float = Field(default=0.0, description="How long to move (seconds). If 0, command is continuous") + duration: float = Field( + default=0.0, description="How long to move (seconds). If 0, command is continuous" + ) def __call__(self): super().__call__() @@ -286,7 +354,9 @@ async def continuous_move(): asyncio.set_event_loop(loop) start_time = time.time() try: - while not stop_event.is_set() and (time.time() - start_time) < self.duration: + while ( + not stop_event.is_set() and (time.time() - start_time) < self.duration + ): self._robot.move(vector) await asyncio.sleep(0.001) # Send commands at 1000Hz # Always stop at the end diff --git a/dimos/robot/unitree_webrtc/connection.py b/dimos/robot/unitree_webrtc/connection.py index 48ea276883..7b9595b696 100644 --- a/dimos/robot/unitree_webrtc/connection.py +++ b/dimos/robot/unitree_webrtc/connection.py @@ -90,7 +90,11 @@ def raw_odom_stream(self) -> Subject[Position]: @functools.cache def lidar_stream(self) -> Subject[LidarMessage]: - return backpressure(self.raw_lidar_stream().pipe(ops.map(lambda raw_frame: LidarMessage.from_msg(raw_frame)))) + return backpressure( + self.raw_lidar_stream().pipe( + ops.map(lambda raw_frame: LidarMessage.from_msg(raw_frame)) + ) + ) @functools.cache def odom_stream(self) -> Subject[Position]: diff --git a/dimos/robot/unitree_webrtc/testing/helpers.py b/dimos/robot/unitree_webrtc/testing/helpers.py index 6f815abd56..20a4dbedc2 100644 --- a/dimos/robot/unitree_webrtc/testing/helpers.py +++ b/dimos/robot/unitree_webrtc/testing/helpers.py @@ -25,7 +25,12 @@ def benchmark(calls: int, targetf: Callable[[], Union[int, None]]) -> float: return (end - start + timemod) * 1000 / calls -O3dDrawable = o3d.geometry.Geometry | o3d.geometry.LineSet | o3d.geometry.TriangleMesh | o3d.geometry.PointCloud +O3dDrawable = ( + o3d.geometry.Geometry + | o3d.geometry.LineSet + | o3d.geometry.TriangleMesh + | o3d.geometry.PointCloud +) class ReturnsDrawable(Protocol): diff --git a/dimos/robot/unitree_webrtc/testing/test_multimock.py b/dimos/robot/unitree_webrtc/testing/test_multimock.py index bde677df95..754a9cce44 100644 --- a/dimos/robot/unitree_webrtc/testing/test_multimock.py +++ b/dimos/robot/unitree_webrtc/testing/test_multimock.py @@ -16,7 +16,9 @@ @pytest.mark.needsdata @pytest.mark.vis def test_multimock_stream(): - backpressure(Multimock("athens_odom").stream().pipe(ops.map(Odometry.from_msg))).subscribe(lambda x: print(x)) + backpressure(Multimock("athens_odom").stream().pipe(ops.map(Odometry.from_msg))).subscribe( + lambda x: print(x) + ) map = Map() def lidarmsg(msg): @@ -78,12 +80,16 @@ def test_webui_multistream(): websocket_vis.start() odom_stream = Multimock("athens_odom").stream().pipe(ops.map(Odometry.from_msg)) - lidar_stream = backpressure(Multimock("athens_lidar").stream().pipe(ops.map(LidarMessage.from_msg))) + lidar_stream = backpressure( + Multimock("athens_lidar").stream().pipe(ops.map(LidarMessage.from_msg)) + ) map = Map() map_stream = map.consume(lidar_stream) - costmap_stream = map_stream.pipe(ops.map(lambda x: ["costmap", map.costmap.smudge(preserve_unknown=False)])) + costmap_stream = map_stream.pipe( + ops.map(lambda x: ["costmap", map.costmap.smudge(preserve_unknown=False)]) + ) websocket_vis.connect(costmap_stream) websocket_vis.connect(odom_stream.pipe(ops.map(lambda pos: ["robot_pos", pos.pos.to_2d()]))) diff --git a/dimos/robot/unitree_webrtc/type/costmap.py b/dimos/robot/unitree_webrtc/type/costmap.py index 49e600ab46..814184479e 100644 --- a/dimos/robot/unitree_webrtc/type/costmap.py +++ b/dimos/robot/unitree_webrtc/type/costmap.py @@ -65,7 +65,9 @@ def save_pickle(self, pickle_path: str): pickle.dump(self, f) @classmethod - def create_empty(cls, width: int = 100, height: int = 100, resolution: float = 0.1) -> "Costmap": + def create_empty( + cls, width: int = 100, height: int = 100, resolution: float = 0.1 + ) -> "Costmap": """Create an empty costmap with specified dimensions.""" return cls( grid=np.zeros((height, width), dtype=np.int8), @@ -185,7 +187,9 @@ def smudge( for i in range(iterations): # Dilate the binary map - dilated = ndimage.binary_dilation(dilated_map > 0, structure=kernel, iterations=1).astype(np.uint8) + dilated = ndimage.binary_dilation( + dilated_map > 0, structure=kernel, iterations=1 + ).astype(np.uint8) # Calculate the new layer (cells that were just added in this iteration) new_layer = (dilated - (dilated_map > 0).astype(np.uint8)) * 100 diff --git a/dimos/robot/unitree_webrtc/type/lidar.py b/dimos/robot/unitree_webrtc/type/lidar.py index 37a51b702a..a0e081d4f1 100644 --- a/dimos/robot/unitree_webrtc/type/lidar.py +++ b/dimos/robot/unitree_webrtc/type/lidar.py @@ -124,7 +124,9 @@ def icptransform(self, other): def estimate_normals(self) -> "LidarMessage": # Check if normals already exist by testing if the normals attribute has data if not self.pointcloud.has_normals() or len(self.pointcloud.normals) == 0: - self.pointcloud.estimate_normals(search_param=o3d.geometry.KDTreeSearchParamHybrid(radius=0.1, max_nn=30)) + self.pointcloud.estimate_normals( + search_param=o3d.geometry.KDTreeSearchParamHybrid(radius=0.1, max_nn=30) + ) return self def color(self, color_choice) -> "LidarMessage": diff --git a/dimos/robot/unitree_webrtc/type/map.py b/dimos/robot/unitree_webrtc/type/map.py index eef15bdeef..cbfe4eb903 100644 --- a/dimos/robot/unitree_webrtc/type/map.py +++ b/dimos/robot/unitree_webrtc/type/map.py @@ -77,9 +77,11 @@ def splice_cylinder( map_pts = np.asarray(map_pcd.points) planar_dists_map = np.linalg.norm(map_pts[:, axes] - center[axes], axis=1) - victims = np.nonzero((planar_dists_map < radius) & (map_pts[:, axis] >= axis_min) & (map_pts[:, axis] <= axis_max))[ - 0 - ] + victims = np.nonzero( + (planar_dists_map < radius) + & (map_pts[:, axis] >= axis_min) + & (map_pts[:, axis] <= axis_max) + )[0] survivors = map_pcd.select_by_index(victims, invert=True) return survivors + patch_pcd diff --git a/dimos/robot/unitree_webrtc/type/test_lidar.py b/dimos/robot/unitree_webrtc/type/test_lidar.py index 65f102c580..0b860c9658 100644 --- a/dimos/robot/unitree_webrtc/type/test_lidar.py +++ b/dimos/robot/unitree_webrtc/type/test_lidar.py @@ -116,7 +116,10 @@ def test_downsample(): # framea_icp = framea.copy().icptransform(frameb) pcd = framea.copy().pointcloud newpcd, _, _ = pcd.voxel_down_sample_and_trace( - voxel_size=0.25, min_bound=pcd.get_min_bound(), max_bound=pcd.get_max_bound(), approximate_class=False + voxel_size=0.25, + min_bound=pcd.get_min_bound(), + max_bound=pcd.get_max_bound(), + approximate_class=False, ) multivis( diff --git a/dimos/robot/unitree_webrtc/type/vector.py b/dimos/robot/unitree_webrtc/type/vector.py index 8d85ca3355..e5fb446884 100644 --- a/dimos/robot/unitree_webrtc/type/vector.py +++ b/dimos/robot/unitree_webrtc/type/vector.py @@ -197,7 +197,8 @@ def angle(self, other: Union["Vector", Iterable[float]]) -> float: other_data = np.array(other, dtype=float) cos_angle = np.clip( - np.dot(self._data, other_data) / (np.linalg.norm(self._data) * np.linalg.norm(other_data)), + np.dot(self._data, other_data) + / (np.linalg.norm(self._data) * np.linalg.norm(other_data)), -1.0, 1.0, ) diff --git a/dimos/simulation/genesis/stream.py b/dimos/simulation/genesis/stream.py index 4dc48115c4..b553f29fa2 100644 --- a/dimos/simulation/genesis/stream.py +++ b/dimos/simulation/genesis/stream.py @@ -106,7 +106,9 @@ def stream(self): if frame_count % 100 == 0: elapsed_time = time.time() - start_time current_fps = frame_count / elapsed_time - print(f"[Stream] Processed {frame_count} frames | Current FPS: {current_fps:.2f}") + print( + f"[Stream] Processed {frame_count} frames | Current FPS: {current_fps:.2f}" + ) except KeyboardInterrupt: print("\n[Stream] Received keyboard interrupt, stopping stream...") diff --git a/dimos/simulation/isaac/stream.py b/dimos/simulation/isaac/stream.py index e77f7d1bd0..44560783bd 100644 --- a/dimos/simulation/isaac/stream.py +++ b/dimos/simulation/isaac/stream.py @@ -76,7 +76,9 @@ def _setup_camera(self): if not camera_prim: raise RuntimeError(f"Failed to find camera at path: {self.camera_path}") - self.render_product = self.rep.create.render_product(self.camera_path, resolution=(self.width, self.height)) + self.render_product = self.rep.create.render_product( + self.camera_path, resolution=(self.width, self.height) + ) def _setup_annotator(self): """Setup the specified annotator.""" @@ -114,7 +116,9 @@ def stream(self): if frame_count % 100 == 0: elapsed_time = time.time() - start_time current_fps = frame_count / elapsed_time - print(f"[Stream] Processed {frame_count} frames | Current FPS: {current_fps:.2f}") + print( + f"[Stream] Processed {frame_count} frames | Current FPS: {current_fps:.2f}" + ) except KeyboardInterrupt: print("\n[Stream] Received keyboard interrupt, stopping stream...") diff --git a/dimos/skills/navigation.py b/dimos/skills/navigation.py index e41727da96..adb4d0e980 100644 --- a/dimos/skills/navigation.py +++ b/dimos/skills/navigation.py @@ -74,7 +74,8 @@ class NavigateWithText(AbstractRobotSkill): distance: float = Field(1.0, description="Desired distance to maintain from object in meters") timeout: float = Field(40.0, description="Maximum time to spend navigating in seconds") similarity_threshold: float = Field( - 0.25, description="Minimum similarity score required for semantic map results to be considered valid" + 0.25, + description="Minimum similarity score required for semantic map results to be considered valid", ) def __init__(self, robot=None, **data): @@ -142,7 +143,11 @@ def _navigate_to_object(self): goal_y_robot = 0 goal_angle = 0 - while time.time() - start_time < 10.0 and not self._stop_event.is_set() and not target_acquired: + while ( + time.time() - start_time < 10.0 + and not self._stop_event.is_set() + and not target_acquired + ): # Get the latest tracking data tracking_data = self._robot.object_tracking_stream.pipe(ops.take(1)).run() @@ -151,11 +156,15 @@ def _navigate_to_object(self): if "distance" in target and "angle" in target: # Convert target distance and angle to xy coordinates in robot frame - goal_distance = target["distance"] - self.distance # Subtract desired distance to stop short + goal_distance = ( + target["distance"] - self.distance + ) # Subtract desired distance to stop short goal_angle = -target["angle"] logger.info(f"Target distance: {goal_distance}, Target angle: {goal_angle}") - goal_x_robot, goal_y_robot = distance_angle_to_goal_xy(goal_distance, goal_angle) + goal_x_robot, goal_y_robot = distance_angle_to_goal_xy( + goal_distance, goal_angle + ) target_acquired = True break @@ -163,13 +172,19 @@ def _navigate_to_object(self): logger.warning(f"No valid target tracking data found. target: {target}") else: - logger.warning(f"No valid target tracking data found. tracking_data: {tracking_data}") + logger.warning( + f"No valid target tracking data found. tracking_data: {tracking_data}" + ) time.sleep(0.1) if not target_acquired: logger.error("Failed to acquire valid target tracking data") - return {"success": False, "failure_reason": "Perception", "error": "Failed to track object"} + return { + "success": False, + "failure_reason": "Perception", + "error": "Failed to track object", + } logger.info( f"Navigating to target at local coordinates: ({goal_x_robot:.2f}, {goal_y_robot:.2f}), angle: {goal_angle:.2f}" @@ -194,7 +209,9 @@ def _navigate_to_object(self): "message": f"Successfully navigated to {self.query} in view", } else: - logger.warning(f"Failed to reach {self.query} within timeout or operation was stopped") + logger.warning( + f"Failed to reach {self.query} within timeout or operation was stopped" + ) return { "success": False, "failure_reason": "Navigation", @@ -226,7 +243,11 @@ def _navigate_using_semantic_map(self): if not results: logger.warning(f"No results found for query: '{self.query}'") - return {"success": False, "query": self.query, "error": "No matching location found in semantic map"} + return { + "success": False, + "query": self.query, + "error": "No matching location found in semantic map", + } # Get the best match best_match = results[0] @@ -236,13 +257,20 @@ def _navigate_using_semantic_map(self): metadata = metadata[0] # Extract coordinates from metadata - if isinstance(metadata, dict) and "pos_x" in metadata and "pos_y" in metadata and "rot_z" in metadata: + if ( + isinstance(metadata, dict) + and "pos_x" in metadata + and "pos_y" in metadata + and "rot_z" in metadata + ): pos_x = metadata.get("pos_x", 0) pos_y = metadata.get("pos_y", 0) theta = metadata.get("rot_z", 0) # Calculate similarity score (distance is inverse of similarity) - similarity = 1.0 - (best_match.get("distance", 0) if best_match.get("distance") is not None else 0) + similarity = 1.0 - ( + best_match.get("distance", 0) if best_match.get("distance") is not None else 0 + ) logger.info( f"Found match for '{self.query}' at ({pos_x:.2f}, {pos_y:.2f}, rotation {theta:.2f}) with similarity: {similarity:.4f}" @@ -272,7 +300,9 @@ def run_navigation(): self.register_as_running("Navigate", skill_library) try: - logger.info(f"Starting navigation to ({pos_x:.2f}, {pos_y:.2f}) with rotation {theta:.2f}") + logger.info( + f"Starting navigation to ({pos_x:.2f}, {pos_y:.2f}) with rotation {theta:.2f}" + ) # Pass our stop_event to allow cancellation result = False try: @@ -312,7 +342,11 @@ def run_navigation(): } else: logger.warning(f"No valid position data found for query: '{self.query}'") - return {"success": False, "query": self.query, "error": "No valid position data found in semantic map"} + return { + "success": False, + "query": self.query, + "error": "No valid position data found in semantic map", + } except Exception as e: logger.error(f"Error in semantic map navigation: {e}") return {"success": False, "error": f"Semantic map error: {e}"} @@ -340,11 +374,15 @@ def __call__(self): return object_result elif object_result and object_result["failure_reason"] == "Navigation": - logger.info(f"Failed to navigate to {self.query} in view: {object_result.get('error', 'Unknown error')}") + logger.info( + f"Failed to navigate to {self.query} in view: {object_result.get('error', 'Unknown error')}" + ) return object_result # If object navigation failed, fall back to semantic map - logger.info(f"Object not found in view. Falling back to semantic map query for: '{self.query}'") + logger.info( + f"Object not found in view. Falling back to semantic map query for: '{self.query}'" + ) return self._navigate_using_semantic_map() @@ -397,7 +435,9 @@ class GetPose(AbstractRobotSkill): allowing you to navigate back to it later using the Navigate skill. """ - location_name: str = Field("", description="Optional name to assign to this location (e.g., 'kitchen', 'office')") + location_name: str = Field( + "", description="Optional name to assign to this location (e.g., 'kitchen', 'office')" + ) def __init__(self, robot=None, **data): """ @@ -430,7 +470,11 @@ def __call__(self): # Format the response result = { "success": True, - "position": {"x": position[0], "y": position[1], "z": position[2] if len(position) > 2 else 0.0}, + "position": { + "x": position[0], + "y": position[1], + "z": position[2] if len(position) > 2 else 0.0, + }, "rotation": {"roll": rotation[0], "pitch": rotation[1], "yaw": rotation[2]}, } @@ -440,7 +484,9 @@ def __call__(self): spatial_memory = self._robot.get_spatial_memory() # Create a RobotLocation object - location = RobotLocation(name=self.location_name, position=position, rotation=rotation) + location = RobotLocation( + name=self.location_name, position=position, rotation=rotation + ) # Add to spatial memory if spatial_memory.add_robot_location(location): @@ -467,7 +513,9 @@ class NavigateToGoal(AbstractRobotSkill): orientation at the goal position. """ - position: Tuple[float, float] = Field((0.0, 0.0), description="Target position (x, y) in map frame") + position: Tuple[float, float] = Field( + (0.0, 0.0), description="Target position (x, y) in map frame" + ) rotation: Optional[float] = Field(None, description="Target orientation (yaw) in radians") frame: str = Field("map", description="Reference frame for the position and rotation") timeout: float = Field(120.0, description="Maximum time (in seconds) allowed for navigation") @@ -535,7 +583,12 @@ def __call__(self): except Exception as e: error_msg = f"Error during navigation: {e}" logger.error(error_msg) - return {"success": False, "position": self.position, "rotation": self.rotation, "error": error_msg} + return { + "success": False, + "position": self.position, + "rotation": self.rotation, + "error": error_msg, + } finally: self.stop() diff --git a/dimos/skills/observe_stream.py b/dimos/skills/observe_stream.py index ee666b3b1e..dc38052508 100644 --- a/dimos/skills/observe_stream.py +++ b/dimos/skills/observe_stream.py @@ -45,12 +45,16 @@ class ObserveStream(AbstractRobotSkill): or to monitor changes in the environment. """ - timestep: float = Field(60.0, description="Time interval in seconds between observation queries") + timestep: float = Field( + 60.0, description="Time interval in seconds between observation queries" + ) query_text: str = Field( "What do you see in this image? Alert me if you see any people or important changes.", description="Query text to send to agent with each image", ) - max_duration: float = Field(0.0, description="Maximum duration to run the observer in seconds (0 for indefinite)") + max_duration: float = Field( + 0.0, description="Maximum duration to run the observer in seconds (0 for indefinite)" + ) def __init__(self, robot=None, agent: Optional[LLMAgent] = None, video_stream=None, **data): """ @@ -163,7 +167,8 @@ def _get_frame_from_stream(self): subscription = self._video_stream.pipe( ops.take(1) # Take just one frame ).subscribe( - on_next=lambda x: frame_subject.on_next(x), on_error=lambda e: logger.error(f"Error getting frame: {e}") + on_next=lambda x: frame_subject.on_next(x), + on_error=lambda e: logger.error(f"Error getting frame: {e}"), ) timeout = 5.0 # 5 seconds timeout diff --git a/dimos/skills/rest/rest.py b/dimos/skills/rest/rest.py index ed789d69d9..3e7c7426cc 100644 --- a/dimos/skills/rest/rest.py +++ b/dimos/skills/rest/rest.py @@ -82,10 +82,14 @@ def __call__(self) -> str: timeout=self.timeout, ) response.raise_for_status() # Raises HTTPError for bad responses (4xx or 5xx) - logger.debug(f"Request successful. Status: {response.status_code}, Response: {response.text[:100]}...") + logger.debug( + f"Request successful. Status: {response.status_code}, Response: {response.text[:100]}..." + ) return response.text # Return text content directly except requests.exceptions.HTTPError as http_err: - logger.error(f"HTTP error occurred: {http_err} - Status Code: {http_err.response.status_code}") + logger.error( + f"HTTP error occurred: {http_err} - Status Code: {http_err.response.status_code}" + ) return f"HTTP error making {self.method.upper()} request to {self.url}: {http_err.response.status_code} {http_err.response.reason}" except requests.exceptions.RequestException as req_err: logger.error(f"Request exception occurred: {req_err}") diff --git a/dimos/skills/skills.py b/dimos/skills/skills.py index 9925046da6..559f9b0e36 100644 --- a/dimos/skills/skills.py +++ b/dimos/skills/skills.py @@ -62,7 +62,11 @@ def get_class_skills(self) -> list["AbstractSkill"]: attr = getattr(self.__class__, attr_name) # Check if it's a class and inherits from AbstractSkill - if isinstance(attr, type) and issubclass(attr, AbstractSkill) and attr is not AbstractSkill: + if ( + isinstance(attr, type) + and issubclass(attr, AbstractSkill) + and attr is not AbstractSkill + ): skills.append(attr) except (AttributeError, TypeError): # Skip attributes that can't be accessed or aren't classes @@ -212,7 +216,11 @@ def terminate_skill(self, name: str): logger.warning(f"Skill {name} does not have a stop method") # Also dispose the subscription if it exists - if subscription is not None and hasattr(subscription, "dispose") and callable(subscription.dispose): + if ( + subscription is not None + and hasattr(subscription, "dispose") + and callable(subscription.dispose) + ): subscription.dispose() logger.info(f"Disposed subscription for skill: {name}") elif subscription is not None: @@ -297,7 +305,9 @@ class AbstractRobotSkill(AbstractSkill): def __init__(self, *args, robot: Optional[Robot] = None, **kwargs): super().__init__(*args, **kwargs) self._robot = robot - print(f"{Colors.BLUE_PRINT_COLOR}Robot Skill Initialized with Robot: {robot}{Colors.RESET_COLOR}") + print( + f"{Colors.BLUE_PRINT_COLOR}Robot Skill Initialized with Robot: {robot}{Colors.RESET_COLOR}" + ) def set_robot(self, robot: Robot) -> None: """Set the robot reference for this skills instance. diff --git a/dimos/skills/speak.py b/dimos/skills/speak.py index a1bb7ba6ea..4abada72bb 100644 --- a/dimos/skills/speak.py +++ b/dimos/skills/speak.py @@ -47,7 +47,9 @@ def start_audio_queue_processor(): if _queue_processor_thread is None or not _queue_processor_thread.is_alive(): _queue_running = True - _queue_processor_thread = threading.Thread(target=_process_audio_queue, daemon=True, name="AudioQueueProcessor") + _queue_processor_thread = threading.Thread( + target=_process_audio_queue, daemon=True, name="AudioQueueProcessor" + ) _queue_processor_thread.start() logger.info("Started audio queue processor thread") diff --git a/dimos/skills/visual_navigation_skills.py b/dimos/skills/visual_navigation_skills.py index cf165ab89b..72696f0427 100644 --- a/dimos/skills/visual_navigation_skills.py +++ b/dimos/skills/visual_navigation_skills.py @@ -42,7 +42,9 @@ class FollowHuman(AbstractRobotSkill): if you want to navigate to a specific person, use NavigateTo instead. """ - distance: float = Field(1.5, description="Desired distance to maintain from the person in meters") + distance: float = Field( + 1.5, description="Desired distance to maintain from the person in meters" + ) timeout: float = Field(20.0, description="Maximum time to follow the person in seconds") point: Optional[Tuple[int, int]] = Field( None, description="Optional point to start tracking (x,y pixel coordinates)" @@ -62,7 +64,10 @@ def __call__(self): """ super().__call__() - if not hasattr(self._robot, "person_tracking_stream") or self._robot.person_tracking_stream is None: + if ( + not hasattr(self._robot, "person_tracking_stream") + or self._robot.person_tracking_stream is None + ): logger.error("Robot does not have a person tracking stream") return False @@ -74,13 +79,17 @@ def __call__(self): try: # Initialize visual servoing - self._visual_servoing = VisualServoing(tracking_stream=self._robot.person_tracking_stream) + self._visual_servoing = VisualServoing( + tracking_stream=self._robot.person_tracking_stream + ) logger.warning(f"Following human for {self.timeout} seconds...") start_time = time.time() # Start tracking - track_success = self._visual_servoing.start_tracking(point=self.point, desired_distance=self.distance) + track_success = self._visual_servoing.start_tracking( + point=self.point, desired_distance=self.distance + ) if not track_success: logger.error("Failed to start tracking") diff --git a/dimos/stream/audio/node_key_recorder.py b/dimos/stream/audio/node_key_recorder.py index 02084b5d6f..aaf9e24a38 100644 --- a/dimos/stream/audio/node_key_recorder.py +++ b/dimos/stream/audio/node_key_recorder.py @@ -208,7 +208,8 @@ def _combine_audio_events(self, audio_events: List[AudioEvent]) -> AudioEvent: valid_events = [ event for event in audio_events - if event is not None and (hasattr(event, "data") and event.data is not None and event.data.size > 0) + if event is not None + and (hasattr(event, "data") and event.data is not None and event.data.size > 0) ] if not valid_events: diff --git a/dimos/stream/audio/node_normalizer.py b/dimos/stream/audio/node_normalizer.py index 11ec78999b..caa17de042 100644 --- a/dimos/stream/audio/node_normalizer.py +++ b/dimos/stream/audio/node_normalizer.py @@ -88,7 +88,9 @@ def _normalize_audio(self, audio_event: AudioEvent) -> AudioEvent: ideal_gain = min(ideal_gain, self.max_gain) # Smoothly adapt current gain towards ideal gain - self.current_gain = (1 - self.adapt_speed) * self.current_gain + self.adapt_speed * ideal_gain + self.current_gain = ( + 1 - self.adapt_speed + ) * self.current_gain + self.adapt_speed * ideal_gain # Apply gain to audio data normalized_data = audio_event.data * self.current_gain @@ -135,7 +137,9 @@ def on_subscribe(observer, scheduler): on_completed=lambda: observer.on_completed(), ) - logger.info(f"Started audio normalizer with target level: {self.target_level}, max gain: {self.max_gain}") + logger.info( + f"Started audio normalizer with target level: {self.target_level}, max gain: {self.max_gain}" + ) # Return a disposable to clean up resources def dispose(): diff --git a/dimos/stream/audio/node_simulated.py b/dimos/stream/audio/node_simulated.py index e1d756a9ad..918e643242 100644 --- a/dimos/stream/audio/node_simulated.py +++ b/dimos/stream/audio/node_simulated.py @@ -77,7 +77,10 @@ def _generate_sine_wave(self, time_points: np.ndarray) -> np.ndarray: elif self.waveform == "square": wave = np.sign(np.sin(phase_arg)) elif self.waveform == "triangle": - wave = 2 * np.abs(2 * (phase_arg / (2 * np.pi) - np.floor(phase_arg / (2 * np.pi) + 0.5))) - 1 + wave = ( + 2 * np.abs(2 * (phase_arg / (2 * np.pi) - np.floor(phase_arg / (2 * np.pi) + 0.5))) + - 1 + ) elif self.waveform == "sawtooth": wave = 2 * (phase_arg / (2 * np.pi) - np.floor(0.5 + phase_arg / (2 * np.pi))) else: @@ -96,7 +99,9 @@ def _generate_sine_wave(self, time_points: np.ndarray) -> np.ndarray: wave *= volume_factor * 0.7 # Update volume phase for next frame - self.volume_phase += time_points[-1] - time_points[0] + (time_points[1] - time_points[0]) + self.volume_phase += ( + time_points[-1] - time_points[0] + (time_points[1] - time_points[0]) + ) # Update phase for next frame self.phase += time_points[-1] - time_points[0] + (time_points[1] - time_points[0]) @@ -121,7 +126,9 @@ def _audio_thread(self, observer, interval: float): while self._running: # Calculate time points for this frame - time_points = np.arange(sample_index, sample_index + self.frame_length) / self.sample_rate + time_points = ( + np.arange(sample_index, sample_index + self.frame_length) / self.sample_rate + ) # Generate audio data audio_data = self._generate_sine_wave(time_points) @@ -165,7 +172,9 @@ def on_subscribe(observer, scheduler): interval = 1.0 / fps # Start the audio generation thread - self._thread = threading.Thread(target=self._audio_thread, args=(observer, interval), daemon=True) + self._thread = threading.Thread( + target=self._audio_thread, args=(observer, interval), daemon=True + ) self._thread.start() logger.info( diff --git a/dimos/stream/audio/node_volume_monitor.py b/dimos/stream/audio/node_volume_monitor.py index 8c1911cf33..af1ddaf098 100644 --- a/dimos/stream/audio/node_volume_monitor.py +++ b/dimos/stream/audio/node_volume_monitor.py @@ -137,7 +137,9 @@ def monitor( The configured volume monitor node """ # Create the volume monitor node with specified parameters - volume_monitor = VolumeMonitorNode(threshold=threshold, bar_length=bar_length, volume_func=volume_func) + volume_monitor = VolumeMonitorNode( + threshold=threshold, bar_length=bar_length, volume_func=volume_func + ) # Connect the volume monitor to the audio source volume_monitor.consume_audio(audio_source) diff --git a/dimos/stream/audio/volume.py b/dimos/stream/audio/volume.py index b8837e3e32..e9748ef3a7 100644 --- a/dimos/stream/audio/volume.py +++ b/dimos/stream/audio/volume.py @@ -82,7 +82,9 @@ def process_frame_wrapper(frame): processed["done"] = True # Subscribe to get a single frame and process it - subscription = audio_observable.subscribe(on_next=process_frame_wrapper, on_completed=lambda: print("Completed")) + subscription = audio_observable.subscribe( + on_next=process_frame_wrapper, on_completed=lambda: print("Completed") + ) # Wait for frame processing to complete while not processed["done"]: diff --git a/dimos/stream/frame_processor.py b/dimos/stream/frame_processor.py index 02a72850af..b07a09118b 100644 --- a/dimos/stream/frame_processor.py +++ b/dimos/stream/frame_processor.py @@ -202,7 +202,8 @@ def process_stream_optical_flow(self, frame_stream: Observable) -> Observable: """ return frame_stream.pipe( ops.scan( - lambda acc, frame: self.compute_optical_flow(acc, frame, compute_relevancy=False), (None, None, None) + lambda acc, frame: self.compute_optical_flow(acc, frame, compute_relevancy=False), + (None, None, None), ), ops.map(lambda result: result[1]), # Extract flow component ops.filter(lambda flow: flow is not None), @@ -245,7 +246,8 @@ def process_stream_optical_flow_with_relevancy(self, frame_stream: Observable) - """ return frame_stream.pipe( ops.scan( - lambda acc, frame: self.compute_optical_flow(acc, frame, compute_relevancy=True), (None, None, None) + lambda acc, frame: self.compute_optical_flow(acc, frame, compute_relevancy=True), + (None, None, None), ), # Result is (current_frame, flow, relevancy) ops.filter(lambda result: result[1] is not None), # Filter out None flows diff --git a/dimos/stream/ros_video_provider.py b/dimos/stream/ros_video_provider.py index ffe6b5b10a..7ca6fa4aa7 100644 --- a/dimos/stream/ros_video_provider.py +++ b/dimos/stream/ros_video_provider.py @@ -43,7 +43,9 @@ class ROSVideoProvider(AbstractVideoProvider): _last_frame_time: Timestamp of the last received frame. """ - def __init__(self, dev_name: str = "ros_video", pool_scheduler: Optional[ThreadPoolScheduler] = None): + def __init__( + self, dev_name: str = "ros_video", pool_scheduler: Optional[ThreadPoolScheduler] = None + ): """Initialize the ROS video provider. Args: @@ -70,7 +72,9 @@ def push_data(self, frame: np.ndarray) -> None: current_time = time.time() if self._last_frame_time: frame_interval = current_time - self._last_frame_time - self.logger.debug(f"Frame interval: {frame_interval:.3f}s ({1 / frame_interval:.1f} FPS)") + self.logger.debug( + f"Frame interval: {frame_interval:.3f}s ({1 / frame_interval:.1f} FPS)" + ) self._last_frame_time = current_time self.logger.debug(f"Pushing frame type: {type(frame)}") diff --git a/dimos/stream/rtsp_video_provider.py b/dimos/stream/rtsp_video_provider.py index 0ed4063e33..5926c4f676 100644 --- a/dimos/stream/rtsp_video_provider.py +++ b/dimos/stream/rtsp_video_provider.py @@ -43,7 +43,9 @@ class RtspVideoProvider(AbstractVideoProvider): built-in VideoCapture for RTSP. """ - def __init__(self, dev_name: str, rtsp_url: str, pool_scheduler: Optional[ThreadPoolScheduler] = None) -> None: + def __init__( + self, dev_name: str, rtsp_url: str, pool_scheduler: Optional[ThreadPoolScheduler] = None + ) -> None: """Initializes the RTSP video provider. Args: @@ -75,7 +77,8 @@ def _get_stream_info(self) -> dict: raise VideoSourceError(msg) from e video_stream = next( - (stream for stream in probe.get("streams", []) if stream.get("codec_type") == "video"), None + (stream for stream in probe.get("streams", []) if stream.get("codec_type") == "video"), + None, ) if video_stream is None: @@ -99,10 +102,14 @@ def _get_stream_info(self) -> dict: else: fps = float(fps_str) if fps <= 0: - logger.warning(f"({self.dev_name}) Invalid avg_frame_rate '{fps_str}', defaulting FPS to 30.") + logger.warning( + f"({self.dev_name}) Invalid avg_frame_rate '{fps_str}', defaulting FPS to 30." + ) fps = 30.0 except ValueError: - logger.warning(f"({self.dev_name}) Could not parse FPS '{fps_str}', defaulting FPS to 30.") + logger.warning( + f"({self.dev_name}) Could not parse FPS '{fps_str}', defaulting FPS to 30." + ) fps = 30.0 logger.info(f"({self.dev_name}) Stream info: {width}x{height} @ {fps:.2f} FPS") @@ -175,7 +182,9 @@ def cleanup_process(): with self._lock: # Check if the process exists and is still running if process and process.poll() is None: - logger.info(f"({self.dev_name}) Terminating ffmpeg process (PID: {process.pid}).") + logger.info( + f"({self.dev_name}) Terminating ffmpeg process (PID: {process.pid})." + ) try: process.terminate() # Ask ffmpeg to exit gracefully process.wait(timeout=1.0) # Wait up to 1 second @@ -251,7 +260,10 @@ def cleanup_process(): # Break inner loop to trigger cleanup and potential restart with self._lock: # Clear the shared process handle if it matches the one that just exited - if self._ffmpeg_process and self._ffmpeg_process.pid == process.pid: + if ( + self._ffmpeg_process + and self._ffmpeg_process.pid == process.pid + ): self._ffmpeg_process = None process = None # Clear local process variable break # Exit frame reading loop @@ -280,12 +292,17 @@ def cleanup_process(): except (VideoSourceError, ffmpeg.Error) as e: # Errors during ffmpeg process start or severe runtime errors - logger.error(f"({self.dev_name}) Unrecoverable ffmpeg error: {e}. Stopping emission.") + logger.error( + f"({self.dev_name}) Unrecoverable ffmpeg error: {e}. Stopping emission." + ) observer.on_error(e) should_stop.set() # Stop retrying except Exception as e: # Catch other unexpected errors during frame reading/processing - logger.error(f"({self.dev_name}) Unexpected error processing stream: {e}", exc_info=True) + logger.error( + f"({self.dev_name}) Unexpected error processing stream: {e}", + exc_info=True, + ) observer.on_error(VideoFrameError(f"Frame processing failed: {e}")) should_stop.set() # Stop retrying @@ -325,7 +342,9 @@ def dispose_all(self) -> None: with self._lock: process = self._ffmpeg_process # Get the current process handle if process and process.poll() is None: - logger.info(f"({self.dev_name}) Terminating ffmpeg process (PID: {process.pid}) via dispose_all.") + logger.info( + f"({self.dev_name}) Terminating ffmpeg process (PID: {process.pid}) via dispose_all." + ) try: process.terminate() process.wait(timeout=1.0) @@ -335,7 +354,9 @@ def dispose_all(self) -> None: ) process.kill() except Exception as e: - logger.error(f"({self.dev_name}) Error during ffmpeg termination in dispose_all: {e}") + logger.error( + f"({self.dev_name}) Error during ffmpeg termination in dispose_all: {e}" + ) finally: self._ffmpeg_process = None # Clear handle after attempting termination elif process: # Process exists but already terminated @@ -344,7 +365,9 @@ def dispose_all(self) -> None: ) self._ffmpeg_process = None else: - logger.debug(f"({self.dev_name}) No active ffmpeg process found during dispose_all.") + logger.debug( + f"({self.dev_name}) No active ffmpeg process found during dispose_all." + ) # Call the parent class's dispose_all to handle Rx Disposables super().dispose_all() diff --git a/dimos/stream/video_operators.py b/dimos/stream/video_operators.py index e7804ec7f5..78ba7518a1 100644 --- a/dimos/stream/video_operators.py +++ b/dimos/stream/video_operators.py @@ -95,13 +95,18 @@ def with_fps_sampling( sample_interval = timedelta(microseconds=int(1_000_000 / fps)) def _operator(source: Observable) -> Observable: - return source.pipe(ops.sample(sample_interval) if use_latest else ops.throttle_first(sample_interval)) + return source.pipe( + ops.sample(sample_interval) if use_latest else ops.throttle_first(sample_interval) + ) return _operator @staticmethod def with_jpeg_export( - frame_processor: "FrameProcessor", save_limit: int = 100, suffix: str = "", loop: bool = False + frame_processor: "FrameProcessor", + save_limit: int = 100, + suffix: str = "", + loop: bool = False, ) -> Callable[[Observable], Observable]: """Creates an operator that saves video frames as JPEG files. @@ -134,7 +139,11 @@ def with_jpeg_export( """ def _operator(source: Observable) -> Observable: - return source.pipe(ops.map(lambda frame: frame_processor.export_to_jpeg(frame, save_limit, loop, suffix))) + return source.pipe( + ops.map( + lambda frame: frame_processor.export_to_jpeg(frame, save_limit, loop, suffix) + ) + ) return _operator @@ -183,7 +192,9 @@ def with_optical_flow_filtering(threshold: float = 1.0) -> Callable[[Observable] def with_edge_detection( frame_processor: "FrameProcessor", ) -> Callable[[Observable], Observable]: - return lambda source: source.pipe(ops.map(lambda frame: frame_processor.edge_detection(frame))) + return lambda source: source.pipe( + ops.map(lambda frame: frame_processor.edge_detection(frame)) + ) @staticmethod def with_optical_flow( @@ -191,7 +202,9 @@ def with_optical_flow( ) -> Callable[[Observable], Observable]: return lambda source: source.pipe( ops.scan( - lambda acc, frame: frame_processor.compute_optical_flow(acc, frame, compute_relevancy=False), + lambda acc, frame: frame_processor.compute_optical_flow( + acc, frame, compute_relevancy=False + ), (None, None, None), ), ops.map(lambda result: result[1]), # Extract flow component @@ -200,7 +213,9 @@ def with_optical_flow( ) @staticmethod - def with_zmq_socket(socket: zmq.Socket, scheduler: Optional[Any] = None) -> Callable[[Observable], Observable]: + def with_zmq_socket( + socket: zmq.Socket, scheduler: Optional[Any] = None + ) -> Callable[[Observable], Observable]: def send_frame(frame, socket): _, img_encoded = cv2.imencode(".jpg", frame) socket.send(img_encoded.tobytes()) @@ -330,7 +345,9 @@ def on_completed(): if not in_flight: observer.on_completed() - upstream_disp = source.subscribe(on_next, on_error, on_completed, scheduler=scheduler) + upstream_disp = source.subscribe( + on_next, on_error, on_completed, scheduler=scheduler + ) return dispose_all return create(_subscribe) @@ -418,7 +435,10 @@ def on_completed(): observer.on_completed() upstream_disp = source.subscribe( - on_next=on_next, on_error=on_error, on_completed=on_completed, scheduler=scheduler + on_next=on_next, + on_error=on_error, + on_completed=on_completed, + scheduler=scheduler, ) return Disposable(dispose_all) @@ -457,7 +477,10 @@ def set_not_processing(): print("\033[35mItem processed.\033[0m") return source.subscribe( - on_next=on_next, on_error=observer.on_error, on_completed=observer.on_completed, scheduler=scheduler + on_next=on_next, + on_error=observer.on_error, + on_completed=observer.on_completed, + scheduler=scheduler, ) return create(subscribe) @@ -488,7 +511,10 @@ def on_completed(): observer.on_completed() return source.subscribe( - on_next=on_next, on_error=on_error, on_completed=on_completed, scheduler=scheduler + on_next=on_next, + on_error=on_error, + on_completed=on_completed, + scheduler=scheduler, ) return Observable(subscribe) @@ -514,7 +540,10 @@ def on_completed(): observer.on_completed() return source.subscribe( - on_next=on_next, on_error=on_error, on_completed=on_completed, scheduler=scheduler + on_next=on_next, + on_error=on_error, + on_completed=on_completed, + scheduler=scheduler, ) return Observable(subscribe) @@ -534,7 +563,11 @@ class PrintColor(Enum): @staticmethod def print_emission( - id: str, dev_name: str = "NA", counts: dict = None, color: "Operators.PrintColor" = None, enabled: bool = True + id: str, + dev_name: str = "NA", + counts: dict = None, + color: "Operators.PrintColor" = None, + enabled: bool = True, ): """ Creates an operator that prints the emission with optional counts for debugging. @@ -578,7 +611,10 @@ def on_next(value): observer.on_next(value) return source.subscribe( - on_next=on_next, on_error=observer.on_error, on_completed=observer.on_completed, scheduler=scheduler + on_next=on_next, + on_error=observer.on_error, + on_completed=observer.on_completed, + scheduler=scheduler, ) return create(_subscribe) diff --git a/dimos/stream/video_provider.py b/dimos/stream/video_provider.py index 8123e67fa7..050905a024 100644 --- a/dimos/stream/video_provider.py +++ b/dimos/stream/video_provider.py @@ -59,7 +59,9 @@ class VideoFrameError(Exception): class AbstractVideoProvider(ABC): """Abstract base class for video providers managing video capture resources.""" - def __init__(self, dev_name: str = "NA", pool_scheduler: Optional[ThreadPoolScheduler] = None) -> None: + def __init__( + self, dev_name: str = "NA", pool_scheduler: Optional[ThreadPoolScheduler] = None + ) -> None: """Initializes the video provider with a device name. Args: diff --git a/dimos/stream/video_providers/unitree.py b/dimos/stream/video_providers/unitree.py index f3ec328435..e91351a229 100644 --- a/dimos/stream/video_providers/unitree.py +++ b/dimos/stream/video_providers/unitree.py @@ -18,7 +18,9 @@ from dimos.robot.unitree.external.go2_webrtc_connect.go2_webrtc_driver.constants import ( WebRTCConnectionMethod, ) -from dimos.robot.unitree.external.go2_webrtc_connect.go2_webrtc_driver.webrtc_driver import Go2WebRTCConnection +from dimos.robot.unitree.external.go2_webrtc_connect.go2_webrtc_driver.webrtc_driver import ( + Go2WebRTCConnection, +) from aiortc import MediaStreamTrack import asyncio from reactivex import Observable, create, operators as ops @@ -55,7 +57,9 @@ def __init__( elif ip: self.conn = Go2WebRTCConnection(connection_method, ip=ip) else: - raise ValueError("Either serial_number or ip must be provided for LocalSTA connection") + raise ValueError( + "Either serial_number or ip must be provided for LocalSTA connection" + ) elif connection_method == WebRTCConnectionMethod.LocalAP: self.conn = Go2WebRTCConnection(connection_method) else: @@ -121,7 +125,9 @@ def emit_frames(observer, scheduler): # Start asyncio loop if not already running if not self.loop: self.loop = asyncio.new_event_loop() - self.asyncio_thread = threading.Thread(target=self._run_asyncio_loop, args=(self.loop,)) + self.asyncio_thread = threading.Thread( + target=self._run_asyncio_loop, args=(self.loop,) + ) self.asyncio_thread.start() frame_time = time.monotonic() diff --git a/dimos/types/costmap.py b/dimos/types/costmap.py index de80b2d6a6..177733ff03 100644 --- a/dimos/types/costmap.py +++ b/dimos/types/costmap.py @@ -124,7 +124,9 @@ def from_pickle(cls, pickle_path: str) -> "Costmap": return costmap @classmethod - def create_empty(cls, width: int = 100, height: int = 100, resolution: float = 0.1) -> "Costmap": + def create_empty( + cls, width: int = 100, height: int = 100, resolution: float = 0.1 + ) -> "Costmap": """Create an empty costmap with specified dimensions.""" return cls( grid=np.zeros((height, width), dtype=np.int8), @@ -241,7 +243,9 @@ def smudge( for i in range(iterations): # Dilate the binary map - dilated = ndimage.binary_dilation(dilated_map > 0, structure=kernel, iterations=1).astype(np.uint8) + dilated = ndimage.binary_dilation( + dilated_map > 0, structure=kernel, iterations=1 + ).astype(np.uint8) # Calculate the new layer (cells that were just added in this iteration) new_layer = (dilated - (dilated_map > 0).astype(np.uint8)) * 100 @@ -321,6 +325,8 @@ def __str__(self) -> str: print(costmap) # Create a smudged version of the costmap for better planning - smudged_costmap = costmap.smudge(kernel_size=10, iterations=10, threshold=80, preserve_unknown=False) + smudged_costmap = costmap.smudge( + kernel_size=10, iterations=10, threshold=80, preserve_unknown=False + ) print(costmap) diff --git a/dimos/types/path.py b/dimos/types/path.py index 2e20924f4d..c87658182f 100644 --- a/dimos/types/path.py +++ b/dimos/types/path.py @@ -134,7 +134,9 @@ def remove(self, index: int) -> np.ndarray: def clear(self) -> None: """Remove all points from the path.""" - self._points = np.zeros((0, self._points.shape[1] if len(self._points) > 0 else 0), dtype=float) + self._points = np.zeros( + (0, self._points.shape[1] if len(self._points) > 0 else 0), dtype=float + ) def length(self) -> float: """Calculate the total length of the path. diff --git a/dimos/types/position.py b/dimos/types/position.py index 07c6813c75..37515d3990 100644 --- a/dimos/types/position.py +++ b/dimos/types/position.py @@ -63,7 +63,9 @@ def __eq__(self, other) -> bool: """Check if two positions are equal using numpy's allclose for floating point comparison.""" if not isinstance(other, Position): return False - return np.allclose(self.pos._data, other.pos._data) and np.allclose(self.rot._data, other.rot._data) + return np.allclose(self.pos._data, other.pos._data) and np.allclose( + self.rot._data, other.rot._data + ) @property def rot(self) -> Vector: diff --git a/dimos/types/robot_location.py b/dimos/types/robot_location.py index 97fb8a79cf..c69d131a04 100644 --- a/dimos/types/robot_location.py +++ b/dimos/types/robot_location.py @@ -96,8 +96,16 @@ def from_vector_metadata(cls, metadata: Dict[str, Any]) -> "RobotLocation": """ return cls( name=metadata.get("location_name", "unknown"), - position=(metadata.get("pos_x", 0.0), metadata.get("pos_y", 0.0), metadata.get("pos_z", 0.0)), - rotation=(metadata.get("rot_x", 0.0), metadata.get("rot_y", 0.0), metadata.get("rot_z", 0.0)), + position=( + metadata.get("pos_x", 0.0), + metadata.get("pos_y", 0.0), + metadata.get("pos_z", 0.0), + ), + rotation=( + metadata.get("rot_x", 0.0), + metadata.get("rot_y", 0.0), + metadata.get("rot_z", 0.0), + ), frame_id=metadata.get("frame_id"), timestamp=metadata.get("timestamp", time.time()), location_id=metadata.get("location_id", f"loc_{uuid.uuid4().hex[:8]}"), diff --git a/dimos/types/sample.py b/dimos/types/sample.py index 58a0da381b..5665f7a640 100644 --- a/dimos/types/sample.py +++ b/dimos/types/sample.py @@ -196,7 +196,9 @@ def flatten_recursive(obj, path=""): flatten_recursive(self) accumulator = accumulator.values() if output_type == "dict" else accumulator - if non_numerical == "forbid" and any(not isinstance(v, int | float | bool) for v in accumulator): + if non_numerical == "forbid" and any( + not isinstance(v, int | float | bool) for v in accumulator + ): raise ValueError("Non-numerical values found in flattened data.") if output_type == "np": return np.array(accumulator) @@ -216,7 +218,10 @@ def obj_to_schema(value: Any) -> Dict: dict: A simplified JSON schema representing the structure of the dictionary. """ if isinstance(value, dict): - return {"type": "object", "properties": {k: Sample.obj_to_schema(v) for k, v in value.items()}} + return { + "type": "object", + "properties": {k: Sample.obj_to_schema(v) for k, v in value.items()}, + } if isinstance(value, list | tuple | np.ndarray): if len(value) > 0: return {"type": "array", "items": Sample.obj_to_schema(value[0])} @@ -260,7 +265,9 @@ def schema(self, resolve_refs: bool = True, include_descriptions=False) -> Dict: if key not in properties: properties[key] = Sample.obj_to_schema(value) if isinstance(value, Sample): - properties[key] = value.schema(resolve_refs=resolve_refs, include_descriptions=include_descriptions) + properties[key] = value.schema( + resolve_refs=resolve_refs, include_descriptions=include_descriptions + ) else: properties[key] = Sample.obj_to_schema(value) return schema @@ -497,7 +504,9 @@ def unpack(self, to_dicts=False) -> List[Union["Sample", Dict]]: return [] # Ensure all attributes are lists and have the same length - list_sizes = {len(getattr(self, attr)) for attr in attributes if isinstance(getattr(self, attr), list)} + list_sizes = { + len(getattr(self, attr)) for attr in attributes if isinstance(getattr(self, attr), list) + } if len(list_sizes) != 1: raise ValueError("Not all attribute lists have the same length.") list_size = list_sizes.pop() @@ -505,7 +514,10 @@ def unpack(self, to_dicts=False) -> List[Union["Sample", Dict]]: if to_dicts: return [{key: getattr(self, key)[i] for key in attributes} for i in range(list_size)] - return [self.__class__(**{key: getattr(self, key)[i] for key in attributes}) for i in range(list_size)] + return [ + self.__class__(**{key: getattr(self, key)[i] for key in attributes}) + for i in range(list_size) + ] @classmethod def default_space(cls) -> spaces.Dict: @@ -543,7 +555,9 @@ def space(self) -> spaces.Dict: logging.debug("Generating space for key: '%s', value: %s", key, value) info = self.model_field_info(key) value = getattr(self, key) if hasattr(self, key) else value # noqa: PLW2901 - space_dict[key] = value.space() if isinstance(value, Sample) else self.space_for(value, info=info) + space_dict[key] = ( + value.space() if isinstance(value, Sample) else self.space_for(value, info=info) + ) return spaces.Dict(space_dict) def random_sample(self) -> "Sample": diff --git a/dimos/types/vector.py b/dimos/types/vector.py index 97544d316e..eb43c04945 100644 --- a/dimos/types/vector.py +++ b/dimos/types/vector.py @@ -204,7 +204,8 @@ def angle(self, other) -> float: other_data = np.array(other, dtype=float) cos_angle = np.clip( - np.dot(self._data, other_data) / (np.linalg.norm(self._data) * np.linalg.norm(other_data)), + np.dot(self._data, other_data) + / (np.linalg.norm(self._data) * np.linalg.norm(other_data)), -1.0, 1.0, ) diff --git a/dimos/utils/extract_frames.py b/dimos/utils/extract_frames.py index e10e12c180..ddff12f189 100644 --- a/dimos/utils/extract_frames.py +++ b/dimos/utils/extract_frames.py @@ -67,9 +67,14 @@ def extract_frames(video_path, output_dir, frame_rate): if __name__ == "__main__": parser = argparse.ArgumentParser(description="Extract frames from a video file.") parser.add_argument("video_path", type=str, help="Path to the input .mov or .mp4 video file.") - parser.add_argument("--output_dir", type=str, default="frames", help="Directory to save extracted frames.") parser.add_argument( - "--frame_rate", type=float, default=1.0, help="Frame rate at which to extract frames (frames per second)." + "--output_dir", type=str, default="frames", help="Directory to save extracted frames." + ) + parser.add_argument( + "--frame_rate", + type=float, + default=1.0, + help="Frame rate at which to extract frames (frames per second).", ) args = parser.parse_args() diff --git a/dimos/utils/logging_config.py b/dimos/utils/logging_config.py index 66877658ef..a1e1a25ca4 100644 --- a/dimos/utils/logging_config.py +++ b/dimos/utils/logging_config.py @@ -25,7 +25,9 @@ logging.basicConfig(format="%(name)s - %(levelname)s - %(message)s") -def setup_logger(name: str, level: Optional[int] = None, log_format: Optional[str] = None) -> logging.Logger: +def setup_logger( + name: str, level: Optional[int] = None, log_format: Optional[str] = None +) -> logging.Logger: """Set up a logger with color output. Args: diff --git a/dimos/utils/reactive.py b/dimos/utils/reactive.py index 0a609dd23e..4836e18987 100644 --- a/dimos/utils/reactive.py +++ b/dimos/utils/reactive.py @@ -71,7 +71,9 @@ def getter_ondemand(observable: Observable[T], timeout: Optional[float] = 30.0) def getter(): try: # Wait for first value with optional timeout - value = observable.pipe(ops.first(), *([ops.timeout(timeout)] if timeout is not None else [])).run() + value = observable.pipe( + ops.first(), *([ops.timeout(timeout)] if timeout is not None else []) + ).run() return value except Exception as e: raise Exception(f"No value received after {timeout} seconds") from e diff --git a/dimos/utils/simple_controller.py b/dimos/utils/simple_controller.py index 632a70cea4..f30155c419 100644 --- a/dimos/utils/simple_controller.py +++ b/dimos/utils/simple_controller.py @@ -122,7 +122,9 @@ def __init__(self, distance_pid_params, angle_pid_params): self.angle_pid = PIDController(*angle_pid_params) self.prev_measured_angle = 0.0 # Used for angular feed-forward damping - def compute_control(self, measured_distance, measured_angle, desired_distance, desired_angle, dt): + def compute_control( + self, measured_distance, measured_angle, desired_distance, desired_angle, dt + ): """ Compute the forward (x) and angular (z) commands. diff --git a/dimos/utils/test_reactive.py b/dimos/utils/test_reactive.py index 977863826a..a5c5e87527 100644 --- a/dimos/utils/test_reactive.py +++ b/dimos/utils/test_reactive.py @@ -5,7 +5,12 @@ from reactivex import operators as ops from typing import Callable, TypeVar, Any from reactivex.disposable import Disposable -from dimos.utils.reactive import backpressure, getter_streaming, getter_ondemand, callback_to_observable +from dimos.utils.reactive import ( + backpressure, + getter_streaming, + getter_ondemand, + callback_to_observable, +) def measure_time(func: Callable[[], Any], iterations: int = 1) -> float: @@ -16,7 +21,9 @@ def measure_time(func: Callable[[], Any], iterations: int = 1) -> float: return result, total_time -def assert_time(func: Callable[[], Any], assertion: Callable[[int], bool], assert_fail_msg=None) -> None: +def assert_time( + func: Callable[[], Any], assertion: Callable[[int], bool], assert_fail_msg=None +) -> None: [result, total_time] = measure_time(func) assert assertion(total_time), assert_fail_msg + f", took {round(total_time, 2)}s" return result @@ -56,7 +63,9 @@ def test_backpressure_handling(): received_fast = [] received_slow = [] # Create an observable that emits numpy arrays instead of integers - source = dispose_spy(rx.interval(0.1).pipe(ops.map(lambda i: np.array([i, i + 1, i + 2])), ops.take(50))) + source = dispose_spy( + rx.interval(0.1).pipe(ops.map(lambda i: np.array([i, i + 1, i + 2])), ops.take(50)) + ) # Wrap with backpressure handling safe_source = backpressure(source) @@ -80,10 +89,14 @@ def test_backpressure_handling(): print("Slow observer received:", len(received_slow), [arr[0] for arr in received_slow]) # Fast observer should get all or nearly all items - assert len(received_fast) > 15, f"Expected fast observer to receive most items, got {len(received_fast)}" + assert len(received_fast) > 15, ( + f"Expected fast observer to receive most items, got {len(received_fast)}" + ) # Slow observer should get fewer items due to backpressure handling - assert len(received_slow) < len(received_fast), "Slow observer should receive fewer items than fast observer" + assert len(received_slow) < len(received_fast), ( + "Slow observer should receive fewer items than fast observer" + ) # Specifically, processing at 0.25s means ~4 items per second, so expect 8-10 items assert 7 <= len(received_slow) <= 11, f"Expected 7-11 items, got {len(received_slow)}" @@ -98,11 +111,19 @@ def test_backpressure_handling(): def test_getter_streaming_blocking(): - source = dispose_spy(rx.interval(0.2).pipe(ops.map(lambda i: np.array([i, i + 1, i + 2])), ops.take(50))) + source = dispose_spy( + rx.interval(0.2).pipe(ops.map(lambda i: np.array([i, i + 1, i + 2])), ops.take(50)) + ) assert source.is_disposed() - getter = min_time(lambda: getter_streaming(source), 0.2, "Latest getter needs to block until first msg is ready") - assert np.array_equal(getter(), np.array([0, 1, 2])), f"Expected to get the first array [0,1,2], got {getter()}" + getter = min_time( + lambda: getter_streaming(source), + 0.2, + "Latest getter needs to block until first msg is ready", + ) + assert np.array_equal(getter(), np.array([0, 1, 2])), ( + f"Expected to get the first array [0,1,2], got {getter()}" + ) time.sleep(0.5) assert getter()[0] >= 2, f"Expected array with first value >= 2, got {getter()}" @@ -125,7 +146,9 @@ def test_getter_streaming_nonblocking(): source = dispose_spy(rx.interval(0.2).pipe(ops.take(50))) getter = max_time( - lambda: getter_streaming(source, nonblocking=True), 0.1, "nonblocking getter init shouldn't block" + lambda: getter_streaming(source, nonblocking=True), + 0.1, + "nonblocking getter init shouldn't block", ) min_time(getter, 0.2, "Expected for first value call to block if cache is empty") assert getter() == 0 diff --git a/dimos/web/dimos_interface/api/server.py b/dimos/web/dimos_interface/api/server.py index f231384074..812973e300 100644 --- a/dimos/web/dimos_interface/api/server.py +++ b/dimos/web/dimos_interface/api/server.py @@ -91,7 +91,9 @@ def __init__( for key in self.streams: if self.streams[key] is not None: - self.active_streams[key] = self.streams[key].pipe(ops.map(self.process_frame_fastapi), ops.share()) + self.active_streams[key] = self.streams[key].pipe( + ops.map(self.process_frame_fastapi), ops.share() + ) # Set up text stream subscriptions for key, stream in self.text_streams.items(): @@ -215,7 +217,11 @@ async def index(request: Request): text_stream_keys = list(self.text_streams.keys()) return self.templates.TemplateResponse( "index_fastapi.html", - {"request": request, "stream_keys": stream_keys, "text_stream_keys": text_stream_keys}, + { + "request": request, + "stream_keys": stream_keys, + "text_stream_keys": text_stream_keys, + }, ) @self.app.post("/submit_query") @@ -229,7 +235,10 @@ async def submit_query(query: str = Form(...)): return JSONResponse({"success": False, "message": "No query provided"}) except Exception as e: # Ensure we always return valid JSON even on error - return JSONResponse(status_code=500, content={"success": False, "message": f"Server error: {str(e)}"}) + return JSONResponse( + status_code=500, + content={"success": False, "message": f"Server error: {str(e)}"}, + ) # Unitree API endpoints @self.app.get("/unitree/status") @@ -247,13 +256,18 @@ async def unitree_command(request: Request): # Emit the command through the query_subject self.query_subject.on_next(command_text) - response = {"success": True, "command": command_text, "result": f"Processed command: {command_text}"} + response = { + "success": True, + "command": command_text, + "result": f"Processed command: {command_text}", + } return JSONResponse(response) except Exception as e: print(f"Error processing command: {str(e)}") return JSONResponse( - status_code=500, content={"success": False, "message": f"Error processing command: {str(e)}"} + status_code=500, + content={"success": False, "message": f"Error processing command: {str(e)}"}, ) @self.app.get("/text_stream/{key}") @@ -267,7 +281,9 @@ async def text_stream(key: str): def run(self): """Run the FastAPI server.""" - uvicorn.run(self.app, host=self.host, port=self.port) # TODO: Translate structure to enable in-built workers' + uvicorn.run( + self.app, host=self.host, port=self.port + ) # TODO: Translate structure to enable in-built workers' if __name__ == "__main__": diff --git a/dimos/web/fastapi_server.py b/dimos/web/fastapi_server.py index ddf3ebdb1d..7dcd0f6d73 100644 --- a/dimos/web/fastapi_server.py +++ b/dimos/web/fastapi_server.py @@ -76,7 +76,9 @@ def __init__( for key in self.streams: if self.streams[key] is not None: - self.active_streams[key] = self.streams[key].pipe(ops.map(self.process_frame_fastapi), ops.share()) + self.active_streams[key] = self.streams[key].pipe( + ops.map(self.process_frame_fastapi), ops.share() + ) # Set up text stream subscriptions for key, stream in self.text_streams.items(): @@ -183,7 +185,11 @@ async def index(request: Request): text_stream_keys = list(self.text_streams.keys()) return self.templates.TemplateResponse( "index_fastapi.html", - {"request": request, "stream_keys": stream_keys, "text_stream_keys": text_stream_keys}, + { + "request": request, + "stream_keys": stream_keys, + "text_stream_keys": text_stream_keys, + }, ) @self.app.post("/submit_query") @@ -197,7 +203,10 @@ async def submit_query(query: str = Form(...)): return JSONResponse({"success": False, "message": "No query provided"}) except Exception as e: # Ensure we always return valid JSON even on error - return JSONResponse(status_code=500, content={"success": False, "message": f"Server error: {str(e)}"}) + return JSONResponse( + status_code=500, + content={"success": False, "message": f"Server error: {str(e)}"}, + ) @self.app.get("/text_stream/{key}") async def text_stream(key: str): @@ -210,4 +219,6 @@ async def text_stream(key: str): def run(self): """Run the FastAPI server.""" - uvicorn.run(self.app, host=self.host, port=self.port) # TODO: Translate structure to enable in-built workers' + uvicorn.run( + self.app, host=self.host, port=self.port + ) # TODO: Translate structure to enable in-built workers' diff --git a/dimos/web/flask_server.py b/dimos/web/flask_server.py index 3cc48e11dd..01d79f63cd 100644 --- a/dimos/web/flask_server.py +++ b/dimos/web/flask_server.py @@ -33,7 +33,9 @@ def __init__(self, dev_name="Flask Server", edge_type="Bidirectional", port=5555 for key in self.streams: if self.streams[key] is not None: # Apply share and ref_count to manage subscriptions - self.active_streams[key] = self.streams[key].pipe(ops.map(self.process_frame_flask), ops.share()) + self.active_streams[key] = self.streams[key].pipe( + ops.map(self.process_frame_flask), ops.share() + ) self.setup_routes() @@ -75,14 +77,18 @@ def generate(): def make_response_generator(key): def response_generator(): - return Response(stream_generator(key)(), mimetype="multipart/x-mixed-replace; boundary=frame") + return Response( + stream_generator(key)(), mimetype="multipart/x-mixed-replace; boundary=frame" + ) return response_generator # Dynamically adding routes using add_url_rule for key in self.streams: endpoint = f"video_feed_{key}" - self.app.add_url_rule(f"/video_feed/{key}", endpoint, view_func=make_response_generator(key)) + self.app.add_url_rule( + f"/video_feed/{key}", endpoint, view_func=make_response_generator(key) + ) def run(self, host="0.0.0.0", port=5555, threaded=True): self.port = port diff --git a/dimos/web/robot_web_interface.py b/dimos/web/robot_web_interface.py index 3d6fc52583..72dcce9d29 100644 --- a/dimos/web/robot_web_interface.py +++ b/dimos/web/robot_web_interface.py @@ -17,7 +17,6 @@ Provides a clean interface to the dimensional-interface FastAPI server. """ - from dimos.web.dimos_interface.api.server import FastAPIServer From 373f6ce917fbb239d55a025c2b1ed0e97252dbf0 Mon Sep 17 00:00:00 2001 From: lesh Date: Tue, 27 May 2025 16:24:01 +0300 Subject: [PATCH 2/2] pyproject.toml update --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index e9804cf212..bbee605a7f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ version = "0.0.2" description = "Powering agentive generalist robotics" [tool.ruff] -line-length = 120 +line-length = 100 exclude = [ ".git", ".pytest_cache",