zz990099 · zz990099 · Mar 25, 2025 · Mar 24, 2025 · Mar 24, 2025
diff --git a/detection_6d_foundationpose/src/foundationpose_render.cpp b/detection_6d_foundationpose/src/foundationpose_render.cpp
@@ -270,6 +270,8 @@ FoundationPoseRenderer::PrepareBuffer()
   // nvdiffrast render 用到的缓存以及渲染器
   size_t pose_clip_size =  num_vertices_ * (kVertexPoints + 1) * input_poses_num_ * sizeof(float);
   size_t pts_cam_size = num_vertices_ * kVertexPoints * input_poses_num_ * sizeof(float);
+  size_t diffuse_intensity_size = num_vertices_ * input_poses_num_ * sizeof(float);
+  size_t diffuse_intensity_map_size = input_poses_num_ * crop_window_H_ * crop_window_W_ * sizeof(float);
   size_t rast_out_size = input_poses_num_ * crop_window_H_ * crop_window_W_ * (kVertexPoints + 1) * sizeof(float);
   size_t color_size = input_poses_num_ * crop_window_H_ * crop_window_W_ * kNumChannels * sizeof(float);
   size_t xyz_map_size = input_poses_num_ * crop_window_H_ * crop_window_W_ * kNumChannels * sizeof(float);
@@ -279,6 +281,8 @@ FoundationPoseRenderer::PrepareBuffer()
   float* _pose_clip_device;
   float* _rast_out_device;
   float* _pts_cam_device;
+  float* _diffuse_intensity_device;
+  float* _diffuse_intensity_map_device;
   float* _texcoords_out_device;
   float* _color_device;
   float* _xyz_map_device;
@@ -307,6 +311,14 @@ FoundationPoseRenderer::PrepareBuffer()
             "[FoundationPoseRenderer] cudaMalloc `_pts_cam_device` FAILED!!!");
   pts_cam_device_ = DeviceBufferUniquePtrType<float>(_pts_cam_device, CudaMemoryDeleter<float>());
 
+  CHECK_CUDA(cudaMalloc(&_diffuse_intensity_device, diffuse_intensity_size),
+            "[FoundationPoseRenderer] cudaMalloc `_diffuse_intensity_device` FAILED!!!");
+  diffuse_intensity_device_ = DeviceBufferUniquePtrType<float>(_diffuse_intensity_device, CudaMemoryDeleter<float>());
+
+  CHECK_CUDA(cudaMalloc(&_diffuse_intensity_map_device, diffuse_intensity_map_size),
+            "[FoundationPoseRenderer] cudaMalloc `_diffuse_intensity_map_device` FAILED!!!");
+  diffuse_intensity_map_device_ = DeviceBufferUniquePtrType<float>(_diffuse_intensity_map_device, CudaMemoryDeleter<float>());
+
   CHECK_CUDA(cudaMalloc(&_texcoords_out_device, texcoords_out_size),
             "[FoundationPoseRenderer] cudaMalloc `_texcoords_out_device` FAILED!!!");
   texcoords_out_device_ = DeviceBufferUniquePtrType<float>(_texcoords_out_device, CudaMemoryDeleter<float>());
@@ -361,18 +373,24 @@ FoundationPoseRenderer::LoadTexturedMesh()
 {
   const auto& mesh_model_center = mesh_loader_->GetMeshModelCenter();  
   const auto& mesh_vertices = mesh_loader_->GetMeshVertices();
+  const auto& mesh_vertex_normals = mesh_loader_->GetMeshVertexNormals();
   const auto& mesh_texcoords = mesh_loader_->GetMeshTextureCoords();
   const auto& mesh_faces = mesh_loader_->GetMeshFaces();
   const auto& rgb_texture_map = mesh_loader_->GetTextureMap();
   mesh_diameter_ = mesh_loader_->GetMeshDiameter();
 
+  std::vector<float> vertex_normals;
 
   // Walk through each of the mesh's vertices
   for (unsigned int v = 0; v < mesh_vertices.size(); v++) {
     vertices_.push_back(mesh_vertices[v].x - mesh_model_center[0]);
     vertices_.push_back(mesh_vertices[v].y - mesh_model_center[1]);
     vertices_.push_back(mesh_vertices[v].z - mesh_model_center[2]);
 
+    vertex_normals.push_back(mesh_vertex_normals[v].x);
+    vertex_normals.push_back(mesh_vertex_normals[v].y);
+    vertex_normals.push_back(mesh_vertex_normals[v].z);
+
     // Check if the mesh has texture coordinates
     if (mesh_texcoords.size() >= 1) {
       texcoords_.push_back(mesh_texcoords[0][v].x);
@@ -422,6 +440,7 @@ FoundationPoseRenderer::LoadTexturedMesh()
   size_t texcoords_size = texcoords_.size() * sizeof(float);
 
   float* _vertices_device;
+  float* _vertex_normals_device;
   float* _texcoords_device;
   int32_t* _mesh_faces_device;
   uint8_t* _texture_map_device;
@@ -430,6 +449,10 @@ FoundationPoseRenderer::LoadTexturedMesh()
             "[FoundationposeRender] cudaMalloc `mesh_faces_device` FAILED!!!");
   vertices_device_ = DeviceBufferUniquePtrType<float>(_vertices_device, CudaMemoryDeleter<float>());
 
+  CHECK_CUDA(cudaMalloc(&_vertex_normals_device, vertices_size),
+            "[FoundationposeRender] cudaMalloc `vertex_normals_device` FAILED!!!");
+  vertex_normals_device_ = DeviceBufferUniquePtrType<float>(_vertex_normals_device, CudaMemoryDeleter<float>());
+
   CHECK_CUDA(cudaMalloc(&_mesh_faces_device, faces_size),
             "[FoundationposeRender] cudaMalloc `mesh_faces_device` FAILED!!!");
   mesh_faces_device_ = DeviceBufferUniquePtrType<int32_t>(_mesh_faces_device, CudaMemoryDeleter<int32_t>());
@@ -442,9 +465,14 @@ FoundationPoseRenderer::LoadTexturedMesh()
             "[FoundationposeRender] cudaMalloc `texture_map_device_` FAILED!!!");
   texture_map_device_ = DeviceBufferUniquePtrType<uint8_t>(_texture_map_device, CudaMemoryDeleter<uint8_t>());
 
-  CHECK_CUDA(cudaMemcpy(vertices_device_.get(), 
-                        vertices_.data(), 
-                        vertices_size, 
+  CHECK_CUDA(cudaMemcpy(vertices_device_.get(),
+                        vertices_.data(),
+                        vertices_size,
+                        cudaMemcpyHostToDevice),
+            "[FoundationposeRender] cudaMemcpy mesh_faces_host -> mesh_faces_device FAILED!!!");
+  CHECK_CUDA(cudaMemcpy(vertex_normals_device_.get(),
+                        vertex_normals.data(),
+                        vertices_size,
                         cudaMemcpyHostToDevice),
             "[FoundationposeRender] cudaMemcpy mesh_faces_host -> mesh_faces_device FAILED!!!");
   CHECK_CUDA(cudaMemcpy(mesh_faces_device_.get(), 
@@ -514,6 +542,36 @@ bool FoundationPoseRenderer::TransformVerticesOnCUDA(cudaStream_t stream,
   return true;
 }
 
+bool FoundationPoseRenderer::TransformVertexNormalsOnCUDA(cudaStream_t stream,
+                          const std::vector<Eigen::MatrixXf>& tfs,
+                          float* output_buffer)
+{
+  // Get the dimensions of the inputs
+  int tfs_size = tfs.size();
+  CHECK_STATE(tfs_size != 0,
+        "[FoundationposeRender] The transfomation matrix is empty! ");
+
+  CHECK_STATE(tfs[0].cols() == tfs[0].rows(),
+        "[FoundationposeRender] The transfomation matrix has different rows and cols! ");
+
+  const int total_elements = tfs[0].cols() * tfs[0].rows();
+
+  float* transform_device_buffer_ = nullptr;
+  cudaMallocAsync(&transform_device_buffer_, tfs_size * total_elements * sizeof(float), stream);
+
+  for (int i = 0 ; i < tfs_size ; ++ i) {
+    cudaMemcpyAsync(transform_device_buffer_ + i * total_elements, 
+                    tfs[i].data(), 
+                    total_elements * sizeof(float), 
+                    cudaMemcpyHostToDevice, 
+                    stream);
+  }
+
+  foundationpose_render::transform_normals(stream, transform_device_buffer_, tfs_size, vertex_normals_device_.get(), num_vertices_, output_buffer);
+
+  cudaFreeAsync(transform_device_buffer_, stream);
+  return true;
+}
 
 bool FoundationPoseRenderer::GeneratePoseClipOnCUDA(cudaStream_t stream,
                       float* output_buffer,
@@ -595,15 +653,15 @@ FoundationPoseRenderer::NvdiffrastRender(cudaStream_t cuda_stream,
   foundationpose_render::interpolate(
       cuda_stream,
       pts_cam_device_.get(), rast_out_device_.get(), mesh_faces_device_.get(), xyz_map_device_.get(),
-      num_vertices_, num_faces_, kVertexPoints,
+      num_vertices_, num_faces_, 3, kVertexPoints,
       H, W, N);
   CHECK_CUDA(cudaGetLastError(),
             "[FoundationPoseRenderer] interpolate failed!!!");
 
   foundationpose_render::interpolate(
       cuda_stream,
       texcoords_device_.get(), rast_out_device_.get(), mesh_faces_device_.get(), texcoords_out_device_.get(),
-      num_vertices_, num_faces_, kTexcoordsDim,
+      num_vertices_, num_faces_, 2, kTexcoordsDim,
       H, W, N);
   CHECK_CUDA(cudaGetLastError(),
             "[FoundationPoseRenderer] interpolate failed!!!");
@@ -619,6 +677,26 @@ FoundationPoseRenderer::NvdiffrastRender(cudaStream_t cuda_stream,
   CHECK_CUDA(cudaGetLastError(),
             "[FoundationPoseRenderer] texture failed!!!");
 
+  CHECK_STATE(TransformVertexNormalsOnCUDA(cuda_stream, poses, diffuse_intensity_device_.get()), 
+            "[FoundationPoseRenderer] Transform vertex normals failed!!!");
+
+  foundationpose_render::interpolate(cuda_stream, 
+                                     diffuse_intensity_device_.get(), 
+                                     rast_out_device_.get(), 
+                                     mesh_faces_device_.get(),
+                                     diffuse_intensity_map_device_.get(),
+                                     num_vertices_, num_faces_, 3, 1, H, W, N);
+  CHECK_CUDA(cudaGetLastError(),
+            "[FoundationPoseRenderer] interpolate failed!!!");
+
+  foundationpose_render::refine_color(cuda_stream, color_device_.get(), 
+                                      diffuse_intensity_map_device_.get(), 
+                                      rast_out_device_.get(),
+                                      color_device_.get(), 
+                                      poses.size(), 0.8, 0.5, H, W);
+  CHECK_CUDA(cudaGetLastError(),
+            "[FoundationPoseRenderer] refine_color failed!!!");
+
   float min_value = 0.0;
   float max_value = 1.0;
   foundationpose_render::clamp(cuda_stream, color_device_.get(), min_value, max_value, N * H * W * kNumChannels);

diff --git a/detection_6d_foundationpose/src/foundationpose_render.cu b/detection_6d_foundationpose/src/foundationpose_render.cu
@@ -196,8 +196,8 @@ void rasterize(
 
 void interpolate(
     cudaStream_t stream, float* attr_ptr, float* rast_ptr, int32_t* tri_ptr, float* out, int num_vertices,
-    int num_triangles, int attr_dim, int H, int W, int C) {
-  int instance_mode = attr_dim > 2 ? 1 : 0;
+    int num_triangles, int attr_shape_dim, int attr_dim, int H, int W, int C) {
+  int instance_mode = attr_shape_dim > 2 ? 1 : 0;
 
   InterpolateKernelParams p = {};  // Initialize all fields to zero.
   p.instance_mode = instance_mode;
@@ -339,4 +339,88 @@ void generate_pose_clip(cudaStream_t stream, const float* transform_matrixs, con
       transform_matrixs, bbox2d_matrix, M, points_vectors, N, transformed_points_vectors, rgb_H, rgb_W);
 }
 
+
+__global__ void transform_normals_kernel(
+    const float* transform_matrixs, int M, const float* normals_vectors, 
+    int N, float* transformed_normal_vectors)
+{
+  int row_idx = threadIdx.y + blockIdx.y * blockDim.y;
+  int col_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (row_idx >= M || col_idx >= N) return;
+
+  const float* matrix = transform_matrixs + row_idx * 16;  // 指向当前 4x4 变换矩阵
+  const float* normal = normals_vectors + col_idx * 3;       // 指向当前 normal 向量
+  float* transformed_normal = transformed_normal_vectors + (row_idx * N + col_idx);
+
+  float x = normal[0], y = normal[1], z = normal[2];
+  // **Column-Major 访问方式**
+  float tx = matrix[0] * x + matrix[4] * y + matrix[8]  * z;
+  float ty = matrix[1] * x + matrix[5] * y + matrix[9]  * z;
+  float tz = matrix[2] * x + matrix[6] * y + matrix[10] * z;
+  // 只保留z方向的分量，取反
+  float l2 = sqrt(tx*tx + ty*ty + tz*tz);
+  float value = l2 == 0 ? 0 : - tz / l2;
+  value = clamp_func(value, 0, 1);
+  transformed_normal[0] = value;
+}
+
+void transform_normals(cudaStream_t stream, const float* transform_matrixs, int M, const float* normals_vectors, 
+    int N, float* transformed_normal_vectors)
+{
+  dim3 blockSize = {32, 32};
+  dim3 gridSize = {ceil_div(N, 32), ceil_div(M, 32)};
+
+  transform_normals_kernel<<<gridSize, blockSize, 0, stream>>>(
+      transform_matrixs, M, normals_vectors, N, transformed_normal_vectors);
+}
+
+
+__global__ void renfine_color_kernel(
+    const float* color, const float* diffuse_intensity_map, const float* rast_out, float* output, int poses_num, float w_ambient, 
+    float w_diffuse, int rgb_H, int rgb_W)
+{
+  int row_idx = threadIdx.y + blockIdx.y * blockDim.y;
+  int col_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (row_idx >= rgb_H || col_idx >= rgb_W * poses_num) return;
+
+  const int color_idx = col_idx / rgb_W;
+  const int color_row_idx = row_idx;
+  const int color_col_idx = col_idx - color_idx * rgb_W;
+
+  const size_t pixel_idx = color_row_idx * rgb_W + color_col_idx;
+  const size_t pixel_offset = color_idx * rgb_H * rgb_W + pixel_idx;
+
+  const float* rgb = color + pixel_offset * 3;
+  const float* diffuse = diffuse_intensity_map + pixel_offset;
+  const float* rast = rast_out + pixel_offset * 4;
+  float* out = output + pixel_offset * 3;
+
+  float diff = diffuse[0];
+
+  float is_foreground = clamp_func(rast[3], 0, 1);
+
+  float r = rgb[0] * (w_ambient + diff*w_diffuse) * is_foreground;
+  float g = rgb[1] * (w_ambient + diff*w_diffuse) * is_foreground;
+  float b = rgb[2] * (w_ambient + diff*w_diffuse) * is_foreground;
+
+  r = clamp_func(r, 0, 1);
+  g = clamp_func(g, 0, 1);
+  b = clamp_func(b, 0, 1);
+
+  out[0] = r;
+  out[1] = g;
+  out[2] = b;
+}
+
+void refine_color(cudaStream_t stream, const float* color, const float* diffuse_intensity_map, const float* rast_out, float* output,
+        int poses_num, float w_ambient, float w_diffuse, int rgb_H, int rgb_W)
+{
+  dim3 blockSize = {32, 32};
+  dim3 gridSize = {ceil_div(rgb_W * poses_num, 32), ceil_div(rgb_H, 32)};
+
+  renfine_color_kernel<<<gridSize, blockSize, 0, stream>>>(
+    color, diffuse_intensity_map, rast_out, output, poses_num, w_ambient, w_diffuse, rgb_H, rgb_W
+  );
+}
+
 }   // namespace foundationpose_render
diff --git a/detection_6d_foundationpose/src/foundationpose_render.cu.hpp b/detection_6d_foundationpose/src/foundationpose_render.cu.hpp
@@ -48,7 +48,7 @@ void rasterize(
 
 void interpolate(
     cudaStream_t stream, float* attr_ptr, float* rast_ptr, int32_t* tri_ptr, float* out, int num_vertices,
-    int num_triangles, int attr_dim, int H, int W, int C);
+    int num_triangles, int attr_shape_dim, int attr_dim, int H, int W, int C);
 
 void texture(
     cudaStream_t stream, float* tex_ptr, float* uv_ptr, float* out, int tex_height, int tex_width, int tex_channel,
@@ -71,6 +71,17 @@ void transform_points(cudaStream_t stream, const float* transform_matrixs, int t
 void generate_pose_clip(cudaStream_t stream, const float* transform_matrixs, const float* bbox2d_matrix, int transform_num, const float* points_vectors, 
     int points_num, float* transformed_points_vectors, int rgb_H, int rgb_W);
 
+/**
+ * @param transform_matrixs 应当是`Col-Major`的transform_num个4x4矩阵
+ * @param normals_vectors 应当是`normals_num`个3x1向量
+ * @param transformed_normal_vectors 这里直接输出归一化后的z方向分量，供 `transform_num x normals_num`个，即 [hyp-pose, H, W, 1] 
+ */
+void transform_normals(cudaStream_t stream, const float* transform_matrixs, int transform_num, const float* normals_vectors, 
+    int normals_num, float* transformed_normal_vectors);
+
+void refine_color(cudaStream_t stream, const float* color, const float* diffuse_intensity_map, const float* rast, float* output,
+    int poses_num, float w_ambient, float w_diffuse, int rgb_H, int rgb_W);
+
 }   // namespace foundationpose_render
 
 #endif  // NVIDIA_ISAAC_ROS_EXTENSIONS_FOUNDATIONPOSE_RENDER_CUDA_HPP_
diff --git a/detection_6d_foundationpose/src/foundationpose_render.hpp b/detection_6d_foundationpose/src/foundationpose_render.hpp
@@ -64,6 +64,10 @@ class FoundationPoseRenderer {
                   const std::vector<Eigen::MatrixXf>& tfs,
                   float* output_buffer) ;
 
+  bool TransformVertexNormalsOnCUDA(cudaStream_t stream,
+                  const std::vector<Eigen::MatrixXf>& tfs,
+                  float* output_buffer);
+
   bool GeneratePoseClipOnCUDA(cudaStream_t stream,
                       float* output_buffer,
                       const std::vector<Eigen::MatrixXf>& poses, 
@@ -121,13 +125,16 @@ class FoundationPoseRenderer {
   using DeviceBufferUniquePtrType = std::unique_ptr<T, std::function<void(T*)>>;
 
   DeviceBufferUniquePtrType<float> vertices_device_ {nullptr};
+  DeviceBufferUniquePtrType<float> vertex_normals_device_ {nullptr};
   DeviceBufferUniquePtrType<float> texcoords_device_ {nullptr};
   DeviceBufferUniquePtrType<int32_t> mesh_faces_device_ {nullptr};
   DeviceBufferUniquePtrType<uint8_t> texture_map_device_ {nullptr};
   // nvdiffrast render时相关缓存
   DeviceBufferUniquePtrType<float> pose_clip_device_ {nullptr};
   DeviceBufferUniquePtrType<float> rast_out_device_ {nullptr};
   DeviceBufferUniquePtrType<float> pts_cam_device_ {nullptr};
+  DeviceBufferUniquePtrType<float> diffuse_intensity_device_ {nullptr};
+  DeviceBufferUniquePtrType<float> diffuse_intensity_map_device_ {nullptr};
   DeviceBufferUniquePtrType<float> texcoords_out_device_ {nullptr};
   DeviceBufferUniquePtrType<float> color_device_ {nullptr};
   DeviceBufferUniquePtrType<float> xyz_map_device_ {nullptr};

diff --git a/detection_6d_foundationpose/src/foundationpose_sampling.cpp b/detection_6d_foundationpose/src/foundationpose_sampling.cpp
@@ -200,7 +200,7 @@ std::vector<Eigen::Matrix4f> MakeRotationGrid(unsigned int n_views = 40, int inp
       auto R_inplane = Eigen::Affine3f::Identity();
       R_inplane.rotate(Eigen::AngleAxisf(0, Eigen::Vector3f::UnitX()))
           .rotate(Eigen::AngleAxisf(0, Eigen::Vector3f::UnitY()))
-          .rotate(Eigen::AngleAxisf(inplane_rot, Eigen::Vector3f::UnitZ()));
+          .rotate(Eigen::AngleAxisf(inplane_rot * M_PI / 180.0f, Eigen::Vector3f::UnitZ()));
 
       cam_in_ob = cam_in_ob * R_inplane.matrix();
       Eigen::Matrix4f ob_in_cam = cam_in_ob.inverse();

diff --git a/detection_6d_foundationpose/src/foundationpose_utils.cpp b/detection_6d_foundationpose/src/foundationpose_utils.cpp
@@ -148,6 +148,7 @@ TexturedMeshLoader::TexturedMeshLoader(const std::string& mesh_file_path,
   // Walk through each of the mesh's vertices
   for (unsigned int v = 0; v < mesh->mNumVertices; v++) {
     vertices_.push_back(mesh->mVertices[v]);
+    vertex_normals_.push_back(mesh->mNormals[v]);
   }
   for (unsigned int i = 0 ; i < AI_MAX_NUMBER_OF_TEXTURECOORDS ; ++ i) {
     if (mesh->mTextureCoords[i] != nullptr) {
@@ -167,12 +168,12 @@ TexturedMeshLoader::TexturedMeshLoader(const std::string& mesh_file_path,
   LOG(INFO) << "Loading textured map file: " << textured_file_path;
   texture_map_ = cv::imread(textured_file_path);
   if (texture_map_.empty()) {
-    throw std::runtime_error("[TexturedMeshLoader] Failed to read textured image: "
-                            + textured_file_path);
+    // throw std::runtime_error("[TexturedMeshLoader] Failed to read textured image: "
+    //                         + textured_file_path);
+    texture_map_ = cv::Mat(2, 2, CV_8UC3, {100, 100, 100});
   }
   cv::cvtColor(texture_map_, texture_map_, cv::COLOR_BGR2RGB);
 
-
   LOG(INFO) << "Successfully Loaded textured mesh file!!!";
   LOG(INFO) << "Mesh has vertices_num: " << vertices_.size()
             << ", diameter: " << mesh_diamter_
@@ -217,6 +218,17 @@ TexturedMeshLoader::GetMeshVertices() const noexcept
   return vertices_;
 }
 
+/**
+ * @brief 获取mesh模型顶点的法向量
+ * 
+ * @return const std::vector<aiVector3D> &
+ */
+const std::vector<aiVector3D> & 
+TexturedMeshLoader::GetMeshVertexNormals() const noexcept
+{
+  return vertex_normals_;
+}
+
 /**
  * @brief 获取mesh模型的外观坐标系
  *