microsoft · snnn · Feb 11, 2020 · Feb 10, 2020 · Feb 10, 2020 · Feb 10, 2020
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
@@ -56,7 +56,6 @@ option(onnxruntime_ENABLE_PYTHON "Enable python buildings" OFF)
 option(onnxruntime_ENABLE_MEMLEAK_CHECKER "Experimental: Enable memory leak checker in Windows debug build" OFF)
 option(onnxruntime_USE_CUDA "Build with CUDA support" OFF)
 option(onnxruntime_USE_OPENVINO "Build with OpenVINO support" OFF)
-option(onnxruntime_USE_NSYNC "Build with NSYNC support. This option only takes effect on Linux" OFF)
 option(onnxruntime_USE_EIGEN_FOR_BLAS "Use eign for blas" ON)
 option(onnxruntime_USE_NNAPI "Build with DNNLibrary for Android NNAPI support" OFF)
 option(onnxruntime_USE_DNNL "Build with DNNL support" OFF)
@@ -299,10 +298,9 @@ if(onnxruntime_BUILD_BENCHMARKS)
   endif()  
 endif()
 
-if(onnxruntime_USE_NSYNC)
+if(NOT WIN32)
   add_subdirectory(${PROJECT_SOURCE_DIR}/external/nsync EXCLUDE_FROM_ALL)
 endif()
-
 # External dependencies
 list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/external)
 
@@ -817,9 +815,7 @@ if(WIN32)
   list(APPEND onnxruntime_EXTERNAL_LIBRARIES Shlwapi)
   list(APPEND onnxruntime_EXTERNAL_LIBRARIES debug Dbghelp)
 else()
-  if(onnxruntime_USE_NSYNC)
-    list(APPEND onnxruntime_EXTERNAL_LIBRARIES nsync_cpp)
-  endif()
+  list(APPEND onnxruntime_EXTERNAL_LIBRARIES nsync_cpp)  
   list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${CMAKE_DL_LIBS} Threads::Threads)
 endif()
 

diff --git a/cmake/onnxruntime_common.cmake b/cmake/onnxruntime_common.cmake
@@ -89,7 +89,7 @@ endif()
 onnxruntime_add_include_to_target(onnxruntime_common date_interface)
 target_include_directories(onnxruntime_common PRIVATE ${CMAKE_CURRENT_BINARY_DIR} ${ONNXRUNTIME_ROOT}
         PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/external/nsync/public")
-if(onnxruntime_USE_NSYNC)
+if(NOT WIN32)
     target_compile_definitions(onnxruntime_common PUBLIC USE_NSYNC NSYNC_ATOMIC_CPP11)
 endif()
 

diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
@@ -602,7 +602,7 @@ endif()
 if (onnxruntime_BUILD_SHARED_LIB)
   set(onnxruntime_perf_test_libs onnxruntime_test_utils onnx_test_runner_common onnxruntime_common re2::re2
           onnx_test_data_proto onnx_proto ${PROTOBUF_LIB} ${GETOPT_LIB_WIDE} onnxruntime ${SYS_PATH_LIB} ${CMAKE_DL_LIBS})
-  if(onnxruntime_USE_NSYNC)
+  if(NOT WIN32)
     list(APPEND onnxruntime_perf_test_libs nsync_cpp)
   endif()
   target_link_libraries(onnxruntime_perf_test PRIVATE ${onnxruntime_perf_test_libs} Threads::Threads)
@@ -641,7 +641,7 @@ if (onnxruntime_BUILD_SHARED_LIB)
   # test inference using shared lib
   set(onnxruntime_shared_lib_test_LIBS onnxruntime_mocked_allocator onnxruntime_test_utils onnxruntime_common onnx_proto)
 
-  if(onnxruntime_USE_NSYNC)
+  if(NOT WIN32)
     list(APPEND onnxruntime_shared_lib_test_LIBS nsync_cpp)
   endif()
   AddTest(DYN
@@ -669,7 +669,7 @@ if(MSVC)
 endif()
 target_include_directories(onnxruntime_mlas_test PRIVATE ${ONNXRUNTIME_ROOT}/core/mlas/inc ${ONNXRUNTIME_ROOT})
 set(onnxruntime_mlas_test_libs onnxruntime_mlas onnxruntime_common)
-if(onnxruntime_USE_NSYNC)
+if(NOT WIN32)
   list(APPEND onnxruntime_mlas_test_libs nsync_cpp)
 endif()
 list(APPEND onnxruntime_mlas_test_libs Threads::Threads)

diff --git a/include/onnxruntime/core/graph/schema_registry.h b/include/onnxruntime/core/graph/schema_registry.h
@@ -5,9 +5,8 @@
 #include "core/graph/constants.h"
 #include "core/common/common.h"
 #include "core/common/status.h"
-#include "core/platform/ort_mutex.h"
-
 #include "core/graph/onnx_protobuf.h"
+#include "core/platform/ort_mutex.h"
 #include <mutex>
 #include <deque>
 #include "sstream"

diff --git a/include/onnxruntime/core/platform/ort_mutex.h b/include/onnxruntime/core/platform/ort_mutex.h
@@ -3,11 +3,103 @@
 
 #pragma once
 #ifdef _WIN32
+#include <Windows.h>
 #include <mutex>
-#include <condition_variable>
 namespace onnxruntime {
-using OrtMutex = std::mutex;
-using OrtCondVar = std::condition_variable;
+// Q: Why OrtMutex is better than std::mutex
+// A: OrtMutex supports static initialization but std::mutex doesn't. Static initialization helps us prevent the "static
+// initialization order problem".
+
+// Q: Why std::mutex can't make it?
+// A: VC runtime has to support Windows XP at ABI level. But we don't have such requirement.
+
+// Q: Is OrtMutex faster than std::mutex?
+// A: Sure
+
+class OrtMutex {
+ private:
+  SRWLOCK data_ = SRWLOCK_INIT;
+
+ public:
+  constexpr OrtMutex() = default;
+  // SRW locks do not need to be explicitly destroyed.
+  ~OrtMutex() = default;
+  OrtMutex(const OrtMutex&) = delete;
+  OrtMutex& operator=(const OrtMutex&) = delete;
+  void lock() { AcquireSRWLockExclusive(native_handle()); }
+  bool try_lock() noexcept { return TryAcquireSRWLockExclusive(native_handle()) == TRUE; }
+  void unlock() noexcept { ReleaseSRWLockExclusive(native_handle()); }
+  using native_handle_type = SRWLOCK*;
+
+  __forceinline native_handle_type native_handle() { return &data_; }
+};
+
+class OrtCondVar {
+  CONDITION_VARIABLE native_cv_object = CONDITION_VARIABLE_INIT;
+
+ public:
+  constexpr OrtCondVar() noexcept = default;
+  ~OrtCondVar() = default;
+
+  OrtCondVar(const OrtCondVar&) = delete;
+  OrtCondVar& operator=(const OrtCondVar&) = delete;
+
+  void notify_one() noexcept { WakeConditionVariable(&native_cv_object); }
+  void notify_all() noexcept { WakeAllConditionVariable(&native_cv_object); }
+
+  void wait(std::unique_lock<OrtMutex>& lk) {
+    if (SleepConditionVariableSRW(&native_cv_object, lk.mutex()->native_handle(), INFINITE, 0) != TRUE) {
+      std::terminate();
+    }
+  }
+  template <class _Predicate>
+  void wait(std::unique_lock<OrtMutex>& __lk, _Predicate __pred);
+
+  /**
+   * returns cv_status::timeout if the wait terminates when Rel_time has elapsed. Otherwise, the method returns
+   * cv_status::no_timeout.
+   * @param cond_mutex A unique_lock<OrtMutex> object.
+   * @param rel_time A chrono::duration object that specifies the amount of time before the thread wakes up.
+   * @return returns cv_status::timeout if the wait terminates when Rel_time has elapsed. Otherwise, the method returns
+   * cv_status::no_timeout
+   */
+  template <class Rep, class Period>
+  std::cv_status wait_for(std::unique_lock<OrtMutex>& cond_mutex, const std::chrono::duration<Rep, Period>& rel_time);
+  using native_handle_type = CONDITION_VARIABLE*;
+
+  native_handle_type native_handle() { return &native_cv_object; }
+
+ private:
+  void timed_wait_impl(std::unique_lock<OrtMutex>& __lk,
+                       std::chrono::time_point<std::chrono::system_clock, std::chrono::nanoseconds>);
+};
+
+template <class _Predicate>
+void OrtCondVar::wait(std::unique_lock<OrtMutex>& __lk, _Predicate __pred) {
+  while (!__pred()) wait(__lk);
+}
+
+template <class Rep, class Period>
+std::cv_status OrtCondVar::wait_for(std::unique_lock<OrtMutex>& cond_mutex,
+                                    const std::chrono::duration<Rep, Period>& rel_time) {
+  // TODO: is it possible to use nsync_from_time_point_ ?
+  using namespace std::chrono;
+  if (rel_time <= duration<Rep, Period>::zero())
+    return std::cv_status::timeout;
+  using SystemTimePointFloat = time_point<system_clock, duration<long double, std::nano> >;
+  using SystemTimePoint = time_point<system_clock, nanoseconds>;
+  SystemTimePointFloat max_time = SystemTimePoint::max();
+  steady_clock::time_point steady_now = steady_clock::now();
+  system_clock::time_point system_now = system_clock::now();
+  if (max_time - rel_time > system_now) {
+    nanoseconds remain = duration_cast<nanoseconds>(rel_time);
+    if (remain < rel_time)
+      ++remain;
+    timed_wait_impl(cond_mutex, system_now + remain);
+  } else
+    timed_wait_impl(cond_mutex, SystemTimePoint::max());
+  return steady_clock::now() - steady_now < rel_time ? std::cv_status::no_timeout : std::cv_status::timeout;
+}
 }  // namespace onnxruntime
 #else
 #ifdef USE_NSYNC
@@ -79,15 +171,15 @@ class OrtCondVar {
   void wait(std::unique_lock<OrtMutex>& __lk, _Predicate __pred);
 
   /**
-   * returns cv_status::timeout if the wait terminates when Rel_time has elapsed. Otherwise, the method returns cv_status::no_timeout.
+   * returns cv_status::timeout if the wait terminates when Rel_time has elapsed. Otherwise, the method returns
+   * cv_status::no_timeout.
    * @param cond_mutex A unique_lock<OrtMutex> object.
    * @param rel_time A chrono::duration object that specifies the amount of time before the thread wakes up.
-   * @return returns cv_status::timeout if the wait terminates when Rel_time has elapsed. Otherwise, the method returns cv_status::no_timeout
+   * @return returns cv_status::timeout if the wait terminates when Rel_time has elapsed. Otherwise, the method returns
+   * cv_status::no_timeout
    */
   template <class Rep, class Period>
-  std::cv_status
-  wait_for(std::unique_lock<OrtMutex>& cond_mutex,
-           const std::chrono::duration<Rep, Period>& rel_time);
+  std::cv_status wait_for(std::unique_lock<OrtMutex>& cond_mutex, const std::chrono::duration<Rep, Period>& rel_time);
 #ifdef USE_NSYNC
   using native_handle_type = nsync::nsync_cv*;
 #else
@@ -103,15 +195,13 @@ class OrtCondVar {
 
 template <class _Predicate>
 void OrtCondVar::wait(std::unique_lock<OrtMutex>& __lk, _Predicate __pred) {
-  while (!__pred())
-    wait(__lk);
+  while (!__pred()) wait(__lk);
 }
 
 template <class Rep, class Period>
-std::cv_status
-OrtCondVar::wait_for(std::unique_lock<OrtMutex>& cond_mutex,
-                     const std::chrono::duration<Rep, Period>& rel_time) {
-  //TODO: is it possible to use nsync_from_time_point_ ?
+std::cv_status OrtCondVar::wait_for(std::unique_lock<OrtMutex>& cond_mutex,
+                                    const std::chrono::duration<Rep, Period>& rel_time) {
+  // TODO: is it possible to use nsync_from_time_point_ ?
   using namespace std::chrono;
   if (rel_time <= duration<Rep, Period>::zero())
     return std::cv_status::timeout;

diff --git a/onnxruntime/core/framework/kernel_registry_manager.h b/onnxruntime/core/framework/kernel_registry_manager.h
@@ -7,9 +7,9 @@
 #include <list>
 #include <unordered_map>
 #include "core/common/status.h"
-#include "core/platform/ort_mutex.h"
 #include "core/graph/graph_viewer.h"
 #include "core/framework/customregistry.h"
+#include "core/platform/ort_mutex.h"
 
 namespace onnxruntime {
 struct KernelCreateInfo;

diff --git a/onnxruntime/core/framework/parallel_executor.h b/onnxruntime/core/framework/parallel_executor.h
@@ -4,16 +4,15 @@
 #pragma once
 
 #include <vector>
-#include <condition_variable>
 #include "core/common/common.h"
 #include "core/common/status.h"
 #include "core/common/logging/logging.h"
-#include "core/platform/ort_mutex.h"
 #include "core/framework/iexecutor.h"
 #include "core/framework/framework_common.h"
 #include "core/framework/ml_value.h"
 #include "core/framework/session_state.h"
 #include "core/graph/graph_viewer.h"
+#include "core/platform/ort_mutex.h"
 
 namespace onnxruntime {
 

diff --git a/onnxruntime/core/framework/session_state.h b/onnxruntime/core/framework/session_state.h
@@ -8,8 +8,7 @@
 #include <unordered_map>
 #include <vector>
 #include "gsl/gsl"
-
-#include "core/platform/ort_mutex.h"
+#include "core/graph/onnx_protobuf.h"
 #include "core/common/common.h"
 #include "core/common/logging/logging.h"
 #include "core/common/profiler.h"
@@ -26,6 +25,7 @@
 #include "core/graph/graph_viewer.h"
 #include "core/framework/fuse_nodes_funcs.h"
 #include "core/platform/threadpool.h"
+#include "core/platform/ort_mutex.h"
 
 namespace onnxruntime {
 

diff --git a/onnxruntime/core/framework/utils.cc b/onnxruntime/core/framework/utils.cc
@@ -1,10 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
-
+#include "core/graph/onnx_protobuf.h"
 #include "core/framework/utils.h"
 
 #include <iomanip>
 
+
 #include "core/graph/graph_viewer.h"
 #include "core/framework/data_transfer_manager.h"
 #include "core/framework/execution_frame.h"
@@ -18,7 +19,6 @@
 #include "core/framework/sequential_executor.h"
 #include "core/framework/tensorprotoutils.h"
 #include "core/mlas/inc/mlas.h"
-#include "core/graph/onnx_protobuf.h"
 
 namespace ONNX_NAMESPACE {
 std::ostream& operator<<(std::ostream& out, const TensorShapeProto& shape_proto) {

diff --git a/onnxruntime/core/providers/cuda/cuda_provider_factory.cc b/onnxruntime/core/providers/cuda/cuda_provider_factory.cc
@@ -3,6 +3,7 @@
 
 #include "core/providers/cuda/cuda_provider_factory.h"
 #include <atomic>
+#include "core/graph/onnx_protobuf.h"
 #include "cuda_execution_provider.h"
 #include "core/session/abi_session_options_impl.h"
 

diff --git a/onnxruntime/core/providers/dnnl/dnnl_execution_provider.h b/onnxruntime/core/providers/dnnl/dnnl_execution_provider.h
@@ -8,11 +8,11 @@
 #include <list>
 #include <memory.h>
 
-#include "core/platform/ort_mutex.h"
 #include "core/graph/constants.h"
 #include "core/framework/allocatormgr.h"
 #include "core/framework/execution_provider.h"
 #include "core/providers/dnnl/subgraph/subgraph.h"
+#include "core/platform/ort_mutex.h"
 
 namespace dnnl {
 struct memory;

diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -1,6 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include "core/graph/onnx_protobuf.h"
+
 #include "tensorrt_execution_provider.h"
 #include "core/providers/cuda/cuda_allocator.h"
 #include "core/providers/cuda/math/unary_elementwise_ops_impl.h"

diff --git a/onnxruntime/core/session/abi_session_options.cc b/onnxruntime/core/session/abi_session_options.cc
@@ -1,6 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include "core/graph/onnx_protobuf.h"
 #include "core/session/onnxruntime_c_api.h"
 #include "core/session/ort_apis.h"
 #include "core/framework/error_code_helper.h"

diff --git a/onnxruntime/core/session/custom_ops.cc b/onnxruntime/core/session/custom_ops.cc
@@ -4,7 +4,7 @@
 #ifdef _WIN32
 #pragma warning(disable : 4267)
 #endif
-
+#include "core/graph/onnx_protobuf.h"
 #include "core/session/inference_session.h"
 #include "core/session/ort_apis.h"
 #include "core/framework/customregistry.h"

diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
@@ -1,6 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include "core/graph/onnx_protobuf.h"
 #include "core/session/inference_session.h"
 
 #include <memory>
@@ -12,7 +13,6 @@
 
 #include "core/common/logging/logging.h"
 #include "core/platform/notification.h"
-#include "core/platform/ort_mutex.h"
 #include "core/platform/threadpool.h"
 #include "core/graph/graph_viewer.h"
 #include "core/graph/graph_utils.h"
@@ -53,6 +53,7 @@
 #include "core/optimizer/graph_transformer_utils.h"
 #include "core/util/thread_utils.h"
 #include "core/session/inference_session_utils.h"
+#include "core/platform/ort_mutex.h"
 
 using namespace ONNX_NAMESPACE;
 

diff --git a/onnxruntime/core/session/inference_session_utils.h b/onnxruntime/core/session/inference_session_utils.h
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 
 #pragma once
-
+#include "core/graph/onnx_protobuf.h"
 #include "core/session/inference_session.h"
 #include "core/framework/session_options.h"
 #include "core/common/common.h"
@@ -14,9 +14,9 @@ namespace onnxruntime {
 
 namespace inference_session_utils {
 
-static const std::string kOrtConfigKey = "ort_config";
-static const std::string kSessionOptionsKey = "session_options";
-static const std::string kOrtLoadConfigFromModelEnvVar = "ORT_LOAD_CONFIG_FROM_MODEL";
+static constexpr const char* kOrtConfigKey = "ort_config";
+static constexpr const char* kSessionOptionsKey = "session_options";
+static constexpr const char* kOrtLoadConfigFromModelEnvVar = "ORT_LOAD_CONFIG_FROM_MODEL";
 
 }  // namespace inference_session_utils
 

diff --git a/onnxruntime/test/framework/cuda/fence_cuda_test.cc b/onnxruntime/test/framework/cuda/fence_cuda_test.cc
@@ -1,5 +1,6 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
+#include "core/graph/onnx_protobuf.h"
 
 #include "core/session/inference_session.h"