diff --git a/services/webnn/BUILD.gn b/services/webnn/BUILD.gn
index b0f27b4cd33b00..e936abe6f78e82 100644
--- a/services/webnn/BUILD.gn
+++ b/services/webnn/BUILD.gn
@@ -95,6 +95,7 @@ component("webnn_service") {
     deps += [
       "//third_party/fp16",
       "//third_party/microsoft_dxheaders:dxguids",
+      "//third_party/onnxruntime_headers",
       "//ui/gl",
       "//ui/gl/init",
     ]
diff --git a/services/webnn/ort/allocator_ort.h b/services/webnn/ort/allocator_ort.h
index 8b48692b810875..d5a966df9ff2c5 100644
--- a/services/webnn/ort/allocator_ort.h
+++ b/services/webnn/ort/allocator_ort.h
@@ -7,7 +7,7 @@
 
 #include "base/component_export.h"
 #include "base/memory/ref_counted.h"
-#include "third_party/microsoft_dxheaders/include/onnxruntime_c_api.h"
+#include "third_party/onnxruntime_headers/src/include/onnxruntime/core/session/onnxruntime_c_api.h"
 
 namespace webnn::ort {
 
diff --git a/services/webnn/ort/context_impl_ort.h b/services/webnn/ort/context_impl_ort.h
index f77f44888cab49..e6559e630f49b7 100644
--- a/services/webnn/ort/context_impl_ort.h
+++ b/services/webnn/ort/context_impl_ort.h
@@ -12,7 +12,7 @@
 #include "services/webnn/webnn_constant_operand.h"
 #include "services/webnn/webnn_context_impl.h"
 #include "services/webnn/webnn_graph_impl.h"
-#include "third_party/microsoft_dxheaders/include/onnxruntime_c_api.h"
+#include "third_party/onnxruntime_headers/src/include/onnxruntime/core/session/onnxruntime_c_api.h"
 
 namespace webnn::ort {
 
diff --git a/services/webnn/ort/graph_builder_ort.cc b/services/webnn/ort/graph_builder_ort.cc
index c69c0930a92b30..d1addc0ffb4919 100644
--- a/services/webnn/ort/graph_builder_ort.cc
+++ b/services/webnn/ort/graph_builder_ort.cc
@@ -247,7 +247,8 @@ void GraphBuilderOrt::AddInitializer(uint64_t constant_id) {
                                               operand.ByteSpan(),
                                               operand_info.onnx_data_type);
 
-  CHECK(result_->id_to_operand_info.try_emplace(constant_id, std::move(operand_info))
+  CHECK(result_->id_to_operand_info
+            .try_emplace(constant_id, std::move(operand_info))
             .second);
 }
 
@@ -411,12 +412,13 @@ void GraphBuilderOrt::AddCastOperation(const mojom::ElementWiseUnary& cast) {
 
   int64_t to_data_type = static_cast<int64_t>(
       OperandTypeToONNXTensorElementDataType(output_data_type));
-  ScopedOrtOpAttr attr_to;
+  ScopedOrtOpAttrPtr attr_to;
   model_builder_.CreateAttribute(attr_to, /*name=*/"to", to_data_type);
 
-  std::array<OrtOpAttr**, 1> attributes = {attr_to.get_pptr()};
+  std::array<OrtOpAttr*, 1> attributes = {attr_to};
 
-  model_builder_.AddNode(kOpTypeCast, node_name, input_names, output_names, attributes);
+  model_builder_.AddNode(kOpTypeCast, node_name, input_names, output_names,
+                         attributes);
 }
 
 void GraphBuilderOrt::AddClampOperation(const mojom::Clamp& clamp) {
@@ -487,11 +489,12 @@ void GraphBuilderOrt::AddConv2dOperation(const mojom::Conv2d& conv2d) {
   std::array<int64_t, 2> dilations = {
       base::checked_cast<int64_t>(conv2d.dilations->height),
       base::checked_cast<int64_t>(conv2d.dilations->width)};
-  ScopedOrtOpAttr attr_dilations;
-  model_builder_.CreateAttribute(attr_dilations, /*name=*/"dilations", dilations);
+  ScopedOrtOpAttrPtr attr_dilations;
+  model_builder_.CreateAttribute(attr_dilations, /*name=*/"dilations",
+                                 dilations);
 
   int64_t group = base::checked_cast<int64_t>(conv2d.groups);
-  ScopedOrtOpAttr attr_group;
+  ScopedOrtOpAttrPtr attr_group;
   model_builder_.CreateAttribute(attr_group, /*name=*/"group", group);
 
   std::array<int64_t, 4> pads = {
@@ -499,20 +502,20 @@ void GraphBuilderOrt::AddConv2dOperation(const mojom::Conv2d& conv2d) {
       base::checked_cast<int64_t>(conv2d.padding->beginning->width),
       base::checked_cast<int64_t>(conv2d.padding->ending->height),
       base::checked_cast<int64_t>(conv2d.padding->ending->width)};
-  ScopedOrtOpAttr attr_pads;
+  ScopedOrtOpAttrPtr attr_pads;
   model_builder_.CreateAttribute(attr_pads, /*name=*/"pads", pads);
 
   std::array<int64_t, 2> strides = {
       base::checked_cast<int64_t>(conv2d.strides->height),
       base::checked_cast<int64_t>(conv2d.strides->width)};
-  ScopedOrtOpAttr attr_strides;
+  ScopedOrtOpAttrPtr attr_strides;
   model_builder_.CreateAttribute(attr_strides, /*name=*/"strides", strides);
 
-  std::array<OrtOpAttr**, 4> attributes = {
-      attr_dilations.get_pptr(),
-      attr_group.get_pptr(),
-      attr_pads.get_pptr(),
-      attr_strides.get_pptr(),
+  std::array<OrtOpAttr*, 4> attributes = {
+      attr_dilations,
+      attr_group,
+      attr_pads,
+      attr_strides,
   };
   model_builder_.AddNode(kOpTypeConv2d, node_name, input_names, output_names,
                          attributes);
@@ -535,22 +538,21 @@ void GraphBuilderOrt::AddGemmOperation(const mojom::Gemm& gemm) {
   }
   std::array<const char*, 1> output_names = {output_name.c_str()};
 
-  ScopedOrtOpAttr attr_alpha;
+  ScopedOrtOpAttrPtr attr_alpha;
   model_builder_.CreateAttribute(attr_alpha, /*name=*/"alpha", gemm.alpha);
-  ScopedOrtOpAttr attr_beta;
+  ScopedOrtOpAttrPtr attr_beta;
   model_builder_.CreateAttribute(attr_beta, /*name=*/"beta", gemm.beta);
 
   int64_t trans_a = static_cast<int64_t>(gemm.a_transpose);
-  ScopedOrtOpAttr attr_transA;
+  ScopedOrtOpAttrPtr attr_transA;
   model_builder_.CreateAttribute(attr_transA, /*name=*/"transA", trans_a);
 
   int64_t trans_b = static_cast<int64_t>(gemm.b_transpose);
-  ScopedOrtOpAttr attr_transB;
+  ScopedOrtOpAttrPtr attr_transB;
   model_builder_.CreateAttribute(attr_transB, /*name=*/"transB", trans_b);
 
-  std::array<OrtOpAttr**, 4> attributes = {
-      attr_alpha.get_pptr(), attr_beta.get_pptr(), attr_transA.get_pptr(),
-      attr_transB.get_pptr()};
+  std::array<OrtOpAttr*, 4> attributes = {attr_alpha, attr_beta, attr_transA,
+                                          attr_transB};
 
   model_builder_.AddNode(kOpTypeGemm, node_name, input_names, output_names,
                          attributes);
@@ -576,20 +578,20 @@ void GraphBuilderOrt::AddPool2dOperation(const mojom::Pool2d& pool2d) {
   std::array<int64_t, 2> dilations = {
       base::checked_cast<int64_t>(pool2d.dilations->height),
       base::checked_cast<int64_t>(pool2d.dilations->width)};
-  ScopedOrtOpAttr attr_dilations;
+  ScopedOrtOpAttrPtr attr_dilations;
   model_builder_.CreateAttribute(attr_dilations, /*name=*/"dilations",
                                  dilations);
 
   std::array<int64_t, 2> strides = {
       base::checked_cast<int64_t>(pool2d.strides->height),
       base::checked_cast<int64_t>(pool2d.strides->width)};
-  ScopedOrtOpAttr attr_strides;
+  ScopedOrtOpAttrPtr attr_strides;
   model_builder_.CreateAttribute(attr_strides, /*name=*/"strides", strides);
 
   std::array<int64_t, 2> window_dimensions = {
       base::checked_cast<int64_t>(pool2d.window_dimensions->height),
       base::checked_cast<int64_t>(pool2d.window_dimensions->width)};
-  ScopedOrtOpAttr attr_kernel_shape;
+  ScopedOrtOpAttrPtr attr_kernel_shape;
   model_builder_.CreateAttribute(attr_kernel_shape,
                                  /*name=*/"kernel_shape", window_dimensions);
 
@@ -600,7 +602,7 @@ void GraphBuilderOrt::AddPool2dOperation(const mojom::Pool2d& pool2d) {
       base::checked_cast<int64_t>(pool2d.padding->beginning->width),
       base::checked_cast<int64_t>(pool2d.padding->ending->height),
       base::checked_cast<int64_t>(pool2d.padding->ending->width)};
-  ScopedOrtOpAttr attr_pads;
+  ScopedOrtOpAttrPtr attr_pads;
   model_builder_.CreateAttribute(attr_pads, /*name=*/"pads", pads);
 
   // Calculate the ceil_mode.
@@ -626,12 +628,12 @@ void GraphBuilderOrt::AddPool2dOperation(const mojom::Pool2d& pool2d) {
   CHECK(float_output_height.has_value());
 
   int64_t ceil_mode = float_output_height.value() < output_height ? 1 : 0;
-  ScopedOrtOpAttr attr_ceil_mode;
+  ScopedOrtOpAttrPtr attr_ceil_mode;
   model_builder_.CreateAttribute(attr_ceil_mode, /*name=*/"ceil_mode",
                                  ceil_mode);
 
   // P value of the Lp norm used to pool over the input data.
-  std::optional<ScopedOrtOpAttr> attr_p;
+  std::optional<ScopedOrtOpAttrPtr> attr_p;
   std::optional<int64_t> p;
   std::string op_type;
   switch (pool2d.kind) {
@@ -652,14 +654,13 @@ void GraphBuilderOrt::AddPool2dOperation(const mojom::Pool2d& pool2d) {
     }
   }
 
-  std::vector<OrtOpAttr**> attributes = {
-      attr_dilations.get_pptr(), attr_strides.get_pptr(),
-      attr_kernel_shape.get_pptr(), attr_pads.get_pptr(),
-      attr_ceil_mode.get_pptr()};
+  std::vector<OrtOpAttr*> attributes = {attr_dilations, attr_strides,
+                                        attr_kernel_shape, attr_pads,
+                                        attr_ceil_mode};
   if (op_type == kOpTypeLpPool2d) {
     CHECK(attr_p.has_value());
     CHECK(p.has_value());
-    attributes.push_back(attr_p.value().get_pptr());
+    attributes.push_back(attr_p.value());
   }
 
   const std::string node_name = GetNodeName(pool2d.label);
@@ -710,11 +711,12 @@ void GraphBuilderOrt::AddSoftmaxOperation(const mojom::Softmax& softmax) {
   std::array<const char*, 1> output_names = {output_name.c_str()};
 
   int64_t axis = static_cast<int64_t>(softmax.axis);
-  ScopedOrtOpAttr attr_axis;
+  ScopedOrtOpAttrPtr attr_axis;
   model_builder_.CreateAttribute(attr_axis, /*name=*/"axis", axis);
 
-  std::array<OrtOpAttr**, 1> attributes = {attr_axis.get_pptr()};
-  model_builder_.AddNode(kOpTypeSoftmax, node_name, input_names, output_names, attributes);
+  std::array<OrtOpAttr*, 1> attributes = {attr_axis};
+  model_builder_.AddNode(kOpTypeSoftmax, node_name, input_names, output_names,
+                         attributes);
 }
 
 void GraphBuilderOrt::AddTransposeOperation(const mojom::Transpose& transpose) {
@@ -727,10 +729,10 @@ void GraphBuilderOrt::AddTransposeOperation(const mojom::Transpose& transpose) {
 
   std::vector<int64_t> permutation(transpose.permutation.begin(),
                                    transpose.permutation.end());
-  ScopedOrtOpAttr attr_perm;
+  ScopedOrtOpAttrPtr attr_perm;
   model_builder_.CreateAttribute(attr_perm, /*name=*/"perm", permutation);
 
-  std::array<OrtOpAttr**, 1> attributes = {attr_perm.get_pptr()};
+  std::array<OrtOpAttr*, 1> attributes = {attr_perm};
   model_builder_.AddNode(kOpTypeTranspose, node_name, input_names, output_names,
                          attributes);
 }
@@ -749,12 +751,12 @@ void GraphBuilderOrt::AddWhereOperation(const mojom::Where& where) {
     std::array<const char*, 1> cast_output_names = {
         cast_node_output_name.c_str()};
 
-    ScopedOrtOpAttr attr_to;
+    ScopedOrtOpAttrPtr attr_to;
     int64_t to_data_type =
         static_cast<int64_t>(ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL);
     model_builder_.CreateAttribute(attr_to, /*name=*/"to", to_data_type);
 
-    std::array<OrtOpAttr**, 1> cast_attributes = {attr_to.get_pptr()};
+    std::array<OrtOpAttr*, 1> cast_attributes = {attr_to};
     model_builder_.AddNode(kOpTypeCast, cast_node_name, cast_input_names,
                            cast_output_names, cast_attributes);
     next_operand_id_++;
diff --git a/services/webnn/ort/graph_builder_ort.h b/services/webnn/ort/graph_builder_ort.h
index 07c1286c06c8ee..adb2c130eb0b78 100644
--- a/services/webnn/ort/graph_builder_ort.h
+++ b/services/webnn/ort/graph_builder_ort.h
@@ -17,13 +17,13 @@
 #include "base/memory/stack_allocated.h"
 #include "base/types/expected.h"
 #include "services/webnn/ort/allocator_ort.h"
+#include "services/webnn/ort/ort_model_builder.h"
 #include "services/webnn/ort/scoped_ort_types.h"
 #include "services/webnn/public/cpp/context_properties.h"
 #include "services/webnn/public/cpp/operand_descriptor.h"
 #include "services/webnn/public/mojom/webnn_context_provider.mojom.h"
 #include "services/webnn/public/mojom/webnn_error.mojom-forward.h"
 #include "services/webnn/public/mojom/webnn_graph.mojom.h"
-#include "services/webnn/ort/ort_model_builder.h"
 
 namespace webnn {
 
@@ -62,7 +62,7 @@ class GraphBuilderOrt {
     const OperandInfo& GetOperandInfo(uint64_t operand_id) const;
 
     std::map<uint64_t, OperandInfo> id_to_operand_info;
-    
+
     std::unique_ptr<OrtModelBuilder::ModelInfo> model_info;
   };
 
diff --git a/services/webnn/ort/graph_impl_ort.cc b/services/webnn/ort/graph_impl_ort.cc
index faad950c208a98..ab0e1b8d373cee 100644
--- a/services/webnn/ort/graph_impl_ort.cc
+++ b/services/webnn/ort/graph_impl_ort.cc
@@ -22,6 +22,7 @@
 #include "services/webnn/resource_task.h"
 #include "services/webnn/webnn_constant_operand.h"
 #include "services/webnn/webnn_graph_impl.h"
+#include "third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/dml/dml_provider_factory.h"
 
 namespace webnn::ort {
 
@@ -132,8 +133,8 @@ GraphImplOrt::CreateAndBuildOnBackgroundThread(
 
   OrtSession* session;
   const OrtEnv* env = allocator->env();
-  OrtStatus* status = GetOrtGraphApi()->CreateSessionFromModel(
-      env, result->model_info->model.get_ptr(), session_options, &session);
+  OrtStatus* status = GetOrtModelBuilderApi()->CreateSessionFromModel(
+      env, result->model_info->model, session_options, &session);
   ort_api->ReleaseSessionOptions(session_options);
 
   if (status != NULL) {
diff --git a/services/webnn/ort/graph_impl_ort.h b/services/webnn/ort/graph_impl_ort.h
index 7b58acc694bdcb..12dd70dce8e016 100644
--- a/services/webnn/ort/graph_impl_ort.h
+++ b/services/webnn/ort/graph_impl_ort.h
@@ -22,7 +22,7 @@
 #include "services/webnn/queueable_resource_state.h"
 #include "services/webnn/webnn_context_impl.h"
 #include "services/webnn/webnn_graph_impl.h"
-#include "third_party/microsoft_dxheaders/include/onnxruntime_c_api.h"
+#include "third_party/onnxruntime_headers/src/include/onnxruntime/core/session/onnxruntime_c_api.h"
 
 namespace webnn {
 
diff --git a/services/webnn/ort/ort_model_builder.cc b/services/webnn/ort/ort_model_builder.cc
index 2bd05b2ed57cf4..876a63899fd523 100644
--- a/services/webnn/ort/ort_model_builder.cc
+++ b/services/webnn/ort/ort_model_builder.cc
@@ -22,39 +22,45 @@ namespace ort {
 OrtModelBuilder::ModelInfo::ModelInfo() = default;
 OrtModelBuilder::ModelInfo::~ModelInfo() = default;
 
+ScopedOrtValueInfoPtr CreateOrtValueInfo(std::string_view name,
+                                         base::span<const int64_t> shape,
+                                         ONNXTensorElementDataType data_type) {
+  ScopedOrtTensorTypeAndShapeInfoPtr tensor_type_and_shape_info;
+  CHECK_STATUS(GetOrtApi()->CreateTensorTypeAndShapeInfo(
+      tensor_type_and_shape_info.GetAddressOf()));
+  CHECK_STATUS(
+      GetOrtApi()->SetTensorElementType(tensor_type_and_shape_info, data_type));
+  CHECK_STATUS(GetOrtApi()->SetDimensions(tensor_type_and_shape_info,
+                                          shape.data(), shape.size()));
+
+  ScopedOrtTypeInfoPtr type_info;
+  CHECK_STATUS(GetOrtApi()->CreateTensorTypeInfo(tensor_type_and_shape_info,
+                                                 type_info.GetAddressOf()));
+
+  ScopedOrtValueInfoPtr value_info;
+  CHECK_STATUS(GetOrtModelBuilderApi()->CreateValueInfo(
+      name.data(), type_info, value_info.GetAddressOf()));
+  return value_info;
+}
+
 OrtModelBuilder::OrtModelBuilder(scoped_refptr<AllocatorOrt> allocator)
     : allocator_(std::move(allocator)),
       model_info_(std::make_unique<ModelInfo>()) {
-  CHECK_STATUS(GetOrtGraphApi()->CreateGraph(graph_.get_pptr()));
+  CHECK_STATUS(GetOrtModelBuilderApi()->CreateGraph(graph_.GetAddressOf()));
 }
+
 OrtModelBuilder::~OrtModelBuilder() = default;
 
 void OrtModelBuilder::AddInput(std::string_view name,
                                base::span<const int64_t> shape,
                                ONNXTensorElementDataType data_type) {
-  ScopedOrtShape input_shape;
-  CHECK_STATUS(GetOrtGraphApi()->CreateFixedShape(shape.data(), shape.size(),
-                                                  input_shape.get_pptr()));
-
-  ScopedOrtValueInfo input_info;
-  CHECK_STATUS(GetOrtGraphApi()->CreateTensorValueInfo(
-      name.data(), data_type, input_shape.get_pptr(), input_info.get_pptr()));
-  CHECK_STATUS(
-      GetOrtGraphApi()->AddInput(graph_.get_ptr(), input_info.get_pptr()));
+  inputs_.push_back(CreateOrtValueInfo(name, shape, data_type));
 }
 
 void OrtModelBuilder::AddOutput(std::string_view name,
                                 base::span<const int64_t> shape,
                                 ONNXTensorElementDataType data_type) {
-  ScopedOrtShape output_shape;
-  CHECK_STATUS(GetOrtGraphApi()->CreateFixedShape(shape.data(), shape.size(),
-                                                  output_shape.get_pptr()));
-
-  ScopedOrtValueInfo output_info;
-  CHECK_STATUS(GetOrtGraphApi()->CreateTensorValueInfo(
-      name.data(), data_type, output_shape.get_pptr(), output_info.get_pptr()));
-  CHECK_STATUS(
-      GetOrtGraphApi()->AddOutput(graph_.get_ptr(), output_info.get_pptr()));
+  outputs_.push_back(CreateOrtValueInfo(name, shape, data_type));
 }
 
 void OrtModelBuilder::AddInitializerAsRawData(
@@ -62,20 +68,22 @@ void OrtModelBuilder::AddInitializerAsRawData(
     base::span<const int64_t> shape,
     base::span<const uint8_t> data,
     ONNXTensorElementDataType data_type) {
-  ScopedOrtValue initializer;
+  ScopedOrtValuePtr initializer;
   CHECK_STATUS(GetOrtApi()->CreateTensorAsOrtValue(
       allocator_->allocator(), shape.data(), shape.size(), data_type,
-      initializer.get_pptr()));
+      initializer.GetAddressOf()));
 
   void* ort_tensor_raw_data = nullptr;
-  CHECK_STATUS(GetOrtApi()->GetTensorMutableData(initializer.get_ptr(),
-                                                 &ort_tensor_raw_data));
+  CHECK_STATUS(
+      GetOrtApi()->GetTensorMutableData(initializer, &ort_tensor_raw_data));
   CHECK(ort_tensor_raw_data);
   UNSAFE_BUFFERS(
       base::span(static_cast<uint8_t*>(ort_tensor_raw_data), data.size()))
       .copy_from(data);
-  CHECK_STATUS(GetOrtGraphApi()->AddInitializer(graph_.get_ptr(), name.data(),
-                                                initializer.get_pptr()));
+  // Graph will own the initializer.
+  CHECK_STATUS(GetOrtModelBuilderApi()->AddInitializerToGraph(
+      graph_, name.data(), initializer.Release(),
+      /*data_is_external=*/false));
 }
 
 void OrtModelBuilder::AddInitializerAsExternalData(
@@ -86,49 +94,52 @@ void OrtModelBuilder::AddInitializerAsExternalData(
   auto weight = base::HeapArray<uint8_t>::CopiedFrom(data);
   model_info_->external_data.push_back(std::move(weight));
 
-  ScopedOrtValue initializer;
+  ScopedOrtValuePtr initializer;
+  // TODO: Use `CreateTensorWithDataAndDeleterAsOrtValue()`.
   CHECK_STATUS(GetOrtApi()->CreateTensorWithDataAsOrtValue(
       allocator_->memory_info(), model_info_->external_data.back().data(),
       model_info_->external_data.back().size(), shape.data(), shape.size(),
-      data_type, initializer.get_pptr()));
-  CHECK_STATUS(GetOrtGraphApi()->AddInitializer(graph_.get_ptr(), name.data(),
-                                                initializer.get_pptr()));
+      data_type, initializer.GetAddressOf()));
+  // Graph will own the initializer.
+  CHECK_STATUS(GetOrtModelBuilderApi()->AddInitializerToGraph(
+      graph_, name.data(), initializer.Release(),
+      /*data_is_external=*/true));
 }
 
-void OrtModelBuilder::CreateAttribute(ScopedOrtOpAttr& attribute,
+void OrtModelBuilder::CreateAttribute(ScopedOrtOpAttrPtr& attribute,
                                       std::string_view name,
                                       OrtOpAttrData data) {
   if (absl::holds_alternative<int64_t>(data)) {
     CHECK_STATUS(GetOrtApi()->CreateOpAttr(
         name.data(), &absl::get<int64_t>(data), /*len=*/1,
-        OrtOpAttrType::ORT_OP_ATTR_INT, attribute.get_pptr()));
+        OrtOpAttrType::ORT_OP_ATTR_INT, attribute.GetAddressOf()));
   } else if (absl::holds_alternative<float>(data)) {
     CHECK_STATUS(GetOrtApi()->CreateOpAttr(
         name.data(), &absl::get<float>(data), /*len=*/1,
-        OrtOpAttrType::ORT_OP_ATTR_FLOAT, attribute.get_pptr()));
+        OrtOpAttrType::ORT_OP_ATTR_FLOAT, attribute.GetAddressOf()));
   } else if (absl::holds_alternative<std::string_view>(data)) {
     std::string_view string_data = absl::get<std::string_view>(data);
     CHECK_STATUS(GetOrtApi()->CreateOpAttr(
         name.data(), string_data.data(), string_data.size(),
-        OrtOpAttrType::ORT_OP_ATTR_STRING, attribute.get_pptr()));
+        OrtOpAttrType::ORT_OP_ATTR_STRING, attribute.GetAddressOf()));
   } else if (absl::holds_alternative<base::span<const int64_t>>(data)) {
     base::span<const int64_t> ints_data =
         absl::get<base::span<const int64_t>>(data);
     CHECK_STATUS(GetOrtApi()->CreateOpAttr(
         name.data(), ints_data.data(), ints_data.size(),
-        OrtOpAttrType::ORT_OP_ATTR_INTS, attribute.get_pptr()));
+        OrtOpAttrType::ORT_OP_ATTR_INTS, attribute.GetAddressOf()));
   } else if (absl::holds_alternative<base::span<const float>>(data)) {
     base::span<const float> floats_data =
         absl::get<base::span<const float>>(data);
     CHECK_STATUS(GetOrtApi()->CreateOpAttr(
         name.data(), floats_data.data(), floats_data.size(),
-        OrtOpAttrType::ORT_OP_ATTR_FLOATS, attribute.get_pptr()));
+        OrtOpAttrType::ORT_OP_ATTR_FLOATS, attribute.GetAddressOf()));
   } else if (absl::holds_alternative<base::span<const char*>>(data)) {
     base::span<const char*> strings_data =
         absl::get<base::span<const char*>>(data);
     CHECK_STATUS(GetOrtApi()->CreateOpAttr(
         name.data(), strings_data.data(), strings_data.size(),
-        OrtOpAttrType::ORT_OP_ATTR_STRINGS, attribute.get_pptr()));
+        OrtOpAttrType::ORT_OP_ATTR_STRINGS, attribute.GetAddressOf()));
   }
 }
 
@@ -136,26 +147,45 @@ void OrtModelBuilder::AddNode(std::string_view op_type,
                               std::string_view node_name,
                               base::span<const char*> input_names,
                               base::span<const char*> output_names,
-                              base::span<OrtOpAttr**> attributes) {
-  ScopedOrtNode node;
-  CHECK_STATUS(GetOrtGraphApi()->CreateNode(
+                              base::span<OrtOpAttr*> attributes) {
+  ScopedOrtNodePtr node;
+  CHECK_STATUS(GetOrtModelBuilderApi()->CreateNode(
       op_type.data(), kOrtDomainName, node_name.data(), input_names.data(),
       input_names.size(), output_names.data(), output_names.size(),
-      attributes.data(), attributes.size(), node.get_pptr()));
-  CHECK_STATUS(GetOrtGraphApi()->AddNode(graph_.get_ptr(), node.get_pptr()));
+      attributes.data(), attributes.size(), node.GetAddressOf()));
+  // Graph will own the node.
+  CHECK_STATUS(GetOrtModelBuilderApi()->AddNodeToGraph(graph_, node.Release()));
 }
 
 std::unique_ptr<OrtModelBuilder::ModelInfo>
 OrtModelBuilder::BuildAndTakeModelInfo() {
+  // Graph will own the input/output `OrtValueInfo`s.
+  std::vector<OrtValueInfo*> graph_inputs;
+  graph_inputs.reserve(inputs_.size());
+  for (auto& input : inputs_) {
+    graph_inputs.push_back(input.Release());
+  }
+  CHECK_STATUS(GetOrtModelBuilderApi()->SetGraphInputs(
+      graph_, graph_inputs.data(), graph_inputs.size()));
+
+  std::vector<OrtValueInfo*> graph_outputs;
+  graph_outputs.reserve(outputs_.size());
+  for (auto& output : outputs_) {
+    graph_outputs.push_back(output.Release());
+  }
+  CHECK_STATUS(GetOrtModelBuilderApi()->SetGraphOutputs(
+      graph_, graph_outputs.data(), graph_outputs.size()));
+
   std::vector<const char*> domain_names = {kOrtDomainName};
   std::vector<int32_t> opset_versions = {kOrtOpsetVersion};
 
-  CHECK_STATUS(GetOrtGraphApi()->CreateModel(
+  CHECK_STATUS(GetOrtModelBuilderApi()->CreateModel(
       domain_names.data(), opset_versions.data(), domain_names.size(),
-      model_info_->model.get_pptr()));
+      model_info_->model.GetAddressOf()));
 
-  CHECK_STATUS(GetOrtGraphApi()->AddGraph(model_info_->model.get_ptr(),
-                                          graph_.get_pptr()));
+  // Model will own the graph.
+  CHECK_STATUS(GetOrtModelBuilderApi()->AddGraphToModel(model_info_->model,
+                                                        graph_.Release()));
 
   return std::move(model_info_);
 }
diff --git a/services/webnn/ort/ort_model_builder.h b/services/webnn/ort/ort_model_builder.h
index 27e8360e7633a7..6cf1c6d56d559f 100644
--- a/services/webnn/ort/ort_model_builder.h
+++ b/services/webnn/ort/ort_model_builder.h
@@ -14,7 +14,7 @@
 #include "services/webnn/ort/allocator_ort.h"
 #include "services/webnn/ort/scoped_ort_types.h"
 #include "third_party/abseil-cpp/absl/types/variant.h"
-#include "third_party/microsoft_dxheaders/include/onnxruntime_c_api.h"
+#include "third_party/onnxruntime_headers/src/include/onnxruntime/core/session/onnxruntime_c_api.h"
 
 namespace webnn {
 
@@ -30,7 +30,7 @@ class OrtModelBuilder final {
     ModelInfo& operator=(const ModelInfo&) = delete;
     ~ModelInfo();
 
-    ScopedOrtModel model;
+    ScopedOrtModelPtr model;
 
     // TODO: Consider reusing constant operands instead of copying them to
     // `external_data`.
@@ -61,13 +61,14 @@ class OrtModelBuilder final {
                                     base::span<const int64_t> shape,
                                     base::span<const uint8_t> data,
                                     ONNXTensorElementDataType data_type);
+
   using OrtOpAttrData = absl::variant<int64_t,
                                       float,
                                       std::string_view,
                                       base::span<const int64_t>,
                                       base::span<const float>,
                                       base::span<const char*>>;
-  void CreateAttribute(ScopedOrtOpAttr& attribute,
+  void CreateAttribute(ScopedOrtOpAttrPtr& attribute,
                        std::string_view name,
                        OrtOpAttrData data);
 
@@ -75,14 +76,17 @@ class OrtModelBuilder final {
                std::string_view node_name,
                base::span<const char*> input_names,
                base::span<const char*> output_names,
-               base::span<OrtOpAttr**> attributes = {});
+               base::span<OrtOpAttr*> attributes = {});
 
   std::unique_ptr<ModelInfo> BuildAndTakeModelInfo();
 
  private:
   scoped_refptr<AllocatorOrt> allocator_;
 
-  ScopedOrtGraph graph_;
+  std::vector<ScopedOrtValueInfoPtr> inputs_;
+  std::vector<ScopedOrtValueInfoPtr> outputs_;
+
+  ScopedOrtGraphPtr graph_;
 
   std::unique_ptr<ModelInfo> model_info_;
 };
diff --git a/services/webnn/ort/platform_functions_ort.cc b/services/webnn/ort/platform_functions_ort.cc
index d5f5dd0da831d3..8e46cc58c65002 100644
--- a/services/webnn/ort/platform_functions_ort.cc
+++ b/services/webnn/ort/platform_functions_ort.cc
@@ -8,7 +8,6 @@
 #include "base/logging.h"
 #include "base/native_library.h"
 #include "base/path_service.h"
-#include "third_party/onnx/proto/onnx.pb.h"
 
 namespace webnn::ort {
 
@@ -39,16 +38,18 @@ PlatformFunctions::PlatformFunctions() {
     return;
   }
 
-  const OrtApi* ort_api =
-      ort_get_api_base_proc()->GetApi(onnx::Version::IR_VERSION);
+  // ORT_API_VERSION is defined in onnxruntime_c_api.h and must be passed to
+  // `OrtApiBase::OrtApi()`.
+  const OrtApi* ort_api = ort_get_api_base_proc()->GetApi(ORT_API_VERSION);
   if (!ort_api) {
     LOG(ERROR) << "[WebNN] Failed to get OrtApi.";
     return;
   }
 
-  const OrtGraphApi* ort_graph_api = ort_api->GetGraphApi();
-  if (!ort_graph_api) {
-    LOG(ERROR) << "[WebNN] Failed to get OrtGraphApi.";
+  const OrtModelBuilderApi* ort_model_builder_api =
+      ort_api->GetModelBuilderApi();
+  if (!ort_model_builder_api) {
+    LOG(ERROR) << "[WebNN] Failed to get OrtModelBuilderApi.";
     return;
   }
 
@@ -56,7 +57,7 @@ PlatformFunctions::PlatformFunctions() {
   ort_library_ = std::move(ort_library);
   ort_get_api_base_proc_ = std::move(ort_get_api_base_proc);
   ort_api_ = ort_api;
-  ort_graph_api_ = ort_graph_api;
+  ort_model_builder_api_ = ort_model_builder_api;
 }
 
 PlatformFunctions::~PlatformFunctions() = default;
@@ -71,7 +72,7 @@ PlatformFunctions* PlatformFunctions::GetInstance() {
 }
 
 bool PlatformFunctions::AllFunctionsLoaded() {
-  return ort_get_api_base_proc_ && ort_api_ && ort_graph_api_;
+  return ort_get_api_base_proc_ && ort_api_ && ort_model_builder_api_;
 }
 
 }  // namespace webnn::ort
diff --git a/services/webnn/ort/platform_functions_ort.h b/services/webnn/ort/platform_functions_ort.h
index 9902a9ee704441..07549f1150b050 100644
--- a/services/webnn/ort/platform_functions_ort.h
+++ b/services/webnn/ort/platform_functions_ort.h
@@ -10,8 +10,7 @@
 #include "base/component_export.h"
 #include "base/no_destructor.h"
 #include "base/scoped_native_library.h"
-#include "third_party/microsoft_dxheaders/include/dml_provider_factory.h"
-#include "third_party/microsoft_dxheaders/include/onnxruntime_c_api.h"
+#include "third_party/onnxruntime_headers/src/include/onnxruntime/core/session/onnxruntime_c_api.h"
 
 namespace webnn::ort {
 
@@ -27,7 +26,9 @@ class COMPONENT_EXPORT(WEBNN_SERVICE) PlatformFunctions {
     return ort_get_api_base_proc_;
   }
   const OrtApi* ort_api() const { return ort_api_.get(); }
-  const OrtGraphApi* ort_graph_api() const { return ort_graph_api_.get(); }
+  const OrtModelBuilderApi* ort_model_builder_api() const {
+    return ort_model_builder_api_.get();
+  }
 
  private:
   friend class base::NoDestructor<PlatformFunctions>;
@@ -40,7 +41,7 @@ class COMPONENT_EXPORT(WEBNN_SERVICE) PlatformFunctions {
   base::ScopedNativeLibrary ort_library_;
   OrtGetApiBaseProc ort_get_api_base_proc_ = nullptr;
   raw_ptr<const OrtApi> ort_api_ = nullptr;
-  raw_ptr<const OrtGraphApi> ort_graph_api_ = nullptr;
+  raw_ptr<const OrtModelBuilderApi> ort_model_builder_api_ = nullptr;
 };
 
 }  // namespace webnn::ort
diff --git a/services/webnn/ort/scoped_ort_types.cc b/services/webnn/ort/scoped_ort_types.cc
index 83fece2eabae0c..d5c519fa617617 100644
--- a/services/webnn/ort/scoped_ort_types.cc
+++ b/services/webnn/ort/scoped_ort_types.cc
@@ -4,67 +4,30 @@
 
 #include "services/webnn/ort/scoped_ort_types.h"
 
-#include <memory>
-
-#include "services/webnn/ort/utils_ort.h"
-
 namespace webnn::ort {
 
-ScopedOrtValue::ScopedOrtValue() {
-  pptr_ = std::make_unique<OrtValue*>(nullptr);
-}
-ScopedOrtValue::~ScopedOrtValue() {
-  // TODO: use deleter instead.
-  GetOrtApi()->ReleaseValue(*pptr_);
-}
-
-ScopedOrtMemoryInfo::ScopedOrtMemoryInfo() {
-  pptr_ = std::make_unique<OrtMemoryInfo*>(nullptr);
-}
-ScopedOrtMemoryInfo::~ScopedOrtMemoryInfo() {
-  GetOrtApi()->ReleaseMemoryInfo(*pptr_);
-}
-
-ScopedOrtOpAttr::ScopedOrtOpAttr() {
-  pptr_ = std::make_unique<OrtOpAttr*>(nullptr);
-}
-ScopedOrtOpAttr::~ScopedOrtOpAttr() {
-  GetOrtApi()->ReleaseOpAttr(*pptr_);
-}
-
-ScopedOrtGraph::ScopedOrtGraph() {
-  pptr_ = std::make_unique<OrtGraph*>(nullptr);
-}
-ScopedOrtGraph::~ScopedOrtGraph() {
-  GetOrtGraphApi()->ReleaseGraph(*pptr_);
-}
-
-ScopedOrtShape::ScopedOrtShape() {
-  pptr_ = std::make_unique<OrtShape*>(nullptr);
-}
-ScopedOrtShape::~ScopedOrtShape() {
-  GetOrtGraphApi()->ReleaseShape(*pptr_);
-}
-
-ScopedOrtValueInfo::ScopedOrtValueInfo() {
-  pptr_ = std::make_unique<OrtValueInfo*>(nullptr);
-}
-ScopedOrtValueInfo::~ScopedOrtValueInfo() {
-  GetOrtGraphApi()->ReleaseValueInfo(*pptr_);
-}
-
-ScopedOrtNode::ScopedOrtNode() {
-  pptr_ = std::make_unique<OrtNode*>(nullptr);
-}
-ScopedOrtNode::~ScopedOrtNode() {
-  GetOrtGraphApi()->ReleaseNode(*pptr_);
-}
-
-ScopedOrtModel::ScopedOrtModel() {
-  pptr_ = std::make_unique<OrtModel*>(nullptr);
-}
-ScopedOrtModel::~ScopedOrtModel() {
-  GetOrtGraphApi()->ReleaseModel(*pptr_);
-}
+#define SCOPED_ORT_TYPE_PTR_DEFINITION(ort_type, ort_api)        \
+  ScopedOrt##ort_type##Ptr::ScopedOrt##ort_type##Ptr() {         \
+    pptr_ = std::make_unique<Ort##ort_type*>(nullptr);           \
+  }                                                              \
+  ScopedOrt##ort_type##Ptr::~ScopedOrt##ort_type##Ptr() {        \
+    if (pptr_) {                                                 \
+      Get##ort_api()->Release##ort_type(*pptr_);                 \
+    }                                                            \
+  }                                                              \
+  ScopedOrt##ort_type##Ptr::ScopedOrt##ort_type##Ptr(            \
+      ScopedOrt##ort_type##Ptr&&) = default;                     \
+  ScopedOrt##ort_type##Ptr& ScopedOrt##ort_type##Ptr::operator=( \
+      ScopedOrt##ort_type##Ptr&&) = default;
+
+SCOPED_ORT_TYPE_PTR_DEFINITION(Value, OrtApi)
+SCOPED_ORT_TYPE_PTR_DEFINITION(MemoryInfo, OrtApi)
+SCOPED_ORT_TYPE_PTR_DEFINITION(OpAttr, OrtApi)
+SCOPED_ORT_TYPE_PTR_DEFINITION(TypeInfo, OrtApi)
+SCOPED_ORT_TYPE_PTR_DEFINITION(TensorTypeAndShapeInfo, OrtApi)
+SCOPED_ORT_TYPE_PTR_DEFINITION(ValueInfo, OrtModelBuilderApi)
+SCOPED_ORT_TYPE_PTR_DEFINITION(Node, OrtModelBuilderApi)
+SCOPED_ORT_TYPE_PTR_DEFINITION(Graph, OrtModelBuilderApi)
+SCOPED_ORT_TYPE_PTR_DEFINITION(Model, OrtModelBuilderApi)
 
 }  // namespace webnn::ort
diff --git a/services/webnn/ort/scoped_ort_types.h b/services/webnn/ort/scoped_ort_types.h
index f3d1841037a961..0075a33aaad7ff 100644
--- a/services/webnn/ort/scoped_ort_types.h
+++ b/services/webnn/ort/scoped_ort_types.h
@@ -7,121 +7,47 @@
 
 #include <memory>
 
-#include "third_party/microsoft_dxheaders/include/onnxruntime_c_api.h"
+#include "services/webnn/ort/utils_ort.h"
+#include "third_party/onnxruntime_headers/src/include/onnxruntime/core/session/onnxruntime_c_api.h"
 
 namespace webnn::ort {
 
-class ScopedOrtValue {
- public:
-  ScopedOrtValue();
-  ScopedOrtValue(const ScopedOrtValue&) = delete;
-  ScopedOrtValue& operator=(const ScopedOrtValue&) = delete;
-  ~ScopedOrtValue();
-
-  OrtValue* get_ptr() { return *pptr_; }
-  OrtValue** get_pptr() { return pptr_.get(); }
-
- private:
-  std::unique_ptr<OrtValue*> pptr_;
-};
-
-class ScopedOrtMemoryInfo {
- public:
-  ScopedOrtMemoryInfo();
-  ScopedOrtMemoryInfo(const ScopedOrtMemoryInfo&) = delete;
-  ScopedOrtMemoryInfo& operator=(const ScopedOrtMemoryInfo&) = delete;
-  ~ScopedOrtMemoryInfo();
-
-  OrtMemoryInfo* get_ptr() { return *pptr_; }
-  OrtMemoryInfo** get_pptr() { return pptr_.get(); }
-
- private:
-  std::unique_ptr<OrtMemoryInfo*> pptr_;
-};
-
-class ScopedOrtOpAttr {
- public:
-  ScopedOrtOpAttr();
-  ScopedOrtOpAttr(const ScopedOrtOpAttr&) = delete;
-  ScopedOrtOpAttr& operator=(const ScopedOrtOpAttr&) = delete;
-  ~ScopedOrtOpAttr();
-
-  OrtOpAttr* get_ptr() { return *pptr_; }
-  OrtOpAttr** get_pptr() { return pptr_.get(); }
-
- private:
-  std::unique_ptr<OrtOpAttr*> pptr_;
-};
-
-class ScopedOrtGraph {
- public:
-  ScopedOrtGraph();
-  ScopedOrtGraph(const ScopedOrtGraph&) = delete;
-  ScopedOrtGraph& operator=(const ScopedOrtGraph&) = delete;
-  ~ScopedOrtGraph();
-
-  OrtGraph* get_ptr() { return *pptr_; }
-  OrtGraph** get_pptr() { return pptr_.get(); }
-
- private:
-  std::unique_ptr<OrtGraph*> pptr_;
-};
-
-class ScopedOrtShape {
- public:
-  ScopedOrtShape();
-  ScopedOrtShape(const ScopedOrtShape&) = delete;
-  ScopedOrtShape& operator=(const ScopedOrtShape&) = delete;
-  ~ScopedOrtShape();
-
-  OrtShape* get_ptr() { return *pptr_; }
-  OrtShape** get_pptr() { return pptr_.get(); }
-
- private:
-  std::unique_ptr<OrtShape*> pptr_;
-};
-
-class ScopedOrtValueInfo {
- public:
-  ScopedOrtValueInfo();
-  ScopedOrtValueInfo(const ScopedOrtValueInfo&) = delete;
-  ScopedOrtValueInfo& operator=(const ScopedOrtValueInfo&) = delete;
-  ~ScopedOrtValueInfo();
-
-  OrtValueInfo* get_ptr() { return *pptr_; }
-  OrtValueInfo** get_pptr() { return pptr_.get(); }
-
- private:
-  std::unique_ptr<OrtValueInfo*> pptr_;
-};
-
-class ScopedOrtNode {
- public:
-  ScopedOrtNode();
-  ScopedOrtNode(const ScopedOrtNode&) = delete;
-  ScopedOrtNode& operator=(const ScopedOrtNode&) = delete;
-  ~ScopedOrtNode();
-
-  OrtNode* get_ptr() { return *pptr_; }
-  OrtNode** get_pptr() { return pptr_.get(); }
-
- private:
-  std::unique_ptr<OrtNode*> pptr_;
-};
-
-class ScopedOrtModel {
- public:
-  ScopedOrtModel();
-  ScopedOrtModel(const ScopedOrtModel&) = delete;
-  ScopedOrtModel& operator=(const ScopedOrtModel&) = delete;
-  ~ScopedOrtModel();
-
-  OrtModel* get_ptr() { return *pptr_; }
-  OrtModel** get_pptr() { return pptr_.get(); }
-
- private:
-  std::unique_ptr<OrtModel*> pptr_;
-};
+#define SCOPED_ORT_TYPE_PTR_DECLARATION(ort_type)                          \
+  class ScopedOrt##ort_type##Ptr {                                         \
+   public:                                                                 \
+    ScopedOrt##ort_type##Ptr();                                            \
+    ~ScopedOrt##ort_type##Ptr();                                           \
+    ScopedOrt##ort_type##Ptr(const ScopedOrt##ort_type##Ptr&) = delete;    \
+    ScopedOrt##ort_type##Ptr& operator=(const ScopedOrt##ort_type##Ptr&) = \
+        delete;                                                            \
+    ScopedOrt##ort_type##Ptr(ScopedOrt##ort_type##Ptr&&);                  \
+    ScopedOrt##ort_type##Ptr& operator=(ScopedOrt##ort_type##Ptr&&);       \
+    operator Ort##ort_type *() const {                                     \
+      return *pptr_;                                                       \
+    }                                                                      \
+    Ort##ort_type* Get() const {                                           \
+      return *pptr_;                                                       \
+    }                                                                      \
+    Ort##ort_type** GetAddressOf() const {                                 \
+      return pptr_.get();                                                  \
+    }                                                                      \
+    Ort##ort_type* Release() {                                             \
+      return *pptr_.release();                                             \
+    }                                                                      \
+                                                                           \
+   private:                                                                \
+    std::unique_ptr<Ort##ort_type*> pptr_;                                 \
+  };
+
+SCOPED_ORT_TYPE_PTR_DECLARATION(Value)
+SCOPED_ORT_TYPE_PTR_DECLARATION(MemoryInfo)
+SCOPED_ORT_TYPE_PTR_DECLARATION(OpAttr)
+SCOPED_ORT_TYPE_PTR_DECLARATION(TypeInfo)
+SCOPED_ORT_TYPE_PTR_DECLARATION(TensorTypeAndShapeInfo)
+SCOPED_ORT_TYPE_PTR_DECLARATION(ValueInfo)
+SCOPED_ORT_TYPE_PTR_DECLARATION(Node)
+SCOPED_ORT_TYPE_PTR_DECLARATION(Graph)
+SCOPED_ORT_TYPE_PTR_DECLARATION(Model)
 
 }  // namespace webnn::ort
 
diff --git a/services/webnn/ort/tensor_impl_ort.h b/services/webnn/ort/tensor_impl_ort.h
index f9931c2853915e..64c6df0c3d44a3 100644
--- a/services/webnn/ort/tensor_impl_ort.h
+++ b/services/webnn/ort/tensor_impl_ort.h
@@ -11,7 +11,7 @@
 #include "services/webnn/public/mojom/webnn_tensor.mojom-forward.h"
 #include "services/webnn/queueable_resource_state.h"
 #include "services/webnn/webnn_tensor_impl.h"
-#include "third_party/microsoft_dxheaders/include/onnxruntime_c_api.h"
+#include "third_party/onnxruntime_headers/src/include/onnxruntime/core/session/onnxruntime_c_api.h"
 
 namespace webnn::ort {
 
diff --git a/services/webnn/ort/utils_ort.cc b/services/webnn/ort/utils_ort.cc
index 6e17478919c307..bb2965296f27f5 100644
--- a/services/webnn/ort/utils_ort.cc
+++ b/services/webnn/ort/utils_ort.cc
@@ -48,10 +48,10 @@ const OrtApi* GetOrtApi() {
   return platform_functions->ort_api();
 }
 
-const OrtGraphApi* GetOrtGraphApi() {
+const OrtModelBuilderApi* GetOrtModelBuilderApi() {
   PlatformFunctions* platform_functions = PlatformFunctions::GetInstance();
   CHECK(platform_functions);
-  return platform_functions->ort_graph_api();
+  return platform_functions->ort_model_builder_api();
 }
 
 mojom::ErrorPtr CreateError(mojom::Error::Code error_code,
diff --git a/services/webnn/ort/utils_ort.h b/services/webnn/ort/utils_ort.h
index f76d7f3804a1b1..8b502c5a56ea96 100644
--- a/services/webnn/ort/utils_ort.h
+++ b/services/webnn/ort/utils_ort.h
@@ -7,7 +7,7 @@
 
 #include "services/webnn/public/cpp/operand_descriptor.h"
 #include "services/webnn/public/mojom/webnn_error.mojom.h"
-#include "third_party/microsoft_dxheaders/include/onnxruntime_c_api.h"
+#include "third_party/onnxruntime_headers/src/include/onnxruntime/core/session/onnxruntime_c_api.h"
 
 namespace webnn::ort {
 
@@ -16,7 +16,7 @@ ONNXTensorElementDataType OperandTypeToONNXTensorElementDataType(
 
 const OrtApi* GetOrtApi();
 
-const OrtGraphApi* GetOrtGraphApi();
+const OrtModelBuilderApi* GetOrtModelBuilderApi();
 
 mojom::ErrorPtr CreateError(mojom::Error::Code error_code,
                             const std::string& error_message,
diff --git a/third_party/onnxruntime_headers/BUILD.gn b/third_party/onnxruntime_headers/BUILD.gn
new file mode 100644
index 00000000000000..6e56ff1b2aa9a7
--- /dev/null
+++ b/third_party/onnxruntime_headers/BUILD.gn
@@ -0,0 +1,11 @@
+# Copyright 2024 The Chromium Authors
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+config("onnxruntime_headers_config") {
+  include_dirs = [ "src/include/onnxruntime/core/session" ]
+}
+
+source_set("onnxruntime_headers") {
+  public_configs = [ ":onnxruntime_headers_config" ]
+}
diff --git a/third_party/onnxruntime_headers/README.chromium b/third_party/onnxruntime_headers/README.chromium
new file mode 100644
index 00000000000000..a266bd00391ebe
--- /dev/null
+++ b/third_party/onnxruntime_headers/README.chromium
@@ -0,0 +1,6 @@
+Name: onnxruntime_headers
+URL: https://github.com/microsoft/onnxruntime/tree/main/include
+Revision: 6e76179a4e1e76761bfd7be2ad6d12c3f99ec938
+
+Description:
+This directory contains a copy of the ONNX Runtime headers.
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/basic_types.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/basic_types.h
new file mode 100644
index 00000000000000..3eb4869377d406
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/basic_types.h
@@ -0,0 +1,19 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <cstdint>
+
+namespace onnxruntime {
+
+/** A computed hash value. */
+using HashValue = uint64_t;
+
+/** The type of an argument (input or output).*/
+enum class ArgType : uint8_t {
+  kInput,
+  kOutput,
+};
+
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/code_location.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/code_location.h
new file mode 100644
index 00000000000000..dbff69099ba78f
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/code_location.h
@@ -0,0 +1,58 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace onnxruntime {
+/**
+   CodeLocation captures information on where in the source code a message came from.
+*/
+struct CodeLocation {
+  /**
+     @param file_path Usually the value of __FILE__
+     @param line Usually the value of __LINE__
+     @param func Usually the value of __PRETTY_FUNCTION__ or __FUNCTION__
+  */
+  CodeLocation(const char* file_path, const int line, const char* func)
+      : file_and_path{file_path}, line_num{line}, function{func} {
+  }
+
+  /**
+     @param file_path Usually the value of __FILE__
+     @param line Usually the value of __LINE__
+     @param func Usually the value of __PRETTY_FUNCTION__ or __FUNCTION__
+     @param stacktrace Stacktrace from source of message.
+  */
+  CodeLocation(const char* file_path, const int line, const char* func, const std::vector<std::string>& stacktrace)
+      : file_and_path{file_path}, line_num{line}, function{func}, stacktrace(stacktrace) {
+  }
+
+  std::string FileNoPath() const {
+    // assuming we always have work to do, so not trying to avoid creating a new string if
+    // no path was removed.
+    return file_and_path.substr(file_and_path.find_last_of("/\\") + 1);
+  }
+
+  enum Format {
+    kFilename,
+    kFilenameAndPath
+  };
+
+  std::string ToString(Format format = Format::kFilename) const {
+    std::ostringstream out;
+    out << (format == Format::kFilename ? FileNoPath() : file_and_path) << ":" << line_num << " " << function;
+    return out.str();
+  }
+  // utf-8. Because on Windows we compile our code with "/utf-8". And we assume the other platforms only use utf-8.
+  const std::string file_and_path;
+  const int line_num;
+  // utf-8
+  const std::string function;
+  const std::vector<std::string> stacktrace;
+};
+
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/common.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/common.h
new file mode 100644
index 00000000000000..0822eba950f500
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/common.h
@@ -0,0 +1,286 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+// Portions Copyright (c) Microsoft Corporation
+
+#pragma once
+
+#include <climits>
+#include <cstring>
+#include <algorithm>
+#include <chrono>
+#include <functional>
+#include <memory>
+#include <numeric>
+#include <set>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "core/common/code_location.h"
+#include "core/common/exceptions.h"
+#include "core/common/make_string.h"
+#include "core/common/status.h"
+
+namespace onnxruntime {
+
+using TimePoint = std::chrono::high_resolution_clock::time_point;
+
+#ifdef _WIN32
+#define ORT_UNUSED_PARAMETER(x) (x)
+#else
+#define ORT_UNUSED_PARAMETER(x) (void)(x)
+#endif
+
+#ifndef ORT_HAVE_ATTRIBUTE
+#ifdef __has_attribute
+#define ORT_HAVE_ATTRIBUTE(x) __has_attribute(x)
+#else
+#define ORT_HAVE_ATTRIBUTE(x) 0
+#endif
+#endif
+
+// ORT_ATTRIBUTE_UNUSED
+//
+// Prevents the compiler from complaining about or optimizing away variables
+// that appear unused on Linux
+#if ORT_HAVE_ATTRIBUTE(unused) || (defined(__GNUC__) && !defined(__clang__))
+#undef ORT_ATTRIBUTE_UNUSED
+#define ORT_ATTRIBUTE_UNUSED __attribute__((__unused__))
+#else
+#define ORT_ATTRIBUTE_UNUSED
+#endif
+
+#ifdef ORT_NO_EXCEPTIONS
+// Print the given final message, the message must be a null terminated char*
+// ORT will abort after printing the message.
+// For Android, will print to Android system log
+// For other platforms, will print to stderr
+void PrintFinalMessage(const char* msg);
+#endif
+
+// macro to explicitly ignore the return value from a function call so Code Analysis doesn't complain
+#define ORT_IGNORE_RETURN_VALUE(fn) \
+  static_cast<void>(fn)
+
+std::vector<std::string> GetStackTrace();
+// these is a helper function that gets defined by platform/Telemetry
+void LogRuntimeError(uint32_t session_id, const common::Status& status, const char* file,
+                     const char* function, uint32_t line);
+
+// __PRETTY_FUNCTION__ isn't a macro on gcc, so use a check for _MSC_VER
+// so we only define it as one for MSVC
+#if (_MSC_VER && !defined(__PRETTY_FUNCTION__))
+#define __PRETTY_FUNCTION__ __FUNCTION__
+#endif
+
+// Capture where a message is coming from. Use __FUNCTION__ rather than the much longer __PRETTY_FUNCTION__
+#define ORT_WHERE ::onnxruntime::CodeLocation(__FILE__, __LINE__, static_cast<const char*>(__FUNCTION__))
+
+#define ORT_WHERE_WITH_STACK \
+  ::onnxruntime::CodeLocation(__FILE__, __LINE__, static_cast<const char*>(__PRETTY_FUNCTION__), ::onnxruntime::GetStackTrace())
+
+#ifdef ORT_NO_EXCEPTIONS
+
+#define ORT_TRY if (true)
+#define ORT_CATCH(x) else if (false)
+#define ORT_RETHROW
+
+// In order to ignore the catch statement when a specific exception (not ... ) is caught and referred
+// in the body of the catch statements, it is necessary to wrap the body of the catch statement into
+// a lambda function. otherwise the exception referred will be undefined and cause build break
+#define ORT_HANDLE_EXCEPTION(func)
+
+// Throw an exception with optional message.
+// NOTE: The arguments get streamed into a string via ostringstream::operator<<
+// DO NOT use a printf format string, as that will not work as you expect.
+#define ORT_THROW(...)                                                    \
+  do {                                                                    \
+    ::onnxruntime::PrintFinalMessage(                                     \
+        ::onnxruntime::OnnxRuntimeException(                              \
+            ORT_WHERE_WITH_STACK, ::onnxruntime::MakeString(__VA_ARGS__)) \
+            .what());                                                     \
+    abort();                                                              \
+  } while (false)
+
+// Just in order to mark things as not implemented. Do not use in final code.
+#define ORT_NOT_IMPLEMENTED(...)                                                       \
+  do {                                                                                 \
+    ::onnxruntime::PrintFinalMessage(                                                  \
+        ::onnxruntime::NotImplementedException(::onnxruntime::MakeString(__VA_ARGS__)) \
+            .what());                                                                  \
+    abort();                                                                           \
+  } while (false)
+
+// Check condition.
+// NOTE: The arguments get streamed into a string via ostringstream::operator<<
+// DO NOT use a printf format string, as that will not work as you expect.
+#define ORT_ENFORCE(condition, ...)                                                   \
+  do {                                                                                \
+    if (!(condition)) {                                                               \
+      ::onnxruntime::PrintFinalMessage(                                               \
+          ::onnxruntime::OnnxRuntimeException(ORT_WHERE_WITH_STACK, #condition,       \
+                                              ::onnxruntime::MakeString(__VA_ARGS__)) \
+              .what());                                                               \
+      abort();                                                                        \
+    }                                                                                 \
+  } while (false)
+
+#define ORT_THROW_EX(ex, ...)                                                                      \
+  do {                                                                                             \
+    ::onnxruntime::PrintFinalMessage(                                                              \
+        ::onnxruntime::MakeString(#ex, "(", ::onnxruntime::MakeString(__VA_ARGS__), ")").c_str()); \
+    abort();                                                                                       \
+  } while (false)
+
+#else
+
+#define ORT_TRY try
+#define ORT_CATCH(x) catch (x)
+#define ORT_RETHROW throw;
+
+#define ORT_HANDLE_EXCEPTION(func) func()
+
+// Throw an exception with optional message.
+// NOTE: The arguments get streamed into a string via ostringstream::operator<<
+// DO NOT use a printf format string, as that will not work as you expect.
+#define ORT_THROW(...) \
+  throw ::onnxruntime::OnnxRuntimeException(ORT_WHERE_WITH_STACK, ::onnxruntime::MakeString(__VA_ARGS__))
+
+// Just in order to mark things as not implemented. Do not use in final code.
+#define ORT_NOT_IMPLEMENTED(...) \
+  throw ::onnxruntime::NotImplementedException(::onnxruntime::MakeString(__VA_ARGS__))
+
+// Check condition.
+// NOTE: The arguments get streamed into a string via ostringstream::operator<<
+// DO NOT use a printf format string, as that will not work as you expect.
+#define ORT_ENFORCE(condition, ...)                                                      \
+  do {                                                                                   \
+    if (!(condition)) {                                                                  \
+      throw ::onnxruntime::OnnxRuntimeException(ORT_WHERE_WITH_STACK, #condition,        \
+                                                ::onnxruntime::MakeString(__VA_ARGS__)); \
+    }                                                                                    \
+  } while (false)
+
+#define ORT_THROW_EX(ex, ...) \
+  throw ex(__VA_ARGS__)
+
+#endif
+
+#define ORT_MAKE_STATUS(category, code, ...)                     \
+  ::onnxruntime::common::Status(::onnxruntime::common::category, \
+                                ::onnxruntime::common::code,     \
+                                ::onnxruntime::MakeString(__VA_ARGS__))
+
+// Check condition. if met, return status.
+#define ORT_RETURN_IF(condition, ...)                                                                          \
+  do {                                                                                                         \
+    if (condition) {                                                                                           \
+      return ::onnxruntime::common::Status(::onnxruntime::common::ONNXRUNTIME,                                 \
+                                           ::onnxruntime::common::FAIL,                                        \
+                                           ::onnxruntime::MakeString(ORT_WHERE.ToString(), " ", __VA_ARGS__)); \
+    }                                                                                                          \
+  } while (false)
+
+// Check condition. if not met, return status.
+#define ORT_RETURN_IF_NOT(condition, ...) \
+  ORT_RETURN_IF(!(condition), __VA_ARGS__)
+
+// Macros to disable the copy and/or move ctor and assignment methods
+// These are usually placed in the private: declarations for a class.
+
+#define ORT_DISALLOW_COPY(TypeName) TypeName(const TypeName&) = delete
+
+#define ORT_DISALLOW_ASSIGNMENT(TypeName) TypeName& operator=(const TypeName&) = delete
+
+#define ORT_DISALLOW_COPY_AND_ASSIGNMENT(TypeName) \
+  ORT_DISALLOW_COPY(TypeName);                     \
+  ORT_DISALLOW_ASSIGNMENT(TypeName)
+
+#define ORT_DISALLOW_MOVE(TypeName) \
+  TypeName(TypeName&&) = delete;    \
+  TypeName& operator=(TypeName&&) = delete
+
+#define ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(TypeName) \
+  ORT_DISALLOW_COPY_AND_ASSIGNMENT(TypeName);           \
+  ORT_DISALLOW_MOVE(TypeName)
+
+#define ORT_RETURN_IF_ERROR_SESSIONID(expr, session_id)                                                                \
+  do {                                                                                                                 \
+    auto _status = (expr);                                                                                             \
+    if ((!_status.IsOK())) {                                                                                           \
+      ::onnxruntime::LogRuntimeError(session_id, _status, __FILE__, static_cast<const char*>(__FUNCTION__), __LINE__); \
+      return _status;                                                                                                  \
+    }                                                                                                                  \
+  } while (0)
+
+#define ORT_RETURN_IF_ERROR_SESSIONID_(expr) ORT_RETURN_IF_ERROR_SESSIONID(expr, session_id_)
+#define ORT_RETURN_IF_ERROR(expr) ORT_RETURN_IF_ERROR_SESSIONID(expr, 0)
+
+#define ORT_THROW_IF_ERROR(expr)                                                                              \
+  do {                                                                                                        \
+    auto _status = (expr);                                                                                    \
+    if ((!_status.IsOK())) {                                                                                  \
+      ::onnxruntime::LogRuntimeError(0, _status, __FILE__, static_cast<const char*>(__FUNCTION__), __LINE__); \
+      ORT_THROW(_status);                                                                                     \
+    }                                                                                                         \
+  } while (0)
+
+// use this macro when cannot early return
+#define ORT_CHECK_AND_SET_RETVAL(expr) \
+  do {                                 \
+    if (retval.IsOK()) {               \
+      retval = (expr);                 \
+    }                                  \
+  } while (0)
+
+inline long long TimeDiffMicroSeconds(TimePoint start_time) {
+  auto end_time = std::chrono::high_resolution_clock::now();
+  return std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count();
+}
+
+inline long long TimeDiffMicroSeconds(TimePoint start_time, TimePoint end_time) {
+  return std::chrono::duration_cast<std::chrono::microseconds>(end_time - start_time).count();
+}
+
+struct null_type {};
+inline std::string ToUTF8String(const std::string& s) { return s; }
+#ifdef _WIN32
+/**
+ * Convert a wide character string to a UTF-8 string
+ */
+std::string ToUTF8String(const std::wstring& s);
+
+std::wstring ToWideString(const std::string& s);
+inline std::wstring ToWideString(const std::wstring& s) { return s; }
+#else
+inline std::string ToWideString(const std::string& s) { return s; }
+#endif
+
+constexpr size_t kMaxStrLen = 2048;
+
+// Returns whether `key` is in `container`.
+// Like C++20's map/set contains() member function.
+template <typename Key, typename... OtherContainerArgs,
+          template <typename...> typename AssociativeContainer,
+          typename LookupKey>
+inline bool Contains(const AssociativeContainer<Key, OtherContainerArgs...>& container, LookupKey&& key) {
+  return container.find(std::forward<LookupKey>(key)) != container.end();
+}
+
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/const_pointer_container.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/const_pointer_container.h
new file mode 100644
index 00000000000000..1d821ba6092050
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/const_pointer_container.h
@@ -0,0 +1,85 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <type_traits>
+
+namespace onnxruntime {
+/**
+   Container has T* entries. e.g. std::vector<T*>, and this class provides const access to those
+   via iterators and direct access, as the standard behavior only makes the pointer constant,
+   and not what is pointed too. i.e. you get a const pointer to T not a pointer to const T without this wrapper.
+   See https://stackoverflow.com/questions/8017036/understanding-const-iterator-with-pointers
+*/
+template <typename Container>
+class ConstPointerContainer {
+ public:
+  using T = typename std::remove_pointer<typename Container::value_type>::type;
+
+  class ConstIterator {
+   public:
+    using const_iterator = typename Container::const_iterator;
+    using iterator_category = std::input_iterator_tag;
+    using value_type = T*;
+    using difference_type = std::ptrdiff_t;
+    using pointer = T**;
+    using reference = T*&;
+
+    /** Construct iterator for container that will return const T* entries.*/
+    explicit ConstIterator(const_iterator position) noexcept : current_{position}, item_{nullptr} {}
+    ConstIterator(const ConstIterator& other) = default;
+    ConstIterator& operator=(const ConstIterator& other) = default;
+
+    bool operator==(const ConstIterator& other) const noexcept { return current_ == other.current_; }
+    bool operator!=(const ConstIterator& other) const noexcept { return current_ != other.current_; }
+
+    ConstIterator& operator++() {
+      ++current_;
+      return *this;
+    }
+
+    ConstIterator operator++(int) {
+      ConstIterator tmp{*this};
+      ++(*this);
+      return tmp;
+    }
+
+    const T*& operator*() const {
+      item_ = *current_;
+      return item_;
+    }
+
+    const T** operator->() const { return &(operator*()); };
+
+   private:
+    const_iterator current_;
+    mutable const T* item_;
+  };
+
+  /**
+     Construct wrapper class that will provide const access to the pointers in a container of non-const pointers.
+     @param data Container with non-const pointers. e.g. std::vector<T*>
+  */
+  explicit ConstPointerContainer(const Container& data) noexcept : data_(data) {}
+
+  size_t size() const noexcept { return data_.size(); }
+  bool empty() const noexcept { return data_.empty(); }
+
+  ConstIterator cbegin() const noexcept { return ConstIterator(data_.cbegin()); }
+  ConstIterator cend() const noexcept { return ConstIterator(data_.cend()); }
+
+  ConstIterator begin() const noexcept { return ConstIterator(data_.cbegin()); }
+  ConstIterator end() const noexcept { return ConstIterator(data_.cend()); }
+
+  const T* operator[](size_t index) const { return data_[index]; }
+
+  const T* at(size_t index) const {
+    ORT_ENFORCE(index < data_.size());
+    return data_[index];
+  }
+
+ private:
+  const Container& data_;
+};
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/denormal.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/denormal.h
new file mode 100644
index 00000000000000..ca944117813116
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/denormal.h
@@ -0,0 +1,12 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+namespace onnxruntime {
+
+// Set or unset flush-to-zero and denormal=as-zero if SSE3 instructions are supported.
+// Return true if SSE3 instruction is supported, otherwise return false.
+bool SetDenormalAsZero(bool on);
+
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/eigen_common_wrapper.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/eigen_common_wrapper.h
new file mode 100644
index 00000000000000..19efa7bcff107d
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/eigen_common_wrapper.h
@@ -0,0 +1,76 @@
+//-----------------------------------------------------------------------------
+//
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+//
+//-----------------------------------------------------------------------------
+#pragma once
+#include "onnxruntime_config.h"
+// build/external/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h:162:71:
+// error: ignoring attributes on template argument "Eigen::PacketType<const float, Eigen::DefaultDevice>::type {aka __vector(4) float}" [-Werror=ignored-attributes]
+#if defined(__GNUC__)
+#pragma GCC diagnostic push
+#if __GNUC__ >= 6
+#pragma GCC diagnostic ignored "-Wignored-attributes"
+#endif
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wunused-result"
+#ifdef HAS_DEPRECATED_COPY
+#pragma GCC diagnostic ignored "-Wdeprecated-copy"
+#endif
+// cmake/external/eigen/unsupported/Eigen/CXX11/../../../Eigen/src/Core/arch/NEON/PacketMath.h:1633:9:
+// error: ‘void* memcpy(void*, const void*, size_t)’ copying an object of non-trivial type ‘Eigen::internal::Packet4c’
+// {aka ‘struct Eigen::internal::eigen_packet_wrapper<int, 2>’} from an array of ‘const int8_t’
+// {aka ‘const signed char’} [-Werror=class-memaccess]
+#ifdef HAS_CLASS_MEMACCESS
+#pragma GCC diagnostic ignored "-Wclass-memaccess"
+#endif
+
+// cmake/external/eigen\Eigen/src/Core/util/Meta.h:454:25:
+// error: 'result_of<Eigen::internal::scalar_product_op<unsigned long long> (const unsigned long long &, const unsigned long long &)>'
+// is deprecated [-Werror,-Wdeprecated-declarations]
+//   typedef typename std::result_of<T>::type type1;
+#ifdef HAS_DEPRECATED_DECLARATIONS
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
+
+// cmake/external/eigen\Eigen/CXX11/src/Tensor/TensorTrace.h:130:9:
+// error: variable 'num_distinct_reduce_dims' set but not used [-Werror,-Wunused-but-set-variable]
+//   int num_distinct_reduce_dims = 0;
+#ifdef HAS_UNUSED_BUT_SET_VARIABLE
+#pragma GCC diagnostic ignored "-Wunused-but-set-variable"
+#endif
+
+// eigen-src/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h:231:56: error: implicit conversion loses integer
+//   precision: 'uint64_t' (aka 'unsigned long long') to 'size_t' (aka 'unsigned long') [-Werror,-Wshorten-64-to-32]
+// next = wnext == kStackMask ? nullptr : &waiters_[wnext];
+//                                         ~~~~~~~~ ^~~~~
+#ifdef HAS_SHORTEN_64_TO_32
+#pragma GCC diagnostic ignored "-Wshorten-64-to-32"
+#endif
+
+// eigen-src/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h:215:9:
+// error: implicit capture of 'this' with a capture default of '=' is deprecated [-Werror,-Wdeprecated-this-capture]
+#ifdef HAS_DEPRECATED_THIS_CAPTURE
+#pragma GCC diagnostic ignored "-Wdeprecated-this-capture"
+#endif
+
+#elif defined(_MSC_VER)
+// build\windows\debug\external\eigen3\unsupported\eigen\cxx11\src/Tensor/Tensor.h(76):
+// warning C4554: '&': check operator precedence for possible error; use parentheses to clarify precedence
+
+// unsupported\eigen\cxx11\src\Tensor\TensorUInt128.h(150,0): Warning C4245: 'initializing': conversion from '__int64'
+// to 'uint64_t', signed/unsigned mismatch
+#pragma warning(push)
+#pragma warning(disable : 4554)
+#pragma warning(disable : 4245)
+#pragma warning(disable : 4127)
+#endif
+
+#include "unsupported/Eigen/CXX11/Tensor"
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#elif defined(_MSC_VER)
+#pragma warning(pop)
+#endif
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/exceptions.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/exceptions.h
new file mode 100644
index 00000000000000..494a770b8db985
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/exceptions.h
@@ -0,0 +1,71 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <algorithm>
+#include <exception>
+#include <iterator>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include "core/common/common.h"
+#include "core/common/code_location.h"
+
+namespace onnxruntime {
+
+class NotImplementedException : public std::logic_error {
+ public:
+  explicit NotImplementedException(const char* _Message = "Function not yet implemented") noexcept : std::logic_error(_Message) {};
+  explicit NotImplementedException(const std::string& _Message = "Function not yet implemented") noexcept : std::logic_error(_Message) {};
+};
+
+class TypeMismatchException : public std::logic_error {
+ public:
+  TypeMismatchException() noexcept : logic_error("Type mismatch") {};
+};
+
+class OnnxRuntimeException : public std::exception {
+ public:
+  OnnxRuntimeException(const CodeLocation& location, const std::string& msg) noexcept
+      : OnnxRuntimeException(location, nullptr, msg) {
+  }
+
+  /**
+     Create a new exception that captures the location it was thrown from.
+     @param location Location in the source code the exception is being thrown from
+     @param failed_condition Optional string containing the condition that failed.
+     e.g. "tensor.Size() == input.Size()". May be nullptr.
+     @param msg Message containing additional information about the exception cause.
+  */
+  OnnxRuntimeException(const CodeLocation& location, const char* failed_condition, const std::string& msg)
+      : location_{location} {
+    std::ostringstream ss;
+
+    ss << location.ToString(CodeLocation::kFilenameAndPath);  // output full path in case just the filename is ambiguous
+    if (failed_condition != nullptr) {
+      ss << " " << failed_condition << " was false.";
+    }
+
+    ss << " " << msg << "\n";
+    if (!location.stacktrace.empty()) {
+      ss << "Stacktrace:\n";
+      // skip the first entry in the stacktrace as we have that information from location.ToString()
+      std::copy(std::next(location.stacktrace.begin()), location.stacktrace.end(), std::ostream_iterator<std::string>(ss, "\n"));
+    }
+
+    what_ = ss.str();
+  }
+
+  const char* what() const noexcept override {
+    return what_.c_str();
+  }
+
+ private:
+  const CodeLocation location_;
+  const std::vector<std::string> stacktrace_;
+  std::string what_;
+};
+
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/gpu_profiler_common.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/gpu_profiler_common.h
new file mode 100644
index 00000000000000..00d5033ef2df40
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/gpu_profiler_common.h
@@ -0,0 +1,472 @@
+#pragma once
+
+#include "core/common/profiler_common.h"
+#include "core/common/inlined_containers.h"
+
+#include <map>
+#include <memory>
+#include <mutex>
+#include <sstream>
+#include <string>
+#include <vector>
+#include <utility>
+
+namespace onnxruntime {
+namespace profiling {
+
+// The classes in this header are implemented as template/inline classes
+// to avoid having to export symbols from the main onnxruntime shared library
+// to ExecutionProvider (EP) shared libraries.
+// More context: The main onnxruntime shared library is optimized for size
+// using --gc-sections during link time to ensure that any unreferenced code
+// is not retained. This poses a problem in using a design pattern where the
+// (abstract) base class is implemented in the main onnxruntime shared library,
+// but (concrete) subclasses are implemented in EP shared libraries. Now, because
+// EP shared libraries are loaded at runtime (as of 11/2022), there will be no
+// references to the base class symbols when the main onnxruntime shared library
+// is compiled. Thus, the base class symbols will not be included in the
+// main onnxruntime shared library. This manifests in being unable to load
+// EP shared libs (because the base class symbols referenced by derived
+// classes are missing).
+// We solve this by implementing base classes that are common to all GPU profilers
+// inline in this header.
+
+class ProfilerActivityBuffer {
+ public:
+  ProfilerActivityBuffer() noexcept
+      : data_(nullptr), size_(0) {}
+
+  ProfilerActivityBuffer(const char* data, size_t size) noexcept
+      : data_(std::make_unique<char[]>(size)), size_(size) {
+    memcpy(data_.get(), data, size_);
+  }
+
+  ProfilerActivityBuffer(const ProfilerActivityBuffer& other) noexcept
+      : ProfilerActivityBuffer(other.GetData(), other.GetSize()) {}
+
+  ProfilerActivityBuffer(ProfilerActivityBuffer&& other) noexcept
+      : ProfilerActivityBuffer() {
+    std::swap(data_, other.data_);
+    std::swap(size_, other.size_);
+  }
+
+  ProfilerActivityBuffer& operator=(const ProfilerActivityBuffer& other) noexcept {
+    if (&other == this) {
+      return *this;
+    }
+
+    new (this) ProfilerActivityBuffer{other};
+    return *this;
+  }
+
+  ProfilerActivityBuffer& operator=(ProfilerActivityBuffer&& other) noexcept {
+    if (&other == this) {
+      return *this;
+    }
+
+    new (this) ProfilerActivityBuffer{std::move(other)};
+    return *this;
+  }
+
+  static ProfilerActivityBuffer CreateFromPreallocatedBuffer(std::unique_ptr<char[]>&& buffer_ptr, size_t size) {
+    ProfilerActivityBuffer res{};
+    res.data_ = std::move(buffer_ptr);
+    res.size_ = size;
+    return res;
+  }
+
+  // accessors
+  char* GetData() { return data_.get(); }
+  const char* GetData() const { return data_.get(); }
+  size_t GetSize() const { return size_; }
+
+ private:
+  std::unique_ptr<char[]> data_;
+  size_t size_;
+}; /* end class ProfilerActivityBuffer */
+
+template <typename TDerived>
+class GPUTracerManager {
+ public:
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(GPUTracerManager);
+  virtual ~GPUTracerManager() {}
+
+  uint64_t RegisterClient() {
+    std::lock_guard<std::mutex> lock(manager_instance_mutex_);
+    auto res = next_client_id_++;
+    per_client_events_by_ext_correlation_.insert({res, {}});
+    ++num_active_clients_;
+    return res;
+  }
+
+  void DeregisterClient(uint64_t client_handle) {
+    std::lock_guard<std::mutex> lock(manager_instance_mutex_);
+    auto it = per_client_events_by_ext_correlation_.find(client_handle);
+    if (it == per_client_events_by_ext_correlation_.end()) {
+      return;
+    }
+    per_client_events_by_ext_correlation_.erase(it);
+    --num_active_clients_;
+    if (num_active_clients_ == 0 && tracing_enabled_) {
+      StopLogging();
+    }
+  }
+
+  void StartLogging() {
+    std::lock_guard<std::mutex> lock(manager_instance_mutex_);
+    if (tracing_enabled_) {
+      return;
+    }
+
+    auto this_as_derived = static_cast<TDerived*>(this);
+    tracing_enabled_ = this_as_derived->OnStartLogging();
+  }
+
+  void Consume(uint64_t client_handle, const TimePoint& start_time, std::map<uint64_t, Events>& events) {
+    auto this_as_derived = static_cast<TDerived*>(this);
+    events.clear();
+    {
+      // Flush any pending activity records before starting
+      // to process the accumulated activity records.
+      std::lock_guard<std::mutex> lock_manager(manager_instance_mutex_);
+      if (!tracing_enabled_) {
+        return;
+      }
+
+      this_as_derived->FlushActivities();
+    }
+
+    std::vector<ProfilerActivityBuffer> activity_buffers;
+    {
+      std::lock_guard<std::mutex> lock(unprocessed_activity_buffers_mutex_);
+      std::swap(unprocessed_activity_buffers_, activity_buffers);
+      unprocessed_activity_buffers_.clear();
+    }
+
+    {
+      // Ensure that at most one thread is working through the activity buffers at any time.
+      std::lock_guard<std::mutex> lock_two(activity_buffer_processor_mutex_);
+      this_as_derived->ProcessActivityBuffers(activity_buffers, start_time);
+      auto it = per_client_events_by_ext_correlation_.find(client_handle);
+      if (it == per_client_events_by_ext_correlation_.end()) {
+        return;
+      }
+      std::swap(events, it->second);
+    }
+  }
+
+  void PushCorrelation(uint64_t client_handle,
+                       uint64_t external_correlation_id,
+                       TimePoint profiling_start_time) {
+    auto this_as_derived = static_cast<TDerived*>(this);
+    std::lock_guard<std::mutex> lock(manager_instance_mutex_);
+    if (!tracing_enabled_) {
+      return;
+    }
+
+    auto it = per_client_events_by_ext_correlation_.find(client_handle);
+    if (it == per_client_events_by_ext_correlation_.end()) {
+      // not a registered client, do nothing
+      return;
+    }
+
+    // external_correlation_id is simply the timestamp of this event,
+    // relative to profiling_start_time. i.e., it was computed as:
+    // external_correlation_id =
+    //      std::chrono::duration_cast<std::chrono::microseconds>(event_start_time - profiling_start_time).count()
+    //
+    // Because of the relative nature of the external_correlation_id, the same
+    // external_correlation_id can be reused across different clients, which then makes it
+    // impossible to recover the client from the external_correlation_id, which in turn
+    // makes it impossible to map events (which are tagged with external_correlation_id) to clients.
+    //
+    // To address these difficulties, we construct a new correlation_id (let's call it unique_cid)
+    // as follows:
+    // unique_cid =
+    //    external_correlation_id +
+    //    std::chrono::duration_cast<std::chrono::microseconds>(profiling_start_time.time_since_epoch()).count()
+    // now, unique_cid is monotonically increasing with time, so it can be used to reliably map events to clients.
+    //
+    // Of course, clients expect lists of events to be returned (on a call to Consume()), that are
+    // still keyed on the external_correlation_id that they've specified here, so we need to remember the
+    // offset to be subtracted
+    uint64_t offset = std::chrono::duration_cast<std::chrono::microseconds>(profiling_start_time.time_since_epoch()).count();
+    auto unique_cid = external_correlation_id + offset;
+    unique_correlation_id_to_client_offset_[unique_cid] = std::make_pair(client_handle, offset);
+    this_as_derived->PushUniqueCorrelation(unique_cid);
+  }
+
+  void PopCorrelation(uint64_t& popped_external_correlation_id) {
+    auto this_as_derived = static_cast<TDerived*>(this);
+    std::lock_guard<std::mutex> lock(manager_instance_mutex_);
+    if (!tracing_enabled_) {
+      return;
+    }
+    uint64_t unique_cid;
+    this_as_derived->PopUniqueCorrelation(unique_cid);
+    // lookup the offset and subtract it before returning popped_external_correlation_id to the client
+    auto client_it = unique_correlation_id_to_client_offset_.find(unique_cid);
+    if (client_it == unique_correlation_id_to_client_offset_.end()) {
+      popped_external_correlation_id = 0;
+      return;
+    }
+    popped_external_correlation_id = unique_cid - client_it->second.second;
+  }
+
+  void PopCorrelation() {
+    uint64_t unused;
+    PopCorrelation(unused);
+  }
+
+ protected:
+  GPUTracerManager() {
+    auto this_as_derived = static_cast<TDerived*>(this);
+    uint64_t gpu_ts1, gpu_ts2, cpu_ts;
+
+    // Get the CPU and GPU timestamps to warm up
+    gpu_ts1 = this_as_derived->GetGPUTimestampInNanoseconds();
+    cpu_ts = this->GetCPUTimestampInNanoseconds();
+
+    // Estimate the skew/offset between the CPU and GPU timestamps.
+    gpu_ts1 = this_as_derived->GetGPUTimestampInNanoseconds();
+    cpu_ts = this->GetCPUTimestampInNanoseconds();
+    gpu_ts2 = this_as_derived->GetGPUTimestampInNanoseconds();
+
+    auto gpu_ts = (gpu_ts1 + gpu_ts2) / 2;
+    offset_to_add_to_gpu_timestamps_ = cpu_ts - gpu_ts;
+  }
+
+#if 0
+  // Functional API to be implemented by subclasses
+  // Included here only for documentation purposes
+protected:
+  bool OnStartLogging();
+  void OnStopLogging();
+  void ProcessActivityBuffers(const std::vector<ProfilerActivityBuffer>& buffers,
+                              const TimePoint& start_time);
+  bool PushUniqueCorrelation(uint64_t unique_cid);
+  void PopUniqueCorrelation(uint64_t& popped_unique_cid);
+  void FlushActivities();
+  uint64_t GetGPUTimestampInNanoseconds();
+#endif
+
+  void EnqueueActivityBuffer(ProfilerActivityBuffer&& buffer) {
+    std::lock_guard<std::mutex> lock(unprocessed_activity_buffers_mutex_);
+    unprocessed_activity_buffers_.emplace_back(std::move(buffer));
+  }
+
+  // To be called by subclasses only from ProcessActivityBuffers
+  void MapEventToClient(uint64_t tracer_correlation_id, EventRecord&& event) {
+    auto it = tracer_correlation_to_unique_correlation_.find(tracer_correlation_id);
+    if (it == tracer_correlation_to_unique_correlation_.end()) {
+      // We're yet to receive a mapping to unique_correlation_id for this tracer_correlation_id
+      DeferEventMapping(std::move(event), tracer_correlation_id);
+      return;
+    }
+    auto unique_correlation_id = it->second;
+    auto p_event_list = GetEventListForUniqueCorrelationId(unique_correlation_id);
+    if (p_event_list != nullptr) {
+      p_event_list->emplace_back(std::move(event));
+    }
+  }
+
+  // To be called by subclasses only from ProcessActivityBuffers
+  void NotifyNewCorrelation(uint64_t tracer_correlation_id, uint64_t unique_correlation_id) {
+    tracer_correlation_to_unique_correlation_[tracer_correlation_id] = unique_correlation_id;
+    auto pending_it = events_pending_client_mapping_.find(tracer_correlation_id);
+    if (pending_it == events_pending_client_mapping_.end()) {
+      return;
+    }
+    // Map the pending events to the right client
+    MapEventsToClient(unique_correlation_id, std::move(pending_it->second));
+    events_pending_client_mapping_.erase(pending_it);
+  }
+
+  uint64_t NormalizeGPUTimestampToCPUEpoch(uint64_t gpu_timestamp_in_nanoseconds) {
+    return gpu_timestamp_in_nanoseconds + this->offset_to_add_to_gpu_timestamps_;
+  }
+
+ private:
+  // Requires: manager_instance_mutex_ should be held
+  void StopLogging() {
+    auto this_as_derived = static_cast<TDerived*>(this);
+    if (!tracing_enabled_) {
+      return;
+    }
+    this_as_derived->OnStopLogging();
+    tracing_enabled_ = false;
+    Clear();
+  }
+
+  // Requires: manager_instance_mutex_ should be held
+  void Clear() {
+    unprocessed_activity_buffers_.clear();
+    unique_correlation_id_to_client_offset_.clear();
+    per_client_events_by_ext_correlation_.clear();
+    tracer_correlation_to_unique_correlation_.clear();
+    events_pending_client_mapping_.clear();
+  }
+
+  Events* GetEventListForUniqueCorrelationId(uint64_t unique_correlation_id) {
+    auto client_it = unique_correlation_id_to_client_offset_.find(unique_correlation_id);
+    if (client_it == unique_correlation_id_to_client_offset_.end()) {
+      return nullptr;
+    }
+
+    // See the comments on the GetUniqueCorrelationId method for an explanation of
+    // of this offset computation and why it's required.
+    auto const& client_handle_offset = client_it->second;
+    auto external_correlation = unique_correlation_id - client_handle_offset.second;
+    auto& event_list = per_client_events_by_ext_correlation_[client_handle_offset.first][external_correlation];
+    return &event_list;
+  }
+
+  void MapEventsToClient(uint64_t unique_correlation_id, std::vector<EventRecord>&& events) {
+    auto p_event_list = GetEventListForUniqueCorrelationId(unique_correlation_id);
+    if (p_event_list != nullptr) {
+      p_event_list->insert(p_event_list->end(),
+                           std::make_move_iterator(events.begin()),
+                           std::make_move_iterator(events.end()));
+    }
+  }
+
+  void DeferEventMapping(EventRecord&& event, uint64_t tracer_correlation_id) {
+    events_pending_client_mapping_[tracer_correlation_id].emplace_back(std::move(event));
+  }
+
+  uint64_t GetCPUTimestampInNanoseconds() {
+    return std::chrono::duration_cast<std::chrono::nanoseconds>(
+               std::chrono::high_resolution_clock::now().time_since_epoch())
+        .count();
+  }
+
+  std::mutex manager_instance_mutex_;
+  uint64_t next_client_id_ = 1;
+  uint64_t num_active_clients_ = 0;
+  bool tracing_enabled_ = false;
+  std::mutex unprocessed_activity_buffers_mutex_;
+  std::mutex activity_buffer_processor_mutex_;
+
+  // Unprocessed activity buffers
+  std::vector<ProfilerActivityBuffer> unprocessed_activity_buffers_;
+
+  // Keyed on unique_correlation_id -> (client_id/client_handle, offset)
+  // unique_correlation_id - offset == external_correlation_id
+  InlinedHashMap<uint64_t, std::pair<uint64_t, uint64_t>> unique_correlation_id_to_client_offset_;
+
+  // Keyed on tracer_correlation_id -> unique_correlation_id
+  InlinedHashMap<uint64_t, uint64_t> tracer_correlation_to_unique_correlation_;
+
+  // client_id/client_handle -> external_correlation_id -> events
+  InlinedHashMap<uint64_t, std::map<uint64_t, Events>> per_client_events_by_ext_correlation_;
+
+  // Keyed on tracer correlation_id, keeps track of activity records
+  // for which we haven't established the external_correlation_id yet.
+  InlinedHashMap<uint64_t, std::vector<EventRecord>> events_pending_client_mapping_;
+
+  // An offset to add to (the possibly skewed) GPU timestamps
+  // to normalize GPU timestamps with CPU timestamps
+  int64_t offset_to_add_to_gpu_timestamps_;
+}; /* class GPUTracerManager */
+
+// Base class for a GPU profiler
+template <typename TManager>
+class GPUProfilerBase : public EpProfiler {
+ protected:
+  GPUProfilerBase() = default;
+  virtual ~GPUProfilerBase() {}
+
+  void MergeEvents(std::map<uint64_t, Events>& events_to_merge, Events& events) {
+    Events merged_events;
+
+    auto event_iter = std::make_move_iterator(events.begin());
+    auto event_end = std::make_move_iterator(events.end());
+    for (auto& map_iter : events_to_merge) {
+      if (map_iter.second.empty()) {
+        continue;
+      }
+
+      auto ts = static_cast<long long>(map_iter.first);
+
+      // find the last occurrence of a matching timestamp,
+      // if one exists
+      while (event_iter != event_end &&
+             (event_iter->ts < ts ||
+              (event_iter->ts == ts &&
+               (event_iter + 1) != event_end &&
+               (event_iter + 1)->ts == ts))) {
+        merged_events.emplace_back(*event_iter);
+        ++event_iter;
+      }
+
+      bool copy_op_names = false;
+      std::string op_name;
+      std::string parent_name;
+
+      if (event_iter != event_end && event_iter->ts == ts) {
+        // We've located a parent event, copy the op_name and set
+        // this event's parent_name property to the name of the parent.
+        copy_op_names = true;
+        op_name = event_iter->args["op_name"];
+        parent_name = event_iter->name;
+        merged_events.emplace_back(*event_iter);
+        ++event_iter;
+      }
+
+      for (auto& evt : map_iter.second) {
+        if (copy_op_names) {
+          // If we have found a matching parent event,
+          // then inherit some names from the parent.
+          evt.args["op_name"] = op_name;
+          evt.args["parent_name"] = parent_name;
+        }
+      }
+
+      merged_events.insert(merged_events.end(),
+                           std::make_move_iterator(map_iter.second.begin()),
+                           std::make_move_iterator(map_iter.second.end()));
+    }
+
+    // move any remaining events
+    merged_events.insert(merged_events.end(), event_iter, event_end);
+    std::swap(events, merged_events);
+  }
+
+  uint64_t client_handle_;
+  TimePoint profiling_start_time_;
+
+ public:
+  virtual bool StartProfiling(TimePoint profiling_start_time) override {
+    auto& manager = TManager::GetInstance();
+    manager.StartLogging();
+    profiling_start_time_ = profiling_start_time;
+    return true;
+  }
+
+  virtual void EndProfiling(TimePoint start_time, Events& events) override {
+    auto& manager = TManager::GetInstance();
+    std::map<uint64_t, Events> event_map;
+    manager.Consume(client_handle_, start_time, event_map);
+    MergeEvents(event_map, events);
+  }
+
+  virtual void Start(uint64_t id) override {
+    auto& manager = TManager::GetInstance();
+    manager.PushCorrelation(client_handle_, id, profiling_start_time_);
+  }
+
+  virtual void Stop(uint64_t) override {
+    auto& manager = TManager::GetInstance();
+    manager.PopCorrelation();
+  }
+}; /* class GPUProfilerBase */
+
+// Convert a pointer to a hex string
+static inline std::string PointerToHexString(const void* ptr) {
+  std::ostringstream sstr;
+  sstr << std::hex << ptr;
+  return sstr.str();
+}
+
+} /* end namespace profiling */
+} /* end namespace onnxruntime */
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/hash_combine.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/hash_combine.h
new file mode 100644
index 00000000000000..5662a329ea77f3
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/hash_combine.h
@@ -0,0 +1,21 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+namespace onnxruntime {
+
+// Combine hash value `seed` with hash value `h`, updating `seed` in place.
+// TODO(edgchen1) find a better implementation? e.g., see a more recent version of boost::hash_combine()
+inline void HashCombineWithHashValue(size_t h, size_t& seed) {
+  seed ^= h + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+}
+
+// Combine hash value `seed` with the hash value of `value`, updating `seed` in place.
+// The hash value computation is specified by the `Hash` template parameter.
+template <typename T, typename Hash = std::hash<T>>
+inline void HashCombine(const T& value, size_t& seed) {
+  HashCombineWithHashValue(Hash{}(value), seed);
+}
+
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/inlined_containers.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/inlined_containers.h
new file mode 100644
index 00000000000000..bd61e691a5d5dd
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/inlined_containers.h
@@ -0,0 +1,175 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <cmath>
+
+#include "core/common/inlined_containers_fwd.h"
+
+#ifndef DISABLE_ABSEIL
+
+#ifdef _MSC_VER
+#pragma warning(push)
+// C4127: conditional expression is constant
+#pragma warning(disable : 4127)
+// C4324: structure was padded due to alignment specifier
+// Usage of alignas causes some internal padding in places.
+#pragma warning(disable : 4324)
+#endif  // _MSC_VER
+
+#include <absl/container/flat_hash_set.h>
+#include <absl/container/flat_hash_map.h>
+
+#include <absl/container/node_hash_set.h>
+#include <absl/container/node_hash_map.h>
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif  // _MSC_VER
+
+#else  // DISABLE_ABSEIL
+
+#include <unordered_set>
+#include <unordered_map>
+#include <set>
+#include <map>
+
+#endif  // DISABLE_ABSEIL
+
+namespace onnxruntime {
+
+#ifndef DISABLE_ABSEIL
+// InlinedHashSet and InlinedHashMap are preferred
+// hash based containers. They store their values in the
+// buckets array that is allocated in one shot. It eliminates
+// per-node new/delete calls. Always call reserve() on any hash set/map
+// when the number of items is known in advance.
+// This does not allocate a dummy 'end' node on default construction.
+template <typename T, typename Allocator>
+class InlinedHashSet : public absl::flat_hash_set<T,
+                                                  absl::container_internal::hash_default_hash<T>,
+                                                  absl::container_internal::hash_default_eq<T>,
+                                                  Allocator> {
+  using Base = absl::flat_hash_set<T,
+                                   absl::container_internal::hash_default_hash<T>,
+                                   absl::container_internal::hash_default_eq<T>,
+                                   Allocator>;
+
+ public:
+  using Base::Base;
+};
+
+template <typename Key, typename Value,
+          typename Allocator>
+class InlinedHashMap : public absl::flat_hash_map<Key, Value,
+                                                  absl::container_internal::hash_default_hash<Key>,
+                                                  absl::container_internal::hash_default_eq<Key>,
+                                                  Allocator> {
+  using Base = absl::flat_hash_map<Key, Value,
+                                   absl::container_internal::hash_default_hash<Key>,
+                                   absl::container_internal::hash_default_eq<Key>,
+                                   Allocator>;
+
+ public:
+  using Base::Base;
+};
+
+// Use this hash set/map where pointer stability is required, otherwise use
+// InlinedHashSet and InlinedHashMap
+// This does not allocate a dummy 'end' node on default construction.
+// Use reserve() when the number of elements is known.
+template <typename T, typename Allocator>
+class NodeHashSet : public absl::node_hash_set<T,
+                                               absl::container_internal::hash_default_hash<T>,
+                                               absl::container_internal::hash_default_eq<T>,
+                                               Allocator> {
+  using Base = absl::node_hash_set<T,
+                                   absl::container_internal::hash_default_hash<T>,
+                                   absl::container_internal::hash_default_eq<T>,
+                                   Allocator>;
+
+ public:
+  using Base::Base;
+};
+
+template <typename Key, typename Value, typename Allocator>
+class NodeHashMap : public absl::node_hash_map<Key, Value,
+                                               absl::container_internal::hash_default_hash<Key>,
+                                               absl::container_internal::hash_default_eq<Key>,
+                                               Allocator> {
+  using Base = absl::node_hash_map<Key, Value,
+                                   absl::container_internal::hash_default_hash<Key>,
+                                   absl::container_internal::hash_default_eq<Key>,
+                                   Allocator>;
+
+ public:
+  using Base::Base;
+};
+
+#else  // DISABLE_ABSEIL
+
+template <typename T, typename Allocator>
+class InlinedHashSet : public std::unordered_set<T,
+                                                 std::hash<T>,
+                                                 std::equal_to<T>,
+                                                 Allocator> {
+  using Base = std::unordered_set<T,
+                                  std::hash<T>,
+                                  std::equal_to<T>,
+                                  Allocator>;
+
+ public:
+  using Base::Base;
+};
+
+template <typename Key, typename Value,
+          typename Allocator>
+class InlinedHashMap : public std::unordered_map<Key, Value,
+                                                 std::hash<Key>,
+                                                 std::equal_to<Key>,
+                                                 Allocator> {
+  using Base = std::unordered_map<Key, Value,
+                                  std::hash<Key>,
+                                  std::equal_to<Key>,
+                                  Allocator>;
+
+ public:
+  using Base::Base;
+};
+
+// Use this hash set/map where pointer stability is required, otherwise use
+// InlinedHashSet and InlinedHashMap
+// This does not allocate a dummy 'end' node on default construction.
+// Use reserve() when the number of elements is known.
+template <typename T, typename Allocator>
+class NodeHashSet : public std::unordered_set<T,
+                                              std::hash<T>,
+                                              std::equal_to<T>,
+                                              Allocator> {
+  using Base = std::unordered_set<T,
+                                  std::hash<T>,
+                                  std::equal_to<T>,
+                                  Allocator>;
+
+ public:
+  using Base::Base;
+};
+
+template <typename Key, typename Value, typename Allocator>
+class NodeHashMap : public std::unordered_map<Key, Value,
+                                              std::hash<Key>,
+                                              std::equal_to<Key>,
+                                              Allocator> {
+  using Base = std::unordered_map<Key, Value,
+                                  std::hash<Key>,
+                                  std::equal_to<Key>,
+                                  Allocator>;
+
+ public:
+  using Base::Base;
+};
+
+#endif  // DISABLE_ABSEIL
+
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/inlined_containers_fwd.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/inlined_containers_fwd.h
new file mode 100644
index 00000000000000..21a55f9b315bc4
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/inlined_containers_fwd.h
@@ -0,0 +1,151 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <memory>
+#include <utility>
+
+#ifndef DISABLE_ABSEIL
+#ifdef _MSC_VER
+#pragma warning(push)
+// C4127: conditional expression is constant
+#pragma warning(disable : 4127)
+// C4324: structure was padded due to alignment specifier
+// Usage of alignas causes some internal padding in places.
+#pragma warning(disable : 4324)
+#else
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102329#c2
+#if !defined(__clang__) && defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+#endif  // _MSC_VER
+
+#include <absl/container/inlined_vector.h>
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#else
+#if !defined(__clang__) && defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
+#endif  // _MSC_VER
+
+#else
+
+#include <vector>
+
+#endif  // DISABLE_ABSEIL
+
+// Forward declarations for contexts where abseil can not be compiled and
+// not really needed but we want to have it in the headers that are included
+// e.g. CUDA 10 and .CU files
+// InlinedVector seems to be fine with old CUDA
+
+//===- llvm/ADT/SmallVector.h - 'Normally small' vectors --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// This file contains code and comments derived from llvm/ADT/SmallVector.h
+//
+// Specifically CalculateInlinedVectorDefaultInlinedElements<T>() template is derived from
+// CalculateSmallVectorDefaultInlinedElements<T>() and its comments.
+
+namespace onnxruntime {
+#ifndef DISABLE_ABSEIL
+/// Inspired by LLVM SmallVector with ONNX Runtime adjustments for abseil.
+/// https://github.com/llvm/llvm-project/blob/a85b37d0ca819776c6034c2dbda2b21e54e3393a/llvm/include/llvm/ADT/SmallVector.h#L1128-L1179
+///
+/// Helper class for calculating the default number of inline elements for
+/// `InlinedVector<T>`.
+/// This produces the following on MSVC x64
+///    int8_t  -> 41
+//     int16_t -> 21
+//     int32_t -> 11
+//     int64_t -> 6
+//     std::string 40 -> 1
+template <typename T>
+struct CalculateInlinedVectorDefaultInlinedElements {
+  // Parameter controlling the default number of inlined elements
+  // for `InlinedVector<T>`.
+  //
+  // The default number of inlined elements ensures that
+  // 1. There is at least one inlined element.
+  // 2. `sizeof(InlinedVector<T>) <= kPreferredInlinedVectorSizeof` unless
+  // it contradicts 1.
+  static constexpr size_t kPreferredInlinedVectorSizeof = 64;
+
+  // Largest allowed element size for default element count calculation.
+  static constexpr size_t kElementSizeCutoff = 256;
+
+  // static_assert that sizeof(T) is not "too big".
+  //
+  // Because the InlinedVector must have at least one inlined element, it is possible
+  // for an arbitrarily large inlined element to allocate an arbitrarily large
+  // amount of inline storage. So we want to call attention to these cases and
+  // make sure that users are making an intentional decision if they request a lot of inline storage.
+  //
+  // We want this assertion to trigger in pathological cases, but otherwise
+  // not be too easy to hit. To accomplish that, the cutoff is actually somewhat
+  // larger than kPreferredInlinedVectorSizeof (otherwise,
+  // `InlinedVector<InlinedVector<T>>` would be one easy way to trip it, and that
+  // pattern seems useful in practice).
+  //
+  // One wrinkle is that this assertion is in theory non-portable, since
+  // sizeof(absl::InlinedVector<T, 1>) is in general platform-dependent. However, we don't expect this
+  // to be much of an issue, because most LLVM development happens on 64-bit
+  // hosts, and therefore sizeof(T) is expected to *decrease* when compiled for
+  // 32-bit hosts, dodging the issue. The reverse situation, where development
+  // happens on a 32-bit host and then fails due to sizeof(T) *increasing* on a
+  // 64-bit host, is expected to be very rare.
+  static_assert(
+      sizeof(T) <= kElementSizeCutoff,
+      "You are trying to use a default number of inlined elements for "
+      "`InlinedVector<T>` but `sizeof(T)` is really big! Please use an "
+      "explicit number of inlined elements with `InlinedVector<T, N>` to make "
+      "sure you really want that much inline storage.");
+
+  // Discount the size of the header itself when calculating the maximum inline
+  // bytes.
+  static constexpr size_t InlinedVectorHeaderSize = sizeof(absl::InlinedVector<T, 1>) - sizeof(T);
+  static constexpr size_t PreferredInlineBytes = kPreferredInlinedVectorSizeof - InlinedVectorHeaderSize;
+  static constexpr size_t NumElementsThatFit = PreferredInlineBytes / sizeof(T);
+  static constexpr size_t value =
+      NumElementsThatFit == 0 ? 1 : NumElementsThatFit;
+};
+
+// Use InlinedVector for small arrays that can fit on a stack with a default
+// value pre-calculated.
+// Use TensorShapeVector for shapes.
+template <typename T,
+          size_t N = CalculateInlinedVectorDefaultInlinedElements<T>::value,
+          typename Allocator = std::allocator<T>>
+using InlinedVector = absl::InlinedVector<T, N, Allocator>;
+
+#else
+
+template <typename T,
+          size_t N = 0,
+          typename Allocator = std::allocator<T>>
+using InlinedVector = std::vector<T, Allocator>;
+
+#endif  // DISABLE_ABSEIL
+
+template <typename T,
+          typename Allocator = std::allocator<T>>
+class InlinedHashSet;
+
+template <typename Key, typename Value,
+          typename Allocator = std::allocator<std::pair<const Key, Value>>>
+class InlinedHashMap;
+
+template <typename T, typename Allocator = std::allocator<T>>
+class NodeHashSet;
+
+template <typename Key, typename Value,
+          typename Allocator = std::allocator<std::pair<const Key, Value>>>
+class NodeHashMap;
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/logging/capture.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/logging/capture.h
new file mode 100644
index 00000000000000..13d3a3ad17aff5
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/logging/capture.h
@@ -0,0 +1,115 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <cstdarg>
+#include <gsl/gsl>
+#include "core/common/common.h"
+#include "core/common/code_location.h"
+#include "core/common/logging/severity.h"
+
+namespace onnxruntime {
+namespace logging {
+
+class Logger;
+enum class DataType;
+
+/**
+   Class to capture the details of a log message.
+*/
+class Capture {
+ public:
+  /**
+     Initializes a new instance of the Capture class.
+     @param logger The logger.
+     @param severity The severity.
+     @param category The category.
+     @param dataType Type of the data.
+     @param location The file location the log message is coming from.
+  */
+  Capture(const Logger& logger, logging::Severity severity, const char* category,
+          logging::DataType dataType, const CodeLocation& location)
+      : logger_{&logger}, severity_{severity}, category_{category}, data_type_{dataType}, location_{location} {
+  }
+
+  /**
+     The stream that can capture the message via operator<<.
+     @returns Output stream.
+  */
+  std::ostream& Stream() noexcept {
+    return stream_;
+  }
+
+#ifdef _MSC_VER
+// add SAL annotation for printf format string. requires Code Analysis to run to validate usage.
+#define msvc_printf_check _Printf_format_string_
+#define __attribute__(x)  // Disable for MSVC. Supported by GCC and CLang.
+#else
+#define msvc_printf_check
+#endif
+
+  /**
+     Captures a printf style log message.
+     @param name="format">The printf format.
+     @param name="">Arguments to the printf format if needed.
+     @remarks
+     A maximum of 2K of output will be captured currently.
+     Non-static method, so 'this' is implicit first arg, and we use format(printf(2,3)
+  */
+  void CapturePrintf(msvc_printf_check const char* format, ...) __attribute__((format(printf, 2, 3)));
+
+  /**
+     Process a printf style log message.
+     @param format The printf format.
+     @param ... Arguments to the printf format if needed.
+     @remarks
+     A maximum of 2K of output will be captured currently.
+     Note: As va_list is 'char *', we have to disambiguate this from CapturePrintf
+     so that something like "One string: %s", "the string" does not consider "the string"
+     to be the va_list.
+  */
+  void ProcessPrintf(msvc_printf_check const char* format, va_list args);
+
+  logging::Severity Severity() const noexcept {
+    return severity_;
+  }
+
+  char SeverityPrefix() const noexcept {
+    // Carefully setup so severity_ is a valid index
+    GSL_SUPPRESS(bounds.2) {
+      return logging::SEVERITY_PREFIX[static_cast<int>(severity_)];
+    }
+  }
+
+  const char* Category() const noexcept {
+    return category_;
+  }
+
+  logging::DataType DataType() const noexcept {
+    return data_type_;
+  }
+
+  const CodeLocation& Location() const noexcept {
+    return location_;
+  }
+
+  std::string Message() const noexcept {
+    return stream_.str();
+  }
+
+  ~Capture();
+
+ private:
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Capture);
+
+  const Logger* logger_;
+  const logging::Severity severity_;
+  const char* category_;
+  const logging::DataType data_type_;
+  const CodeLocation location_;
+
+  std::ostringstream stream_;
+};
+}  // namespace logging
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/logging/isink.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/logging/isink.h
new file mode 100644
index 00000000000000..fd011e71611fc8
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/logging/isink.h
@@ -0,0 +1,46 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <string>
+
+#include "core/common/logging/logging.h"
+#include "core/common/logging/sink_types.h"
+
+namespace onnxruntime {
+namespace logging {
+class ISink {
+ public:
+  explicit ISink(SinkType type = SinkType::BaseSink) : type_(type) {}
+
+  SinkType GetType() const { return type_; }
+
+  /**
+     Sends the message to the sink.
+     @param timestamp The timestamp.
+     @param logger_id The logger identifier.
+     @param message The captured message.
+  */
+  void Send(const Timestamp& timestamp, const std::string& logger_id, const Capture& message) {
+    SendImpl(timestamp, logger_id, message);
+  }
+
+  /**
+    Sends a Profiling Event Record to the sink.
+    @param Profiling Event Record
+  */
+  virtual void SendProfileEvent(profiling::EventRecord&) const {};
+
+  virtual ~ISink() = default;
+
+ private:
+  SinkType type_;
+
+  // Make Code Analysis happy by disabling all for now. Enable as needed.
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(ISink);
+
+  virtual void SendImpl(const Timestamp& timestamp, const std::string& logger_id, const Capture& message) = 0;
+};
+}  // namespace logging
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/logging/logging.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/logging/logging.h
new file mode 100644
index 00000000000000..ab2c476f2975a8
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/logging/logging.h
@@ -0,0 +1,421 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <atomic>
+#include <chrono>
+#include <climits>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <string>
+
+#include "core/common/common.h"
+#include "core/common/profiler_common.h"
+#include "core/common/logging/capture.h"
+#include "core/common/logging/macros.h"
+#include "core/common/logging/severity.h"
+#include "core/common/logging/sink_types.h"
+#include "date/date.h"
+
+/*
+
+  Logging overview and expected usage:
+
+  At program startup:
+  * Create one or more ISink instances. If multiple, combine using composite_sink.
+  * Create a LoggingManager instance with the sink/s with is_default_instance set to true
+  * Only one instance should be created in this way, and it should remain valid for
+  until the program no longer needs to produce log output.
+
+  You can either use the static default Logger which LoggingManager will create when constructed
+  via LoggingManager::DefaultLogger(), or separate Logger instances each with different log ids
+  via LoggingManager::CreateLogger.
+
+  The log id is passed to the ISink instance with the sink determining how the log id is used
+  in the output.
+
+  LoggingManager
+  * creates the Logger instances used by the application
+  * provides a static default logger instance
+  * owns the log sink instance
+  * applies checks on severity and output of user data
+
+  The log macros create a Capture instance to capture the information to log.
+  If the severity and/or user filtering settings would prevent logging, no evaluation
+  of the log arguments will occur, so no performance cost beyond the severity and user
+  filtering check.
+
+  A sink can do further filter as needed.
+
+*/
+
+namespace onnxruntime {
+
+namespace logging {
+
+using Timestamp = std::chrono::time_point<std::chrono::system_clock>;
+
+// C++20 has operator<< in std::chrono for Timestamp type but mac builds need additional checks
+// to ensure usage is valid.
+// TODO: As we enable C++20 on other platforms we may need similar checks.
+// define a temporary value to determine whether to use the std::chrono or date implementation.
+#define ORT_USE_CXX20_STD_CHRONO __cplusplus >= 202002L
+
+// Apply constraints for mac builds
+#if __APPLE__
+#include <TargetConditionals.h>
+
+// Catalyst check must be first as it has both TARGET_OS_MACCATALYST and TARGET_OS_MAC set
+#if TARGET_OS_MACCATALYST
+// maccatalyst requires version 16.3
+#if (defined(__IPHONE_OS_VERSION_MIN_REQUIRED) && __IPHONE_OS_VERSION_MIN_REQUIRED < 160300)
+#undef ORT_USE_CXX20_STD_CHRONO
+#endif
+
+#elif TARGET_OS_MAC
+// Xcode added support for C++20's std::chrono::operator<< in SDK version 14.4,
+// but the target macOS version must also be >= 13.3 for it to be used.
+#if (defined(__MAC_OS_X_VERSION_MAX_ALLOWED) && __MAC_OS_X_VERSION_MAX_ALLOWED < 140400) || \
+    (defined(__MAC_OS_X_VERSION_MIN_REQUIRED) && __MAC_OS_X_VERSION_MIN_REQUIRED < 130300)
+#undef ORT_USE_CXX20_STD_CHRONO
+#endif
+
+#endif
+#endif  // __APPLE__
+
+#if ORT_USE_CXX20_STD_CHRONO
+namespace timestamp_ns = std::chrono;
+#else
+namespace timestamp_ns = ::date;
+#endif
+
+#undef ORT_USE_CXX20_STD_CHRONO
+
+#ifndef NDEBUG
+ORT_ATTRIBUTE_UNUSED static bool vlog_enabled = true;  // Set directly based on your needs.
+#else
+constexpr bool vlog_enabled = false;  // no VLOG output
+#endif
+
+enum class DataType {
+  SYSTEM = 0,  ///< System data.
+  USER = 1     ///< Contains potentially sensitive user data.
+};
+
+// Internal log categories.
+// Logging interface takes const char* so arbitrary values can also be used.
+struct Category {
+  static const char* onnxruntime;  ///< General output
+  static const char* System;       ///< Log output regarding interactions with the host system
+  // TODO: What other high level categories are meaningful? Model? Optimizer? Execution?
+};
+
+/// <summary>
+/// ORT TraceLogging keywords for categories of dynamic logging enablement
+/// </summary>
+enum class ORTTraceLoggingKeyword : uint64_t {
+  Session = 0x1,    // ORT Session TraceLoggingWrite
+  Logs = 0x2,       // LOGS() Macro ORT logs. Pair with an appropriate level depending on detail required
+  Reserved1 = 0x4,  // Reserved if we want to add some specific sub-categories instead of just LOGS() or other uses
+  Reserved2 = 0x8,
+  Reserved3 = 0x10,
+  Reserved4 = 0x20,
+  Reserved5 = 0x40,
+  Reserved6 = 0x80,
+  Profiling = 0x100  // Enables profiling. At higher levels >5 can impact inference performance
+};
+
+class ISink;
+class Logger;
+class Capture;
+
+/// <summary>
+/// The logging manager.
+/// Owns the log sink and potentially provides a default Logger instance.
+/// Provides filtering based on a minimum LogSeverity level, and of messages with DataType::User if enabled.
+/// </summary>
+class LoggingManager final {
+ public:
+  enum InstanceType {
+    Default,  ///< Default instance of LoggingManager that should exist for the lifetime of the program
+    Temporal  ///< Temporal instance. CreateLogger(...) should be used, however DefaultLogger() will NOT be provided via this instance.
+  };
+
+  /**
+     Initializes a new instance of the LoggingManager class.
+     @param sink The sink to write to. Use CompositeSink if you need to write to multiple places.
+     @param default_min_severity The default minimum severity. Messages with lower severity will be ignored unless
+     overridden in CreateLogger.
+     @param default_filter_user_data If set to true ignore messages with DataType::USER unless overridden in CreateLogger.
+     @param instance_type If InstanceType::Default, this is the default instance of the LoggingManager
+     and is expected to exist for the lifetime of the program.
+     It creates and owns the default logger that calls to the static DefaultLogger method return.
+     @param default_logger_id Logger Id to use for the default logger. nullptr/ignored if instance_type == Temporal.
+     @param default_max_vlog_level Default maximum level for VLOG messages to be created unless overridden in CreateLogger.
+     Requires a severity of kVERBOSE for VLOG messages to be logged.
+  */
+  LoggingManager(std::unique_ptr<ISink> sink, Severity default_min_severity, bool default_filter_user_data,
+                 InstanceType instance_type,
+                 const std::string* default_logger_id = nullptr,
+                 int default_max_vlog_level = -1);
+
+  /**
+     Creates a new logger instance which will use the provided logger_id and default severity and vlog levels.
+     @param logger_id The log identifier.
+     @returns A new Logger instance that the caller owns.
+  */
+  std::unique_ptr<Logger> CreateLogger(const std::string& logger_id);
+
+  /**
+     Creates a new logger instance which will use the provided logger_id, severity and vlog levels.
+     @param logger_id The log identifier.
+     @param min_severity The minimum severity. Requests to create messages with lower severity will be ignored.
+     @param filter_user_data If set to true ignore messages with DataType::USER.
+     @param max_vlog_level Maximum level for VLOG messages to be created.
+     @returns A new Logger instance that the caller owns.
+  */
+  std::unique_ptr<Logger> CreateLogger(const std::string& logger_id,
+                                       Severity min_severity, bool filter_user_data, int max_vlog_level = -1);
+
+  /**
+     Gets the default logger instance if set. Throws if no default logger is currently registered.
+     @remarks
+     Creating a LoggingManager instance with is_default_instance == true registers a default logger.
+     Note that the default logger is only valid until the LoggerManager that registered it is destroyed.
+     @returns The default logger if available.
+  */
+  static const Logger& DefaultLogger();
+
+  /**
+    Return a boolean indicating if the default logger has been initialized
+  */
+  static bool HasDefaultLogger() { return nullptr != s_default_logger_; }
+
+  /**
+    Gets the default instance of the LoggingManager.
+  */
+  static LoggingManager* GetDefaultInstance();
+
+  /**
+     Removes a Sink if one is present
+  */
+  void RemoveSink(SinkType sinkType);
+
+  /**
+     Adds a Sink to the current sink creating a CompositeSink if necessary
+     Sinks types must be unique
+     @param severity The severity level for the new Sink
+  */
+  bool AddSinkOfType(SinkType sinkType, std::function<std::unique_ptr<ISink>()> sinkFactory, logging::Severity severity);
+
+  /**
+     Change the minimum severity level for log messages to be output by the default logger.
+     @param severity The severity.
+  */
+  static void SetDefaultLoggerSeverity(Severity severity);
+
+  /**
+     Change the maximum verbosity level for log messages to be output by the default logger.
+     @remarks
+     To activate the verbose log, the logger severity must also be set to kVERBOSE.
+     @param vlog_level The verbosity level.
+  */
+  static void SetDefaultLoggerVerbosity(int vlog_level);
+
+  /**
+     Logs a FATAL level message and creates an exception that can be thrown with error information.
+     @param category The log category.
+     @param location The location the log message was generated.
+     @param format_str The printf format string.
+     @param ... The printf arguments.
+     @returns A new Logger instance that the caller owns.
+  */
+  static std::exception LogFatalAndCreateException(const char* category,
+                                                   const CodeLocation& location,
+                                                   const char* format_str, ...);
+
+  /**
+     Logs the message using the provided logger id.
+     @param logger_id The log identifier.
+     @param message The log message.
+  */
+  void Log(const std::string& logger_id, const Capture& message) const;
+
+  /**
+    Sends a Profiling Event Record to the sink.
+    @param Profiling Event Record
+  */
+  void SendProfileEvent(profiling::EventRecord& eventRecord) const;
+  ~LoggingManager();
+
+ private:
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(LoggingManager);
+
+  Timestamp GetTimestamp() const noexcept;
+  void CreateDefaultLogger(const std::string& logger_id);
+
+  std::unique_ptr<ISink> sink_;
+#ifdef _WIN32
+  mutable std::mutex sink_mutex_;
+#endif
+  Severity default_min_severity_;
+  const bool default_filter_user_data_;
+  const int default_max_vlog_level_;
+  bool owns_default_logger_;
+
+  static Logger* s_default_logger_;
+
+  struct Epochs {
+    const std::chrono::time_point<std::chrono::high_resolution_clock> high_res;
+    const std::chrono::time_point<std::chrono::system_clock> system;
+    const std::chrono::minutes localtime_offset_from_utc;
+  };
+
+  static const Epochs& GetEpochs() noexcept;
+};
+
+/**
+   Logger provides a per-instance log id. Everything else is passed back up to the LoggingManager
+*/
+class Logger {
+ public:
+  /**
+     Initializes a new instance of the Logger class.
+     @param loggingManager The logging manager.
+     @param id The identifier for messages coming from this Logger.
+     @param severity Minimum severity for messages to be created and logged.
+     @param filter_user_data Should USER data be filtered from output.
+     @param vlog_level Minimum level for VLOG messages to be created. Note that a severity of kVERBOSE must be provided
+     for VLOG messages to be logged.
+  */
+  Logger(const LoggingManager& loggingManager, std::string id,
+         Severity severity, bool filter_user_data, int vlog_level)
+      : logging_manager_{&loggingManager},
+        id_{id},
+        min_severity_{severity},
+        filter_user_data_{filter_user_data},
+        max_vlog_level_{vlog_level} {
+  }
+
+  /**
+     Get the minimum severity level for log messages to be output.
+     @returns The severity.
+  */
+  Severity GetSeverity() const noexcept { return min_severity_; }
+
+  /**
+     Change the minimum severity level for log messages to be output.
+     @param severity The severity.
+  */
+  void SetSeverity(Severity severity) noexcept { min_severity_ = severity; }
+
+  /**
+     Change the maximum verbosity level for log messages to be output.
+     @remarks
+     To activate the verbose log, the logger severity must also be set to kVERBOSE.
+     @param vlog_level The verbosity.
+  */
+  void SetVerbosity(int vlog_level) noexcept { max_vlog_level_ = vlog_level; }
+
+  /**
+     Check if output is enabled for the provided LogSeverity and DataType values.
+     @param severity The severity.
+     @param data_type Type of the data.
+     @returns True if a message with these values will be logged.
+  */
+  bool OutputIsEnabled(Severity severity, DataType data_type) const noexcept {
+    return (severity >= min_severity_ && (data_type != DataType::USER || !filter_user_data_));
+  }
+
+  /**
+     Return the maximum VLOG level allowed. Disabled unless logging VLOG messages
+  */
+  int VLOGMaxLevel() const noexcept {
+    return min_severity_ > Severity::kVERBOSE ? -1 : max_vlog_level_;
+  }
+
+  /**
+     Logs the captured message.
+     @param message The log message.
+  */
+  void Log(const Capture& message) const {
+    logging_manager_->Log(id_, message);
+  }
+
+  /**
+    Sends a Profiling Event Record to the sink.
+    @param Profiling Event Record
+  */
+  void SendProfileEvent(profiling::EventRecord& eventRecord) const {
+    logging_manager_->SendProfileEvent(eventRecord);
+  }
+
+ private:
+  const LoggingManager* logging_manager_;
+  const std::string id_;
+  Severity min_severity_;
+  const bool filter_user_data_;
+  int max_vlog_level_;
+};
+
+inline const Logger& LoggingManager::DefaultLogger() {
+  if (s_default_logger_ == nullptr) {
+    // fail early for attempted misuse. don't use logging macros as we have no logger.
+    ORT_THROW("Attempt to use DefaultLogger but none has been registered.");
+  }
+
+  return *s_default_logger_;
+}
+
+inline void LoggingManager::SetDefaultLoggerSeverity(Severity severity) {
+  if (s_default_logger_ == nullptr) {
+    // fail early for attempted misuse. don't use logging macros as we have no logger.
+    ORT_THROW("Attempt to use DefaultLogger but none has been registered.");
+  }
+
+  s_default_logger_->SetSeverity(severity);
+}
+
+inline void LoggingManager::SetDefaultLoggerVerbosity(int vlog_level) {
+  if (s_default_logger_ == nullptr) {
+    // fail early for attempted misuse. don't use logging macros as we have no logger.
+    ORT_THROW("Attempt to use DefaultLogger but none has been registered.");
+  }
+
+  s_default_logger_->SetVerbosity(vlog_level);
+}
+
+inline Timestamp LoggingManager::GetTimestamp() const noexcept {
+  static const Epochs& epochs = GetEpochs();
+
+  const auto high_res_now = std::chrono::high_resolution_clock::now();
+  return std::chrono::time_point_cast<std::chrono::system_clock::duration>(
+      epochs.system + (high_res_now - epochs.high_res) + epochs.localtime_offset_from_utc);
+}
+
+/**
+   Return the current thread id.
+*/
+unsigned int GetThreadId();
+
+/**
+   Return the current process id.
+*/
+unsigned int GetProcessId();
+
+/**
+   If the ONNXRuntimeTraceLoggingProvider ETW Provider is enabled, then adds to the existing logger.
+*/
+std::unique_ptr<ISink> EnhanceSinkWithEtw(std::unique_ptr<ISink> existingSink, logging::Severity originalSeverity,
+                                          logging::Severity etwSeverity);
+
+/**
+  If the ONNXRuntimeTraceLoggingProvider ETW Provider is enabled, then can override the logging level.
+  But this overrided level only applies to the ETW sink. The original logger(s) retain their original logging level
+*/
+Severity OverrideLevelWithEtw(Severity originalSeverity);
+
+}  // namespace logging
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/logging/macros.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/logging/macros.h
new file mode 100644
index 00000000000000..18764460cba76e
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/logging/macros.h
@@ -0,0 +1,280 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+// NOTE: Don't include this file directly. Include logging.h
+
+#define CREATE_MESSAGE(logger, severity, category, datatype) \
+  ::onnxruntime::logging::Capture(logger, ::onnxruntime::logging::Severity::k##severity, category, datatype, ORT_WHERE)
+
+/*
+  Both printf and stream style logging are supported.
+  Not that printf currently has a 2K limit to the message size.
+
+  LOGS_* macros are for stream style
+  LOGF_* macros are for printf style
+
+  The Message class captures the log input, and pushes it through the logger in its destructor.
+
+  Use the *FATAL* macros if you want a Severity::kFatal message to also throw.
+
+  There are a few variants to minimize the length of the macro name required in the calling code.
+  They are optimized so the shortest names are for the (expected) most common usage. This can be
+  tweaked if needed.
+
+  Explicit logger vs LoggingManager::DefaulLogger()
+  Default is for a logger instance to be explicitly passed in.
+  The logger instance provides an identifier so that log messages from different runs can be separated.
+
+  Variants with DEFAULT in the macro name use the default logger provided by logging manager. This is
+  static so accessible from any code, provided a LoggingManager instance created with InstanceType::Default
+  exists somewhere. See logging.h for further explanation of the expected setup.
+
+  DataType
+  Default uses DataType::SYSTEM.
+
+  Variants with USER in the macro name use DataType::USER. This is data that could be PII, and may need to
+  be filtered from output. LoggingManager applies this filtering.
+
+  Category
+  Default category is ::onnxruntime::Logging::Category::onnxruntime.
+
+  If you wish to provide a different category, use variants with CATEGORY in the macro name
+
+*/
+
+/**
+ * Note:
+ * The stream style logging macros (something like `LOGS() << message`) are designed to be appended to.
+ * Normally, we can isolate macro code in a separate scope (e.g., `do {...} while(0)`), but here we need the macro code
+ * to interact with subsequent code (i.e., the values to log).
+ *
+ * When an unisolated conditional is involved, extra care needs to be taken to avoid unexpected parsing behavior.
+ * For example:
+ *
+ * if (enabled)
+ *   Capture().Stream()
+ *
+ * is more direct, but
+ *
+ * if (!enabled) {
+ * } else Capture().Stream()
+ *
+ * ensures that the `if` does not unintentionally associate with a subsequent `else`.
+ */
+
+// Logging with explicit category
+
+// iostream style logging. Capture log info in Message, and push to the logger in ~Message.
+#define LOGS_CATEGORY(logger, severity, category)                              \
+  if (!(logger).OutputIsEnabled(::onnxruntime::logging::Severity::k##severity, \
+                                ::onnxruntime::logging::DataType::SYSTEM)) {   \
+    /* do nothing */                                                           \
+  } else                                                                       \
+    CREATE_MESSAGE(logger, severity, category, ::onnxruntime::logging::DataType::SYSTEM).Stream()
+
+#define LOGS_USER_CATEGORY(logger, severity, category)                         \
+  if (!(logger).OutputIsEnabled(::onnxruntime::logging::Severity::k##severity, \
+                                ::onnxruntime::logging::DataType::USER)) {     \
+    /* do nothing */                                                           \
+  } else                                                                       \
+    CREATE_MESSAGE(logger, severity, category, ::onnxruntime::logging::DataType::USER).Stream()
+
+// printf style logging. Capture log info in Message, and push to the logger in ~Message.
+#define LOGF_CATEGORY(logger, severity, category, format_str, ...)                         \
+  do {                                                                                     \
+    if ((logger).OutputIsEnabled(::onnxruntime::logging::Severity::k##severity,            \
+                                 ::onnxruntime::logging::DataType::SYSTEM))                \
+      CREATE_MESSAGE(logger, severity, category, ::onnxruntime::logging::DataType::SYSTEM) \
+          .CapturePrintf(format_str, ##__VA_ARGS__);                                       \
+  } while (0)
+
+#define LOGF_USER_CATEGORY(logger, severity, category, format_str, ...)                  \
+  do {                                                                                   \
+    if ((logger).OutputIsEnabled(::onnxruntime::logging::Severity::k##severity,          \
+                                 ::onnxruntime::logging::DataType::USER))                \
+      CREATE_MESSAGE(logger, severity, category, ::onnxruntime::logging::DataType::USER) \
+          .CapturePrintf(format_str, ##__VA_ARGS__);                                     \
+  } while (0)
+
+// Logging with category of "onnxruntime"
+
+#define LOGS(logger, severity) \
+  LOGS_CATEGORY(logger, severity, ::onnxruntime::logging::Category::onnxruntime)
+
+#define LOGS_USER(logger, severity) \
+  LOGS_USER_CATEGORY(logger, severity, ::onnxruntime::logging::Category::onnxruntime)
+
+// printf style logging. Capture log info in Message, and push to the logger in ~Message.
+#define LOGF(logger, severity, format_str, ...) \
+  LOGF_CATEGORY(logger, severity, ::onnxruntime::logging::Category::onnxruntime, format_str, ##__VA_ARGS__)
+
+#define LOGF_USER(logger, severity, format_str, ...) \
+  LOGF_USER_CATEGORY(logger, severity, ::onnxruntime::logging::Category::onnxruntime, format_str, ##__VA_ARGS__)
+
+/*
+  Macros that use the default logger.
+  A LoggingManager instance must be currently valid for the default logger to be available.
+*/
+
+// Logging with explicit category
+
+#define LOGS_DEFAULT_CATEGORY(severity, category) \
+  LOGS_CATEGORY(::onnxruntime::logging::LoggingManager::DefaultLogger(), severity, category)
+
+#define LOGS_USER_DEFAULT_CATEGORY(severity, category) \
+  LOGS_USER_CATEGORY(::onnxruntime::logging::LoggingManager::DefaultLogger(), severity, category)
+
+#define LOGF_DEFAULT_CATEGORY(severity, category, format_str, ...) \
+  LOGF_CATEGORY(::onnxruntime::logging::LoggingManager::DefaultLogger(), severity, category, format_str, ##__VA_ARGS__)
+
+#define LOGF_USER_DEFAULT_CATEGORY(severity, category, format_str, ...) \
+  LOGF_USER_CATEGORY(::onnxruntime::logging::LoggingManager::DefaultLogger(), severity, category, format_str, ##__VA_ARGS__)
+
+// Logging with category of "onnxruntime"
+
+#define LOGS_DEFAULT(severity) \
+  LOGS_DEFAULT_CATEGORY(severity, ::onnxruntime::logging::Category::onnxruntime)
+
+#define LOGS_USER_DEFAULT(severity) \
+  LOGS_USER_DEFAULT_CATEGORY(severity, ::onnxruntime::logging::Category::onnxruntime)
+
+#define LOGF_DEFAULT(severity, format_str, ...) \
+  LOGF_DEFAULT_CATEGORY(severity, ::onnxruntime::logging::Category::onnxruntime, format_str, ##__VA_ARGS__)
+
+#define LOGF_USER_DEFAULT(severity, format_str, ...) \
+  LOGF_USER_DEFAULT_CATEGORY(severity, ::onnxruntime::logging::Category::onnxruntime, format_str, ##__VA_ARGS__)
+
+/*
+  Conditional logging
+*/
+
+// Logging with explicit category
+
+#define LOGS_CATEGORY_IF(boolean_expression, logger, severity, category) \
+  if (!((boolean_expression) == true)) {                                 \
+    /* do nothing */                                                     \
+  } else                                                                 \
+    LOGS_CATEGORY(logger, severity, category)
+
+#define LOGS_DEFAULT_CATEGORY_IF(boolean_expression, severity, category) \
+  if (!((boolean_expression) == true)) {                                 \
+    /* do nothing */                                                     \
+  } else                                                                 \
+    LOGS_DEFAULT_CATEGORY(severity, category)
+
+#define LOGS_USER_CATEGORY_IF(boolean_expression, logger, severity, category) \
+  if (!((boolean_expression) == true)) {                                      \
+    /* do nothing */                                                          \
+  } else                                                                      \
+    LOGS_USER_CATEGORY(logger, severity, category)
+
+#define LOGS_USER_DEFAULT_CATEGORY_IF(boolean_expression, severity, category) \
+  if (!((boolean_expression) == true)) {                                      \
+    /* do nothing */                                                          \
+  } else                                                                      \
+    LOGS_USER_DEFAULT_CATEGORY(severity, category)
+
+#define LOGF_CATEGORY_IF(boolean_expression, logger, severity, category, format_str, ...)                   \
+  do {                                                                                                      \
+    if ((boolean_expression) == true) LOGF_CATEGORY(logger, severity, category, format_str, ##__VA_ARGS__); \
+  } while (0)
+
+#define LOGF_DEFAULT_CATEGORY_IF(boolean_expression, severity, category, format_str, ...)                   \
+  do {                                                                                                      \
+    if ((boolean_expression) == true) LOGF_DEFAULT_CATEGORY(severity, category, format_str, ##__VA_ARGS__); \
+  } while (0)
+
+#define LOGF_USER_CATEGORY_IF(boolean_expression, logger, severity, category, format_str, ...)                   \
+  do {                                                                                                           \
+    if ((boolean_expression) == true) LOGF_USER_CATEGORY(logger, severity, category, format_str, ##__VA_ARGS__); \
+  } while (0)
+
+#define LOGF_USER_DEFAULT_CATEGORY_IF(boolean_expression, severity, category, format_str, ...)                   \
+  do {                                                                                                           \
+    if ((boolean_expression) == true) LOGF_USER_DEFAULT_CATEGORY(severity, category, format_str, ##__VA_ARGS__); \
+  } while (0)
+
+// Logging with category of "onnxruntime"
+
+#define LOGS_IF(boolean_expression, logger, severity) \
+  LOGS_CATEGORY_IF(boolean_expression, logger, severity, ::onnxruntime::logging::Category::onnxruntime)
+
+#define LOGS_DEFAULT_IF(boolean_expression, severity) \
+  LOGS_DEFAULT_CATEGORY_IF(boolean_expression, severity, ::onnxruntime::logging::Category::onnxruntime)
+
+#define LOGS_USER_IF(boolean_expression, logger, severity) \
+  LOGS_USER_CATEGORY_IF(boolean_expression, logger, severity, ::onnxruntime::logging::Category::onnxruntime)
+
+#define LOGS_USER_DEFAULT_IF(boolean_expression, severity) \
+  LOGS_USER_DEFAULT_CATEGORY_IF(boolean_expression, severity, ::onnxruntime::logging::Category::onnxruntime)
+
+#define LOGF_IF(boolean_expression, logger, severity, format_str, ...) \
+  LOGF_CATEGORY_IF(boolean_expression, logger, severity, ::onnxruntime::logging::Category::onnxruntime, format_str, ##__VA_ARGS__)
+
+#define LOGF_DEFAULT_IF(boolean_expression, severity, format_str, ...) \
+  LOGF_DEFAULT_CATEGORY_IF(boolean_expression, severity, ::onnxruntime::logging::Category::onnxruntime, format_str, ##__VA_ARGS__)
+
+#define LOGF_USER_IF(boolean_expression, logger, severity, format_str, ...)                                  \
+  LOGF_USER_CATEGORY_IF(boolean_expression, logger, severity, ::onnxruntime::logging::Category::onnxruntime, \
+                        format_str, ##__VA_ARGS__)
+
+#define LOGF_USER_DEFAULT_IF(boolean_expression, severity, format_str, ...)                                  \
+  LOGF_USER_DEFAULT_CATEGORY_IF(boolean_expression, severity, ::onnxruntime::logging::Category::onnxruntime, \
+                                format_str, ##__VA_ARGS__)
+
+/*
+  Debug verbose logging of caller provided level.
+  Disabled in Release builds.
+  Use the _USER variants for VLOG statements involving user data that may need to be filtered.
+*/
+#ifndef NDEBUG
+#define VLOGS(logger, level)                                                         \
+  if (!(::onnxruntime::logging::vlog_enabled && level <= (logger).VLOGMaxLevel())) { \
+    /* do nothing */                                                                 \
+  } else                                                                             \
+    LOGS_CATEGORY(logger, VERBOSE, "VLOG" #level)
+
+#define VLOGS_USER(logger, level)                                                    \
+  if (!(::onnxruntime::logging::vlog_enabled && level <= (logger).VLOGMaxLevel())) { \
+    /* do nothing */                                                                 \
+  } else                                                                             \
+    LOGS_USER_CATEGORY(logger, VERBOSE, "VLOG" #level)
+
+#define VLOGF(logger, level, format_str, ...)                                     \
+  do {                                                                            \
+    if (::onnxruntime::logging::vlog_enabled && level <= (logger).VLOGMaxLevel()) \
+      LOGF_CATEGORY(logger, VERBOSE, "VLOG" #level, format_str, ##__VA_ARGS__);   \
+  } while (0)
+
+#define VLOGF_USER(logger, level, format_str, ...)                                   \
+  do {                                                                               \
+    if (::onnxruntime::logging::vlog_enabled && level <= (logger).VLOGMaxLevel())    \
+      LOGF_USER_CATEGORY(logger, VERBOSE, "VLOG" #level, format_str, ##__VA_ARGS__); \
+  } while (0)
+#else
+// Disabled in Release builds.
+#define VLOGS(logger, level) \
+  if constexpr (true) {      \
+  } else                     \
+    LOGS_CATEGORY(logger, VERBOSE, "VLOG" #level)
+#define VLOGS_USER(logger, level) \
+  if constexpr (true) {           \
+  } else                          \
+    LOGS_USER_CATEGORY(logger, VERBOSE, "VLOG" #level)
+#define VLOGF(logger, level, format_str, ...)
+#define VLOGF_USER(logger, level, format_str, ...)
+#endif
+
+// Default logger variants
+#define VLOGS_DEFAULT(level) \
+  VLOGS(::onnxruntime::logging::LoggingManager::DefaultLogger(), level)
+
+#define VLOGS_USER_DEFAULT(level) \
+  VLOGS_USER(::onnxruntime::logging::LoggingManager::DefaultLogger(), level)
+
+#define VLOGF_DEFAULT(level, format_str, ...) \
+  VLOGF(::onnxruntime::logging::LoggingManager::DefaultLogger(), level, format_str, ##__VA_ARGS__)
+
+#define VLOGF_USER_DEFAULT(level, format_str, ...) \
+  VLOGF_USER(::onnxruntime::logging::LoggingManager::DefaultLogger(), level, format_str, ##__VA_ARGS__)
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/logging/severity.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/logging/severity.h
new file mode 100644
index 00000000000000..e43f192eb1807e
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/logging/severity.h
@@ -0,0 +1,22 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+namespace onnxruntime {
+namespace logging {
+// mild violation of naming convention. the 'k' lets us use token concatenation in the macro
+// ::onnxruntime::Logging::Severity::k##severity. It's not legal to have ::onnxruntime::Logging::Severity::##severity
+// the uppercase makes the LOG macro usage look as expected for passing an enum value as it will be LOGS(logger, ERROR)
+enum class Severity {
+  kVERBOSE = 0,
+  kINFO = 1,
+  kWARNING = 2,
+  kERROR = 3,
+  kFATAL = 4
+};
+
+constexpr const char* SEVERITY_PREFIX = "VIWEF";
+
+}  // namespace logging
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/logging/sink_types.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/logging/sink_types.h
new file mode 100644
index 00000000000000..a99b0fca58d9d1
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/logging/sink_types.h
@@ -0,0 +1,11 @@
+#pragma once
+
+namespace onnxruntime {
+namespace logging {
+enum class SinkType {
+  BaseSink,
+  CompositeSink,
+  EtwSink
+};
+}  // namespace logging
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/make_string.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/make_string.h
new file mode 100644
index 00000000000000..6148ef63e7264e
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/make_string.h
@@ -0,0 +1,132 @@
+/**
+ * Copyright (c) 2016-present, Facebook, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+// Portions Copyright (c) Microsoft Corporation
+
+#pragma once
+
+#include <locale>
+#include <sstream>
+#include <string>
+#include <type_traits>
+
+namespace onnxruntime {
+
+namespace detail {
+
+inline void MakeStringImpl(std::ostringstream& /*ss*/) noexcept {
+}
+
+template <typename T>
+inline void MakeStringImpl(std::ostringstream& ss, const T& t) noexcept {
+  ss << t;
+}
+
+template <typename T, typename... Args>
+inline void MakeStringImpl(std::ostringstream& ss, const T& t, const Args&... args) noexcept {
+  MakeStringImpl(ss, t);
+  MakeStringImpl(ss, args...);
+}
+
+// see MakeString comments for explanation of why this is necessary
+template <typename... Args>
+inline std::string MakeStringImpl(const Args&... args) noexcept {
+  std::ostringstream ss;
+  MakeStringImpl(ss, args...);
+  return ss.str();
+}
+
+template <typename... Args>
+inline std::string MakeStringWithClassicLocaleImpl(const Args&... args) noexcept {
+  std::ostringstream ss;
+  ss.imbue(std::locale::classic());
+  MakeStringImpl(ss, args...);
+  return ss.str();
+}
+
+//
+// Infrastructure to convert char[n] to char* to reduce binary size
+//
+
+// default is to leave the type as is
+template <class T>
+struct if_char_array_make_ptr {
+  using type = T;
+};
+
+// specialization that matches an array reference, which is what the char array from a string literal
+// used in a call to MakeString will be.
+// if the type is a char[n] array we 'decay' it to a char* so that the usages can be folded.
+template <class T, size_t N>
+struct if_char_array_make_ptr<T (&)[N]> {
+  // remove a single extent (T[x] -> T, but T[x][y] -> T[y]) so we only match char[x],
+  // and get the type name without the 'const' so both 'const char (&)[n]' and 'char (&)[n]' are matched.
+  using element_type = typename std::remove_const<typename std::remove_extent<T>::type>::type;
+  using type = typename std::conditional<std::is_same<char, element_type>::value, T*, T (&)[N]>::type;
+};
+
+// helper to make usage simpler in MakeString
+template <class T>
+using if_char_array_make_ptr_t = typename if_char_array_make_ptr<T>::type;
+}  // namespace detail
+
+/**
+ * Makes a string by concatenating string representations of the arguments.
+ * This version uses the current locale.
+ */
+template <typename... Args>
+inline std::string MakeString(const Args&... args) {
+  // We need to update the types from the MakeString template instantiation to decay any char[n] to char*.
+  //   e.g. MakeString("in", "out") goes from MakeString<char[2], char[3]> to MakeStringImpl<char*, char*>
+  //        so that MakeString("out", "in") will also match MakeStringImpl<char*, char*> instead of requiring
+  //        MakeStringImpl<char[3], char[2]>.
+  //
+  // We have to do the type processing before any actual work, so this function purely implements the type processing.
+  // If we do not do it this way we do not get the full binary size reduction.
+  //
+  // See https://stackoverflow.com/a/29418212/684911 for overall details of the approach, but note it does not cover
+  // the need to do the type processing as a separate step.
+
+  return detail::MakeStringImpl(detail::if_char_array_make_ptr_t<Args const&>(args)...);
+}
+
+/**
+ * Makes a string by concatenating string representations of the arguments.
+ * This version uses std::locale::classic().
+ */
+template <typename... Args>
+inline std::string MakeStringWithClassicLocale(const Args&... args) {
+  return detail::MakeStringWithClassicLocaleImpl(detail::if_char_array_make_ptr_t<Args const&>(args)...);
+}
+
+// MakeString versions for already-a-string types.
+
+inline std::string MakeString(const std::string& str) {
+  return str;
+}
+
+inline std::string MakeString(const char* cstr) {
+  return cstr;
+}
+
+inline std::string MakeStringWithClassicLocale(const std::string& str) {
+  return str;
+}
+
+inline std::string MakeStringWithClassicLocale(const char* cstr) {
+  return cstr;
+}
+
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/narrow.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/narrow.h
new file mode 100644
index 00000000000000..49dfbf3c459537
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/narrow.h
@@ -0,0 +1,69 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+// onnxruntime::narrow() is like gsl::narrow() but it is also available when exceptions are disabled.
+
+#if !defined(ORT_NO_EXCEPTIONS)
+
+#include "gsl/narrow"
+
+namespace onnxruntime {
+using gsl::narrow;
+}  // namespace onnxruntime
+
+#else  // ^^ !defined(ORT_NO_EXCEPTIONS) ^^ / vv defined(ORT_NO_EXCEPTIONS) vv
+
+#include <cstdio>     // std::fprintf
+#include <exception>  // std::terminate
+#include <type_traits>
+
+#include "gsl/util"  // gsl::narrow_cast
+
+namespace onnxruntime {
+
+namespace detail {
+[[noreturn]] inline void OnNarrowingError() noexcept {
+  std::fprintf(stderr, "%s", "narrowing error\n");
+  std::terminate();
+}
+}  // namespace detail
+
+// This implementation of onnxruntime::narrow was copied and adapted from:
+// https://github.com/microsoft/GSL/blob/a3534567187d2edc428efd3f13466ff75fe5805c/include/gsl/narrow
+
+// narrow() : a checked version of narrow_cast() that terminates if the cast changed the value
+template <class T, class U, typename std::enable_if<std::is_arithmetic<T>::value>::type* = nullptr>
+GSL_SUPPRESS(type.1) constexpr T narrow(U u) noexcept {
+  constexpr const bool is_different_signedness =
+      (std::is_signed<T>::value != std::is_signed<U>::value);
+
+  GSL_SUPPRESS(es.103)                 // don't overflow
+  GSL_SUPPRESS(es.104)                 // don't underflow
+  GSL_SUPPRESS(p.2)                    // don't rely on undefined behavior
+  const T t = gsl::narrow_cast<T>(u);  // While this is technically undefined behavior in some cases (i.e., if the source value is of floating-point type
+                                       // and cannot fit into the destination integral type), the resultant behavior is benign on the platforms
+                                       // that we target (i.e., no hardware trap representations are hit).
+
+  if (static_cast<U>(t) != u || (is_different_signedness && ((t < T{}) != (u < U{})))) {
+    detail::OnNarrowingError();
+  }
+
+  return t;
+}
+
+template <class T, class U, typename std::enable_if<!std::is_arithmetic<T>::value>::type* = nullptr>
+GSL_SUPPRESS(type.1) constexpr T narrow(U u) noexcept {
+  const T t = gsl::narrow_cast<T>(u);
+
+  if (static_cast<U>(t) != u) {
+    detail::OnNarrowingError();
+  }
+
+  return t;
+}
+
+}  // namespace onnxruntime
+
+#endif  // defined(ORT_NO_EXCEPTIONS)
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/optional.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/optional.h
new file mode 100644
index 00000000000000..f7106a3bbfb1ed
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/optional.h
@@ -0,0 +1,23 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include <optional>
+
+namespace onnxruntime {
+
+using std::optional;
+
+#ifndef ORT_NO_EXCEPTIONS
+using std::bad_optional_access;
+#endif
+
+using std::nullopt;
+using std::nullopt_t;
+
+using std::in_place;
+using std::in_place_t;
+
+using std::make_optional;
+
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/parse_string.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/parse_string.h
new file mode 100644
index 00000000000000..941e3f3377ecc7
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/parse_string.h
@@ -0,0 +1,85 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <locale>
+#include <sstream>
+#include <string_view>
+#include <type_traits>
+
+#include "core/common/common.h"
+
+namespace onnxruntime {
+
+/**
+ * Tries to parse a value from an entire string.
+ */
+template <typename T>
+bool TryParseStringWithClassicLocale(std::string_view str, T& value) {
+  if constexpr (std::is_integral<T>::value && std::is_unsigned<T>::value) {
+    // if T is unsigned integral type, reject negative values which will wrap
+    if (!str.empty() && str[0] == '-') {
+      return false;
+    }
+  }
+
+  // don't allow leading whitespace
+  if (!str.empty() && std::isspace(str[0], std::locale::classic())) {
+    return false;
+  }
+
+  std::istringstream is{std::string{str}};
+  is.imbue(std::locale::classic());
+  T parsed_value{};
+
+  const bool parse_successful =
+      is >> parsed_value &&
+      is.get() == std::istringstream::traits_type::eof();  // don't allow trailing characters
+  if (!parse_successful) {
+    return false;
+  }
+
+  value = std::move(parsed_value);
+  return true;
+}
+
+inline bool TryParseStringWithClassicLocale(std::string_view str, std::string& value) {
+  value = str;
+  return true;
+}
+
+inline bool TryParseStringWithClassicLocale(std::string_view str, bool& value) {
+  if (str == "0" || str == "False" || str == "false") {
+    value = false;
+    return true;
+  }
+
+  if (str == "1" || str == "True" || str == "true") {
+    value = true;
+    return true;
+  }
+
+  return false;
+}
+
+/**
+ * Parses a value from an entire string.
+ */
+template <typename T>
+Status ParseStringWithClassicLocale(std::string_view s, T& value) {
+  ORT_RETURN_IF_NOT(TryParseStringWithClassicLocale(s, value), "Failed to parse value: \"", value, "\"");
+  return Status::OK();
+}
+
+/**
+ * Parses a value from an entire string.
+ */
+template <typename T>
+T ParseStringWithClassicLocale(std::string_view s) {
+  T value{};
+  ORT_THROW_IF_ERROR(ParseStringWithClassicLocale(s, value));
+  return value;
+}
+
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/profiler_common.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/profiler_common.h
new file mode 100644
index 00000000000000..0074d5e74a461d
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/profiler_common.h
@@ -0,0 +1,93 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/common/common.h"
+
+#include <string>
+#include <unordered_map>
+
+namespace onnxruntime {
+namespace profiling {
+
+enum EventCategory {
+  SESSION_EVENT = 0,
+  NODE_EVENT,
+  KERNEL_EVENT,
+  API_EVENT,
+  EVENT_CATEGORY_MAX
+};
+
+// Event descriptions for the above session events.
+static constexpr const char* event_category_names_[EVENT_CATEGORY_MAX] = {
+    "Session",
+    "Node",
+    "Kernel",
+    "Api"};
+
+// Timing record for all events.
+struct EventRecord {
+  EventRecord() = default;
+  EventRecord(EventCategory category,
+              int process_id,
+              int thread_id,
+              std::string&& event_name,
+              long long time_stamp,
+              long long duration,
+              std::unordered_map<std::string, std::string>&& event_args)
+      : cat(category),
+        pid(process_id),
+        tid(thread_id),
+        name(std::move(event_name)),
+        ts(time_stamp),
+        dur(duration),
+        args(std::move(event_args)) {}
+
+  EventRecord(EventCategory category,
+              int process_id,
+              int thread_id,
+              const std::string& event_name,
+              long long time_stamp,
+              long long duration,
+              const std::unordered_map<std::string, std::string>& event_args)
+      : cat(category),
+        pid(process_id),
+        tid(thread_id),
+        name(event_name),
+        ts(time_stamp),
+        dur(duration),
+        args(event_args) {}
+
+  EventRecord(const EventRecord& other) = default;
+  EventRecord(EventRecord&& other) noexcept = default;
+  EventRecord& operator=(const EventRecord& other) = default;
+  EventRecord& operator=(EventRecord&& other) = default;
+
+  EventCategory cat = EventCategory::API_EVENT;
+  int pid = -1;
+  int tid = -1;
+  std::string name{};
+  long long ts = 0;
+  long long dur = 0;
+  std::unordered_map<std::string, std::string> args{};
+};
+
+using Events = std::vector<EventRecord>;
+
+// Execution Provider Profiler
+class EpProfiler {
+ public:
+  virtual ~EpProfiler() = default;
+  virtual bool StartProfiling(TimePoint profiling_start_time) = 0;      // called when profiling starts
+  virtual void EndProfiling(TimePoint start_time, Events& events) = 0;  // called when profiling ends, save all captures numbers to "events"
+  virtual void Start(uint64_t){};                                       // called before op start, accept an id as argument to identify the op
+  virtual void Stop(uint64_t){};                                        // called after op stop, accept an id as argument to identify the op
+};
+
+// Demangle C++ symbols
+std::string demangle(const char* name);
+std::string demangle(const std::string& name);
+
+}  // namespace profiling
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/span_utils.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/span_utils.h
new file mode 100644
index 00000000000000..9f7454625fcd18
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/span_utils.h
@@ -0,0 +1,93 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <algorithm>
+#include <cstddef>
+
+#include <gsl/gsl>
+
+namespace onnxruntime {
+
+// AsSpan inspired by Fekir's Blog https://fekir.info/post/span-the-missing-constructor/
+// Used under MIT license
+
+// Use AsSpan for less typing on any container including initializer list to create a span
+// (unnamed, untyped initializer list does not automatically convert to gsl::span).
+// {1, 2, 3} as such does not have a type
+// (see https://scottmeyers.blogspot.com/2014/03/if-braced-initializers-have-no-type-why.html)
+//
+//   Example: AsSpan({1, 2, 3}) results in gsl::span<const int>
+//
+// The above would deduce to std::initializer_list<int> and the result is gsl::span<const int>
+//
+// AsSpan<int64_t>({1, 2, 3}) produces gsl::span<const int64_t>
+//
+// We can also do std::array<int64_t, 3>{1, 2, 3} that can be automatically converted to span
+// without memory allocation.
+//
+// If type conversion is not required, then for C++17 std::array template parameters are
+// auto-deduced. Example: std::array{1, 2, 3}.
+// We are aiming at not allocating memory dynamically.
+
+namespace details {
+template <class P>
+constexpr auto AsSpanImpl(P* p, size_t s) {
+  return gsl::span<P>(p, s);
+}
+}  // namespace details
+
+template <class C>
+constexpr auto AsSpan(C& c) {
+  return details::AsSpanImpl(c.data(), c.size());
+}
+
+template <class C>
+constexpr auto AsSpan(const C& c) {
+  return details::AsSpanImpl(c.data(), c.size());
+}
+
+template <class C>
+constexpr auto AsSpan(C&& c) {
+  return details::AsSpanImpl(c.data(), c.size());
+}
+
+template <class T>
+constexpr auto AsSpan(std::initializer_list<T> c) {
+  return details::AsSpanImpl(c.begin(), c.size());
+}
+
+template <class T, size_t N>
+constexpr auto AsSpan(T (&arr)[N]) {
+  return details::AsSpanImpl(arr, N);
+}
+
+template <class T, size_t N>
+constexpr auto AsSpan(const T (&arr)[N]) {
+  return details::AsSpanImpl(arr, N);
+}
+
+template <class T>
+inline gsl::span<const T> EmptySpan() { return gsl::span<const T>(); }
+
+template <class U, class T>
+[[nodiscard]] inline gsl::span<U> ReinterpretAsSpan(gsl::span<T> src) {
+  // adapted from gsl-lite span::as_span():
+  // https://github.com/gsl-lite/gsl-lite/blob/4720a2980a30da085b4ddb4a0ea2a71af7351a48/include/gsl/gsl-lite.hpp#L4102-L4108
+  Expects(src.size_bytes() % sizeof(U) == 0);
+  return gsl::span<U>(reinterpret_cast<U*>(src.data()), src.size_bytes() / sizeof(U));
+}
+
+[[nodiscard]] inline gsl::span<const std::byte> AsByteSpan(const void* data, size_t length) {
+  return gsl::span<const std::byte>(reinterpret_cast<const std::byte*>(data), length);
+}
+
+template <class T1, size_t Extent1, class T2, size_t Extent2>
+[[nodiscard]] inline bool SpanEq(gsl::span<T1, Extent1> a, gsl::span<T2, Extent2> b) {
+  static_assert(std::is_same_v<std::remove_const_t<T1>, std::remove_const_t<T2>>,
+                "T1 and T2 should be the same type except for const qualification");
+  return std::equal(a.begin(), a.end(), b.begin(), b.end());
+}
+
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/spin_pause.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/spin_pause.h
new file mode 100644
index 00000000000000..49b71e5567d3e2
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/spin_pause.h
@@ -0,0 +1,28 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#if defined(_M_AMD64)
+#include <intrin.h>
+#endif
+
+#if defined(__x86_64__)
+#include <xmmintrin.h>
+#endif
+
+namespace onnxruntime {
+
+namespace concurrency {
+
+// Intrinsic to use in spin-loops
+
+inline void SpinPause() {
+#if defined(_M_AMD64) || defined(__x86_64__)
+  _mm_pause();
+#endif
+}
+
+}  // namespace concurrency
+
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/status.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/status.h
new file mode 100644
index 00000000000000..8f171daabbb1ea
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/status.h
@@ -0,0 +1,192 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Modifications Copyright (c) Microsoft.
+
+#pragma once
+
+#include <memory>
+#include <ostream>
+#include <string>
+#ifdef _WIN32
+#include <winerror.h>
+#endif
+namespace onnxruntime {
+namespace common {
+
+enum StatusCategory {
+  NONE = 0,
+  SYSTEM = 1,
+  ONNXRUNTIME = 2,
+};
+
+/**
+   Error code for ONNXRuntime.
+*/
+enum StatusCode {
+  OK = 0,
+  FAIL = 1,
+  INVALID_ARGUMENT = 2,
+  NO_SUCHFILE = 3,
+  NO_MODEL = 4,
+  ENGINE_ERROR = 5,
+  RUNTIME_EXCEPTION = 6,
+  INVALID_PROTOBUF = 7,
+  MODEL_LOADED = 8,
+  NOT_IMPLEMENTED = 9,
+  INVALID_GRAPH = 10,
+  EP_FAIL = 11
+};
+
+constexpr const char* StatusCodeToString(StatusCode status) noexcept {
+  switch (status) {
+    case StatusCode::OK:
+      return "SUCCESS";
+    case StatusCode::FAIL:
+      return "FAIL";
+    case StatusCode::INVALID_ARGUMENT:
+      return "INVALID_ARGUMENT";
+    case StatusCode::NO_SUCHFILE:
+      return "NO_SUCHFILE";
+    case StatusCode::NO_MODEL:
+      return "NO_MODEL";
+    case StatusCode::ENGINE_ERROR:
+      return "ENGINE_ERROR";
+    case StatusCode::RUNTIME_EXCEPTION:
+      return "RUNTIME_EXCEPTION";
+    case StatusCode::INVALID_PROTOBUF:
+      return "INVALID_PROTOBUF";
+    case StatusCode::MODEL_LOADED:
+      return "MODEL_LOADED";
+    case StatusCode::NOT_IMPLEMENTED:
+      return "NOT_IMPLEMENTED";
+    case StatusCode::INVALID_GRAPH:
+      return "INVALID_GRAPH";
+    case StatusCode::EP_FAIL:
+      return "EP_FAIL";
+    default:
+      return "GENERAL ERROR";
+  }
+}
+
+#ifdef _WIN32
+constexpr HRESULT StatusCodeToHRESULT(StatusCode status) noexcept {
+  switch (status) {
+    case StatusCode::OK:
+      return S_OK;
+    case StatusCode::FAIL:
+      return E_FAIL;
+    case StatusCode::INVALID_ARGUMENT:
+      return E_INVALIDARG;
+    case StatusCode::NO_SUCHFILE:
+      return HRESULT_FROM_WIN32(ERROR_FILE_NOT_FOUND);
+    case StatusCode::NO_MODEL:
+      return HRESULT_FROM_WIN32(ERROR_FILE_NOT_FOUND);
+    case StatusCode::ENGINE_ERROR:
+      return E_FAIL;
+    case StatusCode::RUNTIME_EXCEPTION:
+      return E_FAIL;
+    case StatusCode::INVALID_PROTOBUF:
+      return HRESULT_FROM_WIN32(ERROR_FILE_CORRUPT);
+    case StatusCode::MODEL_LOADED:
+      return HRESULT_FROM_WIN32(ERROR_INTERNAL_ERROR);
+    case StatusCode::NOT_IMPLEMENTED:
+      return E_NOTIMPL;
+    case StatusCode::INVALID_GRAPH:
+      return HRESULT_FROM_WIN32(ERROR_FILE_CORRUPT);
+    case StatusCode::EP_FAIL:
+      return HRESULT_FROM_WIN32(ERROR_INTERNAL_ERROR);
+    default:
+      return E_FAIL;
+  }
+}
+#endif
+
+class [[nodiscard]] Status {
+ public:
+  Status() noexcept = default;
+
+  Status(StatusCategory category, int code, const std::string& msg);
+
+  Status(StatusCategory category, int code, const char* msg);
+
+  Status(StatusCategory category, int code);
+
+  Status(const Status& other)
+      : state_((other.state_ == nullptr) ? nullptr : new State(*other.state_)) {}
+  Status& operator=(const Status& other) {
+    if (state_ != other.state_) {
+      if (other.state_ == nullptr) {
+        state_.reset();
+      } else {
+        state_.reset(new State(*other.state_));
+      }
+    }
+    return *this;
+  }
+
+  Status(Status&&) = default;
+  Status& operator=(Status&&) = default;
+  ~Status() = default;
+
+  bool IsOK() const {
+    return (state_ == nullptr);
+  }
+
+  int Code() const noexcept;
+
+  StatusCategory Category() const noexcept;
+
+  const std::string& ErrorMessage() const noexcept;
+
+  std::string ToString() const;
+
+  bool operator==(const Status& other) const {
+    return (this->state_ == other.state_) || (ToString() == other.ToString());
+  }
+
+  bool operator!=(const Status& other) const {
+    return !(*this == other);
+  }
+
+  static Status OK() {
+    return Status();
+  }
+
+ private:
+  static const std::string& EmptyString() noexcept;
+
+  struct State {
+    State(StatusCategory cat0, int code0, const std::string& msg0)
+        : category(cat0), code(code0), msg(msg0) {}
+
+    State(StatusCategory cat0, int code0, const char* msg0)
+        : category(cat0), code(code0), msg(msg0) {}
+
+    const StatusCategory category;
+    const int code;
+    const std::string msg;
+  };
+
+  // As long as Code() is OK, state_ == nullptr.
+  std::unique_ptr<State> state_;
+};
+
+inline std::ostream& operator<<(std::ostream& out, const Status& status) {
+  return out << status.ToString();
+}
+
+}  // namespace common
+
+// make Status directly available in the onnxruntime namespace as it is widely used
+using common::Status;
+
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/string_helper.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/string_helper.h
new file mode 100644
index 00000000000000..1304303132d5a4
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/common/string_helper.h
@@ -0,0 +1,11 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include <string>
+
+// forward declaration
+struct OrtAllocator;
+namespace onnxruntime {
+char* StrDup(const std::string& str, OrtAllocator* allocator);
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/eager/ort_kernel_invoker.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/eager/ort_kernel_invoker.h
new file mode 100644
index 00000000000000..fcf92de2ee39a9
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/eager/ort_kernel_invoker.h
@@ -0,0 +1,59 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "core/common/common.h"
+#include "core/framework/allocator.h"
+#include "core/framework/tensor.h"
+#include "core/framework/execution_provider.h"
+#include "core/graph/constants.h"
+#include "core/session/environment.h"
+#include "core/graph/basic_types.h"
+#include "core/graph/model.h"
+
+namespace onnxruntime {
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#endif
+
+class ORTInvoker {
+ public:
+  ORTInvoker(std::shared_ptr<IExecutionProvider> execution_provider,
+             const logging::Logger& logger,
+             const IOnnxRuntimeOpSchemaRegistryList& custom_op_registries)
+      : execution_provider_(std::move(execution_provider)),
+        logger_(logger),
+        custom_op_registries_(custom_op_registries) {
+    if (!execution_provider_) {
+      ORT_THROW("Execution provider is nullptr");
+    }
+  }
+
+  IExecutionProvider& GetCurrentExecutionProvider() {
+    return *execution_provider_;
+  }
+
+  common::Status Invoke(const std::string& op_name,
+                        // optional inputs / outputs?
+                        const std::vector<OrtValue>& inputs,
+                        std::vector<OrtValue>& outputs,
+                        const NodeAttributes* attributes,
+                        const std::string& domain = kOnnxDomain,
+                        const int version = -1);
+
+ private:
+  std::shared_ptr<IExecutionProvider> execution_provider_;
+  const logging::Logger& logger_;
+  // custom ops for current execution provider
+  // we need the op schema to resolve the output type during invoke
+  const IOnnxRuntimeOpSchemaRegistryList& custom_op_registries_;
+};
+
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/alloc_kind.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/alloc_kind.h
new file mode 100644
index 00000000000000..c7a953a44b872e
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/alloc_kind.h
@@ -0,0 +1,36 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include <iosfwd>
+
+namespace onnxruntime {
+// The ml-Values fall into the following categories with respect to their
+// memory management:
+//   - inference inputs: owned (allocated and freed) by caller, and is by
+//     default read-only by the runtime.
+//   - inference outputs: allocated by runtime, ownership transferred to
+//     caller. TODO: Make sure this semantics is clear in InferenceSession API.
+//   - weights (constant tensors): can be allocated once (statically), and
+//     reused by all inference calls within an InferenceSession.
+//   - tensor values: The lifetimes of these tensor-values are statically
+//     determined, which is used for memory reuse/sharing optimizations. The
+//     runtime allocates/frees these values at the right time (as determined
+//     by the static allocation plan). Note that this is simplified since we
+//     do not try to optimize for "slice" like ops, where we may be able to
+//     conditionally reuse memory/data in some cases but not others.
+//     Generalizing this is future work.
+
+enum class AllocKind {
+  kNotSet = -1,
+  kAllocate = 0,
+  kReuse = 1,
+  kPreExisting = 2,
+  kAllocateStatically = 3,
+  kAllocateOutput = 4,
+  kShare = 5,
+  kAllocatedExternally = 6
+};
+
+std::ostream& operator<<(std::ostream& out, AllocKind alloc_kind);
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/allocator.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/allocator.h
new file mode 100644
index 00000000000000..57b332ce65b93b
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/allocator.h
@@ -0,0 +1,268 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <map>
+
+#include "core/common/common.h"
+#include "core/framework/allocator_stats.h"
+// some enums are defined in session/onnxruntime_c_api.h but used in ortdevice.h/ortmemory.h
+#include "core/session/onnxruntime_c_api.h"
+#include "core/framework/ortdevice.h"
+#include "core/framework/ortmemoryinfo.h"
+
+// This configures the arena based allocator used by ORT
+// See docs/C_API.md for details on what these mean and how to choose these values
+struct OrtArenaCfg {
+  OrtArenaCfg() : max_mem(0),
+                  arena_extend_strategy(-1),
+                  initial_chunk_size_bytes(-1),
+                  max_dead_bytes_per_chunk(-1),
+                  initial_growth_chunk_size_bytes(-1),
+                  max_power_of_two_extend_bytes(-1) {}
+  OrtArenaCfg(size_t max_mem, int arena_extend_strategy, int initial_chunk_size_bytes,
+              int max_dead_bytes_per_chunk, int initial_growth_chunk_size_bytes,
+              int64_t max_power_of_two_extend_bytes)
+      : max_mem(max_mem),
+        arena_extend_strategy(arena_extend_strategy),
+        initial_chunk_size_bytes(initial_chunk_size_bytes),
+        max_dead_bytes_per_chunk(max_dead_bytes_per_chunk),
+        initial_growth_chunk_size_bytes(initial_growth_chunk_size_bytes),
+        max_power_of_two_extend_bytes(max_power_of_two_extend_bytes) {}
+
+  size_t max_mem;                         // use 0 to allow ORT to choose the default
+  int arena_extend_strategy;              // use -1 to allow ORT to choose the default, 0 = kNextPowerOfTwo, 1 = kSameAsRequested
+  int initial_chunk_size_bytes;           // use -1 to allow ORT to choose the default
+  int max_dead_bytes_per_chunk;           // use -1 to allow ORT to choose the default
+  int initial_growth_chunk_size_bytes;    // use -1 to allow ORT to choose the default
+  int64_t max_power_of_two_extend_bytes;  // use -1 to allow ORT to choose the default
+};
+
+namespace onnxruntime {
+constexpr const char* CPU = "Cpu";
+constexpr const char* CUDA = "Cuda";
+constexpr const char* CUDA_PINNED = "CudaPinned";
+constexpr const char* CANN = "Cann";
+constexpr const char* CANN_PINNED = "CannPinned";
+constexpr const char* DML = "DML";
+constexpr const char* HIP = "Hip";
+constexpr const char* HIP_PINNED = "HipPinned";
+constexpr const char* OpenVINO_CPU = "OpenVINO_CPU";
+constexpr const char* OpenVINO_GPU = "OpenVINO_GPU";
+constexpr const char* OpenVINO_RT = "OpenVINO_RT";
+constexpr const char* OpenVINO_RT_NPU = "OpenVINO_RT_NPU";
+constexpr const char* WEBGPU_BUFFER = "WebGPU_Buffer";
+constexpr const char* WEBNN_TENSOR = "WebNN_Tensor";
+
+constexpr size_t kAllocAlignment = 256;
+
+class IAllocator;
+class Stream;
+namespace synchronize {
+class Notification;
+}
+using WaitNotificationFn = std::function<void(Stream&, synchronize::Notification&)>;
+void* AllocateBufferWithOptions(IAllocator& allocator, size_t size, bool use_reserve, Stream* stream, WaitNotificationFn wait_fn);
+
+template <typename T>
+using IAllocatorUniquePtr = std::unique_ptr<T, std::function<void(T*)>>;
+
+class IAllocator {
+ public:
+  IAllocator(const OrtMemoryInfo& info) : memory_info_(info) {}
+  virtual ~IAllocator() = default;
+  /**
+   * Allocate memory of the specified size.
+   * If size is 0, nullptr is returned.
+   * If allocation fails, an exception is thrown.
+   *
+   * @remarks Use SafeInt when calculating the size of memory to allocate using Alloc.
+   */
+  virtual void* Alloc(size_t size) = 0;
+
+  virtual void Free(void* p) = 0;
+
+  // Reserve() is an interface exposed for an implementation of IAllocator
+  // to optionally implement some allocation logic that by-passes any arena-based
+  // logic that may be housed in the Alloc() implementation.
+  // There are SessionOptions config(s) that allow users to allocate some memory
+  // by-passing arena-based logic.
+  // By default, the base implementation  just calls Alloc().
+  virtual void* Reserve(size_t size) { return Alloc(size); }
+
+  const OrtMemoryInfo& Info() const { return memory_info_; };
+
+  // Each implementation of IAllocator can override and provide their own implementation
+  virtual void GetStats(AllocatorStats* /*stats*/) { return; }
+
+  static bool CalcMemSizeForArray(size_t nmemb, size_t size, size_t* out) noexcept {
+    return CalcMemSizeForArrayWithAlignment(nmemb, size, 0, out);
+  }
+
+  /**
+   * Calculate the memory size for an array. The size is bounds checked using SafeInt.
+   * \tparam alignment must be power of 2
+   * \param nmemb Number of members or elements in the array
+   * \param size Size of each element
+   * \param out Total size required after any alignment is applied
+   * \return true, successful. false, overflow
+   */
+  [[nodiscard]] static bool CalcMemSizeForArrayWithAlignment(size_t nmemb, size_t size, size_t alignment,
+                                                             size_t* out) noexcept;
+
+  /**
+   * https://cwe.mitre.org/data/definitions/190.html
+   * \param alignment must be power of 2
+   * \param nmemb Number of members or elements in the array
+   * \param size Size of each element
+   * \param out Total size required after any alignment is applied
+   * \return true, successful. false, overflow
+   * \remarks This was the original API and was implemented in the header. Replaced with the above version
+   *          implemented in the .cc file so that the SafeInt dependency is internal.
+   */
+  template <size_t alignment>
+  [[nodiscard]] static bool CalcMemSizeForArrayWithAlignment(size_t nmemb, size_t size, size_t* out) noexcept;
+
+  /**
+   * allocate memory for an array which has nmemb items of data, each size bytes long
+   */
+  void* AllocArray(size_t nmemb, size_t size) {
+    size_t len;
+    if (!CalcMemSizeForArray(nmemb, size, &len)) {
+      ORT_THROW("Invalid size requested for allocation: ", nmemb, " * ", size);
+    }
+
+    return Alloc(len);
+  }
+
+  /**
+   * allocate memory for an array which has nmemb items of data, each size bytes long
+   */
+  template <size_t alignment>
+  void* AllocArrayWithAlignment(size_t nmemb, size_t size) {
+    size_t len;
+    if (!CalcMemSizeForArrayWithAlignment(nmemb, size, alignment, &len)) {
+      ORT_THROW("Invalid size requested for allocation: ", nmemb, " * ", size, " with alignment ", alignment);
+    }
+
+    return Alloc(len);
+  }
+
+  /**
+     Create a std::unique_ptr that is allocated and freed by the provided IAllocator.
+     @param allocator The allocator.
+     @param count_or_bytes The exact bytes to allocate if T is void, otherwise the number of elements to allocate.
+     @param use_reserve If true, call Reserve() instead of Alloc() to allocate memory.
+     @param stream Which stream instance allocated chunk will be used with.
+     @param wait_fn If the allocator want to dynamic reuse a chunk from another stream, use this wait_fn to sync on
+                    the target stream to make the reuse safe.
+     @returns std::unique_ptr with allocated memory and deleter. Throws if it cannot allocate memory.
+  */
+  template <typename T>
+  static IAllocatorUniquePtr<T> MakeUniquePtr(std::shared_ptr<IAllocator> allocator, size_t count_or_bytes,
+                                              bool use_reserve = false,
+                                              Stream* stream = nullptr, WaitNotificationFn wait_fn = nullptr) {
+    ValidateAllocator(allocator);
+
+    // for now limit to fundamental types. we could support others, but to do so either we or the caller
+    // needs to call the dtor for the objects, for buffers allocated on device we don't have destructor
+    // static_assert(std::is_fundamental<T>::value, "Fundamental type required as no destructors are called.");
+
+    size_t alloc_size = count_or_bytes;
+
+    // if T is not void, 'count_or_bytes' == number of items so allow for that
+    if constexpr (!std::is_void<T>::value) {
+      // sizeof(void) isn't valid, but the compiler isn't smart enough to ignore that this line isn't
+      // reachable if T is void. use std::conditional to 'use' void* in the sizeof call
+      constexpr auto size = sizeof(typename std::conditional<std::is_void<T>::value, void*, T>::type);
+      alloc_size = ValidatedCalcMemSizeForArray(count_or_bytes, size);
+    }
+
+    // allocate
+    T* p = static_cast<T*>(AllocateBufferWithOptions(*allocator, alloc_size, use_reserve, stream, std::move(wait_fn)));
+    ValidateAllocation(p, alloc_size);
+
+    return IAllocatorUniquePtr<T>{p,
+                                  [allocator = std::move(allocator)](T* p) {
+                                    allocator->Free(p);
+                                  }};
+  }
+
+  /**
+     Create a std::unique_ptr that is allocated and freed by the provided OrtAllocator.
+     @param ort_allocator The allocator.
+     @param count_or_bytes The exact bytes to allocate if T is void, otherwise the number of elements to allocate.
+     @returns std::unique_ptr with allocated memory and deleter. Throws if it cannot allocate memory.
+  */
+  template <typename T>
+  static IAllocatorUniquePtr<T> MakeUniquePtrFromOrtAllocator(OrtAllocator* ort_allocator, size_t count_or_bytes) {
+    ValidateAllocator(ort_allocator);
+
+    size_t alloc_size = count_or_bytes;
+    // if T is not void, 'count_or_bytes' == number of items so allow for that
+    if constexpr (!std::is_void<T>::value) {
+      // sizeof(void) isn't valid, but the compiler isn't smart enough to ignore that this line isn't
+      // reachable if T is void. use std::conditional to 'use' void* in the sizeof call
+      constexpr auto size = sizeof(typename std::conditional<std::is_void<T>::value, void*, T>::type);
+      alloc_size = ValidatedCalcMemSizeForArray(count_or_bytes, size);
+    }
+
+    T* p = static_cast<T*>(ort_allocator->Alloc(ort_allocator, alloc_size));
+    ValidateAllocation(p, alloc_size);
+
+    return IAllocatorUniquePtr<T>{p,
+                                  [ort_allocator](T* p) {
+                                    ort_allocator->Free(ort_allocator, p);
+                                  }};
+  }
+
+ private:
+  //
+  // validation functions. split out from methods that are templatized on the data type to minimize binary size.
+  //
+
+  template <typename T>
+  static void ValidateAllocator(const T& allocator) {
+    ORT_ENFORCE(allocator != nullptr);
+  }
+
+  static size_t ValidatedCalcMemSizeForArray(size_t count, size_t size) {
+    size_t alloc_size = 0;
+    if (!CalcMemSizeForArray(count, size, &alloc_size)) {
+      ORT_THROW("Invalid size requested for allocation: ", count, " * ", size);
+    }
+
+    return alloc_size;
+  }
+
+  static void ValidateAllocation(void* p, size_t size) {
+    // allocator should throw directly but in case it didn't ensure we do here so that calling code doesn't
+    // need to check for nullptr when an actual allocation was expected.
+    ORT_ENFORCE(p != nullptr || size == 0, "Memory allocation failed. Size=", size);
+  };
+
+  OrtMemoryInfo memory_info_;
+};
+
+template <size_t alignment>
+bool IAllocator::CalcMemSizeForArrayWithAlignment(size_t nmemb, size_t size, size_t* out) noexcept {
+  return CalcMemSizeForArrayWithAlignment(nmemb, size, alignment, out);
+}
+
+class CPUAllocator : public IAllocator {
+ public:
+  explicit CPUAllocator(const OrtMemoryInfo& memory_info) : IAllocator(memory_info) {}
+
+  CPUAllocator() : IAllocator(OrtMemoryInfo(CPU, OrtAllocatorType::OrtDeviceAllocator)) {}
+
+  void* Alloc(size_t size) override;
+  void Free(void* p) override;
+};
+
+using AllocatorPtr = std::shared_ptr<IAllocator>;
+using AllocatorMap = std::map<OrtDevice, AllocatorPtr>;
+
+void* AllocatorDefaultAlloc(size_t size);
+void AllocatorDefaultFree(void* p);
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/buffer_deleter.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/buffer_deleter.h
new file mode 100644
index 00000000000000..961eb443ee1c7a
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/buffer_deleter.h
@@ -0,0 +1,36 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/framework/allocator.h"
+
+namespace onnxruntime {
+
+// TODO: Do we need this class or is IAllocator::MakeUniquePtr sufficient/better
+class BufferDeleter {
+ public:
+  BufferDeleter() = default;
+  explicit BufferDeleter(AllocatorPtr alloc)
+      : alloc_(std::move(alloc)) {}
+
+  void operator()(void* p) const {
+    if (alloc_)
+      alloc_->Free(p);
+  }
+
+ private:
+  // TODO: we may need consider the lifetime of alloc carefully
+  // The alloc_ here is the allocator that used to allocate the buffer
+  // And need go with the unique_ptr together. If it is using our internal
+  // allocator, it is ok as our allocators are global managed. But if it
+  // is provide by user, user need to be very careful about it.
+  // A weak_ptr may be a choice to reduce the impact, but that require to
+  // change our current allocator mgr to use shared_ptr. Will revisit it
+  // later.
+  AllocatorPtr alloc_{nullptr};
+};
+
+using BufferUniquePtr = std::unique_ptr<void, BufferDeleter>;
+using BufferNakedPtr = void*;
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/customregistry.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/customregistry.h
new file mode 100644
index 00000000000000..52f6169e2e8294
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/customregistry.h
@@ -0,0 +1,60 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/common/status.h"
+#include "core/common/logging/logging.h"
+#include "core/framework/op_kernel.h"
+#include "core/framework/kernel_def_builder.h"
+#include "core/framework/kernel_registry.h"
+
+#if !defined(ORT_MINIMAL_BUILD)
+#include "core/graph/schema_registry.h"
+#endif
+
+namespace onnxruntime {
+
+/**
+   Represents a registry that contains both custom kernels and custom schemas.
+*/
+class CustomRegistry final {
+ public:
+  CustomRegistry()
+      : kernel_registry_(std::make_shared<KernelRegistry>())
+#if !defined(ORT_MINIMAL_BUILD)
+        ,
+        opschema_registry_(std::make_shared<onnxruntime::OnnxRuntimeOpSchemaRegistry>())
+#endif
+  {
+  }
+
+  /**
+   * Register a kernel definition together with kernel factory method to this session.
+   * If any conflict happened between registered kernel def and built-in kernel def,
+   * registered kernel will have higher priority.
+   * Call this before invoking Initialize().
+   * @return OK if success.
+   */
+  common::Status RegisterCustomKernel(KernelDefBuilder& kernel_def_builder, const KernelCreateFn& kernel_creator);
+
+  common::Status RegisterCustomKernel(KernelCreateInfo&);
+
+  const std::shared_ptr<KernelRegistry>& GetKernelRegistry();
+
+#if !defined(ORT_MINIMAL_BUILD)
+  common::Status RegisterOpSet(std::vector<ONNX_NAMESPACE::OpSchema>& schemas, const std::string& domain,
+                               int baseline_opset_version, int opset_version);
+
+  const std::shared_ptr<onnxruntime::OnnxRuntimeOpSchemaRegistry>& GetOpschemaRegistry();
+#endif
+
+ private:
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(CustomRegistry);
+  std::shared_ptr<KernelRegistry> kernel_registry_;
+#if !defined(ORT_MINIMAL_BUILD)
+  std::shared_ptr<onnxruntime::OnnxRuntimeOpSchemaRegistry> opschema_registry_;
+#endif
+};
+
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/data_types.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/data_types.h
new file mode 100644
index 00000000000000..87feefa10ca4a0
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/data_types.h
@@ -0,0 +1,1125 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+#include <string>
+#include <type_traits>
+#include <map>
+#include <unordered_map>
+#include <gsl/gsl>
+#include "core/common/common.h"
+#include "core/common/exceptions.h"
+#include "core/framework/endian.h"
+#include "core/framework/float8.h"
+#include "core/framework/float16.h"
+#include "core/framework/int4.h"
+#include "core/graph/onnx_protobuf.h"
+#include "core/framework/to_tensor_proto_element_type.h"
+
+struct OrtValue;
+
+namespace ONNX_NAMESPACE {
+class TypeProto;
+}  // namespace ONNX_NAMESPACE
+
+namespace onnxruntime {
+/// Predefined registered types
+
+#if !defined(DISABLE_ML_OPS)
+
+// maps (only used by ML ops)
+using MapStringToString = std::map<std::string, std::string>;
+using MapStringToInt64 = std::map<std::string, int64_t>;
+using MapStringToFloat = std::map<std::string, float>;
+using MapStringToDouble = std::map<std::string, double>;
+using MapInt64ToString = std::map<int64_t, std::string>;
+using MapInt64ToInt64 = std::map<int64_t, int64_t>;
+using MapInt64ToFloat = std::map<int64_t, float>;
+using MapInt64ToDouble = std::map<int64_t, double>;
+
+// vectors/sequences
+using VectorMapStringToFloat = std::vector<MapStringToFloat>;
+using VectorMapInt64ToFloat = std::vector<MapInt64ToFloat>;
+
+#endif
+
+using VectorString = std::vector<std::string>;
+using VectorInt64 = std::vector<int64_t>;
+
+// Forward declarations
+class DataTypeImpl;
+class TensorTypeBase;
+#if !defined(DISABLE_SPARSE_TENSORS)
+class SparseTensorTypeBase;
+#endif
+class SequenceTensorTypeBase;
+class NonTensorTypeBase;
+#if !defined(DISABLE_OPTIONAL_TYPE)
+class OptionalTypeBase;
+#endif
+class PrimitiveDataTypeBase;
+class Tensor;
+class TensorSeq;
+
+// DataTypeImpl pointer as unique DataTypeImpl identifier.
+using MLDataType = const DataTypeImpl*;
+// be used with class MLValue
+using DeleteFunc = void (*)(void*);
+using CreateFunc = void* (*)();
+
+/**
+ * \brief Base class for MLDataType
+ *
+ */
+class DataTypeImpl {
+ public:
+  enum class GeneralType {
+    kInvalid = 0,
+    kNonTensor = 1,
+    kTensor = 2,
+    kTensorSequence = 3,
+    kSparseTensor = 4,
+    kOptional = 5,
+    kPrimitive = 6,
+  };
+
+  const GeneralType type_;
+  const size_t size_;
+
+ protected:
+  DataTypeImpl(GeneralType type, size_t size) : type_{type}, size_{size} {}
+
+ public:
+  virtual ~DataTypeImpl() = default;
+
+  /**
+   * \brief this API will be used to check type compatibility at runtime
+   *
+   * \param type_proto a TypeProto instance that is constructed for a specific type
+   *        will be checked against a TypeProto instance contained within a corresponding
+   *        MLDataType instance.
+   */
+  virtual bool IsCompatible(const ONNX_NAMESPACE::TypeProto& type_proto) const = 0;
+
+  size_t Size() const { return size_; }
+
+  virtual DeleteFunc GetDeleteFunc() const = 0;
+
+  /**
+   * \brief Retrieves an instance of TypeProto for
+   *        a given MLDataType
+   * \returns optional TypeProto. Only ONNX types
+              has type proto, non-ONNX types will return nullptr.
+   */
+  virtual const ONNX_NAMESPACE::TypeProto* GetTypeProto() const = 0;
+
+  bool IsTensorType() const {
+    return type_ == GeneralType::kTensor;
+  }
+
+  bool IsTensorSequenceType() const {
+    return type_ == GeneralType::kTensorSequence;
+  }
+
+  bool IsSparseTensorType() const {
+    return type_ == GeneralType::kSparseTensor;
+  }
+
+  bool IsOptionalType() const {
+    return type_ == GeneralType::kOptional;
+  }
+
+  bool IsNonTensorType() const {
+    return type_ == GeneralType::kNonTensor;
+  }
+
+  bool IsPrimitiveDataType() const {
+    return type_ == GeneralType::kPrimitive;
+  }
+
+  // Returns this if this is of tensor-type and null otherwise
+  const TensorTypeBase* AsTensorType() const;
+
+  const SequenceTensorTypeBase* AsSequenceTensorType() const;
+
+#if !defined(DISABLE_SPARSE_TENSORS)
+  // Returns this if this is of sparse-tensor-type and null otherwise
+  const SparseTensorTypeBase* AsSparseTensorType() const;
+#endif
+
+#if !defined(DISABLE_OPTIONAL_TYPE)
+  const OptionalTypeBase* AsOptionalType() const;
+#endif
+
+  const NonTensorTypeBase* AsNonTensorType() const;
+
+  // Returns this if this is one of the primitive data types (specialization of PrimitiveDataTypeBase)
+  // and null otherwise
+  const PrimitiveDataTypeBase* AsPrimitiveDataType() const;
+
+  // Return the type meta that we are using in the runtime.
+  template <typename T>
+  static MLDataType GetType();
+
+  // Return the types for a concrete tensor type, like Tensor_Float
+  template <typename elemT>
+  static MLDataType GetTensorType();
+
+  template <typename elemT>
+  static MLDataType GetSequenceTensorType();
+
+#if !defined(DISABLE_SPARSE_TENSORS)
+  // Return the MLDataType for a concrete sparse tensor type.
+  template <typename elemT>
+  static MLDataType GetSparseTensorType();
+#endif
+
+  template <typename T, typename elemT>
+  static MLDataType GetOptionalType();
+
+  /**
+   * Convert an ONNX TypeProto to onnxruntime DataTypeImpl.
+   * However, this conversion is lossy. Don't try to use 'this->GetTypeProto()' converting it back.
+   * Even though GetTypeProto() will not have the original information, it will still have enough to correctly
+   * map to MLDataType.
+   * \param proto
+   */
+  static MLDataType TypeFromProto(const ONNX_NAMESPACE::TypeProto& proto);
+
+  static const TensorTypeBase* TensorTypeFromONNXEnum(int type);
+  static const SequenceTensorTypeBase* SequenceTensorTypeFromONNXEnum(int type);
+#if !defined(DISABLE_SPARSE_TENSORS)
+  static const SparseTensorTypeBase* SparseTensorTypeFromONNXEnum(int type);
+#endif
+
+  static const char* ToString(MLDataType type);
+  static std::vector<std::string> ToString(const std::vector<MLDataType>& types);
+  // Registers ONNX_NAMESPACE::DataType (internalized string) with
+  // MLDataType. DataType is produced by internalizing an instance of
+  // TypeProto contained within MLDataType
+  static void RegisterDataType(MLDataType);
+  static MLDataType GetDataType(const std::string&);
+
+  // IR4: includes all float types, includes float16, bfloat16
+  // IR9: includes float 8 types as well
+  static const std::vector<MLDataType>& AllTensorTypes();  // up to IR4 (no float 8), deprecated
+  static const std::vector<MLDataType>& AllTensorTypesIRv4();
+  static const std::vector<MLDataType>& AllTensorTypesIRv9();
+
+  static const std::vector<MLDataType>& AllFixedSizeTensorTypes();  // up to IR4 (no float 8), deprecated
+  static const std::vector<MLDataType>& AllFixedSizeTensorTypesIRv4();
+  static const std::vector<MLDataType>& AllFixedSizeTensorTypesIRv9();
+
+  static const std::vector<MLDataType>& AllSequenceTensorTypes();  // up to IR4 (no float 8), deprecated
+  static const std::vector<MLDataType>& AllSequenceTensorTypesIRv4();
+  static const std::vector<MLDataType>& AllSequenceTensorTypesIRv9();
+
+  static const std::vector<MLDataType>& AllFixedSizeSequenceTensorTypes();  // up to IR4 (no float 8), deprecated
+  static const std::vector<MLDataType>& AllFixedSizeSequenceTensorTypesIRv4();
+  static const std::vector<MLDataType>& AllFixedSizeSequenceTensorTypesIRv9();
+
+  static const std::vector<MLDataType>& AllNumericTensorTypes();  // up to IR4 (no float 8), deprecated
+  static const std::vector<MLDataType>& AllNumericTensorTypesIRv4();
+  static const std::vector<MLDataType>& AllNumericTensorTypesIRv9();
+
+  static const std::vector<MLDataType>& AllIEEEFloatTensorTypes();  // float16, float, double
+
+  static const std::vector<MLDataType>& AllTensorAndSequenceTensorTypes();  // up to IR4 (no float 8), deprecated
+  static const std::vector<MLDataType>& AllTensorAndSequenceTensorTypesIRv4();
+  static const std::vector<MLDataType>& AllTensorAndSequenceTensorTypesIRv9();
+
+  static const std::vector<MLDataType>& AllOptionalAndTensorAndSequenceTensorTypes();  // up to IR4 (no float 8), deprecated
+  static const std::vector<MLDataType>& AllOptionalAndTensorAndSequenceTensorTypesIRv4();
+  static const std::vector<MLDataType>& AllOptionalAndTensorAndSequenceTensorTypesIRv9();
+
+  static const std::vector<MLDataType>& AllFixedSizeTensorAndSequenceTensorTypes();  // up to IR4 (no float 8), deprecated
+  static const std::vector<MLDataType>& AllFixedSizeTensorAndSequenceTensorTypesIRv4();
+  static const std::vector<MLDataType>& AllFixedSizeTensorAndSequenceTensorTypesIRv9();
+
+  static const std::vector<MLDataType>& AllOptionalTypes();  // up to IR4 (no float 8), deprecated
+  static const std::vector<MLDataType>& AllOptionalTypesIRv4();
+  static const std::vector<MLDataType>& AllOptionalTypesIRv9();
+
+  static const std::vector<MLDataType>& AllTensorAndSequenceTensorAndOptionalTypes();  // up to IR4 (no float 8), deprecated
+  static const std::vector<MLDataType>& AllTensorAndSequenceTensorAndOptionalTypesIRv4();
+  static const std::vector<MLDataType>& AllTensorAndSequenceTensorAndOptionalTypesIRv9();
+};
+
+std::ostream& operator<<(std::ostream& out, MLDataType data_type);
+
+/*
+ * Type registration helpers
+ */
+namespace data_types_internal {
+/// TensorType helpers
+///
+
+/// Is a given type on the list of types?
+/// Accepts a list of types and the first argument is the type
+/// We are checking if it is listed among those that follow
+template <typename T, typename... Types>
+struct IsAnyOf;
+
+/// Two types remaining, end of the list
+template <typename T, typename Tail>
+struct IsAnyOf<T, Tail> : public std::is_same<T, Tail> {
+};
+
+template <typename T, typename H, typename... Tail>
+struct IsAnyOf<T, H, Tail...> {
+  static constexpr bool value = (std::is_same<T, H>::value ||
+                                 IsAnyOf<T, Tail...>::value);
+};
+
+/// Tells if the specified type is one of fundamental types
+/// that can be contained within a tensor.
+/// We do not have raw fundamental types, rather a subset
+/// of fundamental types is contained within tensors.
+template <typename T>
+struct IsTensorContainedType : public IsAnyOf<T, float, uint8_t, int8_t, uint16_t, int16_t,
+                                              int32_t, int64_t, std::string, bool, MLFloat16,
+                                              double, uint32_t, uint64_t, BFloat16,
+                                              Int4x2, UInt4x2
+#if !defined(DISABLE_FLOAT8_TYPES)
+                                              ,
+                                              Float8E4M3FN, Float8E4M3FNUZ, Float8E5M2, Float8E5M2FNUZ
+#endif
+                                              > {
+};
+
+#if !defined(DISABLE_SPARSE_TENSORS)
+/// Use "IsSparseTensorContainedType<T>::value" to test if a type T
+/// is permitted as the element-type of a sparse-tensor.
+
+template <typename T>
+struct IsSparseTensorContainedType : public IsAnyOf<T, float, uint8_t, int8_t, uint16_t, int16_t,
+                                                    int32_t, int64_t, std::string, bool, MLFloat16,
+                                                    double, uint32_t, uint64_t, BFloat16
+#if !defined(DISABLE_FLOAT8_TYPES)
+                                                    ,
+                                                    Float8E4M3FN, Float8E4M3FNUZ, Float8E5M2, Float8E5M2FNUZ
+#endif
+                                                    > {
+};
+#endif
+
+#if !defined(DISABLE_OPTIONAL_TYPE)
+/// Tells if the specified type is one of ORT types
+/// that can be contained within an optional struct.
+template <typename T>
+struct IsOptionalOrtType : public IsAnyOf<T, Tensor, TensorSeq> {
+};
+#endif
+
+/// This template's Get() returns a corresponding MLDataType
+/// It dispatches the call to either GetTensorType<>() or
+/// GetType<>()
+template <typename T, bool TensorContainedType>
+struct GetMLDataType;
+
+template <typename T>
+struct GetMLDataType<T, true> {
+  static MLDataType Get() {
+    return DataTypeImpl::GetTensorType<T>();
+  }
+};
+
+template <typename T>
+struct GetMLDataType<T, false> {
+  static MLDataType Get() {
+    return DataTypeImpl::GetType<T>();
+  }
+};
+
+struct TensorTypeHelper {
+  static void Set(ONNX_NAMESPACE::TensorProto_DataType element_type,
+                  ONNX_NAMESPACE::TypeProto& proto) {
+    proto.mutable_tensor_type()->set_elem_type(element_type);
+  }
+};
+
+#if !defined(DISABLE_SPARSE_TENSORS)
+struct SparseTensorTypeHelper {
+  static void Set(ONNX_NAMESPACE::TensorProto_DataType element_type,
+                  ONNX_NAMESPACE::TypeProto& proto) {
+    proto.mutable_sparse_tensor_type()->set_elem_type(element_type);
+  }
+};
+#endif  // !defined(DISABLE_SPARSE_TENSORS)
+
+#if !defined(DISABLE_ML_OPS)
+/// Map helpers
+
+void CopyMutableMapValue(const ONNX_NAMESPACE::TypeProto&,
+                         ONNX_NAMESPACE::TypeProto&);
+
+struct MapTypeHelper {
+  // V can be either a primitive type (in which case it is a tensor)
+  // or other preregistered types
+  template <typename V>
+  static MLDataType GetValueType() {
+    return GetMLDataType<V, IsTensorContainedType<V>::value>::Get();
+  }
+
+  static void Set(ONNX_NAMESPACE::TensorProto_DataType key_type, const ONNX_NAMESPACE::TypeProto* value_proto,
+                  ONNX_NAMESPACE::TypeProto& proto) {
+    ORT_ENFORCE(value_proto != nullptr, "expected a registered ONNX type");
+    proto.mutable_map_type()->set_key_type(key_type);
+    CopyMutableMapValue(*value_proto, proto);
+  }
+};
+#endif
+
+/// Sequence helpers
+
+// Element type is a primitive type so we set it to a tensor<elemT>
+void CopyMutableSeqElement(const ONNX_NAMESPACE::TypeProto&,
+                           ONNX_NAMESPACE::TypeProto&);
+
+// helper to create TypeProto with minimal binary size impact
+struct SequenceTypeHelper {
+  template <typename T>
+  static MLDataType GetElemType() {
+    return GetMLDataType<T, IsTensorContainedType<T>::value>::Get();
+  }
+
+  static void Set(const ONNX_NAMESPACE::TypeProto* elem_proto,
+                  ONNX_NAMESPACE::TypeProto& proto) {
+    ORT_ENFORCE(elem_proto != nullptr, "expected a registered ONNX type");
+    CopyMutableSeqElement(*elem_proto, proto);
+  }
+};
+
+/// Optional helpers
+
+void CopyMutableOptionalElement(const ONNX_NAMESPACE::TypeProto&,
+                                ONNX_NAMESPACE::TypeProto&);
+
+// helper to create TypeProto with minimal binary size impact
+struct OptionalTypeHelper {
+  template <typename T, typename elemT>
+  static MLDataType GetElemType() {
+    if constexpr (std::is_same<T, Tensor>::value) {
+      return DataTypeImpl::GetTensorType<elemT>();
+    } else {
+      static_assert(std::is_same<T, TensorSeq>::value, "Unsupported element type for optional type");
+      return DataTypeImpl::GetSequenceTensorType<elemT>();
+    }
+  }
+
+  static void Set(const onnx::TypeProto* elem_proto, ONNX_NAMESPACE::TypeProto& proto) {
+    ORT_ENFORCE(elem_proto != nullptr, "expected a registered ONNX type");
+    CopyMutableOptionalElement(*elem_proto, proto);
+  }
+};
+
+/// OpaqueTypes helpers
+
+void AssignOpaqueDomainName(const char* domain, const char* name,
+                            ONNX_NAMESPACE::TypeProto& proto);
+
+}  // namespace data_types_internal
+
+// The suppressed warning is: "The type with a virtual function needs either public virtual or protected nonvirtual destructor."
+// However, we do not allocate this type on heap.
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(push)
+#pragma warning(disable : 26436)
+#endif
+/// All tensors base
+class TensorTypeBase : public DataTypeImpl {
+ public:
+  static MLDataType Type();
+
+  /// We first compare type_proto pointers and then
+  /// if they do not match try to account for the case
+  /// where TypeProto was created ad-hoc and not queried from MLDataType
+  bool IsCompatible(const ONNX_NAMESPACE::TypeProto& type_proto) const override;
+
+  DeleteFunc GetDeleteFunc() const override;
+
+  const ONNX_NAMESPACE::TypeProto* GetTypeProto() const override;
+
+  virtual MLDataType GetElementType() const {
+    // should never reach here.
+    ORT_NOT_IMPLEMENTED(__FUNCTION__, " is not implemented");
+  }
+
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(TensorTypeBase);
+
+ protected:
+  ONNX_NAMESPACE::TypeProto& MutableTypeProto();
+
+  TensorTypeBase();
+  ~TensorTypeBase() override;
+
+ private:
+  struct Impl;
+  Impl* impl_;
+};
+
+/**
+ * \brief Tensor type. This type does not have a C++ type associated with
+ * it at registration time except the element type. One of the types mentioned
+ * above at IsTensorContainedType<> list is acceptable.
+ *
+ * \details
+ *        Usage:
+ *        ORT_REGISTER_TENSOR(ELEMENT_TYPE)
+ *        Currently all of the Tensors irrespective of the dimensions are mapped to Tensor<type>
+ *        type. IsCompatible() currently ignores shape.
+ */
+
+template <typename elemT>
+class TensorType : public TensorTypeBase {
+ public:
+  static_assert(data_types_internal::IsTensorContainedType<elemT>::value,
+                "Requires one of the tensor fundamental types");
+
+  static MLDataType Type();
+
+  /// Tensors only can contain basic data types
+  /// that have been previously registered with ONNXRuntime
+  MLDataType GetElementType() const override {
+    return DataTypeImpl::GetType<elemT>();
+  }
+
+ private:
+  TensorType() {
+    using namespace data_types_internal;
+    TensorTypeHelper::Set(utils::ToTensorProtoElementType<elemT>(), MutableTypeProto());
+  }
+};
+
+#if defined(DISABLE_OPTIONAL_TYPE)
+
+// TODO is this still needed after removing kernel def hashes?
+/// Common base-class for all disabled types. We need DataTypeImpl::ToString to work in a minimal build
+/// with disabled types to keep the ORT format model kernel hashes stable.
+class DisabledTypeBase : public DataTypeImpl {
+ public:
+  static MLDataType Type();
+
+  bool IsCompatible(const ONNX_NAMESPACE::TypeProto&) const override {
+    // We always want to return false for the IsCompatible() for a disabled type
+    // because this will ensure that no kernel supporting the disabled type will
+    // be matched to a model node requiring that type and the model load will
+    // result in failure.
+    return false;
+  }
+
+  DeleteFunc GetDeleteFunc() const override {
+    ORT_THROW("Type is disabled in this build.");
+  }
+
+  // This must work
+  const ONNX_NAMESPACE::TypeProto* GetTypeProto() const override;
+
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(DisabledTypeBase);
+
+ protected:
+  // This must work
+  ONNX_NAMESPACE::TypeProto& MutableTypeProto();
+
+  DisabledTypeBase(DataTypeImpl::GeneralType type, size_t size);
+  ~DisabledTypeBase() override;
+
+ private:
+  struct Impl;
+  Impl* impl_;
+};
+
+#endif
+
+#if !defined(DISABLE_SPARSE_TENSORS)
+/// Common base-class for all sparse-tensors (with different element types).
+class SparseTensorTypeBase : public DataTypeImpl {
+ public:
+  static MLDataType Type();
+
+  bool IsCompatible(const ONNX_NAMESPACE::TypeProto& type_proto) const override;
+
+  DeleteFunc GetDeleteFunc() const override;
+
+  const ONNX_NAMESPACE::TypeProto* GetTypeProto() const override;
+
+  virtual MLDataType GetElementType() const {
+    // should never reach here.
+    ORT_NOT_IMPLEMENTED(__FUNCTION__, " is not implemented");
+  }
+
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(SparseTensorTypeBase);
+
+ protected:
+  ONNX_NAMESPACE::TypeProto& MutableTypeProto();
+
+  SparseTensorTypeBase();
+  ~SparseTensorTypeBase() override;
+
+ private:
+  struct Impl;
+  Impl* impl_;
+};
+
+template <typename elemT>
+class SparseTensorType : public SparseTensorTypeBase {
+ public:
+  static_assert(data_types_internal::IsSparseTensorContainedType<elemT>::value,
+                "Requires one of the sparse-tensor fundamental types");
+
+  static MLDataType Type();
+
+  /// Return a MLDataType representing the element-type
+  MLDataType GetElementType() const override {
+    return DataTypeImpl::GetType<elemT>();
+  }
+
+ private:
+  SparseTensorType() {
+    using namespace data_types_internal;
+    SparseTensorTypeHelper::Set(utils::ToTensorProtoElementType<elemT>(), MutableTypeProto());
+  }
+};
+
+#endif  // !defined(DISABLE_SPARSE_TENSORS)
+
+/// Common base-class for all optional types.
+
+#if !defined(DISABLE_OPTIONAL_TYPE)
+class OptionalTypeBase : public DataTypeImpl {
+ public:
+  static MLDataType Type();
+
+  bool IsCompatible(const ONNX_NAMESPACE::TypeProto& type_proto) const override;
+
+  DeleteFunc GetDeleteFunc() const override {
+    // should never reach here.
+    ORT_NOT_IMPLEMENTED(__FUNCTION__, " is not implemented");
+  }
+
+  const ONNX_NAMESPACE::TypeProto* GetTypeProto() const override;
+
+  virtual MLDataType GetElementType() const {
+    // should never reach here.
+    ORT_NOT_IMPLEMENTED(__FUNCTION__, " is not implemented");
+  }
+
+  OptionalTypeBase(const OptionalTypeBase&) = delete;
+  OptionalTypeBase& operator=(const OptionalTypeBase&) = delete;
+
+ protected:
+  ONNX_NAMESPACE::TypeProto& MutableTypeProto();
+
+  OptionalTypeBase();
+  ~OptionalTypeBase() override;
+
+ private:
+  struct Impl;
+  Impl* impl_;
+};
+#endif
+
+// Derive from OptionalTypeBase if the Optional type support is enabled,
+// else derive from DisabledTypeBase
+template <typename T, typename elemT>
+class OptionalType :
+#if !defined(DISABLE_OPTIONAL_TYPE)
+    public OptionalTypeBase
+#else
+    public DisabledTypeBase
+#endif
+{
+ public:
+  static MLDataType Type();
+
+#if !defined(DISABLE_OPTIONAL_TYPE)
+  static_assert(data_types_internal::IsOptionalOrtType<T>::value,
+                "Requires one of the supported types: Tensor or TensorSeq");
+
+  static_assert(data_types_internal::IsTensorContainedType<elemT>::value,
+                "Requires one of the tensor fundamental types");
+
+  MLDataType GetElementType() const override {
+    return data_types_internal::OptionalTypeHelper::GetElemType<T, elemT>();
+  }
+#endif
+
+ private:
+#if !defined(DISABLE_OPTIONAL_TYPE)
+  OptionalType()
+#else
+  OptionalType() : DisabledTypeBase{DataTypeImpl::GeneralType::kOptional, 0}
+#endif
+  {
+    using namespace data_types_internal;
+    OptionalTypeHelper::Set(OptionalTypeHelper::GetElemType<T, elemT>()->GetTypeProto(), MutableTypeProto());
+  }
+};  // namespace onnxruntime
+
+/**
+ * \brief Provide a specialization for your C++ Non-tensor type
+ *        so your implementation FromDataTypeContainer/ToDataTypeContainer
+ *        functions correctly. Otherwise you get a default implementation
+ *        which may not be what you need/want.
+ *
+ * This class is used to create OrtValue, fetch data from OrtValue via
+ * C/C++ APIs
+ */
+template <class T>
+struct NonTensorTypeConverter {
+  static void FromContainer(MLDataType /*dtype*/, const void* /*data*/, size_t /*data_size*/, OrtValue& /*output*/) {
+    ORT_THROW("Not implemented");
+  }
+  static void ToContainer(const OrtValue& /*input*/, size_t /*data_size*/, void* /*data*/) {
+    ORT_THROW("Not implemented");
+  }
+};
+
+/**
+ * \brief Base type for all non-tensors, maps, sequences and opaques
+ */
+class NonTensorTypeBase : public DataTypeImpl {
+ public:
+  DeleteFunc GetDeleteFunc() const override = 0;
+
+  virtual CreateFunc GetCreateFunc() const = 0;
+
+  const ONNX_NAMESPACE::TypeProto* GetTypeProto() const override;
+
+  // \brief Override for Non-tensor types to initialize non-tensor CPP
+  // data representation from data. The caller of the interface
+  // should have a shared definition of the data which is used to initialize
+  // CPP data representation. This is used from C API.
+  //
+  // \param data - pointer to a data container structure non_tensor type specific
+  // \param data_size - size of the data container structure, used for rudimentary checks
+  // \param output - reference to a default constructed non-tensor type
+  // \returns OrtValue
+  // \throw if there is an error
+  virtual void FromDataContainer(const void* data, size_t data_size, OrtValue& output) const;
+
+  // \brief Override for Non-tensor types to fetch data from the internal CPP data representation
+  // The caller of the interface should have a shared definition of the data which is used to initialize
+  // CPP data representation. This is used from C API.
+  //
+  // \param input - OrtValue containing data
+  // \param data_size - size of the structure that is being passed for receiving data, used for
+  //                    validation
+  // \param data - pointer to receiving data structure
+  virtual void ToDataContainer(const OrtValue& input, size_t data_size, void* data) const;
+
+  NonTensorTypeBase(const NonTensorTypeBase&) = delete;
+  NonTensorTypeBase& operator=(const NonTensorTypeBase&) = delete;
+
+ protected:
+  NonTensorTypeBase(size_t size);
+  ~NonTensorTypeBase() override;
+
+  ONNX_NAMESPACE::TypeProto& MutableTypeProto();
+
+  bool IsMapCompatible(const ONNX_NAMESPACE::TypeProto& type_proto) const;
+
+  bool IsSequenceCompatible(const ONNX_NAMESPACE::TypeProto& type_proto) const;
+
+  bool IsOpaqueCompatible(const ONNX_NAMESPACE::TypeProto& type_proto) const;
+
+ private:
+  struct Impl;
+  Impl* impl_;
+};
+
+// This is where T is the actual CPPRuntimeType
+template <typename T>
+class NonTensorType : public NonTensorTypeBase {
+ private:
+  static void Delete(void* p) {
+    delete static_cast<T*>(p);
+  }
+
+ public:
+  DeleteFunc GetDeleteFunc() const override {
+    return &Delete;
+  }
+
+  CreateFunc GetCreateFunc() const override {
+    return []() -> void* { return new T(); };
+  }
+
+ protected:
+  NonTensorType() : NonTensorTypeBase(sizeof(T)) {}
+};
+
+#if !defined(DISABLE_ML_OPS)
+/**
+ * \brief MapType. Use this type to register
+ * mapping types.
+ *
+ * \param T - cpp type that you wish to register as runtime MapType
+ *
+ * \details Usage: ORT_REGISTER_MAP(C++Type)
+ *          The type is required to have mapped_type and
+ *          key_type defined
+ */
+template <typename CPPType>
+class MapType : public NonTensorType<CPPType> {
+ public:
+  static_assert(data_types_internal::IsTensorContainedType<typename CPPType::key_type>::value,
+                "Requires one of the tensor fundamental types as key");
+
+  static MLDataType Type();
+
+  bool IsCompatible(const ONNX_NAMESPACE::TypeProto& type_proto) const override {
+    return this->IsMapCompatible(type_proto);
+  }
+
+ private:
+  MapType() {
+    using namespace data_types_internal;
+    MapTypeHelper::Set(utils::ToTensorProtoElementType<typename CPPType::key_type>(),
+                       MapTypeHelper::GetValueType<typename CPPType::mapped_type>()->GetTypeProto(),
+                       this->MutableTypeProto());
+  }
+};
+#endif
+
+/**
+ * \brief SequenceType. Use to register sequence for non-tensor types.
+ *
+ *  \param T - CPP type that you wish to register as Sequence
+ *             runtime type.
+ *
+ * \details Usage: ORT_REGISTER_SEQ(C++Type)
+ *          The type is required to have value_type defined
+ */
+template <typename CPPType>
+class SequenceType : public NonTensorType<CPPType> {
+ public:
+  static MLDataType Type();
+
+  bool IsCompatible(const ONNX_NAMESPACE::TypeProto& type_proto) const override {
+    return this->IsSequenceCompatible(type_proto);
+  }
+
+ private:
+  SequenceType() {
+    using namespace data_types_internal;
+    SequenceTypeHelper::Set(SequenceTypeHelper::GetElemType<typename CPPType::value_type>()->GetTypeProto(),
+                            this->MutableTypeProto());
+  }
+};
+
+/**
+ * \brief SequenceTensorTypeBase serves as a base type class for
+ *        Tensor sequences. Akin to TensorTypeBase.
+ *        Runtime representation is always TensorSeq.
+ */
+class SequenceTensorTypeBase : public DataTypeImpl {
+ public:
+  static MLDataType Type();
+
+  bool IsCompatible(const ONNX_NAMESPACE::TypeProto& type_proto) const override;
+
+  virtual MLDataType GetElementType() const {
+    // should never reach here.
+    ORT_NOT_IMPLEMENTED(__FUNCTION__, " is not implemented");
+  }
+
+  DeleteFunc GetDeleteFunc() const override;
+
+  const ONNX_NAMESPACE::TypeProto* GetTypeProto() const override;
+
+  SequenceTensorTypeBase(const SequenceTensorTypeBase&) = delete;
+  SequenceTensorTypeBase& operator=(const SequenceTensorTypeBase&) = delete;
+
+ protected:
+  SequenceTensorTypeBase();
+  ~SequenceTensorTypeBase();
+
+  ONNX_NAMESPACE::TypeProto& MutableTypeProto();
+
+ private:
+  struct Impl;
+  Impl* impl_;
+};
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(pop)
+#endif
+/**
+ * \brief SequenceTensorType. Use to register sequence for non-tensor types.
+ *
+ *  \param CPPRuntime - We always use TensorSeq
+ *
+ *  \param TensorElemType - one of the primitive types
+ *
+ * \details Usage: ORT_REGISTER_SEQ_TENSOR_TYPE()
+ *          The type is required to have value_type defined
+ */
+template <typename TensorElemType>
+class SequenceTensorType : public SequenceTensorTypeBase {
+ public:
+  static_assert(data_types_internal::IsTensorContainedType<TensorElemType>::value,
+                "Requires one of the tensor fundamental types");
+
+  static MLDataType Type();
+
+  /// Return a MLDataType representing the element-type
+  MLDataType GetElementType() const override {
+    return DataTypeImpl::GetType<TensorElemType>();
+  }
+
+ private:
+  SequenceTensorType() {
+    using namespace data_types_internal;
+    SequenceTypeHelper::Set(SequenceTypeHelper::GetElemType<TensorElemType>()->GetTypeProto(),
+                            MutableTypeProto());
+  }
+};
+
+/**
+ * \brief OpaqueType
+ *
+ * \tparam T - cpp runtume that implements the Opaque type
+ *
+ * \tparam const char D[] - domain must be extern to be unique
+ *
+ * \tparam const char N[] - name must be extern to be unique
+ *
+ * \details Only one CPP type can be associated with a particular
+ *          OpaqueType registration
+ *
+ */
+template <typename T, const char D[], const char N[]>
+class OpaqueType : public NonTensorType<T> {
+ public:
+  static MLDataType Type();
+
+  bool IsCompatible(const ONNX_NAMESPACE::TypeProto& type_proto) const override {
+    return this->IsOpaqueCompatible(type_proto);
+  }
+
+  void FromDataContainer(const void* data, size_t data_size, OrtValue& output) const override {
+    NonTensorTypeConverter<T>::FromContainer(this, data, data_size, output);
+  }
+
+  void ToDataContainer(const OrtValue& input, size_t data_size, void* data) const override {
+    NonTensorTypeConverter<T>::ToContainer(input, data_size, data);
+  }
+
+ private:
+  OpaqueType() {
+    data_types_internal::AssignOpaqueDomainName(D, N, this->MutableTypeProto());
+  }
+};
+
+/**
+ * \brief PrimitiveDataTypeBase
+ *        Base class for primitive Tensor contained types
+ *
+ * \details This class contains an integer constant that can be
+ *          used for input data type dispatching. This class also stores the number of subelements per size units.
+ *          Example: For int4, the size unit is 1 byte and the number of subelements is 2.
+ *
+ */
+class PrimitiveDataTypeBase : public DataTypeImpl {
+ public:
+  bool IsCompatible(const ONNX_NAMESPACE::TypeProto&) const override {
+    return false;
+  }
+
+  const ONNX_NAMESPACE::TypeProto* GetTypeProto() const final {
+    return nullptr;
+  }
+
+  int32_t GetDataType() const {
+    return data_type_;
+  }
+
+  int32_t GetNumSubElems() const {
+    return num_sub_elems_;
+  }
+
+  bool HasSubElems() const {
+    return num_sub_elems_ > 1;
+  }
+
+ protected:
+  PrimitiveDataTypeBase(size_t size, int32_t data_type, int32_t num_sub_elems)
+      : DataTypeImpl{GeneralType::kPrimitive, size}, data_type_{data_type}, num_sub_elems_{num_sub_elems} {}
+
+ private:
+  const int32_t data_type_;
+  const int32_t num_sub_elems_;  // > 1 for subbyte primitives, 1 for normal primitives.
+};
+
+/**
+ * \brief PrimitiveDataType
+ *        Typed specialization for primitive types.
+ *        Concrete instances of this class are used by Tensor.
+ *
+ * \param T - primitive data type
+ *
+ */
+template <typename T>
+class PrimitiveDataType : public PrimitiveDataTypeBase {
+ private:
+  static void Delete(void* p) {
+    delete static_cast<T*>(p);
+  }
+
+ public:
+  static MLDataType Type();
+
+  DeleteFunc GetDeleteFunc() const override {
+    return &Delete;
+  }
+
+ private:
+  explicit PrimitiveDataType(int32_t num_sub_elems)
+      : PrimitiveDataTypeBase{sizeof(T),
+                              utils::ToTensorProtoElementType<T>(), num_sub_elems} {
+  }
+};
+
+inline const TensorTypeBase* DataTypeImpl::AsTensorType() const {
+  return IsTensorType() ? static_cast<const TensorTypeBase*>(this) : nullptr;
+}
+
+inline const SequenceTensorTypeBase* DataTypeImpl::AsSequenceTensorType() const {
+  return IsTensorSequenceType() ? static_cast<const SequenceTensorTypeBase*>(this) : nullptr;
+}
+
+#if !defined(DISABLE_SPARSE_TENSORS)
+inline const SparseTensorTypeBase* DataTypeImpl::AsSparseTensorType() const {
+  return IsSparseTensorType() ? static_cast<const SparseTensorTypeBase*>(this) : nullptr;
+}
+#endif
+
+#if !defined(DISABLE_OPTIONAL_TYPE)
+inline const OptionalTypeBase* DataTypeImpl::AsOptionalType() const {
+  return IsOptionalType() ? static_cast<const OptionalTypeBase*>(this) : nullptr;
+}
+#endif
+
+inline const NonTensorTypeBase* DataTypeImpl::AsNonTensorType() const {
+  return IsNonTensorType() ? static_cast<const NonTensorTypeBase*>(this) : nullptr;
+}
+
+inline const PrimitiveDataTypeBase* DataTypeImpl::AsPrimitiveDataType() const {
+  return IsPrimitiveDataType() ? static_cast<const PrimitiveDataTypeBase*>(this) : nullptr;
+}
+
+// Explicit specialization of base class template function
+// is only possible within the enclosing namespace scope,
+// thus a simple way to pre-instantiate a given template
+// at a registration time does not currently work and the macro
+// is needed.
+#define ORT_REGISTER_TENSOR_TYPE(ELEM_TYPE)             \
+  template <>                                           \
+  MLDataType TensorType<ELEM_TYPE>::Type() {            \
+    static TensorType<ELEM_TYPE> tensor_type;           \
+    return &tensor_type;                                \
+  }                                                     \
+  template <>                                           \
+  MLDataType DataTypeImpl::GetTensorType<ELEM_TYPE>() { \
+    return TensorType<ELEM_TYPE>::Type();               \
+  }
+
+#if !defined(DISABLE_SPARSE_TENSORS)
+#define ORT_REGISTER_SPARSE_TENSOR_TYPE(ELEM_TYPE)            \
+  template <>                                                 \
+  MLDataType SparseTensorType<ELEM_TYPE>::Type() {            \
+    static SparseTensorType<ELEM_TYPE> tensor_type;           \
+    return &tensor_type;                                      \
+  }                                                           \
+  template <>                                                 \
+  MLDataType DataTypeImpl::GetSparseTensorType<ELEM_TYPE>() { \
+    return SparseTensorType<ELEM_TYPE>::Type();               \
+  }
+#endif
+
+#define ORT_REGISTER_OPTIONAL_TYPE(ORT_TYPE, TYPE)             \
+  template <>                                                  \
+  MLDataType OptionalType<ORT_TYPE, TYPE>::Type() {            \
+    static OptionalType<ORT_TYPE, TYPE> optional_type;         \
+    return &optional_type;                                     \
+  }                                                            \
+  template <>                                                  \
+  MLDataType DataTypeImpl::GetOptionalType<ORT_TYPE, TYPE>() { \
+    return OptionalType<ORT_TYPE, TYPE>::Type();               \
+  }
+
+#if !defined(DISABLE_ML_OPS)
+#define ORT_REGISTER_MAP(TYPE)               \
+  template <>                                \
+  MLDataType MapType<TYPE>::Type() {         \
+    static MapType<TYPE> map_type;           \
+    return &map_type;                        \
+  }                                          \
+  template <>                                \
+  MLDataType DataTypeImpl::GetType<TYPE>() { \
+    return MapType<TYPE>::Type();            \
+  }
+#endif
+
+#define ORT_REGISTER_SEQ(TYPE)               \
+  template <>                                \
+  MLDataType SequenceType<TYPE>::Type() {    \
+    static SequenceType<TYPE> sequence_type; \
+    return &sequence_type;                   \
+  }                                          \
+  template <>                                \
+  MLDataType DataTypeImpl::GetType<TYPE>() { \
+    return SequenceType<TYPE>::Type();       \
+  }
+
+#define ORT_REGISTER_SEQ_TENSOR_TYPE(ELEM_TYPE)                 \
+  template <>                                                   \
+  MLDataType SequenceTensorType<ELEM_TYPE>::Type() {            \
+    static SequenceTensorType<ELEM_TYPE> sequence_tensor_type;  \
+    return &sequence_tensor_type;                               \
+  }                                                             \
+  template <>                                                   \
+  MLDataType DataTypeImpl::GetSequenceTensorType<ELEM_TYPE>() { \
+    return SequenceTensorType<ELEM_TYPE>::Type();               \
+  }
+
+#define ORT_REGISTER_PRIM_TYPE(TYPE)                  \
+  template <>                                         \
+  MLDataType PrimitiveDataType<TYPE>::Type() {        \
+    static PrimitiveDataType<TYPE> prim_data_type(1); \
+    return &prim_data_type;                           \
+  }                                                   \
+  template <>                                         \
+  MLDataType DataTypeImpl::GetType<TYPE>() {          \
+    return PrimitiveDataType<TYPE>::Type();           \
+  }
+
+// Registers a subbyte primitive.
+// Examples:
+//   - Int4x2 stores 2 packed 4-bit elements in 1 byte: ORT_*_SUBBYTE_TYPE(Int4x2, 2)
+//   - [not supported] Int3x8 could store 8 packed 3-bit elements in 3 bytes: ORT_*_SUBBYTE_TYPE(Int3x8, 8)
+#define ORT_REGISTER_PRIM_SUBBYTE_TYPE(TYPE, NUM_SUB_ELEMS)       \
+  template <>                                                     \
+  MLDataType PrimitiveDataType<TYPE>::Type() {                    \
+    static PrimitiveDataType<TYPE> prim_data_type(NUM_SUB_ELEMS); \
+    return &prim_data_type;                                       \
+  }                                                               \
+  template <>                                                     \
+  MLDataType DataTypeImpl::GetType<TYPE>() {                      \
+    return PrimitiveDataType<TYPE>::Type();                       \
+  }
+
+#define ORT_REGISTER_OPAQUE_TYPE(CPPType, Domain, Name)   \
+  template <>                                             \
+  MLDataType OpaqueType<CPPType, Domain, Name>::Type() {  \
+    static OpaqueType<CPPType, Domain, Name> opaque_type; \
+    return &opaque_type;                                  \
+  }                                                       \
+  template <>                                             \
+  MLDataType DataTypeImpl::GetType<CPPType>() {           \
+    return OpaqueType<CPPType, Domain, Name>::Type();     \
+  }
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/data_types_internal.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/data_types_internal.h
new file mode 100644
index 00000000000000..05f4c10995ef2a
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/data_types_internal.h
@@ -0,0 +1,712 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <array>
+#include <cassert>
+#include <cstdint>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include "boost/mp11.hpp"
+
+#include "core/common/common.h"
+#include "core/framework/to_tensor_proto_element_type.h"
+#ifndef SHARED_PROVIDER
+#include "core/common/type_list.h"
+#include "core/framework/data_types.h"
+#include "core/graph/onnx_protobuf.h"
+#endif
+
+namespace onnxruntime {
+namespace utils {
+
+// The following primitives are strongly recommended for switching on tensor input datatypes for
+// kernel implementations.
+//
+//  1) If you need to handle all of the primitive tensor contained datatypes, the best choice would be macros
+//     DispatchOnTensorType or DispatchOnTensorTypeWithReturn. Use inline wrappers so your function can be invoked as function<T>().
+//  2) if you have a few types, use Tensor.IsDataType<T>()/IsDataTypeString() or use utils::IsPrimitiveDataType<T>()
+//     if you have a standalone MLDatatType with a sequence of if/else statements.
+//  3) For something in between, we suggest to use CallDispatcher pattern.
+//
+// Invoking DataTypeImpl::GetType<T>() for switching on input types is discouraged and should be avoided.
+// Every primitive type carries with it an integer constant that can be used for quick switching on types.
+
+#if !defined(DISABLE_FLOAT8_TYPES)
+
+#define DispatchOnTensorType(tensor_type, function, ...)          \
+  switch (tensor_type->AsPrimitiveDataType()->GetDataType()) {    \
+    case ONNX_NAMESPACE::TensorProto_DataType_FLOAT:              \
+      function<float>(__VA_ARGS__);                               \
+      break;                                                      \
+    case ONNX_NAMESPACE::TensorProto_DataType_BOOL:               \
+      function<bool>(__VA_ARGS__);                                \
+      break;                                                      \
+    case ONNX_NAMESPACE::TensorProto_DataType_DOUBLE:             \
+      function<double>(__VA_ARGS__);                              \
+      break;                                                      \
+    case ONNX_NAMESPACE::TensorProto_DataType_STRING:             \
+      function<std::string>(__VA_ARGS__);                         \
+      break;                                                      \
+    case ONNX_NAMESPACE::TensorProto_DataType_INT8:               \
+      function<int8_t>(__VA_ARGS__);                              \
+      break;                                                      \
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT8:              \
+      function<uint8_t>(__VA_ARGS__);                             \
+      break;                                                      \
+    case ONNX_NAMESPACE::TensorProto_DataType_INT16:              \
+      function<int16_t>(__VA_ARGS__);                             \
+      break;                                                      \
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT16:             \
+      function<uint16_t>(__VA_ARGS__);                            \
+      break;                                                      \
+    case ONNX_NAMESPACE::TensorProto_DataType_INT32:              \
+      function<int32_t>(__VA_ARGS__);                             \
+      break;                                                      \
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT32:             \
+      function<uint32_t>(__VA_ARGS__);                            \
+      break;                                                      \
+    case ONNX_NAMESPACE::TensorProto_DataType_INT64:              \
+      function<int64_t>(__VA_ARGS__);                             \
+      break;                                                      \
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT64:             \
+      function<uint64_t>(__VA_ARGS__);                            \
+      break;                                                      \
+    case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:            \
+      function<MLFloat16>(__VA_ARGS__);                           \
+      break;                                                      \
+    case ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16:           \
+      function<BFloat16>(__VA_ARGS__);                            \
+      break;                                                      \
+    case ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E4M3FN:       \
+      function<Float8E4M3FN>(__VA_ARGS__);                        \
+      break;                                                      \
+    case ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E4M3FNUZ:     \
+      function<Float8E4M3FNUZ>(__VA_ARGS__);                      \
+      break;                                                      \
+    case ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E5M2:         \
+      function<Float8E5M2>(__VA_ARGS__);                          \
+      break;                                                      \
+    case ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E5M2FNUZ:     \
+      function<Float8E5M2FNUZ>(__VA_ARGS__);                      \
+      break;                                                      \
+    case ONNX_NAMESPACE::TensorProto_DataType_INT4:               \
+      function<Int4x2>(__VA_ARGS__);                              \
+      break;                                                      \
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT4:              \
+      function<UInt4x2>(__VA_ARGS__);                             \
+      break;                                                      \
+    default:                                                      \
+      ORT_ENFORCE(false, "Unknown tensor type of ", tensor_type); \
+  }
+
+#define DispatchOnTensorTypeWithReturn(tensor_type, retval, function, ...) \
+  switch (tensor_type->AsPrimitiveDataType()->GetDataType()) {             \
+    case ONNX_NAMESPACE::TensorProto_DataType_FLOAT:                       \
+      retval = function<float>(__VA_ARGS__);                               \
+      break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_BOOL:                        \
+      retval = function<bool>(__VA_ARGS__);                                \
+      break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_DOUBLE:                      \
+      retval = function<double>(__VA_ARGS__);                              \
+      break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_STRING:                      \
+      retval = function<std::string>(__VA_ARGS__);                         \
+      break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_INT8:                        \
+      retval = function<int8_t>(__VA_ARGS__);                              \
+      break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT8:                       \
+      retval = function<uint8_t>(__VA_ARGS__);                             \
+      break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT16:                      \
+      retval = function<uint16_t>(__VA_ARGS__);                            \
+      break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_INT16:                       \
+      retval = function<int16_t>(__VA_ARGS__);                             \
+      break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_INT32:                       \
+      retval = function<int32_t>(__VA_ARGS__);                             \
+      break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT32:                      \
+      retval = function<uint32_t>(__VA_ARGS__);                            \
+      break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_INT64:                       \
+      retval = function<int64_t>(__VA_ARGS__);                             \
+      break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT64:                      \
+      retval = function<uint64_t>(__VA_ARGS__);                            \
+      break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:                     \
+      retval = function<MLFloat16>(__VA_ARGS__);                           \
+      break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16:                    \
+      retval = function<BFloat16>(__VA_ARGS__);                            \
+      break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E4M3FN:                \
+      retval = function<Float8E4M3FN>(__VA_ARGS__);                        \
+      break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E4M3FNUZ:              \
+      retval = function<Float8E4M3FNUZ>(__VA_ARGS__);                      \
+      break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E5M2:                  \
+      retval = function<Float8E5M2>(__VA_ARGS__);                          \
+      break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E5M2FNUZ:              \
+      retval = function<Float8E5M2FNUZ>(__VA_ARGS__);                      \
+      break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_INT4:                        \
+      retval = function<Int4x2>(__VA_ARGS__);                              \
+      break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT4:                       \
+      retval = function<UInt4x2>(__VA_ARGS__);                             \
+      break;                                                               \
+    default:                                                               \
+      ORT_ENFORCE(false, "Unknown tensor type of ", tensor_type);          \
+  }
+
+#else
+
+#define DispatchOnTensorType(tensor_type, function, ...)          \
+  switch (tensor_type->AsPrimitiveDataType()->GetDataType()) {    \
+    case ONNX_NAMESPACE::TensorProto_DataType_FLOAT:              \
+      function<float>(__VA_ARGS__);                               \
+      break;                                                      \
+    case ONNX_NAMESPACE::TensorProto_DataType_BOOL:               \
+      function<bool>(__VA_ARGS__);                                \
+      break;                                                      \
+    case ONNX_NAMESPACE::TensorProto_DataType_DOUBLE:             \
+      function<double>(__VA_ARGS__);                              \
+      break;                                                      \
+    case ONNX_NAMESPACE::TensorProto_DataType_STRING:             \
+      function<std::string>(__VA_ARGS__);                         \
+      break;                                                      \
+    case ONNX_NAMESPACE::TensorProto_DataType_INT8:               \
+      function<int8_t>(__VA_ARGS__);                              \
+      break;                                                      \
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT8:              \
+      function<uint8_t>(__VA_ARGS__);                             \
+      break;                                                      \
+    case ONNX_NAMESPACE::TensorProto_DataType_INT16:              \
+      function<int16_t>(__VA_ARGS__);                             \
+      break;                                                      \
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT16:             \
+      function<uint16_t>(__VA_ARGS__);                            \
+      break;                                                      \
+    case ONNX_NAMESPACE::TensorProto_DataType_INT32:              \
+      function<int32_t>(__VA_ARGS__);                             \
+      break;                                                      \
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT32:             \
+      function<uint32_t>(__VA_ARGS__);                            \
+      break;                                                      \
+    case ONNX_NAMESPACE::TensorProto_DataType_INT64:              \
+      function<int64_t>(__VA_ARGS__);                             \
+      break;                                                      \
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT64:             \
+      function<uint64_t>(__VA_ARGS__);                            \
+      break;                                                      \
+    case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:            \
+      function<MLFloat16>(__VA_ARGS__);                           \
+      break;                                                      \
+    case ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16:           \
+      function<BFloat16>(__VA_ARGS__);                            \
+      break;                                                      \
+    case ONNX_NAMESPACE::TensorProto_DataType_INT4:               \
+      function<Int4x2>(__VA_ARGS__);                              \
+      break;                                                      \
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT4:              \
+      function<UInt4x2>(__VA_ARGS__);                             \
+      break;                                                      \
+    default:                                                      \
+      ORT_ENFORCE(false, "Unknown tensor type of ", tensor_type); \
+  }
+
+#define DispatchOnTensorTypeWithReturn(tensor_type, retval, function, ...) \
+  switch (tensor_type->AsPrimitiveDataType()->GetDataType()) {             \
+    case ONNX_NAMESPACE::TensorProto_DataType_FLOAT:                       \
+      retval = function<float>(__VA_ARGS__);                               \
+      break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_BOOL:                        \
+      retval = function<bool>(__VA_ARGS__);                                \
+      break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_DOUBLE:                      \
+      retval = function<double>(__VA_ARGS__);                              \
+      break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_STRING:                      \
+      retval = function<std::string>(__VA_ARGS__);                         \
+      break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_INT8:                        \
+      retval = function<int8_t>(__VA_ARGS__);                              \
+      break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT8:                       \
+      retval = function<uint8_t>(__VA_ARGS__);                             \
+      break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT16:                      \
+      retval = function<uint16_t>(__VA_ARGS__);                            \
+      break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_INT16:                       \
+      retval = function<int16_t>(__VA_ARGS__);                             \
+      break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_INT32:                       \
+      retval = function<int32_t>(__VA_ARGS__);                             \
+      break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT32:                      \
+      retval = function<uint32_t>(__VA_ARGS__);                            \
+      break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_INT64:                       \
+      retval = function<int64_t>(__VA_ARGS__);                             \
+      break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT64:                      \
+      retval = function<uint64_t>(__VA_ARGS__);                            \
+      break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16:                     \
+      retval = function<MLFloat16>(__VA_ARGS__);                           \
+      break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16:                    \
+      retval = function<BFloat16>(__VA_ARGS__);                            \
+      break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_INT4:                        \
+      retval = function<Int4x2>(__VA_ARGS__);                              \
+      break;                                                               \
+    case ONNX_NAMESPACE::TensorProto_DataType_UINT4:                       \
+      retval = function<UInt4x2>(__VA_ARGS__);                             \
+      break;                                                               \
+    default:                                                               \
+      ORT_ENFORCE(false, "Unknown tensor type of ", tensor_type);          \
+  }
+
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+/// Use the following primitives if you have a few types to switch on so you
+//  can write a short sequence of if/else statements.
+
+// This is a frequently used check so we make a separate utility function.
+inline bool IsDataTypeString(MLDataType dt_type) {
+  auto prim_type = dt_type->AsPrimitiveDataType();
+  return (prim_type != nullptr && prim_type->GetDataType() == ONNX_NAMESPACE::TensorProto_DataType_STRING);
+}
+
+// Test if MLDataType is a concrete type of PrimitiveDataTypeBase
+// and it is T
+template <class T>
+inline bool IsPrimitiveDataType(MLDataType dt_type) {
+  auto prim_type = dt_type->AsPrimitiveDataType();
+  return (prim_type != nullptr && prim_type->GetDataType() == ToTensorProtoElementType<T>());
+}
+
+// Use after AsPrimitiveDataType() is successful
+// Check if PrimitiveDataTypeBase is of type T
+template <class T>
+inline bool IsPrimitiveDataType(const PrimitiveDataTypeBase* prim_type) {
+  assert(prim_type != nullptr);
+  return prim_type->GetDataType() == ToTensorProtoElementType<T>();
+}
+
+// This implementation contains a workaround for GCC bug https://gcc.gnu.org/bugzilla/show_bug.cgi?id=47226
+// GCC until very recently does not support template parameter pack expansion within lambda context.
+namespace mltype_dispatcher_internal {
+
+// T - type handled by this helper
+class CallableDispatchableHelper {
+  int32_t dt_type_;  // Type currently dispatched
+  size_t called_;
+
+ public:
+  explicit CallableDispatchableHelper(int32_t dt_type) noexcept : dt_type_(dt_type), called_(0) {}
+
+  // Must return integer to be in a expandable context
+  template <class T, class Fn, class... Args>
+  int Invoke(Fn&& fn, Args&&... args) {
+    if (utils::ToTensorProtoElementType<T>() == dt_type_) {
+      std::forward<Fn>(fn)(std::forward<Args>(args)...);
+      ++called_;
+    }
+    return 0;
+  }
+
+  void CheckCalledOnce() const {
+    ORT_ENFORCE(called_ == 1, "Unsupported data type: ", dt_type_);
+  }
+};
+
+// Default policy is to throw an exception.
+// Other policies may set the second result argument accordingly.
+template <class Ret>
+struct UnsupportedTypeDefaultPolicy {
+  void operator()(int32_t dt_type, Ret& /*result*/) const {
+    ORT_THROW("Unsupported data type: ", dt_type);
+  }
+};
+
+// Helper with the result type
+template <class Ret, class UnsupportedPolicy>
+class CallableDispatchableRetHelper {
+  int32_t dt_type_;  // Type currently dispatched
+  size_t called_;
+  Ret result_;
+
+ public:
+  explicit CallableDispatchableRetHelper(int32_t dt_type) noexcept : dt_type_(dt_type), called_(0), result_() {}
+
+  Ret Get() {
+    // No type was invoked
+    if (called_ == 0) {
+      UnsupportedPolicy()(dt_type_, result_);
+    }
+    return result_;
+  }
+
+  // Must return integer to be in a expandable context
+  template <class T, class Fn, class... Args>
+  int Invoke(Fn&& fn, Args&&... args) {
+    if (utils::ToTensorProtoElementType<T>() == dt_type_) {
+      result_ = std::forward<Fn>(fn)(std::forward<Args>(args)...);
+      ++called_;
+    }
+    return 0;
+  }
+};
+
+template <typename T>
+using TensorProtoElementTypeConstant =
+    std::integral_constant<ONNX_NAMESPACE::TensorProto_DataType, ToTensorProtoElementType<T>()>;
+
+using UndefinedTensorProtoElementTypeConstant =
+    std::integral_constant<ONNX_NAMESPACE::TensorProto_DataType, ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED>;
+
+}  // namespace mltype_dispatcher_internal
+
+/**
+ * This class helps to efficiently dispatch calls to implementation function
+ * objects with a tensor element type template argument.
+ *
+ * The constructor accepts a value corresponding to a tensor element type.
+ * For example, it can be obtained from:
+ *   input_tensor->GetElementType()
+ *
+ * The Invoke member functions will instantiate and invoke the provided
+ * function object template, Fn. Fn must be default constructible. Fn must also
+ * have a tensor element type template argument. This type template argument
+ * will be the type that corresponds to the value given in the constructor.
+ * These functions accept and forward arbitrary function arguments. They ensure
+ * that Fn is called once with the type specified in the constructor.
+ *
+ * @tparam Types The types supported by the implementation. This should be a
+ *         set of ONNX tensor element types that are supported by ORT.
+ */
+template <typename... Types>
+class MLTypeCallDispatcher {
+  using SupportedTypeList = TypeList<Types...>;
+  using SupportedTensorProtoElementTypeList =
+      boost::mp11::mp_transform<
+          mltype_dispatcher_internal::TensorProtoElementTypeConstant, SupportedTypeList>;
+
+  static_assert(
+      boost::mp11::mp_and<
+          boost::mp11::mp_is_set<SupportedTensorProtoElementTypeList>,
+          boost::mp11::mp_not<
+              boost::mp11::mp_set_contains<
+                  SupportedTensorProtoElementTypeList,
+                  mltype_dispatcher_internal::UndefinedTensorProtoElementTypeConstant>>>::value,
+      "Types must map to a unique set of ONNX tensor element data types supported by ORT.");
+
+  int32_t dt_type_;
+
+ public:
+  /**
+   * Constructor.
+   * @param dt_type The value corresponding to the tensor element type to be
+   *        dispatched to. This can be obtained from
+   *        input_tensor->GetElementType() or
+   *        utils::ToTensorProtoElementType<T>().
+   */
+  explicit MLTypeCallDispatcher(int32_t dt_type) noexcept : dt_type_(dt_type) {}
+
+  /**
+   * Invokes Fn<T> with the specified arguments.
+   *
+   * @tparam Fn The function object template.
+   * @tparam Args The argument types.
+   */
+  template <template <typename...> class Fn, typename... Args>
+  void Invoke(Args&&... args) const {
+    InvokeWithLeadingTemplateArgs<Fn, TypeList<>>(std::forward<Args>(args)...);
+  }
+
+  /**
+   * Invokes Fn<..., T> with leading template arguments and the specified
+   * arguments.
+   *
+   * @tparam Fn The function object template.
+   * @tparam LeadingTemplateArgTypeList A type list of the leading template
+   *         arguments.
+   * @tparam Args The argument types.
+   */
+  template <template <typename...> class Fn, typename LeadingTemplateArgTypeList, typename... Args>
+  void InvokeWithLeadingTemplateArgs(Args&&... args) const {
+    static_assert(
+        boost::mp11::mp_is_list<LeadingTemplateArgTypeList>::value,
+        "LeadingTemplateArgTypeList must be a type list (e.g., onnxruntime::TypeList<T1, T2, ...>).");
+
+    mltype_dispatcher_internal::CallableDispatchableHelper helper(dt_type_);
+
+    // given LeadingTemplateArgTypeList is a type list L<U1, U2, ...>,
+    //   call helper.Invoke() with Fn<U1, U2, ..., T> for each T in Types
+    static_cast<void>(std::array<int, sizeof...(Types)>{
+        helper.template Invoke<Types>(
+            boost::mp11::mp_apply<Fn, boost::mp11::mp_push_back<LeadingTemplateArgTypeList, Types>>(),
+            std::forward<Args>(args)...)...});
+
+    // avoid "unused parameter" warning for the case where Types is empty
+    static_cast<void>(std::array<int, sizeof...(Args)>{(ORT_UNUSED_PARAMETER(args), 0)...});
+
+    helper.CheckCalledOnce();
+  }
+
+  /**
+   * Invokes Fn<T> with the specified arguments and returns the result.
+   *
+   * @tparam Ret The return type. Fn should return a type convertible to Ret.
+   * @tparam Fn The function object template.
+   * @tparam Args The argument types.
+   */
+  template <class Ret, template <typename...> class Fn, typename... Args>
+  Ret InvokeRet(Args&&... args) const {
+    return InvokeRetWithUnsupportedPolicy<
+        Ret, Fn, mltype_dispatcher_internal::UnsupportedTypeDefaultPolicy<Ret>>(
+        std::forward<Args>(args)...);
+  }
+
+  /**
+   * Invokes Fn<T> with the specified arguments and returns the result.
+   *
+   * @tparam Ret The return type. Fn should return a type convertible to Ret.
+   * @tparam Fn The function object template.
+   * @tparam UnsupportedPolicy The policy used to handle unsupported types.
+   *         See mltype_dispatcher_internal::UnsupportedTypeDefaultPolicy
+   *         for an example.
+   * @tparam Args The argument types.
+   */
+  template <class Ret, template <typename...> class Fn, class UnsupportedPolicy, typename... Args>
+  Ret InvokeRetWithUnsupportedPolicy(Args&&... args) const {
+    return InvokeRetWithUnsupportedPolicyAndLeadingTemplateArgs<
+        Ret, Fn, UnsupportedPolicy, TypeList<>>(
+        std::forward<Args>(args)...);
+  }
+
+  /**
+   * Invokes Fn<..., T> with leading template arguments and the specified
+   * arguments and returns the result.
+   *
+   * @tparam Ret The return type. Fn should return a type convertible to Ret.
+   * @tparam Fn The function object template.
+   * @tparam LeadingTemplateArgTypeList A type list of the leading template
+   *         arguments.
+   * @tparam Args The argument types.
+   */
+  template <class Ret, template <typename...> class Fn, typename LeadingTemplateArgTypeList, typename... Args>
+  Ret InvokeRetWithLeadingTemplateArgs(Args&&... args) const {
+    return InvokeRetWithUnsupportedPolicyAndLeadingTemplateArgs<
+        Ret, Fn, mltype_dispatcher_internal::UnsupportedTypeDefaultPolicy<Ret>, LeadingTemplateArgTypeList>(
+        std::forward<Args>(args)...);
+  }
+
+  /**
+   * Invokes Fn<..., T> with leading template arguments and the specified
+   * arguments and returns the result.
+   *
+   * @tparam Ret The return type. Fn should return a type convertible to Ret.
+   * @tparam Fn The function object template.
+   * @tparam UnsupportedPolicy The policy used to handle unsupported types.
+   *         See mltype_dispatcher_internal::UnsupportedTypeDefaultPolicy
+   *         for an example.
+   * @tparam LeadingTemplateArgTypeList A type list of the leading template
+   *         arguments.
+   * @tparam Args The argument types.
+   */
+  template <class Ret,
+            template <typename...> class Fn,
+            class UnsupportedPolicy,
+            typename LeadingTemplateArgTypeList,
+            typename... Args>
+  Ret InvokeRetWithUnsupportedPolicyAndLeadingTemplateArgs(Args&&... args) const {
+    mltype_dispatcher_internal::CallableDispatchableRetHelper<Ret, UnsupportedPolicy> helper(dt_type_);
+
+    // given LeadingTemplateArgTypeList is a type list L<U1, U2, ...>,
+    //   call helper.Invoke() with Fn<U1, U2, ..., T> for each T in Types
+    static_cast<void>(std::array<int, sizeof...(Types)>{
+        helper.template Invoke<Types>(
+            boost::mp11::mp_apply<Fn, boost::mp11::mp_push_back<LeadingTemplateArgTypeList, Types>>(),
+            std::forward<Args>(args)...)...});
+
+    // avoid "unused parameter" warning for the case where Types is empty
+    static_cast<void>(std::array<int, sizeof...(Args)>{(ORT_UNUSED_PARAMETER(args), 0)...});
+
+    return helper.Get();
+  }
+};
+
+// the type MLTypeCallDispatcher<T...> given a type list L<T...>
+template <typename L>
+using MLTypeCallDispatcherFromTypeList = boost::mp11::mp_apply<MLTypeCallDispatcher, L>;
+
+namespace data_types_internal {
+
+enum class ContainerType : uint16_t {
+  kUndefined = 0,
+  kTensor = 1,
+  kMap = 2,
+  kSequence = 3,
+  kOpaque = 4,
+  kOptional = 5
+};
+
+class TypeNode {
+  // type_ is a TypeProto value case enum
+  // that may be a kTypeTensor, kTypeMap, kTypeSequence
+  // prim_type_ is a TypeProto_DataType enum that has meaning
+  // - for Tensor then prim_type_ is the contained type
+  // - for Map prim_type is the key type. Next entry describes map value
+  // - For sequence prim_type_ is not used and has no meaning. Next entry
+  //   describes the value for the sequence
+  // Tensor is always the last entry as it describes a contained primitive type.
+  ContainerType type_;
+  uint16_t prim_type_;
+
+ public:
+  TypeNode(ContainerType type, int32_t prim_type) noexcept {
+    type_ = type;
+    prim_type_ = static_cast<uint16_t>(prim_type);
+  }
+
+  bool IsType(ContainerType type) const noexcept {
+    return type_ == type;
+  }
+
+  bool IsPrimType(int32_t prim_type) const noexcept {
+    return prim_type_ == static_cast<uint16_t>(prim_type);
+  }
+};
+
+}  // namespace data_types_internal
+
+////////////////////////////////////////////////////////////////////
+/// Provides generic interface to test whether MLDataType is a Sequence,
+/// Map or an Opaque type including arbitrary recursive definitions
+/// without querying DataTypeImpl::GetType<T> for all known complex types
+
+// T is a sequence contained element type
+// If returns true then we know that the runtime
+// representation is std::vector<T>
+// T itself can be a runtime representation of another
+// sequence, map, opaque type or a tensor
+//
+// That is it can be std::vector or a std::map
+// If T is a primitive type sequence is tested whether it contains
+// tensors of that type
+//
+// If T is an opaque type, then it is only tested to be opaque but not exactly
+// a specific opaque type. To Test for a specific Opaque type use IsOpaqueType() below
+//
+// This class examines the supplied MLDataType and records
+// its information in a vector so any subsequent checks for Sequences and Maps
+// are quick.
+class ContainerChecker {
+  using Cont = std::vector<data_types_internal::TypeNode>;
+  Cont types_;
+
+  // Default IsContainerOfType is for Opaque type
+  template <class T>
+  struct IsContainerOfType {
+    static bool check(const Cont& c, size_t index) {
+      if (index >= c.size()) {
+        return false;
+      }
+      return c[index].IsType(data_types_internal::ContainerType::kOpaque);
+    }
+  };
+
+  // Handles the case where sequence element is also a sequence
+  template <class T>
+  struct IsContainerOfType<std::vector<T>> {
+    static bool check(const Cont& c, size_t index) {
+      if (index >= c.size()) {
+        return false;
+      }
+      if (c[index].IsType(data_types_internal::ContainerType::kSequence)) {
+        ORT_ENFORCE(++index < c.size(), "Sequence is missing type entry for its element");
+        constexpr int32_t prim_type = ToTensorProtoElementType<T>();
+        // Check if this is a primitive type and it matches
+        if constexpr (prim_type != ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED) {
+          return c[index].IsType(data_types_internal::ContainerType::kTensor) &&
+                 c[index].IsPrimType(prim_type);
+        } else {
+          // T is not primitive, check next entry for non-primitive proto
+          return IsContainerOfType<T>::check(c, index);
+        }
+      }
+      return false;
+    }
+  };
+
+  template <class K, class V>
+  struct IsContainerOfType<std::map<K, V>> {
+    static bool check(const Cont& c, size_t index) {
+      static_assert(ToTensorProtoElementType<K>() != ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED,
+                    "Map Key can not be a non-primitive type");
+      if (index >= c.size()) {
+        return false;
+      }
+      if (!c[index].IsType(data_types_internal::ContainerType::kMap)) {
+        return false;
+      }
+      constexpr int32_t key_type = ToTensorProtoElementType<K>();
+      if (!c[index].IsPrimType(key_type)) {
+        return false;
+      }
+      ORT_ENFORCE(++index < c.size(), "Map is missing type entry for its value");
+      constexpr int32_t val_type = ToTensorProtoElementType<V>();
+      if constexpr (val_type != ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED) {
+        return c[index].IsType(data_types_internal::ContainerType::kTensor) &&
+               c[index].IsPrimType(val_type);
+      } else
+        return IsContainerOfType<V>::check(c, index);
+    }
+  };
+
+ public:
+  explicit ContainerChecker(MLDataType);
+  ~ContainerChecker() = default;
+
+  bool IsMap() const noexcept {
+    assert(!types_.empty());
+    return types_[0].IsType(data_types_internal::ContainerType::kMap);
+  }
+
+  bool IsSequence() const noexcept {
+    assert(!types_.empty());
+    return types_[0].IsType(data_types_internal::ContainerType::kSequence);
+  }
+
+  template <class T>
+  bool IsSequenceOf() const {
+    assert(!types_.empty());
+    return IsContainerOfType<std::vector<T>>::check(types_, 0);
+  }
+
+  template <class K, class V>
+  bool IsMapOf() const {
+    assert(!types_.empty());
+    return IsContainerOfType<std::map<K, V>>::check(types_, 0);
+  }
+};
+
+bool IsOpaqueType(MLDataType ml_type, const char* domain, const char* name);
+
+}  // namespace utils
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/endian.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/endian.h
new file mode 100644
index 00000000000000..629fb78f0fa72d
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/endian.h
@@ -0,0 +1,27 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+namespace onnxruntime {
+
+// the semantics of this enum should match std::endian from C++20
+enum class endian {
+#if defined(_WIN32)
+  little = 0,
+  big = 1,
+  native = little,
+#elif defined(__GNUC__) || defined(__clang__)
+  little = __ORDER_LITTLE_ENDIAN__,
+  big = __ORDER_BIG_ENDIAN__,
+  native = __BYTE_ORDER__,
+#else
+#error onnxruntime::endian is not implemented in this environment.
+#endif
+};
+
+static_assert(
+    endian::native == endian::little || endian::native == endian::big,
+    "Only little-endian or big-endian native byte orders are supported.");
+
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/execution_provider.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/execution_provider.h
new file mode 100644
index 00000000000000..0d9e6db1a77481
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/execution_provider.h
@@ -0,0 +1,356 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#ifndef SHARED_PROVIDER
+#include <memory>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "core/common/logging/logging.h"
+#include "core/common/status.h"
+#include "core/framework/data_transfer.h"
+#include "core/framework/external_data_loader.h"
+#include "core/framework/tensor.h"
+
+namespace onnxruntime {
+class GraphViewer;
+struct ComputeCapability;
+class KernelRegistry;
+struct KernelCreateInfo;
+class Node;
+}  // namespace onnxruntime
+#else
+#include <memory>
+#endif
+
+#include "core/common/basic_types.h"
+#include "core/common/profiler_common.h"
+#include "core/framework/allocator_utils.h"
+#include "core/framework/func_api.h"
+#include "core/framework/provider_options.h"
+#include "core/framework/framework_provider_common.h"
+#include "core/framework/stream_handles.h"
+#include "core/framework/tuning_context.h"
+
+struct OrtRunOptions;
+
+namespace onnxruntime {
+
+/**
+   Logical device representation.
+*/
+
+// if we are export the fused function to dll, the function will still in the same binary as onnxruntime
+// use std function to give execution provider some chance to capture some state.
+using CreateFunctionStateFunc = std::function<int(ComputeContext*, FunctionState*)>;
+using ComputeFunc = std::function<Status(FunctionState, const OrtApi*, OrtKernelContext*)>;
+using DestroyFunctionStateFunc = std::function<void(FunctionState)>;
+
+struct NodeComputeInfo {
+  CreateFunctionStateFunc create_state_func;
+  ComputeFunc compute_func;
+  DestroyFunctionStateFunc release_state_func;
+};
+
+using RunOptions = ::OrtRunOptions;
+
+enum class DataLayout {
+  NCHW,
+  NHWC,
+  NCHWC,
+};
+
+class IExecutionProvider {
+ protected:
+  IExecutionProvider(const std::string& type)
+      : IExecutionProvider(type, OrtDevice()) {}
+
+  IExecutionProvider(const std::string& type, OrtDevice device)
+      : default_device_(device), type_{type} {
+  }
+
+  /*
+     default device for this ExecutionProvider
+  */
+  const OrtDevice default_device_;
+
+ public:
+  virtual ~IExecutionProvider() = default;
+
+  /**
+   * Returns a data transfer object that implements methods to copy to and
+   * from this device.
+   * If no copy is required for the successful operation of this provider,
+   * return a nullptr.
+   */
+  virtual std::unique_ptr<onnxruntime::IDataTransfer> GetDataTransfer() const {
+    return nullptr;
+  }
+
+  /**
+   * Returns an external data loader object that implements methods to load data from external sources.
+   *
+   * By default, framework will handle external data loading by loading the data into CPU memory and then copying
+   * it to the target device if required. So in most cases, it's not necessary to override this method. Specifically,
+   * in WebAssembly build, because the memory is limited and Web platform supports loading data from external sources
+   * directly into GPU memory, this method is overridden to provide a custom external data loader to avoid the extra
+   * CPU memory usage.
+   */
+  virtual std::unique_ptr<onnxruntime::IExternalDataLoader> GetExternalDataLoader() const {
+    return nullptr;
+  }
+
+  /**
+   * Interface for performing kernel lookup within kernel registries.
+   * Abstracts away lower-level details about kernel registries and kernel matching.
+   */
+  class IKernelLookup {
+   public:
+    /**
+     * Given `node`, try to find a matching kernel for this EP.
+     * The return value is non-null if and only if a matching kernel was found.
+     */
+    virtual const KernelCreateInfo* LookUpKernel(const Node& node) const = 0;
+
+   protected:
+    ~IKernelLookup() = default;
+  };
+
+  /**
+     Get execution provider's capability for the specified <graph>.
+     Return a bunch of IndexedSubGraphs <*this> execution provider can run if
+     the sub-graph contains only one node or can fuse to run if the sub-graph
+     contains more than one node. The node indexes contained in sub-graphs may
+     have overlap, and it's ONNXRuntime's responsibility to do the partition
+     and decide whether a node will be assigned to <*this> execution provider.
+     For kernels registered in a kernel registry, `kernel_lookup` must be used
+     to find a matching kernel for this EP.
+  */
+  virtual std::vector<std::unique_ptr<ComputeCapability>>
+  GetCapability(const onnxruntime::GraphViewer& graph_viewer,
+                const IKernelLookup& kernel_lookup) const;
+
+  /**
+     Get kernel registry per execution provider type.
+     The KernelRegistry share pointer returned is shared across sessions.
+
+     NOTE: this approach was taken to achieve the following goals,
+     1. The execution provider type based kernel registry should be shared
+     across sessions.
+     Only one copy of this kind of kernel registry exists in ONNXRuntime
+     with multiple sessions/models.
+     2. Adding an execution provider into ONNXRuntime does not need to touch ONNXRuntime
+     framework/session code.
+     3. onnxruntime (framework/session) does not depend on any specific
+     execution provider lib.
+  */
+  virtual std::shared_ptr<KernelRegistry> GetKernelRegistry() const { return nullptr; }
+
+  /**
+     Get the device id of current execution provider
+  */
+  virtual int GetDeviceId() const { return 0; };
+
+  /**
+     Get execution provider's configuration options.
+   */
+  virtual ProviderOptions GetProviderOptions() const { return {}; }
+
+  /**
+     Get provider specific custom op domain list.
+     Provider has the responsibility to release OrtCustomOpDomain instances it creates.
+
+     NOTE: In the case of ONNX model having EP specific custom nodes and don't want to ask user to register those nodes,
+     EP might need to a way to register those custom nodes. This API is added for the purpose where EP can use it to
+     leverage ORT custom op to register those custom nodes with one or more custom op domains.
+
+     For example, TensorRT EP uses this API to support TRT plugins where each custom op is mapped to TRT plugin and no
+     kernel implementation is needed for custom op since the real implementation is inside TRT. This custom op acts as
+     a role to help pass ONNX model validation.
+   */
+  virtual void GetCustomOpDomainList(std::vector<OrtCustomOpDomain*>& /*provider custom op domain list*/) const {};
+
+  /**
+     Returns an opaque handle whose exact type varies based on the provider
+     and is interpreted accordingly by the corresponding kernel implementation.
+     For Direct3D operator kernels, this may return an IUnknown supporting
+     QueryInterface to ID3D12GraphicsCommandList1.
+  */
+  virtual const void* GetExecutionHandle() const noexcept {
+    return nullptr;
+  }
+
+  /**
+     @return type of the execution provider; should match that set in the node
+     through the SetExecutionProvider API. Example valid return values are:
+     kCpuExecutionProvider, kCudaExecutionProvider
+  */
+  const std::string& Type() const { return type_; }
+
+  /**
+     Blocks until the device has completed all preceding requested tasks.
+     Currently this is primarily used by the IOBinding object to ensure that all
+     inputs have been copied to the device before execution begins.
+  */
+  virtual common::Status Sync() const { return Status::OK(); }
+
+  /**
+     Called when InferenceSession::Run started
+     NOTE that due to async execution in provider, the actual work of previous
+     Run may not be finished on device This function should be regarded as the
+     point after which a new Run would start to submit commands from CPU
+  */
+  virtual common::Status OnRunStart(const onnxruntime::RunOptions& /*run_options*/) { return Status::OK(); }
+
+  /**
+     Called when InferenceSession::Run ended
+     NOTE that due to async execution in provider, the actual work of this Run
+     may not be finished on device This function should be regarded as the point
+     that all commands of current Run has been submmited by CPU
+  */
+  virtual common::Status OnRunEnd(bool /*sync_stream*/, const onnxruntime::RunOptions& /*run_options*/) {
+    return Status::OK();
+  }
+
+  /**
+     Called when InferenceSession::SetEpDynamicOptions is called
+  */
+  virtual common::Status SetEpDynamicOptions(gsl::span<const char* const> /*keys*/,
+                                             gsl::span<const char* const> /*values*/) {
+    return Status::OK();
+  }
+
+  /**
+     Indicate whether the graph capturing mode (e.g., cuda graph) is enabled for
+     the provider.
+   */
+  virtual bool IsGraphCaptureEnabled() const { return false; }
+
+  /**
+     Indicate whether the graph has been captured and instantiated.
+   */
+  virtual bool IsGraphCaptured(int /*graph_annotation_id*/) const { return false; }
+
+  /**
+     Run the instantiated graph.
+   */
+  virtual common::Status ReplayGraph(int /*graph_annotation_id*/) {
+    return Status::OK();
+  }
+
+  /**
+     Called when session creation is complete
+     This provides an opportunity for execution providers to optionally synchronize and
+     clean up its temporary resources to reduce memory and ensure the first run is fast.
+  */
+  virtual common::Status OnSessionInitializationEnd() { return Status::OK(); }
+
+  struct FusedNodeAndGraph {
+    const std::reference_wrapper<onnxruntime::Node> fused_node;
+    // GraphViewer that filters the full graph to the nodes that are covered by 'node'
+    const std::reference_wrapper<GraphViewer> filtered_graph;
+  };
+
+  // Fusion approach that is supported
+  // !!! The "Function" FusionStyle is deprecated.
+  // !!! If your EP is using this fusion style, please migrate it to "FilteredGraphViewer" style.
+  enum class FusionStyle {
+    // The node fusion will create an onnxruntime::Function based Node that contains a completely new Graph instance
+    // in the Node body. The original nodes and initializers are copied to the new Graph instance in Function::Body().
+    // A GraphProto can be produced from the Node body.
+    Function,
+
+    // The node fusion will create a new Node that defines the inputs and outputs using the IndexedSubGraph
+    // that GetCapability returned. The Node will not be onnxruntime::Function based so will have no Body().
+    // Instead a GraphViewer that filters the full Graph to the fused Nodes will be created.
+    // This is significantly cheaper as it doesn't incur the cost of creating a new Graph instance,
+    // and can be supported in a minimal build.
+    FilteredGraphViewer
+  };
+
+  virtual FusionStyle GetFusionStyle() const {
+    // All the ORT build in EP has migrate to FilteredGraphViewer style.
+    // For newer EPs, please avoid use Function style as it is deprecated.
+    return FusionStyle::FilteredGraphViewer;
+  }
+
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+  /**
+  Given a collection of fused Nodes and the respective GraphViewer instance for the nodes that were fused,
+  return create_state/compute/release_state func for each node.
+  @remarks This is now the default interface when execution provider wants to compile nodes
+           for both minimal build and complete ort build.
+
+           Do NOT cache the GraphViewer in FusedNodeAndGraph.filtered_graph in any of the NodeComputeInfo functions
+           as it is only valid for the duration of the call to Compile.
+  */
+  virtual common::Status Compile(const std::vector<FusedNodeAndGraph>& fused_nodes_and_graphs,
+                                 std::vector<NodeComputeInfo>& node_compute_funcs);
+
+#endif
+
+  void SetLogger(const logging::Logger* logger) {
+    logger_ = logger;
+  }
+
+  const logging::Logger* GetLogger() const {
+    return logger_;
+  }
+
+  virtual std::unique_ptr<profiling::EpProfiler> GetProfiler() {
+    return {};
+  }
+
+  virtual DataLayout GetPreferredLayout() const {
+    // NCHW is the default ONNX standard data layout. So default to it.
+    // EPs which prefer a different layout should override to return their preferred layout.
+    return DataLayout::NCHW;
+  }
+
+  virtual void RegisterStreamHandlers(IStreamCommandHandleRegistry& /*stream_handle_registry*/, AllocatorMap&) const {}
+
+  /** Does the EP support concurrent calls to InferenceSession::Run to execute the model.
+   */
+  virtual bool ConcurrentRunSupported() const { return true; }
+
+  /**
+   * Return the tuning context which holds all TunableOp state.
+   */
+  virtual ITuningContext* GetTuningContext() const {
+    return nullptr;
+  }
+
+  /**
+   * Return the appropriate OrtDevice object given OrtMemType.
+   */
+  virtual OrtDevice GetOrtDeviceByMemType(OrtMemType mem_type) const {
+    if (mem_type == OrtMemTypeCPUInput || mem_type == OrtMemTypeCPUOutput) {
+      return OrtDevice();  // default return CPU device.
+    }
+    return default_device_;
+  };
+
+  /**
+   * Create Preferred allocators for the current Execution Provider
+   * This function is a stateless function which creates new instances of Allocator, without storing them in EP.
+   */
+  virtual std::vector<AllocatorPtr> CreatePreferredAllocators() { return std::vector<AllocatorPtr>(); };
+
+  /**
+   * Get the array of pointers for EPContext nodes
+   * EP needs to implement this if has the requirement to generate the context cache model. Otherwise leave it.
+   * Default return an empty vector if not provided by the Execution Provider
+   */
+  virtual const InlinedVector<const Node*> GetEpContextNodes() const {
+    return InlinedVector<const Node*>();
+  }
+
+ private:
+  const std::string type_;
+
+  // It will be set when this object is registered to a session
+  const logging::Logger* logger_ = nullptr;
+};
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/float16.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/float16.h
new file mode 100644
index 00000000000000..dac0a01fbc3fe9
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/float16.h
@@ -0,0 +1,441 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+
+#include <math.h>
+
+#include "endian.h"
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+#include "cuda_bf16.h"
+#endif
+
+#if !defined(__CUDACC__) && !defined(__HIPCC__)
+#include "core/common/narrow.h"
+#endif
+
+#include "core/common/common.h"
+
+#include "core/session/onnxruntime_float16.h"
+
+namespace onnxruntime {
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+#define ORT_HOST_DEVICE __host__ __device__
+#else
+#define ORT_HOST_DEVICE
+#endif
+
+// MLFloat16
+struct MLFloat16 : onnxruntime_float16::Float16Impl<MLFloat16> {
+ private:
+  explicit constexpr MLFloat16(uint16_t x) noexcept { val = x; }
+
+ public:
+  using Base = onnxruntime_float16::Float16Impl<MLFloat16>;
+
+  MLFloat16() = default;
+
+  constexpr static MLFloat16 FromBits(uint16_t x) noexcept { return MLFloat16(x); }
+
+  // Using inherited implementation instead of math floatToHalf allows us to use this
+  // in other shared providers without having to implement the bridge
+  explicit MLFloat16(float v) noexcept { val = Base::ToUint16Impl(v); }
+
+  static const MLFloat16 NaN;
+  static const MLFloat16 NegativeNaN;
+  static const MLFloat16 Infinity;
+  static const MLFloat16 NegativeInfinity;
+  static const MLFloat16 MaxValue;
+  static const MLFloat16 Zero;
+  static const MLFloat16 One;
+  static const MLFloat16 MinusOne;
+
+  // Using inherited implementation instead of math halfToFloat allows us to use this
+  // in other shared providers without having to implement the bridge
+  float ToFloat() const noexcept { return Base::ToFloatImpl(); }
+
+  using Base::IsNegative;
+
+  using Base::IsNaN;
+
+  using Base::IsFinite;
+
+  using Base::IsPositiveInfinity;
+
+  using Base::IsNegativeInfinity;
+
+  using Base::IsInfinity;
+
+  using Base::IsNaNOrZero;
+
+  using Base::IsNormal;
+
+  using Base::IsSubnormal;
+
+  using Base::Abs;
+
+  using Base::Negate;
+
+  operator float() const noexcept { return ToFloat(); }
+
+  using Base::operator==;
+  using Base::operator!=;
+  using Base::operator<;
+};
+
+// BFloat16
+struct BFloat16 : onnxruntime_float16::BFloat16Impl<BFloat16> {
+  using Base = onnxruntime_float16::BFloat16Impl<BFloat16>;
+
+#if defined(__HIP__)
+  ORT_HOST_DEVICE BFloat16() = default;
+#else
+  BFloat16() = default;
+#endif
+
+  struct FromBitsT {};
+  static constexpr ORT_HOST_DEVICE FromBitsT FromBits() noexcept { return FromBitsT(); }
+  constexpr ORT_HOST_DEVICE BFloat16(unsigned short bits, FromBitsT) noexcept { val = bits; }
+
+  static constexpr ORT_HOST_DEVICE BFloat16 FromBits(uint16_t bits) noexcept {
+    return BFloat16(bits, FromBits());
+  }
+
+  inline ORT_HOST_DEVICE BFloat16(float v) noexcept {
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+    val = __bfloat16_as_ushort(__float2bfloat16(v));
+#elif defined(__HIP__)
+    // We should be using memcpy in order to respect the strict aliasing rule but it fails in the HIP environment.
+    if (v != v) {  // isnan
+      val = UINT16_C(0x7FC0);
+    } else {
+      union {
+        uint32_t U32;
+        float F32;
+      };
+
+      F32 = v;
+      uint32_t rounding_bias = ((U32 >> 16) & 1) + UINT32_C(0x7FFF);
+      val = static_cast<uint16_t>((U32 + rounding_bias) >> 16);
+    }
+#else
+
+    // Use C isnan to work both in host and device
+    if (::isnan(v)) {
+      val = kPositiveQNaNBits;
+    } else {
+      auto get_msb_half = [](float fl) {
+        uint16_t result;
+        if constexpr (onnxruntime_float16::detail::endian::native == onnxruntime_float16::detail::endian::little) {
+          std::memcpy(&result, reinterpret_cast<char*>(&fl) + sizeof(uint16_t), sizeof(uint16_t));
+        } else {
+          std::memcpy(&result, &fl, sizeof(uint16_t));
+        }
+        return result;
+      };
+
+      uint16_t upper_bits = get_msb_half(v);
+      union {
+        uint32_t U32;
+        float F32;
+      };
+      F32 = v;
+      U32 += (upper_bits & 1) + kRoundToNearest;
+      val = get_msb_half(F32);
+    }
+#endif
+  }
+
+  inline ORT_HOST_DEVICE float ToFloat() const noexcept {
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+    return __bfloat162float(*reinterpret_cast<const __nv_bfloat16*>(&val));
+#elif defined(__HIP__)
+    // We should be using memcpy in order to respect the strict aliasing rule but it fails in the HIP environment.
+    float result = 0;
+    uint32_t tmp = val;
+    tmp <<= 16;
+    float* tempRes = reinterpret_cast<float*>(&tmp);
+    result = *tempRes;
+    return result;
+#else
+
+    if (IsNaNHostDevice()) {
+      return std::numeric_limits<float>::quiet_NaN();
+    }
+
+    float result = 0;
+    char* const first = reinterpret_cast<char*>(&result);
+    if constexpr (endian::native == endian::little) {
+      char* const second = first + sizeof(uint16_t);
+      std::memcpy(second, &val, sizeof(uint16_t));
+    } else {
+      std::memcpy(first, &val, sizeof(uint16_t));
+    }
+    return result;
+#endif
+  }
+
+  static const BFloat16 NaN;
+  static const BFloat16 NegativeNaN;
+  static const BFloat16 Infinity;
+  static const BFloat16 NegativeInfinity;
+  static const BFloat16 MaxValue;
+  static const BFloat16 Zero;
+  static const BFloat16 One;
+  static const BFloat16 MinusOne;
+
+  using Base::IsNegative;
+
+  using Base::IsNaN;
+
+  using Base::IsFinite;
+
+  using Base::IsPositiveInfinity;
+
+  using Base::IsNegativeInfinity;
+
+  using Base::IsInfinity;
+
+  using Base::IsNaNOrZero;
+
+  using Base::IsNormal;
+
+  using Base::IsSubnormal;
+
+  using Base::Abs;
+
+  using Base::Negate;
+
+  ORT_HOST_DEVICE operator float() const noexcept { return ToFloat(); }
+
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11000
+  ORT_HOST_DEVICE BFloat16(const __nv_bfloat16& value) { val = *reinterpret_cast<const unsigned short*>(&value); }
+  explicit ORT_HOST_DEVICE operator __nv_bfloat16() const { return *reinterpret_cast<const __nv_bfloat16*>(&val); }
+#endif
+
+  ORT_HOST_DEVICE bool operator==(const BFloat16& rhs) const noexcept {
+    if (IsNaNHostDevice() || rhs.IsNaNHostDevice()) {
+      // IEEE defines that NaN is not equal to anything, including itself.
+      return false;
+    }
+    return val == rhs.val;
+  }
+
+  ORT_HOST_DEVICE bool operator!=(const BFloat16& rhs) const noexcept {
+    return !(*this == rhs);
+  }
+
+  ORT_HOST_DEVICE bool operator<(const BFloat16& rhs) const noexcept {
+    if (IsNaNHostDevice() || rhs.IsNaNHostDevice()) {
+      // IEEE defines that NaN is unordered with respect to everything, including itself.
+      return false;
+    }
+
+    const bool left_is_negative = IsNegativeHostDevice();
+    if (left_is_negative != rhs.IsNegativeHostDevice()) {
+      // When the signs of left and right differ, we know that left is less than right if it is
+      // the negative value. The exception to this is if both values are zero, in which case IEEE
+      // says they should be equal, even if the signs differ.
+      return left_is_negative && !AreZeroHostDevice(*this, rhs);
+    }
+    return (val != rhs.val) && ((val < rhs.val) ^ left_is_negative);
+  }
+
+  ORT_HOST_DEVICE bool IsNegativeHostDevice() const noexcept {
+    return (val & kSignMask) != 0;
+  }
+
+  ORT_HOST_DEVICE bool IsNaNHostDevice() const noexcept {
+    return static_cast<uint16_t>(val & ~kSignMask) > kPositiveInfinityBits;
+  }
+
+  ORT_HOST_DEVICE static bool AreZeroHostDevice(const BFloat16Impl& lhs, const BFloat16Impl& rhs) noexcept {
+    // IEEE defines that positive and negative zero are equal, this gives us a quick equality check
+    // for two values by or'ing the private bits together and stripping the sign. They are both zero,
+    // and therefore equivalent, if the resulting value is still zero.
+    return static_cast<uint16_t>((lhs.val | rhs.val) & ~kSignMask) == 0;
+  }
+};
+
+// User defined suffixes to make it easier to declare
+// initializers with MLFloat16 and BFloat16 from unsigned short
+// E.g 10_f16 or 10_b16
+#if !defined(__CUDACC__) && !defined(__HIPCC__)
+inline MLFloat16 operator"" _f16(unsigned long long int v) noexcept {
+  return MLFloat16::FromBits(narrow<uint16_t>(v));
+}
+
+inline MLFloat16 operator"" _fp16(long double v) noexcept {
+  return MLFloat16(static_cast<float>(v));
+}
+
+inline BFloat16 operator"" _b16(unsigned long long int v) noexcept {
+  return BFloat16::FromBits((narrow<uint16_t>(v)));
+}
+
+inline BFloat16 operator"" _bfp16(long double v) noexcept {
+  return BFloat16(static_cast<float>(v));
+}
+#endif
+
+inline void BFloat16ToFloat(const BFloat16* blf, float* flt, size_t size) noexcept {
+  auto src = blf;
+  auto d = flt;
+  for (; size != 0; ++src, ++d, --size) {
+    *d = src->ToFloat();
+  }
+}
+
+inline void FloatToBFloat16(const float* flt, BFloat16* blf, size_t size) {
+  auto src = flt;
+  auto d = blf;
+  for (; size != 0; ++src, ++d, --size) {
+    *d = BFloat16(*src);
+  }
+}
+
+}  // namespace onnxruntime
+
+namespace std {
+
+template <>
+class numeric_limits<onnxruntime::MLFloat16> {
+ public:
+  static constexpr onnxruntime::MLFloat16 min() noexcept {
+    return onnxruntime::MLFloat16::FromBits(0x0400U);  // Minimum positive normalized value: 0.00006103515625
+  }
+
+  static constexpr onnxruntime::MLFloat16 max() noexcept {
+    return onnxruntime::MLFloat16::FromBits(0x7BFFU);  // Largest representable value: 65504
+  }
+
+  static constexpr onnxruntime::MLFloat16 lowest() noexcept {
+    return onnxruntime::MLFloat16::FromBits(0xFBFFU);  // Smallest representable value: -65504
+  }
+
+  static constexpr onnxruntime::MLFloat16 infinity() noexcept {
+    return onnxruntime::MLFloat16::FromBits(0x7C00U);  // Bits: sign(0), exponent(111,11), fraction(00,0000,0000)
+  }
+
+  static constexpr onnxruntime::MLFloat16 quiet_NaN() noexcept {
+    // The most significant fraction bit shall be 1, and no limitation on other fraction bits.
+    // Note that most frameworks use 0x7E00; while CUDA uses 0x7FFF; .Net System.Half.NaN uses 0xFE00;
+    return onnxruntime::MLFloat16::FromBits(0x7E00U);  // Bits: sign(0), exponent(111,11), fraction(10,0000,0000)
+  }
+
+  static constexpr onnxruntime::MLFloat16 signaling_NaN() noexcept {
+    return onnxruntime::MLFloat16::FromBits(0x7D00U);  // Bits: sign(0), exponent(111,11), fraction(01,0000,0000)
+  }
+
+  static constexpr onnxruntime::MLFloat16 denorm_min() noexcept {
+    return onnxruntime::MLFloat16::FromBits(0x0001U);  // Minimum subnormal value: 0.000000059604645
+  }
+
+  static constexpr onnxruntime::MLFloat16 epsilon() noexcept {
+    return onnxruntime::MLFloat16::FromBits(0x1400U);  // Difference between 1.0 and the next value: 2^-10 = 0.0009765625
+  }
+
+  static constexpr onnxruntime::MLFloat16 round_error() noexcept {
+    return onnxruntime::MLFloat16::FromBits(0x3800U);  // 0.5
+  }
+
+  static constexpr bool is_specialized = true;
+
+  static constexpr bool is_signed = true;
+  static constexpr bool is_integer = false;
+  static constexpr bool is_exact = false;
+  static constexpr bool has_infinity = true;
+  static constexpr bool has_quiet_NaN = true;
+  static constexpr bool has_signaling_NaN = true;
+  static constexpr float_denorm_style has_denorm = denorm_present;
+  static constexpr bool has_denorm_loss = false;
+
+  static constexpr bool is_bounded = true;
+  static constexpr bool is_iec559 = true;
+  static constexpr bool is_modulo = false;
+
+  static constexpr int digits = 11;       // Number of significant digits (mantissa)
+  static constexpr int digits10 = 3;      // Decimal digits of precision
+  static constexpr int max_digits10 = 5;  // Max decimal digits required for precision
+  static constexpr int radix = 2;
+  static constexpr int min_exponent = -13;
+  static constexpr int min_exponent10 = -4;
+  static constexpr int max_exponent = 16;
+  static constexpr int max_exponent10 = 4;
+
+  static constexpr bool traps = false;
+  static constexpr bool tinyness_before = false;
+  static constexpr std::float_round_style round_style = std::round_to_nearest;
+};
+
+template <>
+class numeric_limits<onnxruntime::BFloat16> {
+ public:
+  static constexpr onnxruntime::BFloat16 min() noexcept {
+    return onnxruntime::BFloat16::FromBits(0x0080U);  // Minimum positive normalized value: 1.175494e-38
+  }
+
+  static constexpr onnxruntime::BFloat16 max() noexcept {
+    return onnxruntime::BFloat16::FromBits(0x7F7FU);  // Largest representable value: 3.38953139e38
+  }
+
+  static constexpr onnxruntime::BFloat16 lowest() noexcept {
+    return onnxruntime::BFloat16::FromBits(0xFF7FU);  // Smallest representable value: -3.38953139e38
+  }
+
+  static constexpr onnxruntime::BFloat16 infinity() noexcept {
+    return onnxruntime::BFloat16::FromBits(0x7F80U);  // Bits: sign(0), exponent(111,1111,1), fraction(000,0000)
+  }
+
+  static constexpr onnxruntime::BFloat16 quiet_NaN() noexcept {
+    // The most significant fraction bit shall be 1, and no limitation on other fraction bits.
+    // Note that Torch, Tensorflow, OpenVino, nGraph uses 0x7FC0; Paddle uses 0x7FC1; CUDA uses 0x7FFF.
+    return onnxruntime::BFloat16::FromBits(0x7FC1U);  // Bits: sign(0), exponent(111,1111,1), fraction(100,0001)
+  }
+
+  static constexpr onnxruntime::BFloat16 signaling_NaN() noexcept {
+    // The most significant fraction bit shall be 0, and there is at least one 1 in other fraction bits.
+    return onnxruntime::BFloat16::FromBits(0x7F81U);  // Bits: sign(0), exponent(111,1111,1), fraction(000,0001)
+  }
+
+  static constexpr onnxruntime::BFloat16 denorm_min() noexcept {
+    return onnxruntime::BFloat16::FromBits(0x0001U);  // Minimum subnormal value: 9.1835e-41
+  }
+
+  static constexpr onnxruntime::BFloat16 epsilon() noexcept {
+    return onnxruntime::BFloat16::FromBits(0x3C00U);  // Difference between 1.0 and the next value: 2^-7 = 0.0078125
+  }
+
+  static constexpr onnxruntime::BFloat16 round_error() noexcept {
+    return onnxruntime::BFloat16::FromBits(0x3F00U);  // 0.5
+  }
+
+  static constexpr bool is_specialized = true;
+  static constexpr bool is_signed = true;
+  static constexpr bool is_integer = false;
+  static constexpr bool is_exact = false;
+  static constexpr bool has_infinity = true;
+  static constexpr bool has_quiet_NaN = true;
+  static constexpr bool has_signaling_NaN = true;
+  static constexpr float_denorm_style has_denorm = denorm_present;
+  static constexpr bool has_denorm_loss = false;
+
+  static constexpr bool is_bounded = true;
+  static constexpr bool is_iec559 = false;
+  static constexpr bool is_modulo = false;
+
+  static constexpr int digits = 8;
+  static constexpr int digits10 = 2;
+  static constexpr int max_digits10 = 4;
+  static constexpr int radix = 2;
+  static constexpr int min_exponent = -125;
+  static constexpr int min_exponent10 = -37;
+  static constexpr int max_exponent = 128;
+  static constexpr int max_exponent10 = 38;
+
+  static constexpr bool traps = false;
+  static constexpr bool tinyness_before = false;
+  static constexpr float_round_style round_style = round_to_nearest;
+};
+
+}  // namespace std
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/float8.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/float8.h
new file mode 100644
index 00000000000000..5d92ee86af8647
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/float8.h
@@ -0,0 +1,920 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#if !defined(DISABLE_FLOAT8_TYPES)
+
+#include "endian.h"
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11080
+#include "cuda_fp8.h"
+#endif
+
+#if !defined(__CUDACC__) && !defined(__HIPCC__)
+#include "core/common/narrow.h"
+#endif
+
+#include "core/common/common.h"
+
+namespace onnxruntime {
+
+#if defined(__CUDACC__) || defined(__HIPCC__)
+#define ORT_HOST_DEVICE __host__ __device__
+#else
+#define ORT_HOST_DEVICE
+#endif
+
+// Float8E4M3FN
+struct Float8E4M3FN {
+  uint8_t val{0};
+#if defined(__HIP__)
+  ORT_HOST_DEVICE Float8E4M3FN() = default;
+#else
+  Float8E4M3FN() = default;
+#endif
+  struct FromBitsT {};
+  static constexpr ORT_HOST_DEVICE FromBitsT FromBits() { return FromBitsT(); }
+  constexpr ORT_HOST_DEVICE Float8E4M3FN(unsigned char bits, FromBitsT) : val(bits) {}
+
+  inline explicit ORT_HOST_DEVICE Float8E4M3FN(float v, bool saturate = true) {
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11080
+    val = __nv_cvt_float_to_fp8(v, saturate ? __NV_SATFINITE : __NV_NOSAT, __NV_E4M3);
+#else
+    uint32_t b;
+    std::memcpy(&b, &v, sizeof(b));
+
+    val = static_cast<uint8_t>((b & 0x80000000) >> 24);  // sign
+    if ((b & 0x7fffffff) == 0x7f800000) {                // infinity
+      if (saturate) {
+        val |= 126;
+      } else {
+        val |= 0x7f;
+      }
+    } else if ((b & 0x7F800000) == 0x7F800000) {  // NaN
+      val |= 0x7f;
+    } else {
+      uint8_t e = static_cast<uint8_t>((b & 0x7F800000) >> 23);  // exponent
+      uint32_t m = static_cast<uint32_t>(b & 0x007FFFFF);        // mantissa
+      if (e != 0) {
+        if (e < 117) {
+        } else if (e < 121) {
+          // denormalized number
+          auto d = 120 - e;
+          if (d < 3) {
+            val |= 1 << (2 - d);
+            val |= m >> (21 + d);
+          } else if (m > 0) {
+            val |= 1;
+          }
+          auto mask = 1 << (20 + d);
+          if ((m & mask) && ((val & 1) || ((m & (mask - 1)) > 0) || ((m & mask) && (m & (mask << 1)) && ((m & (mask - 1)) == 0)))) {
+            // rounding
+            val += 1;
+          }
+        } else if (e < 136) {
+          // normalized number
+          auto ex = e - 120;
+          if (ex == 0) {
+            val |= 0x4;
+            val |= m >> 21;
+          } else {
+            val |= ex << 3;
+            val |= m >> 20;
+            if ((val & 0x7F) == 0x7F) {
+              val &= 0xFE;
+            }
+          }
+          if ((m & 0x80000) && ((m & 0x100000) || (m & 0x7FFFF))) {
+            if ((val & 0x7F) < 0x7E) {
+              // rounding
+              val += 1;
+            } else if (!saturate) {
+              val |= 0x7F;
+            }
+          }
+        } else if (saturate) {
+          val |= 126;  // 0b01111110
+        } else {
+          val |= 0x7F;
+        }
+      }
+    }
+#endif
+  }
+
+  inline ORT_HOST_DEVICE bool IsNaN() const {
+    return (val & 0b01111111) == 0b01111111;
+  }
+
+  inline ORT_HOST_DEVICE float ToFloat() const {
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11080
+    return __half2float(__nv_cvt_fp8_to_halfraw(val, __NV_E4M3));
+#else
+    uint32_t res;
+    if (val == 255) {
+      res = 0xffc00000;
+    } else if (val == 127) {
+      res = 0x7fc00000;
+    } else {
+      uint32_t expo = (val & 0x78) >> 3;
+      uint32_t mant = val & 0x07;
+      uint32_t sign = val & 0x80;
+      res = sign << 24;
+      if (expo == 0) {
+        if (mant > 0) {
+          expo = 0x7F - 7;
+          if ((mant & 0x4) == 0) {
+            mant &= 0x3;
+            mant <<= 1;
+            expo -= 1;
+          }
+          if ((mant & 0x4) == 0) {
+            mant &= 0x3;
+            mant <<= 1;
+            expo -= 1;
+          }
+          res |= (mant & 0x3) << 21;
+          res |= expo << 23;
+        }
+      } else {
+        res |= mant << 20;
+        expo -= 0x7;
+        expo += 0x7F;
+        res |= expo << 23;
+      }
+    }
+    float float_res;
+    std::memcpy(&float_res, &res, sizeof(float));
+    return float_res;
+#endif
+  }
+
+  inline ORT_HOST_DEVICE operator float() const { return ToFloat(); }
+
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11080
+  explicit ORT_HOST_DEVICE Float8E4M3FN(const __nv_fp8_e4m3& value) { val = *reinterpret_cast<const unsigned char*>(&value); }
+  explicit ORT_HOST_DEVICE operator __nv_fp8_e4m3() const { return *reinterpret_cast<const __nv_fp8_e4m3*>(&val); }
+#endif
+};
+
+inline ORT_HOST_DEVICE bool operator==(const Float8E4M3FN& left, const Float8E4M3FN& right) { return left.val == right.val; }
+inline ORT_HOST_DEVICE bool operator!=(const Float8E4M3FN& left, const Float8E4M3FN& right) { return left.val != right.val; }
+inline ORT_HOST_DEVICE bool operator<(const Float8E4M3FN& left, const Float8E4M3FN& right) { return left.val < right.val; }
+
+// User defined suffixes to make it easier to declare
+// initializers with MLFloat8E4M3FN and Float8E4M3FN from unsigned char
+#if !defined(__CUDACC__) && !defined(__HIPCC__)
+
+inline Float8E4M3FN operator"" _f8e4m3fn(unsigned long long int v) {
+  return Float8E4M3FN(narrow<uint8_t>(v), Float8E4M3FN::FromBits());
+}
+
+inline Float8E4M3FN operator"" _f8e4m3fnp8(long double v) {
+  return Float8E4M3FN(static_cast<float>(v), true);
+}
+
+#endif
+
+inline void Float8E4M3FNToFloat(const Float8E4M3FN* blf, float* flt, size_t size) {
+  auto src = blf;
+  auto d = flt;
+  for (; size != 0; ++src, ++d, --size) {
+    *d = src->ToFloat();
+  }
+}
+
+inline void FloatToFloat8E4M3FN(const float* flt, Float8E4M3FN* blf, size_t size, bool saturate) {
+  auto src = flt;
+  auto d = blf;
+  for (; size != 0; ++src, ++d, --size) {
+    new (d) Float8E4M3FN(*src, saturate);
+  }
+}
+
+// Float8E4M3FNUZ
+struct Float8E4M3FNUZ {
+  uint8_t val{0};
+#if defined(__HIP__)
+  ORT_HOST_DEVICE Float8E4M3FNUZ() = default;
+#else
+  Float8E4M3FNUZ() = default;
+#endif
+
+  struct FromBitsT {};
+  static constexpr ORT_HOST_DEVICE FromBitsT FromBits() { return FromBitsT(); }
+  constexpr ORT_HOST_DEVICE Float8E4M3FNUZ(unsigned char bits, FromBitsT) : val(bits) {}
+
+  inline explicit ORT_HOST_DEVICE Float8E4M3FNUZ(float v, bool saturate = true) {
+    // This type does not exist on CUDA.
+    uint32_t b;
+    std::memcpy(&b, &v, sizeof(b));
+
+    val = static_cast<uint8_t>((b & 0x80000000) >> 24);  // sign
+    if ((b & 0x7fffffff) == 0x7f800000) {                // infinity
+      if (saturate) {
+        // the highest available value
+        val |= 0x7F;
+      } else {
+        // NaN
+        val = 0x80;
+      }
+    } else if ((b & 0x7F800000) == 0x7F800000) {  // NaN
+      val = 0x80;
+    } else {
+      uint8_t e = static_cast<uint8_t>((b & 0x7F800000) >> 23);  // exponent
+      uint32_t m = static_cast<uint32_t>(b & 0x007FFFFF);        // mantissa
+
+      if (e < 116) {
+        // all near-zero numbers round to positive zero:
+        val = 0;
+      } else if (e < 120) {
+        // denormalized number
+        auto d = 119 - e;
+        if (d < 3) {
+          val |= 1 << (2 - d);
+          val |= m >> (21 + d);
+        } else if (m > 0) {
+          val |= 1;
+        } else {
+          // round to positive zero:
+          val = 0;
+        }
+        auto mask = 1 << (20 + d);
+        if ((m & mask) && ((val & 1) || ((m & (mask - 1)) > 0) || ((m & mask) && (m & (mask << 1)) && ((m & (mask - 1)) == 0)))) {
+          // rounding
+          val += 1;
+        }
+      } else if (e < 135) {
+        // normalized number
+        auto ex = e - 119;
+        if (ex == 0) {
+          val |= 0x4;
+          val |= m >> 21;
+        } else {
+          val |= ex << 3;
+          val |= m >> 20;
+        }
+        if ((m & 0x80000) && ((m & 0x100000) || (m & 0x7FFFF))) {
+          if ((val & 0x7F) < 0x7F) {
+            // rounding
+            val += 1;
+          } else if (!saturate) {
+            val = 0x80;
+          }
+        }
+      } else if (saturate) {
+        val |= 0x7F;
+      } else {
+        val = 0x80;
+      }
+    }
+  }
+
+  inline ORT_HOST_DEVICE bool IsNaN() const {
+    return val == 0b10000000;
+  }
+
+  inline ORT_HOST_DEVICE float ToFloat() const {
+    // This type does not exist on CUDA.
+    uint32_t res;
+    if (val == 0x80) {
+      res = 0xffc00000;
+    } else {
+      uint32_t expo = (val & 0x78) >> 3;
+      uint32_t mant = val & 0x07;
+      uint32_t sign = val & 0x80;
+      res = sign << 24;
+      if (expo == 0) {
+        if (mant > 0) {
+          expo = 0x7F - 8;
+          if ((mant & 0x4) == 0) {
+            mant &= 0x3;
+            mant <<= 1;
+            expo -= 1;
+          }
+          if ((mant & 0x4) == 0) {
+            mant &= 0x3;
+            mant <<= 1;
+            expo -= 1;
+          }
+          res |= (mant & 0x3) << 21;
+          res |= expo << 23;
+        }
+      } else {
+        res |= mant << 20;
+        expo -= 8;
+        expo += 0x7F;
+        res |= expo << 23;
+      }
+    }
+    float float_res;
+    std::memcpy(&float_res, &res, sizeof(float));
+    return float_res;
+  }
+
+  inline ORT_HOST_DEVICE operator float() const { return ToFloat(); }
+};
+
+inline ORT_HOST_DEVICE bool operator==(const Float8E4M3FNUZ& left, const Float8E4M3FNUZ& right) { return left.val == right.val; }
+inline ORT_HOST_DEVICE bool operator!=(const Float8E4M3FNUZ& left, const Float8E4M3FNUZ& right) { return left.val != right.val; }
+inline ORT_HOST_DEVICE bool operator<(const Float8E4M3FNUZ& left, const Float8E4M3FNUZ& right) { return left.val < right.val; }
+
+// User defined suffixes to make it easier to declare
+// initializers with MLFloat8E4M3FN and Float8E4M3FN from unsigned char
+#if !defined(__CUDACC__) && !defined(__HIPCC__)
+
+inline Float8E4M3FNUZ operator"" _f8e4m3p8fnuz(unsigned long long int v) {
+  return Float8E4M3FNUZ(narrow<uint8_t>(v), Float8E4M3FNUZ::FromBits());
+}
+
+inline Float8E4M3FNUZ operator"" _f8e4m3fnuzp8(long double v) {
+  return Float8E4M3FNUZ(static_cast<float>(v), true);
+}
+
+#endif
+
+inline void Float8E4M3FNUZToFloat(const Float8E4M3FNUZ* blf, float* flt, size_t size) {
+  auto src = blf;
+  auto d = flt;
+  for (; size != 0; ++src, ++d, --size) {
+    *d = src->ToFloat();
+  }
+}
+
+inline void FloatToFloat8E4M3FNUZ(const float* flt, Float8E4M3FNUZ* blf, size_t size, bool saturate) {
+  auto src = flt;
+  auto d = blf;
+  for (; size != 0; ++src, ++d, --size) {
+    new (d) Float8E4M3FNUZ(*src, saturate);
+  }
+}
+
+// Float8E5M2
+struct Float8E5M2 {
+  uint8_t val{0};
+#if defined(__HIP__)
+  ORT_HOST_DEVICE Float8E5M2() = default;
+#else
+  Float8E5M2() = default;
+#endif
+
+  struct FromBitsT {};
+  static constexpr ORT_HOST_DEVICE FromBitsT FromBits() { return FromBitsT(); }
+  constexpr ORT_HOST_DEVICE Float8E5M2(unsigned char bits, FromBitsT) : val(bits) {}
+
+  inline explicit ORT_HOST_DEVICE Float8E5M2(float v, bool saturate = true) {
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11080
+    val = __nv_cvt_float_to_fp8(v, saturate ? __NV_SATFINITE : __NV_NOSAT, __NV_E5M2);
+#else
+    uint32_t b;
+    std::memcpy(&b, &v, sizeof(b));
+
+    val = (b & 0x80000000) >> 24;          // sign
+    if ((b & 0x7FFFFFFF) == 0x7F800000) {  // inf
+      if (saturate) {
+        // the highest available value
+        val |= 0x7B;
+      } else {
+        // the infinity
+        val |= 0x7C;
+      }
+    } else if ((b & 0x7F800000) == 0x7F800000) {  // NaN
+      val |= 0x7f;
+    } else {
+      uint32_t e = (b & 0x7F800000) >> 23;  // exponent
+      uint32_t m = b & 0x007FFFFF;          // mantissa
+
+      if (e != 0) {
+        if (e < 110) {
+        } else if (e < 113) {
+          // denormalized number
+          auto d = 112 - e;
+          if (d < 2) {
+            val |= 1 << (1 - d);
+            val |= m >> (22 + d);
+          } else if (m > 0) {
+            val |= 1;
+          }
+          auto mask = 1 << (21 + d);
+          if ((m & mask) && ((val & 1) || ((m & (mask - 1)) > 0) || ((m & mask) && (m & (mask << 1)) && ((m & (mask - 1)) == 0)))) {
+            // rounding
+            val += 1;
+          }
+        } else if (e < 143) {  // 127 + 15 + 1
+          auto ex = e - 112;   // 127 - 15
+          val |= ex << 2;
+          val |= m >> 21;
+          if ((m & 0x100000) && ((m & 0xFFFFF) || (m & 0x200000))) {
+            if ((val & 0x7F) < 0x7B) {
+              // rounding
+              val += 1;
+            } else if (saturate) {
+              val |= 0x7B;
+            } else {
+              val |= 0x7C;
+            }
+          }
+        } else if (saturate) {
+          val |= 0x7B;
+        } else {
+          val |= 0x7C;
+        }
+      }
+    }
+#endif
+  }
+
+  inline ORT_HOST_DEVICE bool IsNaN() const {
+    // 7D, 7E, 7F are positive NaNs; FD, FE, FF are negative NaNs
+    return (val & 0b01111111) > 0b01111100;
+  }
+
+  inline ORT_HOST_DEVICE bool IsInfinity() const {
+    // 7C and FC are infinity
+    return (val & 0b01111111) == 0b01111100;
+  }
+
+  inline ORT_HOST_DEVICE float ToFloat() const {
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11080
+    return __half2float(__nv_cvt_fp8_to_halfraw(val, __NV_E5M2));
+#else
+    uint32_t res;
+    if (val >= 253) {
+      res = 0xffc00000;
+    } else if (val >= 125 && val <= 127) {
+      res = 0x7fc00000;
+    } else if (val == 252) {
+      res = 0xff800000;
+    } else if (val == 124) {
+      res = 0x7f800000;
+    } else {
+      uint32_t expo = (val & 0x7C) >> 2;
+      uint32_t mant = val & 0x03;
+      uint32_t sign = val & 0x80;
+      res = sign << 24;
+      if (expo == 0) {
+        if (mant > 0) {
+          expo = 0x7F - 15;
+          if ((mant & 0x2) == 0) {
+            mant &= 0x1;
+            mant <<= 1;
+            expo -= 1;
+          }
+          res |= (mant & 0x1) << 22;
+          res |= expo << 23;
+        }
+      } else {
+        res |= mant << 21;
+        expo -= 15;
+        expo += 0x7F;
+        res |= expo << 23;
+      }
+    }
+
+    float float_res;
+    std::memcpy(&float_res, &res, sizeof(float));
+    return float_res;
+#endif
+  }
+
+  inline ORT_HOST_DEVICE operator float() const { return ToFloat(); }
+
+#if defined(CUDA_VERSION) && CUDA_VERSION >= 11080
+  ORT_HOST_DEVICE Float8E5M2(const __nv_fp8_e5m2& value) { val = *reinterpret_cast<const unsigned char*>(&value); }
+  explicit ORT_HOST_DEVICE operator __nv_fp8_e5m2() const { return *reinterpret_cast<const __nv_fp8_e5m2*>(&val); }
+#endif
+};
+
+inline ORT_HOST_DEVICE bool operator==(const Float8E5M2& left, const Float8E5M2& right) { return left.val == right.val; }
+inline ORT_HOST_DEVICE bool operator!=(const Float8E5M2& left, const Float8E5M2& right) { return left.val != right.val; }
+inline ORT_HOST_DEVICE bool operator<(const Float8E5M2& left, const Float8E5M2& right) { return left.val < right.val; }
+
+// User defined suffixes to make it easier to declare
+// initializers with MLFloat8E5M2 and Float8E5M2 from unsigned char
+#if !defined(__CUDACC__) && !defined(__HIPCC__)
+
+inline Float8E5M2 operator"" _f8e5m2fn(unsigned long long int v) {
+  return Float8E5M2(narrow<uint8_t>(v), Float8E5M2::FromBits());
+}
+
+inline Float8E5M2 operator"" _f8e5m2fnp8(long double v) {
+  return Float8E5M2(static_cast<float>(v), true);
+}
+
+#endif
+
+inline void Float8E5M2ToFloat(const Float8E5M2* blf, float* flt, size_t size) {
+  auto src = blf;
+  auto d = flt;
+  for (; size != 0; ++src, ++d, --size) {
+    *d = src->ToFloat();
+  }
+}
+
+inline void FloatToFloat8E5M2(const float* flt, Float8E5M2* blf, size_t size, bool saturate) {
+  auto src = flt;
+  auto d = blf;
+  for (; size != 0; ++src, ++d, --size) {
+    new (d) Float8E5M2(*src, saturate);
+  }
+}
+
+// Float8E5M2FNUZ
+struct Float8E5M2FNUZ {
+  uint8_t val{0};
+#if defined(__HIP__)
+  ORT_HOST_DEVICE Float8E5M2FNUZ() = default;
+#else
+  Float8E5M2FNUZ() = default;
+#endif
+
+  struct FromBitsT {};
+  static constexpr ORT_HOST_DEVICE FromBitsT FromBits() { return FromBitsT(); }
+  constexpr ORT_HOST_DEVICE Float8E5M2FNUZ(unsigned char bits, FromBitsT) : val(bits) {}
+
+  inline explicit ORT_HOST_DEVICE Float8E5M2FNUZ(float v, bool saturate = true) {
+    // This type does not exist on CUDA.
+    uint32_t b;
+    std::memcpy(&b, &v, sizeof(b));
+
+    val = (b & 0x80000000) >> 24;          // sign
+    if ((b & 0x7FFFFFFF) == 0x7F800000) {  // inf
+      if (saturate) {
+        val |= 0x7F;
+      } else {
+        val = 0x80;
+      }
+    } else if ((b & 0x7F800000) == 0x7F800000) {  // NaN
+      val = 0x80;
+    } else {
+      uint32_t e = (b & 0x7F800000) >> 23;  // exponent
+      uint32_t m = b & 0x007FFFFF;          // mantissa
+
+      if (e < 109) {
+        // all near-zero numbers round to positive zero:
+        val = 0;
+      } else if (e < 112) {
+        // denormalized number
+        auto d = 111 - e;
+        if (d < 2) {
+          val |= 1 << (1 - d);
+          val |= m >> (22 + d);
+        } else if (m > 0) {
+          val |= 1;
+        } else {
+          // round to positive zero:
+          val = 0;
+        }
+        auto mask = 1 << (21 + d);
+        if ((m & mask) && ((val & 1) || ((m & (mask - 1)) > 0) || ((m & mask) && (m & (mask << 1)) && ((m & (mask - 1)) == 0)))) {
+          // rounding
+          val += 1;
+        }
+      } else if (e < 143) {
+        // normalized number
+        auto ex = e - 111;
+        val |= ex << 2;
+        val |= m >> 21;
+        if ((m & 0x100000) && ((m & 0xFFFFF) || (m & 0x200000))) {
+          if ((val & 0x7F) < 0x7F) {
+            // rounding
+            val += 1;
+          } else if (!saturate) {
+            val = 0x80;
+          }
+        }
+      } else if ((e == 255) && (m == 0)) {
+        val = 0x80;
+      } else if (saturate) {
+        val |= 0x7F;
+      } else {
+        val = 0x80;
+      }
+    }
+  }
+
+  inline ORT_HOST_DEVICE bool IsNaN() const {
+    return val == 0b10000000;
+  }
+
+  inline ORT_HOST_DEVICE float ToFloat() const {
+    // This type does not exist on CUDA.
+    uint32_t res;
+    if (val == 0x80) {
+      res = 0xffc00000;
+    } else {
+      uint32_t expo = (val & 0x7C) >> 2;
+      uint32_t mant = val & 0x03;
+      uint32_t sign = val & 0x80;
+      res = sign << 24;
+      if (expo == 0) {
+        if (mant > 0) {
+          expo = 0x7F - 16;
+          if ((mant & 0x2) == 0) {
+            mant &= 0x1;
+            mant <<= 1;
+            expo -= 1;
+          }
+          res |= (mant & 0x1) << 22;
+          res |= expo << 23;
+        }
+      } else {
+        res |= mant << 21;
+        expo -= 16;
+        expo += 0x7F;
+        res |= expo << 23;
+      }
+    }
+
+    float float_res;
+    std::memcpy(&float_res, &res, sizeof(float));
+    return float_res;
+  }
+
+  inline ORT_HOST_DEVICE operator float() const { return ToFloat(); }
+};
+
+inline ORT_HOST_DEVICE bool operator==(const Float8E5M2FNUZ& left, const Float8E5M2FNUZ& right) { return left.val == right.val; }
+inline ORT_HOST_DEVICE bool operator!=(const Float8E5M2FNUZ& left, const Float8E5M2FNUZ& right) { return left.val != right.val; }
+inline ORT_HOST_DEVICE bool operator<(const Float8E5M2FNUZ& left, const Float8E5M2FNUZ& right) { return left.val < right.val; }
+
+// User defined suffixes to make it easier to declare
+// initializers with MLFloat8E5M2 and Float8E5M2 from unsigned char
+#if !defined(__CUDACC__) && !defined(__HIPCC__)
+
+inline Float8E5M2FNUZ operator"" _f8e5m2fnuz(unsigned long long int v) {
+  return Float8E5M2FNUZ(narrow<uint8_t>(v), Float8E5M2FNUZ::FromBits());
+}
+
+inline Float8E5M2FNUZ operator"" _f8e5m2fnuzp8(long double v) {
+  return Float8E5M2FNUZ(static_cast<float>(v), true);
+}
+
+#endif
+
+inline void Float8E5M2FNUZToFloat(const Float8E5M2FNUZ* blf, float* flt, size_t size) {
+  auto src = blf;
+  auto d = flt;
+  for (; size != 0; ++src, ++d, --size) {
+    *d = src->ToFloat();
+  }
+}
+
+inline void FloatToFloat8E5M2FNUZ(const float* flt, Float8E5M2FNUZ* blf, size_t size, bool saturate) {
+  auto src = flt;
+  auto d = blf;
+  for (; size != 0; ++src, ++d, --size) {
+    new (d) Float8E5M2FNUZ(*src, saturate);
+  }
+}
+
+}  // namespace onnxruntime
+
+namespace std {
+
+template <>
+class numeric_limits<onnxruntime::Float8E4M3FN> {
+ public:
+  static constexpr onnxruntime::Float8E4M3FN lowest() {
+    return onnxruntime::Float8E4M3FN(0xFE, onnxruntime::Float8E4M3FN::FromBits());  // -448
+  }
+
+  static constexpr onnxruntime::Float8E4M3FN max() {
+    return onnxruntime::Float8E4M3FN(0x7E, onnxruntime::Float8E4M3FN::FromBits());  // 448
+  }
+
+  static constexpr onnxruntime::Float8E4M3FN min() {
+    return onnxruntime::Float8E4M3FN(0x08, onnxruntime::Float8E4M3FN::FromBits());  // 2^-6 = 0.015625
+  }
+
+  static constexpr onnxruntime::Float8E4M3FN denorm_min() {
+    return onnxruntime::Float8E4M3FN(0x01, onnxruntime::Float8E4M3FN::FromBits());  // 2^-9 = 0.001953125
+  }
+
+  static constexpr onnxruntime::Float8E4M3FN epsilon() {
+    return onnxruntime::Float8E4M3FN(0x20, onnxruntime::Float8E4M3FN::FromBits());
+  }
+
+  static constexpr onnxruntime::Float8E4M3FN round_error() {
+    return onnxruntime::Float8E4M3FN(0x30, onnxruntime::Float8E4M3FN::FromBits());
+  }
+
+  static constexpr onnxruntime::Float8E4M3FN infinity() {
+    // no infinity, returns quiet NaN instead
+    return quiet_NaN();
+  }
+
+  static constexpr onnxruntime::Float8E4M3FN quiet_NaN() {
+    return onnxruntime::Float8E4M3FN(0x7F, onnxruntime::Float8E4M3FN::FromBits());
+  }
+
+  static constexpr bool is_specialized = true;
+  static constexpr bool is_signed = true;
+  static constexpr bool is_integer = false;
+  static constexpr bool is_exact = false;
+  static constexpr bool has_infinity = false;
+  static constexpr bool has_quiet_NaN = true;
+  static constexpr bool has_signaling_NaN = false;
+  static constexpr auto has_denorm = true;
+  static constexpr auto has_denorm_loss = true;
+  static constexpr auto round_style = round_to_nearest;
+  static constexpr bool is_iec559 = false;
+  static constexpr bool is_bounded = true;
+  static constexpr bool is_modulo = false;
+  static constexpr int digits = 4;
+  static constexpr int digits10 = 0;
+  static constexpr int max_digits10 = 3;
+  static constexpr int radix = 2;
+  static constexpr int min_exponent = -5;
+  static constexpr int min_exponent10 = -1;
+  static constexpr int max_exponent = 8;
+  static constexpr int max_exponent10 = 2;
+  static constexpr auto traps = false;
+  static constexpr auto tinyness_before = false;
+};
+
+template <>
+class numeric_limits<onnxruntime::Float8E5M2> {
+ public:
+  static constexpr onnxruntime::Float8E5M2 lowest() {
+    return onnxruntime::Float8E5M2(0xFB, onnxruntime::Float8E5M2::FromBits());  // -57344.0
+  }
+
+  static constexpr onnxruntime::Float8E5M2 max() {
+    return onnxruntime::Float8E5M2(0x7B, onnxruntime::Float8E5M2::FromBits());  // 57344.0
+  }
+
+  static constexpr onnxruntime::Float8E5M2 min() {
+    return onnxruntime::Float8E5M2(0x4, onnxruntime::Float8E5M2::FromBits());  // 2^-14 = 0.00006103515
+  }
+
+  static constexpr onnxruntime::Float8E5M2 denorm_min() {
+    return onnxruntime::Float8E5M2(0x01, onnxruntime::Float8E5M2::FromBits());  // 2^-16 = 0.00001525878
+  }
+
+  static constexpr onnxruntime::Float8E5M2 epsilon() {
+    return onnxruntime::Float8E5M2(0x34, onnxruntime::Float8E5M2::FromBits());
+  }
+
+  static constexpr onnxruntime::Float8E5M2 round_error() {
+    return onnxruntime::Float8E5M2(0x38, onnxruntime::Float8E5M2::FromBits());
+  }
+
+  static constexpr onnxruntime::Float8E5M2 infinity() {
+    return onnxruntime::Float8E5M2(0x7C, onnxruntime::Float8E5M2::FromBits());
+  }
+
+  static constexpr onnxruntime::Float8E5M2 quiet_NaN() {
+    return onnxruntime::Float8E5M2(0x7F, onnxruntime::Float8E5M2::FromBits());
+  }
+
+  static constexpr bool is_specialized = true;
+  static constexpr bool is_signed = true;
+  static constexpr bool is_integer = false;
+  static constexpr bool is_exact = false;
+  static constexpr bool has_infinity = true;
+  static constexpr bool has_quiet_NaN = true;
+  static constexpr bool has_signaling_NaN = false;
+  static constexpr auto has_denorm = true;
+  static constexpr auto has_denorm_loss = true;
+  static constexpr auto round_style = round_to_nearest;
+  static constexpr bool is_iec559 = false;
+  static constexpr bool is_bounded = true;
+  static constexpr bool is_modulo = false;
+  static constexpr int digits = 3;
+  static constexpr int digits10 = 0;
+  static constexpr int max_digits10 = 2;
+  static constexpr int radix = 2;
+  static constexpr int min_exponent = -13;
+  static constexpr int min_exponent10 = -4;
+  static constexpr int max_exponent = 16;
+  static constexpr int max_exponent10 = 4;
+  static constexpr auto traps = false;
+  static constexpr auto tinyness_before = false;
+};
+
+template <>
+class numeric_limits<onnxruntime::Float8E4M3FNUZ> {
+ public:
+  static constexpr onnxruntime::Float8E4M3FNUZ lowest() {
+    return onnxruntime::Float8E4M3FNUZ(0xFF, onnxruntime::Float8E4M3FNUZ::FromBits());  // -240.0
+  }
+
+  static constexpr onnxruntime::Float8E4M3FNUZ max() {
+    return onnxruntime::Float8E4M3FNUZ(0x7F, onnxruntime::Float8E4M3FNUZ::FromBits());  // 240.0
+  }
+
+  static constexpr onnxruntime::Float8E4M3FNUZ min() {
+    return onnxruntime::Float8E4M3FNUZ(0x08, onnxruntime::Float8E4M3FNUZ::FromBits());  // 2^-7 = 0.0078125
+  }
+
+  static constexpr onnxruntime::Float8E4M3FNUZ denorm_min() {
+    return onnxruntime::Float8E4M3FNUZ(0x01, onnxruntime::Float8E4M3FNUZ::FromBits());  // 2^-10 = 0.0009765625
+  }
+
+  static constexpr onnxruntime::Float8E4M3FNUZ epsilon() {
+    return onnxruntime::Float8E4M3FNUZ(0x28, onnxruntime::Float8E4M3FNUZ::FromBits());
+  }
+
+  static constexpr onnxruntime::Float8E4M3FNUZ round_error() {
+    return onnxruntime::Float8E4M3FNUZ(0x38, onnxruntime::Float8E4M3FNUZ::FromBits());
+  }
+
+  static constexpr onnxruntime::Float8E4M3FNUZ infinity() {
+    // no infinity, returns quiet NaN instead
+    return quiet_NaN();
+  }
+
+  static constexpr onnxruntime::Float8E4M3FNUZ quiet_NaN() {
+    return onnxruntime::Float8E4M3FNUZ(0x80, onnxruntime::Float8E4M3FNUZ::FromBits());
+  }
+
+  static constexpr bool is_specialized = true;
+  static constexpr bool is_signed = true;
+  static constexpr bool is_integer = false;
+  static constexpr bool is_exact = false;
+  static constexpr bool has_infinity = false;
+  static constexpr bool has_quiet_NaN = true;
+  static constexpr bool has_signaling_NaN = false;
+  static constexpr auto has_denorm = true;
+  static constexpr auto has_denorm_loss = true;
+  static constexpr auto round_style = round_to_nearest;
+  static constexpr bool is_iec559 = false;
+  static constexpr bool is_bounded = true;
+  static constexpr bool is_modulo = false;
+  static constexpr int digits = 4;
+  static constexpr int digits10 = 0;
+  static constexpr int max_digits10 = 3;
+  static constexpr int radix = 2;
+  static constexpr int min_exponent = -6;
+  static constexpr int min_exponent10 = -1;
+  static constexpr int max_exponent = 8;
+  static constexpr int max_exponent10 = 2;
+  static constexpr auto traps = false;
+  static constexpr auto tinyness_before = false;
+};
+
+template <>
+class numeric_limits<onnxruntime::Float8E5M2FNUZ> {
+ public:
+  static constexpr onnxruntime::Float8E5M2FNUZ lowest() {
+    return onnxruntime::Float8E5M2FNUZ(0xFF, onnxruntime::Float8E5M2FNUZ::FromBits());  // -57344.0
+  }
+
+  static constexpr onnxruntime::Float8E5M2FNUZ max() {
+    return onnxruntime::Float8E5M2FNUZ(0x7F, onnxruntime::Float8E5M2FNUZ::FromBits());  // 57344.0
+  }
+
+  static constexpr onnxruntime::Float8E5M2FNUZ min() {
+    return onnxruntime::Float8E5M2FNUZ(0x04, onnxruntime::Float8E5M2FNUZ::FromBits());  // 2^-15 = 0.00003051757
+  }
+
+  static constexpr onnxruntime::Float8E5M2FNUZ denorm_min() {
+    return onnxruntime::Float8E5M2FNUZ(0x01, onnxruntime::Float8E5M2FNUZ::FromBits());  // 2^-17 = 0.00000762939
+  }
+
+  static constexpr onnxruntime::Float8E5M2FNUZ epsilon() {
+    return onnxruntime::Float8E5M2FNUZ(0x34, onnxruntime::Float8E5M2FNUZ::FromBits());
+  }
+
+  static constexpr onnxruntime::Float8E5M2FNUZ round_error() {
+    return onnxruntime::Float8E5M2FNUZ(0x38, onnxruntime::Float8E5M2FNUZ::FromBits());
+  }
+
+  static constexpr onnxruntime::Float8E5M2FNUZ infinity() {
+    // no infinity, returns quiet NaN instead
+    return quiet_NaN();
+  }
+
+  static constexpr onnxruntime::Float8E5M2FNUZ quiet_NaN() {
+    return onnxruntime::Float8E5M2FNUZ(0x80, onnxruntime::Float8E5M2FNUZ::FromBits());
+  }
+
+  static constexpr bool is_specialized = true;
+  static constexpr bool is_signed = true;
+  static constexpr bool is_integer = false;
+  static constexpr bool is_exact = false;
+  static constexpr bool has_infinity = false;
+  static constexpr bool has_quiet_NaN = true;
+  static constexpr bool has_signaling_NaN = false;
+  static constexpr auto has_denorm = true;
+  static constexpr auto has_denorm_loss = true;
+  static constexpr auto round_style = round_to_nearest;
+  static constexpr bool is_iec559 = false;
+  static constexpr bool is_bounded = true;
+  static constexpr bool is_modulo = false;
+  static constexpr int digits = 3;
+  static constexpr int digits10 = 0;
+  static constexpr int max_digits10 = 2;
+  static constexpr int radix = 2;
+  static constexpr int min_exponent = -14;
+  static constexpr int min_exponent10 = -4;
+  static constexpr int max_exponent = 16;
+  static constexpr int max_exponent10 = 4;
+  static constexpr auto traps = false;
+  static constexpr auto tinyness_before = false;
+};
+
+}  // namespace std
+
+#endif  // DISABLE_FLOAT8_TYPES
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/framework_common.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/framework_common.h
new file mode 100644
index 00000000000000..dd0dea9856b0c9
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/framework_common.h
@@ -0,0 +1,22 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "run_options.h"
+
+namespace onnxruntime {  // forward declarations
+class Model;
+class GraphTransformer;
+class NodeArg;
+}  // namespace onnxruntime
+
+namespace onnxruntime {
+using InputDefList = std::vector<const onnxruntime::NodeArg*>;
+using OutputDefList = std::vector<const onnxruntime::NodeArg*>;
+
+using NameMLValMap = std::unordered_map<std::string, OrtValue>;
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/framework_provider_common.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/framework_provider_common.h
new file mode 100644
index 00000000000000..7c53f8289894b8
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/framework_provider_common.h
@@ -0,0 +1,10 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include "core/session/onnxruntime_c_api.h"
+
+struct OrtCustomOpDomain {
+  std::string domain_;
+  std::vector<const OrtCustomOp*> custom_ops_;
+};
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/func_api.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/func_api.h
new file mode 100644
index 00000000000000..8db47e775e3b83
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/func_api.h
@@ -0,0 +1,27 @@
+#pragma once
+#include "core/common/status.h"
+using onnxruntime::common::Status;  // TODO: Needed by WinML, but shouldn't be put into the global namespace like this
+
+namespace onnxruntime {
+
+// AllocateFunc(void* handle, size_t alignment, size_t size)
+using AllocateFunc = void* (*)(void*, size_t, size_t);
+using DestroyFunc = void (*)(void*, void*);
+using AllocatorHandle = void*;
+
+typedef struct {
+  // right now we only include allocation for host memory
+  AllocateFunc allocate_func;
+  DestroyFunc release_func;
+  AllocatorHandle allocator_handle;
+  const char* node_name;
+} ComputeContext;
+
+using FunctionState = void*;
+// take the ComputeContext, and create a function state.
+using CreateFunctionStateC = int (*)(ComputeContext*, FunctionState*);
+// pass in the function state and input/output tensors, perform compute and return status
+using ComputeFuncC = common::Status (*)(FunctionState, const OrtApi*, OrtKernelContext*);
+// release the function state.
+using DestroyFunctionStateC = void (*)(FunctionState);
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/int4.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/int4.h
new file mode 100644
index 00000000000000..0282b84bd0f829
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/int4.h
@@ -0,0 +1,159 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <cassert>
+#include <type_traits>
+#include "core/common/common.h"
+#include <gsl/gsl>
+
+namespace onnxruntime {
+
+template <bool Signed>
+struct Int4Traits;
+
+template <>
+struct Int4Traits<true> {
+  using UnpackedType = int8_t;
+  static constexpr int8_t min_val = -8;
+  static constexpr int8_t max_val = 7;
+};
+
+template <>
+struct Int4Traits<false> {
+  using UnpackedType = uint8_t;
+  static constexpr uint8_t min_val = 0;
+  static constexpr uint8_t max_val = 15;
+};
+
+/// <summary>
+/// Stores 2 packed 4-bit elements in 1 byte.
+/// </summary>
+/// <typeparam name="Signed">Set to true if signed int4, or false if unsigned uint4.</typeparam>
+template <bool Signed>
+struct Int4x2Base {
+  using UnpackedType = typename Int4Traits<Signed>::UnpackedType;
+  static constexpr UnpackedType min_val = Int4Traits<Signed>::min_val;
+  static constexpr UnpackedType max_val = Int4Traits<Signed>::max_val;
+
+  std::byte bits_{};
+
+  Int4x2Base() = default;
+
+  explicit Int4x2Base(std::byte bits) {
+    bits_ = bits;
+  }
+
+  Int4x2Base(UnpackedType val0, UnpackedType val1) {
+    bits_ = static_cast<std::byte>(((val1 & 0xF) << 4) | (val0 & 0xF));
+  }
+
+  static inline int8_t SignExtendLower4Bits(std::byte bits) {
+    // Sign-extend lower 4-bits by left shifting and then doing an arithmetic right shift.
+    constexpr uint8_t shift = (sizeof(int32_t) * 8) - 4;
+    return static_cast<int8_t>((static_cast<int32_t>(bits) << shift) >> shift);
+  }
+
+  inline UnpackedType GetElem(size_t index) const {
+    assert(index <= 1);
+    const uint8_t shift = 4 * static_cast<uint8_t>(index);
+    const std::byte val = (bits_ >> shift) & std::byte{0xF};
+
+    if constexpr (Signed) {
+      return SignExtendLower4Bits(val);
+    } else {
+      return static_cast<UnpackedType>(val);
+    }
+  }
+
+  inline void SetElem(size_t index, UnpackedType val) {
+    assert(index <= 1);
+    const uint8_t shift = 4 * static_cast<uint8_t>(index);
+    const std::byte mask = std::byte{0xF0} >> shift;
+
+    bits_ &= mask;                                          // Clear 4-bit element to 0
+    bits_ |= static_cast<std::byte>((val & 0xF) << shift);  // Set 4-bit element to val
+  }
+
+  inline std::byte ToBits() const {
+    return bits_;
+  }
+
+  static size_t CalcNumInt4Pairs(size_t num_int4_elems) {
+    return (num_int4_elems + 1) / 2;
+  }
+
+  /// <summary>
+  /// Copy a source buffer of 4-bit elements (packed) into a destination buffer of 8-bit elements (unpacked).
+  /// </summary>
+  /// <param name="dst">Destination buffer to store unpacked 8-bit elements</param>
+  /// <param name="src">Source buffer with 4-bit elements</param>
+  /// <returns>True on success</returns>
+  static bool Unpack(gsl::span<UnpackedType> dst, gsl::span<const Int4x2Base<Signed>> src) {
+    if (CalcNumInt4Pairs(dst.size()) != src.size()) {
+      return false;
+    }
+
+    if (src.empty()) {
+      return true;
+    }
+
+    for (size_t i = 0; i < dst.size(); i++) {
+      size_t r = i >> 1;   // i / 2;
+      size_t c = i & 0x1;  // i % 2;
+      dst[i] = src[r].GetElem(c);
+    }
+
+    return true;
+  }
+
+  /// <summary>
+  /// Copy a source buffer of 8-bit elements (unpacked) into a destination buffer of 4-bit elements (packed).
+  /// </summary>
+  /// <param name="dst">Destination buffer to store packed 4-bit elements</param>
+  /// <param name="src">Source buffer with 8-bit elements</param>
+  /// <returns>True on success</returns>
+  static bool Pack(gsl::span<Int4x2Base<Signed>> dst, gsl::span<const UnpackedType> src) {
+    if (CalcNumInt4Pairs(src.size()) != dst.size()) {
+      return false;
+    }
+
+    if (src.empty()) {
+      return true;
+    }
+
+    size_t src_i = 0;
+    size_t dst_i = 0;
+
+    for (; src_i < src.size() - 1; src_i += 2) {
+      dst[dst_i++] = Int4x2Base<Signed>(src[src_i], src[src_i + 1]);
+    }
+
+    if (src_i < src.size()) {
+      dst[dst_i] = Int4x2Base<Signed>(src[src_i], 0);
+    }
+
+    return true;
+  }
+
+  /// <summary>
+  /// Returns hierarchical indices for a packed int4 element from the given element index.
+  ///
+  /// Usage:
+  ///   Int4x2* data = ...;
+  ///   auto indices = GetTensorElemIndices(3);  // 4th int4 element
+  ///   int8_t elem = data[indices.first].GetElem(indices.second);
+  /// </summary>
+  /// <param name="index">Index of 4-bit element</param>
+  /// <returns>Unpacked element</returns>
+  static inline std::pair<size_t, size_t> GetTensorElemIndices(size_t index) {
+    return {index >> 1, index & 0x1};
+  }
+};
+
+using Int4x2 = Int4x2Base<true>;
+using UInt4x2 = Int4x2Base<false>;
+static_assert(sizeof(Int4x2) == sizeof(std::byte));
+static_assert(sizeof(UInt4x2) == sizeof(std::byte));
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/kernel_def_builder.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/kernel_def_builder.h
new file mode 100644
index 00000000000000..baccbe1929ac46
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/kernel_def_builder.h
@@ -0,0 +1,361 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <limits.h>
+#include <memory>
+#include <optional>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "core/common/common.h"
+#include "core/framework/allocator.h"
+#include "core/framework/data_types.h"
+#include "core/graph/basic_types.h"
+
+namespace onnxruntime {
+class KernelDefBuilder;
+
+typedef std::map<size_t, OrtMemType> MemTypeMap;
+
+class KernelDef {
+ private:
+  // note that input/output might be on CPU implicitly when the node is from CPU execution provider
+  constexpr static inline bool MemTypeOnCpuExplicitly(OrtMemType mem_type) {
+    return mem_type == OrtMemTypeCPUInput || mem_type == OrtMemTypeCPUOutput;
+  }
+
+ public:
+  explicit KernelDef() = default;
+
+  const std::string& OpName() const {
+    return op_name_;
+  }
+
+  const std::string& Domain() const {
+    return op_domain_;
+  }
+
+  void SinceVersion(/*out*/ int* start, /*out*/ int* end) const {
+    *start = op_since_version_start_;
+    *end = op_since_version_end_;
+  }
+
+  const std::pair<int, int> SinceVersion() const {
+    return std::pair<int, int>(op_since_version_start_, op_since_version_end_);
+  }
+
+  onnxruntime::ProviderType Provider() const {
+    return provider_type_;
+  }
+
+  // type constraints with types supported in this build
+  const std::unordered_map<std::string, std::vector<MLDataType>>& TypeConstraints() const {
+    return type_constraints_;
+  }
+
+  const std::vector<std::pair<int, int>>& MayInplace() const {
+    return inplace_map_;
+  }
+
+  const std::vector<std::pair<int, int>>& Alias() const {
+    return alias_map_;
+  }
+
+  const std::optional<std::pair<int, int>>& VariadicAlias() const {
+    return variadic_alias_offsets_;
+  }
+
+  OrtMemType InputMemoryType(size_t input_index) const {
+    auto it = input_memory_type_args_.find(input_index);
+    if (it == input_memory_type_args_.end())
+      return default_inputs_mem_type_;
+    return it->second;
+  }
+
+  bool IsInputOnCpu(size_t input_index) const { return MemTypeOnCpuExplicitly(InputMemoryType(input_index)); }
+
+  bool IsOutputOnCpu(size_t output_index) const { return MemTypeOnCpuExplicitly(OutputMemoryType(output_index)); }
+
+  bool AllocateInputsContiguously() const { return allocate_inputs_contiguously_; }
+
+  bool HasExternalOutputs() const { return external_outputs_; }
+
+#ifdef ENABLE_STRIDED_TENSORS
+  const std::vector<int>& MayStridedInput() const { return may_strided_inputs_; }
+  const std::vector<std::pair<int, int>>& MayStridedOutput() const { return may_strided_output_map_; }
+#endif
+
+  OrtMemType OutputMemoryType(size_t output_index) const {
+    auto it = output_memory_type_args_.find(output_index);
+    if (it == output_memory_type_args_.end())
+      return default_outputs_mem_type_;
+    return it->second;
+  }
+
+  int ExecQueueId() const {
+    return exec_queue_id_;
+  }
+
+  bool IsConflict(const KernelDef& other) const;
+
+ private:
+  friend class KernelDefBuilder;
+
+  // The operator name supported by <*this> kernel..
+  std::string op_name_;
+
+  // The operator since_version range supported by <*this> kernel.
+  // A kernel could support an operator definition between <op_since_version_start>
+  // and <op_since_version_end> (inclusive).
+  int op_since_version_start_ = 1;
+  int op_since_version_end_ = INT_MAX;
+
+  // The operator domain supported by <*this> kernel.
+  // Default to 'onnxruntime::kOnnxDomain'.
+  // Please note the behavior of std::string("") and std::string() are different
+  std::string op_domain_;
+
+  // The type of the execution provider.
+  std::string provider_type_;
+
+  // The data types that are supported in this build (enabled) for inputs/outputs.
+  // Key is input/output/type constraint name defined in op schema, Value is supported types.
+  std::unordered_map<std::string, std::vector<MLDataType>> type_constraints_;
+
+  // An element <i, j> means that output j reuses the memory of input i.
+  std::vector<std::pair<int, int>> inplace_map_;
+
+  // An element <i, j> means that output j is an alias of input i.
+  std::vector<std::pair<int, int>> alias_map_;
+
+  // This variable stores <input_offset, output_offset> for the variadic alias mapping
+  // output 'i + output_offset' is an alias of input 'i + input_offset' for all i >= 0
+  std::optional<std::pair<int, int>> variadic_alias_offsets_;
+
+  // Require input tensors to be allocated contiguously.
+  bool allocate_inputs_contiguously_ = false;
+
+  // Whether the outputs are from external.
+  bool external_outputs_ = false;
+
+#ifdef ENABLE_STRIDED_TENSORS
+  // An element i means i-th input can be strided tensor.
+  std::vector<int> may_strided_inputs_;
+
+  // An element <i, j> means j-th output can be a strided tensor, which share the data from i-th input.
+  std::vector<std::pair<int, int>> may_strided_output_map_;
+#endif
+
+  // The memory types of inputs/outputs of this kernel
+  MemTypeMap input_memory_type_args_;
+  MemTypeMap output_memory_type_args_;
+
+  // execution command queue id, 0 for default queue in execution provider
+  int exec_queue_id_ = 0;
+  // Default memory type for all inputs
+  OrtMemType default_inputs_mem_type_{OrtMemTypeDefault};
+  // Default memory type for all outputs
+  OrtMemType default_outputs_mem_type_{OrtMemTypeDefault};
+};
+
+class KernelDefBuilder {
+ public:
+  static std::unique_ptr<KernelDefBuilder> Create() { return std::make_unique<KernelDefBuilder>(); }
+
+  explicit KernelDefBuilder()
+      : kernel_def_(std::make_unique<KernelDef>()) {}
+
+  KernelDefBuilder& SetName(const std::string& op_name);
+  KernelDefBuilder& SetName(const char* op_name);
+
+  KernelDefBuilder& SetDomain(const std::string& domain);
+  KernelDefBuilder& SetDomain(const char* domain);
+
+  /**
+     This kernel supports operator definition since <since_version> (to latest).
+  */
+  KernelDefBuilder& SinceVersion(int since_version) {
+    kernel_def_->op_since_version_start_ = since_version;
+    return *this;
+  }
+
+  /**
+     The start and end version should be set accordingly per version range for
+     each domain registered in OpSchemaRegistry::DomainToVersionRange in
+     \onnxruntime\onnxruntime\core\graph\op.h as below.
+     Key: domain. Value: <lowest version, highest version> pair.
+     std::unordered_map<std::string, std::pair<int, int>> map_;
+  */
+  KernelDefBuilder& SinceVersion(int since_version_start, int since_version_end) {
+    kernel_def_->op_since_version_start_ = since_version_start;
+    kernel_def_->op_since_version_end_ = since_version_end;
+    return *this;
+  }
+
+  /**
+     The execution provider type of the kernel.
+  */
+  KernelDefBuilder& Provider(ProviderType provider_type);
+  KernelDefBuilder& Provider(const char* provider_type);
+
+  /**
+     Specify the set of types that this kernel supports. A further restriction
+     of the set of types specified in the op schema.
+
+     @param arg_name The arg name can be either op formal parameter name, say "X", or type
+                     argument name specified in op schema, say "T".
+     @param types The types that are supported in this build.
+  */
+  KernelDefBuilder& TypeConstraint(const std::string& arg_name, std::vector<MLDataType> types);
+  KernelDefBuilder& TypeConstraint(const char* arg_name, std::vector<MLDataType> types);
+
+  /**
+     Like TypeConstraint but supports just a single type.
+  */
+  KernelDefBuilder& TypeConstraint(const std::string& arg_name, MLDataType type);
+  KernelDefBuilder& TypeConstraint(const char* arg_name, MLDataType type);
+
+  /**
+     Inplace mapping from inputs to outputs allowed.
+     It means that uplayer runtime could do memory in-place optimization
+     as it will not impact the correctness of this kernel.
+  */
+  KernelDefBuilder& MayInplace(const std::vector<std::pair<int, int>>& inplaces);
+  KernelDefBuilder& MayInplace(int input_index, int output_index);
+
+  /**
+     Alias mapping from inputs to outputs. Different from Inplace that the
+     content of the tensor is not changed. This is to take care of operators
+     such as Identity and Reshape.
+  */
+  KernelDefBuilder& Alias(const std::vector<std::pair<int, int>>& aliases);
+  KernelDefBuilder& Alias(int input_index, int output_index);
+
+  /**
+     Apply variadic number of alias mapping from inputs to outputs.
+     This is effectively applying Alias(i + input_offset, i + output_offset) for i >= 0
+  */
+  KernelDefBuilder& VariadicAlias(int input_offset, int output_offset);
+
+  /**
+     Specify that this kernel requires input tensors to be allocated
+     contiguously. This allows kernels to execute as a single large
+     computation, rather than numerous smaller computations.
+  */
+  KernelDefBuilder& AllocateInputsContiguously() {
+    kernel_def_->allocate_inputs_contiguously_ = true;
+    return *this;
+  }
+
+  /**
+     Specify that this kernel's output buffers are passed from external,
+     i.e. not created or managed by ORT's memory allocator.
+
+     The OrtValue set as external outputs, must be safe to release as long as the OrtValue's reference
+     count reaches zero in ORT's allocation/deallocation plan. We usually create such an OrtValue
+     following flows: torch tensors --> to dlpack tensors (destructor will release a view of original torch tensor,
+         instead of releasing original torch tensor) --> to OrtValue.
+
+     When the OrtValue is not needed in the graph, then it will be released after calling the attached
+     destructor. The destructor will release the view of the original torch tensor, instead of releasing the original
+     torch tensor. This is to make sure the original torch tensor can still be okay to use externally,
+     even after OrtValue is released in the graph. (Recalled this OrtValue is also not reused by ORT).
+  */
+  KernelDefBuilder& ExternalOutputs() {
+    kernel_def_->external_outputs_ = true;
+    return *this;
+  }
+
+#ifdef ENABLE_STRIDED_TENSORS
+  /**
+     Specify that the input_index-th input can be strided tensor.
+   */
+  KernelDefBuilder& MayStridedInput(int input_index);
+
+  /**
+     Specify that the output_index-th output can be strided tensor, and share the data
+     from input_index-th input.
+   */
+  KernelDefBuilder& MayStridedOutput(int input_index, int output_index);
+#endif
+
+  /**
+     Specify that this kernel requires an input arg
+     in certain memory type (instead of the default, device memory).
+  */
+  KernelDefBuilder& InputMemoryType(OrtMemType type, int input_index) {
+    kernel_def_->input_memory_type_args_.insert(std::make_pair(input_index, type));
+    return *this;
+  }
+
+  /**
+     Specify that this kernel requires input arguments
+     in certain memory type (instead of the default, device memory).
+  */
+  KernelDefBuilder& InputMemoryType(OrtMemType type, const std::vector<int>& input_indexes) {
+    for (auto input_index : input_indexes) {
+      kernel_def_->input_memory_type_args_.insert(std::make_pair(input_index, type));
+    }
+    return *this;
+  }
+
+  /**
+     Specify that this kernel provides an output arg
+     in certain memory type (instead of the default, device memory).
+  */
+  KernelDefBuilder& OutputMemoryType(OrtMemType type, int output_index) {
+    kernel_def_->output_memory_type_args_.insert(std::make_pair(output_index, type));
+    return *this;
+  }
+
+  /**
+     Specify that this kernel provides an output arguments
+     in certain memory type (instead of the default, device memory).
+  */
+  KernelDefBuilder& OutputMemoryType(OrtMemType type, const std::vector<int>& output_indexes) {
+    for (auto output_index : output_indexes) {
+      kernel_def_->output_memory_type_args_.insert(std::make_pair(output_index, type));
+    }
+    return *this;
+  }
+
+  /**
+     Specify that this kernel runs on which execution queue in the provider
+  */
+  KernelDefBuilder& ExecQueueId(int queue_id) {
+    kernel_def_->exec_queue_id_ = queue_id;
+    return *this;
+  }
+
+  /**
+  Specify the default inputs memory type, if not specified, it is DefaultMemory
+  */
+  KernelDefBuilder& SetDefaultInputsMemoryType(OrtMemType mem_type) {
+    kernel_def_->default_inputs_mem_type_ = mem_type;
+    return *this;
+  }
+
+  /**
+  Specify the default outputs memory type, if not specified, it is DefaultMemory
+  */
+  KernelDefBuilder& SetDefaultOutputMemoryType(OrtMemType mem_type) {
+    kernel_def_->default_outputs_mem_type_ = mem_type;
+    return *this;
+  }
+
+  /**
+     Return the kernel definition, passing ownership of the KernelDef to the caller
+  */
+  std::unique_ptr<KernelDef> Build() {
+    return std::move(kernel_def_);
+  }
+
+ private:
+  // we own the KernelDef until Build() is called.
+  std::unique_ptr<KernelDef> kernel_def_;
+};
+
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/kernel_registry.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/kernel_registry.h
new file mode 100644
index 00000000000000..aaf533135429c4
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/kernel_registry.h
@@ -0,0 +1,133 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <string_view>
+
+#include "core/framework/op_kernel.h"
+
+namespace onnxruntime {
+namespace logging {
+class Logger;
+}
+
+using KernelCreateMap = std::multimap<std::string, KernelCreateInfo>;
+using KernelDefHashes = std::vector<std::pair<std::string, HashValue>>;
+
+class IKernelTypeStrResolver;
+
+/**
+ * Each provider has a KernelRegistry. Often, the KernelRegistry only belongs to that specific provider.
+ */
+class KernelRegistry {
+ public:
+  KernelRegistry() = default;
+
+  // Register a kernel with kernel definition and function to create the kernel.
+  Status Register(KernelDefBuilder& kernel_def_builder, const KernelCreateFn& kernel_creator);
+
+  Status Register(KernelCreateInfo&& create_info);
+
+  // TODO(edgchen1) for TryFindKernel(), consider using `out` != nullptr as indicator of whether kernel was found and
+  // Status as an indication of failure
+
+  // Check if an execution provider can create kernel for a node and return the kernel if so.
+  // Kernel matching uses the types from the node and the kernel_type_str_resolver.
+  Status TryFindKernel(const Node& node, ProviderType exec_provider,
+                       const IKernelTypeStrResolver& kernel_type_str_resolver,
+                       const logging::Logger& logger,
+                       const KernelCreateInfo** out) const;
+
+  // map of type constraint name to required type
+  using TypeConstraintMap = InlinedHashMap<std::string, MLDataType>;
+
+  // Check if an execution provider can create kernel for a node and return the kernel if so.
+  // Kernel matching uses the explicit type constraint name to required type map in type_constraints.
+  Status TryFindKernel(const Node& node, ProviderType exec_provider,
+                       const TypeConstraintMap& type_constraints,
+                       const logging::Logger& logger,
+                       const KernelCreateInfo** out) const;
+
+  /**
+   * @brief Find out whether a kernel is registered, without a node.
+   *        This should be useful in graph optimizers, to check whether
+   *        the node it is about to generate, is supported or not.
+   * @param exec_provider
+   * @param op_type
+   * @param domain
+   * @param version
+   * @param type_constraints
+   * @param out
+   * @return
+   */
+  Status TryFindKernel(ProviderType exec_provider,
+                       std::string_view op_type,
+                       std::string_view domain,
+                       int version,
+                       const KernelRegistry::TypeConstraintMap& type_constraints,
+                       const logging::Logger& logger,
+                       const KernelCreateInfo** out) const;
+
+  static bool HasImplementationOf(const KernelRegistry& r, const Node& node,
+                                  ProviderType exec_provider,
+                                  const IKernelTypeStrResolver& kernel_type_str_resolver,
+                                  const logging::Logger& logger) {
+    const KernelCreateInfo* info;
+    Status st = r.TryFindKernel(node, exec_provider, kernel_type_str_resolver, logger, &info);
+    return st.IsOK();
+  }
+
+  bool IsEmpty() const { return kernel_creator_fn_map_.empty(); }
+
+  // This is used by the opkernel doc generator to enlist all registered operators for a given provider's opkernel
+  const KernelCreateMap& GetKernelCreateMap() const {
+    return kernel_creator_fn_map_;
+  }
+
+ private:
+  // TryFindKernel implementation. Either kernel_type_str_resolver or type_constraints is provided.
+  Status TryFindKernelImpl(const Node& node, ProviderType exec_provider,
+                           const IKernelTypeStrResolver* kernel_type_str_resolver,
+                           const TypeConstraintMap* type_constraints,
+                           const logging::Logger& logger,
+                           const KernelCreateInfo** out) const;
+
+  // Check whether the types of inputs/outputs of the given node match the extra
+  // type-constraints of the given kernel. This serves two purposes: first, to
+  // select the right kernel implementation based on the types of the arguments
+  // when we have multiple kernels, e.g., Clip<float> and Clip<int>; second, to
+  // accommodate (and check) mapping of ONNX (specification) type to the onnxruntime
+  // implementation type (e.g., if we want to implement ONNX's float16 as a regular
+  // float in onnxruntime). (The second, however, requires a globally uniform mapping.)
+  //
+  // Note that this is not intended for type-checking the node against the ONNX
+  // type specification of the corresponding op, which is done before this check.
+  //
+  // In typical usage kernel_type_str_resolver is provided and type information from the node is used with
+  // kernel_type_str_resolver.
+  //
+  // There is also usage from a node dynamically created within a custom op via OrtApi CreateOp where an explicit
+  // type value for each type constraint is provided in type_constraints.
+  //
+  // Either kernel_type_str_resolver or type_constraints is provided and not both.
+  static bool VerifyKernelDef(const Node& node, const KernelDef& kernel_def,
+                              const IKernelTypeStrResolver* kernel_type_str_resolver,
+                              const TypeConstraintMap* type_constraints,
+                              std::string& error_str);
+
+  static std::string GetMapKey(std::string_view op_name, std::string_view domain, std::string_view provider) {
+    std::string key(op_name);
+    // use the kOnnxDomainAlias of 'ai.onnx' instead of kOnnxDomain's empty string
+    key.append(1, ' ').append(domain.empty() ? kOnnxDomainAlias : domain).append(1, ' ').append(provider);
+    return key;
+  }
+
+  static std::string GetMapKey(const KernelDef& kernel_def) {
+    return GetMapKey(kernel_def.OpName(), kernel_def.Domain(), kernel_def.Provider());
+  }
+  // Kernel create function map from op name to kernel creation info.
+  // key is opname+domain_name+provider_name
+  KernelCreateMap kernel_creator_fn_map_;
+};
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/op_kernel.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/op_kernel.h
new file mode 100644
index 00000000000000..375f0a4dc8dd2c
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/op_kernel.h
@@ -0,0 +1,405 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "boost/mp11.hpp"
+
+// It is safe to include the below header even if SHARED_PROVIDER macro is enabled
+// as it doesn't include any pb headers.
+#include "core/framework/buffer_deleter.h"
+#include "core/framework/prepacked_weights_container.h"
+
+#ifndef SHARED_PROVIDER
+#include <functional>
+
+#include "core/common/exceptions.h"
+#include "core/common/logging/logging.h"
+#include "core/common/status.h"
+#include "core/framework/execution_provider.h"
+#include "core/framework/kernel_def_builder.h"
+#include "core/framework/op_kernel_info.h"
+#include "core/framework/op_node_proto_helper.h"
+#include "core/framework/ort_value.h"
+#include "core/framework/sparse_tensor.h"
+#include "core/framework/tensor.h"
+#include "core/graph/constants.h"
+#include "core/graph/graph_viewer.h"
+#include "core/graph/onnx_protobuf.h"
+#include <gsl/gsl>
+namespace onnxruntime {
+class OpKernelContext;
+}
+#endif
+
+namespace onnxruntime {
+
+std::unique_ptr<OpKernelInfo> CopyOpKernelInfo(const OpKernelInfo& info);
+
+class OpKernel {
+ public:
+  using DoneCallback = std::function<void()>;
+
+  explicit OpKernel(const OpKernelInfo& info) : op_kernel_info_(CopyOpKernelInfo(info)) {}
+  virtual ~OpKernel() = default;
+
+  const onnxruntime::Node& Node() const;
+  const onnxruntime::KernelDef& KernelDef() const;
+
+  [[nodiscard]] virtual Status Compute(_Inout_ OpKernelContext* context) const = 0;
+
+  [[nodiscard]] virtual bool IsAsync() const {
+    // by default all kernels are sync version.
+    return false;
+  }
+
+  [[nodiscard]] virtual Status ComputeAsync(_Inout_ OpKernelContext*, DoneCallback) const {
+    ORT_NOT_IMPLEMENTED(__FUNCTION__, " is not implemented");
+  }
+
+  // Override this function to PrePack initialized constant tensor to the format as needed.
+  // For example, MatMul kernel can pack the input B if it is constant like code below.
+  //   Status PrePack(const Tensor& tensor, int input_idx, /*out*/ bool& is_packed,
+  //                  /*out*/ PrePackedWeights* prepacked_weight_for_caching,
+  //                  AllocatorPtr alloc) override {
+  //     is_packed = false;
+  //     if (input_idx == 1) {
+  //       is_packed = true;
+  //       this.Pack(tensor, this.buffer_, alloc);
+  //       if (prepacked_weight_for_caching) {
+  //           // LOGIC TO CACHE `this.buffer_` SINCE THE KERNEL DOESN"T OWN THE PACKED WEIGHT
+  //       }
+  //     }
+  //     return Status::OK();
+  //   }
+  // Please refer to MatMulIntegerToFloatBase for a complete example
+  // @param tensor: The initialized constant tensor
+  // @param input_idx: The input index of the tensor in this kernel
+  // @param alloc: The kernel's PrePack() method MUST use this allocator for allocating the pre-packed
+  //               weights' buffers. The alloc that the PrePack() method will receive will be either
+  //               the allocator tied to the session if the kernel owns the pre-packed buffer or an
+  //               allocator shared between sessions if the pre-packed buffer is to be shared across sessions
+  //               (i.e.) the kernel does not own the buffer.
+  // @param is_packed: Set it to true if the kernel packed the tensor or to false
+  //                   The kernel is responsible for keeping the packed data and related metadata if is_packed is true,
+  //                   and the original initialized constant tensor will be released and not accessible anymore in
+  //                   the Compute function.
+  // @param prepacked_weights: A PrePackedWeights instance will be provided to the kernel IF the pre-packed weights
+  //                          are meant to be stored in a shared container.
+
+  virtual Status
+  PrePack(const Tensor& /*tensor*/, int /*input_idx*/, AllocatorPtr /*alloc*/,
+          /*out*/ bool& is_packed, /*out*/ PrePackedWeights* /*prepacked_weights*/) {
+    is_packed = false;
+    return Status::OK();
+  }
+
+  // Override this function to return a list of attributes the session can safely remove
+  // after it is initialized and saved. This option is useful to reduce memory usage
+  // when the kernel does not reuse the operator attributes but copies them.
+  // All attributes returned by this method will be removed by method
+  // PruneRemovableAttributes of they exists.
+  // @param removable_attributes set of attributes the session can safely remove.
+  virtual Status GetRemovableAttributes(InlinedVector<std::string>& removable_attributes) const {
+    removable_attributes.clear();
+    return Status::OK();
+  }
+
+  // Override this function to use provided pre-packed weight.
+  // Status UseSharedPrePackedBuffers(std::vector<BufferUniquePtr>& prepacked_buffers,
+  //                                 int input_idx,
+  //                                 /*out*/ bool& used_shared_buffers) {
+  //     used_shared_buffers = true;
+  //     this.buffer_ = std::move(prepacked_buffers[0]);
+  //     return Status::OK();
+  //   }
+  // Please refer to MatMulIntegerToFloatBase for a complete example
+  // @param prepacked_buffers: The pre-packed buffers to be used by this kernel for the provided input index
+  //                           (Sometimes a single constant initializer may have multiple pre-packed buffers associated
+  //                            with it and it upto the kernel developer to store it in any order of their choice in PrePack()
+  //                            and must use the same order for retrieval in UseSharedPrePackedBuffers(). Though each element
+  //                           of this vector is a BufferUniquePtr, the deleter of the BufferUniquePtr is NULL. So actually they
+  //                           are raw pointers.
+  // @param input_idx: The input index of the tensor in this kernel
+  // @param used_shared_buffers: Boolean flag set by the kernel implementation indicating
+  // that the provided weight has been used by the kernel.
+  virtual Status UseSharedPrePackedBuffers(std::vector<BufferUniquePtr>& /*prepacked_buffers*/,
+                                           int /*input_idx*/,
+                                           /*out*/ bool& used_shared_buffers) {
+    used_shared_buffers = false;
+    return Status::OK();
+  }
+
+  const OrtDevice GetDevice(OrtMemType mem_type) const;
+  const OpKernelInfo& Info() const {
+    return *op_kernel_info_;
+  }
+
+ private:
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(OpKernel);
+  std::unique_ptr<OpKernelInfo> op_kernel_info_;
+};
+class FuncManager;
+using KernelCreateFn = std::function<Status(FuncManager& func_mgr, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out)>;
+using KernelCreatePtrFn = std::add_pointer<Status(FuncManager& func_mgr, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out)>::type;
+
+struct KernelCreateInfo {
+  std::unique_ptr<KernelDef> kernel_def;  // Owned and stored in the global kernel registry.
+  KernelCreateFn kernel_create_func;
+  Status status;
+
+  KernelCreateInfo(std::unique_ptr<KernelDef> definition,
+                   KernelCreateFn create_func)
+      : kernel_def(std::move(definition)),
+        kernel_create_func(create_func) {
+    assert(kernel_def != nullptr);
+  }
+
+  KernelCreateInfo(KernelCreateInfo&& other) noexcept
+      : kernel_def(std::move(other.kernel_def)),
+        kernel_create_func(std::move(other.kernel_create_func)) {}
+
+  KernelCreateInfo() = default;
+};
+
+// Forward declarations for the non-specialized BuildKernelCreateInfo method.
+template <typename T>
+KernelCreateInfo BuildKernelCreateInfo();
+
+namespace ml {
+template <typename T>
+KernelCreateInfo BuildKernelCreateInfo();
+}  // namespace ml
+
+namespace contrib {
+template <typename T>
+KernelCreateInfo BuildKernelCreateInfo();
+}  // namespace contrib
+
+namespace contrib {
+namespace cuda {
+template <typename T>
+KernelCreateInfo BuildKernelCreateInfo();
+}  // namespace cuda
+}  // namespace contrib
+
+namespace contrib {
+namespace js {
+template <typename T>
+KernelCreateInfo BuildKernelCreateInfo();
+}  // namespace js
+}  // namespace contrib
+
+namespace contrib {
+namespace rocm {
+template <typename T>
+KernelCreateInfo BuildKernelCreateInfo();
+}  // namespace rocm
+}  // namespace contrib
+
+namespace contrib {
+namespace snpe {
+template <typename T>
+KernelCreateInfo BuildKernelCreateInfo();
+}  // namespace snpe
+}  // namespace contrib
+
+using BuildKernelCreateInfoFn = KernelCreateInfo (*)();
+
+// Naming convention for operator kernel classes
+#define ONNX_OPERATOR_KERNEL_CLASS_NAME(provider, domain, ver, name) \
+  provider##_##name##_##domain##_ver##ver
+
+#define ONNX_CPU_OPERATOR_KERNEL(name, ver, builder, ...) \
+  ONNX_OPERATOR_KERNEL_EX(name, kOnnxDomain, ver, kCpuExecutionProvider, builder, __VA_ARGS__)
+
+#define ONNX_CPU_OPERATOR_ML_KERNEL(name, ver, builder, ...) \
+  ONNX_OPERATOR_KERNEL_EX(name, kMLDomain, ver, kCpuExecutionProvider, builder, __VA_ARGS__)
+
+#define ONNX_CPU_OPERATOR_MS_KERNEL(name, ver, builder, ...) \
+  ONNX_OPERATOR_KERNEL_EX(name, kMSDomain, ver, kCpuExecutionProvider, builder, __VA_ARGS__)
+
+#define ONNX_OPERATOR_KERNEL_EX(name, domain, ver, provider, builder, ...)                \
+  class ONNX_OPERATOR_KERNEL_CLASS_NAME(provider, domain, ver, name);                     \
+  template <>                                                                             \
+  KernelCreateInfo                                                                        \
+  BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(provider, domain, ver, name)>() { \
+    return KernelCreateInfo(                                                              \
+        builder.SetName(#name)                                                            \
+            .SetDomain(domain)                                                            \
+            .SinceVersion(ver)                                                            \
+            .Provider(provider)                                                           \
+            .Build(),                                                                     \
+        static_cast<KernelCreatePtrFn>(                                                   \
+            [](FuncManager&,                                                              \
+               const OpKernelInfo& info,                                                  \
+               std::unique_ptr<OpKernel>& out) -> Status {                                \
+              out = std::make_unique<__VA_ARGS__>(info);                                  \
+              return Status::OK();                                                        \
+            }));                                                                          \
+  }
+
+#define ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(provider, domain, startver, endver, name) \
+  provider##_##name##_##domain##_ver##startver##_##endver
+
+#define ONNX_CPU_OPERATOR_VERSIONED_KERNEL(name, startver, endver, builder, ...) \
+  ONNX_OPERATOR_VERSIONED_KERNEL_EX(name, kOnnxDomain, startver, endver, kCpuExecutionProvider, builder, __VA_ARGS__)
+
+#define ONNX_CPU_OPERATOR_VERSIONED_ML_KERNEL(name, startver, endver, builder, ...) \
+  ONNX_OPERATOR_VERSIONED_KERNEL_EX(name, kMLDomain, startver, endver, kCpuExecutionProvider, builder, __VA_ARGS__)
+
+#define ONNX_OPERATOR_VERSIONED_KERNEL_EX(name, domain, startver, endver, provider, builder, ...)                                  \
+  class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(provider, domain, startver, endver, name);                                       \
+  template <>                                                                                                                      \
+  KernelCreateInfo                                                                                                                 \
+  BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(provider, domain, startver, endver, name)>() {                   \
+    return KernelCreateInfo(                                                                                                       \
+        builder.SetName(#name)                                                                                                     \
+            .SetDomain(domain)                                                                                                     \
+            .SinceVersion(startver, endver)                                                                                        \
+            .Provider(provider)                                                                                                    \
+            .Build(),                                                                                                              \
+        static_cast<KernelCreatePtrFn>([](FuncManager&, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) -> Status { out = std::make_unique<__VA_ARGS__>(info); return Status::OK(); })); \
+  }
+
+#define ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(provider, domain, ver, type, name) \
+  provider##_##name##_##domain##_ver##ver##_##type
+
+#define ONNX_CPU_OPERATOR_TYPED_KERNEL(name, ver, type, builder, ...) \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(name, kOnnxDomain, ver, type, kCpuExecutionProvider, builder, __VA_ARGS__)
+
+#define ONNX_CPU_OPERATOR_TYPED_ML_KERNEL(name, ver, type, builder, ...) \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(name, kMLDomain, ver, type, kCpuExecutionProvider, builder, __VA_ARGS__)
+
+#define ONNX_CPU_OPERATOR_TYPED_MS_KERNEL(name, ver, type, builder, ...) \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(name, kMSDomain, ver, type, kCpuExecutionProvider, builder, __VA_ARGS__)
+
+#define ONNX_OPERATOR_TYPED_KERNEL_EX(name, domain, ver, type, provider, builder, ...)                                             \
+  class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(provider, domain, ver, type, name);                                                  \
+  template <>                                                                                                                      \
+  KernelCreateInfo                                                                                                                 \
+  BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(provider, domain, ver, type, name)>() {                              \
+    return KernelCreateInfo(                                                                                                       \
+        builder.SetName(#name)                                                                                                     \
+            .SetDomain(domain)                                                                                                     \
+            .SinceVersion(ver)                                                                                                     \
+            .Provider(provider)                                                                                                    \
+            .Build(),                                                                                                              \
+        static_cast<KernelCreatePtrFn>([](FuncManager&, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) -> Status { out = std::make_unique<__VA_ARGS__>(info); return Status::OK(); })); \
+  }
+
+#define ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(provider, domain, ver, type1, type2, name) \
+  provider##_##name##_##domain##_ver##ver##_##type1##_##type2
+
+#define ONNX_OPERATOR_TWO_TYPED_KERNEL_EX(name, domain, ver, type1, type2, provider, builder, ...)                                 \
+  class ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(provider, domain, ver, type1, type2, name);                                      \
+  template <>                                                                                                                      \
+  KernelCreateInfo                                                                                                                 \
+  BuildKernelCreateInfo<ONNX_OPERATOR_TWO_TYPED_KERNEL_CLASS_NAME(provider, domain, ver, type1, type2, name)>() {                  \
+    return KernelCreateInfo(                                                                                                       \
+        builder.SetName(#name)                                                                                                     \
+            .SetDomain(domain)                                                                                                     \
+            .SinceVersion(ver)                                                                                                     \
+            .Provider(provider)                                                                                                    \
+            .Build(),                                                                                                              \
+        static_cast<KernelCreatePtrFn>([](FuncManager&, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) -> Status { out = std::make_unique<__VA_ARGS__>(info); return Status::OK(); })); \
+  }
+
+#define ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(provider, domain, startver, endver, type, name) \
+  provider##_##name##_##domain##_ver##startver##_##endver##_##type
+
+#define ONNX_CPU_OPERATOR_VERSIONED_TYPED_KERNEL(name, startver, endver, type, builder, ...)                         \
+  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(name, kOnnxDomain, startver, endver, type, kCpuExecutionProvider, builder, \
+                                          __VA_ARGS__)
+
+#define ONNX_CPU_OPERATOR_VERSIONED_TYPED_ML_KERNEL(name, startver, endver, type, builder, ...)                    \
+  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(name, kMLDomain, startver, endver, type, kCpuExecutionProvider, builder, \
+                                          __VA_ARGS__)
+
+#define ONNX_CPU_OPERATOR_VERSIONED_TYPED_MS_KERNEL(name, startver, endver, type, builder, ...)                    \
+  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(name, kMSDomain, startver, endver, type, kCpuExecutionProvider, builder, \
+                                          __VA_ARGS__)
+
+#define ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(name, domain, startver, endver, type, provider, builder, ...)                      \
+  class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(provider, domain, startver, endver, type, name);                           \
+  template <>                                                                                                                      \
+  KernelCreateInfo                                                                                                                 \
+  BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(provider, domain, startver, endver,                        \
+                                                                        type, name)>() {                                           \
+    return KernelCreateInfo(                                                                                                       \
+        builder.SetName(#name)                                                                                                     \
+            .SetDomain(domain)                                                                                                     \
+            .SinceVersion(startver, endver)                                                                                        \
+            .Provider(provider)                                                                                                    \
+            .Build(),                                                                                                              \
+        static_cast<KernelCreatePtrFn>([](FuncManager&, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) -> Status { out = std::make_unique<__VA_ARGS__>(info); return Status::OK(); })); \
+  }
+
+#define ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(provider, domain, startver, endver, type1, type2, name) \
+  provider##_##name##_##domain##_ver##startver##_##endver##_##type1##_##type2
+
+#define ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_EX(name, domain, startver, endver, type1, type2,                                  \
+                                                    provider, builder, ...)                                                        \
+  class ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(provider, domain, startver, endver, type1, type2, name);               \
+  template <>                                                                                                                      \
+  KernelCreateInfo                                                                                                                 \
+  BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TWO_TYPED_KERNEL_CLASS_NAME(provider, domain, startver, endver,                    \
+                                                                            type1, type2, name)>() {                               \
+    return KernelCreateInfo(                                                                                                       \
+        builder.SetName(#name)                                                                                                     \
+            .SetDomain(domain)                                                                                                     \
+            .SinceVersion(startver, endver)                                                                                        \
+            .Provider(provider)                                                                                                    \
+            .Build(),                                                                                                              \
+        static_cast<KernelCreatePtrFn>([](FuncManager&, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out) -> Status { out = std::make_unique<__VA_ARGS__>(info); return Status::OK(); })); \
+  }
+
+template <typename... Types>
+struct BuildKernelDefConstraintsImpl {
+  std::vector<MLDataType> operator()() const {
+    return {DataTypeImpl::GetTensorType<Types>()...};
+  }
+};
+
+#if !defined(DISABLE_SPARSE_TENSORS)
+template <typename... Types>
+struct BuildKernelDefSparseConstraintsImpl {
+  std::vector<MLDataType> operator()() const {
+    return {DataTypeImpl::GetSparseTensorType<Types>()...};
+  }
+};
+#endif
+
+// Use within macro definitions to create a custom vector of constraints.
+// Example: #define REG_KERNEL(OP, VERSION, KERNEL_CLASS, Type, ...)
+//  .TypeConstraint("T", BuildKernelDefConstraints<Type, __VA_ARGS_>())
+template <typename... Types>
+inline std::vector<MLDataType> BuildKernelDefConstraints() {
+  return BuildKernelDefConstraintsImpl<Types...>{}();
+}
+
+#if !defined(DISABLE_SPARSE_TENSORS)
+template <typename... Types>
+inline std::vector<MLDataType> BuildKernelDefSparseConstraints() {
+  return BuildKernelDefSparseConstraintsImpl<Types...>{}();
+}
+#endif
+
+// version of BuildKernelDefConstraints() which takes a type list
+template <typename L>
+inline std::vector<MLDataType> BuildKernelDefConstraintsFromTypeList() {
+  return boost::mp11::mp_apply<BuildKernelDefConstraintsImpl, L>{}();
+}
+
+#if !defined(DISABLE_SPARSE_TENSORS)
+template <typename L>
+inline std::vector<MLDataType> BuildKernelDefSparseConstraintsFromTypeList() {
+  return boost::mp11::mp_apply<BuildKernelDefSparseConstraintsImpl, L>{}();
+}
+#endif
+
+}  // namespace onnxruntime
+
+#ifndef SHARED_PROVIDER
+#include "core/framework/op_kernel_context.h"
+#endif
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/op_kernel_context.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/op_kernel_context.h
new file mode 100644
index 00000000000000..ac22d9130983aa
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/op_kernel_context.h
@@ -0,0 +1,243 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+namespace onnxruntime {
+class IExecutionFrame;
+class Stream;
+namespace concurrency {
+class ThreadPool;
+}
+
+class OpKernelContext {
+ public:
+  using ArgMap = std::unordered_map<std::string, size_t>;
+
+  OpKernelContext(_Inout_ IExecutionFrame* frame, _In_ const OpKernel* kernel,
+                  _In_ Stream* stream,
+                  _In_opt_ concurrency::ThreadPool* threadpool, _In_ const logging::Logger& logger);
+
+  virtual ~OpKernelContext() = default;
+
+  /**
+  Return the number of inputs for a variadic argument.
+  @param arg_num The operator argument number.
+  @returns Number of inputs the argument has.
+  */
+  virtual int NumVariadicInputs(size_t arg_num) const;
+
+  virtual MLDataType InputType(int index) const;
+  virtual MLDataType OutputType(int index) const;
+
+  const OrtValue* GetInputOrtValue(int index) const {
+    return GetInputMLValue(index);
+  }
+
+  template <typename T>
+  const T* Input(int index) const {
+    const OrtValue* p_ml_value = GetInputMLValue(index);
+    ORT_TRY {
+      return p_ml_value ? &(p_ml_value->Get<T>()) : nullptr;
+    }
+    ORT_CATCH(const std::exception& /*e*/) {
+      ORT_THROW("Missing Input: " + kernel_->Node().InputDefs()[index]->Name());
+    }
+  }
+
+  // Fetch a required input, enforcing that it is present.
+  template <typename T>
+  const T& RequiredInput(int index) const {
+    const T* input_ptr = Input<T>(index);
+    ORT_ENFORCE(input_ptr, "Required input at index ", index, " is not present.");
+    return *input_ptr;
+  }
+
+  // Fetch output (non-tensor) with specified index.
+  template <typename T>
+  T* Output(int index) {
+    if (index < 0 || index >= OutputCount())
+      return nullptr;
+
+    OrtValue* p_ml_value = GetOrCreateOutputMLValue(index);
+    return p_ml_value ? p_ml_value->GetMutable<T>() : nullptr;
+  }
+
+  // In the case that memory allocation has not been done for an output tensor,
+  // The memory allocation will be done on-the-fly with given tensor shape.
+  // Return nullptr if the output is an unused optional output.
+  Tensor* Output(int index, const TensorShape& shape);
+  Tensor* Output(int index, const std::vector<int64_t>& shape);
+  Tensor* Output(int index, const std::initializer_list<int64_t>& shape);
+
+  // Fetch a required tensor output, enforcing that it is present.
+  Tensor& RequiredOutput(int index, const TensorShape& shape) {
+    Tensor* output_ptr = Output(index, shape);
+    ORT_ENFORCE(output_ptr, "Required output at index ", index, " is not present.");
+    return *output_ptr;
+  }
+
+#if !defined(DISABLE_SPARSE_TENSORS)
+  // Fetch a sparse-tensor output corresponding to the specified index.
+  // shape must specify the shape of the underlying dense-tensor.
+  // Memory allocation for the output may happen when this method is invoked,
+  // unless static optimization pre-allocates it.
+  SparseTensor* OutputSparse(int index, const TensorShape& shape);
+#endif
+
+#if !defined(DISABLE_OPTIONAL_TYPE)
+  // Use this API to output a "None" of a specific type (e.g. Tensor) at specified index
+  template <typename T>
+  void OutputOptionalWithoutData(int index) {
+    auto* output_ort_value = GetOutputMLValue(index);
+
+    auto type = DataTypeImpl::GetType<T>();
+
+    output_ort_value->Init(nullptr,  // This OrtValue is "None" and has no data
+                           type,
+                           type->GetDeleteFunc());
+  }
+#endif
+
+  // Retrieve indexed shape obtained from memory planning before actual
+  // computation. If the indexed shape cannot be inferred, this function returns
+  // false.
+  virtual bool TryGetInferredInputShape(int index, TensorShape& shape) const;
+
+  // Retrieve indexed shape obtained from memory planning before actual
+  // computation. If the indexed shape cannot be inferred, this function returns
+  // false.
+  virtual bool TryGetInferredOutputShape(int index, TensorShape& shape) const;
+
+  const logging::Logger& Logger() const {
+    return *logger_;
+  }
+
+  // always >= 0
+  virtual int InputCount() const {
+    return static_cast<int>(kernel_->Node().InputDefs().size());
+  }
+
+  // always >= 0
+  virtual int ImplicitInputCount() const {
+    return static_cast<int>(kernel_->Node().ImplicitInputDefs().size());
+  }
+
+  // always >= 0
+  virtual int OutputCount() const {
+    return static_cast<int>(kernel_->Node().OutputDefs().size());
+  }
+
+  /**
+   Return an allocator on device 0, with memtype of OrtMemTypeDefault.
+   @remarks Use SafeInt when calculating the size of memory to allocate using AllocatorPtr->Alloc.
+   */
+  [[nodiscard]] virtual Status GetTempSpaceAllocator(AllocatorPtr* output) const;
+
+  /**
+   Return the allocator associated with the CPU EP with memtype of OrtMemTypeDefault.
+   @remarks Use SafeInt when calculating the size of memory to allocate using AllocatorPtr->Alloc.
+   */
+  [[nodiscard]] Status GetTempSpaceCPUAllocator(AllocatorPtr* output) const;
+
+  /**
+  Return the device id that current kernel runs on.
+  */
+  virtual int GetDeviceId() const {
+    return kernel_->Info().GetExecutionProvider()->GetDeviceId();
+  }
+
+  /**
+  Return the compute stream associated with the EP that the kernel is partitioned to.
+  For EPs that do not have a compute stream (e.g. CPU EP), a nullptr is returned.
+  */
+  [[nodiscard]] virtual Stream* GetComputeStream() const {
+    return stream_;
+  }
+
+  /**
+  Returns the opset domain of the underlying kernel
+  **/
+  const std::string& GetOpDomain() const;
+
+  /**
+  Returns the optype of the underlying kernel
+  **/
+  const std::string& GetOpType() const;
+
+  /**
+  Returns the node name of the underlying kernel
+  **/
+  const std::string& GetNodeName() const;
+
+  /**
+  Returns the intra-op threadpool, if available.
+  */
+  _Ret_maybenull_ onnxruntime::concurrency::ThreadPool* GetOperatorThreadPool() const { return threadpool_; }
+
+  /**
+  Returns whether deterministic computation is preferred.
+  */
+  virtual bool GetUseDeterministicCompute() const {
+    return true;
+  }
+
+  /**
+  Returns Allocator from a specific OrtMemoryInfo object.
+  TODO(leca): Replace GetTempSpaceAllocator() and GetTempSpaceCPUAllocator() with this API in the future
+  */
+  AllocatorPtr GetAllocator(const OrtDevice& device) const;
+
+ protected:
+  OpKernelContext(concurrency::ThreadPool* threadpool, const logging::Logger& logger, Stream* stream);
+
+  onnxruntime::NodeIndex GetNodeIndex() const;
+
+  virtual const OrtValue* GetInputMLValue(int index) const;
+  const OrtValue* GetImplicitInputMLValue(int index) const;
+  OrtValue* GetOutputMLValue(int index);
+
+#ifdef ENABLE_ATEN
+  Status SetOutputMLValue(int index, const OrtValue& ort_value);
+#endif
+
+  // Creates the OrtValue* based on the shape, if it does not exist
+  virtual OrtValue* OutputMLValue(int index, const TensorShape& shape);
+
+  virtual OrtValue* GetOrCreateOutputMLValue(int index);
+
+ private:
+  ORT_DISALLOW_COPY_AND_ASSIGNMENT(OpKernelContext);
+  int GetInputArgIndex(int index) const;
+  int GetImplicitInputArgIndex(int index) const;
+  int GetOutputArgIndex(int index) const;
+
+  IExecutionFrame* const execution_frame_{};
+  const OpKernel* const kernel_{};
+  concurrency::ThreadPool* const threadpool_{};
+  const logging::Logger* const logger_{};
+
+  // The argument starting index in ExecutionFrame.
+  int node_input_start_index_{-1};
+  int node_implicit_input_start_index_{-1};
+  int node_output_start_index_{-1};
+
+  Stream* stream_;
+};
+
+// Fetching output tensor without shape is not allowed except when it already exists
+template <>
+inline Tensor* OpKernelContext::Output<Tensor>(int index) {
+  OrtValue* p_ml_value = GetOutputMLValue(index);
+  ORT_ENFORCE(p_ml_value, "Please fetch output tensor with specified shape.");
+  return p_ml_value->GetMutable<Tensor>();
+}
+
+#if !defined(DISABLE_SPARSE_TENSORS)
+template <>
+inline SparseTensor* OpKernelContext::Output<SparseTensor>(int index) {
+  OrtValue* p_ml_value = GetOutputMLValue(index);
+  ORT_ENFORCE(p_ml_value, "Please fetch output sparse tensor with specified shape.");
+  return p_ml_value->GetMutable<SparseTensor>();
+}
+#endif
+
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/op_kernel_info.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/op_kernel_info.h
new file mode 100644
index 00000000000000..1510cdc9d14591
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/op_kernel_info.h
@@ -0,0 +1,73 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/framework/execution_provider.h"
+#include "core/framework/kernel_def_builder.h"
+#include "core/framework/ort_value.h"
+#include "core/framework/op_node_proto_helper.h"
+#include "core/graph/graph_viewer.h"
+#include <gsl/gsl>
+
+namespace onnxruntime {
+
+class DataTransferManager;
+class FuncManager;
+class OrtValueNameIdxMap;
+struct AllocPlanPerValue;
+
+// A very light-weight class, which works as an aggregated
+// view of all data needed for constructing a Kernel instance.
+// NOTE: it does not own/hold any objects.
+class OpKernelInfo : public OpNodeProtoHelper<ProtoHelperNodeContext> {
+ public:
+  explicit OpKernelInfo(const onnxruntime::Node& node,
+                        const KernelDef& kernel_def,
+                        const IExecutionProvider& execution_provider,
+                        const std::unordered_map<int, OrtValue>& constant_initialized_tensors,
+                        const OrtValueNameIdxMap& mlvalue_name_idx_map,
+                        const DataTransferManager& data_transfer_mgr,
+                        const AllocatorMap& allocators,
+                        const ConfigOptions& config_options);
+
+  OpKernelInfo(const OpKernelInfo& other);
+
+  const OrtDevice GetDevice(OrtMemType mem_type) const;
+
+  AllocatorPtr GetAllocator(OrtMemType mem_type) const;
+
+  const KernelDef& GetKernelDef() const;
+
+  const IExecutionProvider* GetExecutionProvider() const noexcept;
+
+  const DataTransferManager& GetDataTransferManager() const noexcept;
+
+  const onnxruntime::Node& node() const noexcept;
+
+  bool TryGetConstantInput(int input_index, const Tensor** constant_input_value) const;
+
+  bool TryGetConstantInput(int input_index, const OrtValue** constant_input_value) const;
+
+  const AllocatorMap& GetAllocators() const { return allocators_; }
+
+  const ConfigOptions& GetConfigOptions() const { return config_options_; }
+
+ private:
+  ORT_DISALLOW_MOVE(OpKernelInfo);
+  ORT_DISALLOW_ASSIGNMENT(OpKernelInfo);
+
+  const onnxruntime::Node& node_;
+  const KernelDef& kernel_def_;
+  // For non cpu/cuda case, this pointer should be set so that function kernel
+  // will delegate kernel compute call to <execution_provider> compute call.
+  gsl::not_null<const ::onnxruntime::IExecutionProvider*> execution_provider_;
+  const std::unordered_map<int, OrtValue>& constant_initialized_tensors_;
+  const OrtValueNameIdxMap& ort_value_name_idx_map_;
+  const DataTransferManager& data_transfer_mgr_;
+  ProtoHelperNodeContext proto_helper_context_;
+  const AllocatorMap& allocators_;
+  const ConfigOptions& config_options_;
+};
+
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/op_node_proto_helper.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/op_node_proto_helper.h
new file mode 100644
index 00000000000000..5cbaaa0212c5ff
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/op_node_proto_helper.h
@@ -0,0 +1,167 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#ifndef SHARED_PROVIDER
+#include "core/common/status.h"
+#include "core/framework/tensor_shape.h"
+#include "core/graph/graph_viewer.h"
+#include <gsl/gsl>
+#endif
+
+class IMLOpKernel;
+
+namespace onnxruntime {
+
+/**
+   A set of wrappers with common signatures for use with both OpKernelInfo
+   (as its base class) and InferenceContext.  Used by ABI kernels for both
+   shape / type inference and kernel construction
+*/
+template <class Impl_t>
+class OpNodeProtoHelper {
+ public:
+  explicit OpNodeProtoHelper(const Impl_t* impl) : impl_(impl) {}
+
+  /**
+     Get a single attribute
+     Call this function for a required attribute or when a default value for an optional attribute is specified in the op schema
+  */
+  template <typename T>
+  Status GetAttr(const std::string& name, T* value) const;
+
+  /**
+     Get a single attribute
+     Call this function for a required attribute or when a default value for an optional attribute is specified in the op schema
+     Throws if an attribute with the specified type doesn't exist
+  */
+  template <typename T>
+  [[nodiscard]] T GetAttr(const std::string& name) const {
+    T value;
+    ORT_THROW_IF_ERROR(GetAttr(name, &value));
+    return value;
+  }
+
+  /**
+     Get a single attribute
+     Call this function only when a default value for an optional attribute isn't specified in the op schema
+  */
+  template <typename T>
+  [[nodiscard]] T GetAttrOrDefault(const std::string& name, const T& default_value) const {
+    T tmp;
+    return GetAttr<T>(name, &tmp).IsOK() ? tmp : default_value;
+  }
+
+  /**
+     Get a single attribute
+     Call this function only when a default value for an optional attribute isn't specified in the op schema
+  */
+  template <typename T>
+  void GetAttrOrDefault(const std::string& name, T* value, const T& default_value) const {
+    if (!GetAttr<T>(name, value).IsOK())
+      *value = default_value;
+  }
+
+  /**
+     Get repeated attributes
+     Call this function only when a default value for an optional attribute isn't specified in the op schema
+  */
+  template <typename T>
+  [[nodiscard]] std::vector<T> GetAttrsOrDefault(const std::string& name,
+                                                 const std::vector<T>& default_value = {}) const {
+    std::vector<T> tmp;
+    return GetAttrs<T>(name, tmp).IsOK() ? tmp : default_value;
+  }
+
+  /// <summary>
+  /// Return a gsl::span that points to an array of primitive types held by AttributeProto
+  /// This function allows to avoid copying big attributes locally into a kernel and operate on
+  /// AttributeProto data directly.
+  ///
+  ///  Does not apply to strings, Tensors and Sparse Tensors that require special treatment.
+  /// </summary>
+  /// <typeparam name="T">Primitive type contained in the array</typeparam>
+  /// <param name="name">Attribute name</param>
+  /// <param name="values">Attribute data in a span, out parameter</param>
+  /// <returns>Status</returns>
+  template <typename T>
+  Status GetAttrsAsSpan(const std::string& name, gsl::span<const T>& values) const;
+
+  Status GetAttrs(const std::string& name, TensorShapeVector& out) const;
+
+  [[nodiscard]] TensorShapeVector GetAttrsOrDefault(const std::string& name,
+                                                    const TensorShapeVector& default_value = {}) const {
+    TensorShapeVector tmp;
+    return GetAttrs(name, tmp).IsOK() ? tmp : default_value;
+  }
+
+  /**
+     Get repeated attributes
+  */
+  template <typename T>
+  Status GetAttrs(const std::string& name, std::vector<T>& values) const;
+
+  template <typename T>
+  Status GetAttrs(const std::string& name, gsl::span<T> values) const;
+
+  Status GetAttrsStringRefs(const std::string& name,
+                            std::vector<std::reference_wrapper<const std::string>>& refs) const;
+
+  [[nodiscard]] uint32_t GetPrimitiveAttrElementCount(ONNX_NAMESPACE::AttributeProto_AttributeType type,
+                                                      const std::string& name) const noexcept;
+
+  [[nodiscard]] bool HasPrimitiveAttribute(ONNX_NAMESPACE::AttributeProto_AttributeType type,
+                                           const std::string& name) const noexcept;
+
+  [[nodiscard]] uint32_t GetInputCount() const {
+    return gsl::narrow_cast<uint32_t>(impl_->getNumInputs());
+  }
+
+  [[nodiscard]] uint32_t GetOutputCount() const {
+    return gsl::narrow_cast<uint32_t>(impl_->getNumOutputs());
+  }
+
+  [[nodiscard]] const ONNX_NAMESPACE::TypeProto* GetInputType(size_t index) const {
+    return impl_->getInputType(index);
+  }
+
+  [[nodiscard]] const ONNX_NAMESPACE::TypeProto* GetOutputType(size_t index) const {
+    // Work around lack of a const method from the onnx InferenceContext interface
+    return const_cast<Impl_t*>(impl_)->getOutputType(index);
+  }
+
+  // Try to query an attribute, returning nullptr if it doesn't exist
+  [[nodiscard]] const ONNX_NAMESPACE::AttributeProto* TryGetAttribute(const std::string& name) const {
+    return impl_->getAttribute(name);
+  }
+
+  [[nodiscard]] const ONNX_NAMESPACE::AttributeProto* GetAttribute(const std::string& name) const {
+    const ONNX_NAMESPACE::AttributeProto* attr = TryGetAttribute(name);
+    ORT_ENFORCE(attr != nullptr);
+    return attr;
+  }
+
+ private:
+  OpNodeProtoHelper() = delete;
+  const Impl_t* impl_ = nullptr;
+};
+
+// The methods on the following class are called by OpNodeProtoHelper, implementing
+// the same signatures as InferenceContext other than const-ness.
+class ProtoHelperNodeContext {
+ public:
+  explicit ProtoHelperNodeContext(const onnxruntime::Node& node) : node_(node) {}
+  ProtoHelperNodeContext() = delete;
+
+  const ONNX_NAMESPACE::AttributeProto* getAttribute(const std::string& name) const;
+  size_t getNumInputs() const;
+  const ONNX_NAMESPACE::TypeProto* getInputType(size_t index) const;
+  size_t getNumOutputs() const;
+  const ONNX_NAMESPACE::TypeProto* getOutputType(size_t index) const;
+
+ private:
+  const onnxruntime::Node& node_;
+};
+
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/ort_value.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/ort_value.h
new file mode 100644
index 00000000000000..a071f3182faad7
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/ort_value.h
@@ -0,0 +1,119 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <string>
+#ifndef SHARED_PROVIDER
+#include "core/common/common.h"
+#include "core/common/exceptions.h"
+#include "core/framework/allocator.h"
+#include "core/framework/data_types.h"
+#include "core/framework/tensor.h"
+
+namespace onnxruntime {
+#if !defined(DISABLE_SPARSE_TENSORS)
+class SparseTensor;
+#endif
+class TensorSeq;
+}  // namespace onnxruntime
+
+#endif
+
+/**
+   Represents both tensors and non-tensors.
+*/
+struct OrtValue {
+ public:
+  OrtValue() : data_(nullptr) {}
+  ~OrtValue() = default;
+
+  OrtValue(void* pData, onnxruntime::MLDataType type, onnxruntime::DeleteFunc deleter) {
+    Init(pData, type, deleter);
+  }
+
+  void Init(void* pData, onnxruntime::MLDataType type, onnxruntime::DeleteFunc deleter) {
+    data_.reset(pData, deleter);
+    type_ = type;
+  }
+
+  void Init(void* pData, onnxruntime::MLDataType type, const std::function<void(void*)>& deleter) {
+    data_.reset(pData, deleter);
+    type_ = type;
+  }
+
+  bool IsAllocated() const {
+    return data_ && type_;
+  }
+
+  template <typename T>
+  const T& Get() const {
+    ORT_ENFORCE(onnxruntime::DataTypeImpl::GetType<T>() == type_, onnxruntime::DataTypeImpl::GetType<T>(), " != ", type_);
+    return *static_cast<T*>(data_.get());
+  }
+
+  // May return nullptr, if this OrtValue is an optional type and it is "None".
+  template <typename T>
+  T* GetMutable() {
+    ORT_ENFORCE(onnxruntime::DataTypeImpl::GetType<T>() == type_, onnxruntime::DataTypeImpl::GetType<T>(), " != ", type_);
+    return static_cast<T*>(data_.get());
+  }
+
+  bool IsTensor() const noexcept {
+    return (type_ != nullptr && type_->IsTensorType());
+  }
+
+  bool IsTensorSequence() const noexcept {
+    return (type_ != nullptr && type_->IsTensorSequenceType());
+  }
+
+  bool IsSparseTensor() const {
+    return (type_ != nullptr && type_->IsSparseTensorType());
+  }
+
+  onnxruntime::MLDataType Type() const {
+    return type_;
+  }
+
+ private:
+  std::shared_ptr<void> data_;
+  onnxruntime::MLDataType type_{nullptr};
+};
+
+template <>
+inline const onnxruntime::Tensor& OrtValue::Get<onnxruntime::Tensor>() const {
+  ORT_ENFORCE(IsTensor(), "Trying to get a Tensor, but got: ", onnxruntime::DataTypeImpl::ToString(type_));
+  return *static_cast<onnxruntime::Tensor*>(data_.get());
+}
+
+template <>
+inline onnxruntime::Tensor* OrtValue::GetMutable<onnxruntime::Tensor>() {
+  ORT_ENFORCE(IsTensor(), "Trying to get a Tensor, but got: ", onnxruntime::DataTypeImpl::ToString(type_));
+  return static_cast<onnxruntime::Tensor*>(data_.get());
+}
+
+template <>
+inline const onnxruntime::TensorSeq& OrtValue::Get<onnxruntime::TensorSeq>() const {
+  ORT_ENFORCE(IsTensorSequence(), "Trying to get a TensorSeq, but got: ", onnxruntime::DataTypeImpl::ToString(type_));
+  return *static_cast<onnxruntime::TensorSeq*>(data_.get());
+}
+
+template <>
+inline onnxruntime::TensorSeq* OrtValue::GetMutable<onnxruntime::TensorSeq>() {
+  ORT_ENFORCE(IsTensorSequence(), "Trying to get a TensorSeq, but got: ", onnxruntime::DataTypeImpl::ToString(type_));
+  return static_cast<onnxruntime::TensorSeq*>(data_.get());
+}
+
+#if !defined(DISABLE_SPARSE_TENSORS)
+template <>
+inline const onnxruntime::SparseTensor& OrtValue::Get<onnxruntime::SparseTensor>() const {
+  ORT_ENFORCE(IsSparseTensor(), "Trying to get a SparseTensor, but got: ", onnxruntime::DataTypeImpl::ToString(type_));
+  return *static_cast<onnxruntime::SparseTensor*>(data_.get());
+}
+
+template <>
+inline onnxruntime::SparseTensor* OrtValue::GetMutable<onnxruntime::SparseTensor>() {
+  ORT_ENFORCE(IsSparseTensor(), "Trying to get a SparseTensor, but got: ", onnxruntime::DataTypeImpl::ToString(type_));
+  return static_cast<onnxruntime::SparseTensor*>(data_.get());
+}
+#endif
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/ortdevice.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/ortdevice.h
new file mode 100644
index 00000000000000..6f658ab65be204
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/ortdevice.h
@@ -0,0 +1,103 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <sstream>
+#include "core/common/hash_combine.h"
+
+// Struct to represent a physical device.
+struct OrtDevice {
+  using DeviceType = int8_t;
+  using MemoryType = int8_t;
+  using DeviceId = int16_t;
+
+  // Pre-defined device types.
+  static const DeviceType CPU = 0;
+  static const DeviceType GPU = 1;  // Nvidia or AMD
+  static const DeviceType FPGA = 2;
+  static const DeviceType NPU = 3;  // Ascend
+  static const DeviceType DML = 4;
+
+  struct MemType {
+    // Pre-defined memory types.
+    static const MemoryType DEFAULT = 0;
+    static const MemoryType CUDA_PINNED = 1;
+    static const MemoryType HIP_PINNED = 2;
+    static const MemoryType CANN_PINNED = 3;
+  };
+
+  constexpr OrtDevice(DeviceType device_type_, MemoryType memory_type_, DeviceId device_id_)
+      : device_type(device_type_),
+        memory_type(memory_type_),
+        device_id(device_id_) {}
+
+  constexpr OrtDevice() : OrtDevice(CPU, MemType::DEFAULT, 0) {}
+
+  DeviceType Type() const {
+    return device_type;
+  }
+
+  MemoryType MemType() const {
+    return memory_type;
+  }
+
+  DeviceId Id() const {
+    return device_id;
+  }
+
+  std::string ToString() const {
+    std::ostringstream ostr;
+    ostr << "Device:["
+         << "DeviceType:" << static_cast<int>(device_type)
+         << " MemoryType:" << static_cast<int>(memory_type)
+         << " DeviceId:" << device_id
+         << "]";
+    return ostr.str();
+  }
+
+  // This is to make OrtDevice a valid key in hash tables
+  size_t Hash() const {
+    auto h = std::hash<int>()(device_type);
+    onnxruntime::HashCombine(memory_type, h);
+    onnxruntime::HashCombine(device_id, h);
+    return h;
+  }
+
+  // To make OrtDevice become a valid key in std map
+  bool operator<(const OrtDevice& other) const {
+    if (device_type != other.device_type)
+      return device_type < other.device_type;
+    if (memory_type != other.memory_type)
+      return memory_type < other.memory_type;
+
+    return device_id < other.device_id;
+  }
+
+ private:
+  // Device type.
+  int32_t device_type : 8;
+
+  // Memory type.
+  int32_t memory_type : 8;
+
+  // Device index.
+  int32_t device_id : 16;
+};
+
+inline bool operator==(const OrtDevice& left, const OrtDevice& other) {
+  return left.Id() == other.Id() && left.MemType() == other.MemType() && left.Type() == other.Type();
+}
+
+inline bool operator!=(const OrtDevice& left, const OrtDevice& other) {
+  return !(left == other);
+}
+
+namespace std {
+template <>
+struct hash<OrtDevice> {
+  size_t operator()(const OrtDevice& i) const {
+    return i.Hash();
+  }
+};
+}  // namespace std
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/ortmemoryinfo.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/ortmemoryinfo.h
new file mode 100644
index 00000000000000..7af5554e25c0b7
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/ortmemoryinfo.h
@@ -0,0 +1,87 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <string_view>
+
+#include "core/common/hash_combine.h"
+
+struct OrtMemoryInfo {
+  OrtMemoryInfo() = default;  // to allow default construction of Tensor
+
+  // use string for name, so we could have customized allocator in execution provider.
+  const char* name = nullptr;
+  int id = -1;
+  OrtMemType mem_type = OrtMemTypeDefault;
+  OrtAllocatorType alloc_type = OrtInvalidAllocator;
+  OrtDevice device;
+
+  constexpr OrtMemoryInfo(const char* name_, OrtAllocatorType type_, OrtDevice device_ = OrtDevice(), int id_ = 0,
+                          OrtMemType mem_type_ = OrtMemTypeDefault)
+#if ((defined(__GNUC__) && __GNUC__ > 4) || defined(__clang__))
+      // this causes a spurious error in CentOS gcc 4.8 build so disable if GCC version < 5
+      __attribute__((nonnull))
+#endif
+      : name(name_),
+        id(id_),
+        mem_type(mem_type_),
+        alloc_type(type_),
+        device(device_) {
+  }
+
+  // To make OrtMemoryInfo become a valid key in std map
+  bool operator<(const OrtMemoryInfo& other) const {
+    if (alloc_type != other.alloc_type)
+      return alloc_type < other.alloc_type;
+    if (mem_type != other.mem_type)
+      return mem_type < other.mem_type;
+    if (id != other.id)
+      return id < other.id;
+
+    return strcmp(name, other.name) < 0;
+  }
+
+  // This is to make OrtMemoryInfo a valid key in hash tables
+  // we ignore device id
+  size_t Hash() const {
+    auto h = std::hash<int>()(alloc_type);
+    onnxruntime::HashCombine(mem_type, h);
+    onnxruntime::HashCombine(id, h);
+    onnxruntime::HashCombine<std::string_view>(name, h);
+    return h;
+  }
+
+  std::string ToString() const {
+    std::ostringstream ostr;
+    ostr << "OrtMemoryInfo:["
+         << "name:" << name
+         << " id:" << id
+         << " OrtMemType:" << mem_type
+         << " OrtAllocatorType:" << alloc_type
+         << " " << device.ToString()
+         << "]";
+    return ostr.str();
+  }
+};
+
+// Required by hash tables
+inline bool operator==(const OrtMemoryInfo& left, const OrtMemoryInfo& other) {
+  return left.mem_type == other.mem_type &&
+         left.alloc_type == other.alloc_type &&
+         left.id == other.id &&
+         strcmp(left.name, other.name) == 0;
+}
+
+inline bool operator!=(const OrtMemoryInfo& lhs, const OrtMemoryInfo& rhs) { return !(lhs == rhs); }
+
+std::ostream& operator<<(std::ostream& out, const OrtMemoryInfo& info);
+
+namespace std {
+template <>
+struct hash<OrtMemoryInfo> {
+  size_t operator()(const OrtMemoryInfo& i) const {
+    return i.Hash();
+  }
+};
+}  // namespace std
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/provider_options.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/provider_options.h
new file mode 100644
index 00000000000000..aab13e808e3b67
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/provider_options.h
@@ -0,0 +1,18 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace onnxruntime {
+
+// data types for execution provider options
+
+using ProviderOptions = std::unordered_map<std::string, std::string>;
+using ProviderOptionsVector = std::vector<ProviderOptions>;
+using ProviderOptionsMap = std::unordered_map<std::string, ProviderOptions>;
+
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/provider_options_utils.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/provider_options_utils.h
new file mode 100644
index 00000000000000..5967fb91523d0d
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/provider_options_utils.h
@@ -0,0 +1,164 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <algorithm>
+#include <functional>
+#include <type_traits>
+#include <unordered_map>
+#include <vector>
+
+#include "core/common/common.h"
+#include "core/common/parse_string.h"
+#include "core/framework/provider_options.h"
+
+namespace onnxruntime {
+
+template <typename TEnum>
+using EnumNameMapping = std::vector<std::pair<TEnum, std::string>>;
+
+/**
+ * Given a mapping and an enumeration value, gets the corresponding name.
+ */
+template <typename TEnum>
+Status EnumToName(const EnumNameMapping<TEnum>& mapping, TEnum value, std::string& name) {
+  const auto it = std::find_if(
+      mapping.begin(), mapping.end(),
+      [&value](const std::pair<TEnum, std::string>& entry) {
+        return entry.first == value;
+      });
+  ORT_RETURN_IF(
+      it == mapping.end(),
+      "Failed to map enum value to name: ", static_cast<typename std::underlying_type<TEnum>::type>(value));
+  name = it->second;
+  return Status::OK();
+}
+
+template <typename TEnum>
+std::string EnumToName(const EnumNameMapping<TEnum>& mapping, TEnum value) {
+  std::string name;
+  ORT_THROW_IF_ERROR(EnumToName(mapping, value, name));
+  return name;
+}
+
+/**
+ * Given a mapping and a name, gets the corresponding enumeration value.
+ */
+template <typename TEnum>
+Status NameToEnum(
+    const EnumNameMapping<TEnum>& mapping, const std::string& name, TEnum& value) {
+  const auto it = std::find_if(
+      mapping.begin(), mapping.end(),
+      [&name](const std::pair<TEnum, std::string>& entry) {
+        return entry.second == name;
+      });
+  ORT_RETURN_IF(
+      it == mapping.end(),
+      "Failed to map enum name to value: ", name);
+  value = it->first;
+  return Status::OK();
+}
+
+template <typename TEnum>
+TEnum NameToEnum(const EnumNameMapping<TEnum>& mapping, const std::string& name) {
+  TEnum value;
+  ORT_THROW_IF_ERROR(NameToEnum(mapping, name, value));
+  return value;
+}
+
+class ProviderOptionsParser {
+ public:
+  /**
+   * Adds a parser for a particular provider option value.
+   *
+   * @param name The provider option name.
+   * @param value_parser An object that parses the option value.
+   *        It should be callable with the following signature and return
+   *        whether the parsing was successful:
+   *            Status value_parser(const std::string&)
+   *
+   * @return The current ProviderOptionsParser instance.
+   */
+  template <typename ValueParserType>
+  ProviderOptionsParser& AddValueParser(
+      const std::string& name, ValueParserType value_parser) {
+    ORT_ENFORCE(
+        value_parsers_.emplace(name, ValueParser{value_parser}).second,
+        "Provider option \"", name, "\" already has a value parser.");
+    return *this;
+  }
+
+  /**
+   * Adds a parser for a particular provider option value which converts a
+   * value to the right type and assigns it to the given reference.
+   *
+   * IMPORTANT: This function stores a reference to the destination variable.
+   * The caller must ensure that the reference is valid when Parse() is called!
+   *
+   * @param name The provider option name.
+   * @param dest The destination variable reference.
+   *
+   * @return The current ProviderOptionsParser instance.
+   */
+  template <typename ValueType>
+  ProviderOptionsParser& AddAssignmentToReference(
+      const std::string& name, ValueType& dest) {
+    return AddValueParser(
+        name,
+        [&dest](const std::string& value_str) -> Status {
+          return ParseStringWithClassicLocale(value_str, dest);
+        });
+  }
+
+  /**
+   * Adds a parser for a particular provider option value which maps an
+   * enumeration name to a value and assigns it to the given reference.
+   *
+   * IMPORTANT: This function stores references to the mapping and destination
+   * variables. The caller must ensure that the references are valid when
+   * Parse() is called!
+   *
+   * @param name The provider option name.
+   * @param mapping The enumeration value to name mapping.
+   * @param dest The destination variable reference.
+   *
+   * @return The current ProviderOptionsParser instance.
+   */
+  template <typename EnumType>
+  ProviderOptionsParser& AddAssignmentToEnumReference(
+      const std::string& name, const EnumNameMapping<EnumType>& mapping, EnumType& dest) {
+    return AddValueParser(
+        name,
+        [&mapping, &dest](const std::string& value_str) -> Status {
+          return NameToEnum(mapping, value_str, dest);
+        });
+  }
+
+  /**
+   * Parses the given provider options.
+   */
+  Status Parse(const ProviderOptions& options) const {
+    for (const auto& option : options) {
+      const auto& name = option.first;
+      const auto& value_str = option.second;
+      const auto value_parser_it = value_parsers_.find(name);
+      ORT_RETURN_IF(
+          value_parser_it == value_parsers_.end(),
+          "Unknown provider option: \"", name, "\".");
+
+      const auto parse_status = value_parser_it->second(value_str);
+      ORT_RETURN_IF_NOT(
+          parse_status.IsOK(),
+          "Failed to parse provider option \"", name, "\": ", parse_status.ErrorMessage());
+    }
+
+    return Status::OK();
+  }
+
+ private:
+  using ValueParser = std::function<Status(const std::string&)>;
+  std::unordered_map<std::string, ValueParser> value_parsers_;
+};
+
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/provider_shutdown.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/provider_shutdown.h
new file mode 100644
index 00000000000000..7488f12a0facad
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/provider_shutdown.h
@@ -0,0 +1,8 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+namespace onnxruntime {
+void UnloadSharedProviders();
+}
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/run_options.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/run_options.h
new file mode 100644
index 00000000000000..fab65e8fee6925
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/run_options.h
@@ -0,0 +1,59 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <string>
+#include <atomic>
+
+#include "core/common/inlined_containers_fwd.h"
+#include "core/session/onnxruntime_c_api.h"
+#include "core/framework/config_options.h"
+
+namespace onnxruntime {
+namespace lora {
+class LoraAdapter;
+}
+}  // namespace onnxruntime
+
+/**
+ * Configuration information for a Run call.
+ */
+struct OrtRunOptions {
+  /// Log severity.  See https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/common/logging/severity.h
+  /// Default = -1 (use the log severity from the InferenceSession that the Run is for).
+  int run_log_severity_level = -1;
+  int run_log_verbosity_level = 0;  ///< VLOG level if debug build and run_log_severity_level is 0 (VERBOSE).
+  std::string run_tag;              ///< A tag for the Run() calls using this.
+
+  // Set to 'true' to ensure the termination of all the outstanding Run() calls
+  // that use this OrtRunOptions instance. Some of the outstanding Run() calls may
+  // be forced to terminate with an error status.
+  bool terminate = false;
+
+  // Set to 'true' to run only the nodes from feeds to required fetches.
+  // So it is possible that only some of the nodes are executed.
+  bool only_execute_path_to_fetches = false;
+
+#ifdef ENABLE_TRAINING
+  // Used by onnxruntime::training::TrainingSession. This class is now deprecated.
+  // Delete training_mode when TrainingSession is deleted.
+  // Set to 'true' to run in training mode.
+  bool training_mode = true;
+#endif
+
+  // Stores the configurations for this run
+  // To add an configuration to this specific run, call OrtApis::AddRunConfigEntry
+  // The configuration keys and value formats are defined in
+  // /include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h
+  onnxruntime::ConfigOptions config_options;
+
+  onnxruntime::InlinedVector<const onnxruntime::lora::LoraAdapter*> active_adapters;
+
+  OrtRunOptions() = default;
+  ~OrtRunOptions() = default;
+};
+
+namespace onnxruntime {
+using RunOptions = ::OrtRunOptions;
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/sparse_tensor.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/sparse_tensor.h
new file mode 100644
index 00000000000000..da10e3bc103079
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/sparse_tensor.h
@@ -0,0 +1,533 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#if !defined(DISABLE_SPARSE_TENSORS)
+
+#include "core/framework/data_types.h"
+#include "core/framework/tensor_shape.h"
+#include "core/framework/tensor.h"
+
+struct OrtValue;
+
+namespace onnxruntime {
+
+class IDataTransfer;
+class DataTransferManager;
+
+/**
+ * @brief This is a Sparse Format enumeration
+ *
+ *
+ */
+enum class SparseFormat : uint32_t {
+  kUndefined = 0x0U,        // For completeness
+  kCoo = 0x1U,              // 1-D or 2-D indices
+  kCsrc = 0x1U << 1,        // Both CSR(C)
+  kBlockSparse = 0x1U << 2  // as in OpenAI
+};
+
+std::ostream& operator<<(std::ostream&, SparseFormat);
+
+/**
+ * @brief This class implements SparseTensor.
+ * This class holds sparse non-zero data (values) and sparse format
+ * specific indices. There are two main uses for the class (similar to that of Tensor)
+ * - one is to re-present model sparse inputs. Such inputs typically reside
+ *   in user allocated buffers that are not owned by SparseTensor instance and the instance
+ *   serves as a facade to expose user allocated buffers. Such buffers should already
+ *   contain proper values and format specific indices.
+ *   Use the first constructor
+ *   to instantiate SparseTensor and supply values_data pointer. Use*() functions can
+ *   be used to supply pointers to format specific indices. These buffers are used as is
+ *   and will not be modified or deallocated by the instance. However, the lifespan of the buffers
+ *   must eclipse the lifespan of the SparseTensor instance.
+ *
+ * - Represent sparse data that is a result of format conversion or a computation result. Use second constructor
+ *   to supply a desired allocator. Use Make*() format specific interfaces to supply values and format
+ *   specific indices. The specified data will be copied into an internally allocated buffer.
+     Internally, we will represent a SparseTensor as a single contiguous buffer that
+ *   contains values followed by format specific indices. We use Tensors to project
+ *   values and indices into various parts of buffer.
+ */
+
+class SparseTensor final {
+ public:
+  /// <summary>
+  /// This constructs an instance that points to user defined buffers.
+  /// Make use of Use* functions to supply format specific indices that
+  /// reside in the user supplied buffers. The instance constructed this way
+  /// will not copy data. The lifespan of supplied buffers is expected to eclipse
+  ///  the lifespan of the sparse tensor instance.
+  /// </summary>
+  /// <param name="elt_type">MlDataType</param>
+  /// <param name="dense_shape">a shape of original tensor in dense form</param>
+  /// <param name="values_shape">shape for user supplied values. Use {0} shape for fully sparse tensors.</param>
+  /// <param name="values_data">a pointer to values. Use nullptr for fully sparse tensors.</param>
+  /// <param name="location">description of the user allocated memory</param>
+  SparseTensor(MLDataType elt_type,
+               const TensorShape& dense_shape,
+               const TensorShape& values_shape,
+               void* values_data,
+               const OrtMemoryInfo& location);
+
+  /// <summary>
+  /// Use this constructor to hold sparse data in the buffer
+  /// allocated with the specified allocator. Use Make*() methods
+  /// to populate the instance with data which will be copied into the
+  /// allocated buffer.
+  /// </summary>
+  /// <param name="elt_type"></param>
+  /// <param name="dense_shape"></param>
+  /// <param name="allocator"></param>
+  SparseTensor(MLDataType elt_type,
+               const TensorShape& dense_shape,
+               std::shared_ptr<IAllocator> allocator);
+
+  SparseTensor() noexcept;
+
+  ~SparseTensor();
+
+  ORT_DISALLOW_COPY_AND_ASSIGNMENT(SparseTensor);
+
+  /// <summary>
+  /// The factory function creates an instance of SparseTensor on the heap
+  /// using appropriate constructor and initializes OrtValue instance wit it.
+  /// </summary>
+  /// <param name="elt_type">element data type</param>
+  /// <param name="dense_shape">dense shape of the sparse tensor</param>
+  /// <param name="values_shape">values shape. Use {0} for fully sparse tensors.</param>
+  /// <param name="values_data">pointer to a user allocated buffer. Use nullptr for fully sparse tensors.</param>
+  /// <param name="location">description of the user allocated buffer</param>
+  /// <param name="ort_value">default constructed input/output ort_value</param>
+  static void InitOrtValue(MLDataType elt_type,
+                           const TensorShape& dense_shape,
+                           const TensorShape& values_shape,
+                           void* values_data,
+                           const OrtMemoryInfo& location,
+                           OrtValue& ort_value);
+
+  /// <summary>
+  /// The factory function creates an instance of SparseTensor on the heap
+  /// using appropriate constructor and initializes OrtValue instance wit it.
+  /// </summary>
+  /// <param name="elt_type">element data type</param>
+  /// <param name="dense_shape">dense shape of the sparse tensor</param>
+  /// <param name="allocator">allocator to use</param>
+  /// <param name="ort_value">default constructed input/output ort_value</param>
+  static void InitOrtValue(MLDataType elt_type,
+                           const TensorShape& dense_shape,
+                           std::shared_ptr<IAllocator> allocator,
+                           OrtValue& ort_value);
+
+  /// <summary>
+  /// The function will check if the OrtValue is allocated
+  /// fetch the containing SparseTensor instance or throw if it
+  /// does not contain one. It will check that the SparseTensor has
+  /// sparse format set (i.e. fully constructed).
+  /// </summary>
+  /// <param name="v">OrtValue instance</param>
+  /// <returns>const SparseTensor Reference</returns>
+  static const SparseTensor& GetSparseTensorFromOrtValue(const OrtValue& v);
+
+  /// <summary>
+  /// /// The function will check if the OrtValue is allocated
+  /// fetch the containing SparseTensor instance or throw if it
+  /// does not contain one. It will check that the SparseTensor does not
+  /// have sparse format set and will return non-const ref to so indices
+  /// can be added to it.
+  /// </summary>
+  /// <param name="v">OrtValue</param>
+  /// <returns>non-const reference to SparseTensor</returns>
+  static SparseTensor& GetSparseTensorFromOrtValue(OrtValue& v);
+
+  /// <summary>
+  // Returns the number of non-zero values (aka "NNZ")
+  // For block sparse formats this may include some zeros in the blocks
+  // are considered non-zero.
+  /// </summary>
+  /// <returns>nnz</returns>
+  size_t NumValues() const { return static_cast<size_t>(values_.Shape().Size()); }
+
+  /// <summary>
+  /// Read only accessor to non-zero values
+  /// </summary>
+  /// <returns></returns>
+  const Tensor& Values() const noexcept {
+    return values_;
+  }
+
+  SparseTensor(SparseTensor&& o) noexcept;
+  SparseTensor& operator=(SparseTensor&& o) noexcept;
+
+  /// <summary>
+  /// Returns SparseFormat that the instance currently holds
+  /// if the value returned in kUndefined, the instance is not populated
+  /// </summary>
+  /// <returns>format enum</returns>
+  SparseFormat Format() const noexcept {
+    return format_;
+  }
+
+  /// <summary>
+  /// Returns a would be dense_shape
+  /// </summary>
+  /// <returns>reference to dense_shape</returns>
+  const TensorShape& DenseShape() const noexcept {
+    return dense_shape_;
+  }
+
+  /// <summary>
+  /// Calculates and returns how much this fully initialized SparseTensor data (would)
+  /// occupy in a contiguous allocation block, or, in fact, occupies if it owns the buffer.
+  /// </summary>
+  /// <returns>required allocation size</returns>
+  int64_t RequiredAllocationSize() const noexcept;
+
+  /// <summary>
+  /// Returns Tensor element type enum.
+  /// Useful for type dispatching
+  /// </summary>
+  /// <returns></returns>
+  int32_t GetElementType() const {
+    return ml_data_type_->GetDataType();
+  }
+
+  /// <summary>
+  /// Return Element MLDataType
+  /// </summary>
+  /// <returns></returns>
+  MLDataType DataType() const noexcept {
+    return ml_data_type_;
+  }
+
+  /// <summary>
+  /// Test for string type
+  /// </summary>
+  /// <returns>true if tensor values are strings</returns>
+  bool IsDataTypeString() const {
+    return utils::IsPrimitiveDataType<std::string>(ml_data_type_);
+  }
+
+  /// <summary>
+  /// Checks if the Tensor contains data type T
+  /// </summary>
+  /// <typeparam name="T"></typeparam>
+  /// <returns>true if tensor contains data of type T</returns>
+  template <class T>
+  bool IsDataType() const {
+    return utils::IsPrimitiveDataType<T>(ml_data_type_);
+  }
+
+  const OrtMemoryInfo& Location() const noexcept { return location_; }
+
+  /// <summary>
+  /// Read only access to Coo indices
+  /// </summary>
+  class CooView {
+   public:
+    explicit CooView(const Tensor& indices) noexcept
+        : indices_(indices) {}
+    const Tensor& Indices() const noexcept { return indices_; }
+
+   private:
+    std::reference_wrapper<const Tensor> indices_;
+  };
+
+  /// <summary>
+  /// Returns Coo index view
+  /// </summary>
+  /// <returns>CooView instance</returns>
+  CooView AsCoo() const;
+
+  /// <summary>
+  /// Uses COO index contained in the user allocated buffer along with the values buffer passed on
+  /// to the constructor. The buffer is used as is and its lifespan must eclipse the lifespan of the sparse
+  /// tensor instance. The OrtMemoryInfo (location) of the index is assumed to be the same as values.
+  ///
+  /// The index size must either exactly match the number of values in which case
+  /// index shape would be 1-D (values_count) or it must be twice the number of values
+  /// in which case its shape would be 2-D (values_count, 2)
+  /// </summary>
+  /// <param name="indices">user allocated buffer span. Use empty span for fully sparse tensors.</param>
+  /// <returns>Status</returns>
+  Status UseCooIndices(gsl::span<int64_t> indices);
+
+  /// <summary>
+  /// The method allocates a single contiguous buffer and copies specified values
+  /// and indices into it using supplied IDataTransfer.
+  ///
+  /// The indices size must either exactly match the number of values in which case
+  /// indices shape would be 1-D (values_count) or it must be twice the number of values
+  /// in which case its shape would be 2-D (values_count, 2).
+  ///
+  /// Values shape is supplied at construction time and its Size() must match values_count.
+  /// </summary>
+  /// <param name="values_count">Use 0 for fully sparse tensors.</param>
+  /// <param name="values_data">pointer to a buffer to be copied. Use nullptr for fully sparse tensors.</param>
+  /// <param name="indices"></param>
+  /// <returns></returns>
+  Status MakeCooData(const IDataTransfer& data_transfer, const OrtMemoryInfo& data_location,
+                     size_t values_count, const void* values_data, gsl::span<const int64_t> indices);
+
+  /// <summary>
+  /// The method allocates a single contiguous buffer and creates instances of std::strings in it, with
+  /// copies of the supplied zero-terminated strings followed by COO indices.
+  /// All data is assumed to be on CPU and the allocator supplied must be
+  /// a CPU based allocator.
+  /// </summary>
+  /// <param name="string_count">use 0 for fully sparse tensors</param>
+  /// <param name="strings">array of char* pointers. use nullptr for fully sparse tensors</param>
+  /// <param name="indices">span of indices. Use empty span for fully sparse tensors.</param>
+  /// <returns>Status</returns>
+  Status MakeCooStrings(size_t string_count, const char* const* strings, gsl::span<const int64_t> indices);
+
+  /// <summary>
+  /// Gives mutable access to Coo buffers so they can be populated
+  /// </summary>
+  class CooMutator {
+   public:
+    CooMutator(Tensor& values, Tensor& indices) noexcept : values_(values), indices_(indices) {}
+    Tensor& Values() noexcept { return values_; }
+    Tensor& Indices() noexcept { return indices_; }
+
+   private:
+    std::reference_wrapper<Tensor> values_;
+    std::reference_wrapper<Tensor> indices_;
+  };
+
+  /// <summary>
+  /// Allocates memory for values and index and returns a mutator so
+  /// data can be copied into the buffer.
+  /// </summary>
+  /// <param name="values_count">use 0 for fully sparse tensors</param>
+  /// <param name="index_count">use 0 for fully sparse tensors</param>
+  /// <returns></returns>
+  CooMutator MakeCooData(size_t values_count, size_t index_count);
+
+  /// <summary>
+  /// Read only access to Csr indices
+  /// </summary>
+  class CsrView {
+   public:
+    CsrView(const Tensor& inner, const Tensor& outer) noexcept
+        : inner_(inner), outer_(outer) {}
+    const Tensor& Inner() const noexcept { return inner_; }
+    const Tensor& Outer() const noexcept { return outer_; }
+
+   private:
+    std::reference_wrapper<const Tensor> inner_;
+    std::reference_wrapper<const Tensor> outer_;
+  };
+
+  /// <summary>
+  /// Returns Csr indices read only view
+  /// </summary>
+  /// <returns></returns>
+  CsrView AsCsr() const;
+
+  /// <summary>
+  /// This function will use Csr indices contained within the user allocated buffers.
+  /// The lifespan of the buffers must eclipse the lifespan of sparse tensor instance.
+  /// </summary>
+  /// <param name="inner_index">User allocated buffer span. use empty span for fully sparse tensors</param>
+  /// <param name="outer_index">User allocated buffer span. Use empty span for fully sparse tensors</param>
+  /// <returns></returns>
+  Status UseCsrIndices(gsl::span<int64_t> inner_index, gsl::span<int64_t> outer_index);
+
+  /// <summary>
+  /// The function will allocate a single contiguous buffer and will copy values
+  /// and indices into it.
+  /// </summary>
+  /// <param name="data_transfer"></param>
+  /// <param name="data_location"></param>
+  /// <param name="values_count">use 0 for fully sparse tensors</param>
+  /// <param name="values_data">pointer to data to be copied. Use nullptr for fully sparse tensors.</param>
+  /// <param name="inner_index">inner index to be copied. Use empty span for fully sparse tensors.</param>
+  /// <param name="outer_index">outer index to be copied. Use empty span for fully sparse tensors.</param>
+  /// <returns></returns>
+  Status MakeCsrData(const IDataTransfer& data_transfer,
+                     const OrtMemoryInfo& data_location,
+                     size_t values_count, const void* values_data,
+                     gsl::span<const int64_t> inner_index,
+                     gsl::span<const int64_t> outer_index);
+
+  /// <summary>
+  /// The method allocates a single contiguous buffer and creates instances of std::strings in it, with
+  /// copies of the supplied zero-terminated strings followed by COO indices.
+  /// All data is assumed to be on CPU and the allocator supplied must be
+  /// a CPU based allocator
+  /// </summary>
+  /// <param name="string_count"></param>
+  /// <param name="strings">array of char* pointers</param>
+  /// <param name="inner_index">inner index to be copied. Use empty span for fully sparse tensors.</param>
+  /// <param name="outer_index">outer index to be copied. Use empty span for fully sparse tensors.</param>
+  /// <returns></returns>
+  Status MakeCsrStrings(size_t string_count, const char* const* strings,
+                        gsl::span<const int64_t> inner_index,
+                        gsl::span<const int64_t> outer_index);
+
+  /// <summary>
+  /// Give writable access to Csr values and indices
+  /// </summary>
+  class CsrMutator {
+   public:
+    CsrMutator(Tensor& values, Tensor& inner, Tensor& outer) noexcept
+        : values_(values), inner_(inner), outer_(outer) {}
+    Tensor& Values() const noexcept { return values_; }
+    Tensor& Inner() const noexcept { return inner_; }
+    Tensor& Outer() const noexcept { return outer_; }
+
+   private:
+    std::reference_wrapper<Tensor> values_;
+    std::reference_wrapper<Tensor> inner_;
+    std::reference_wrapper<Tensor> outer_;
+  };
+
+  /// <summary>
+  /// Allocates memory for values and index and returns mutator so
+  /// data can be populated.
+  /// </summary>
+  /// <param name="values_count">Use 0 for fully sparse tensors.</param>
+  /// <param name="inner_index_count">Use 0 for fully sparse tensors.</param>
+  /// <param name="outer_index_count">Use 0 for fully sparse tensors.</param>
+  /// <returns></returns>
+  CsrMutator MakeCsrData(size_t values_count, size_t inner_index_count, size_t outer_index_count);
+
+  /// <summary>
+  /// Read only access to BlockSparse index
+  /// </summary>
+  class BlockSparseView {
+   public:
+    explicit BlockSparseView(const Tensor& indices) noexcept
+        : indices_(indices) {}
+    const Tensor& Indices() const noexcept { return indices_; }
+
+   private:
+    std::reference_wrapper<const Tensor> indices_;
+  };
+
+  /// <summary>
+  /// Return BlockSparseIndex view
+  /// </summary>
+  /// <returns>an instance of BlockSparseView</returns>
+  BlockSparseView AsBlockSparse() const;
+
+  /// <summary>
+  /// Use blocksparse indices contained in the user allocated buffer. The shape of the index
+  /// must be 2-D and must contain one tuple per each of the value blocks that
+  /// were supplied to the constructor. The supplied buffer lifespan must eclipse the life
+  /// of sparse tensor instance.
+  /// </summary>
+  /// <param name="indices_shape">Use {0} for fully sparse tensors.</param>
+  /// <param name="indices_data">Ptr to user allocated buffer. Use nullptr for fully spare tensors.</param>
+  /// <returns></returns>
+  Status UseBlockSparseIndices(const TensorShape& indices_shape, int32_t* indices_data);
+
+  /// <summary>
+  /// The function allocates a single contiguous buffer and copies values and index
+  /// into it. The shape of the values is expected to be at least 3-D but may contain more
+  /// dimensions. At the very minimum it should be (num_blocks, block_size, block_size).
+  ///
+  // The shape of the index is must be at least 2-D and must contain one tuple per each of
+  // the value blocks that  were supplied to the constructor. Each index tuple is a
+  // (row, col) coordinates of the values block in a dense matrix.
+  /// </summary>
+  /// <param name="data_transfer"></param>
+  /// <param name="data_location"></param>
+  /// <param name="values_shape">The shape is expected to be at least 3-D. However, use {0} for fully sparse tensors.</param>
+  /// <param name="values_data">Pointer to a data to be copied. Use nullptr for fully sparse tensors.</param>
+  /// <param name="indices_shape">The shape is expected to be 2-D. However, you can use {0} for fully sparse tensors.</param>
+  /// <param name="indices_data">Pointer to index data to be copied. Use nullptr for fully sparse tensors.</param>
+  /// <returns></returns>
+  Status MakeBlockSparseData(const IDataTransfer& data_transfer,
+                             const OrtMemoryInfo& data_location,
+                             const TensorShape& values_shape, const void* values_data,
+                             const TensorShape& indices_shape, const int32_t* indices_data);
+
+  /// <summary>
+  /// The method allocates a single contiguous buffer and creates instances of std::strings in it, with
+  /// copies of the supplied zero-terminated strings followed by COO indices.
+  /// All data is assumed to be on CPU and the allocator supplied must be
+  /// a CPU based allocator.
+  /// </summary>
+  /// <param name="values_shape">Use {0} shape for fully sparse tensors</param>
+  /// <param name="strings">array of char* ptrs, use nullptr for fully sparse tensor</param>
+  /// <param name="indices_shape">Use {0} for fully sparse tensors</param>
+  /// <param name="indices_data">use nullptr for fully sparse tensors</param>
+  /// <returns></returns>
+  Status MakeBlockSparseStrings(const TensorShape& values_shape, const char* const* strings,
+                                const TensorShape& indices_shape, const int32_t* indices_data);
+
+  /// <summary>
+  /// Mutable data access
+  /// </summary>
+  class BlockSparseMutator {
+   public:
+    BlockSparseMutator(Tensor& values, Tensor& indices) noexcept
+        : values_(values), indices_(indices) {}
+    Tensor& Values() noexcept { return values_; }
+    Tensor& Indices() noexcept { return indices_; }
+
+   private:
+    std::reference_wrapper<Tensor> values_;
+    std::reference_wrapper<Tensor> indices_;
+  };
+
+  /// <summary>
+  /// Allocates memory for values and index and returns mutator so
+  /// data can be populated
+  /// </summary>
+  /// <param name="values_shape">Shape is expected to be 3-D, use {0} for fully sparse tensors</param>
+  /// <param name="indices_shape">Shape is expected to be 2-D, use {0} for fully sparse tensors </param>
+  /// <returns></returns>
+  BlockSparseMutator MakeBlockSparseData(const TensorShape& values_shape, const TensorShape& indices_shape);
+
+  /// <summary>
+  /// X-device copy. Destination tensor must have allocator set.
+  /// </summary>
+  /// <param name="data_transfer_manager"></param>
+  /// <param name="exec_q_id"></param>
+  /// <param name="dst_tensor"></param>
+  /// <returns></returns>
+  Status Copy(const DataTransferManager& data_transfer_manager, SparseTensor& dst_tensor) const;
+
+  /// <summary>
+  /// X-device copy. Destination tensor must have allocator set.
+  /// </summary>
+  /// <param name="dst_tensor"></param>
+  /// <returns></returns>
+  Status Copy(const IDataTransfer& data_transfer, SparseTensor& dst_tensor) const;
+
+ private:
+  Status AllocateBuffer(int64_t buffer_size, size_t num_values);
+  void ReleaseBuffer();
+  void* IndicesStart(int64_t values_bytes);
+  const void* IndicesStart(int64_t values_bytes) const;
+  Status ValidateBlockSparseShapes(const TensorShape& values_shape, const TensorShape& index_shape) const;
+
+  std::vector<int64_t> GetCooIndexDims(size_t values_count, size_t index_size) const;
+  void InitCooIndex(const TensorShape& index_shape, int64_t* index_data);
+
+  Status ValidateCsrIndices(size_t values_count, size_t inner_size, size_t outer_size) const;
+  void InitCsrIndices(size_t inner_size, const int64_t* inner, size_t outer_size, const int64_t* outer);
+  void InitBlockSparseIndices(const TensorShape& indices_shape, int32_t* indices_data);
+
+  SparseFormat format_;                        // sparse format enum value
+  TensorShape dense_shape_;                    // a shape of a corresponding dense tensor
+  const PrimitiveDataTypeBase* ml_data_type_;  // MLDataType for contained values
+  AllocatorPtr allocator_;                     // Allocator or nullptr when using user supplied buffers
+  OrtMemoryInfo location_;                     // Memory info where data resides. When allocator is supplied,
+                                               // location_ is obtained from the allocator.
+  void* p_data_;                               // Allocated buffer ptr, or nullptr when using user supplied buffers
+  int64_t buffer_size_;                        // Allocated buffer size or zero when using user supplied buffers.
+  Tensor values_;                              // Tensor instance that holds a values buffer information either user supplied or
+                                               // to a beginning of p_data_, before format specific indices.
+  std::vector<Tensor> format_data_;            // A collection of format specific indices. They contain pointers to either a
+                                               // user supplied buffers or to portions of contiguous buffer p_data_.
+};
+
+}  // namespace onnxruntime
+
+#endif
\ No newline at end of file
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/stream_handles.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/stream_handles.h
new file mode 100644
index 00000000000000..01631e1fb2aa63
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/stream_handles.h
@@ -0,0 +1,176 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+
+#include <functional>
+#include <unordered_map>
+#include "core/framework/allocator.h"
+#include "core/framework/ortdevice.h"
+#include "core/common/status.h"
+
+namespace onnxruntime {
+class IExecutionProvider;
+// this opaque handle could be anything the target device generated.
+// it could be a cuda event, or a npu notification implementation
+using NotificationHandle = void*;
+// it can be either a cuda stream, or even nullptr for device doesn't have stream support like cpu.
+using StreamHandle = void*;
+
+namespace synchronize {
+class Notification;
+}
+
+// a stream abstraction which hold an opaque handle, and a reference to which OrtDevice instance this stream belong to.
+// it need to be OrtDevice instance as we might have different stream on different OrtDevice with same type.
+// i.e. different cuda stream on different GPU.
+class Stream {
+ public:
+  Stream(StreamHandle h, const OrtDevice& d) : handle_(h), device_(d) {}
+
+  virtual ~Stream() = default;
+  virtual std::unique_ptr<synchronize::Notification> CreateNotification(size_t /*num_consumers*/) {
+    return {};
+  };
+  // block the host thread until all the tasks in the stream finished.
+  virtual void Flush() {};
+  // The framework may reuse the stream instance for multiple iterations.
+  // This is the API that provide a chance to let the device stream cleanup
+  // resource at the end of a iteration.
+  virtual Status CleanUpOnRunEnd() { return Status::OK(); };
+
+  StreamHandle GetHandle() const { return handle_; }
+
+  const OrtDevice& GetDevice() const { return device_; }
+
+  // We use the timestamp based vector clocks to optimize the resource sharing
+  // between different streams.
+  // Each stream maintain following data structure:
+  // 1. Current timestamp
+  // 2. A lookup table that for a given stream, what is its timestamp when the
+  //    last synchronization happened with current stream.
+  // 3. When a notification is activated, it take a snapshot of current stream's
+  //    lookup table.
+  // 4. When synchronization happened (current stream wait on a notification),
+  //    update its lookup table with the table snapshot in notification.
+  // The memory reusing strategy is:
+  // A kernel in current stream is safe to reuse another stream's memory chunk
+  // as long as the reused chunk's timestamp is less than the last synchronized
+  // timestamp recorded in the lookup table.
+
+  // Get the current timestamp
+  uint64_t GetCurrentTimestamp() const { return timestamp_; }
+
+  // return the timestamp when the last synchronization happened between target stream and current stream.
+  // return 0 if no synchronization happened.
+  // if target_stream is nullptr, it means it is a sequence running on device doesn't support Stream (i.e. CPU)
+  // we can safely return 0 in that case to save a lookup.
+  uint64_t GetLastSyncTimestampWithTargetStream(Stream* target_stream) const {
+    if (!target_stream)
+      return 0;
+    auto it = other_stream_clock_.find(target_stream);
+    return it == other_stream_clock_.end() ? 0 : it->second;
+  }
+
+  // make a copy of the current stream lookup table.
+  // this is used to create a snapshot of the stream lookup table in notification.
+  void CloneCurrentStreamSyncTable(std::unordered_map<Stream*, uint64_t>& output) const {
+    output.reserve(other_stream_clock_.size());
+    output.insert(other_stream_clock_.begin(), other_stream_clock_.end());
+  }
+
+  // bump the current timestamp
+  // When a notification get activated, bump the snapshot in its owner.
+  // Stream is not shared across threads, BumpTimeStampAndReturn will only be invoked on the current thread
+  // where the stream is executed on, so there is no race condition.
+  uint64_t BumpTimeStampAndReturn() {
+    return ++timestamp_;
+  }
+
+  // update the stream lookup table with the snapshot saved in notification.
+  void UpdateStreamClock(const std::unordered_map<Stream*, uint64_t>& clock) {
+    for (const auto& kv : clock) {
+      auto ret = other_stream_clock_.insert(kv);
+      if (!ret.second) {
+        ret.first->second = std::max(ret.first->second, kv.second);
+      }
+    }
+  }
+
+  virtual void* GetResource(int /*version*/, int /*id*/) const {
+    return nullptr;
+  }
+
+  virtual WaitNotificationFn GetWaitNotificationFn() const { return nullptr; }
+
+ private:
+  StreamHandle handle_;
+  const OrtDevice& device_;
+  uint64_t timestamp_{0};
+  // TODO: use inline container.
+  // currently this class is header only, but abseil doesn't compile with nvcc
+  // we need to add new symbol to provider_bridge and hide abseil from the header.
+  std::unordered_map<Stream*, uint64_t> other_stream_clock_{};
+
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Stream);
+};
+
+namespace synchronize {
+// An abstraction used for synchronization between streams. See its concrete subclass (CudaNotification, etc.) how the activate
+// and wait works for a specific stream
+class Notification {
+ public:
+  explicit Notification(Stream& s) : stream_(s) {}
+  virtual ~Notification() = default;
+
+  // this api will perform three operations:
+  // 1. activate the notification on device, for example, record an event on GPU.
+  // 2. take a snapshot of the timestamp lookup table in current stream.
+  // 3. bump the timestamp for current stream.
+  void ActivateAndUpdate() {
+    Activate();
+    stream_.CloneCurrentStreamSyncTable(stream_clock_);
+    stream_clock_[&stream_] = stream_.BumpTimeStampAndReturn();
+  }
+
+  // return the timestamp lookup table saved in the notification.
+  const std::unordered_map<Stream*, uint64_t>& GetStreamSyncTable() {
+    return stream_clock_;
+  }
+
+ protected:
+  virtual void Activate() = 0;
+  // which stream create this notification.
+  Stream& stream_;
+  // TODO: use inline container.
+  // currently this class is header only, but abseil doesn't compile with nvcc
+  // we need to add new symbol to provider_bridge and hide abseil from the header.
+  std::unordered_map<Stream*, uint64_t> stream_clock_{};
+};
+}  // namespace synchronize
+
+// the definition for the handle for stream commands
+// EP can register the handle to the executor.
+// in the POC, just use primitive function pointer
+// TODO: use a better way to dispatch handles.
+using CreateStreamFn = std::function<std::unique_ptr<Stream>(const OrtDevice&)>;
+
+// an interface of a simple registry which hold the handles EP registered.
+// make it interface so we can pass it through shared library based execution providers
+class IStreamCommandHandleRegistry {
+ public:
+  virtual ~IStreamCommandHandleRegistry() = default;
+  // Wait is a little special as we need to consider the source stream the notification generated, and the stream we are waiting.
+  // i.e., for an cuda event what notify the memory copy, it could be wait on a CPU stream, or on another cuda stream.
+  [[nodiscard]] virtual WaitNotificationFn GetWaitHandle(OrtDevice::DeviceType notification_ower_device_type,
+                                                         OrtDevice::DeviceType executor_device_type) const = 0;
+  // Get the stream creation function registered on the given device type.
+  [[nodiscard]] virtual CreateStreamFn GetCreateStreamFn(OrtDevice::DeviceType execution_device_type) const = 0;
+  // register a wait methond which will be invoked when we wait a notification (created by 'notification_device_type' device) on a stream at 'device_type' device.
+  virtual void RegisterWaitFn(OrtDevice::DeviceType notification_device_type,
+                              OrtDevice::DeviceType device_type,
+                              WaitNotificationFn fn) = 0;
+  // register a handle about how to create stream on given device type.
+  virtual void RegisterCreateStreamFn(OrtDevice::DeviceType device_type, CreateStreamFn f) = 0;
+};
+
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/tensor.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/tensor.h
new file mode 100644
index 00000000000000..dd2603d214f630
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/tensor.h
@@ -0,0 +1,356 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <cstddef>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <gsl/gsl>
+#include "core/common/common.h"
+#include "core/framework/allocator.h"
+#include "core/framework/tensor_shape.h"
+#include "core/framework/buffer_deleter.h"
+#include "onnxruntime_config.h"
+#include "core/framework/data_types.h"
+#include "core/framework/data_types_internal.h"
+
+struct OrtValue;
+
+namespace onnxruntime {
+
+// TODO:ensure dtype_!=nullptr
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#ifdef HAS_NULL_DEREFERENCE
+#pragma GCC diagnostic ignored "-Wnull-dereference"
+#endif
+#endif
+/*
+  We want to keep tensor as simple as possible, it is just a placeholder
+  for a piece of memory, with additional shape information.
+  Memory is owned and managed by Executor / Workspace, so Tensor just uses
+  it, and won't do any allocation / release.
+*/
+
+class Tensor final {
+ public:
+  // NB! Removing Create() methods returning unique_ptr<Tensor>.
+  // Still available in other EPs that are dynamically linked.
+  // Strive not to allocate Tensor with new/delete as it is a shallow class and using it by value is just fine.
+  // Use InitOrtValue() methods to allocate for OrtValue.
+
+  Tensor() = default;  // to allow creating vector<Tensor> to support seq(tensor)
+
+  /**
+   * Create tensor with given type, shape, pre-allocated memory and allocator info.
+   * This function does not check if the preallocated buffer(p_data) has enough room for the shape.
+   * \param elt_type Data type of the tensor elements.
+   * \param shape Shape of the tensor
+   * \param p_data A preallocated buffer. Can be NULL if the shape is empty.
+   *               Tensor does not own the data and will not delete it
+   * \param location Memory info for location of p_data.
+   * \param offset Offset in bytes to start of Tensor within p_data.
+   * \param strides Strides span. Can be empty if the tensor is contiguous.
+   */
+  Tensor(MLDataType elt_type, const TensorShape& shape, void* p_data, const OrtMemoryInfo& location,
+         ptrdiff_t offset = 0, gsl::span<const int64_t> strides = {});
+
+  /**
+   * Create tensor with given type, shape, pre-allocated memory and allocator which will be used to free the
+   * pre-allocated memory. The Tensor will take over ownership of p_data.
+   * This function does not check if the preallocated buffer(p_data) has enough room for the shape.
+   * \param elt_type Data type of the tensor elements.
+   * \param shape Shape of the tensor
+   * \param p_data A preallocated buffer. Can be NULL if the shape is empty.
+   *               Tensor will own the memory and will delete it when the tensor instance is destructed.
+   * \param deleter Allocator used to free the pre-allocated memory
+   * \param offset Offset in bytes to start of Tensor within p_data.
+   * \param strides Strides span. Can be empty if the tensor is contiguous.
+   */
+  Tensor(MLDataType elt_type, const TensorShape& shape, void* p_data, std::shared_ptr<IAllocator> deleter,
+         ptrdiff_t offset = 0, gsl::span<const int64_t> strides = {});
+
+  /// <summary>
+  /// Create a Tensor that allocates and owns the buffer required for the specified shape.
+  /// </summary>
+  /// <param name="elt_type">Data type of the tensor elements.</param>
+  /// <param name="shape">Tensor shape.</param>
+  /// <param name="allocator">Allocator to use to create and free buffer.</param>
+  Tensor(MLDataType elt_type, const TensorShape& shape, std::shared_ptr<IAllocator> allocator);
+
+  ~Tensor();
+
+  // Move is allowed
+  ORT_DISALLOW_COPY_AND_ASSIGNMENT(Tensor);
+
+  Tensor(Tensor&& other) noexcept;
+  Tensor& operator=(Tensor&& other) noexcept;
+
+  /// <summary>
+  /// Creates an instance of Tensor on the heap and initializes OrtValue with it.
+  /// </summary>
+  /// <param name="elt_type">Data type of the tensor elements.</param>
+  /// <param name="shape">Tensor shape.</param>
+  /// <param name="p_data">Tensor data.</param>
+  /// <param name="location">Memory info for location of p_data.</param>
+  /// <param name="ort_value">OrtValue to populate with Tensor.</param>
+  /// <param name="offset">Optional offset if Tensor refers to a subset of p_data.</param>
+  /// <param name="strides">Optional strides if Tensor refers to a subset of p_data.</param>
+  static void InitOrtValue(MLDataType elt_type, const TensorShape& shape, void* p_data, const OrtMemoryInfo& location,
+                           OrtValue& ort_value,
+                           ptrdiff_t offset = 0, gsl::span<const int64_t> strides = {});
+
+  /// <summary>
+  /// Creates an instance of Tensor on the heap which will take over ownership of the pre-allocated buffer.
+  /// </summary>
+  /// <param name="elt_type">Data type of the tensor elements.</param>
+  /// <param name="shape"Tensor shape.</param>
+  /// <param name="p_data">Tensor data.</param>
+  /// <param name="allocator">Allocator that was used to create p_data and will be used to free it.</param>
+  /// <param name="ort_value">OrtValue to populate with Tensor.</param>
+  /// <param name="offset">Optional offset if Tensor refers to a subset of p_data.</param>
+  /// <param name="strides">Optional strides if Tensor refers to a subset of p_data.</param>
+  static void InitOrtValue(MLDataType elt_type, const TensorShape& shape, void* p_data,
+                           std::shared_ptr<IAllocator> allocator,
+                           OrtValue& ort_value,
+                           ptrdiff_t offset = 0, gsl::span<const int64_t> strides = {});
+
+  /// <summary>
+  /// Creates an instance of Tensor on the heap and initializes OrtValue with it.
+  /// The Tensor instance will allocate and own the data required for `shape`.
+  /// </summary>
+  /// <param name="elt_type">Data type of the tensor elements.</param>
+  /// <param name="shape">Tensor shape.</param>
+  /// <param name="allocator">Allocator that was used to create p_data and will be used to free it.</param>
+  /// <param name="ort_value">OrtValue to populate with Tensor.</param>
+  static void InitOrtValue(MLDataType elt_type, const TensorShape& shape, std::shared_ptr<IAllocator> allocator,
+                           OrtValue& ort_value);
+
+  /// <summary>
+  /// Initializes OrtValue with an existing Tensor.
+  /// </summary>
+  /// <param name="tensor">Tensor.</param>
+  /// <param name="ort_value">OrtValue to populate with Tensor.</param>
+  static void InitOrtValue(Tensor&& tensor, OrtValue& ort_value);
+
+  /// <summary>
+  /// Calculate the required storage for the tensor.
+  /// </summary>
+  /// <param name="elt_type">Data type of the tensor elements.</param>
+  /// <param name="shape">Tensor shape.</param>
+  /// <returns>Bytes required.</returns>
+  static size_t CalculateTensorStorageSize(MLDataType elt_type, const TensorShape& shape);
+
+  /// <summary>
+  /// Calculate the required storage for the tensor.
+  /// </summary>
+  /// <param name="elt_type">Data type of the tensor elements.</param>
+  /// <param name="shape">Tensor shape.</param>
+  /// <param name="alignment">Power of 2 alignment to include in calculation.
+  /// Bumps up result to the nearest multiple of alignment. Set to 0 to ignore.</param>
+  /// <param name="storage_size">The resulting storage size.</param>
+  /// <returns>Status indicating success or failure.</returns>
+  static Status CalculateTensorStorageSize(MLDataType elt_type, const TensorShape& shape, size_t alignment,
+                                           size_t& storage_size);
+  /**
+     Returns the data type.
+  */
+  MLDataType DataType() const { return dtype_; }
+
+  /**
+     Returns the data type enum constant
+     @remarks Use utils::ToTensorProtoElementType<T> for comparison.
+  */
+  int32_t GetElementType() const {
+    return dtype_->GetDataType();
+  }
+
+  // Check if contains string data. This is a separate
+  // interface bc it is frequently used.
+  bool IsDataTypeString() const {
+    return utils::IsPrimitiveDataType<std::string>(dtype_);
+  }
+
+  // Checks if the Tensor contains data type T
+  template <class T>
+  bool IsDataType() const {
+    return utils::IsPrimitiveDataType<T>(dtype_);
+  }
+
+  /**
+     Returns the shape of the tensor.
+  */
+  const TensorShape& Shape() const noexcept { return shape_; }
+
+  /**
+     Returns the location of the tensor's memory
+  */
+  const OrtMemoryInfo& Location() const { return alloc_info_; }
+
+  /**
+     May return nullptr if tensor size is zero
+  */
+  template <typename T>
+  T* MutableData() {
+    // Type check
+    ORT_ENFORCE(utils::IsPrimitiveDataType<T>(dtype_), "Tensor type mismatch. ",
+                "T ", "!=", dtype_);
+    return reinterpret_cast<T*>(static_cast<char*>(p_data_) + byte_offset_);
+  }
+
+  /**
+     May return nullptr if tensor size is zero
+  */
+  template <typename T>
+  gsl::span<T> MutableDataAsSpan() {
+    // Type check
+    ORT_ENFORCE(utils::IsPrimitiveDataType<T>(dtype_), "Tensor type mismatch. ",
+                "T ", "!=", dtype_);
+    T* data = reinterpret_cast<T*>(static_cast<char*>(p_data_) + byte_offset_);
+    return gsl::make_span(data, static_cast<size_t>(NumStorageElements()));
+  }
+
+  template <typename T>
+  const T* Data() const {
+    // Type check
+    ORT_ENFORCE(utils::IsPrimitiveDataType<T>(dtype_), "Tensor type mismatch. ",
+                "T ", "!=", dtype_);
+    return reinterpret_cast<const T*>(static_cast<char*>(p_data_) + byte_offset_);
+  }
+
+  template <typename T>
+  gsl::span<const T> DataAsSpan() const {
+    // Type check
+    ORT_ENFORCE(utils::IsPrimitiveDataType<T>(dtype_), "Tensor type mismatch. ",
+                "T ", "!=", dtype_);
+    const T* data = reinterpret_cast<const T*>(static_cast<char*>(p_data_) + byte_offset_);
+    return gsl::make_span(data, static_cast<typename gsl::span<T>::size_type>(NumStorageElements()));
+  }
+
+  void* MutableDataRaw(MLDataType type) {
+    ORT_ENFORCE(type == dtype_, "Tensor type mismatch.", type, "!=", dtype_);
+    return static_cast<char*>(p_data_) + byte_offset_;
+  }
+
+  const void* DataRaw(MLDataType type) const {
+    ORT_ENFORCE(type == dtype_, "Tensor type mismatch.", type, "!=", dtype_);
+    return static_cast<char*>(p_data_) + byte_offset_;
+  }
+
+  void* MutableDataRaw() noexcept {
+    return static_cast<char*>(p_data_) + byte_offset_;
+  }
+
+  const void* DataRaw() const noexcept {
+    return static_cast<char*>(p_data_) + byte_offset_;
+  }
+
+  bool OwnsBuffer() const noexcept {
+    return buffer_deleter_ != nullptr;
+  }
+
+  /**
+   * Resizes the tensor without touching underlying storage.
+   * This requires the total size of the tensor to remains constant.
+   * @warning this function is NOT thread-safe.
+   */
+  inline void Reshape(const TensorShape& new_shape) {
+    ORT_ENFORCE(shape_.Size() == new_shape.Size(),
+                "Tensor size (" + std::to_string(shape_.Size()) +
+                    ") != new size (" + std::to_string(new_shape.Size()) + ")");
+    shape_ = new_shape;
+  }
+
+  /**
+   * Get the byte offset with respect to the p_data
+   * @warning this is a temporary solution for reusing the buffer bigger than needed.
+   * @warning use with caution - make sure you do boundary check before calling this method (see view.cc)
+   */
+  inline ptrdiff_t ByteOffset() const {
+    return byte_offset_;
+  }
+
+  /**
+   * Set the byte offset with respect to the p_data
+   * @warning this is a temporary solution for reusing the buffer bigger than needed.
+   */
+  inline void SetByteOffset(ptrdiff_t byte_offset) {
+    byte_offset_ = byte_offset;
+  }
+
+  /// <summary>
+  /// The number of Tensor "storage" elements. A single storage element may contain multiple sub-elements for
+  /// sub-byte data types (e.g., int4).
+  ///
+  /// For element types smaller than 1 byte (e.g., int4), a single storage element stores multiple sub-byte elements.
+  /// Example: Tensor<int4> of shape (4,) has 2 storage elements.
+  ///
+  /// For element types >= 1 byte, this function returns the product of the shape.
+  /// Example: Tensor<int8> of shape (4,) has 4 storage elements.
+  /// </summary>
+  /// <returns>Number of tensor storage elements</returns>
+  int64_t NumStorageElements() const;
+
+  /**
+  The number of bytes of data.
+  */
+  size_t SizeInBytes() const;
+
+#ifdef ENABLE_STRIDED_TENSORS
+  /**
+   * Get the strides of the tensor.
+   */
+  gsl::span<const int64_t> Strides() const;
+
+  /**
+   * Return if the tensor is contiguous.
+   */
+  bool IsContiguous() const noexcept { return is_contiguous_; }
+
+  /**
+   * Set strides.
+   */
+  void SetShapeAndStrides(const TensorShape& new_shape, gsl::span<const int64_t> new_strides);
+#endif
+
+  // More API methods.
+ private:
+  void Init(MLDataType elt_type,
+            const TensorShape& shape,
+            void* p_raw_data,
+            AllocatorPtr deleter,
+            ptrdiff_t offset = 0,
+            gsl::span<const int64_t> strides = {});
+
+  void ReleaseBuffer();
+
+#ifdef ENABLE_STRIDED_TENSORS
+  bool CheckIsContiguous() const;
+#endif
+
+  void* p_data_;
+  /**
+     if buffer_deleter_ is null, it means tensor does not own the buffer.
+     otherwise tensor will use the deleter to release the buffer when
+     tensor is released.
+  */
+  AllocatorPtr buffer_deleter_;
+
+  TensorShape shape_;
+#ifdef ENABLE_STRIDED_TENSORS
+  mutable TensorShapeVector strides_;
+  bool is_contiguous_ = true;
+#endif
+
+  const PrimitiveDataTypeBase* dtype_;
+  OrtMemoryInfo alloc_info_;
+  ptrdiff_t byte_offset_;
+};
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/tensor_shape.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/tensor_shape.h
new file mode 100644
index 00000000000000..d4ee4a0e5e649f
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/tensor_shape.h
@@ -0,0 +1,176 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <algorithm>
+#include <cstring>
+#include <iosfwd>
+#include <string>
+#include <vector>
+
+#include <gsl/gsl>
+#include "core/common/inlined_containers_fwd.h"
+#include "core/common/span_utils.h"
+#include "onnxruntime_config.h"
+
+namespace onnxruntime {
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#ifdef HAS_NULL_DEREFERENCE
+#pragma GCC diagnostic ignored "-Wnull-dereference"
+#endif
+#endif
+
+constexpr size_t kTensorShapeSmallBufferElementsSize = 5;
+
+// Use this type to build a shape and then create TensorShape.
+// We opt to re-use a common instantiation instead of a typedef with kTensorShapeSmallBufferElementsSize
+// To reduce on binary size.
+using TensorShapeVector = InlinedVector<int64_t>;
+
+inline TensorShapeVector ToShapeVector(const gsl::span<const int64_t>& span) {
+  TensorShapeVector out;
+  out.reserve(span.size());
+  out.assign(span.begin(), span.end());
+  return out;
+}
+
+inline gsl::span<const int64_t> ToConstSpan(const TensorShapeVector& vec) {
+  return gsl::make_span(vec);
+}
+
+class TensorShape {
+  // We use negative numbers for unknown symbolic dimension. Each negative
+  // number represents a unique symbolic dimension.
+ public:
+  TensorShape() = default;
+
+  TensorShape(const TensorShape& other) : TensorShape(other.GetDims()) {}
+  TensorShape& operator=(const TensorShape& other);
+  TensorShape& operator=(const gsl::span<const int64_t>& dims) {
+    *this = TensorShape(dims);
+    return *this;
+  }
+
+  TensorShape(TensorShape&& other) noexcept { operator=(std::move(other)); }
+  TensorShape& operator=(TensorShape&& other) noexcept;
+
+  TensorShape(gsl::span<const int64_t> dims);
+  TensorShape(const TensorShapeVector& dims) : TensorShape(gsl::make_span(dims)) {}
+  TensorShape(std::initializer_list<int64_t> dims) : TensorShape(gsl::make_span(dims.begin(), dims.end())) {}
+  TensorShape(const int64_t* dimension_sizes, size_t dimension_count) : TensorShape(gsl::span<const int64_t>(dimension_sizes, dimension_count)) {}
+  TensorShape(const std::vector<int64_t>& dims, size_t start, size_t end) : TensorShape(gsl::span<const int64_t>(&dims[start], end - start)) {}
+
+  // Create a TensorShape that points to an existing buffer internally. As no copy is made, 'data' must remain valid for the life of the TensorShape
+  static const TensorShape FromExistingBuffer(const std::vector<int64_t>& data) {
+    return TensorShape(External{}, gsl::span<int64_t>(const_cast<int64_t*>(data.data()), data.size()));
+  }
+
+  /**
+     Return the dimension specified by <idx>.
+  */
+  int64_t operator[](size_t idx) const { return values_[idx]; }
+  int64_t& operator[](size_t idx) { return values_[idx]; }
+
+  bool operator==(const TensorShape& other) const noexcept { return SpanEq(GetDims(), other.GetDims()); }
+  bool operator!=(const TensorShape& other) const noexcept { return !(*this == other); }
+
+  size_t NumDimensions() const noexcept {
+    return values_.size();
+  }
+
+  /**
+     Copy dims into an array with given size
+  */
+  void CopyDims(int64_t* dims, size_t num_dims) const {
+    memcpy(dims, values_.data(), sizeof(int64_t) * std::min(num_dims, NumDimensions()));
+  }
+
+  /**
+     Copy dims from a specific start dim into an array with given size
+     `start_dim` is expected to be in the inclusive range [0, NumDimensions() - 1]
+     and this function does no checks to ensure that
+  */
+  void CopyDims(int64_t* dims, size_t start_dim, size_t num_dims) const {
+    memcpy(dims, values_.data() + start_dim, sizeof(int64_t) * std::min(num_dims, NumDimensions() - start_dim));
+  }
+
+  /**
+     Return underlying vector representation.
+  */
+  gsl::span<const int64_t> GetDims() const { return values_; }
+
+  TensorShapeVector AsShapeVector() const {
+    return ToShapeVector(values_);
+  }
+
+  /**
+   * Return the total number of elements. Returns 1 for an empty (rank 0) TensorShape.
+   *
+   * May return -1
+   */
+  int64_t Size() const;
+
+  /**
+     Return the total number of elements up to the specified dimension.
+     If the dimension interval is empty (dimension == 0), return 1.
+     @param dimension Return size up to this dimension. Value must be between 0 and this->NumDimensions(), inclusive.
+  */
+  int64_t SizeToDimension(size_t dimension) const;
+
+  /**
+     Return the total number of elements from the specified dimension to the end of the tensor shape.
+     If the dimension interval is empty (dimension == this->NumDimensions()), return 1.
+     @param dimension Return size from this dimension to the end. Value must be between 0 and this->NumDimensions(),
+                      inclusive.
+  */
+  int64_t SizeFromDimension(size_t dimension) const;
+
+  /**
+     Return a new TensorShape of the dimensions from dimstart to dimend.
+  */
+  TensorShape Slice(size_t dimstart, size_t dimend) const;
+
+  /**
+     Return a new TensorShape of the dimensions from dimstart to end.
+  */
+  TensorShape Slice(size_t dimstart) const { return Slice(dimstart, values_.size()); }
+
+  /**
+     output dimensions nicely formatted
+  */
+  std::string ToString() const;
+
+  /**
+     Calculate size between start and end.
+     Assumes start and end are between 0 and this->NumDimensions(), inclusive, and that
+     start < end.
+  */
+  int64_t SizeHelper(size_t start, size_t end) const;
+
+  /**
+     empty shape or 1D shape (1) is regarded as scalar tensor
+  */
+  bool IsScalar() const {
+    size_t len = values_.size();
+    return len == 0 || (len == 1 && values_[0] == 1);
+  }
+
+ private:
+  struct External {};
+  TensorShape(External, gsl::span<int64_t> buffer) : values_{buffer} {}
+
+  void Allocate(size_t size);
+
+  gsl::span<int64_t> values_;
+  int64_t small_buffer_[kTensorShapeSmallBufferElementsSize]{0};
+  std::unique_ptr<int64_t[]> allocated_buffer_;
+
+  friend struct ProviderHostImpl;  // So that the shared provider interface can access Allocate
+};
+
+// operator<< to nicely output to a stream
+std::ostream& operator<<(std::ostream& out, const TensorShape& shape);
+
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/to_tensor_proto_element_type.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/to_tensor_proto_element_type.h
new file mode 100644
index 00000000000000..e9e28e4864a67b
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/framework/to_tensor_proto_element_type.h
@@ -0,0 +1,111 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <cstdint>
+#include <string>
+
+#ifndef SHARED_PROVIDER
+#include "core/graph/onnx_protobuf.h"
+#endif
+
+#include "core/framework/float8.h"
+#include "core/framework/float16.h"
+#include "core/framework/int4.h"
+
+namespace onnxruntime {
+namespace utils {
+/** Gets the TensorProto_DataType corresponding to the template type `T`. */
+template <typename T>
+constexpr ONNX_NAMESPACE::TensorProto_DataType ToTensorProtoElementType() {
+  return ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED;
+}
+template <>
+constexpr ONNX_NAMESPACE::TensorProto_DataType ToTensorProtoElementType<float>() {
+  return ONNX_NAMESPACE::TensorProto_DataType_FLOAT;
+}
+template <>
+constexpr ONNX_NAMESPACE::TensorProto_DataType ToTensorProtoElementType<uint8_t>() {
+  return ONNX_NAMESPACE::TensorProto_DataType_UINT8;
+}
+template <>
+constexpr ONNX_NAMESPACE::TensorProto_DataType ToTensorProtoElementType<int8_t>() {
+  return ONNX_NAMESPACE::TensorProto_DataType_INT8;
+}
+template <>
+constexpr ONNX_NAMESPACE::TensorProto_DataType ToTensorProtoElementType<uint16_t>() {
+  return ONNX_NAMESPACE::TensorProto_DataType_UINT16;
+}
+template <>
+constexpr ONNX_NAMESPACE::TensorProto_DataType ToTensorProtoElementType<int16_t>() {
+  return ONNX_NAMESPACE::TensorProto_DataType_INT16;
+}
+template <>
+constexpr ONNX_NAMESPACE::TensorProto_DataType ToTensorProtoElementType<int32_t>() {
+  return ONNX_NAMESPACE::TensorProto_DataType_INT32;
+}
+template <>
+constexpr ONNX_NAMESPACE::TensorProto_DataType ToTensorProtoElementType<int64_t>() {
+  return ONNX_NAMESPACE::TensorProto_DataType_INT64;
+}
+template <>
+constexpr ONNX_NAMESPACE::TensorProto_DataType ToTensorProtoElementType<std::string>() {
+  return ONNX_NAMESPACE::TensorProto_DataType_STRING;
+}
+template <>
+constexpr ONNX_NAMESPACE::TensorProto_DataType ToTensorProtoElementType<bool>() {
+  return ONNX_NAMESPACE::TensorProto_DataType_BOOL;
+}
+template <>
+constexpr ONNX_NAMESPACE::TensorProto_DataType ToTensorProtoElementType<MLFloat16>() {
+  return ONNX_NAMESPACE::TensorProto_DataType_FLOAT16;
+}
+template <>
+constexpr ONNX_NAMESPACE::TensorProto_DataType ToTensorProtoElementType<double>() {
+  return ONNX_NAMESPACE::TensorProto_DataType_DOUBLE;
+}
+template <>
+constexpr ONNX_NAMESPACE::TensorProto_DataType ToTensorProtoElementType<uint32_t>() {
+  return ONNX_NAMESPACE::TensorProto_DataType_UINT32;
+}
+template <>
+constexpr ONNX_NAMESPACE::TensorProto_DataType ToTensorProtoElementType<uint64_t>() {
+  return ONNX_NAMESPACE::TensorProto_DataType_UINT64;
+}
+template <>
+constexpr ONNX_NAMESPACE::TensorProto_DataType ToTensorProtoElementType<BFloat16>() {
+  return ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16;
+}
+
+#if !defined(DISABLE_FLOAT8_TYPES)
+
+template <>
+constexpr ONNX_NAMESPACE::TensorProto_DataType ToTensorProtoElementType<Float8E4M3FN>() {
+  return ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E4M3FN;
+}
+template <>
+constexpr ONNX_NAMESPACE::TensorProto_DataType ToTensorProtoElementType<Float8E4M3FNUZ>() {
+  return ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E4M3FNUZ;
+}
+template <>
+constexpr ONNX_NAMESPACE::TensorProto_DataType ToTensorProtoElementType<Float8E5M2>() {
+  return ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E5M2;
+}
+template <>
+constexpr ONNX_NAMESPACE::TensorProto_DataType ToTensorProtoElementType<Float8E5M2FNUZ>() {
+  return ONNX_NAMESPACE::TensorProto_DataType_FLOAT8E5M2FNUZ;
+}
+
+#endif
+template <>
+constexpr ONNX_NAMESPACE::TensorProto_DataType ToTensorProtoElementType<Int4x2>() {
+  return ONNX_NAMESPACE::TensorProto_DataType_INT4;
+}
+template <>
+constexpr ONNX_NAMESPACE::TensorProto_DataType ToTensorProtoElementType<UInt4x2>() {
+  return ONNX_NAMESPACE::TensorProto_DataType_UINT4;
+}
+
+}  // namespace utils
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/graph/basic_types.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/graph/basic_types.h
new file mode 100644
index 00000000000000..cdd5e4c1e571bd
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/graph/basic_types.h
@@ -0,0 +1,52 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <unordered_map>
+#include <unordered_set>
+#include <string>
+#include <cstdint>
+#include <memory>
+#include <functional>
+
+#include "core/common/basic_types.h"
+#include "core/common/status.h"
+
+namespace ONNX_NAMESPACE {
+class ValueInfoProto;
+class TensorProto;
+class SparseTensorProto;
+class TypeProto;
+class AttributeProto;
+class FunctionProto;
+class OperatorSetIdProto;
+// define types that would come from the ONNX library if we were building against it.
+#if defined(ORT_MINIMAL_BUILD)
+using OperatorSetVersion = int;
+#endif
+
+}  // namespace ONNX_NAMESPACE
+
+namespace onnxruntime {
+using NodeIndex = size_t;
+using Version = int64_t;
+using NodeArgInfo = ONNX_NAMESPACE::ValueInfoProto;
+using InitializedTensorSet = std::unordered_map<std::string, const ONNX_NAMESPACE::TensorProto*>;
+using ArgNameToTypeMap = std::unordered_map<std::string, ONNX_NAMESPACE::TypeProto>;
+using ProviderType = const std::string&;
+
+// TODO - Evaluate switching the types below to support transparent comparators and enable
+// lookups based on gsl::cstring_span<> and std::string_view.  This would reduces allocations
+// converting to std::string, but requires conversion to std::map<std::string, foo, std::less<>>
+// instead of std::unordered_map<std::string, foo, [std::less<foo>]>.
+
+using NodeAttributes = std::unordered_map<std::string, ONNX_NAMESPACE::AttributeProto>;
+class IOnnxRuntimeOpSchemaCollection;
+using IOnnxRuntimeOpSchemaCollectionPtr = std::shared_ptr<IOnnxRuntimeOpSchemaCollection>;
+
+class OpKernel;
+class OpKernelInfo;
+class FuncManager;
+using KernelCreateFn = std::function<onnxruntime::common::Status(FuncManager& func_mgr, const OpKernelInfo& info, std::unique_ptr<OpKernel>& out)>;
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/graph/constants.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/graph/constants.h
new file mode 100644
index 00000000000000..f072badd199ba4
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/graph/constants.h
@@ -0,0 +1,61 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <stddef.h>  // needed for size_t on some platforms
+
+namespace onnxruntime {
+
+constexpr const char* kNoOp = "NoOp";
+constexpr const char* kConstant = "Constant";
+constexpr const char* kFunctionOp = "_kFunctionOp";
+constexpr const char* kConstantValue = "value";
+constexpr const char* kOnnxDomain = "";
+// NOTE: Node::Init converts kOnnxDomainAlias to kOnnxDomain, so all Node instances use kOnnxDomain.
+constexpr const char* kOnnxDomainAlias = "ai.onnx";
+constexpr const char* kMLDomain = "ai.onnx.ml";
+constexpr const char* kMSDomain = "com.microsoft";
+constexpr const char* kPytorchAtenDomain = "org.pytorch.aten";
+constexpr const char* kMSExperimentalDomain = "com.microsoft.experimental";
+constexpr const char* kMSNchwcDomain = "com.microsoft.nchwc";
+constexpr const char* kMSInternalNHWCDomain = "com.ms.internal.nhwc";
+constexpr const char* kMSDmlDomain = "com.microsoft.dml";
+constexpr const char* kNGraphDomain = "com.intel.ai";
+constexpr const char* kMIGraphXDomain = "";
+constexpr const char* kVitisAIDomain = "com.xilinx";
+
+// This is moved from the OrtApis::GetAvailableProviders implementation
+// where it is enforced
+constexpr size_t kMaxExecutionProviderNameLen = 30;
+
+constexpr const char* kCpuExecutionProvider = "CPUExecutionProvider";
+constexpr const char* kCudaExecutionProvider = "CUDAExecutionProvider";
+constexpr const char* kCudaNHWCExecutionProvider = "CUDANHWCExecutionProvider";
+constexpr const char* kDnnlExecutionProvider = "DnnlExecutionProvider";
+constexpr const char* kOpenVINOExecutionProvider = "OpenVINOExecutionProvider";
+constexpr const char* kVitisAIExecutionProvider = "VitisAIExecutionProvider";
+constexpr const char* kTensorrtExecutionProvider = "TensorrtExecutionProvider";
+constexpr const char* kNnapiExecutionProvider = "NnapiExecutionProvider";
+constexpr const char* kQnnExecutionProvider = "QNNExecutionProvider";
+constexpr const char* kRknpuExecutionProvider = "RknpuExecutionProvider";
+constexpr const char* kDmlExecutionProvider = "DmlExecutionProvider";
+constexpr const char* kMIGraphXExecutionProvider = "MIGraphXExecutionProvider";
+constexpr const char* kAclExecutionProvider = "ACLExecutionProvider";
+constexpr const char* kArmNNExecutionProvider = "ArmNNExecutionProvider";
+constexpr const char* kRocmExecutionProvider = "ROCMExecutionProvider";
+constexpr const char* kCoreMLExecutionProvider = "CoreMLExecutionProvider";
+constexpr const char* kJsExecutionProvider = "JsExecutionProvider";
+constexpr const char* kSnpeExecutionProvider = "SNPEExecutionProvider";
+constexpr const char* kTvmExecutionProvider = "TvmExecutionProvider";
+constexpr const char* kXnnpackExecutionProvider = "XnnpackExecutionProvider";
+constexpr const char* kWebNNExecutionProvider = "WebNNExecutionProvider";
+constexpr const char* kWebGpuExecutionProvider = "WebGpuExecutionProvider";
+constexpr const char* kCannExecutionProvider = "CANNExecutionProvider";
+constexpr const char* kAzureExecutionProvider = "AzureExecutionProvider";
+constexpr const char* kVSINPUExecutionProvider = "VSINPUExecutionProvider";
+
+constexpr const char* kExecutionProviderSharedLibraryPath = "shared_lib_path";
+constexpr const char* kExecutionProviderSharedLibraryEntry = "provider_factory_entry_point";
+
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/graph/function.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/graph/function.h
new file mode 100644
index 00000000000000..3341f74d78869f
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/graph/function.h
@@ -0,0 +1,31 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/common/common.h"
+#include "core/graph/indexed_sub_graph.h"
+
+namespace onnxruntime {
+class Graph;
+class Node;
+}  // namespace onnxruntime
+
+namespace onnxruntime {
+
+/**
+@class Function
+Class representing a Function.
+*/
+class Function {
+ public:
+  virtual ~Function() = default;
+
+  /** Gets the Graph instance for the Function body subgraph. */
+  virtual const onnxruntime::Graph& Body() const = 0;
+
+  /** Gets the Mutable Graph instance for the Function body subgraph. */
+  virtual onnxruntime::Graph& MutableBody() = 0;
+};
+
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/graph/graph.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/graph/graph.h
new file mode 100644
index 00000000000000..2c40b41774d788
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/graph/graph.h
@@ -0,0 +1,1893 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <filesystem>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <optional>
+#include <string>
+#include <type_traits>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "core/common/flatbuffers.h"
+
+#include <gsl/gsl>
+
+#include "core/common/common.h"
+#include "core/common/path_string.h"
+#include "core/common/const_pointer_container.h"
+#include "core/common/inlined_containers_fwd.h"
+#if !defined(ORT_MINIMAL_BUILD)
+#include "core/common/inlined_containers.h"
+#endif
+#include "core/common/span_utils.h"
+#include "core/common/status.h"
+#include "core/common/logging/logging.h"
+#include "core/framework/ort_value.h"
+#include "core/framework/prepacked_weights_container.h"
+#include "core/graph/onnx_protobuf.h"
+#include "core/graph/basic_types.h"
+#include "core/graph/constants.h"
+#include "core/graph/function.h"
+#if !defined(ORT_MINIMAL_BUILD)
+#include "core/graph/function_template.h"
+#endif
+#include "core/graph/graph_nodes.h"
+#include "core/graph/node_arg.h"
+#include "core/graph/ort_format_load_options.h"
+
+// Type from Graph API in ORT C API so can't be in a namespace
+struct OrtGraph;
+
+namespace onnxruntime {
+class Graph;
+struct IndexedSubGraph;
+class Model;
+struct ModelSavingOptions;
+class OpSignature;
+
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+class RuntimeOptimizationRecordContainer;
+#endif
+
+namespace fbs {
+struct Graph;
+struct Node;
+struct NodeEdge;
+}  // namespace fbs
+
+/**
+@class Node
+Class representing a node in the graph.
+*/
+class Node {
+ public:
+  /** Node types */
+  enum class Type {
+    Primitive = 0,  ///< The node refers to a primitive operator.
+    Fused = 1,      ///< The node refers to a function.
+  };
+
+  explicit Node() = default;
+
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS)
+  Node(std::string_view name,
+       std::string_view op_type,
+       std::string_view description,
+       gsl::span<NodeArg* const> input_args,
+       gsl::span<NodeArg* const> output_args,
+       const NodeAttributes* attributes,
+       std::string_view domain) {
+    Init(name, op_type, description,
+         input_args,
+         output_args,
+         attributes, domain);
+  }
+#endif
+
+  ~Node() = default;
+
+  /**
+  @class EdgeEnd
+  Class representing the end of an edge. It could be an input or output edge end of a node.
+  For the node's input edge end, it's the source end, as the destination end is the node itself.
+  For the node's output edge end, it's the destination end, as the source end is the node itself.
+  */
+  class EdgeEnd {
+   public:
+    /**
+    Construct an EdgeEnd
+    @param node The source node if this is an input edge to the current node,
+    or the destination node if this is an output edge from the current node.
+    @param src_arg_index The node arg index of source node of the edge.
+    @param dst_arg_index The node arg index of destination node of the edge.
+    */
+    EdgeEnd(const Node& node, int src_arg_index, int dst_arg_index) noexcept;
+
+    /** Construct a control edge.
+    @param node The node the edge joins to the current node.
+    */
+    explicit EdgeEnd(const Node& node) noexcept;
+
+    /** Gets the Node that this EdgeEnd refers to. */
+    const Node& GetNode() const noexcept { return *node_; }
+
+    /** Gets the source arg index.
+    @returns the source arg index of <*this> edge.*/
+    int GetSrcArgIndex() const { return src_arg_index_; }
+
+    /** Gets the destination arg index.
+    @returns the destination arg index of <*this> edge.*/
+    int GetDstArgIndex() const { return dst_arg_index_; }
+
+   private:
+    const Node* node_;
+    const int src_arg_index_;
+    const int dst_arg_index_;
+  };
+
+  /** Gets the Node's NodeIndex. */
+  NodeIndex Index() const noexcept { return index_; }
+
+  /** Gets the Node's name. */
+  const std::string& Name() const noexcept { return name_; }
+
+  /** Gets the Node's operator type. */
+  const std::string& OpType() const noexcept { return op_type_; }
+
+  /** Gets the domain of the OperatorSet that specifies the operator returned by #OpType.
+   * @remarks If this is an ONNX operator the value will be kOnnxDomain not kOnnxDomainAlias
+   */
+  const std::string& Domain() const noexcept { return domain_; }
+
+  /** Gets the path of the owning model if any. */
+  const std::filesystem::path& ModelPath() const noexcept;
+
+  /** Gets the Node's execution priority.
+  @remarks Lower value means higher priority  */
+  int Priority() const noexcept { return priority_; };
+
+  /** Sets the execution priority of a node.
+  @remarks Lower value means higher priority  */
+  void SetPriority(int priority) noexcept;
+
+  /** Gets the node description. */
+  const std::string& Description() const noexcept { return description_; }
+
+  /** Gets the Node's Node::Type. */
+  Node::Type NodeType() const noexcept { return node_type_; }
+
+  /** Gets the opset version that the Node's operator was first defined in.
+  @returns Opset version. If -1 the Node's operator has not been set.
+  @remarks Prefer over Op()->SinceVersion() as Op() is disabled in a minimal build
+  */
+  int SinceVersion() const noexcept { return since_version_; }
+
+  /** Sets the since version (opset version that the Node's operator was first defined in.) for this node.
+  @remarks Used during layout transformation for setting since version for layout transformed nodes with
+  domain kMSNHWC.
+  */
+  void SetSinceVersion(int since_version) noexcept { since_version_ = since_version; }
+
+#if !defined(ORT_MINIMAL_BUILD)
+  /** Gets the Node's OpSchema.
+  @remarks The graph containing this node must be resolved, otherwise nullptr will be returned. */
+  const ONNX_NAMESPACE::OpSchema* Op() const noexcept { return op_; }
+
+  /** Create a copy of the called op's FunctionProto if it has one. Returns true if successful. */
+  bool TryGetFunctionProto(ONNX_NAMESPACE::FunctionProto& func_proto) const;
+
+  bool CanBeInlined() const;
+
+  /** Gets the function body if applicable otherwise nullptr. */
+  const Function* GetFunctionBody() const noexcept { return func_body_.get(); }
+#endif
+
+  /**
+  Helper to iterate through the container returned by #InputDefs() or #OutputDefs() and call the provided function.
+  @param node_args Collection of NodeArgs returned by #InputDefs() or #OutputDefs()
+  @param func Function to call for each valid NodeArg in the node_args. The function is called with the NodeArg
+              and the index number in the container.
+  @returns common::Status with success or error information.
+  @remarks Returns immediately on error.
+  */
+  static common::Status ForEachWithIndex(const ConstPointerContainer<std::vector<NodeArg*>>& node_args,
+                                         std::function<common::Status(const NodeArg& arg, size_t index)> func) {
+    for (size_t index = 0; index < node_args.size(); ++index) {
+      auto arg = node_args[index];
+      if (!arg->Exists())
+        continue;
+      ORT_RETURN_IF_ERROR(func(*arg, index));
+    }
+    return common::Status::OK();
+  }
+
+  /** Gets the count of arguments for each of the Node's explicit inputs. */
+  const std::vector<int>& InputArgCount() const noexcept { return definitions_.input_arg_count; }
+
+  /** Gets the Node's input definitions.
+  @remarks requires ConstPointerContainer wrapper to apply const to the NodeArg pointers so access is read-only. */
+  ConstPointerContainer<std::vector<NodeArg*>> InputDefs() const noexcept {
+    return ConstPointerContainer<std::vector<NodeArg*>>(definitions_.input_defs);
+  }
+
+  /** Gets the implicit inputs to this Node.
+  If this Node contains a subgraph, these are the NodeArg's that are implicitly consumed by Nodes within that
+  subgraph. e.g. If and Loop operators.*/
+  ConstPointerContainer<std::vector<NodeArg*>> ImplicitInputDefs() const noexcept {
+    return ConstPointerContainer<std::vector<NodeArg*>>(definitions_.implicit_input_defs);
+  }
+
+  /** Gets the Node's output definitions.
+  @remarks requires ConstPointerContainer wrapper to apply const to the NodeArg pointers so access is read-only. */
+  ConstPointerContainer<std::vector<NodeArg*>> OutputDefs() const noexcept {
+    return ConstPointerContainer<std::vector<NodeArg*>>(definitions_.output_defs);
+  }
+
+#if !defined(ORT_MINIMAL_BUILD)
+  /**
+  Helper to iterate through the container returned by #MutableInputDefs() or #MutableOutputDefs() and call the provided function.
+  @param node_args Collection of NodeArgs returned by #MutableInputDefs() or #MutableOutputDefs()
+  @param func Function to call for each valid NodeArg in the node_args. The function is called with the NodeArg
+              and the index number in the container.
+  @returns common::Status with success or error information.
+  @remarks Returns immediately on error.
+  */
+  static common::Status ForEachMutableWithIndex(std::vector<NodeArg*>& node_args,
+                                                std::function<common::Status(NodeArg& arg, size_t index)> func) {
+    for (size_t index = 0; index < node_args.size(); ++index) {
+      auto arg = node_args[index];
+      if (!arg->Exists())
+        continue;
+      ORT_RETURN_IF_ERROR(func(*arg, index));
+    }
+    return common::Status::OK();
+  }
+
+  /** Gets a modifiable collection of the Node's implicit input definitions. */
+  std::vector<NodeArg*>& MutableImplicitInputDefs() noexcept {
+    return definitions_.implicit_input_defs;
+  }
+#endif  // !defined(ORT_MINIMAL_BUILD)
+
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+  /** Gets a modifiable count of arguments for each of the Node's explicit inputs.
+  @todo This should be removed in favor of a method that updates the input args and the count.
+        Currently these operations are separate which is not a good setup. */
+  std::vector<int>& MutableInputArgsCount() { return definitions_.input_arg_count; }
+
+  /** Gets a modifiable collection of the Node's input definitions. */
+  std::vector<NodeArg*>& MutableInputDefs() noexcept {
+    return definitions_.input_defs;
+  }
+
+  /** Gets a modifiable collection of the Node's output definitions. */
+  std::vector<NodeArg*>& MutableOutputDefs() noexcept {
+    return definitions_.output_defs;
+  }
+#endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+
+  /** Struct to provide sorting between EdgeEnd instances based on NodeIndex first, and NodeArg::Name second. */
+  struct EdgeEndCompare {
+    bool operator()(const EdgeEnd& lhs, const EdgeEnd& rhs) const {
+      if (lhs.GetNode().Index() == rhs.GetNode().Index()) {
+        if (lhs.GetSrcArgIndex() == rhs.GetSrcArgIndex()) {
+          return lhs.GetDstArgIndex() < rhs.GetDstArgIndex();
+        }
+        return lhs.GetSrcArgIndex() < rhs.GetSrcArgIndex();
+      }
+      return lhs.GetNode().Index() < rhs.GetNode().Index();
+    }
+  };
+
+  using EdgeSet = std::set<EdgeEnd, EdgeEndCompare>;
+  using EdgeConstIterator = EdgeSet::const_iterator;
+
+  /**
+  @class NodeConstIterator
+  Class to provide const access to Node instances iterated via an EdgeConstIterator. */
+  class NodeConstIterator {
+   public:
+    NodeConstIterator(EdgeConstIterator p_iter);
+
+    bool operator==(const NodeConstIterator& p_other) const;
+
+    bool operator!=(const NodeConstIterator& p_other) const;
+
+    void operator++();
+    void operator--();
+
+    const Node& operator*() const;
+    const Node* operator->() const;
+
+   private:
+    EdgeConstIterator m_iter;
+  };
+
+  // Functions defined to traverse a Graph as below.
+
+  /** Gets an iterator to the beginning of the input nodes to this Node. */
+  NodeConstIterator InputNodesBegin() const noexcept { return NodeConstIterator(relationships_.input_edges.cbegin()); };
+  /** Gets an iterator to the end of the input nodes to this Node. */
+  NodeConstIterator InputNodesEnd() const noexcept { return NodeConstIterator(relationships_.input_edges.cend()); }
+
+  /** Gets an iterator to the beginning of the output nodes from this Node. */
+  NodeConstIterator OutputNodesBegin() const noexcept {
+    return NodeConstIterator(relationships_.output_edges.cbegin());
+  }
+
+  /** Gets an iterator to the end of the output nodes from this Node. */
+  NodeConstIterator OutputNodesEnd() const noexcept { return NodeConstIterator(relationships_.output_edges.cend()); }
+
+  /** Gets an iterator to the beginning of the input edges to this Node.
+  @remarks There are no nullptr entries in this collection. */
+  EdgeConstIterator InputEdgesBegin() const noexcept { return relationships_.input_edges.cbegin(); }
+
+  /** Gets an iterator to the end of the input edges to this Node. */
+  EdgeConstIterator InputEdgesEnd() const noexcept { return relationships_.input_edges.cend(); }
+
+  /** Gets an iterator to the beginning of the output edges from this Node.
+  @remarks There are no nullptr entries in this collection. */
+  EdgeConstIterator OutputEdgesBegin() const noexcept { return relationships_.output_edges.cbegin(); }
+
+  /** Gets an iterator to the end of the output edges from this Node. */
+  EdgeConstIterator OutputEdgesEnd() const noexcept { return relationships_.output_edges.cend(); }
+
+  /** Gets the Node's control inputs. */
+  const std::set<std::string>& ControlInputs() const noexcept { return relationships_.control_inputs; }
+
+  /** Gets the number of input edges to this Node */
+  size_t GetInputEdgesCount() const noexcept { return relationships_.input_edges.size(); }
+
+  /** Gets the number of output edges from this Node */
+  size_t GetOutputEdgesCount() const noexcept { return relationships_.output_edges.size(); }
+
+  /** Adds an AttributeProto to this Node.
+  @remarks The attribute name is used as the key in the attribute map. */
+  void AddAttributeProto(ONNX_NAMESPACE::AttributeProto value);
+
+  // keep this signature in sync with ADD_ATTR_SINGLE_INTERFACE below
+  /** Adds an attribute to this Node with the specified attribute name and value. */
+  void AddAttribute(std::string attr_name, int64_t value);
+
+  // keep this signature in sync with ADD_ATTR_LIST_INTERFACE below
+  /** Adds an attribute to this Node with the specified attribute name and values. */
+  void AddAttribute(std::string attr_name, gsl::span<const int64_t> values);
+
+#define ADD_ATTR_SINGLE_INTERFACE(Type) \
+  void AddAttribute(std::string attr_name, Type value)
+
+#define ADD_ATTR_LIST_INTERFACE(Type) \
+  void AddAttribute(std::string attr_name, gsl::span<const Type> values)
+
+#define ADD_ATTR_INTERFACES(Type)  \
+  ADD_ATTR_SINGLE_INTERFACE(Type); \
+  ADD_ATTR_LIST_INTERFACE(Type)
+
+  ADD_ATTR_INTERFACES(float);
+  ADD_ATTR_INTERFACES(std::string);
+  ADD_ATTR_INTERFACES(ONNX_NAMESPACE::TensorProto);
+#if !defined(DISABLE_SPARSE_TENSORS)
+  ADD_ATTR_INTERFACES(ONNX_NAMESPACE::SparseTensorProto);
+#endif
+  ADD_ATTR_INTERFACES(ONNX_NAMESPACE::TypeProto);
+
+  ADD_ATTR_SINGLE_INTERFACE(ONNX_NAMESPACE::GraphProto);
+
+#undef ADD_ATTR_SINGLE_INTERFACE
+#undef ADD_ATTR_LIST_INTERFACE
+#undef ADD_ATTR_INTERFACES
+
+  // The below overload is made so the compiler does not attempt to resolve
+  // string literals with the gsl::span overload
+  template <size_t N>
+  void AddAttribute(std::string attr_name, const char (&value)[N]) {
+    this->AddAttribute(std::move(attr_name), std::string(value, N - 1));
+  }
+
+  /** Gets the Node's attributes. */
+  const NodeAttributes& GetAttributes() const noexcept { return attributes_; }
+
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+  /** Remove the specified attribute from this Node */
+  bool ClearAttribute(const std::string& attr_name);
+
+  /** Gets the Node's mutable attributes. */
+  NodeAttributes& GetMutableAttributes() noexcept { return attributes_; }
+
+#endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+
+  /**
+   * Clears removable attributes. These are no longer needed after the initialization
+   * of the session. The function returns the number of removed attributes.
+   */
+  int PruneRemovableAttributes(gsl::span<const std::string> removable_attributes);
+
+#if !defined(ORT_MINIMAL_BUILD)
+
+  /** Gets the Graph instance that is instantiated from a GraphProto attribute during Graph::Resolve.
+  @param attr_name Attribute name for the GraphProto attribute.
+  @returns nullptr if the Graph instance has not been instantiated or attribute does not contain a GraphProto.
+  */
+  const Graph* GetGraphAttribute(const std::string& attr_name) const;
+
+  /** Gets the mutable Graph instance that is instantiated from a GraphProto attribute during Graph::Resolve.
+  @param attr_name Attribute name for the GraphProto attribute.
+  @returns nullptr if the Graph instance has not been instantiated or attribute does not contain a GraphProto.
+  */
+  Graph* GetMutableGraphAttribute(const std::string& attr_name);
+#endif  // !defined(ORT_MINIMAL_BUILD)
+
+  /** Checks if the Node contains at least one subgraph (this is the case for control flow operators, such as If, Scan, Loop).
+  @returns true if the Node contains a subgraph.
+  */
+  bool ContainsSubgraph() const {
+    return !attr_to_subgraph_map_.empty();
+  }
+
+  /** Get the const subgraphs from a node.
+  @remarks Creates a new vector so calling ContainsSubgraphs first is preferred. */
+  std::vector<gsl::not_null<const Graph*>> GetSubgraphs() const;
+
+  /** Gets a map of attribute name to the mutable Graph instances for all subgraphs of the Node.
+  @returns Map of the attribute name that defines the subgraph to the subgraph's Graph instance.
+           nullptr if the Node has no subgraphs.
+  */
+  const std::unordered_map<std::string, gsl::not_null<Graph*>>& GetAttributeNameToMutableSubgraphMap() {
+    return attr_to_subgraph_map_;
+  }
+
+  /** Gets a map of attribute name to the mutable Graph instances for all subgraphs of the Node.
+   * @returns a mutable map of mutable subgraphs.
+   */
+  std::unordered_map<std::string, gsl::not_null<Graph*>>& GetMutableMapOfAttributeNameToSubgraph() {
+    return attr_to_subgraph_map_;
+  }
+
+  /** Gets a map of attribute name to the const Graph instances for all subgraphs of the Node.
+  @returns Map of the attribute name that defines the subgraph to the subgraph's Graph instance.
+           nullptr if the Node has no subgraphs.
+  */
+  std::unordered_map<std::string, gsl::not_null<const Graph*>> GetAttributeNameToSubgraphMap() const;
+
+  /** Gets the execution ProviderType that this node will be executed by. */
+  ProviderType GetExecutionProviderType() const noexcept { return execution_provider_type_; }
+
+  /** Sets the execution ProviderType that this Node will be executed by. */
+  void SetExecutionProviderType(ProviderType execution_provider_type) {
+    execution_provider_type_ = execution_provider_type;
+  }
+
+  /** Call the provided function for all explicit inputs, implicit inputs, and outputs of this Node.
+      If the NodeArg is an explicit or implicit input, is_input will be true when func is called.
+      @param include_missing_optional_defs Include NodeArgs that are optional and were not provided
+                                           i.e. NodeArg::Exists() == false.
+      */
+  void ForEachDef(std::function<void(const onnxruntime::NodeArg&, bool is_input)> func,
+                  bool include_missing_optional_defs = false) const;
+
+#if !defined(ORT_MINIMAL_BUILD)
+  /** Replaces any matching definitions in the Node's explicit inputs or explicit outputs.
+  @param replacements Map of current NodeArg to replacement NodeArg.
+  */
+  void ReplaceDefs(const std::map<const onnxruntime::NodeArg*, onnxruntime::NodeArg*>& replacements);
+
+  /** Gets the NodeProto representation of this Node.
+  @param update_subgraphs Update the GraphProto values for any subgraphs in the returned NodeProto.
+                          If graph optimization has been run this is most likely required
+                          to ensure the complete Graph is valid.
+  */
+  void ToProto(ONNX_NAMESPACE::NodeProto& proto, bool update_subgraphs = false) const;
+
+  Status SaveToOrtFormat(flatbuffers::FlatBufferBuilder& builder,
+                         flatbuffers::Offset<onnxruntime::fbs::Node>& fbs_node) const;
+
+  flatbuffers::Offset<onnxruntime::fbs::NodeEdge>
+  SaveEdgesToOrtFormat(flatbuffers::FlatBufferBuilder& builder) const;
+
+  void SetFunctionTemplate(const FunctionTemplate& func_template);
+#endif
+
+  static Status LoadFromOrtFormat(const onnxruntime::fbs::Node& fbs_node, Graph& graph,
+                                  const OrtFormatLoadOptions& load_options,
+                                  const logging::Logger& logger, std::unique_ptr<Node>& node);
+
+  Status LoadFromOrtFormat(const onnxruntime::fbs::Node& fbs_node,
+                           const OrtFormatLoadOptions& load_options,
+                           const logging::Logger& logger);
+  Status LoadEdgesFromOrtFormat(const onnxruntime::fbs::NodeEdge& fbs_node_edgs, const Graph& graph);
+
+  /**
+  @class Definitions
+  The input and output definitions for this Node.
+  */
+  class Definitions {
+   public:
+    Definitions() = default;
+
+    /** The Node's explicit input definitions. */
+    std::vector<NodeArg*> input_defs;
+
+    /**
+    The number of inputs for each argument of the operator or function which this node refers.
+    @remarks For example, #input_defs has 10 elements (inputs), and #input_arg_count is {4, 6}.
+    This means that 4 elements (inputs) of input_defs map to the first argument of the operator or function, and
+    the other 6 map to the second argument.
+    */
+    std::vector<int> input_arg_count;
+
+    /** The Node's output definitions. */
+    std::vector<NodeArg*> output_defs;
+
+    /** The Node's implicit input definitions if the Node contains one or more subgraphs
+    (i.e. GraphProto attributes) and the subgraph/s implicitly consume these values.
+    @remarks For example, a subgraph in an 'If' node gets all its input values via this mechanism rather than
+    there being explicit inputs to the 'If' node that are passed to the subgraph.
+    They are pseudo-inputs to this Node as it has an implicit dependency on them. */
+    std::vector<NodeArg*> implicit_input_defs;
+
+    ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Definitions);
+
+   private:
+  };
+
+  /**
+  @class Relationships
+  Defines the relationships between this Node and other Nodes in the Graph.
+  */
+  class Relationships {
+   public:
+    Relationships() = default;
+
+    void Clear() noexcept {
+      input_edges.clear();
+      output_edges.clear();
+      control_inputs.clear();
+    }
+
+    /** The edges for Nodes that provide inputs to this Node. */
+    EdgeSet input_edges;
+
+    /** The edges for Nodes that receive outputs from this Node. */
+    EdgeSet output_edges;
+
+    /** The Node names of the control inputs to this Node. */
+    std::set<std::string> control_inputs;
+
+   private:
+    ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Relationships);
+  };
+
+  // NOTE: This friendship relationship should ONLY be used for calling methods of the Node class and not accessing
+  // the data members directly, so that the Node can maintain its internal invariants.
+  friend class Graph;
+  Node(NodeIndex index, Graph& graph) : index_(index), graph_(&graph), can_be_saved_(true) {}
+
+ private:
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Node);
+
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS)
+  void Init(std::string_view name,
+            std::string_view op_type,
+            std::string_view description,
+            gsl::span<NodeArg* const> input_args,
+            gsl::span<NodeArg* const> output_args,
+            const NodeAttributes* attributes,
+            std::string_view domain);
+  void Init(std::string_view name,
+            std::string_view op_type,
+            std::string_view description,
+            gsl::span<NodeArg* const> input_args,
+            gsl::span<NodeArg* const> output_args,
+            NodeAttributes&& attributes,
+            std::string_view domain);
+#endif
+
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+  // internal only method to allow selected classes to directly alter the input/output definitions and arg counts
+  Definitions& MutableDefinitions() noexcept;
+
+  // internal only method to allow selected classes to directly alter the links between nodes.
+  Relationships& MutableRelationships() noexcept;
+
+  void SetNodeType(Node::Type node_type) noexcept { node_type_ = node_type; }
+#endif
+
+  // create a Graph instance for an attribute that contains a GraphProto
+  void CreateSubgraph(const std::string& attr_name);
+
+  std::vector<std::unique_ptr<Graph>>& MutableSubgraphs() noexcept { return subgraphs_; }
+
+  // validate and update the input arg count
+  common::Status UpdateInputArgCount();
+
+  const Definitions& GetDefinitions() const noexcept { return definitions_; }
+  const Relationships& GetRelationships() const noexcept { return relationships_; }
+
+  // Node index. Default to impossible value rather than 0.
+  NodeIndex index_ = std::numeric_limits<NodeIndex>::max();
+
+  // Node name.
+  std::string name_;
+
+  // Node operator type.
+  std::string op_type_;
+
+  // OperatorSet domain of op_type_.
+  std::string domain_;
+
+#if !defined(ORT_MINIMAL_BUILD)
+  // OperatorSchema that <*this> node refers to.
+  const ONNX_NAMESPACE::OpSchema* op_ = nullptr;
+
+  // Reference to the function template defined in the model.
+  const FunctionTemplate* func_template_ = nullptr;
+
+  // set/clear NodeProto that the Node was created from.
+  // Set by Graph ctor when loading a model from file.
+  // Cleared after first call to onnx::check_node in VerifyNodeAndOpMatch when the first Graph::Resolve runs.
+  void SetOriginalNodeProto(const ONNX_NAMESPACE::NodeProto* node_proto) {
+    original_node_proto_ = node_proto;
+  }
+
+  const ONNX_NAMESPACE::NodeProto* GetOriginalNodeProto() const {
+    return original_node_proto_;
+  }
+
+  // NodeProto that the Node was created from. We temporarily set this as a performance optimization to avoid calling
+  // Node::ToProto when running onnx::check_node in the first Graph::Resolve. At that point we know all the nodes are
+  // unchanged from the original model.
+  const ONNX_NAMESPACE::NodeProto* original_node_proto_ = nullptr;
+#endif
+
+  // Execution priority, lower value for higher priority
+  int priority_ = 0;
+
+  // set from op_->SinceVersion() or via deserialization when OpSchema is not available
+  int since_version_ = -1;
+
+  Node::Type node_type_ = Node::Type::Primitive;
+
+  // The function body is owned by graph_
+  std::unique_ptr<Function> func_body_ = nullptr;
+
+  // Node doc string.
+  std::string description_;
+
+  // input/output defs and arg count
+  Definitions definitions_;
+
+  // Relationships between this node and others in the graph
+  Relationships relationships_;
+
+  // Device.
+  std::string execution_provider_type_;
+
+  // Map from attribute name to attribute.
+  // This allows attribute adding and removing.
+  NodeAttributes attributes_;
+
+  // Graph that contains this Node
+  Graph* graph_ = nullptr;
+
+  // Map of attribute name to the Graph instance created from the GraphProto attribute
+  std::unordered_map<std::string, gsl::not_null<Graph*>> attr_to_subgraph_map_;
+
+  // Graph instances for subgraphs that are owned by this Node
+  std::vector<std::unique_ptr<Graph>> subgraphs_;
+
+  // Can be saved? The node cannot be saved anymore if removable attributes have been cleared.
+  bool can_be_saved_;
+};
+
+/**
+@class Graph
+The Graph representation containing the graph inputs and outputs, the Node instances,
+and the edges connecting the nodes.
+*/
+class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve existing data member order for readability
+ public:
+  /** Gets the Graph name. */
+  const std::string& Name() const noexcept;
+
+  /** Gets the Graph description. */
+  const std::string& Description() const noexcept;
+
+  /** Gets the path of the owning model, if any. */
+  const std::filesystem::path& ModelPath() const;
+
+  /** Returns true if this is a subgraph or false if it is a high-level graph. */
+  bool IsSubgraph() const { return parent_graph_ != nullptr; }
+
+  /** Returns the parent graph if this is a subgraph */
+  const Graph* ParentGraph() const { return parent_graph_; }
+
+  /** Returns the mutable parent graph if this is a subgraph */
+  Graph* MutableParentGraph() { return parent_graph_; }
+
+  /** Returns the strict_shape_type_inference that was passed into the constructor. */
+  bool StrictShapeTypeInference() const { return strict_shape_type_inference_; }
+
+#if !defined(ORT_MINIMAL_BUILD)
+  /** Sets the Graph name. */
+  void SetName(const std::string& name);
+
+  /** Gets the Graph description. */
+  void SetDescription(const std::string& description);
+
+  /** Replaces the initializer tensor with the same name as the given initializer tensor.
+  The replacement initializer tensor must have the same type and shape as the existing initializer tensor.
+
+  Note: This currently has linear time complexity. There is room for improvement but it would likely require changes to
+  how initializer tensors are stored and tracked.
+  */
+  common::Status ReplaceInitializedTensor(ONNX_NAMESPACE::TensorProto new_initializer);
+
+#if !defined(DISABLE_EXTERNAL_INITIALIZERS)
+  /** This function takes externally provided data for initializers with external data
+   *    and replaces graph initializers with its content.
+   */
+  common::Status InjectExternalInitializedTensors(const InlinedHashMap<std::string, OrtValue>& external_initializers);
+
+  /** This function takes externally provided files in memory for initializers with external
+   *    data and replaces graph initializers with its content.
+   */
+  common::Status InjectExternalInitializersFromFilesInMemory(
+      const InlinedHashMap<PathString, std::pair<char*, size_t>>& external_initializer_files);
+#endif  // !defined(DISABLE_EXTERNAL_INITIALIZERS)
+
+#endif  // !defined(ORT_MINIMAL_BUILD)
+
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+  /** Add an initializer tensor to the Graph. */
+  void AddInitializedTensor(const ONNX_NAMESPACE::TensorProto& tensor_proto);
+#endif
+
+  /** Remove the initializer tensor with the provided name from the Graph. */
+  void RemoveInitializedTensor(const std::string& tensor_name);
+
+  /** Check if a given name is an initializer tensor's name in this graph. */
+  bool IsInitializedTensor(const std::string& name) const;
+
+#if !defined(DISABLE_SPARSE_TENSORS)
+  /** Check if a given name is a sparse initializer's name in the model
+   * we currently convert sparse_initializer field in the model into dense Tensor instances.
+   * However, we sometimes want to check if this initializer was stored as sparse in the model.
+   */
+  bool IsSparseInitializer(const std::string& name) const;
+#endif
+
+  /** Gets an initializer tensor with the provided name.
+  @param[out] value Set to the TensorProto* if the initializer is found, or nullptr if not.
+  @returns True if found.
+  */
+  bool GetInitializedTensor(const std::string& tensor_name, const ONNX_NAMESPACE::TensorProto*& value) const;
+
+  /** Populate `value` if an externally allocated OrtValue exists for an initializer with the given name.
+   */
+  bool GetOrtValueInitializer(const std::string& name, OrtValue& value) const;
+
+  /** Gets all the initializer tensors in this Graph. */
+  const InitializedTensorSet& GetAllInitializedTensors() const noexcept { return name_to_initial_tensor_; }
+
+  /** Removes all initializer tensors from this Graph and releases the memory they were using. */
+  void CleanAllInitializedTensors() noexcept;
+
+  /** Returns true if an initializer value can be overridden by a graph input with the same name. */
+  bool CanOverrideInitializer() const noexcept { return ir_version_ >= 4; }
+
+  /** returns the initializer's TensorProto if 'name' is an initializer, is constant and
+  cannot be overridden at runtime. If the initializer is not found or is not constant, a nullptr is returned.
+  @param check_outer_scope If true and the graph is a subgraph,
+         check ancestor graph/s for 'name' if not found in 'graph'.
+  */
+  const ONNX_NAMESPACE::TensorProto* GetConstantInitializer(const std::string& name, bool check_outer_scope) const;
+
+  /** returns the initializer's TensorProto if 'name' is an initializer (both constant and overridable).
+  If the initializer is not found, a nullptr is returned.
+  @param check_outer_scope If true and the graph is a subgraph,
+         check ancestor graph/s for 'name' if not found in 'graph'.
+  @remarks check_outer_scope of true is not supported in a minimal build
+  */
+  const ONNX_NAMESPACE::TensorProto* GetInitializer(const std::string& name, bool check_outer_scope) const;
+
+  /** Gets the Graph inputs excluding initializers.
+  These are the required inputs to the Graph as the initializers can be optionally overridden via graph inputs.
+  @remarks Contains no nullptr values. */
+  const std::vector<const NodeArg*>& GetInputs() const noexcept { return graph_inputs_excluding_initializers_; }
+
+  /** Gets the Graph inputs including initializers.
+  This is the full set of inputs, in the same order as defined in the GraphProto.
+  @remarks Contains no nullptr values. */
+  const std::vector<const NodeArg*>& GetInputsIncludingInitializers() const noexcept {
+    return graph_inputs_including_initializers_;
+  }
+
+  /** Return true if "node_arg" is a input or an initializer. Otherwise, returns false. */
+  bool IsInputsIncludingInitializers(const NodeArg* node_arg) const noexcept {
+    return std::find(graph_inputs_including_initializers_.begin(),
+                     graph_inputs_including_initializers_.end(), node_arg) != graph_inputs_including_initializers_.end();
+  }
+
+  /** Gets the Graph inputs that are initializers
+  These are overridable initializers. This is a difference between
+  graph_inputs_including_initializers_ and graph_inputs_excluding_initializers_
+  @remarks Contains no nullptr values. */
+  const std::vector<const NodeArg*>& GetOverridableInitializers() const {
+    return graph_overridable_initializers_;
+  }
+
+  /** Gets the Graph outputs.
+  @remarks Contains no nullptr values.*/
+  const std::vector<const NodeArg*>& GetOutputs() const noexcept { return graph_outputs_; }
+
+  bool IsOutput(const NodeArg* node_arg) const noexcept {
+    return std::find(graph_outputs_.begin(), graph_outputs_.end(), node_arg) != graph_outputs_.end();
+  }
+
+  /** Returns true if one or more of the Node outputs are Graph outputs.
+  @remarks Cheaper than calling GetNodeOutputsInGraphOutputs.
+  */
+  bool NodeProducesGraphOutput(const Node& node) const {
+    auto end_outputs = graph_outputs_.cend();
+    for (auto output_def : node.OutputDefs()) {
+      if (std::find(graph_outputs_.cbegin(), end_outputs, output_def) != end_outputs) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  /** Returns a vector with the indexes of the outputs of the given Node that are also Graph outputs. */
+  std::vector<int> GetNodeOutputsInGraphOutputs(const Node& node) const {
+    int output_idx = 0;
+    std::vector<int> indexes;
+    for (auto output_def : node.OutputDefs()) {
+      if (std::find(GetOutputs().cbegin(), GetOutputs().cend(), output_def) != GetOutputs().cend()) {
+        indexes.push_back(output_idx);
+      }
+
+      ++output_idx;
+    }
+
+    return indexes;
+  }
+
+  /** Gets the NodeArgs that represent value_info instances in the Graph.
+  These are the values that are neither Graph inputs nor outputs.
+  @remarks Contains no nullptr values. */
+  const std::unordered_set<const NodeArg*>& GetValueInfo() const noexcept { return value_info_; }
+
+#if !defined(ORT_MINIMAL_BUILD)
+  void AddValueInfo(const NodeArg* new_value_info);
+#endif
+
+  /** Gets the Node with the specified node index.
+  @returns Node instance if found. nullptr if node_index is invalid or node has been freed.
+  */
+  const Node* GetNode(NodeIndex node_index) const { return NodeAtIndexImpl(node_index); }
+
+  /** Gets the mutable Node with the specified node index.
+  @returns Mutable Node instance if found. nullptr if node_index is invalid or node has been freed.
+  */
+  Node* GetNode(NodeIndex node_index) { return NodeAtIndexImpl(node_index); }
+
+  /** Get a GraphNodes instance that provides mutable access to all valid Nodes in the Graph. */
+  GraphNodes& Nodes() noexcept { return iterable_nodes_; }
+
+  /** Get a GraphNodes instance that provides const access to all valid Nodes in the Graph. */
+  const GraphNodes& Nodes() const noexcept { return iterable_nodes_; }
+
+  /** Get a ConstGraphNodes instance that provides access to a filtered set of valid Nodes in the Graph.
+  @remarks We can't use GraphNodes as that would provide mutable access to the nodes by default, and we can't prevent
+           that by returning a const instance of GraphNodes as we're creating a new instance here due to the filter
+           being something we don't control (i.e. we have to return a new instance so it can't be const).
+  */
+  ConstGraphNodes FilteredNodes(GraphNodes::NodeFilterFunc&& filter_func) const noexcept {
+    return ConstGraphNodes(nodes_, std::move(filter_func));
+  }
+
+  /** Gets the maximum NodeIndex value used in the Graph.
+  WARNING: This actually returns the max index value used + 1.
+  */
+  int MaxNodeIndex() const noexcept { return static_cast<int>(nodes_.size()); }  // assume the casting won't overflow
+
+  /** Gets the number of valid Nodes in the Graph.
+  @remarks This may be smaller than MaxNodeIndex(), as Nodes may be removed during optimization.
+  */
+  int NumberOfNodes() const noexcept { return num_of_nodes_; }
+
+  /** Gets the mutable NodeArg with the provided name.
+  @returns Pointer to NodeArg if found, nullptr if not. */
+  NodeArg* GetNodeArg(const std::string& name) {
+    auto iter = node_args_.find(name);
+    if (iter != node_args_.end()) {
+      return iter->second.get();
+    }
+    return nullptr;
+  }
+
+  /** Gets the const NodeArg with the provided name.
+  @returns Pointer to const NodeArg if found, nullptr if not. */
+  const NodeArg* GetNodeArg(const std::string& name) const {
+    return const_cast<Graph*>(this)->GetNodeArg(name);
+  }
+
+  // search this and up through any parent_graph_ instance for a NodeArg
+  NodeArg* GetNodeArgIncludingParentGraphs(const std::string& node_arg_name);
+
+  /** Gets a mutable NodeArg by name. Creates a new NodeArg that is owned by this Graph if not found.
+  @param name The NodeArg name.
+  @param[in] p_arg_type Optional TypeProto to use if the NodeArg needs to be created.
+  @returns NodeArg reference.
+  */
+  NodeArg& GetOrCreateNodeArg(const std::string& name, const ONNX_NAMESPACE::TypeProto* p_arg_type) {
+    auto insert_result = node_args_.emplace(name, nullptr);
+    if (insert_result.second) {
+      insert_result.first->second = std::make_unique<NodeArg>(name, p_arg_type);
+    }
+    return *(insert_result.first->second);
+  }
+
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+  /** Generate a unique name in this Graph for a NodeArg */
+  std::string GenerateNodeArgName(const std::string& base_name);
+
+  /** Generate a unique name in this Graph for a Node */
+  std::string GenerateNodeName(const std::string& base_name);
+#endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+
+#if !defined(ORT_MINIMAL_BUILD)
+  /** Copy a Node and add it to this Graph.
+  @param other Node to copy
+  @returns Reference to the Node that was created and added to this Graph.
+  @remarks Do not call AddNode and Remove Node concurrently as they are not thread-safe.
+  */
+  Node& AddNode(const Node& other);
+#endif
+
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+  /** Add a Node to this Graph.
+  @param name The Node name. Must be unique in this Graph.
+  @param op_type The operator type. e.g. ONNX operator name.
+  @param description Arbitrary description of the Node.
+  @param input_args The explicit inputs to this Node.
+  @param output_args The outputs from this Node.
+  @param attributes Optional NodeAttributes to add.
+  @param domain The domain for the op_type.
+  @returns Reference to the new Node.
+  @remarks Do not call AddNode and Remove Node concurrently as they are not thread-safe.
+  */
+  Node& AddNode(const std::string& name,
+                const std::string& op_type,
+                const std::string& description,
+                gsl::span<NodeArg* const> input_args,
+                gsl::span<NodeArg* const> output_args,
+                const NodeAttributes* attributes = nullptr,
+                const std::string& domain = kOnnxDomain);
+
+  Node& AddNode(const std::string& name,
+                const std::string& op_type,
+                const std::string& description,
+                gsl::span<NodeArg* const> input_args,
+                gsl::span<NodeArg* const> output_args,
+                NodeAttributes&& attributes,
+                const std::string& domain = kOnnxDomain);
+  Node& AddNode(const std::string& name,
+                const std::string& op_type,
+                const std::string& description,
+                std::initializer_list<NodeArg*> input_args,
+                std::initializer_list<NodeArg*> output_args,
+                const NodeAttributes* attributes = nullptr,
+                const std::string& domain = kOnnxDomain) {
+    return AddNode(name, op_type, description,
+                   AsSpan(input_args),
+                   AsSpan(output_args),
+                   attributes, domain);
+  }
+
+  Node& AddNode(const std::string& name,
+                const std::string& op_type,
+                const std::string& description,
+                gsl::span<NodeArg* const> input_args,
+                std::initializer_list<NodeArg*> output_args,
+                const NodeAttributes* attributes = nullptr,
+                const std::string& domain = kOnnxDomain) {
+    return AddNode(name, op_type, description,
+                   input_args,
+                   AsSpan(output_args),
+                   attributes, domain);
+  }
+
+  Node& AddNode(const std::string& name,
+                const std::string& op_type,
+                const std::string& description,
+                std::initializer_list<NodeArg*> input_args,
+                gsl::span<NodeArg* const> output_args,
+                const NodeAttributes* attributes = nullptr,
+                const std::string& domain = kOnnxDomain) {
+    return AddNode(name, op_type, description,
+                   AsSpan(input_args),
+                   output_args,
+                   attributes, domain);
+  }
+
+  /** Remove a Node from this Graph and free it.
+  The output edges of this specified node MUST have been removed before removing the node.
+  The input edges of this specified node is removed while removing the node. The process of
+  removing a node from a graph should be,
+  1. Remove out edges of this specified node.
+  2. Remove this specified node.
+  3. Add new input edges connected with all out nodes.
+  @returns true if the node_index was valid
+  @remarks Do not call AddNode and Remove Node concurrently as they are not thread-safe.
+  */
+  bool RemoveNode(NodeIndex node_index);
+
+  /** Add an edge between two Nodes.
+  @param src_node_index NodeIndex of source Node that is providing output to the destination Node.
+  @param dst_node_index NodeIndex of destination Node that is receiving input from the source Node.
+  @param src_arg_index node arg index of source node.
+  @param dst_arg_index node arg index of destination node.
+  */
+  void AddEdge(NodeIndex src_node_index, NodeIndex dst_node_index, int src_arg_index, int dst_arg_index);
+
+  /** Remove an edge between two Nodes.
+  @param src_node_index NodeIndex of source Node to remove an output edge from.
+  @param dst_node_index NodeIndex of destination Node to remove an input edge from.
+  @param src_arg_index node arg index of source node.
+  @param dst_arg_index node arg index of destination node.
+  */
+  void RemoveEdge(NodeIndex src_node_index, NodeIndex dst_node_index, int src_arg_index, int dst_arg_index);
+#endif
+
+#if !defined(ORT_MINIMAL_BUILD)
+  /**
+  Add a control edge between two Nodes in this Graph.
+  The source Node does not produce output that is directly consumed by the destination Node, however the
+  destination Node must execute after the source node. The control edge allows this ordering to occur.
+  */
+  bool AddControlEdge(NodeIndex src_node_index, NodeIndex dst_node_index);
+#endif  // !defined(ORT_MINIMAL_BUILD)
+
+  /** Mark the Graph as needing Resolve() to be called.
+  This should be done after modifying any aspect of the Graph that changes the Nodes or relationships between them. */
+  Graph& SetGraphResolveNeeded() noexcept {
+    graph_resolve_needed_ = true;
+    return *this;
+  }
+
+  /** Gets flag indicating whether Graph::Resolve needs to be called before using the Graph. */
+  bool GraphResolveNeeded() const noexcept {
+    return graph_resolve_needed_;
+  }
+
+  /** Sets flag that Graph::graph_proto_ needs to be updated to reflect changes in the Graph. */
+  Graph& SetGraphProtoSyncNeeded() noexcept {
+    graph_proto_sync_needed_ = true;
+    return *this;
+  }
+
+  /** Gets flag indicating whether Graph::graph_proto_ needs to be synchronized with this Graph instance. */
+  bool GraphProtoSyncNeeded() const noexcept {
+    return graph_proto_sync_needed_;
+  }
+
+  /** Performs a reverse depth-first search (DFS) traversal from a set of nodes, via their inputs,
+  up to their source node/s.
+  @param from NodeIndex values for a set of Nodes to traverse from.
+  @param enter Visit function that will be invoked on a node when it is visited but its parents haven't been.
+  @param leave Visit function invoked on the node after its parents have all been visited.
+  @param comp Comparison function to stabilize the traversal order by making Node ordering deterministic.
+  */
+  void ReverseDFSFrom(gsl::span<NodeIndex const> from,
+                      const std::function<void(const Node*)>& enter,
+                      const std::function<void(const Node*)>& leave,
+                      const std::function<bool(const Node*, const Node*)>& comp = {}) const;
+
+  /** Performs a reverse depth-first search (DFS) traversal from a set of nodes, via their inputs,
+  up to their source node/s.
+  @param from Set of Nodes to traverse from.
+  @param enter Visit function that will be invoked on a node when it is visited but its parents haven't been.
+  @param leave Visit function invoked on the node after its parents have all been visited.
+  @param comp Comparison function to stabilize the traversal order by making Node ordering deterministic.
+  */
+  void ReverseDFSFrom(gsl::span<const Node* const> from,
+                      const std::function<void(const Node*)>& enter,
+                      const std::function<void(const Node*)>& leave,
+                      const std::function<bool(const Node*, const Node*)>& comp = {}) const;
+
+  /** Performs a reverse depth-first search (DFS) traversal from a set of nodes, via their inputs,
+  up to their source node/s.
+  @param from Set of Nodes to traverse from.
+  @param enter Visit function that will be invoked on a node when it is visited but its parents haven't been.
+  @param leave Visit function invoked on the node after its parents have all been visited.
+  @param stop Stop traversal from node n to input node p if stop(n, p) is true.
+  @param comp Comparison function to stabilize the traversal order by making Node ordering deterministic.
+  */
+  void ReverseDFSFrom(gsl::span<const Node* const> from,
+                      const std::function<void(const Node*)>& enter,
+                      const std::function<void(const Node*)>& leave,
+                      const std::function<bool(const Node*, const Node*)>& comp,
+                      const std::function<bool(const Node*, const Node*)>& stop) const;
+
+#if !defined(ORT_MINIMAL_BUILD)
+  /** Performs topological sort with Kahn's algorithm on the graph/s.
+  @param enter Visit function that will be invoked on a node when it is visited.
+  @param comp Comparison function to stabilize the traversal order by making Node ordering deterministic.
+  */
+  void KahnsTopologicalSort(const std::function<void(const Node*)>& enter,
+                            const std::function<bool(const Node*, const Node*)>& comp) const;
+
+#endif
+
+#ifdef ENABLE_TRAINING
+  /**
+   * @brief Performs topological sort with customized Kahn's algorithm on the graph/s.
+   *  This is a specialized version for training where need memory efficient topological sort.
+   * @param yield_op The YieldOp used in ORTModule training.
+   * @param shape_size_parents The shape size parents nodes.
+   * @param node_orders The output node orders.
+   */
+  void MemoryEfficientTopologicalSort(const Node* yield_op,
+                                      const InlinedHashMap<NodeIndex, InlinedVector<NodeIndex>>& shape_size_parents,
+                                      std::vector<NodeIndex>& node_orders) const;
+#endif
+
+  /** Gets the map of operator domains to their opset versions. */
+  const std::unordered_map<std::string, int>& DomainToVersionMap() const noexcept {
+    return domain_to_version_;
+  }
+
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+  /**
+  Create a single Node that will be the result of the a fusion of multiple nodes in this Graph.
+  @param sub_graph A IndexSubGraph instance with details of the nodes to fuse.
+  @param fused_node_name The name for the new Node.
+  @returns Node with fused subgraph.
+  @remarks As a new Graph instance for the fused nodes is not created, a GraphViewer can be constructed with the
+           IndexedSubGraph information to provide a view of the subgraph. The original nodes are left in place
+           while this is in use.
+           Call FinalizeFuseSubGraph to remove them once the fused replacement node is fully created.
+  */
+  Node& BeginFuseSubGraph(const IndexedSubGraph& sub_graph, const std::string& fused_node_name);
+
+  void FinalizeFuseSubGraph(const IndexedSubGraph& sub_graph, Node& fused_node);
+#endif
+
+#if !defined(ORT_MINIMAL_BUILD)
+  /** Gets the GraphProto representation of this Graph. */
+  const ONNX_NAMESPACE::GraphProto& ToGraphProto();
+  ONNX_NAMESPACE::GraphProto ToGraphProto() const;
+
+  /** Gets the GraphProto representation of this Graph
+  @param external_file_path File path of the binary file to use for initializers.
+  @param model_file_path path of the model file.
+  @param initializer_size_threshold initializers larger or equal to this threshold (in bytes) are saved
+  in the external file. Initializer smaller than this threshold are included in the onnx file.
+  @param align_info offset alignment info.
+  @returns GraphProto serialization of the graph.
+  */
+  ONNX_NAMESPACE::GraphProto ToGraphProtoWithExternalInitializers(const std::filesystem::path& external_file_path,
+                                                                  const std::filesystem::path& model_file_path,
+                                                                  const ModelSavingOptions& model_saving_options) const;
+
+  /** Gets the ISchemaRegistry instances being used with this Graph. */
+  IOnnxRuntimeOpSchemaCollectionPtr GetSchemaRegistry() const;
+
+  /**
+  Looks up the op schema in the schema registry and sets it for the given node.
+  @param node The node to update.
+  @return Whether the node's op schema was set to a valid value.
+  */
+  bool SetOpSchemaFromRegistryForNode(Node& node);
+
+  /**
+  Create a single Function based Node that is the result of the a fusion of multiple nodes in this Graph.
+  A new Graph instance will be created for the fused nodes.
+  @param sub_graph A IndexSubGraph instance with details of the nodes to fuse. Ownership is transferred to the new Node
+  @param fused_node_name The name for the new Node.
+  @returns Function based Node with fused subgraph. The Node body will contain a Function instance.
+  */
+  Node& FuseSubGraph(const IndexedSubGraph& sub_graph, const std::string& fused_node_name);
+
+  /**
+    Directly insert one of the If node branches into this Graph.
+    `If` node condition must be a constant. The function would
+    rename the nodes of the corresponding subgraph to make sure there is no conflict.
+
+    Explicit and implicit inputs references stay the same.
+
+    All of the outputs of the subgraph being inlined should be renamed
+    to the outputs of the If node.
+
+    The function will process any subgraphs in each of the nodes being inlined,
+    and will rename any references to the new names introduced.
+
+    @param condition_value If condition value
+    @param if_node - the node that contains the graph_to_inline. This node is going
+    to be deleted and replaced by the corresponding graph (either then or else)
+    @param logger
+  */
+  Status InlineIfSubgraph(bool condition_value, Node& if_node, const logging::Logger& logger);
+
+  /**
+  Directly insert the nodes in the function Node provided into this Graph.
+  The Graph needs to be Resolve()d after this call.
+  @param node Node with Node::Type of Node::Type::Fused
+  @returns Status indicating success or providing an error message.
+  */
+  Status InlineFunction(Node& node);
+
+  /**
+  Directly insert the nodes in the function proto provided into the graph.
+  The function converts Constant nodes into the initializers in the graph.
+  It then creates a node in the graph for each of the function nodes.
+  All of the names are expected to be specialized, and, therefore unique.
+  See function_utils::Specialize().
+
+  The Graph needs to be Resolve()d after this call.
+  @param func_to_inline
+  @returns Status indicating success or providing an error message.
+  */
+
+  Status InlineFunctionProto(const ONNX_NAMESPACE::FunctionProto& func_to_inline);
+
+  /** Mark a NodeArg name as coming from the outer scope when programmatically constructing a Graph that will
+  be used as a GraphProto attribute in another Node.
+  e.g. when creating a Graph instance that will be used as a subgraph in a control flow operator, it is necessary to
+  define placeholder NodeArgs for outer scope values. This prevents these values from becoming explicit graph inputs
+  when the Graph is resolved.
+  */
+  void AddOuterScopeNodeArg(const std::string& name) {
+    ORT_IGNORE_RETURN_VALUE(outer_scope_node_arg_names_.insert(name));
+  }
+
+  /** Explicitly set graph inputs.
+  @param inputs NodeArgs that represent complete graph inputs which need to be explicitly ordered.
+  @remarks Note that the input order matters for subgraphs.
+  */
+  void SetInputs(gsl::span<const NodeArg* const> inputs);
+
+  void SetInputs(std::initializer_list<const NodeArg*> inputs) {
+    SetInputs(AsSpan(inputs));
+  }
+
+  const Model& GetModel() const {
+    return owning_model_;
+  }
+
+  const logging::Logger& GetLogger() const {
+    return logger_;
+  }
+
+  /** Explicitly set graph outputs.
+  @param outputs NodeArgs that represent complete graph outputs which need to be explicitly ordered.
+  @remarks Note that the output order matters for subgraphs.
+  */
+  void SetOutputs(gsl::span<const NodeArg* const> outputs);
+
+  void SetOutputs(std::initializer_list<const NodeArg*> outputs) {
+    SetOutputs(AsSpan(outputs));
+  }
+
+#endif  // !defined(ORT_MINIMAL_BUILD)
+
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+  /** Sets the type of a NodeArg, replacing existing type/shape if any */
+  void SetNodeArgType(NodeArg& arg, const ONNX_NAMESPACE::TypeProto& type_proto);
+
+  const Node* GetProducerNode(const std::string& node_arg_name) const {
+    return GetProducerNodeImpl(*this, node_arg_name);
+  }
+
+  Node* GetMutableProducerNode(const std::string& node_arg_name) {
+    return GetProducerNodeImpl(*this, node_arg_name);
+  }
+
+  void UpdateProducerNode(const std::string& node_arg_name, NodeIndex node_index) {
+    auto iter = node_arg_to_producer_node_.find(node_arg_name);
+
+    if (iter != node_arg_to_producer_node_.end()) {
+      iter->second = node_index;
+    } else {
+      node_arg_to_producer_node_[node_arg_name] = node_index;
+    }
+  }
+
+  std::vector<const Node*> GetConsumerNodes(const std::string& node_arg_name) const {
+    return GetConsumerNodesImpl(*this, node_arg_name);
+  }
+
+  // Without removing the existing consumers, add a consumer to the give node arg name.
+  void AddConsumerNode(const std::string& node_arg_name, Node* consumer) {
+    node_arg_to_consumer_nodes_[node_arg_name].insert(consumer->Index());
+  }
+
+  // Remove a consumer from the set
+  void RemoveConsumerNode(const std::string& node_arg_name, Node* consumer) {
+    node_arg_to_consumer_nodes_[node_arg_name].erase(consumer->Index());
+  }
+#endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+
+#if !defined(ORT_MINIMAL_BUILD)
+  std::vector<Node*> GetMutableConsumerNodes(const std::string& node_arg_name) {
+    return GetConsumerNodesImpl(*this, node_arg_name);
+  }
+
+  void UpdateConsumerNodes(const std::string& node_arg_name, gsl::span<Node* const> nodes) {
+    // Replace nodes for the arg
+    auto& nodes_for_arg = node_arg_to_consumer_nodes_[node_arg_name];
+    if (!nodes_for_arg.empty()) {
+      nodes_for_arg.clear();
+    }
+
+    nodes_for_arg.reserve(nodes.size());
+    for (Node* node : nodes) {
+      nodes_for_arg.insert(node->Index());
+    }
+  }
+
+  void UpdateConsumerNodes(const std::string& node_arg_name, std::initializer_list<Node*> nodes) {
+    UpdateConsumerNodes(node_arg_name, AsSpan(nodes));
+  }
+
+  /** During constant folding it may become possible to infer the shape for a node.
+      To avoid running a full Resolve allow an individual node to have the shape inferencing re-run.
+  */
+  Status UpdateShapeInference(Node& node);
+
+  // Options to control Graph::Resolve.
+  struct ResolveOptions {
+    // Whether to override existing types with inferred types.
+    bool override_types = false;
+    // Names of initializers to keep even if unused (optional).
+    const std::unordered_set<std::string>* initializer_names_to_preserve = nullptr;
+    // Whether to set that no proto sync is required after resolving.
+    // Useful for resolving right after loading from a GraphProto.
+    bool no_proto_sync_required = false;
+  };
+
+  /**
+  Resolve this Graph to ensure it is completely valid, fully initialized, and able to be executed.
+  1. Run through all validation rules.
+    a. Node name and node output's names should be unique.
+    b. Attribute match between node and op definition.
+    c. Input/Output match between node and op definition.
+    d. Graph is acyclic and sort nodes in topological order.
+  2. Check & Setup inner nodes' dependency.
+  3. Cleanup function definition lists.
+  Note: the weights for training can't be cleaned during resolve.
+  @returns common::Status with success or error information.
+  */
+  common::Status Resolve(const ResolveOptions& options);
+
+  common::Status Resolve() {
+    ResolveOptions default_options;
+    return Resolve(default_options);
+  }
+
+  const std::unordered_set<std::string>& GetOuterScopeNodeArgNames() const noexcept {
+    return outer_scope_node_arg_names_;
+  }
+
+  common::Status SaveToOrtFormat(flatbuffers::FlatBufferBuilder& builder,
+                                 flatbuffers::Offset<onnxruntime::fbs::Graph>& fbs_graph) const;
+
+#endif  // !defined(ORT_MINIMAL_BUILD)
+
+  // This function constructs PrepackedSharedContainer in the root graph only
+  // and initializes a reference to it in all (sub)graphs
+  void ConstructPrepackedSharedContainerAndSetMode(bool saving_mode_on);
+
+  const PrepackedWeightsForGraph& GetPrepacked() const noexcept {
+    return *prepacked_weights_for_graph_;
+  }
+
+  PrepackedWeightsForGraph& GetPrepacked() noexcept {
+    return *prepacked_weights_for_graph_;
+  }
+
+  /** Returns the Node containing the GraphProto for this Graph instance if IsSubgraph is true */
+  const Node* ParentNode() const { return parent_node_; }
+
+  /** Returns true if the name is for a value that is coming from outer scope */
+  bool IsOuterScopeValue(const std::string& name) const {
+    if (!parent_node_) return false;
+    const auto& implicit_input_defs = parent_node_->ImplicitInputDefs();
+    return std::any_of(implicit_input_defs.cbegin(), implicit_input_defs.cend(),
+                       [&name](const NodeArg* implicit_input) {
+                         return implicit_input->Name() == name;
+                       });
+  }
+
+#if !defined(ORT_MINIMAL_BUILD)
+  /** Construct a Graph instance for a subgraph that is created from a GraphProto attribute in a Node.
+  Inherits some properties from the parent graph.
+  @param parent_graph The Graph containing the Node that has the GraphProto attribute.
+  @param parent_node The Node that has the GraphProto attribute.
+  @param subgraph_proto The GraphProto from the Node attribute.
+  */
+  Graph(Graph& parent_graph, const Node& parent_node, ONNX_NAMESPACE::GraphProto& subgraph_proto);
+
+  Graph(const Model& owning_model,
+        IOnnxRuntimeOpSchemaCollectionPtr schema_registry,
+        ONNX_NAMESPACE::GraphProto& subgraph_proto,
+        const std::unordered_map<std::string, int>& domain_version_map,
+        const logging::Logger& logger,
+        bool strict_shape_type_inference);
+#endif
+
+  virtual ~Graph();
+
+  static Status LoadFromOrtFormat(const onnxruntime::fbs::Graph& fbs_graph, const Model& owning_model,
+                                  const std::unordered_map<std::string, int>& domain_to_version,
+#if !defined(ORT_MINIMAL_BUILD)
+                                  IOnnxRuntimeOpSchemaCollectionPtr schema_registry,
+#endif
+                                  const OrtFormatLoadOptions& load_options,
+                                  const logging::Logger& logger, std::unique_ptr<Graph>& graph);
+
+  // deserialize a subgraph
+  static Status LoadFromOrtFormat(const onnxruntime::fbs::Graph& fbs_graph,
+                                  Graph& parent_graph, const Node& parent_node,
+                                  const OrtFormatLoadOptions& load_options,
+                                  const logging::Logger& logger, std::unique_ptr<Graph>& graph);
+
+  static Status LoadFromModelBuilderApiModel(const OrtGraph& api_graph,
+                                             const Model& owning_model,
+                                             const std::unordered_map<std::string, int>& domain_to_version,
+                                             IOnnxRuntimeOpSchemaCollectionPtr schema_registry,
+                                             bool strict_shape_type_inference,
+                                             const logging::Logger& logger,
+                                             std::unique_ptr<Graph>& graph);
+
+  Status UpdateUsingModelBuilderApiModel(const OrtModel& api_model);
+
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+  const RuntimeOptimizationRecordContainer& RuntimeOptimizations() const {
+    return runtime_optimizations_;
+  }
+
+  RuntimeOptimizationRecordContainer& MutableRuntimeOptimizations() {
+    return runtime_optimizations_;
+  }
+
+  // We don't run Graph::Resolve() on an ORT format model, but a compiling EP may copy initializers to its
+  // compiled model during partitioning, leaving them unused in the ORT Graph. To allow the memory to be freed
+  // we need to manually run the cleanup that would usually happen as part of Graph::Resolve.
+  Status RemovedUnusedInitializersOrtFormat();
+#endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+
+  // This friendship relationship should only be used to call Graph::Graph and
+  // Graph::LoadGraph All other access should be via the public API.
+  friend class Model;
+
+  Graph() = delete;
+
+  // Create empty Graph instance to re-create from ORT format serialized data.
+  Graph(const Model& owning_model,
+        const std::unordered_map<std::string, int>& domain_to_version,
+#if !defined(ORT_MINIMAL_BUILD)
+        IOnnxRuntimeOpSchemaCollectionPtr schema_registry,
+#endif
+        Graph* parent_graph, const Node* parent_node,
+        const logging::Logger& logger,
+        bool strict_shape_type_inference);
+
+  // Populate Graph instance from ORT format serialized data.
+  Status LoadFromOrtFormat(const onnxruntime::fbs::Graph& fbs_graph,
+                           const OrtFormatLoadOptions& load_options);
+
+#if !defined(ORT_MINIMAL_BUILD)
+  // Constructor: Given a <GraphProto> loaded from model file, construct
+  // a <Graph> object. Used by Model to create a Graph instance.
+  Graph(const Model& owning_model,
+        ONNX_NAMESPACE::GraphProto* graph_proto,
+        const std::unordered_map<std::string, int>& domain_to_version,
+        Version ir_version,
+        IOnnxRuntimeOpSchemaCollectionPtr schema_registry,
+        const logging::Logger& logger,
+        bool strict_shape_type_inference);
+
+  // internal use by the Graph class only
+  Graph(const Model& owning_model,
+        ONNX_NAMESPACE::GraphProto* graph_proto,
+        const std::unordered_map<std::string, int>& domain_to_version,
+        Version ir_version,
+        IOnnxRuntimeOpSchemaCollectionPtr schema_registry,
+        Graph* parent_graph,
+        const Node* parent_node,
+        const logging::Logger& logger,
+        bool strict_shape_type_inference);
+
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Graph);
+
+ private:
+  void InitializeStateFromModelFileGraphProto();
+
+  // Add node with specified <node_proto>.
+  Node& AddNode(const ONNX_NAMESPACE::NodeProto& node_proto,
+                const ArgNameToTypeMap& name_to_type);
+
+  /** Helper that converts and adds constant node proto to an initializer in the graph.
+   @param constant_node_proto Constant node to convert
+   @param new_name use the new name for the initializer.
+  */
+  Status AddConstantProtoAsInitializer(const ONNX_NAMESPACE::NodeProto& constant_node_proto,
+                                       std::optional<std::string_view> new_name);
+
+  /// <summary>
+  /// This function traverses the graph bottom up and externalizes
+  /// constant initializers along with their pre-packed blobs from different
+  /// kernels. Writes constant initializers to the external file with any pre-packed
+  /// blobs (if enabled and produced for this initializer) and then modifies TensorProto
+  /// entry with external data references.
+  /// </summary>
+  /// <param name="model_path">model file path from Model</param>
+  /// <param name="external_file_path">a binary file path for relative to the model file path
+  /// where the initializers data is written</param>
+  /// <param name="model_external_file_path">model file folder path with external file path appended</param>
+  /// <param name="model_saving_options">model saving options including alignment and pre-packs</param>
+  /// <param name="output_graph_proto">The graph proto to be modified</param>
+  /// <param name="external_stream">external file stream</param>
+  /// <param name="external_offset">current external file offset updated with each write</param>
+  /// <returns>Status instance</returns>
+  Status AddExternalInitializersToGraphProtoImpl(
+      const std::filesystem::path& model_path,
+      const std::filesystem::path& external_file_path,
+      const std::filesystem::path& model_external_file_path,
+      const ModelSavingOptions& model_saving_options,
+      ONNX_NAMESPACE::GraphProto& output_graph_proto,
+      std::ostream& external_stream,
+      int64_t& external_offset) const;
+
+#endif
+
+  Version IrVersion() const noexcept {
+    return ir_version_;
+  }
+
+  Graph& GraphResolveNeeded(bool needed) noexcept {
+    graph_resolve_needed_ = needed;
+    return *this;
+  }
+
+  Graph& GraphProtoSyncNeeded(bool needed) noexcept {
+    graph_proto_sync_needed_ = needed;
+    return *this;
+  }
+
+  // During the Resolve of a Graph it is necessary to recursively descend into subgraphs (created from GraphProto
+  // Node attributes in the Graph) if present.
+  // The ResolveContext holds the collection of values for the current Graph instance, be it the main graph
+  // or a subgraph, so that the various operations that are part of the Resolve can work iteratively or
+  // recursively as needed.
+  struct ResolveContext {
+    ResolveContext(const Graph& owning_graph) : graph{owning_graph} {
+    }
+
+    std::unordered_map<std::string_view, std::pair<Node*, int>> output_args;
+    std::unordered_set<std::string_view> inputs_and_initializers;
+    std::unordered_map<std::string_view, NodeIndex> node_name_to_index;
+    std::unordered_set<Node*> nodes_with_subgraphs;
+
+    // check if the provided name is an input/initialize/node output of this Graph instance during Graph::Resolve.
+    // Graph::node_args_ can have stale entries so we can't rely on that.
+    bool IsLocalValue(const std::string& name) const;
+
+    // check if an ancestor graph has a valid value with the provided name during Graph::Resolve.
+    // Once Graph::Resolve completes Graph::IsOuterScopeValue can be used and is more efficient.
+    bool IsOuterScopeValue(const std::string& name) const;
+
+    void Clear() {
+      output_args.clear();
+      inputs_and_initializers.clear();
+      node_name_to_index.clear();
+      nodes_with_subgraphs.clear();
+    }
+
+   private:
+    bool IsInputInitializerOrOutput(const std::string& name, bool check_ancestors) const;
+
+    const Graph& graph;
+    ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(ResolveContext);
+  };
+
+  // Initialize all the graph inputs, initializers and outputs
+  common::Status InitInputsInitializersOutputs();
+
+  // Initialize overridable initializers container
+  void ComputeOverridableInitializers();
+
+#if !defined(ORT_MINIMAL_BUILD)
+  // Build and verify node connection (edges).
+  // Verify NodeArg name/type/shape matching correctly.
+  common::Status BuildConnections(std::unordered_set<std::string>& outer_scope_node_args_consumed);
+
+  common::Status VerifyNoDuplicateName();
+
+  // Check whether <*this> graph is acyclic while performing a topological sort.
+  // Depth-first going from bottom up through the graph and checking whether there are any back edges.
+  // NodesInTopologicalOrder is updated with the nodes' indexes in topological
+  // order if <Status> returned is "OK", otherwise it's undefined.
+  common::Status PerformTopologicalSortAndCheckIsAcyclic();
+
+  common::Status PerformTypeAndShapeInferencing(const ResolveOptions& options);
+
+  common::Status InferAndVerifyTypeMatch(Node& node, const ONNX_NAMESPACE::OpSchema& op, const ResolveOptions& options);
+
+  // perform type and shape inferencing on the subgraph and Resolve to validate
+  static common::Status InferAndVerifySubgraphTypes(const Node& node, Graph& subgraph,
+                                                    const std::vector<const ONNX_NAMESPACE::TypeProto*>& input_types,
+                                                    std::vector<const ONNX_NAMESPACE::TypeProto*>& output_types,
+                                                    const Graph::ResolveOptions& options);
+
+  // Apply type-inference and type-checking to all inputs and initializers:
+  common::Status TypeCheckInputsAndInitializers();
+
+  // Compute set of input and initializer names and checking for duplicate names
+  common::Status VerifyInputAndInitializerNames();
+
+  // Infer and set type information across <*this> graph if needed, and verify type/attribute
+  // information matches between node and op.
+
+  common::Status VerifyNodeAndOpMatch(const ResolveOptions& options);
+
+  // Set graph inputs/outputs when resolving a graph..
+  common::Status SetGraphInputsOutputs();
+
+  // recursively accumulate and set the outer scope node args in the resolve context for all subgraphs
+  // so they can be used to resolve outer scope dependencies when running BuildConnections for the subgraphs.
+  common::Status SetOuterScopeNodeArgs(const std::unordered_set<std::string>& outer_scope_node_args);
+
+  // Implementation for initializer replacement
+  Status ReplaceInitializedTensorImpl(ONNX_NAMESPACE::TensorProto new_initializer, bool is_external);
+
+  template <typename StringRange>  // range-initializer returning std::string
+  std::vector<NodeArg*> CreateNodeArgs(const StringRange& names,
+                                       const ArgNameToTypeMap& name_to_type_map);
+
+  void ToGraphProtoInternal(ONNX_NAMESPACE::GraphProto& graph_proto) const;
+
+#endif  // !defined(ORT_MINIMAL_BUILD)
+
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+
+  // Recursively find all subgraphs including nested subgraphs
+  void FindAllSubgraphs(std::vector<Graph*>& subgraphs);
+
+  // Iterate this Graph instance and all subgraphs, calling the provided function for each.
+  common::Status ForThisAndAllSubgraphs(const std::vector<Graph*>& subgraphs, std::function<Status(Graph&)> func);
+
+  // Clear all unused initializers and NodeArgs
+  void CleanUnusedInitializersAndNodeArgs(const std::unordered_set<std::string>* initializer_names_to_preserve = nullptr);
+
+  Status PopulateNodeArgToProducerConsumerLookupsFromNodes();
+
+  template <typename TInstance>
+  static auto GetConsumerNodesImpl(
+      TInstance& instance, const std::string& node_arg_name) -> std::vector<decltype(instance.GetNode(0))> {
+    std::vector<decltype(instance.GetNode(0))> results;
+    auto iter = instance.node_arg_to_consumer_nodes_.find(node_arg_name);
+    if (iter != instance.node_arg_to_consumer_nodes_.end()) {
+      results.reserve(iter->second.size());
+      for (auto node_index : iter->second) {
+        results.push_back(instance.GetNode(node_index));
+      }
+    }
+    return results;
+  }
+
+  template <typename TInstance>
+  static auto GetProducerNodeImpl(
+      TInstance& instance, const std::string& node_arg_name) -> decltype(instance.GetNode(0)) {
+    auto iter = instance.node_arg_to_producer_node_.find(node_arg_name);
+    if (iter != instance.node_arg_to_producer_node_.end()) {
+      auto node_index = iter->second;
+      return instance.GetNode(node_index);
+    }
+    return nullptr;
+  }
+
+  gsl::not_null<Node*> AllocateNode();
+
+  // Release the node.
+  // @returns false if node_index was invalid.
+  bool ReleaseNode(NodeIndex node_index);
+
+  Node& CreateFusedSubGraphNode(const IndexedSubGraph& sub_graph, const std::string& fused_node_name);
+#endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+
+  Node* NodeAtIndexImpl(NodeIndex node_index) const {
+    // if we are trying to access a node that doesn't exist there's (most
+    // likely) either a logic issue or a graph consistency/correctness issue.
+    // use ORT_ENFORCE to prove that or uncover scenarios where we actually
+    // expect attempts to retrieve a non-existent node.
+    ORT_ENFORCE(node_index < nodes_.size(), "Validating no unexpected access using an invalid node_index. Got:",
+                node_index, " Max:", nodes_.size());
+    return nodes_[node_index].get();
+  }
+
+  Status LoadFromModelBuilderApiModel(const OrtGraph& api_graph, bool updating_existing_graph = false);
+
+  const Model& owning_model_;
+
+  // GraphProto to store name, version, initializer.
+  // When serializing <*this> Graph to a GraphProto, the nodes and
+  // functions in <Graph> will also be fed into <graph_proto_> so that
+  // it's consistent with <*this> graph.
+  // This pointer is owned by parent model.
+  ONNX_NAMESPACE::GraphProto* graph_proto_;
+
+  // GraphProto that provides storage for the ONNX proto types deserialized from a flexbuffer/flatbuffer
+  ONNX_NAMESPACE::GraphProto deserialized_proto_data_;
+
+  InitializedTensorSet name_to_initial_tensor_;
+
+  // Initializers that are external to the Graph. e.g. created using Model Builder API from existing memory.
+  // As we need to convert to TensorProto for the optimizers to work and keep the deleter information we store them
+  // in the Graph instance and retrieve during session state finalization.
+  std::unordered_map<std::string, OrtValue> ortvalue_initializers_;
+
+  std::unordered_set<std::reference_wrapper<const std::string>,
+                     std::hash<std::string>, std::equal_to<std::string>>
+      sparse_tensor_names_;
+
+  // Prepacked blobs container that stored pre-packed initializers
+  // data that is:
+  // - mem-mapped from disk
+  // - shared within the session
+  // - shared across sessions by transferring the ownership of loaded data entries to
+  // SessionState::PrepackedWeightsContainer* if one is present.
+  // This container is optional because it is present only in the root graph.
+  std::optional<PrepackedKeyToBlobMap> prepacked_key_to_blobs_;
+
+  // This container contains a reference to the root prepacked_key_to_blobs_
+  // and also (in the save mode) records association between the initializer
+  // names and their pre-packed blobs (via keys).
+  // This is optional due to delayed construction.
+  std::optional<PrepackedWeightsForGraph> prepacked_weights_for_graph_;
+
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+  // Runtime optimization storage.
+  // Note: runtime_optimizations_ == *runtime_optimizations_ptr_ and must be initialized
+  std::unique_ptr<RuntimeOptimizationRecordContainer> runtime_optimizations_ptr_;
+  RuntimeOptimizationRecordContainer& runtime_optimizations_;
+#endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+
+#if !defined(ORT_MINIMAL_BUILD)
+  IOnnxRuntimeOpSchemaCollectionPtr schema_registry_;
+
+  // Currently to make the ORT in-memory graph work, we have to create a temporary op schema
+  // for the fused kernel. I really don't like it. but for short-term solution, let's host
+  // those schemas here.
+  InlinedVector<std::unique_ptr<ONNX_NAMESPACE::OpSchema>> fused_schemas_containers_;
+  // in some case, a fused sub-graph will happens multiple times in one model, we use a map
+  // to store reusable-schema in lookup.
+  InlinedHashMap<std::string, std::reference_wrapper<ONNX_NAMESPACE::OpSchema>> reusable_fused_schema_map_;
+
+#endif  // !defined(ORT_MINIMAL_BUILD)
+
+  // Graph nodes.
+  // Element in <nodes_> may be nullptr due to graph optimization.
+  std::vector<std::unique_ptr<Node>> nodes_;
+
+  // Wrapper of Graph nodes to provide iteration services that hide nullptr entries
+  GraphNodes iterable_nodes_{nodes_};
+
+  // Number of nodes.
+  // Normally this is smaller than the size of <m_nodes>, as some
+  // elements in <m_nodes> may be removed when doing graph optimization,
+  // or some elements may be merged, etc.
+  int num_of_nodes_ = 0;
+
+  // A flag indicates whether <*this> graph needs to be resolved.
+  bool graph_resolve_needed_ = false;
+
+  bool graph_proto_sync_needed_ = false;
+
+  // The topological order of node index used to do node and op match verification temporarily.
+  std::vector<NodeIndex> nodes_in_topological_order_;
+
+  // Full list of graph inputs. Matches number and order of inputs in the GraphProto.
+  std::vector<const NodeArg*> graph_inputs_including_initializers_;
+  bool graph_inputs_manually_set_ = false;
+
+  // Graph inputs excluding initializers.
+  std::vector<const NodeArg*> graph_inputs_excluding_initializers_;
+
+  // Overridable Initializers. The difference between graph_inputs_including_initializers_
+  // and graph_inputs_excluding_initializers_
+  std::vector<const NodeArg*> graph_overridable_initializers_;
+
+  // Graph outputs.
+  std::vector<const NodeArg*> graph_outputs_;
+  bool graph_outputs_manually_set_ = false;
+
+  // Graph value_info.
+  std::unordered_set<const NodeArg*> value_info_;
+
+  // All node args owned by <*this> graph. Key is node arg name.
+  std::unordered_map<std::string, std::unique_ptr<NodeArg>> node_args_;
+
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+  int name_generator_ = 0;
+
+  // Strings which have been used as node names.
+  // New node name should not conflict with this set.
+  std::unordered_set<std::string> generated_node_names_;
+
+  // Strings which have been used as node_arg names.
+  // New node_arg name should not conflict this this set.
+  std::unordered_set<std::string> generated_node_arg_names_;
+
+  // node arg to its producer node
+  std::unordered_map<std::string, NodeIndex> node_arg_to_producer_node_;
+
+  // node arg to its consumer nodes
+  std::unordered_map<std::string, std::unordered_set<NodeIndex>> node_arg_to_consumer_nodes_;
+#endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+
+  std::unordered_map<std::string, int> domain_to_version_;
+
+  // Model IR version.
+  Version ir_version_{ONNX_NAMESPACE::Version::IR_VERSION};
+
+  ResolveContext resolve_context_{*this};
+
+  // the parent graph if this is a subgraph.
+  Graph* parent_graph_;
+  // the node containing the graph if parent_graph_ is not nullptr
+  const Node* parent_node_;
+
+  // NodeArgs that come from outer scope. Used when building a graph so that
+  // these don't get recorded as graph inputs in the GraphProto.
+  std::unordered_set<std::string> outer_scope_node_arg_names_;
+
+  // number of times Resolve has run.
+  int num_resolves_ = 0;
+
+  const logging::Logger& logger_;
+
+  // If true, all inconsistencies encountered during shape and type inference
+  // will be exposed to the caller as failures. If false, in some cases
+  // warnings will be logged but processing will continue and no error will
+  // be returned.
+  const bool strict_shape_type_inference_;
+
+  // distinguishes between graph loaded from model file and graph created from scratch
+  const bool is_loaded_from_model_file_;
+};
+
+#if !defined(ORT_MINIMAL_BUILD)
+// Print NodeArg as
+//  name : type
+// For example,
+//  "110": tensor(float)
+std::ostream& operator<<(std::ostream& out, const NodeArg& node_arg);
+// Print Node as,
+//  (operator's name, operator's type, domain, version) : (input0, input1, ...) -> (output0, output1, ...)
+// For example,
+//  ("Add_14", Add, "", 7) : ("110": tensor(float),"109": tensor(float),) -> ("111": tensor(float),)
+std::ostream& operator<<(std::ostream& out, const Node& node);
+// Print Graph as, for example,
+// Inputs:
+//    "Input": tensor(float)
+// Nodes:
+//    ("add0", Add, "", 7) : ("Input": tensor(float),"Bias": tensor(float),) -> ("add0_out": tensor(float),)
+//    ("matmul", MatMul, "", 9) : ("add0_out": tensor(float),"matmul_weight": tensor(float),) -> ("matmul_out": tensor(float),)
+//    ("add1", Add, "", 7) : ("matmul_out": tensor(float),"add_weight": tensor(float),) -> ("add1_out": tensor(float),)
+//    ("reshape", Reshape, "", 5) : ("add1_out": tensor(float),"concat_out": tensor(int64),) -> ("Result": tensor(float),)
+// Outputs:
+//    "Result": tensor(float)
+// Inputs' and outputs' format is described in document of NodeArg's operator<< above.
+// Node format is described in Node's operator<< above.
+std::ostream& operator<<(std::ostream& out, const Graph& graph);
+#endif
+
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/graph/graph_nodes.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/graph/graph_nodes.h
new file mode 100644
index 00000000000000..aab5f2699d2347
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/graph/graph_nodes.h
@@ -0,0 +1,183 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <memory>
+#include <type_traits>
+#include <vector>
+
+namespace onnxruntime {
+
+class Node;
+
+/**
+Class to filter out null entries from either a vector of unique_ptr<Node> or a vector of [const] Node* and
+provide an iterator interface that returns [const] Node& for the valid entries.
+*/
+template <typename TNodesContainer>
+class ValidNodes {
+ public:
+  template <typename TIterator>
+  class NodeIterator;
+
+  // optional filtering function to return a subset of nodes
+  using NodeFilterFunc = std::function<bool(NodeIndex)>;
+
+  /**
+  Construct a ValidNodes instance to provide iteration over all valid nodes in the TNodesCollection
+  @param[in] nodes Nodes to iterate, skipping invalid entries.
+  */
+  explicit ValidNodes(TNodesContainer& nodes) noexcept : nodes_(&nodes) {}
+
+  explicit ValidNodes(TNodesContainer& nodes, NodeFilterFunc&& filter_node_fn) noexcept
+      : nodes_(&nodes), filter_node_fn_{std::move(filter_node_fn)} {}
+
+  using ConstNodeIterator = NodeIterator<typename TNodesContainer::const_iterator>;
+  using MutableNodeIterator = NodeIterator<typename TNodesContainer::iterator>;
+  using ConstReverseNodeIterator = NodeIterator<typename TNodesContainer::const_reverse_iterator>;
+
+  ConstNodeIterator cbegin() const noexcept {
+    return {nodes_->cbegin(), nodes_->cend(), filter_node_fn_};
+  }
+
+  ConstNodeIterator cend() const noexcept {
+    return {nodes_->cend(), nodes_->cend(), filter_node_fn_};
+  }
+
+  ConstNodeIterator begin() const noexcept {
+    return cbegin();
+  }
+
+  ConstNodeIterator end() const noexcept {
+    return cend();
+  }
+
+  ConstReverseNodeIterator rbegin() const noexcept {
+    return {nodes_->crbegin(), nodes_->crend(), filter_node_fn_};
+  }
+
+  ConstReverseNodeIterator rend() const noexcept {
+    return {nodes_->crend(), nodes_->crend(), filter_node_fn_};
+  }
+
+  // we only allow mutable access if the container is non-const.
+  // we need to templatize the functions for enable_if to work at this level, but mandate T2 being TNodesContainer
+  template <typename T2 = TNodesContainer>
+  typename std::enable_if<!std::is_const<T2>::value, MutableNodeIterator>::type begin() noexcept {
+    static_assert(std::is_same<T2, TNodesContainer>::value, "Explicit specialization is not allowed");
+    return MutableNodeIterator(nodes_->begin(), nodes_->end(), filter_node_fn_);
+  }
+
+  template <typename T2 = TNodesContainer>
+  typename std::enable_if<!std::is_const<T2>::value, MutableNodeIterator>::type end() noexcept {
+    static_assert(std::is_same<T2, TNodesContainer>::value, "Explicit specialization is not allowed");
+    return MutableNodeIterator(nodes_->end(), nodes_->end(), filter_node_fn_);
+  }
+
+  bool empty() const noexcept { return nodes_->empty(); }
+
+  /**
+  @class NodeIterator
+  Iterator to provide const and non-const access to valid Node instances in a Graph.
+  @remarks Skips invalid nodes.
+  */
+  template <typename TIterator>
+  class NodeIterator {
+    // get the type being returned by the iterator. can't use TIterator::value_type as that is always non-const
+    using IterType = typename std::remove_reference<typename std::iterator_traits<TIterator>::reference>::type;
+    // and determine what we will return based on its constness
+    using T = typename std::conditional<std::is_const<IterType>::value,
+                                        const Node,   // return const Node if this is a const iterator
+                                        Node>::type;  // else return Node
+
+   public:
+    using iterator_category = std::input_iterator_tag;
+    using value_type = T;
+    using difference_type = typename std::iterator_traits<TIterator>::difference_type;
+    using pointer = T*;
+    using reference = T&;
+    using const_reference = const T&;
+
+    /** Construct a NodeInterator and move to the first valid node. */
+    NodeIterator(const TIterator current, const TIterator end, const NodeFilterFunc& filter_fn) noexcept
+        : current_{current}, end_{end}, apply_filter_{filter_fn != nullptr}, filter_func_{&filter_fn} {
+      // skip to next valid node, stopping at end if none are found
+      while (current_ < end && (*current_ == nullptr ||
+                                (apply_filter_ && (*filter_func_)((*current_)->Index()) == true))) {
+        ++current_;
+      }
+    }
+
+    bool operator==(const NodeIterator<TIterator>& other) const noexcept {
+      return (current_ == other.current_);
+    }
+
+    bool operator!=(const NodeIterator<TIterator>& other) const noexcept {
+      return (current_ != other.current_);
+    }
+
+    NodeIterator<TIterator>& operator++() {
+      if (current_ < end_) {
+        while (++current_ != end_) {
+          if (*current_ != nullptr && (!apply_filter_ || (*filter_func_)((*current_)->Index()) == false))
+            break;
+        }
+      }
+      return *this;
+    }
+
+    NodeIterator<TIterator> operator++(int) {
+      NodeIterator<TIterator> tmp{*this};
+      ++(*this);
+
+      return tmp;
+    }
+
+    /** Return the current Node&. This will be const if the iterator was returned from a const GraphNodes instance. */
+    reference operator*() const {
+      // if iterator is valid we always have a non-nullptr node
+      // if this is a nullptr we're at end_ and this shouldn't be being called
+      return **current_;
+    }
+
+    pointer operator->() const {
+      return current_->get();
+    }
+
+   private:
+    TIterator current_;
+    TIterator end_;
+    bool apply_filter_;                  // store whether filter_func_ is not nullptr and contains a callable
+    const NodeFilterFunc* filter_func_;  // store as pointer so iterator is copyable
+  };
+
+ private:
+  gsl::not_null<TNodesContainer*> nodes_;  // always set by ctor
+
+  // no filtering if not set. this instance owns the filter func if set.
+  NodeFilterFunc filter_node_fn_;
+};
+
+/**
+Class that provides iteration over all valid nodes in the Graph.
+*/
+class GraphNodes : public ValidNodes<std::vector<std::unique_ptr<Node>>> {
+ public:
+  GraphNodes(std::vector<std::unique_ptr<Node>>& nodes) : ValidNodes(nodes) {
+  }
+};
+
+// Variant that only ever allows const access to nodes and optionally allows filtering of the nodes.
+class ConstGraphNodes : public ValidNodes<const std::vector<std::unique_ptr<Node>>> {
+ public:
+  ConstGraphNodes(const std::vector<std::unique_ptr<Node>>& nodes) : ValidNodes(nodes) {
+  }
+
+  ConstGraphNodes(const std::vector<std::unique_ptr<Node>>& nodes,
+                  GraphNodes::NodeFilterFunc&& filter_func)
+      : ValidNodes(nodes, std::move(filter_func)) {
+  }
+};
+
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/graph/graph_viewer.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/graph/graph_viewer.h
new file mode 100644
index 00000000000000..6a664d8be9c05a
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/graph/graph_viewer.h
@@ -0,0 +1,235 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include <unordered_set>
+#include <filesystem>
+
+#include "core/graph/graph.h"
+#include "core/framework/session_options.h"
+
+namespace onnxruntime {
+class Function;
+struct IndexedSubGraph;
+}  // namespace onnxruntime
+
+namespace onnxruntime {
+
+// use value-based compare to make sure transformer output order is consistent
+struct NodeCompare {
+  bool operator()(const Node* n1, const Node* n2) const;
+};
+
+/**
+@class GraphViewer
+Class that provides a read-only view of the Graph.
+@remarks If the underlying Graph is changed, GetNodesInTopologicalOrder and GetRootNodes may become invalid.
+*/
+class GraphViewer {
+ public:
+  /**
+  Construct a GraphViewer from the provided Graph instance.
+  */
+  explicit GraphViewer(const Graph& graph);
+
+  /**
+  Construct a GraphViewer from the provided Graph instance, filtering to the nodes specified in the IndexedSubGraph
+  */
+  explicit GraphViewer(const Graph& graph, const IndexedSubGraph& filter_info);
+
+  /** Gets the Graph name. */
+  const std::string& Name() const noexcept;
+
+  /** Gets the Graph description. */
+  const std::string& Description() const noexcept;
+
+  /** Gets the path of the owning model if any **/
+  const std::filesystem::path& ModelPath() const noexcept { return graph_->ModelPath(); }
+
+  /**
+  Gets a tensor created from an initializer.
+  @param tensor_name The tensor name
+  @param[out] value Sets the pointer to the TensorProto if found, or nullptr if not.
+  @returns True if found. False if not.
+  */
+  bool GetInitializedTensor(const std::string& tensor_name, const ONNX_NAMESPACE::TensorProto*& value) const;
+
+  /** Returns true if an initializer value can be overridden by a graph input with the same name. */
+  bool CanOverrideInitializer() const noexcept;
+
+  /**
+  Gets the Graph inputs, excluding initializers.
+  @returns Collection of NodeArg pointers for the graph inputs, excluding inputs that have matching initializers.
+  @remarks No nullptr values in the returned collection. The order will be the same as in the GraphProto.
+           Inputs are for filter_info_ if set.
+  */
+  const std::vector<const NodeArg*>& GetInputs() const noexcept;
+
+  /**
+  Gets the Graph inputs, including any initializers.
+  @returns Collection of NodeArg pointers for all the graph inputs.
+  @remarks No nullptr values in the returned collection. The order will be the same as in the GraphProto.
+           Inputs are for filter_info_ if set.
+  */
+  const std::vector<const NodeArg*>& GetInputsIncludingInitializers() const noexcept;
+
+  /**
+  Gets the Graph outputs.
+  @returns Collection of NodeArg pointers for all the graph outputs.
+  @remarks No nullptr values in the returned collection. The order will be the same as in the GraphProto.
+           Outputs are for filter_info_ if set.
+  */
+  const std::vector<const NodeArg*>& GetOutputs() const noexcept;
+
+  /** Returns true if one or more of the Node outputs are Graph outputs.
+   */
+  bool NodeProducesGraphOutput(const Node& node) const;
+
+  /** Gets all ValueInfo NodeArg instances in the Graph.
+  @remarks NOT filtered using filter_info_.
+  */
+  const std::unordered_set<const NodeArg*>& GetValueInfo() const noexcept;
+
+  /**
+  Gets the Node instance at the specified index.
+  @param node_index Index to retrieve Node from.
+  @remarks May return nullptr if index no longer points to a valid node due to the node being freed, or if
+           node is excluded by filter_info_.
+  */
+  const Node* GetNode(NodeIndex node_index) const;
+
+  /**  Gets an iterator over all the valid Nodes in the Graph.
+  @remarks Nodes are filtered using filter_info_ if set.
+  */
+  const ConstGraphNodes& Nodes() const noexcept;
+
+  /** Gets the number of valid nodes in the Graph.
+  @remarks Returns the number of nodes in filter_info_ if set.
+  */
+  int NumberOfNodes() const noexcept;
+
+  /** Gets the maximum NodeIndex value used by Nodes in the Graph. */
+  int MaxNodeIndex() const noexcept;
+
+  /** Gets the NodeIndex values for the Graph nodes, sorted into topological order.
+  @remarks Filtered using filter_info_ if set.
+  */
+  const std::vector<NodeIndex>& GetNodesInTopologicalOrder(ExecutionOrder order = ExecutionOrder::DEFAULT) const;
+
+  /**
+  Gets the NodeIndex values for the root nodes in the Graph.
+  The root nodes are the topmost nodes in the Graph that receive inputs from the Graph inputs
+  and no other nodes in the Graph.
+  @remarks Not supported if filter_info_ is set.
+  */
+  const std::vector<NodeIndex>& GetRootNodes() const;
+
+  /** Gets all tensors created from initializers. */
+  const InitializedTensorSet& GetAllInitializedTensors() const noexcept;
+
+  /**
+  Gets the NodeArg instance for the given name.
+  @returns A NodeArg if found, a nullptr if not.
+  */
+  const NodeArg* GetNodeArg(const std::string& name) const;
+
+  /** Gets the map of operator domains to their opset versions. */
+  const std::unordered_map<std::string, int>& DomainToVersionMap() const noexcept {
+    return graph_->DomainToVersionMap();
+  }
+
+  /** Checks if this is a Subgraph */
+  bool IsSubgraph() const;
+
+  /** Get the internal graph*/
+  const Graph& GetGraph() const { return *graph_; }
+
+#if !defined(ORT_MINIMAL_BUILD)
+  const std::unordered_set<std::string>& GetOuterScopeNodeArgNames() const noexcept;
+#endif
+
+  /**
+  returns true if 'name' is an initializer, and is constant and cannot be overridden at runtime.
+  @param check_outer_scope If true and the 'graph_' is a subgraph, check parent graph/s for 'name'
+                           if the name is not found in 'graph_'.
+  */
+  bool IsConstantInitializer(const std::string& name, bool check_outer_scope) const;
+
+  /** Check if a given name is an initializer tensor's name in this graph. */
+  bool IsInitializedTensor(const std::string& name) const;
+
+  /** returns the initializer's TensorProto if 'name' is an initializer, is constant and
+  cannot be overridden at runtime. If the initializer is not found or is not constant, a nullptr is returned.
+  @param check_outer_scope If true and the graph is a subgraph,
+         check ancestor graph/s for 'name' if not found in 'graph'.
+  @remarks This function will return the result from GetConstantInitializer of the underlying Graph,
+           if a const initializer is part of the underlying Graph but not part of this GraphViewer,
+           it will still be returned instead of nullptr
+  */
+  const ONNX_NAMESPACE::TensorProto* GetConstantInitializer(const std::string& name,
+                                                            bool check_outer_scope = true) const;
+
+  /** Get the Node containing this Graph if IsSubgraph is true. Returns nullptr otherwise. */
+  const Node* ParentNode() const noexcept { return graph_->ParentNode(); }
+
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+  /** Get the consumer nodes of a node arg */
+  std::vector<const Node*> GetConsumerNodes(const std::string& node_arg_name) const {
+    return graph_->GetConsumerNodes(node_arg_name);
+  }
+
+  /** Get the producer node of a node arg */
+  const Node* GetProducerNode(const std::string& node_arg_name) const {
+    return graph_->GetProducerNode(node_arg_name);
+  }
+#endif
+
+  /** Get the filter info that restricts the graph viewer to a subset of nodes if set.
+  @returns Filter info or nullptr
+  */
+  const IndexedSubGraph* GetFilterInfo() const { return filter_info_; }
+
+#if !defined(ORT_MINIMAL_BUILD)
+  IOnnxRuntimeOpSchemaCollectionPtr GetSchemaRegistry() const { return graph_->GetSchemaRegistry(); }
+#endif
+
+  /** Populate `value` if an externally allocated OrtValue exists for an initializer with the given name.
+   */
+  bool GetOrtValueInitializer(const std::string& name, OrtValue& value) const {
+    return graph_->GetOrtValueInitializer(name, value);
+  }
+
+ private:
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(GraphViewer);
+  GraphViewer(const Graph& graph, const IndexedSubGraph* filter_info);
+
+  const Graph* graph_;
+  ConstGraphNodes graph_nodes_;
+
+  // The NodeIndex values of the graph nodes sorted in topological order.
+  std::vector<NodeIndex> nodes_in_topological_order_;
+
+#if !defined(ORT_MINIMAL_BUILD)
+  // The NodeIndex values of the graph nodes sorted in topological order with priority.
+  std::vector<NodeIndex> nodes_in_topological_order_with_priority_;
+#endif
+
+#ifdef ENABLE_TRAINING
+  // The NodeIndex values of the graph nodes sorted in memory efficient topological order.
+  std::vector<NodeIndex> nodes_in_mem_efficient_topological_order_;
+#endif
+
+  // Graph root nodes.
+  std::vector<NodeIndex> root_nodes_;
+
+  // if we're limiting the view to an IndexedSubGraph we need to create a few pieces of infrastructure that would
+  // usually come from the full graph
+  const IndexedSubGraph* filter_info_{nullptr};
+  using FilteredNodeSet = InlinedHashSet<NodeIndex>;
+  FilteredNodeSet filtered_node_indices_;
+  std::vector<const NodeArg*> filtered_node_inputs_;
+  std::vector<const NodeArg*> filtered_node_inputs_including_initializers_;
+  std::vector<const NodeArg*> filtered_node_outputs_;
+  InitializedTensorSet filtered_initializers_;
+};
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/graph/indexed_sub_graph.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/graph/indexed_sub_graph.h
new file mode 100644
index 00000000000000..c57db412541591
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/graph/indexed_sub_graph.h
@@ -0,0 +1,78 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "core/graph/basic_types.h"
+#include "core/graph/onnx_protobuf.h"
+
+namespace onnxruntime {
+
+class OpKernel;
+class OpKernelInfo;
+
+/**
+@class IndexedSubGraph
+
+Class containing information about a subgraph of Nodes from a Graph.
+It contains a NodeIndex array of the Nodes covered by the subgraph,
+and the meta definition needed for representing this subgraph as a FunctionProto,
+which could be serialized/saved to a model file.
+*/
+struct IndexedSubGraph {
+  struct MetaDef {
+    std::string name;    ///< Name of customized SubGraph/FunctionProto
+    std::string domain;  ///< Domain of customized SubGraph/FunctionProto
+    int since_version;   ///< Since version of customized SubGraph/FunctionProto.
+
+    ONNX_NAMESPACE::OperatorStatus status;  ///< Status of customized SubGraph/FunctionProto.
+
+    std::vector<std::string> inputs;                 ///< Inputs of customized SubGraph/FunctionProto.
+    std::vector<std::string> outputs;                ///< Outputs of customized SubGraph/FunctionProto.
+    std::vector<std::string> constant_initializers;  ///< Constant initializers of customized SubGraph/FunctionProto.
+    NodeAttributes attributes;                       ///< Attributes of customized SubGraph/FunctionProto.
+
+    std::string doc_string;  ///< Doc string of customized SubGraph/FunctionProto.
+#if !defined(ORT_MINIMAL_BUILD)
+    /** Type and shape inference function that can optionally be defined for the fused node */
+    std::function<void(ONNX_NAMESPACE::InferenceContext&)> type_and_shape_inference_function;
+#endif
+  };
+
+  /** Nodes covered by this subgraph. The NodeIndex values are from the parent Graph.*/
+  std::vector<onnxruntime::NodeIndex> nodes;
+
+  enum class SourceOfSchema : uint8_t {
+    CREATE,           /// create new schema from info in IndexedSubGraph instance.
+                      /// schema instance will not be re-usable.
+    REUSE_OR_CREATE,  /// re-use existing dynamically created schema with matching domain+name.
+                      /// create re-usable schema if one is not found.
+    EXISTING,         /// use existing statically registered schema.
+                      /// e.g. domain+name matches ONNX or contrib op domain+op_type+opset.
+  };
+  // Either using an existing schema or generating reusable one when fusing nodes using the MetaDef.
+  // MetaDef.domain + MetaDef.name => the domain.op_type that a schema must exist for with a valid since_version.
+  SourceOfSchema schema_source{SourceOfSchema::CREATE};
+
+  /** Set the meta definition needed to represent this subgraph as a FunctionProto
+  It's needed IF AND ONLY IF there are multiple indexes contained in #nodes. */
+  void SetMetaDef(std::unique_ptr<MetaDef>&& meta_def) {
+    meta_def_ = std::move(meta_def);
+  }
+
+  /** Gets the meta definition needed to represent this subgraph as a FunctionProto.
+  @returns MetaDef instance if it has been set. nullptr if not. */
+  const MetaDef* GetMetaDef() const {
+    return meta_def_.get();
+  }
+
+ private:
+  // subgraph meta definition.
+  std::unique_ptr<MetaDef> meta_def_;
+};
+
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/graph/model_saving_options.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/graph/model_saving_options.h
new file mode 100644
index 00000000000000..924799f15b2474
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/graph/model_saving_options.h
@@ -0,0 +1,44 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+namespace onnxruntime {
+
+class PrepackedWeightsForGraph;
+
+// These options affect how the model initializers are written to the external file.
+// This includes options to align external initializer offset.
+// For models running on CPU, ORT will try to use mmap to load external
+// initializers. To use mmap, external initializer need to be offset aligned.
+// ORT saves external initializers into single data file, each initializer is
+// accessed with offset(start position of initializer) and length(byte length of
+// initializer) of the data file. To use mmap, each offset need to be aligned
+// which means offset need to divisible by allocation granularity(64KB for
+// windows and 4K for other OSes). With align_offset to true, ORT will align
+// offset for large initializer when save ONNX model with external data file.
+struct ModelSavingOptions {
+  explicit ModelSavingOptions(size_t size_threshold)
+      : initializer_size_threshold(size_threshold) {}
+
+  // Mimimal initializer size in bytes to be externalized on disk
+  size_t initializer_size_threshold;
+  // Offset will always be page aligned and allocation granularity aligned for
+  // mmap support. This is done by padding previous tensor data with zeros
+  // keeping same length.
+  bool align_offset = false;
+  // Alignment threshold for size of data.
+  // Having a low threshold will waste file space for small initializers.
+  // Only when tensor's data size is > the page_align_threshold it will be force
+  // aligned. Default to 1MB.
+  int64_t align_threshold = 1048576;
+  // The allocation Granularity for mmap() support.
+  // Typically 64KB for Windows & 4KB for other OSes. Default to 64KB.
+#ifdef _WIN32
+  int64_t allocation_granularity = 65536;
+#else
+  int64_t allocation_granularity = 4096;
+#endif
+};
+
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/graph/node_arg.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/graph/node_arg.h
new file mode 100644
index 00000000000000..921bff59fb6d4e
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/graph/node_arg.h
@@ -0,0 +1,134 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "onnx/onnx_pb.h"
+
+#include "core/graph/basic_types.h"
+#include "core/common/status.h"
+#include "core/common/logging/logging.h"
+
+namespace onnxruntime {
+
+// Node argument definition, for both input and output,
+// including arg name, arg type (contains both type and shape).
+//
+// Design Question: in my opinion, shape should not be part of type.
+// We may align the protobuf design with our operator registry interface,
+// which has type specified for each operator, but no shape. Well, shape
+// should be inferred with a separate shape inference function given
+// input shapes, or input tensor data sometimes.
+// With shape as part of type (current protobuf design),
+// 1) we'll have to split the "TypeProto" into type and shape in this internal
+// representation interface so that it could be easily used when doing type
+// inference and matching with operator registry.
+// 2) SetType should be always called before SetShape, otherwise, SetShape()
+// will fail. Because shape is located in a TypeProto.
+// Thoughts?
+//
+
+/**
+@class NodeArg
+Class representing a data type that is input or output for a Node, including the shape if it is a Tensor.
+*/
+class NodeArg {
+ public:
+  /**
+  Construct a new NodeArg.
+  @param name The name to use.
+  @param p_arg_type Optional TypeProto specifying type and shape information.
+  */
+  NodeArg(const std::string& name,
+          const ONNX_NAMESPACE::TypeProto* p_arg_type);
+
+  NodeArg(NodeArg&&) = default;
+  NodeArg& operator=(NodeArg&& other) = default;
+
+  /** Gets the name. */
+  const std::string& Name() const noexcept;
+
+  /** Gets the data type. */
+  const std::string* Type() const noexcept;
+
+  /** Gets the TypeProto
+  @returns TypeProto if type is set. nullptr otherwise. */
+  const ONNX_NAMESPACE::TypeProto* TypeAsProto() const noexcept;
+
+  /** Gets the shape if NodeArg is for a Tensor.
+  @returns TensorShapeProto if shape is set. nullptr if there's no shape specified. */
+  const ONNX_NAMESPACE::TensorShapeProto* Shape() const;
+
+  /** Return an indicator.
+  @returns true if NodeArg is a normal tensor with a non-empty shape or a scalar with an empty shape. Otherwise, returns false. */
+  bool HasTensorOrScalarShape() const;
+
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+
+  /** Sets the shape.
+  @remarks Shape can only be set if the TypeProto was provided to the ctor, or #SetType has been called,
+  as the shape information is stored as part of TypeProto. */
+  void SetShape(const ONNX_NAMESPACE::TensorShapeProto& shape);
+
+  /** Clears shape info.
+  @remarks If there is a mismatch during shape inferencing that can't be resolved the shape info may be removed. */
+  void ClearShape();
+
+#endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+
+#if !defined(ORT_MINIMAL_BUILD)
+
+  /** Override current type from input_type if override_types is set to true, return failure status otherwise.
+  @param input_tensor_elem_type Tensor element type parsed input_type
+  @param current_tensor_elem_type Tensor element type parsed from existing type
+  @param override_types If true, resolve the two inputs or two outputs type when different
+  @returns Success unless there is existing type or shape info that can't be successfully updated. */
+  common::Status OverrideTypesHelper(const ONNX_NAMESPACE::TypeProto& input_type,
+                                     int32_t input_tensor_elem_type,
+                                     int32_t current_tensor_elem_type,
+                                     bool override_types);
+
+  /** Validate and merge type [and shape] info from input_type.
+  @param strict If true, the shape update will fail if there are incompatible values.
+                If false, will be lenient and merge only shape info that can be validly processed.
+  @param override_types If true, resolve the two inputs or two outputs type when different
+  @returns Success unless there is existing type or shape info that can't be successfully updated. */
+  common::Status UpdateTypeAndShape(const ONNX_NAMESPACE::TypeProto& input_type, bool strict, bool override_types, const logging::Logger& logger);
+
+  /** Validate and merge type [and shape] info from node_arg.
+  @param strict If true, the shape update will fail if there are incompatible values.
+                If false, will be lenient and merge only shape info that can be validly processed.
+  @param override_types If true, resolve the two inputs or two outputs type when different
+  @returns Success unless there is existing type or shape info that can't be successfully updated. */
+  common::Status UpdateTypeAndShape(const NodeArg& node_arg, bool strict, bool override_types, const logging::Logger& logger);
+
+#endif  // !defined(ORT_MINIMAL_BUILD)
+
+  /** Gets this NodeArg as a NodeArgInfo, AKA ValueInfoProto. */
+  const NodeArgInfo& ToProto() const noexcept { return node_arg_info_; }
+
+  /** Gets a flag indicating whether this NodeArg exists or not.
+  Optional inputs are allowed in ONNX and an empty #Name represents a non-existent input argument. */
+  bool Exists() const noexcept;
+
+  friend class Graph;
+
+  NodeArg(NodeArgInfo&& node_arg_info);
+
+ private:
+  ORT_DISALLOW_COPY_AND_ASSIGNMENT(NodeArg);
+  void SetType(const std::string* p_type);
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+  void SetType(const ONNX_NAMESPACE::TypeProto& type_proto);
+#endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+
+  // Node arg PType.
+  const std::string* type_;
+
+  // Node arg name, type and shape.
+  NodeArgInfo node_arg_info_;
+
+  // Flag indicates whether <*this> node arg exists or not.
+  bool exists_;
+};
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/graph/schema_registry.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/graph/schema_registry.h
new file mode 100644
index 00000000000000..ca51e3621b2c69
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/graph/schema_registry.h
@@ -0,0 +1,160 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include <mutex>
+#include <deque>
+#include <map>
+#include <sstream>
+
+#include "core/graph/onnx_protobuf.h"
+#include "onnx/defs/schema.h"
+#include "core/graph/constants.h"
+#include "core/common/common.h"
+#include "core/common/status.h"
+
+namespace onnxruntime {
+using OpName_Domain_Version_Schema_Map = std::unordered_map<
+    std::string,
+    std::unordered_map<std::string, std::map<ONNX_NAMESPACE::OperatorSetVersion, ONNX_NAMESPACE::OpSchema>>>;
+
+/**
+@struct SchemaRegistryVersion
+onnxruntime schema registry is a supplement to the built-in ONNX schema.
+Every schema registry represent a collection of schema deltas from baseline_opset_version to opset_version
+*/
+struct SchemaRegistryVersion {
+  int baseline_opset_version;
+  int opset_version;
+};
+
+using DomainToVersionMap = std::unordered_map<std::string, int>;
+using DomainToVersionRangeMap = std::unordered_map<std::string, SchemaRegistryVersion>;
+
+class IOnnxRuntimeOpSchemaCollection : public ONNX_NAMESPACE::ISchemaRegistry {
+ public:
+  virtual DomainToVersionMap GetLatestOpsetVersions(bool is_onnx_only) const = 0;
+
+  using ISchemaRegistry::GetSchema;
+
+  const ONNX_NAMESPACE::OpSchema* GetSchema(const std::string& key, const int maxInclusiveVersion,
+                                            const std::string& domain) const final {
+    const ONNX_NAMESPACE::OpSchema* latest_schema = nullptr;
+    int earliest_opset_where_unchanged = std::numeric_limits<int>::max();
+    GetSchemaAndHistory(key, maxInclusiveVersion, domain, &latest_schema, &earliest_opset_where_unchanged);
+
+    assert(latest_schema == nullptr || (latest_schema->SinceVersion() <= maxInclusiveVersion &&
+                                        earliest_opset_where_unchanged == latest_schema->SinceVersion()));
+
+    return latest_schema;
+  }
+
+  virtual void GetSchemaAndHistory(
+      const std::string& key,
+      int maxInclusiveVersion,
+      const std::string& domain,
+      const ONNX_NAMESPACE::OpSchema** latest_schema,
+      int* earliest_opset_where_unchanged) const = 0;
+};
+
+/**
+@class OnnxRuntimeOpSchemaRegistry
+
+OnnxRuntimeOpSchemaRegistry is used to provide supplement for built-in ONNX schemas.
+Each OnnxRuntimeOpSchemaRegistry must register complete opsets delta from a baseline version to max opset version.
+(Please notice that baseline opsets are not include in the delta)
+
+For example, ONNXRuntime is build with ONNX 1.2 which is at opset7, to use ONNX opset8 and opset9,
+user could create a OnnxRuntimeOpSchemaRegistry and config it as {baseline_opset_version = 7, opset_version = 9}
+it means this OnnxRuntimeOpSchemaRegistry contains the complete delta from opset7 to opset9.
+*/
+class OnnxRuntimeOpSchemaRegistry : public IOnnxRuntimeOpSchemaCollection {
+ public:
+  OnnxRuntimeOpSchemaRegistry() = default;
+
+  common::Status SetBaselineAndOpsetVersionForDomain(
+      const std::string& domain,
+      int baseline_opset_version,
+      int opset_version);
+
+  DomainToVersionMap GetLatestOpsetVersions(bool is_onnx_only) const override;
+
+  // OnnxRuntimeOpSchemaRegistry must register complete delta for a opset.
+  common::Status RegisterOpSet(
+      std::vector<ONNX_NAMESPACE::OpSchema>& schemas,
+      const std::string& domain,
+      int baseline_opset_version,
+      int opset_version);
+
+  using IOnnxRuntimeOpSchemaCollection::GetSchema;
+
+  void GetSchemaAndHistory(const std::string& key, int maxInclusiveVersion, const std::string& domain,
+                           const ONNX_NAMESPACE::OpSchema** latest_schema,
+                           int* earliest_opset_where_unchanged) const override;
+
+  bool empty() const {
+    return map_.empty();
+  }
+
+ private:
+  common::Status RegisterOpSchema(ONNX_NAMESPACE::OpSchema&& op_schema);
+
+  common::Status RegisterOpSchemaInternal(ONNX_NAMESPACE::OpSchema&& op_schema);
+
+  std::mutex mutex_;
+
+  OpName_Domain_Version_Schema_Map map_;
+  DomainToVersionRangeMap domain_version_range_map_;
+};
+
+/**
+@class SchemaRegistryManager
+
+SchemaRegistryManager provides a view based on built-in ONNX schema and a list of
+OnnxRuntimeOpSchemaRegistry as supplement.
+
+The user needs to make sure the customized schema registry is valid, otherwise the behavior is undefined.
+
+@todo We may add more consistency checks later.
+*/
+class SchemaRegistryManager : public onnxruntime::IOnnxRuntimeOpSchemaCollection {
+ public:
+  /**
+  Register a new schema registry instance.
+  @remarks The schema registry priority is the reverse of registration order. i.e. the last registry added will be
+  searched first for a matching OpSchema.
+  */
+  void RegisterRegistry(std::shared_ptr<IOnnxRuntimeOpSchemaCollection> registry);
+
+  /** Gets the latest opset versions.
+  @param is_onnx_only If true, return the latest ONNX schemas. If false, return the latest schemas for all domains.
+  */
+  DomainToVersionMap GetLatestOpsetVersions(bool is_onnx_only) const override;
+
+  /** Gets the last released opset versions.
+  @param is_onnx_only If true, return ONNX schemas only. If false, return the schemas for all domains.
+  */
+  DomainToVersionMap GetLastReleasedOpsetVersions(bool is_onnx_only) const;
+  /**
+  Gets the OpSchema and its history.
+  Searches custom schema registries starting with the last one added. \
+  If the OpSchema is not found the default ONNX schema registry is searched.
+
+  @param key Operator type.
+  @param max_inclusive_version Maximum opset version allowed, inclusive.
+  @param domain The domain of the operator.
+  @param[out] latest_schema Returns the latest OpSchema if found. nullptr otherwise.
+  @param[out] earliest_opset_where_unchanged The earliest opset version preceding max_inclusive_version where the
+  operator is known to be unchanged.
+  */
+  void GetSchemaAndHistory(const std::string& key, int max_inclusive_version, const std::string& domain,
+                           const ONNX_NAMESPACE::OpSchema** latest_schema,
+                           int* earliest_opset_where_unchanged) const override;
+
+ private:
+  void GetDomainToVersionMapForRegistries(DomainToVersionMap& domain_version_map, bool is_onnx_only) const;
+
+  std::deque<std::shared_ptr<IOnnxRuntimeOpSchemaCollection>> registries;
+};
+
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/optimizer/graph_transformer.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/optimizer/graph_transformer.h
new file mode 100644
index 00000000000000..0af84f17abb88d
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/optimizer/graph_transformer.h
@@ -0,0 +1,96 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include <string>
+
+#include "core/common/common.h"
+#include "core/common/inlined_containers.h"
+#include "core/framework/data_types.h"
+#include "core/graph/graph_viewer.h"
+#include "core/optimizer/graph_transformer_level.h"
+
+namespace onnxruntime {
+
+/**
+@class GraphTransformer
+
+The interface for in-place transformation of a Graph.
+*/
+class GraphTransformer {
+ public:
+  GraphTransformer(const std::string& name,
+                   const InlinedHashSet<std::string_view>& compatible_execution_providers = {}) noexcept
+      : name_(name), compatible_provider_types_(compatible_execution_providers) {
+  }
+
+  virtual ~GraphTransformer() = default;
+
+  /** Gets the name of this graph transformer. */
+  const std::string& Name() const noexcept {
+    return name_;
+  }
+
+  const InlinedHashSet<std::string_view>& GetCompatibleExecutionProviders() const noexcept {
+    return compatible_provider_types_;
+  }
+
+  /** Apply the in-place transformation defined by this transformer to the provided Graph instance.
+  @param[out] modified Set to true if the Graph was modified.
+  @returns Status with success or error information.
+  */
+  Status Apply(Graph& graph, bool& modified, const logging::Logger& logger) const;
+
+  virtual bool ShouldOnlyApplyOnce() const { return false; }
+
+ protected:
+  /** Helper method to call ApplyImpl on any subgraphs in the Node. */
+  Status Recurse(Node& node, bool& modified, int graph_level, const logging::Logger& logger) const {
+    int subgraph_level = ++graph_level;
+    for (auto& entry : node.GetAttributeNameToMutableSubgraphMap()) {
+      auto& subgraph = *entry.second;
+      ORT_RETURN_IF_ERROR(ApplyImpl(subgraph, modified, subgraph_level, logger));
+    }
+
+    return Status::OK();
+  }
+
+ private:
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(GraphTransformer);
+
+  // Apply the transform to the graph.
+  // graph_level is 0 for the main graph, and is incremented when descending into the subgraph of a node.
+  // You MUST call Recurse for all valid Nodes in the graph to ensure any subgraphs in control flow nodes
+  // (Scan/If/Loop) are processed as well.
+  // You should avoid calling Graph::Resolve in ApplyImpl unless you are 100% sure it's required. In most cases
+  // the call to Graph::Resolve in GraphTransformer::Apply after the call to ApplyImpl (if 'modified' is true)
+  // should suffice.
+  virtual Status ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const = 0;
+
+  const std::string name_;
+  const InlinedHashSet<std::string_view> compatible_provider_types_;
+};
+
+/**
+ * @brief Immutable object to identify a kernel registration.
+ *
+ * This data structure is used by the graph transformers to check whether
+ * a kernel is registered with the execution provider (i.e. has an
+ * implementation). If not, the transformer can not generate a node with
+ * such kernel.
+ */
+struct OpKernelRegistryId {
+  const std::string op_type_;
+  const std::string domain_;
+  const int version_;
+  const InlinedHashMap<std::string, MLDataType> type_constraints_;
+
+  OpKernelRegistryId(
+      const std::basic_string_view<char>& op,
+      const std::basic_string_view<char>& domain,
+      const int version,
+      const std::initializer_list<std::pair<const std::string, MLDataType>>& init_list)
+      : op_type_(op), domain_(domain), version_(version), type_constraints_(init_list) {}
+};
+
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/optimizer/graph_transformer_config.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/optimizer/graph_transformer_config.h
new file mode 100644
index 00000000000000..6af48331270cd7
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/optimizer/graph_transformer_config.h
@@ -0,0 +1,73 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include <type_traits>
+namespace onnxruntime {
+
+struct GraphTransformerConfiguration {
+  struct PropagateCastOpsConfiguration {
+    // Propagate FP16 Cast operations up and FP32 operations down
+    /*
+     * Cast propagation strategy.
+     * One strategy is to insert casts around all the nodes with the allowed opcodes
+     * and reduce, by removing redundant-casts and back-to-back-casts etc., and
+     * the other is to propagate casts using flood-fill approach, expanding float16 regions in the graph
+     * traversing the graph up/down.
+     */
+    enum class Strategy {
+      None = 0,
+      InsertAndReduce = 1,
+      FloodFill = 2, /* Propagate FP16 Cast operations up and FP32 operations down */
+    };
+    using Strategy_t = std::underlying_type<Strategy>::type;
+    friend constexpr Strategy operator|(const Strategy s1, const Strategy s2) {
+      return static_cast<Strategy>(static_cast<Strategy_t>(s1) | static_cast<Strategy_t>(s2));
+    }
+
+    friend Strategy& operator|=(Strategy& s1, Strategy s2) {
+      s1 = s1 | s2;
+      return s1;
+    }
+
+    friend constexpr Strategy operator&(const Strategy s1, const Strategy s2) {
+      return static_cast<Strategy>(static_cast<Strategy_t>(s1) & static_cast<Strategy_t>(s2));
+    }
+
+    friend constexpr Strategy& operator&=(Strategy& s1, Strategy s2) {
+      s1 = s1 & s2;
+      return s1;
+    }
+
+    friend constexpr bool operator==(Strategy s1, Strategy s2) {
+      return static_cast<Strategy_t>(s1) == static_cast<Strategy_t>(s2);
+    }
+
+    friend constexpr bool operator!=(Strategy s1, Strategy s2) {
+      return (s1 == s2) == false;
+    }
+
+    int level{1}; /* -1 => no cast propagation,
+                       0 => use user specified list of opcodes to allow moving cast operations,
+                       1 => use ORT predefined list of level 1 opcodes in addition to the user specified allow opcodes
+                       2 => use ORT predefined list of level 2 opcodes in addition to the user specified allow opcodes
+                    */
+    Strategy strategy = Strategy::FloodFill;
+    // List of allowed opcodes to consider as safe to execute in float16, while moving cast operations
+    std::vector<std::string> allow;
+  };
+
+  PropagateCastOpsConfiguration propagate_cast_ops_config;
+};
+
+// The following declarations are required to refer to these operators in pybind11.
+constexpr GraphTransformerConfiguration::PropagateCastOpsConfiguration::Strategy operator|(GraphTransformerConfiguration::PropagateCastOpsConfiguration::Strategy,
+                                                                                           GraphTransformerConfiguration::PropagateCastOpsConfiguration::Strategy);
+constexpr GraphTransformerConfiguration::PropagateCastOpsConfiguration::Strategy operator&(GraphTransformerConfiguration::PropagateCastOpsConfiguration::Strategy,
+                                                                                           GraphTransformerConfiguration::PropagateCastOpsConfiguration::Strategy);
+constexpr bool operator==(GraphTransformerConfiguration::PropagateCastOpsConfiguration::Strategy, GraphTransformerConfiguration::PropagateCastOpsConfiguration::Strategy);
+constexpr bool operator!=(GraphTransformerConfiguration::PropagateCastOpsConfiguration::Strategy, GraphTransformerConfiguration::PropagateCastOpsConfiguration::Strategy);
+
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/optimizer/graph_transformer_level.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/optimizer/graph_transformer_level.h
new file mode 100644
index 00000000000000..111f38f9ccb6e1
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/optimizer/graph_transformer_level.h
@@ -0,0 +1,19 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+namespace onnxruntime {
+
+// Graph transformer level
+// refer to docs/ONNX_Runtime_Graph_Optimizations.md for details
+enum class TransformerLevel : int {
+  Default = 0,  // required transformers only
+  Level1,       // basic optimizations
+  Level2,       // extended optimizations
+  Level3,       // layout optimizations
+  // The max level should always be same as the last level.
+  MaxLevel = Level3
+};
+
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/optimizer/graph_transformer_utils.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/optimizer/graph_transformer_utils.h
new file mode 100644
index 00000000000000..31b0f223405108
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/optimizer/graph_transformer_utils.h
@@ -0,0 +1,96 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <string>
+#include <memory>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "core/common/inlined_containers.h"
+#include "core/framework/session_options.h"
+#include "core/framework/tensor.h"
+#include "core/optimizer/graph_transformer.h"
+#include "core/platform/threadpool.h"
+
+#if !defined(ORT_MINIMAL_BUILD)
+#include "core/optimizer/rule_based_graph_transformer.h"
+#include "core/optimizer/rewrite_rule.h"
+#endif
+
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+#include "core/optimizer/selectors_actions/selector_action_transformer_apply_contexts.h"
+#endif
+
+namespace onnxruntime {
+class IExecutionProvider;
+
+namespace optimizer_utils {
+
+#if !defined(ORT_MINIMAL_BUILD)
+
+/** Generates all predefined rules for this level.
+   If rules_to_enable is not empty, it returns the intersection of predefined rules and rules_to_enable.
+   TODO: This is visible for testing at the moment, but we should rather make it private. */
+InlinedVector<std::unique_ptr<RewriteRule>> GenerateRewriteRules(
+    TransformerLevel level,
+    const InlinedHashSet<std::string>& rules_to_disable = {});
+
+/** Given a TransformerLevel, this method generates a name for the rule-based graph transformer of that level. */
+std::string GenerateRuleBasedTransformerName(TransformerLevel level);
+
+/** Generates all rule-based transformers for this level. */
+std::unique_ptr<RuleBasedGraphTransformer> GenerateRuleBasedGraphTransformer(
+    TransformerLevel level,
+    const InlinedHashSet<std::string>& rules_to_disable,
+    const InlinedHashSet<std::string_view>& compatible_execution_providers);
+
+/** Generates all predefined (both rule-based and non-rule-based) transformers for this level.
+    Any transformers or rewrite rules named in rules_and_transformers_to_disable will be excluded. */
+InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
+    TransformerLevel level,
+    const SessionOptions& session_options,
+    const IExecutionProvider& execution_provider /*required by constant folding*/,
+    const logging::Logger& logger,
+    const InlinedHashSet<std::string>& rules_and_transformers_to_disable = {},
+    concurrency::ThreadPool* intra_op_thread_pool = nullptr,
+    std::unordered_map<std::string, std::unique_ptr<Tensor>>* p_buffered_tensors = nullptr);
+
+#endif  // !defined(ORT_MINIMAL_BUILD)
+
+#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+
+/** Generates all predefined transformers which can be used to provide runtime optimizations for this level
+    in a minimal build.
+    Any transformers or rewrite rules named in rules_and_transformers_to_disable will be excluded.
+
+    This is a distinct function from GenerateTransformers() because:
+    - An ORT format model used in a minimal build will have been pre-optimized to at least level 1 when created, so
+      level 1 transformers are not included.
+    - In a minimal build we have limited optimization/Graph capabilities
+      - Graph::Resolve is not available so the transformer must keep the Graph in a valid state
+      - Limited graph_utils capabilities are included
+    - Only a small subset of transformers support storing/replaying runtime optimizations with an ORT format model
+      - this capability is provided by the SelectionActionTransformer infrastructure
+      - the logic to determine the set of nodes a transformer should modify is captured during creation of the ORT
+        format model
+      - this information is saved in the ORT format model
+      - only the logic to modify the set of nodes is included in the minimal build
+    - The QDQFinalCleanupTransformer and NhwcTransformer transformers are also supported in a minimal build
+*/
+InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformersForMinimalBuild(
+    TransformerLevel level,
+    const SessionOptions& session_options,
+    const SatApplyContextVariant& apply_context,
+    const IExecutionProvider& cpu_execution_provider,
+    const logging::Logger& logger,
+    const InlinedHashSet<std::string>& rules_and_transformers_to_disable = {},
+    concurrency::ThreadPool* intra_op_thread_pool = nullptr,
+    std::unordered_map<std::string, std::unique_ptr<Tensor>>* p_buffered_tensors = nullptr);
+
+#endif  // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)
+
+}  // namespace optimizer_utils
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/optimizer/rewrite_rule.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/optimizer/rewrite_rule.h
new file mode 100644
index 00000000000000..c9bfb36d98efb8
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/optimizer/rewrite_rule.h
@@ -0,0 +1,89 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/common/common.h"
+#include "core/graph/graph_viewer.h"
+
+namespace onnxruntime {
+
+/**
+@class RewriteRule
+
+The base class for a rewrite rule. A rewrite rule represents a semantics-preserving transformation of a
+computation graph. It can be used to represent, for example, the elimination of operators that serve as
+no-ops (e.g., dropout during inference), as well as inlining of "function" definitions or the dual operation
+of replacing a complex expression by an equivalent function-call). Unlike the more general GraphTransformer,
+a rewrite rule is a more local transformation that is triggered on a particular node of the graph.
+
+Each rule has a set of conditions and a body. The conditions have to be satisfied for the body of the rule
+to be triggered. Therefore, when creating a new rewrite rule, two main functions have to be implemented:
+- SatisfyCondition defines the condition checks. It is advisable to add the more selective checks first,
+  because those will lead to discarding fast rules that cannot be applied on a node.
+- Apply is the actual body of the rule that will be executed if SatisfyCondition returns true for a particular
+  node. Note that additional, more complex checks can be included in the Apply if putting them in the
+  SatisfyCondition would lead to duplicate work (e.g., when we make a check on a Node attribute but we need
+  that attribute to execute the rule too).
+In general, simple fast checks are a better fit for SatisfyCondition, whereas more complex ones can be added
+in the Apply.
+
+In order to avoid evaluating the SatisfyCondition for each rule and each node of the graph, each rewrite rule
+should specify the target op types for which a rule will be evaluated, by overriding the TargetOpTypes() function.
+If the op type of a node is not included in the target op types of a rule, that rule would not be considered at all.
+If the list of op types is left empty, that rule will be triggered for every op type.
+*/
+class RewriteRule {
+ public:
+  /**
+  @class RewriteRuleEffect
+
+  Class used to indicate the effect of rule application on a graph's node.
+  */
+  enum class RewriteRuleEffect : uint8_t {
+    kNone,                // The rewrite rule has not modified the graph.
+    kUpdatedCurrentNode,  // The rewrite rule updated (but did not remove) the node on which it was triggered.
+    kRemovedCurrentNode,  // The rewrite rule removed the node on which it was triggered.
+    kModifiedRestOfGraph  // The rewrite rule modified nodes other than the one it was triggered on.
+  };
+
+  RewriteRule(const std::string& name) : name_(name) {}
+
+  virtual ~RewriteRule() = default;
+
+  /** Gets the name of this rewrite rule. */
+  const std::string& Name() const noexcept {
+    return name_;
+  }
+
+  /** Returns the node op types for which this rule will be triggered. If the op type of a node is not included in the
+      target op types of a rule, that rule would not be considered at all. Returning an empty list indicates that we
+      will attempt to trigger the rule for every op type. */
+  virtual std::vector<std::string> TargetOpTypes() const noexcept = 0;
+
+  /** Checks if the condition of the rule is satisfied, and if so applies the body of the rule.
+      @param[in] graph The Graph.
+      @param[in] node The Node to apply the rewrite to.
+      @param[out] rule_effect Enum to indicate if and how the graph was modified as a result of the rule application.
+      @returns Status indicating success or providing error information */
+  common::Status CheckConditionAndApply(Graph& graph, Node& node, RewriteRuleEffect& rule_effect, const logging::Logger& logger) const {
+    return SatisfyCondition(graph, node, logger) ? Apply(graph, node, rule_effect, logger) : Status::OK();
+  }
+
+ private:
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(RewriteRule);
+
+  const std::string name_;
+
+  /** Checks if the Node of the given Graph satisfies the conditions of this rule. The body of the rule will be
+      evaluated if this condition function returns true. This can include a more complex pattern matching (conditions
+      on the ascending or descending nodes of the node for which this rule was triggered) or some other properties
+      of the nodes. */
+  virtual bool SatisfyCondition(const Graph& graph, const Node& node, const logging::Logger& logger) const = 0;
+
+  /** This is the actual body of the rule that performs the graph transformation. The transformation happens in-place.
+      The return-value of node may be different from the input-value due to rewriting.
+      The value of "rule_effect" indicates whether and how the graph was modified by the rule. */
+  virtual common::Status Apply(Graph& graph, Node& node, RewriteRuleEffect& rule_effect, const logging::Logger& logger) const = 0;
+};
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/optimizer/rule_based_graph_transformer.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/optimizer/rule_based_graph_transformer.h
new file mode 100644
index 00000000000000..cb00026922148c
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/optimizer/rule_based_graph_transformer.h
@@ -0,0 +1,82 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/common/common.h"
+#include "core/graph/graph_viewer.h"
+#include "core/optimizer/graph_transformer.h"
+#include "core/optimizer/rewrite_rule.h"
+
+namespace onnxruntime {
+
+/**
+@class RuleBasedGraphTransformer
+
+Rule-based graph transformer that provides an API to register rewrite rules
+and an API to apply all applicable rules to a Graph.
+
+Represents an IGraphTransformer determined by a set of rewrite rules.
+The transformer will apply all the rewrite rules iteratively as determined by the underlying rewriting strategy.
+Several rewriting-strategies are possible when traversing the graph and applying rewrite rules,
+each with different trade offs. At the moment, we define one that performs top-down traversal of nodes.
+
+@TODO: Is a bottom-up traversal more efficient?
+@TODO: Is it worth adding the max number of passes a rule should be applied for?
+@TODO: We need to define a contract about whether a rewrite rule is allowed to leave
+       the graph in an inconsistent state (this will determine when and where we will be
+       calling Graph::resolve().
+*/
+class RuleBasedGraphTransformer : public GraphTransformer {
+ public:
+  RuleBasedGraphTransformer(const std::string& name,
+                            const InlinedHashSet<std::string_view>& compatible_execution_providers = {})
+      : GraphTransformer(name, compatible_execution_providers) {}
+
+  /** Registers a rewrite rule in this transformer. */
+  Status Register(std::unique_ptr<RewriteRule> rule);
+
+  /** Gets the list of registered rewrite rules that will be triggered on nodes with the given op type
+      by this rule-based transformer.
+      @returns a pointer to the vector containing all the registered rewrite rules. */
+  const InlinedVector<std::reference_wrapper<const RewriteRule>>* GetRewriteRulesForOpType(const std::string& op_type) const {
+    auto rules = op_type_to_rules_.find(op_type);
+    return (rules != op_type_to_rules_.cend()) ? &rules->second : nullptr;
+  }
+
+  /** Gets the rewrite rules that are evaluated on all nodes irrespective of their op type.
+      @returns a pointer to the vector containing all such rewrite rules or nullptr if no such rule. */
+  const InlinedVector<std::reference_wrapper<const RewriteRule>>* GetAnyOpRewriteRules() const {
+    return &any_op_type_rules_;
+  }
+
+  /** Returns the total number of rules that are registered in this transformer. */
+  size_t RulesCount() const;
+
+ protected:
+  /** Applies the given set of rewrite rules on the Node of this Graph.
+      @param[in] graph The Graph.
+      @param[in] node The Node to apply the rules to.
+      @param[in] rules The vector of RewriteRules that will be applied to the Node.
+      @param[out] rule_effect Enum that indicates whether and how the graph was modified as a result of
+      applying rules on this node.
+      @returns Status indicating success or providing error information. */
+  common::Status ApplyRulesOnNode(Graph& graph, Node& node,
+                                  gsl::span<const std::reference_wrapper<const RewriteRule>> rules,
+                                  RewriteRule::RewriteRuleEffect& rule_effect, const logging::Logger& logger) const;
+
+ private:
+  using RuleEffect = RewriteRule::RewriteRuleEffect;
+
+  // The list of unique pointers for all rules (so that rules can be registered for several op types).
+  InlinedVector<std::unique_ptr<RewriteRule>> rules_;
+  // Map that associates a node's op type with the vector of rules that are registered to be triggered for that node.
+  InlinedHashMap<std::string, InlinedVector<std::reference_wrapper<const RewriteRule>>> op_type_to_rules_;
+  // Rules that will be evaluated regardless of the op type of the node.
+  InlinedVector<std::reference_wrapper<const RewriteRule>> any_op_type_rules_;
+
+  // Performs a single top-down traversal of the graph and applies all registered rules.
+  common::Status ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const override;
+};
+
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/platform/Barrier.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/platform/Barrier.h
new file mode 100644
index 00000000000000..bddc3ba8903f6b
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/platform/Barrier.h
@@ -0,0 +1,81 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <assert.h>
+
+#include "core/common/spin_pause.h"
+
+#include <mutex>
+#include <condition_variable>
+#include <atomic>
+
+namespace onnxruntime {
+class Barrier {
+ public:
+  explicit Barrier(unsigned int count, bool spin = false)
+      : state_(count << 1), notified_(false), spin_(spin) {
+    assert(((count << 1) >> 1) == count);
+  }
+#ifdef NDEBUG
+  ~Barrier() = default;
+#else
+  ~Barrier() {
+    assert((state_ >> 1) == 0);
+  }
+#endif
+
+  void Notify(unsigned int c = 1) {
+    unsigned int delta = c << 1;
+    unsigned int v = state_.fetch_sub(delta, std::memory_order_acq_rel) - delta;
+    if (v != 1) {
+      // Clear the lowest bit (waiter flag) and check that the original state
+      // value was not zero. If it was zero, it means that notify was called
+      // more times than the original count.
+      assert(((v + delta) & ~1) != 0);
+      return;  // either count has not dropped to 0, or waiter is not waiting
+    }
+    std::unique_lock<std::mutex> l(mu_);
+    assert(!notified_);
+    notified_ = true;
+    cv_.notify_all();
+  }
+
+  void Wait() {
+    if (spin_) {
+      while ((state_ >> 1) != 0) {
+        onnxruntime::concurrency::SpinPause();
+      }
+    } else {
+      unsigned int v = state_.fetch_or(1, std::memory_order_acq_rel);
+      if ((v >> 1) == 0)
+        return;
+      std::unique_lock<std::mutex> l(mu_);
+      while (!notified_) {
+        cv_.wait(l);
+      }
+    }
+  }
+
+ private:
+  std::mutex mu_;
+  std::condition_variable cv_;
+  std::atomic<unsigned int> state_;  // low bit is waiter flag
+  bool notified_;
+  const bool spin_;
+};
+
+// Notification is an object that allows a user to to wait for another
+// thread to signal a notification that an event has occurred.
+//
+// Multiple threads can wait on the same Notification object,
+// but only one caller must call Notify() on the object.
+struct Notification : Barrier {
+  Notification() : Barrier(1) {};
+};
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h
new file mode 100644
index 00000000000000..a7c63c507d1ba0
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h
@@ -0,0 +1,1723 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/* Modifications Copyright (c) Microsoft. */
+
+#include <type_traits>
+
+#pragma once
+#include "onnxruntime_config.h"
+// build/external/eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h:162:71:
+// error: ignoring attributes on template argument "Eigen::PacketType<const float, Eigen::DefaultDevice>::type {aka
+// __vector(4) float}" [-Werror=ignored-attributes]
+#if defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wunused-result"
+// cmake/external/eigen/unsupported/Eigen/CXX11/../../../Eigen/src/Core/arch/NEON/PacketMath.h:1633:9:
+// error: ‘void* memcpy(void*, const void*, size_t)’ copying an object of non-trivial type ‘Eigen::internal::Packet4c’
+// {aka ‘struct Eigen::internal::eigen_packet_wrapper<int, 2>’} from an array of ‘const int8_t’
+// {aka ‘const signed char’} [-Werror=class-memaccess]
+#ifdef HAS_CLASS_MEMACCESS
+#pragma GCC diagnostic ignored "-Wclass-memaccess"
+#endif
+// eigen-src/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h:231:56: error: implicit conversion loses integer
+//   precision: 'uint64_t' (aka 'unsigned long long') to 'size_t' (aka 'unsigned long') [-Werror,-Wshorten-64-to-32]
+// next = wnext == kStackMask ? nullptr : &waiters_[wnext];
+//                                         ~~~~~~~~ ^~~~~
+#ifdef HAS_SHORTEN_64_TO_32
+#pragma GCC diagnostic ignored "-Wshorten-64-to-32"
+#endif
+#elif defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4127)
+#pragma warning(disable : 4805)
+#endif
+#include <memory>
+#include "unsupported/Eigen/CXX11/ThreadPool"
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#elif defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+#include "core/common/denormal.h"
+#include "core/common/inlined_containers_fwd.h"
+#include "core/common/spin_pause.h"
+#include "core/platform/ort_spin_lock.h"
+#include "core/platform/Barrier.h"
+
+// ORT thread pool overview
+// ------------------------
+//
+// The ORT thread pool implementation is split into two layers.  This
+// file provides the low-level component.  See the accompanying
+// comments in threadpool.h for the high-level component.
+//
+// The code here is derived from the Eigen non-blocking thread pool,
+// although many parts have been updated over time.  The main
+// abstractions used here are:
+//
+// - The thread pool maintains a set of OS threads running
+//   ThreadPoolTempl::WorkerLoop.
+//
+//   Each thread has its own RunQueue object, holding a queue of tasks
+//   that have been pushed to the thread for execution.  The main work
+//   loop is to pop a task from the head of the queue, and to execute
+//   it to completion.  If the worker's run queue is empty then it
+//   will spin waiting for work, then attempt to steal tasks from
+//   other threads' queues, and then block in the OS if it cannot find
+//   work.
+//
+//   This spin-then-block behavior is configured via a flag provided
+//   when creating the thread pool, and by the constant spin_count.
+//
+// - Although all tasks are simple void()->void functions,
+//   conceptually there are three different kinds:
+//
+//   - One-shot tasks submitted externally via the Schedule() method.
+//     These tasks are used to support asynchronous work.  These are
+//     used in the parallel executor, but otherwise are not widely
+//     used outside of test harnesses (see threadpool_test.cc for some
+//     examples).
+//
+//   - Tasks for running a parallel loop.
+//
+//     The tasks themselves are defined in threadpool.cc, and are
+//     submitted to the run queues via RunInParallel->SummonWorkers.
+//     Each task will loop internally, picking off iterations from the
+//     user's code via atoic-fetch-and-add, until the loop is
+//     complete.
+//
+//     This two-layer approach lets us separate out the
+//     super-lightweight per-iteration-batch work from the more
+//     costly per-loop work of managing Task objects.
+//
+//   - Tasks for running a parallel section.  This is an extension of
+//     the approach taken for parallel loops.  However, the Tasks are
+//     defined in this file, and can pick up iterations from a series
+//     of different parallel loops.  The tasks are defined in
+//     RunInParallelSection->SummonWorkers.
+//
+//     The additional layer of parallel sections is a further way to
+//     amortize costs: the work done creating the tasks can be
+//     performed once, and then exploited over a series of loops.
+//
+// There are a few aspects of the modified Eigen thread pool to
+// highlight:
+//
+// - The run queues follow the usual approach of having push/pop
+//   operations on the front/back, and optimizing the PopFront case
+//   for single-threaded use by the thread owning the run queue.
+//   Two points to note here are:
+//
+//   * We should experiment with simplifying these queues.  In ORT, we
+//     use the CAS-based scheduling layer in threadpool.cc for the
+//     fine-grained allocation of individual loop iterations to worker
+//     threads.  This means we do not have the form of recursive
+//     sub-division of work that motivates the original design.
+//
+//   * We support an additional Revoke operation to replace an item in
+//     the middle of a queue with a tombstone.  This operation is used
+//     at the end of parallel loops and parallel sections to remove
+//     any tasks that were created but not yet executed.  Once
+//     revoked, a thread can rely on the fact that the task will no
+//     longer execute.  Revocation helps manage captured state in
+//     parallel loops: the alternatives would be (i) waiting for all
+//     tasks that captured state to reach the head of their queues and
+//     execute, or (ii) use heap-allocated state in tasks, and use a
+//     technique such as reference counting to de-allocate it.
+//
+//     To support revocation, each thread has a unique "Tag" to
+//     identify the items that it adds to the work queues.  A thread
+//     can revoke an item only if it has the thread's own tag.
+//
+// - When entering a parallel loop (or parallel section), a thread
+//   maintains a set of "preferred" worker hints, and initially
+//   submits tasks to these workers.
+//   When a task executes, it updates the submitting thread's
+//   preferred workers to reflect the worker that the task ran on.
+//   Hence, if a task is submitted to thread T1's queue, and then
+//   stolen by T2 for execution, then T2 will become preferred.
+//
+//   This "stickiness" aims to retain locality between successive
+//   loops submitted by the same thread, to maintain the same set of
+//   active threads over time (when the entire pool is not needed),
+//   and to allow concurrent requests to submit works to their own
+//   respective sets of preferred workers.
+
+namespace onnxruntime {
+namespace concurrency {
+
+#ifdef _WIN32
+using CHAR_TYPE = wchar_t;
+#else
+using CHAR_TYPE = char;
+#endif
+
+class ThreadPoolParallelSection;
+class ThreadPoolLoop;
+
+enum class StealAttemptKind {
+  TRY_ONE,
+  TRY_ALL,
+};
+
+enum class PushResult {
+  REJECTED,
+  ACCEPTED_IDLE,
+  ACCEPTED_BUSY
+};
+
+// Align to avoid false sharing with prior fields.  If required,
+// alignment or padding must be added subsequently to avoid false
+// sharing with later fields.  Note that:
+//
+// - The __x86_64__ value is twice the line size (64 bytes).  This
+//   accounts for 2-line prefetch behavior on some cores.
+//
+// - Ideally, ORT_ALIGN_TO_AVOID_FALSE_SHARING is used.  However, the
+//   definition of ThreadPoolParallelSection uses naive padding
+//   because C++11 does not support alignment constraints on
+//   allocation or expose stdlib.h aligned_alloc.  C++17 introduces
+//   support for aligned allocation which we could use here.
+
+#if defined(__x86_64__)
+#define ORT_FALSE_SHARING_BYTES 128
+#else
+#define ORT_FALSE_SHARING_BYTES 64
+#endif
+
+#define ORT_ALIGN_TO_AVOID_FALSE_SHARING alignas(ORT_FALSE_SHARING_BYTES)
+
+struct PaddingToAvoidFalseSharing {
+  char padding[ORT_FALSE_SHARING_BYTES];
+};
+
+/* Usage:
+1. In executor, call Start() before profiling and Stop() to get profiled numbers;
+2. Inside thread pool, call LogStart() before interested section and LogEnd... after to log elapsed time;
+3. To extend, just add more events in enum Event before "All", and update GetEventName(...) accordingly;
+4. Note LogStart must pair with either LogEnd or LogEndAndStart, otherwise ORT_ENFORCE will fail;
+5. ThreadPoolProfiler is thread-safe.
+*/
+#ifdef ORT_MINIMAL_BUILD
+class ThreadPoolProfiler {
+ public:
+  enum ThreadPoolEvent {
+    DISTRIBUTION = 0,
+    DISTRIBUTION_ENQUEUE,
+    RUN,
+    WAIT,
+    WAIT_REVOKE,
+    MAX_EVENT
+  };
+  ThreadPoolProfiler(int, const CHAR_TYPE*) {};
+  ~ThreadPoolProfiler() = default;
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(ThreadPoolProfiler);
+  void Start() {};
+  std::string Stop() { return "not available for minimal build"; }
+  void LogStart() {};
+  void LogEnd(ThreadPoolEvent){};
+  void LogEndAndStart(ThreadPoolEvent){};
+  void LogStartAndCoreAndBlock(std::ptrdiff_t){};
+  void LogCoreAndBlock(std::ptrdiff_t){};
+  void LogThreadId(int) {};
+  void LogRun(int) {};
+  std::string DumpChildThreadStat() { return {}; }
+};
+#else
+class ThreadPoolProfiler {
+ public:
+  enum ThreadPoolEvent {
+    DISTRIBUTION = 0,
+    DISTRIBUTION_ENQUEUE,
+    RUN,
+    WAIT,
+    WAIT_REVOKE,
+    MAX_EVENT
+  };
+  ThreadPoolProfiler(int num_threads, const CHAR_TYPE* threal_pool_name);
+  ~ThreadPoolProfiler();
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(ThreadPoolProfiler);
+  using Clock = std::chrono::high_resolution_clock;
+  void Start();                  // called by executor to start profiling
+  std::string Stop();            // called by executor to stop profiling and return collected numbers
+  void LogStart();               // called in main thread to record the starting time point
+  void LogEnd(ThreadPoolEvent);  // called in main thread to calculate and save the time elapsed from last start point
+  void LogEndAndStart(ThreadPoolEvent);
+  void LogStartAndCoreAndBlock(std::ptrdiff_t block_size);
+  void LogCoreAndBlock(std::ptrdiff_t block_size);  // called in main thread to log core and block size for task breakdown
+  void LogThreadId(int thread_idx);                 // called in child thread to log its id
+  void LogRun(int thread_idx);                      // called in child thread to log num of run
+  std::string DumpChildThreadStat();                // return all child statitics collected so far
+
+ private:
+  static const char* GetEventName(ThreadPoolEvent);
+  struct MainThreadStat {
+    uint64_t events_[MAX_EVENT] = {};
+    int32_t core_ = -1;
+    std::vector<std::ptrdiff_t> blocks_;  // block size determined by cost model
+    std::vector<onnxruntime::TimePoint> points_;
+    void LogCore();
+    void LogBlockSize(std::ptrdiff_t block_size);
+    void LogStart();
+    void LogEnd(ThreadPoolEvent);
+    void LogEndAndStart(ThreadPoolEvent);
+    std::string Reset();
+  };
+  bool enabled_ = false;
+  MainThreadStat& GetMainThreadStat();  // return thread local stat
+  int num_threads_;
+#ifdef _MSC_VER
+#pragma warning(push)
+  // C4324: structure was padded due to alignment specifier
+#pragma warning(disable : 4324)
+#endif  // _MSC_VER
+  struct ORT_ALIGN_TO_AVOID_FALSE_SHARING ChildThreadStat {
+    std::thread::id thread_id_;
+    uint64_t num_run_ = 0;
+    onnxruntime::TimePoint last_logged_point_ = Clock::now();
+    int32_t core_ = -1;  // core that the child thread is running on
+  };
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif  // _MSC_VER
+  std::vector<ChildThreadStat> child_thread_stats_;
+  std::string thread_pool_name_;
+};
+#endif
+
+// Extended Eigen thread pool interface, avoiding the need to modify
+// the ThreadPoolInterface.h header from the external Eigen
+// repository.
+
+class ExtendedThreadPoolInterface : public Eigen::ThreadPoolInterface {
+ public:
+  // Start/end a parallel section, within which calls to
+  // RunInParallelSection may be made.  Parallel sections are
+  // non-nesting.
+  virtual void StartParallelSection(ThreadPoolParallelSection& ps) = 0;
+  virtual void EndParallelSection(ThreadPoolParallelSection& ps) = 0;
+
+  // Run fn with up to n degree-of-parallelism enlisting the thread
+  // pool for help.  The degree-of-parallelism includes the caller,
+  // and so if n==1 then the function will run directly in the caller.
+  //
+  // The fork-join synchronization is handled in the thread pool, and
+  // so any state captured by fn() is safe from concurrent access once
+  // RunInParallelSection returns.
+  //
+  // The parameter idx provides a loop-local thread ID in the range
+  // [0,k) where k<=n.
+  virtual void RunInParallelSection(ThreadPoolParallelSection& ps,
+                                    std::function<void(unsigned idx)> fn,
+                                    unsigned n, std::ptrdiff_t block_size) = 0;
+
+  // Special case alternative to RunInParallelSection for use without
+  // an existing parallel section.  Ideally we would use a single
+  // implementation and a stack-allocated ThreadPoolParallelSection.
+  //
+  // However, on the BM_ThreadPoolParallelFor micro-benchmark I saw
+  // ~20% overhead on the resulting single-loop parallel sections.
+  // There are some additional costs (~5%) for additional invocations
+  // through lambda functions on loop entry.  Most significantly, on
+  // loop exit, we incurred ~15% cost by no longer being able to
+  // overlap clean-up of unused Task objects in EndParallelSection
+  // with waiting for loop iterations to complete.
+  //
+  // [ Note that this 20% overhead is more than paid for when we have
+  // two loops execute in series in a parallel section. ]
+  virtual void RunInParallel(std::function<void(unsigned idx)> fn,
+                             unsigned n, std::ptrdiff_t block_size) = 0;
+  virtual void StartProfiling() = 0;
+  virtual std::string StopProfiling() = 0;
+};
+
+class ThreadPoolParallelSection {
+ public:
+  // State accessed only by the main thread
+  // --------------------------------------
+
+  // Tasks successfully submitted to the work queues.  This sets the
+  // maximum degree of parallelism that the section will support.
+  InlinedVector<std::pair<int, unsigned>> tasks;
+
+  // Number of tasks revoked (i.e., removed from the queues prior to
+  // execution).  We count this at various points, and omit waiting
+  // for them at the end of a loop.
+  unsigned tasks_revoked{0};
+
+  // Current degree of parallelism, including work in the main thread
+  // and in the dispatcher.
+  unsigned current_dop{0};
+
+  // State shared between the main thread and worker threads
+  // -------------------------------------------------------
+
+  // Flag to signal termination of the parallel section
+  std::atomic<bool> active{false};
+
+  // Count of the number of tasks that completed normally.  Other
+  // tasks may be running currently, or may be present in work queues,
+  // or may have been removed from the queues by
+  // RunQueue::RevokeWithTag.
+  PaddingToAvoidFalseSharing padding_1;
+  std::atomic<unsigned> tasks_finished{0};
+  PaddingToAvoidFalseSharing padding_2;
+
+  // If non-null, the current loop that tasks should be executing.  We
+  // need to be careful on access to the contents of current_loop
+  // because it can be stack allocated on the thread entering the
+  // loop:
+  //
+  // - Readers increment workers_in_loop and then read current_loop
+  //
+  // - Writers wishing to deallocate *current_loop must first clear
+  //   current_loop and then wait for workers_in_loop==0
+  std::atomic<ThreadPoolLoop*> current_loop{nullptr};
+  std::atomic<unsigned> workers_in_loop{0};
+
+  // Members to track asynchronous dispatching
+  int dispatch_q_idx = -1;      // index of thread that dispatch work to all other threads
+  unsigned dispatch_w_idx = 0;  // index of enqueued work
+  std::atomic<bool> dispatch_started{false};
+  std::atomic<bool> dispatch_done{false};
+  std::atomic<bool> work_done{false};
+};
+
+class ThreadPoolLoop {
+ public:
+  ThreadPoolLoop(std::function<void(unsigned)> f, unsigned t) : fn(std::move(f)), threads_needed(t) {
+  }
+
+  const std::function<void(unsigned)> fn;
+  const unsigned threads_needed;
+
+ private:
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(ThreadPoolLoop);
+};
+
+template <typename Work, typename Tag, unsigned kSize>
+class RunQueue {
+ public:
+  RunQueue() : front_(0), back_(0) {
+    // require power-of-two for fast masking
+    assert((kSize & (kSize - 1)) == 0);
+    assert(kSize > 2);            // why would you do this?
+    assert(kSize <= (64 << 10));  // leave enough space for counter
+    for (unsigned i = 0; i < kSize; i++) array_[i].state.store(ElemState::kEmpty, std::memory_order_relaxed);
+  }
+
+  ~RunQueue() {
+    assert(Size() == 0);
+  }
+
+  // PopFront removes and returns the first element in the queue.
+  // If the queue was empty returns default-constructed Work.
+  Work PopFront() {
+    unsigned front;
+    Elem* e;
+    ElemState s;
+
+    // Drain revoked items from the front of the queue.  CAS to busy to synchronize with
+    // any attempt to take the same item from the back of the queue.
+    do {
+      front = front_.load(std::memory_order_relaxed);
+      e = &array_[(front - 1) & kMask];
+      s = e->state.load(std::memory_order_relaxed);
+      if (s == ElemState::kRevoked &&
+          e->state.compare_exchange_strong(s, ElemState::kBusy, std::memory_order_acquire)) {
+        e->state.store(ElemState::kEmpty, std::memory_order_release);
+        front = ((front - 1) & kMask2) | (front & ~kMask2);
+        front_.store(front, std::memory_order_relaxed);
+      }
+    } while (s == ElemState::kRevoked);
+
+    // Attempt to take next item.  State kEmpty shows the queue is empty, kBusy shows
+    // the work is in progress on the item at the front of the queue.
+    if (s != ElemState::kReady ||
+        !e->state.compare_exchange_strong(s, ElemState::kBusy, std::memory_order_acquire))
+      return Work();
+    Work w = std::move(e->w);
+    e->tag = Tag();
+    e->state.store(ElemState::kEmpty, std::memory_order_release);
+    front = ((front - 1) & kMask2) | (front & ~kMask2);
+    front_.store(front, std::memory_order_relaxed);
+    return w;
+  }
+
+  // PushBack adds w at the end of the queue.
+  // If queue is full returns w, otherwise returns default-constructed Work.
+  Work PushBack(Work w) {
+#ifdef USE_LOCK_FREE_QUEUE
+    std::lock_guard<OrtSpinLock> mtx(spin_lock_);
+#else
+    std::lock_guard<std::mutex> lock(mutex_);
+#endif
+    unsigned back = back_.load(std::memory_order_relaxed);
+    Elem& e = array_[(back - 1) & kMask];
+    ElemState s = e.state.load(std::memory_order_relaxed);
+    if (s != ElemState::kEmpty ||
+        !e.state.compare_exchange_strong(s, ElemState::kBusy, std::memory_order_acquire))
+      return w;
+    back = ((back - 1) & kMask2) | (back & ~kMask2);
+    back_.store(back, std::memory_order_relaxed);
+    e.w = std::move(w);
+    e.tag = Tag();
+    e.state.store(ElemState::kReady, std::memory_order_release);
+    return Work();
+  }
+
+  // PushBackWithTag adds w at the end of the queue.  The tag value can be used on a
+  // subsequent call to RevokeWithTag to remove the item from the queue in combination
+  // with w_idx.  Typically the tag will be a per-thread ID to distinguish work
+  // submitted from different threads.
+  PushResult PushBackWithTag(Work w, Tag tag, unsigned& w_idx) {
+#ifdef USE_LOCK_FREE_QUEUE
+    std::lock_guard<OrtSpinLock> mtx(spin_lock_);
+#else
+    std::lock_guard<std::mutex> lock(mutex_);
+#endif
+    unsigned back = back_.load(std::memory_order_relaxed);
+    w_idx = (back - 1) & kMask;
+    Elem& e = array_[w_idx];
+    ElemState s = e.state.load(std::memory_order_relaxed);
+    if (s != ElemState::kEmpty ||
+        !e.state.compare_exchange_strong(s, ElemState::kBusy, std::memory_order_acquire))
+      return PushResult::REJECTED; /* Not enqueued */
+    bool was_ready = (((back ^ (front_.load(std::memory_order_relaxed))) & kMask) == 0);
+    back = ((back - 1) & kMask2) | (back & ~kMask2);
+    back_.store(back, std::memory_order_relaxed);
+    e.w = std::move(w);
+    e.tag = tag;
+    e.state.store(ElemState::kReady, std::memory_order_release);
+    return was_ready ? PushResult::ACCEPTED_IDLE : PushResult::ACCEPTED_BUSY; /* Enqueued */
+  }
+
+  // PopBack removes and returns the last elements in the queue.
+  Work PopBack() {
+    if (Empty())
+      return Work();
+#ifdef USE_LOCK_FREE_QUEUE
+    std::lock_guard<OrtSpinLock> mtx(spin_lock_);
+#else
+    std::lock_guard<std::mutex> lock(mutex_);
+#endif
+    unsigned back;
+    Elem* e;
+    ElemState s;
+
+    // Drain revoked items from the back of the queue.  CAS to busy to synchronize with
+    // any attempt to take the same item from the front of the queue.
+    do {
+      back = back_.load(std::memory_order_relaxed);
+      e = &array_[back & kMask];
+      s = e->state.load(std::memory_order_relaxed);
+      if (s == ElemState::kRevoked &&
+          e->state.compare_exchange_strong(s, ElemState::kBusy, std::memory_order_acquire)) {
+        e->state.store(ElemState::kEmpty, std::memory_order_release);
+        back_.store(back + 1 + (kSize << 1), std::memory_order_relaxed);
+      }
+    } while (s == ElemState::kRevoked);
+
+    if (s != ElemState::kReady ||
+        !e->state.compare_exchange_strong(s, ElemState::kBusy, std::memory_order_acquire))
+      return Work();
+    Work w = std::move(e->w);
+    e->tag = Tag();
+    e->state.store(ElemState::kEmpty, std::memory_order_release);
+    back_.store(back + 1 + (kSize << 1), std::memory_order_relaxed);
+    return w;
+  }
+
+  // RevokeItem removes a work item from the queue.  Items are identified positionally,
+  // and so a tag is used to detect whether the same position is occupied by a
+  // different work item at the time of removal.  RevokeWithTags lets threads offer work
+  // for parallel execution, and then revoke the offer prior to the work executing (for
+  // instance if the thread itself completes all of the work).  Revoking the work
+  // lets the thread deallocate state that might otherwise have been captured by the work item
+  // and accessed by it.
+  //
+  // Return true iff the item is successfully revoked.  If the item is not revoked then
+  // the caller must assume that it may still execute, for instance because it
+  // has been pop'd from the queue concurrent with the revocation request.
+
+  bool RevokeWithTag(Tag tag, unsigned w_idx) {
+    bool revoked = false;
+#ifdef USE_LOCK_FREE_QUEUE
+    std::lock_guard<OrtSpinLock> mtx(spin_lock_);
+#else
+    std::lock_guard<std::mutex> lock(mutex_);
+#endif
+    Elem& e = array_[w_idx];
+    ElemState s = e.state.load(std::memory_order_relaxed);
+
+    // We have acquired a lock on the queue, synchronizing with
+    // operations aside from the PopFront fast-path.  Synchronize with
+    // that by attempting the same kReady->kBusy transition via CAS.
+
+    if (s == ElemState::kReady &&
+        e.state.compare_exchange_strong(s, ElemState::kBusy, std::memory_order_acquire)) {
+      if (e.tag == tag) {
+        unsigned back = back_.load(std::memory_order_relaxed);
+        unsigned back_idx = back & kMask;
+        if (back_idx != w_idx) {
+          // Item is not at the back of the queue, mark it in-place as revoked
+          e.tag = Tag();
+          e.w = Work();
+          e.state.store(ElemState::kRevoked, std::memory_order_release);
+          revoked = true;
+        } else {
+          // Item being removed as still at the back; shift the back pointer over it,
+          // and bump the version number.
+          e.tag = Tag();
+          e.w = Work();
+          e.state.store(ElemState::kEmpty, std::memory_order_release);
+          back_.store(back + 1 + (kSize << 1), std::memory_order_relaxed);
+          revoked = true;
+        }
+      } else {
+        // Tag mismatch, i.e. work queue slot re-used
+        e.state.store(ElemState::kReady, std::memory_order_release);
+      }
+    }
+    return revoked;
+  }
+
+  // Size returns current queue size.
+  // Can be called by any thread at any time.
+  unsigned Size() const {
+    return SizeOrNotEmpty<true>();
+  }
+
+  // Empty tests whether container is empty.
+  // Can be called by any thread at any time.
+  bool Empty() const {
+    return SizeOrNotEmpty<false>() == 0;
+  }
+
+ private:
+  static const unsigned kMask = kSize - 1;
+  static const unsigned kMask2 = (kSize << 1) - 1;
+
+  enum class ElemState : uint8_t {
+    kEmpty,
+    kBusy,
+    kReady,
+    kRevoked,
+  };
+
+  // Updates to an element are bracketed by a std::memory_order_acquire
+  // load from the state, and a std::memory_order_release store.  Accesses
+  // to the front/back indices for the work queue use relaxed semantics,
+  // with the state of the elements being authoritative.
+  //
+  // TODO: Revisit whether there is a significant benefit for the current
+  // workloads in the complexity here.
+  struct Elem {
+    std::atomic<ElemState> state;
+    Tag tag;
+    Work w;
+  };
+
+#ifdef USE_LOCK_FREE_QUEUE
+  OrtSpinLock spin_lock_;
+#else
+  std::mutex mutex_;
+#endif
+
+  // Low log(kSize) + 1 bits in front_ and back_ contain rolling index of
+  // front/back, respectively. The remaining bits contain modification counters
+  // that are incremented on Push operations. This allows us to (1) distinguish
+  // between empty and full conditions (if we would use log(kSize) bits for
+  // position, these conditions would be indistinguishable); (2) obtain
+  // consistent snapshot of front_/back_ for Size operation using the
+  // modification counters.
+  ORT_ALIGN_TO_AVOID_FALSE_SHARING std::atomic<unsigned> front_;
+  ORT_ALIGN_TO_AVOID_FALSE_SHARING std::atomic<unsigned> back_;
+  ORT_ALIGN_TO_AVOID_FALSE_SHARING Elem array_[kSize];
+
+  // SizeOrNotEmpty returns current queue size; if NeedSizeEstimate is false,
+  // only whether the size is 0 is guaranteed to be correct.
+  // Can be called by any thread at any time.
+  template <bool NeedSizeEstimate>
+  unsigned SizeOrNotEmpty() const {
+    // Emptiness plays critical role in thread pool blocking. So we go to great
+    // effort to not produce false positives (claim non-empty queue as empty).
+    unsigned front = front_.load(std::memory_order_acquire);
+    for (;;) {
+      // Capture a consistent snapshot of front/tail.
+      unsigned back = back_.load(std::memory_order_acquire);
+      unsigned front1 = front_.load(std::memory_order_relaxed);
+      if (front != front1) {
+        front = front1;
+        std::atomic_thread_fence(std::memory_order_acquire);
+        continue;
+      }
+      if (NeedSizeEstimate) {
+        return CalculateSize(front, back);
+      }
+      // This value will be 0 if the queue is empty, and undefined otherwise.
+      unsigned maybe_zero = ((front ^ back) & kMask2);
+      // Queue size estimate must agree with maybe zero check on the queue
+      // empty/non-empty state.
+      eigen_assert((CalculateSize(front, back) == 0) == (maybe_zero == 0));
+      return maybe_zero;
+    }
+  }
+
+  EIGEN_ALWAYS_INLINE
+  unsigned CalculateSize(unsigned front, unsigned back) const {
+    int size = (front & kMask2) - (back & kMask2);
+    // Fix overflow.
+    if (size < 0)
+      size += 2 * kSize;
+    // Order of modification in push/pop is crafted to make the queue look
+    // larger than it is during concurrent modifications. E.g. push can
+    // increment size before the corresponding pop has decremented it.
+    // So the computed size can be up to kSize + 1, fix it.
+    if (size > static_cast<int>(kSize))
+      size = kSize;
+    return static_cast<unsigned>(size);
+  }
+
+  RunQueue(const RunQueue&) = delete;
+  void operator=(const RunQueue&) = delete;
+};
+
+static std::atomic<uint32_t> next_tag{1};
+
+template <typename Environment>
+class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInterface {
+ private:
+  struct PerThread;
+
+  static unsigned WorkerLoop(int id, Eigen::ThreadPoolInterface* param) {
+    // unsafe downcast
+    ThreadPoolTempl* this_ptr = (ThreadPoolTempl*)param;
+    this_ptr->WorkerLoop(id);
+    return 0;
+  }
+
+  ThreadPoolProfiler profiler_;
+
+  void SignalAllAndWait() {
+    done_ = true;
+
+    // Now if all threads block without work, they will start exiting.
+    // But note that threads can continue to work arbitrary long,
+    // block, submit new work, unblock and otherwise live full life.
+    WakeAllWorkersForExit();
+    // Join threads explicitly (by destroying) to avoid destruction order within
+    // this class.
+    for (size_t i = 0; i < worker_data_.size(); ++i) worker_data_[i].thread.reset();
+  }
+
+ public:
+  void StartProfiling() override {
+    profiler_.Start();
+  }
+
+  std::string StopProfiling() override {
+    return profiler_.Stop();
+  }
+
+  struct Tag {
+    constexpr Tag() : v_(0) {
+    }
+
+    Tag(uint32_t v) : v_(v) {
+    }
+
+    // Allocate a new tag to use to identify work items from a given
+    // thread in a parallel section.  Ideally, threads will have
+    // unique tags, but re-use is not incorrect if the counter wraps
+    // (for intsance, if a long-running workload is calling into ORT
+    // from a fresh thread for each request).  We must not re-use the
+    // default tag 0 which is used to identify work items added via
+    // Schedule as opposed to requests for help in parallel sections.
+
+    static Tag GetNext() {
+      Tag t = Tag(next_tag++);
+      if (t.v_ == 0) {
+        t = Tag(next_tag++);
+      }
+      return t;
+    }
+
+    uint32_t Get() const {
+      return v_;
+    }
+
+    bool operator==(const Tag& other) const {
+      return v_ == other.v_;
+    }
+
+    uint32_t v_ = 0;
+  };
+
+  typedef std::function<void()> Task;
+  typedef RunQueue<Task, Tag, 1024> Queue;
+
+  ThreadPoolTempl(const CHAR_TYPE* name, int num_threads, bool allow_spinning, Environment& env,
+                  const ThreadOptions& thread_options)
+      : profiler_(num_threads, name),
+        env_(env),
+        num_threads_(num_threads),
+        allow_spinning_(allow_spinning),
+        set_denormal_as_zero_(thread_options.set_denormal_as_zero),
+        worker_data_(num_threads),
+        all_coprimes_(num_threads),
+        blocked_(0),
+        done_(false) {
+    // Calculate coprimes of all numbers [1, num_threads].
+    // Coprimes are used for random walks over all threads in Steal
+    // and NonEmptyQueueIndex. Iteration is based on the fact that if we take
+    // a random starting thread index t and calculate num_threads - 1 subsequent
+    // indices as (t + coprime) % num_threads, we will cover all threads without
+    // repetitions (effectively getting a presudo-random permutation of thread
+    // indices).
+    for (auto i = 1u; i <= num_threads_; ++i) {
+      all_coprimes_.emplace_back(i);
+      ComputeCoprimes(i, &all_coprimes_.back());
+    }
+
+    // Eigen::MaxSizeVector has neither essential exception safety features
+    // such as swap, nor it is movable. So we have to join threads right here
+    // on exception
+    ORT_TRY {
+      worker_data_.resize(num_threads_);
+      for (auto i = 0u; i < num_threads_; i++) {
+        worker_data_[i].thread.reset(env_.CreateThread(name, i, WorkerLoop, this, thread_options));
+      }
+    }
+    ORT_CATCH(...) {
+      ORT_HANDLE_EXCEPTION([&]() {
+        SignalAllAndWait();
+        throw;
+      });
+    }
+  }
+
+  ~ThreadPoolTempl() override {
+    SignalAllAndWait();
+  }
+
+  // Run fn().  Ordinarily, the function will be added to the thread pool and executed
+  // by a worker thread.  If the thread pool rejects the work then fn() will instead
+  // execute synchronously during Schedule(fn).  Currently the thread pool will only
+  // reject work if the queue of pending work is full.
+
+  void Schedule(std::function<void()> fn) override {
+    PerThread* pt = GetPerThread();
+    int q_idx = Rand(&pt->rand) % num_threads_;
+    WorkerData& td = worker_data_[q_idx];
+    Queue& q = td.queue;
+    fn = q.PushBack(std::move(fn));
+    if (!fn) {
+      // The queue accepted the work; ensure that the thread will pick it up
+      td.EnsureAwake();
+    } else {
+      // Run the work directly if the queue rejected the work
+      fn();
+    }
+  }
+
+  //......................................................................
+  //
+  // Parallel sections
+  // -----------------
+  //
+
+  // Start a parallel section, using a caller-provided
+  // ThreadPoolParallelSection for maintaining the per-section state.
+  // Starting a parallel section is just book-keeping; threads are
+  // "summoned" to help with the parallel section once it enters
+  // parallel loops.  The threads are then retained until the end of the
+  // section, being re-used over subsequent loops.
+
+  void StartParallelSectionInternal(PerThread& pt,
+                                    ThreadPoolParallelSection& ps) {
+    assert((!pt.leading_par_section) && "Nested parallelism not supported");
+    assert((!ps.active) && "Starting parallel section, but active already");
+    pt.leading_par_section = true;
+    if (!pt.tag.Get()) {
+      pt.tag = Tag::GetNext();
+    }
+    ps.dispatch_q_idx = -1;
+    ps.dispatch_started = false;
+    ps.dispatch_done = false;
+    ps.work_done = false;
+    ps.tasks_revoked = 0;
+    ps.current_dop = 1;
+    ps.active = true;
+  }
+
+  void StartParallelSection(ThreadPoolParallelSection& ps) override {
+    PerThread* pt = GetPerThread();
+    StartParallelSectionInternal(*pt, ps);
+  }
+
+  // End a parallel section, waiting for all worker threads to exit from
+  // section.  Hence, on return, the ThreadPoolParallelSection object
+  // can be dealloacted.
+  void EndParallelSectionInternal(PerThread& pt,
+                                  ThreadPoolParallelSection& ps) {
+    assert((pt.leading_par_section) && "Ending parallel section, but none started");
+    assert((ps.active) && "Ending parallel section, but not active");
+    pt.leading_par_section = false;
+
+    // Notify workers to exit from the section
+    ps.active = false;
+
+    // First, attempt to revoke the dispatch task.  If we succeed then
+    // we know we revoked _something_ pushed for the current loop.  That
+    // may be the dispatch task itself, or it may be a task pushed by
+    // the dispatch task.  Those cases are distinguished by whether or
+    // not the dispatch task itself has started -- if it has not started
+    // then it cannot have pushed tasks.
+    if (ps.dispatch_q_idx != -1) {
+      Queue& q = worker_data_[ps.dispatch_q_idx].queue;
+      if (q.RevokeWithTag(pt.tag, ps.dispatch_w_idx)) {
+        if (!ps.dispatch_started.load(std::memory_order_acquire)) {
+          // We successfully revoked a task, and saw the dispatch task
+          // not started.  Hence we know we revoked the dispatch task.
+          // This should be the common case.
+          ps.dispatch_q_idx = -1;
+        } else {
+          // We successfully revoked a task, but saw the dispatch task
+          // had started.  Hence we know we revoked one of the _new_
+          // tasks created by the dispatcher (not the dispatcher
+          // itself).  This should be the rare case, but can occur if
+          // one of the tasks created by the dispatcher occupies the
+          // exact same slot in a work queue that the dispatcher used.
+          ps.tasks_revoked++;
+        }
+      }
+    }
+
+    // Second, if we failed to revoke the dispatch task, wait for it to
+    // finish dispatch work.  This avoids new tasks being started
+    // concurrently with us attempting to end the parallel section.
+    if (ps.dispatch_q_idx != -1) {
+      while (!ps.dispatch_done.load(std::memory_order_acquire)) {
+        onnxruntime::concurrency::SpinPause();
+      }
+    }
+
+    // Now we know that dispatch is finshed, we synchronize with the
+    // tasks that were created (if any) for the parallel section.  We
+    // revoke tasks still in queues, and then wait for any that are
+    // still running.
+    profiler_.LogStart();
+    unsigned tasks_started = static_cast<unsigned>(ps.tasks.size());
+    while (!ps.tasks.empty()) {
+      const auto& item = ps.tasks.back();
+      Queue& q = worker_data_[item.first].queue;
+      if (q.RevokeWithTag(pt.tag, item.second)) {
+        ps.tasks_revoked++;
+      }
+      ps.tasks.pop_back();
+    }
+    profiler_.LogEnd(ThreadPoolProfiler::WAIT_REVOKE);
+
+    // Wait for the dispatch task's own work...
+    if (ps.dispatch_q_idx > -1) {
+      while (!ps.work_done.load(std::memory_order_acquire)) {
+        onnxruntime::concurrency::SpinPause();
+      }
+    }
+
+    // ...and wait for any other tasks not revoked to finish their work
+    auto tasks_to_wait_for = tasks_started - ps.tasks_revoked;
+    while (ps.tasks_finished < tasks_to_wait_for) {
+      onnxruntime::concurrency::SpinPause();
+    }
+
+    // Clear status to allow the ThreadPoolParallelSection to be
+    // re-used.
+    ps.tasks_finished = 0;
+  }
+
+  void EndParallelSection(ThreadPoolParallelSection& ps) override {
+    PerThread* pt = GetPerThread();
+    EndParallelSectionInternal(*pt, ps);
+  }
+
+  //----------------------------------------------------------------------
+  //
+  // Preferred workers
+  // -----------------
+  //
+  // Initialize the set of hints for preferred worker threads we will
+  // use.  We do this once, covering the maximum num_threads_ items,
+  // in order to avoid resizing preferred_workers concurrent with
+  // access from worker threads.
+  //
+  // For simplicity we initialize with hints round-robin among the
+  // workers.  For simple workloads with 1 main thread this means we
+  // will distribute work across the pool of workers.  For workers
+  // with multiple main threads it attempts to balance the load.
+  //
+  // These hints are just used as a starting point, and are updated by
+  // the worker thread that actually claims an item (e.g., if an item
+  // initially assigned to thread T1 is stolen and executed by T2,
+  // then T2 is assigned at the new preferred worker).
+  //
+  // Note that the hints are held in the _main_ thread that submits
+  // work to the pool.  We assume that a thread is primarily
+  // submitting work to just one pool, but allow for the pool to
+  // change over time.  Hence we allow the hints vector to grow over
+  // time.
+  //
+  // A note on terminology used in the variable names here:
+  //
+  // dop - degree of parallelism, as seen by the user.  For instance
+  //       dop=4 means 4 threads in total: 1 main thread that enters the
+  //       loop, plus 1 dispatcher thread, plus 2 additional worker
+  //       threads.
+  //
+  // par_idx - a thread's index within the loop, in the range [0,dop).
+  //
+  // num_threads_ - the number of worker threads in the thread pool.  A
+  //       loop with dop=4 will be common on a pool with 3 threads
+  //       (given that the main thread will also participate).
+  //
+  // q_idx - a worker queue index, in the range [0,num_threads_).
+  //
+  // preferred_workers - this maps from par_idx values to q_idx.  Hence,
+  //        with dop=4 the vector will have length 4, and will identify
+  //        which of the workers (0,1,2) should run tasks for the loop.
+  //        Note that mapping from par_idx values means that only slots
+  //        [1,dop) are actually used in preferred_workers.
+  //
+  // Here are three examples, all assuming a machine with 4 h/w threads,
+  // and ORT configured to use dop=4.
+  //
+  // * First, suppose that a single job is running a series of loops.
+  //   Its main thread enters a parallel loop.  Initially, let's assume
+  //   its preferred worker array is [_,0,1,2], writing "_" for the
+  //   unusued element for the par_idx=0 work that the main thread will
+  //   run.
+  //
+  //   The main thread schedules the dispatcher task onto worker 0.
+  //
+  //   The dispatcher task schedules worker tasks onto workers 1 and 2.
+  //
+  //   The tasks all execute, without any work stealing, on the threads
+  //   they were scheduled on.  The preferred worker array remains
+  //   [_,0,1,2].
+  //
+  // * Next, assume we have the same job, and for whatever reason the
+  //   preferred workers were initially [_,0,0,0].
+  //
+  //   The main thread schedules the dispatcher onto worker 0.
+  //
+  //   This dispatcher task runs on worker 0, and pushes the worker
+  //   tasks back onto worker 0's queue.
+  //
+  //   Workers 1 and 2 are idle, and steal tasks from worker 0.  As the
+  //   tasks run, they update the preferred_workers array to record the
+  //   workers that execute them.
+  //
+  //   After the loop, the preferred worker array may now be [_,0,2,1]
+  //   or [_,0,1,2], reflecting the fact that the work has got
+  //   re-distributed.  The next loop will start out by distributing the
+  //   work to those same workers.
+  //
+  // * Finally, let's assume we have two jobs running on two main
+  //   threads, and we are now using DoP=2 in the loops, and have 2
+  //   workers in the thread pool (so the machine is not
+  //   over-subscribed).
+  //
+  //   Each main thread has its own preferred_workers, and
+  //   let's say initially these are both [_,0].
+  //
+  //   Here, with DoP=2, each main thread will just dispatch a single
+  //   task immediately (there is no need for asynchrony with only one
+  //   task to generate).
+  //
+  //   Initially both main threads will submit these tasks to worker 0.
+  //
+  //   Once worker 1 steals one of these tasks, the task will update its
+  //   preferred worker to be 1.
+  //
+  //   From that point onwards, the two main threads will dispatch tasks
+  //   to separate workers, avoiding the need for further work stealing.
+
+  void InitializePreferredWorkers(InlinedVector<int>& preferred_workers) {
+    static std::atomic<unsigned> next_worker{0};
+
+    // preferred_workers[0] isn't supposed to be used, so initializing it with -1 to:
+    // a) fault if inappropriately accessed
+    // b) avoid wasting next_worker value
+    if (preferred_workers.empty()) {
+      preferred_workers.push_back(-1);
+    }
+
+    // preferred_workers maps from a par_idx to a q_idx, hence we
+    // initialize slots in the range [0,num_threads_]
+    while (preferred_workers.size() <= num_threads_) {
+      preferred_workers.push_back(next_worker++ % num_threads_);
+    }
+  }
+
+  // Update the preferred worker for par_idx to be the calling thread
+
+  void UpdatePreferredWorker(InlinedVector<int>& preferred_workers,
+                             unsigned par_idx) {
+    unsigned ran_on_idx = GetPerThread()->thread_id;
+    assert(ran_on_idx < num_threads_);
+    assert(par_idx < preferred_workers.size());
+    preferred_workers[par_idx] = ran_on_idx;
+  }
+
+  // Schedule [par_idx_start,par_idx_end) across the preferred workers
+
+  void ScheduleOnPreferredWorkers(PerThread& pt,
+                                  ThreadPoolParallelSection& ps,
+                                  InlinedVector<int>& preferred_workers,
+                                  unsigned par_idx_start,
+                                  unsigned par_idx_end,
+                                  std::function<void(unsigned)> worker_fn) {
+    for (auto par_idx = par_idx_start; par_idx < par_idx_end; ++par_idx) {
+      // Look up hint for par_idx.  Note that the hints may have been
+      // recorded from a prior thread pool with a different number of
+      // threads, hence we must cap at num_threads_.
+      assert(par_idx < preferred_workers.size());
+      unsigned q_idx = preferred_workers[par_idx] % num_threads_;
+      assert(q_idx < num_threads_);
+      WorkerData& td = worker_data_[q_idx];
+      Queue& q = td.queue;
+      unsigned w_idx;
+
+      // Attempt to enqueue the task
+      auto push_status = q.PushBackWithTag([worker_fn, par_idx, &preferred_workers, &ps, this]() {
+        // Record the worker thread that actually runs this task.
+        // This will form the preferred worker for the next loop.
+        UpdatePreferredWorker(preferred_workers, par_idx);
+        worker_fn(par_idx);
+        ps.tasks_finished++;
+      },
+                                           pt.tag, w_idx);
+
+      // Queue accepted the task; wake the thread that owns the queue.
+      // In addition, if the queue was non-empty, attempt to wake
+      // another thread (which may then steal the task).
+      if (push_status == PushResult::ACCEPTED_IDLE || push_status == PushResult::ACCEPTED_BUSY) {
+        ps.tasks.push_back({q_idx, w_idx});
+        td.EnsureAwake();
+        if (push_status == PushResult::ACCEPTED_BUSY) {
+          worker_data_[Rand(&pt.rand) % num_threads_].EnsureAwake();
+        }
+      }
+    }
+  }
+
+  //......................................................................
+  //
+  // Parallel loops
+  // --------------
+  //
+  // Ensure that the ThreadPoolParallelSection has sufficient workers to
+  // execute a loop with degree of parallelism n.  We track the number
+  // of workers already available to the parallel section, prior to
+  // submitting tasks to the work queues to make up the total.
+  //
+  // Each worker will call in to worker_fn(idx) with a per-worker thread
+  // ID.  Note there are different levels of indirection here:
+  //
+  // - In a single-loop parallel section, worker_fn will directly
+  //   execute the threadpool.cc code that implements the parallel loop.
+  //
+  // - In a multi-loop parallel section, worker_fn is an intermediate
+  //   function that is long-lived (i.e., that lasts until the end of
+  //   the parallel section, as opposed to just a single loop's
+  //   duration).
+  //
+  // For ordinary parallel sections, RunInParallelInternal dispatch
+  // tasks to a number of workers asynchronously.  A worker thread will
+  // be selected as the dispatcher that distributes tasks.  This removes
+  // the O(n) work off the critical path of starting the first loop
+  // iteration, helping maintain good performance on very short loops.
+  //
+  // See the note on terminology above for the use of variable names
+  // here.
+
+  void RunInParallelInternal(PerThread& pt,
+                             ThreadPoolParallelSection& ps,
+                             unsigned new_dop,
+                             bool dispatch_async,
+                             std::function<void(unsigned)> worker_fn) {
+    // Ensure that the vector of preferred workers is sufficient for the
+    // size of the loop we are entering.  We do this before dispatching
+    // tasks for the loop in order to avoid any races between changes to
+    // the size of the vector and recording the locations that tasks run
+    // in as they complete.
+    assert(new_dop <= (unsigned)(num_threads_ + 1));
+    auto& preferred_workers = pt.preferred_workers;
+    InitializePreferredWorkers(preferred_workers);
+
+    // current_dop is the degree of parallelism via any workers already
+    // participating in the current parallel section.  Usually, for
+    // single-loop parallel sections, current_dop=1.
+    unsigned current_dop = ps.current_dop;
+
+    if (current_dop < new_dop) {
+      unsigned extra_needed = new_dop - current_dop;
+
+      // Attempt to summon additional workers asynchronously if we
+      // need more than one.  Otherwise, we fall back to simple
+      // synchronous scheduling.
+      if (dispatch_async && extra_needed > 1) {
+        assert(current_dop == 1);
+
+        // Task for dispatching work asynchronously.
+        Task dispatch_task = [current_dop, new_dop, worker_fn, &preferred_workers, &ps, &pt, this]() {
+          // Record that dispatch work has started.  This must occur
+          // prior to scheduling tasks, in order to synchronize with
+          // EndParallelSectionInternal.  [ If EndParallelSection
+          // revoked a task, and then sees distpatch_started=false, then
+          // it knows that it revoked the dispatcher.  Conversely, if it
+          // revokes a task, and then sees dispatch_started=true, then
+          // it knows it revoked a worker task. ]
+          ps.dispatch_started.store(true, std::memory_order_seq_cst);
+
+          // Schedule tasks par_idx=[current_dop+1,new_dop)
+          ScheduleOnPreferredWorkers(pt, ps, preferred_workers, current_dop + 1, new_dop, worker_fn);
+          ps.dispatch_done.store(true, std::memory_order_release);
+
+          // Record the worker thread that actually runs this task.
+          // This will form the preferred worker for the next loop.
+          UpdatePreferredWorker(preferred_workers, current_dop);
+
+          // Run dispatcher task's own work, par_idx=current_dop
+          worker_fn(current_dop);
+
+          // Dispatcher's work complete
+          ps.work_done.store(true, std::memory_order_release);
+        };
+
+        profiler_.LogStart();
+        ps.dispatch_q_idx = preferred_workers[current_dop] % num_threads_;
+        WorkerData& dispatch_td = worker_data_[ps.dispatch_q_idx];
+        Queue& dispatch_que = dispatch_td.queue;
+
+        // assign dispatch task to selected dispatcher
+        auto push_status = dispatch_que.PushBackWithTag(dispatch_task, pt.tag, ps.dispatch_w_idx);
+        // Queue accepted the task; wake the thread that owns the queue.
+        // In addition, if the queue was non-empty, attempt to wake
+        // another thread (which may then steal the task).
+        if (push_status == PushResult::ACCEPTED_IDLE || push_status == PushResult::ACCEPTED_BUSY) {
+          dispatch_td.EnsureAwake();
+          if (push_status == PushResult::ACCEPTED_BUSY) {
+            worker_data_[Rand(&pt.rand) % num_threads_].EnsureAwake();
+          }
+        } else {
+          ps.dispatch_q_idx = -1;  // failed to enqueue dispatch_task
+        }
+        profiler_.LogEnd(ThreadPoolProfiler::DISTRIBUTION_ENQUEUE);
+      } else {
+        // Synchronous dispatch
+        ScheduleOnPreferredWorkers(pt, ps, preferred_workers, current_dop, new_dop, std::move(worker_fn));
+      }
+      ps.current_dop = new_dop;
+    }
+  }
+
+  // Run a single parallel loop in an existing parallel section.  This
+  // maps directly onto SummonWorkers to create sufficient worker
+  // threads for the desired degree of parallelism, followed by
+  // dispatching the loop to those workers.
+  void RunInParallelSection(ThreadPoolParallelSection& ps,
+                            std::function<void(unsigned idx)> fn,
+                            unsigned n,
+                            std::ptrdiff_t block_size) override {
+    ORT_ENFORCE(n <= num_threads_ + 1, "More work items than threads");
+    profiler_.LogStartAndCoreAndBlock(block_size);
+    PerThread* pt = GetPerThread();
+    assert(pt->leading_par_section && "RunInParallel, but not in parallel section");
+    assert((n > 1) && "Trivial parallel section; should be avoided by caller");
+
+    // Publish the work to any existing workers in the parallel
+    // section, and ensure it is visible to any new threads created
+    // below.
+    assert((!ps.current_loop) && "RunInParallelSection, but loop already active");
+    ThreadPoolLoop loop{std::move(fn), n};
+    ps.current_loop = &loop;
+
+    // Increase the worker count if needed.  Each worker will pick up
+    // loops to execute from the current parallel section.
+    std::function<void(unsigned)> worker_fn = [&ps](unsigned par_idx) {
+      while (ps.active) {
+        if (ps.current_loop.load() == nullptr) {
+          onnxruntime::concurrency::SpinPause();
+        } else {
+          ps.workers_in_loop++;
+          ThreadPoolLoop* work_item = ps.current_loop;
+          if (work_item && par_idx < work_item->threads_needed) {
+            work_item->fn(par_idx);
+          }
+          ps.workers_in_loop--;
+        }
+      }
+    };
+    RunInParallelInternal(*pt, ps, n, false, std::move(worker_fn));
+    assert(ps.dispatch_q_idx == -1);
+    profiler_.LogEndAndStart(ThreadPoolProfiler::DISTRIBUTION);
+
+    // Run work in the main thread
+    loop.fn(0);
+    profiler_.LogEndAndStart(ThreadPoolProfiler::RUN);
+
+    // Wait for workers to exit the loop
+    ps.current_loop = 0;
+    while (ps.workers_in_loop) {
+      onnxruntime::concurrency::SpinPause();
+    }
+    profiler_.LogEnd(ThreadPoolProfiler::WAIT);
+  }
+
+  // Run a single parallel loop _without_ a parallel section.  This is a
+  // special case of RunInParallelSection, avoiding code paths for
+  // handing off multiple loops to the pool of workers.
+  // For main thread:
+  //  1. select a dispatcher and do job distribution;
+  //  2. run fn(0);
+  //  3, wait for all;
+  // For dispatcher:
+  //  1. distribute jobs to all other threads;
+  //  2. run fn(...) itself.
+  // For all other threads:
+  //  1. run fn(...);
+  void RunInParallel(std::function<void(unsigned idx)> fn, unsigned n, std::ptrdiff_t block_size) override {
+    ORT_ENFORCE(n <= num_threads_ + 1, "More work items than threads");
+    profiler_.LogStartAndCoreAndBlock(block_size);
+    PerThread* pt = GetPerThread();
+    ThreadPoolParallelSection ps;
+    StartParallelSectionInternal(*pt, ps);
+    RunInParallelInternal(*pt, ps, n, true, fn);  // select dispatcher and do job distribution;
+    profiler_.LogEndAndStart(ThreadPoolProfiler::DISTRIBUTION);
+    fn(0);  // run fn(0)
+    profiler_.LogEndAndStart(ThreadPoolProfiler::RUN);
+    EndParallelSectionInternal(*pt, ps);  // wait for all
+    profiler_.LogEnd(ThreadPoolProfiler::WAIT);
+  }
+
+  int NumThreads() const final {
+    return num_threads_;
+  }
+
+  int CurrentThreadId() const final {
+    const PerThread* pt = const_cast<ThreadPoolTempl*>(this)->GetPerThread();
+    if (pt->pool == this) {
+      return pt->thread_id;
+    }
+    return -1;
+  }
+
+  void EnableSpinning() {
+    spin_loop_status_ = SpinLoopStatus::kBusy;
+  }
+
+  void DisableSpinning() {
+    spin_loop_status_ = SpinLoopStatus::kIdle;
+  }
+
+ private:
+  void ComputeCoprimes(int N, Eigen::MaxSizeVector<unsigned>* coprimes) {
+    for (int i = 1; i <= N; i++) {
+      unsigned a = i;
+      unsigned b = N;
+      // If GCD(a, b) == 1, then a and b are coprimes.
+      while (b != 0) {
+        unsigned tmp = a;
+        a = b;
+        b = tmp % b;
+      }
+      if (a == 1) {
+        coprimes->push_back(i);
+      }
+    }
+  }
+
+  typedef typename Environment::EnvThread Thread;
+  struct WorkerData;
+
+  // PerThread objects are allocated in thread-local storage and
+  // allocated on the thread's first call to GetPerThread.  PerThread
+  // objects are allocated for all threads that submit work to the
+  // thread pool, in addition to threads within the pool.
+  //
+  // In contrast, the WorkerData objects are allocated only for the
+  // threads in the pool, and their lifetime is managed along with the
+  // pool.
+
+#ifdef _MSC_VER
+#pragma warning(push)
+// C4324: structure was padded due to alignment specifier
+#pragma warning(disable : 4324)
+#endif  // _MSC_VER
+
+  struct ORT_ALIGN_TO_AVOID_FALSE_SHARING PerThread {
+    constexpr PerThread() : pool(nullptr) {
+    }
+    ThreadPoolTempl* pool;            // Parent pool, or null for normal threads.
+    bool initialized{false};          // Non-trivial initialization ran (e.g. for RNG)
+    uint64_t rand{0};                 // Random generator state.
+    int thread_id{-1};                // Worker thread index in pool.
+    Tag tag{};                        // Work item tag used to identify this thread.
+    bool leading_par_section{false};  // Leading a parallel section (used only for asserts)
+
+    // When this thread is entering a parallel section, it will
+    // initially push work to this set of workers.  The aim is to
+    // retain cache state within the workers, and to reduce the number
+    // of times that the work-stealing code paths are used for
+    // rebalancing.
+    InlinedVector<int> preferred_workers;
+  };
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif  // _MSC_VER
+
+  struct WorkerData {
+    constexpr WorkerData() : thread(), queue() {
+    }
+    std::unique_ptr<Thread> thread;
+    Queue queue;
+
+    // Each thread has a status, available read-only without locking, and protected
+    // by the mutex field below for updates.  The status is used for three
+    // purposes:
+    //
+    // 1. To identify threads that are good candidates to push work to.
+    //    We prefer to push work to threads that are actively spinning (no need
+    //    for an OS wake-up, and no need for current work to finish).  After that, we
+    //    prefer to push work to threads that are blocked (no need to wait for the
+    //    current work to finish).
+    //
+    // 2. To identify threads that are good candidates to steal work from.  We
+    //    prefer to steal work from threads that are active outside the worker loop.
+    //    This avoids "snatching" new work away from a thread that has just been
+    //    given it but not yet noticed.
+    //
+    // 3. When pushing work to a thread, we use the status read-only to identify
+    //    when we need to wake the thread.  This read-only check avoids the
+    //    need for mutex / condvar operations in the case where the thread pool
+    //    remains busy.
+
+    enum class ThreadStatus : uint8_t {
+      Spinning,  // Spinning in the work loop, and other cases (initialization) where
+                 // the thread will soon be in the loop
+      Active,    // Running user code, not waiting for work
+      Blocking,  // In the process of blocking; may no longer notice work pushed to it
+      Blocked,   // Blocked on cv
+      Waking,    // Not yet back in the worker loop, but wake-up notification sent
+    };
+
+    ThreadStatus GetStatus() const {
+      return status;
+    }
+
+    // State transitions, called from other threads
+
+    // We employ mutex for synchronizing on Blocked/Waking state (EnsureAwake/SeBlocked)
+    // to wakeup the thread in the event it goes to sleep. Because thread status
+    // is an atomic member the lock is not necessary to update it.
+    // Thus, we do not obtain the mutex when we set Active/Spinning state for the thread.
+    // While manipulating under the mutex, we employ relaxed semantics so the compiler is not restricted
+    // any further.
+    void EnsureAwake() {
+      ThreadStatus seen = GetStatus();
+      if (seen == ThreadStatus::Blocking ||
+          seen == ThreadStatus::Blocked) {
+        std::unique_lock<std::mutex> lk(mutex);
+        // Blocking state exists only transiently during the SetBlock() method
+        // while holding the lock.  We may observe it at the start of this
+        // function, but after acquiring the lock then the target thread
+        // will either be blocked or not.
+        seen = status.load(std::memory_order_relaxed);
+        assert(seen != ThreadStatus::Blocking);
+        if (seen == ThreadStatus::Blocked) {
+          status.store(ThreadStatus::Waking, std::memory_order_relaxed);
+          lk.unlock();
+          cv.notify_one();
+        }
+      }
+    }
+
+    // State transitions, called only from the thread itself
+    // The lock is only used in the synchronization between EnsureAwake and SetBlocked,
+    // while the Active vs Spinning states are just used as a hint for work stealing
+    // (prefer to steal from a thread that is actively running a task, rather than stealing from
+    // a thread that is spinning and likely to pick up the task itself).
+    void SetActive() {
+      status = ThreadStatus::Active;
+    }
+
+    void SetSpinning() {
+      status = ThreadStatus::Spinning;
+    }
+
+    bool SetBlocked(std::function<bool()> should_block,
+                    std::function<void()> post_block) {
+      std::unique_lock<std::mutex> lk(mutex);
+      auto old_status = status.exchange(ThreadStatus::Blocking, std::memory_order_seq_cst);
+      if (old_status != ThreadStatus::Spinning) {
+        // Encountered a logical error
+        return false;
+      }
+      if (should_block()) {
+        status.store(ThreadStatus::Blocked, std::memory_order_relaxed);
+        do {
+          cv.wait(lk);
+        } while (status.load(std::memory_order_relaxed) == ThreadStatus::Blocked);
+        post_block();
+      }
+      status.store(ThreadStatus::Spinning, std::memory_order_relaxed);
+      return true;
+    }
+
+   private:
+    std::atomic<ThreadStatus> status{ThreadStatus::Spinning};
+    std::mutex mutex;
+    std::condition_variable cv;
+  };
+
+  Environment& env_;
+  const unsigned num_threads_;
+  const bool allow_spinning_;
+  const bool set_denormal_as_zero_;
+  Eigen::MaxSizeVector<WorkerData> worker_data_;
+  Eigen::MaxSizeVector<Eigen::MaxSizeVector<unsigned>> all_coprimes_;
+  std::atomic<unsigned> blocked_;  // Count of blocked workers, used as a termination condition
+  std::atomic<bool> done_;
+
+  // SpinLoopStatus indicates whether the main worker spinning (inner) loop should exit immediately when there is
+  // no work available (kIdle) or whether it should follow the configured spin-then-block policy (kBusy).
+  // This lets the ORT session layer hint to the thread pool that it should stop spinning in between
+  // requests.
+  enum class SpinLoopStatus {
+    kIdle,
+    kBusy
+  };
+
+  // Default is no control over spinning
+  std::atomic<SpinLoopStatus> spin_loop_status_{SpinLoopStatus::kBusy};
+
+  // Wake any blocked workers so that they can cleanly exit WorkerLoop().  For
+  // a clean exit, each thread will observe (1) done_ set, indicating that the
+  // destructor has been called, (2) all threads blocked, and (3) no
+  // items in the work queues.
+
+  void WakeAllWorkersForExit() {
+    for (auto& td : worker_data_) {
+      td.EnsureAwake();
+    }
+  }
+
+  // Main worker thread loop.
+  void WorkerLoop(int thread_id) {
+    PerThread* pt = GetPerThread();
+    WorkerData& td = worker_data_[thread_id];
+    Queue& q = td.queue;
+    bool should_exit = false;
+    pt->pool = this;
+    pt->thread_id = thread_id;
+
+    assert(td.GetStatus() == WorkerData::ThreadStatus::Spinning);
+
+    constexpr int log2_spin = 20;
+    const int spin_count = allow_spinning_ ? (1ull << log2_spin) : 0;
+    const int steal_count = spin_count / 100;
+
+    SetDenormalAsZero(set_denormal_as_zero_);
+    profiler_.LogThreadId(thread_id);
+
+    while (!should_exit) {
+      Task t = q.PopFront();
+      if (!t) {
+        // Spin waiting for work.
+        for (int i = 0; i < spin_count && !done_; i++) {
+          if (((i + 1) % steal_count == 0)) {
+            t = Steal(StealAttemptKind::TRY_ONE);
+          } else {
+            t = q.PopFront();
+          }
+          if (t) break;
+
+          if (spin_loop_status_.load(std::memory_order_relaxed) == SpinLoopStatus::kIdle) {
+            break;
+          }
+          onnxruntime::concurrency::SpinPause();
+        }
+
+        // Attempt to block
+        if (!t) {
+          if (!td.SetBlocked(  // Pre-block test
+                  [&]() -> bool {
+                    bool should_block = true;
+                    // Check whether work was pushed to us while attempting to block.  We make
+                    // this test while holding the per-thread status lock, and after setting
+                    // our status to ThreadStatus::Blocking.
+                    //
+                    // This synchronizes with ThreadPool::Schedule which pushes work to the queue
+                    // and then tests for ThreadStatus::Blocking/Blocked (via EnsureAwake):
+                    //
+                    // Main thread:                    Worker:
+                    //   #1 Push work                   #A Set status blocking
+                    //   #2 Read worker status          #B Check queue
+                    //   #3 Wake if blocking/blocked
+                    //
+                    // If #A is before #2 then main sees worker blocked and wakes
+                    //
+                    // If #A if after #2 then #B will see #1, and we abandon blocking
+                    assert(!t);
+                    t = q.PopFront();
+                    if (t) {
+                      should_block = false;
+                    }
+
+                    // No work pushed to us, continue attempting to block.  The remaining
+                    // test  is to synchronize with termination requests.  If we are
+                    // shutting down and all worker threads blocked without work, that's
+                    // we are done.
+                    if (should_block) {
+                      blocked_++;
+                      if (done_ && blocked_ == num_threads_) {
+                        should_block = false;
+                        // Almost done, but need to re-check queues.
+                        // Consider that all queues are empty and all worker threads are preempted
+                        // right after incrementing blocked_ above. Now a free-standing thread
+                        // submits work and calls destructor (which sets done_). If we don't
+                        // re-check queues, we will exit leaving the work unexecuted.
+                        if (NonEmptyQueueIndex() != -1) {
+                          // Note: we must not pop from queues before we decrement blocked_,
+                          // otherwise the following scenario is possible. Consider that instead
+                          // of checking for emptiness we popped the only element from queues.
+                          // Now other worker threads can start exiting, which is bad if the
+                          // work item submits other work. So we just check emptiness here,
+                          // which ensures that all worker threads exit at the same time.
+                          blocked_--;
+                        } else {
+                          should_exit = true;
+                        }
+                      }
+                    }
+                    return should_block;
+                  },
+                  // Post-block update (executed only if we blocked)
+                  [&]() {
+                    blocked_--;
+                  })) {
+            // Encountered a fatal logic error in SetBlocked
+            should_exit = true;
+            break;
+          }
+          // Thread just unblocked.  Unless we picked up work while
+          // blocking, or are exiting, then either work was pushed to
+          // us, or it was pushed to an overloaded queue
+          if (!t) t = q.PopFront();
+          if (!t) t = Steal(StealAttemptKind::TRY_ALL);
+        }
+      }
+
+      if (t) {
+        td.SetActive();
+        t();
+        profiler_.LogRun(thread_id);
+        td.SetSpinning();
+      }
+    }
+
+    // Whichever thread(s) observe the termination conditions are responsible for waking
+    // any other threads that have remained blocked.
+    if (should_exit) {
+      WakeAllWorkersForExit();
+    }
+  }
+
+  // Steal tries to steal work from other worker threads in a
+  // best-effort manner.  We steal only from threads that are running
+  // in user code (ThreadStatus::Active).  The intuition behind this
+  // is that the thread is busy with other work, and we will avoid
+  // "snatching" work from a thread which is just about to notice the
+  // work itself.
+
+  Task Steal(StealAttemptKind steal_kind) {
+    PerThread* pt = GetPerThread();
+    unsigned size = num_threads_;
+    unsigned num_attempts = (steal_kind == StealAttemptKind::TRY_ALL) ? size : 1;
+    unsigned r = Rand(&pt->rand);
+    unsigned inc = all_coprimes_[size - 1][r % all_coprimes_[size - 1].size()];
+    unsigned victim = r % size;
+
+    for (unsigned i = 0; i < num_attempts; i++) {
+      assert(victim < size);
+      if (worker_data_[victim].GetStatus() == WorkerData::ThreadStatus::Active) {
+        Task t = worker_data_[victim].queue.PopBack();
+        if (t) {
+          return t;
+        }
+      }
+      victim += inc;
+      if (victim >= size) {
+        victim -= size;
+      }
+    }
+
+    return Task();
+  }
+
+  int NonEmptyQueueIndex() {
+    PerThread* pt = GetPerThread();
+    const unsigned size = static_cast<unsigned>(worker_data_.size());
+    unsigned r = Rand(&pt->rand);
+    unsigned inc = all_coprimes_[size - 1][r % all_coprimes_[size - 1].size()];
+    unsigned victim = r % size;
+    for (unsigned i = 0; i < size; i++) {
+      if (!worker_data_[victim].queue.Empty()) {
+        return victim;
+      }
+      victim += inc;
+      if (victim >= size) {
+        victim -= size;
+      }
+    }
+    return -1;
+  }
+
+  static EIGEN_STRONG_INLINE uint64_t GlobalThreadIdHash() {
+    return std::hash<std::thread::id>()(std::this_thread::get_id());
+  }
+
+  static EIGEN_STRONG_INLINE PerThread* GetPerThread() {
+    static thread_local PerThread per_thread_;
+    PerThread* pt = &per_thread_;
+    if (!pt->initialized) {
+      pt->rand = GlobalThreadIdHash();
+      pt->initialized = true;
+    }
+    return pt;
+  }
+
+  static EIGEN_STRONG_INLINE unsigned Rand(uint64_t* state) {
+    uint64_t current = *state;
+    // Update the internal state
+    *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
+    // Generate the random output (using the PCG-XSH-RS scheme)
+    return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
+  }
+};
+
+}  // namespace concurrency
+
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/platform/ort_spin_lock.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/platform/ort_spin_lock.h
new file mode 100644
index 00000000000000..db80abe1ee2fe0
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/platform/ort_spin_lock.h
@@ -0,0 +1,35 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+#include "core/common/spin_pause.h"
+#include <atomic>
+
+namespace onnxruntime {
+/*
+OrtSpinLock implemented mutex semantic "lock-freely",
+calling thread will not be put to sleep on blocked,
+which reduces cpu usage on context switching.
+*/
+struct OrtSpinLock {
+  using LockState = enum { Locked = 0,
+                           Unlocked };
+
+  void lock() noexcept {
+    LockState state = Unlocked;
+    while (!state_.compare_exchange_weak(state, Locked, std::memory_order_acq_rel, std::memory_order_relaxed)) {
+      state = Unlocked;
+      concurrency::SpinPause();  // pause and retry
+    }
+  }
+  bool try_lock() noexcept {
+    LockState state = Unlocked;
+    return state_.compare_exchange_weak(state, Locked, std::memory_order_acq_rel, std::memory_order_relaxed);
+  }
+  void unlock() noexcept {
+    state_.store(Unlocked, std::memory_order_release);
+  }
+
+ private:
+  std::atomic<LockState> state_{Unlocked};
+};
+}  // namespace onnxruntime
\ No newline at end of file
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/platform/threadpool.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/platform/threadpool.h
new file mode 100644
index 00000000000000..04df6dc982c6a9
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/platform/threadpool.h
@@ -0,0 +1,434 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/* Modifications Copyright (c) Microsoft. */
+
+#pragma once
+#include <string>
+#include <vector>
+#include <functional>
+#include <memory>
+#include "core/common/common.h"
+#include "core/platform/env.h"
+
+#include <functional>
+#include <memory>
+
+// ORT thread pool overview
+// ------------------------
+//
+// The ORT thread pool implementation is split into two layers.  This
+// file provides the high-level component.  See the accompanying
+// comments in EigenNonBlockingThreadPool.h for the low-level
+// component.
+//
+// threadpool.h defines the user-facing functions for use in
+// operators.  The main abstraction are parallel loops
+// (ThreadPool::TryParallelFor*), although we also support scheduling
+// of asynchronous tasks (ThreadPool::Schedule), and the construction
+// of multi-loop parallel sections (ThreadPool::ParallelSection).
+//
+// This high level API is accessed via static methods on the
+// ThreadPool class.  These methods map the operations onto one of
+// three low-level implementations: (#1) direct execution of the
+// operations if there is no thread pool configured, (#2) execution of
+// the operations using the modified Eigen threadpool, (#3) execution
+// of the operations using OpenMP.  Option #1 enables execution in
+// simple settings without needing threads.  Option #2 is the
+// preferred approach for use in settings with parallelism.
+//
+// The high-level part of the thread pool is responsible for:
+//
+// - Exposing the desired degree of parallelism to user code, and to
+//   libraries such as MLAS.  This lets the libraries tailor the
+//   extent to which they parallelize work.
+//
+// - Handling trivial cases (such as directly running parallel loops
+//   with only a single iteration, or with no iterations at all).
+//
+// - Deciding how to divide work efficiently between the threads
+//   available.
+//
+//   The ThreadPool::TryParallelFor methods do this based on cost
+//   estimates supplied by the caller, and are designed to support
+//   loops with small amounts of work per iteration.  The loop body is
+//   supplied as a function taking a [start,end) range of iterations
+//   to execute (avoiding the need for per-iteration std::function
+//   calls, or a reliance upon inlining to avoid those calls).
+//
+//   ThreadPool::TrySimpleParallelFor uses a simpler single-iteration
+//   API based on the assumption that the caller has divided work to
+//   an appropriate granularity.
+//
+// - When used with the Eigen-based thread pool, the implementation of
+//   all of the loops maps down onto
+//   ThreadPool::ParallelForFixedBlockSizeScheduling.  This method
+//   takes the degree of parallelism (d_of_p) and work distribution
+//   block size (from the cost-based heuristics), and creates a set of
+//   tasks in the underlying thread pool (via
+//   ThreadPool::RunInParallel).
+//
+//   These tasks then run a loop which picks off batches of iterations
+//   from the user's code.  The distribution of these batches is
+//   handled dynmamically via LoopCounter::ClaimIterations.  This
+//   dynamic balancing behavior helps make performance robust to any
+//   variability in the execution time across iterations, and to
+//   situations such as multiple loops running concurrently on the
+//   same thread pool.
+//
+// - When running a series of loops inside a parallel section, the
+//   LoopCounter also helps obtain affinity between these loops (i.e.,
+//   iteration X of one loop will tend to run on the same thread that
+//   ran iteration X of prior loops).  This locality helps improve hit
+//   rates in per-core caches across the series of short loops used in
+//   operators like GRU.
+//
+// There are some known areas for exploration here:
+//
+// - The cost-based heuristics were developed prior to recent changes
+//   to the thread pool.  The heuristics seem to work well, but we
+//   should revisit the tuning periodically.
+//
+// - Can we unify the APIs for the different kinds of parallel loop?
+//
+//   In particular, we may be able to replace the current use of
+//   TryBatchParallelFor with appropriate costs for each call site,
+//   and then use TryParallelFor.  This would allow for more dynamic
+//   re-balancing of work between threads than the current
+//   ThreadPool::PartitionWork function provides.
+//
+// - Given the extensive modifications to original Eigen code, should
+//   we separate that out as a new class and remove the dependence on
+//   other Eigen components.
+
+// This file use PIMPL to avoid having eigen headers here
+namespace Eigen {
+class Allocator;
+class ThreadPoolInterface;
+}  // namespace Eigen
+
+namespace onnxruntime {
+
+struct TensorOpCost {
+  double bytes_loaded;
+  double bytes_stored;
+  double compute_cycles;
+};
+
+namespace concurrency {
+
+template <typename Environment>
+class ThreadPoolTempl;
+
+class ExtendedThreadPoolInterface;
+class LoopCounter;
+class ThreadPoolParallelSection;
+
+class ThreadPool {
+ public:
+#ifdef _WIN32
+  using NAME_CHAR_TYPE = wchar_t;
+#else
+  using NAME_CHAR_TYPE = char;
+#endif
+  // Constructs a pool for running with with "degree_of_parallelism" threads with
+  // specified "name". env->StartThread() is used to create individual threads
+  // with the given ThreadOptions. If "low_latency_hint" is true the thread pool
+  // implementation may use it as a hint that lower latency is preferred at the
+  // cost of higher CPU usage, e.g. by letting one or more idle threads spin
+  // wait. Conversely, if the threadpool is used to schedule high-latency
+  // operations like I/O the hint should be set to false.
+  //
+  // REQUIRES: degree_of_parallelism > 0
+  ThreadPool(Env* env,
+             const ThreadOptions& thread_options,
+             const NAME_CHAR_TYPE* name,
+             int degree_of_parallelism,
+             bool low_latency_hint,
+             bool force_hybrid = false);
+
+  // Waits until all scheduled work has finished and then destroy the
+  // set of threads.
+  ~ThreadPool();
+
+  // Start and end a multi-loop parallel section.  Parallel loops can
+  // be executed directly (without using this API), but entering a
+  // parallel section allows the runtime system to amortize loop
+  // entry/exit costs over multiple loops, and allows it to promote
+  // affinity between corresponding iterations of different loops.
+  //
+  // Multi-loop sections would typically be used in cases where a
+  // series of loops executes without much code in between them, and
+  // where it is impractical to refactor code into a single loop.  For
+  // instance:
+  //
+  // {
+  //   onnxruntime::concurrency::ThreadPoool::ParallelSection ps(tp);
+  //   for (int x = 0; x < seq_len; x++) {
+  //     TrySimpleParallelFor(tp, 16, [&]() { ... });
+  //   }
+  // }
+  //
+  // The parallel section is entered via the constructor of
+  // ThreadPool::ParallelSection, and exited via the destructor.
+  // Currently, thread-local state is used to track whether or not the
+  // current thread is inside a parallel section.  In contrast to
+  // handling parallel section objects explicitly in user code, this
+  // approach allows code such as MLAS to operate with/without the use
+  // of parallel sections.
+  //
+  // Parallel sections are only implemented with the Eigen threadpool.
+  // They have no effect when using OpenMP.
+  //
+  // Parallel sections may not be nested, and may not be used inside
+  // parallel loops.
+
+  class ParallelSection {
+   public:
+    explicit ParallelSection(ThreadPool* tp);
+    ~ParallelSection();
+
+   private:
+    friend class ThreadPool;
+
+    // Owning reference for the underlying ThreadPoolParallelSection
+    // which implements the thread management.  We use an explicit
+    // deleter here so that the definition of
+    // ThreadPoolParallelSection does not need to be available at this
+    // point to avoid a dependence on the Eigen headers.
+    ThreadPoolParallelSection* ps_{nullptr};
+    ThreadPool* tp_;
+    ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(ParallelSection);
+  };
+
+  // The below API allows to disable spinning
+  // This is used to support real-time scenarios where
+  // spinning between relatively infrequent requests
+  // contributes to high CPU usage while not processing anything.
+  void EnableSpinning();
+
+  void DisableSpinning();
+
+  // Schedules fn() for execution in the pool of threads.  The function may run
+  // synchronously if it cannot be enqueued.  This will occur if the thread pool's
+  // degree-of-parallelism is 1, but it may also occur for implementation-dependent
+  // reasons such as if queues used for buffering work are full.
+  static void Schedule(ThreadPool* tp,
+                       std::function<void()> fn) {
+    if (tp) {
+      tp->Schedule(fn);
+    } else {
+      fn();
+    }
+  }
+
+  // ParallelFor shards the "total" units of work assuming each unit of work
+  // having roughly "cost_per_unit" cost, in cycles. Each unit of work is
+  // indexed 0, 1, ..., total - 1. Each shard contains 1 or more units of work
+  // and the total cost of each shard is roughly the same.
+  //
+  // "cost_per_unit" is an estimate of the number of CPU cycles (or nanoseconds
+  // if not CPU-bound) to complete a unit of work. Overestimating creates too
+  // many shards and CPU time will be dominated by per-shard overhead, such as
+  // Context creation. Underestimating may not fully make use of the specified
+  // parallelism, and may also cause inefficiencies due to load balancing
+  // issues and stragglers.
+
+  static void TryParallelFor(ThreadPool* tp, std::ptrdiff_t total, double cost_per_unit,
+                             const std::function<void(std::ptrdiff_t first, std::ptrdiff_t last)>& fn) {
+    TryParallelFor(tp, total, TensorOpCost{0, 0, static_cast<double>(cost_per_unit)}, fn);
+  }
+
+  static void TryParallelFor(ThreadPool* tp, std::ptrdiff_t total, const TensorOpCost& cost_per_unit,
+                             const std::function<void(std::ptrdiff_t first, std::ptrdiff_t last)>& fn);
+
+  // Directly schedule the 'total' tasks to the underlying threadpool, without
+  // cutting them by halves
+
+  inline static void TrySimpleParallelFor(ThreadPool* tp, std::ptrdiff_t total,
+                                          const std::function<void(std::ptrdiff_t)>& fn) {
+    if (tp != nullptr) {
+      tp->SimpleParallelFor(total, fn);
+    } else {
+      for (std::ptrdiff_t i = 0; i < total; ++i) {
+        // In many cases, fn can be inlined here.
+        fn(i);
+      }
+    }
+  }
+
+  /**
+   * Tries to call the given function in parallel, with calls split into (num_batches) batches.
+   *\param num_batches If it is zero, it will be replaced to the value of DegreeOfParallelism().
+   *\param fn A std::function or STL style functor with signature of "void f(std::ptrdiff_t);"
+   * Pitfall: Caller should cap `num_batches` to a reasonable value based on the cost of `fn` and the value of `total`.
+   *For example, if fn is as simple as: int sum=0; fn = [&](int i){sum +=i;} and `total` is 100, then num_batches should
+   *be just 1.
+   *
+   * ```
+   **/
+  template <typename F>
+  inline static void TryBatchParallelFor(ThreadPool* tp, std::ptrdiff_t total, F&& fn, std::ptrdiff_t num_batches) {
+    if (tp == nullptr) {
+      for (std::ptrdiff_t i = 0; i < total; ++i) {
+        // In many cases, fn can be inlined here.
+        fn(i);
+      }
+      return;
+    }
+    if (total <= 0)
+      return;
+
+    if (total == 1) {
+      fn(0);
+      return;
+    }
+
+    if (num_batches <= 0) {
+      num_batches = std::min<std::ptrdiff_t>(total, DegreeOfParallelism(tp));
+    }
+
+    if (num_batches <= 1) {
+      for (int i = 0; i < total; i++) {
+        fn(i);
+      }
+      return;
+    }
+
+    tp->SimpleParallelFor(num_batches, [&](std::ptrdiff_t batch_index) {
+      auto work = PartitionWork(batch_index, num_batches, total);
+      for (std::ptrdiff_t i = work.start; i < work.end; i++) {
+        fn(i);
+      }
+    });
+  }
+
+  struct WorkInfo {
+    std::ptrdiff_t start{0};
+    std::ptrdiff_t end{0};
+  };
+
+  /** Calculate the start and end offsets for a batch.
+      @remarks Based on MlasPartitionWork
+  */
+  constexpr static WorkInfo PartitionWork(std::ptrdiff_t batch_idx, std::ptrdiff_t num_batches, std::ptrdiff_t total_work) {
+    const std::ptrdiff_t work_per_batch = total_work / num_batches;
+    const std::ptrdiff_t work_per_batch_extra = total_work % num_batches;
+
+    WorkInfo info;
+    if (batch_idx < work_per_batch_extra) {
+      info.start = (work_per_batch + 1) * batch_idx;
+      info.end = info.start + work_per_batch + 1;
+    } else {
+      info.start = work_per_batch * batch_idx + work_per_batch_extra;
+      info.end = info.start + work_per_batch;
+    }
+
+    return info;
+  }
+
+  //......................................................................
+  //
+  // The following static methods take into account whether OpenMP is
+  // enabled/disabled, and if the thread pool pointer is nullptr
+  // during sequential execution.
+
+  // Provide a hint to the caller for whether or not to parallelize
+  // work.  This lets a caller switch to a sequential version of an
+  // algorithm rather than using calls via the ParallelFor functions.
+
+  static bool ShouldParallelize(const ThreadPool* tp);
+
+  // Return the degree of parallelism that code should assume when using the thread pool.
+  // It decouples the degree of parallelism for use with the thread pool from
+  // the implementation choice of whether this matches the number of threads created in
+  // the pool.
+  //
+  // Currently, a loop with degree-of-parallelism N is supported by a pool of N-1 threads
+  // working in combination with the thread initiating the loop.
+  static int DegreeOfParallelism(const ThreadPool* tp);
+
+  ORT_DISALLOW_COPY_AND_ASSIGNMENT(ThreadPool);
+
+  // StartProfiling and StopProfiling are not to be consumed as public-facing API
+  static void StartProfiling(concurrency::ThreadPool* tp);
+  static std::string StopProfiling(concurrency::ThreadPool* tp);
+
+ private:
+  friend class LoopCounter;
+
+  // Returns the number of threads created in the pool.  This may be different from the
+  // value returned by DegreeOfParallelism to code using the pool.
+  int NumThreads() const;
+
+  // Returns current thread id between 0 and NumThreads() - 1, if called from a
+  // thread in the pool. Returns -1 otherwise.
+  int CurrentThreadId() const;
+
+  // Run fn with up to n degree-of-parallelism enlisting the thread pool for
+  // help.  The degree-of-parallelism includes the caller, and so if n==1
+  // then the function will run directly in the caller.  The fork-join
+  // synchronization is handled in the thread pool, and so any state captured
+  // by fn() is safe from concurrent access once RunWithHelp returns.
+  void RunInParallel(std::function<void(unsigned idx)> fn, unsigned n, std::ptrdiff_t block_size);
+
+  // Divides the work represented by the range [0, total) into k shards.
+  // Calls fn(i*block_size, (i+1)*block_size) from the ith shard (0 <= i < k).
+  // Each shard may be executed on a different thread in parallel, depending on
+  // the number of threads available in the pool.
+  // When (i+1)*block_size > total, fn(i*block_size, total) is called instead.
+  // Requires 0 < block_size <= total.
+  void ParallelForFixedBlockSizeScheduling(std::ptrdiff_t total, std::ptrdiff_t block_size,
+                                           const std::function<void(std::ptrdiff_t, std::ptrdiff_t)>& fn);
+
+  // Return whether or not the calling thread should run a loop of
+  // num_iterations divided in chunks of block_size in parallel.  If not,
+  // the caller should run the loop sequentially.
+  bool ShouldParallelizeLoop(const std::ptrdiff_t num_iterations,
+                             const std::ptrdiff_t block_size = 1) const;
+
+  // Internal (non-static) parallel loop methods.  Unlike the public static methods,
+  // these will not handle the cases of OpenMP builds. or builds without a threadpool.
+  void ParallelFor(std::ptrdiff_t total, double cost_per_unit,
+                   const std::function<void(std::ptrdiff_t first, std::ptrdiff_t last)>& fn);
+
+  void ParallelFor(std::ptrdiff_t total, const TensorOpCost& cost_per_unit,
+                   const std::function<void(std::ptrdiff_t first, std::ptrdiff_t)>& fn);
+
+  void SimpleParallelFor(std::ptrdiff_t total, const std::function<void(std::ptrdiff_t)>& fn);
+
+  void Schedule(std::function<void()> fn);
+
+  void StartProfiling();
+
+  std::string StopProfiling();
+
+  ThreadOptions thread_options_;
+
+  // If a thread pool is created with degree_of_parallelism != 1 then an underlying
+  // EigenThreadPool is used to create OS threads and handle work distribution to them.
+  // If degree_of_parallelism == 1 then underlying_threadpool_ is left as nullptr
+  // and parallel work is run directly by the caller.
+  ExtendedThreadPoolInterface* underlying_threadpool_ = nullptr;
+
+  // If used, underlying_threadpool_ is instantiated and owned by the ThreadPool.
+  std::unique_ptr<ThreadPoolTempl<Env> > extended_eigen_threadpool_;
+
+  // Force the thread pool to run in hybrid mode on a normal cpu.
+  bool force_hybrid_ = false;
+};
+
+}  // namespace concurrency
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/platform/tracing.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/platform/tracing.h
new file mode 100644
index 00000000000000..fb61632aae2389
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/platform/tracing.h
@@ -0,0 +1,9 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <windows.h>
+#include <TraceLoggingProvider.h>
+
+TRACELOGGING_DECLARE_PROVIDER(telemetry_provider_handle);
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/platform/windows/TraceLoggingConfig.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/platform/windows/TraceLoggingConfig.h
new file mode 100644
index 00000000000000..29871676b316b7
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/platform/windows/TraceLoggingConfig.h
@@ -0,0 +1,81 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+/* ++
+Module Name:
+    TraceLoggingConfig.h
+Abstract:
+    Macro definitions used by this project's TraceLogging ETW providers:
+    - Configuration macros that select the ETW Provider Groups to be used by
+      this project.
+    - Constants for tags that are commonly used in Microsoft's
+      TraceLogging-based ETW.
+    Different versions of this file use different definitions for the
+    TraceLoggingOption configuration macros. The definitions in this file are
+    empty. As a result, providers using this configuration file will not join
+    any ETW Provider Groups and will not be given any special treatment by
+    group-sensitive ETW listeners.
+Environment:
+    User mode or kernel mode.
+--*/
+
+#pragma once
+
+// Configuration macro for use in TRACELOGGING_DEFINE_PROVIDER. The definition
+// in this file configures the provider as a normal (non-telemetry) provider.
+#ifndef TraceLoggingOptionMicrosoftTelemetry
+#define TraceLoggingOptionMicrosoftTelemetry() \
+  TraceLoggingOptionGroup(0000000000, 00000, 00000, 0000, 0000, 0000, 0000, 0000, 000, 0000, 0000)
+// Empty definition for TraceLoggingOptionMicrosoftTelemetry
+#endif
+
+// Configuration macro for use in TRACELOGGING_DEFINE_PROVIDER. The definition
+// in this file configures the provider as a normal (non-telemetry) provider.
+#define TraceLoggingOptionWindowsCoreTelemetry() \
+  // Empty definition for TraceLoggingOptionWindowsCoreTelemetry
+
+// Event privacy tags. Use the PDT macro values for the tag parameter, e.g.:
+// TraceLoggingWrite(...,
+//   TelemetryPrivacyDataTag(PDT_BrowsingHistory | PDT_ProductAndServiceUsage),
+//   ...);
+#define TelemetryPrivacyDataTag(tag) TraceLoggingUInt64((tag), "PartA_PrivTags")
+#define PDT_BrowsingHistory 0x0000000000000002u
+#define PDT_DeviceConnectivityAndConfiguration 0x0000000000000800u
+#define PDT_InkingTypingAndSpeechUtterance 0x0000000000020000u
+#define PDT_ProductAndServicePerformance 0x0000000001000000u
+#define PDT_ProductAndServiceUsage 0x0000000002000000u
+#define PDT_SoftwareSetupAndInventory 0x0000000080000000u
+
+// Event categories specified via keywords, e.g.:
+// TraceLoggingWrite(...,
+//     TraceLoggingKeyword(MICROSOFT_KEYWORD_MEASURES),
+//     ...);
+#define MICROSOFT_KEYWORD_CRITICAL_DATA 0x0000800000000000  // Bit 47
+#define MICROSOFT_KEYWORD_MEASURES 0x0000400000000000       // Bit 46
+#define MICROSOFT_KEYWORD_TELEMETRY 0x0000200000000000      // Bit 45
+#define MICROSOFT_KEYWORD_RESERVED_44 0x0000100000000000    // Bit 44 (reserved for future assignment)
+
+// Event categories specified via event tags, e.g.:
+// TraceLoggingWrite(...,
+//     TraceLoggingEventTag(MICROSOFT_EVENTTAG_REALTIME_LATENCY),
+//     ...);
+#define MICROSOFT_EVENTTAG_DROP_USER_IDS 0x00008000
+#define MICROSOFT_EVENTTAG_AGGREGATE 0x00010000
+#define MICROSOFT_EVENTTAG_DROP_PII_EXCEPT_IP 0x00020000
+#define MICROSOFT_EVENTTAG_COSTDEFERRED_LATENCY 0x00040000
+#define MICROSOFT_EVENTTAG_CORE_DATA 0x00080000
+#define MICROSOFT_EVENTTAG_INJECT_XTOKEN 0x00100000
+#define MICROSOFT_EVENTTAG_REALTIME_LATENCY 0x00200000
+#define MICROSOFT_EVENTTAG_NORMAL_LATENCY 0x00400000
+#define MICROSOFT_EVENTTAG_CRITICAL_PERSISTENCE 0x00800000
+#define MICROSOFT_EVENTTAG_NORMAL_PERSISTENCE 0x01000000
+#define MICROSOFT_EVENTTAG_DROP_PII 0x02000000
+#define MICROSOFT_EVENTTAG_HASH_PII 0x04000000
+#define MICROSOFT_EVENTTAG_MARK_PII 0x08000000
+
+// Field categories specified via field tags, e.g.:
+// TraceLoggingWrite(...,
+//     TraceLoggingString(szUser, "UserName", "User's name", MICROSOFT_FIELDTAG_HASH_PII),
+//     ...);
+#define MICROSOFT_FIELDTAG_DROP_PII 0x04000000
+#define MICROSOFT_FIELDTAG_HASH_PII 0x08000000
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/platform/windows/readme.txt b/third_party/onnxruntime_headers/src/include/onnxruntime/core/platform/windows/readme.txt
new file mode 100644
index 00000000000000..f1a436fc200bef
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/platform/windows/readme.txt
@@ -0,0 +1,2 @@
+copied from minkernel/published/internal/telemetry/open_source/TraceLoggingConfig.h 
+this is the official open source edition for these configuration settings
\ No newline at end of file
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/acl/acl_provider_factory.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/acl/acl_provider_factory.h
new file mode 100644
index 00000000000000..8875a83a39f544
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/acl/acl_provider_factory.h
@@ -0,0 +1,20 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
+// Licensed under the MIT License.
+
+#include "onnxruntime_c_api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * \param use_arena zero: false. non-zero: true.
+ */
+ORT_API_STATUS(OrtSessionOptionsAppendExecutionProvider_ACL, _In_ OrtSessionOptions* options,
+               bool enable_fast_math)
+ORT_ALL_ARGS_NONNULL;
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/armnn/armnn_provider_factory.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/armnn/armnn_provider_factory.h
new file mode 100644
index 00000000000000..35276db323ff46
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/armnn/armnn_provider_factory.h
@@ -0,0 +1,18 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "onnxruntime_c_api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * \param use_arena zero: false. non-zero: true.
+ */
+ORT_API_STATUS(OrtSessionOptionsAppendExecutionProvider_ArmNN, _In_ OrtSessionOptions* options, int use_arena)
+ORT_ALL_ARGS_NONNULL;
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/cann/cann_provider_options.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/cann/cann_provider_options.h
new file mode 100644
index 00000000000000..51b423e68110a6
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/cann/cann_provider_options.h
@@ -0,0 +1,26 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) Huawei. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <string>
+
+#include "onnxruntime_c_api.h"
+#include "core/framework/arena_extend_strategy.h"
+
+struct OrtCANNProviderOptions {
+  int device_id;                                           // CANN device id
+  size_t npu_mem_limit;                                    // BFC Arena memory limit for CANN
+  onnxruntime::ArenaExtendStrategy arena_extend_strategy;  // Strategy used to grow the memory arena
+  int enable_cann_graph;                                   // Flag indicating if prioritizing the use of
+                                                           // CANN's graph-running capabilities
+  int dump_graphs;                                         // Flag indicating if dumping graphs
+  int dump_om_model;                                       // Flag indicating if dumping om model
+  std::string precision_mode;                              // Operator Precision Mode
+  std::string op_select_impl_mode;                         // Operator-level model compilation options:
+                                                           // Mode selection
+  std::string optypelist_for_implmode;                     // Operator-level model compilation options:
+                                                           // Operator list
+  OrtArenaCfg* default_memory_arena_cfg;                   // CANN memory arena configuration parameters
+};
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h
new file mode 100644
index 00000000000000..d035fd34bd072c
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h
@@ -0,0 +1,74 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+
+#include "onnxruntime_c_api.h"
+
+// COREMLFlags are bool options we want to set for CoreML EP
+// This enum is defined as bit flags, and cannot have negative value
+// To generate an uint32_t coreml_flags for using with OrtSessionOptionsAppendExecutionProvider_CoreML below,
+//   uint32_t coreml_flags = 0;
+//   coreml_flags |= COREML_FLAG_USE_CPU_ONLY;
+enum COREMLFlags {
+  COREML_FLAG_USE_NONE = 0x000,
+
+  // Using CPU only in CoreML EP, this may decrease the perf but will provide
+  // reference output value without precision loss, which is useful for validation
+  COREML_FLAG_USE_CPU_ONLY = 0x001,
+
+  // Enable CoreML EP on subgraph
+  COREML_FLAG_ENABLE_ON_SUBGRAPH = 0x002,
+
+  // By default CoreML Execution provider will be enabled for all compatible Apple devices
+  // Enable this option will only enable CoreML EP for Apple devices with ANE (Apple Neural Engine)
+  // Please note, enable this option does not guarantee the entire model to be executed using ANE only
+  COREML_FLAG_ONLY_ENABLE_DEVICE_WITH_ANE = 0x004,
+
+  // Only allow CoreML EP to take nodes with inputs with static shapes. By default it will also allow inputs with
+  // dynamic shapes. However, the performance may be negatively impacted if inputs have dynamic shapes.
+  COREML_FLAG_ONLY_ALLOW_STATIC_INPUT_SHAPES = 0x008,
+
+  // Create an MLProgram. By default it will create a NeuralNetwork model. Requires Core ML 5 or later.
+  COREML_FLAG_CREATE_MLPROGRAM = 0x010,
+
+  // https://developer.apple.com/documentation/coreml/mlcomputeunits?language=objc
+  // there are four compute units:
+  // MLComputeUnitsCPUAndNeuralEngine|MLComputeUnitsCPUAndGPU|MLComputeUnitsCPUOnly|MLComputeUnitsAll
+  // different CU will have different performance and power consumption
+  COREML_FLAG_USE_CPU_AND_GPU = 0x020,
+  // Keep COREML_FLAG_LAST at the end of the enum definition
+  // And assign the last COREMLFlag to it
+  COREML_FLAG_LAST = COREML_FLAG_USE_CPU_AND_GPU,
+};
+
+// MLComputeUnits can be one of the following values:
+// 'MLComputeUnitsCPUAndNeuralEngine|MLComputeUnitsCPUAndGPU|MLComputeUnitsCPUOnly|MLComputeUnitsAll'
+// these values are intended to be used with Ort::SessionOptions::AppendExecutionProvider (C++ API)
+// and SessionOptionsAppendExecutionProvider (C API). For the old API, use COREMLFlags instead.
+static const char* const kCoremlProviderOption_MLComputeUnits = "MLComputeUnits";
+static const char* const kCoremlProviderOption_ModelFormat = "ModelFormat";
+// same as COREML_FLAG_ONLY_ALLOW_STATIC_INPUT_SHAPES
+static const char* const kCoremlProviderOption_RequireStaticInputShapes = "RequireStaticInputShapes";
+static const char* const kCoremlProviderOption_EnableOnSubgraphs = "EnableOnSubgraphs";
+// provided by https://developer.apple.com/documentation/coreml/mloptimizationhints-swift.struct/specializationstrategy-swift.property
+// Core ML segments the model’s compute graph and specializes each segment for the target compute device.
+// This process can affect the model loading time and the prediction latency.
+// Use this option to tailor the specialization strategy for your model.
+static const char* const kCoremlProviderOption_SpecializationStrategy = "SpecializationStrategy";
+// Profile the Core ML MLComputePlan.
+// This logs the hardware each operator is dispatched to and the estimated execution time.
+// Intended for developer usage but provide useful diagnostic information if performance is not as expected.
+static const char* const kCoremlProviderOption_ProfileComputePlan = "ProfileComputePlan";
+// please refer to https://developer.apple.com/documentation/coreml/mlmodelconfiguration/allowlowprecisionaccumulationongpu
+static const char* const kCoremlProviderOption_AllowLowPrecisionAccumulationOnGPU = "AllowLowPrecisionAccumulationOnGPU";
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ORT_EXPORT ORT_API_STATUS(OrtSessionOptionsAppendExecutionProvider_CoreML,
+                          _In_ OrtSessionOptions* options, uint32_t coreml_flags);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/cpu/cpu_provider_factory.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/cpu/cpu_provider_factory.h
new file mode 100644
index 00000000000000..292678692b1a10
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/cpu/cpu_provider_factory.h
@@ -0,0 +1,19 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "onnxruntime_c_api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * \param use_arena zero: false. non-zero: true.
+ */
+ORT_EXPORT
+ORT_API_STATUS(OrtSessionOptionsAppendExecutionProvider_CPU, _In_ OrtSessionOptions* options, int use_arena)
+ORT_ALL_ARGS_NONNULL;
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/cuda/cuda_context.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/cuda/cuda_context.h
new file mode 100644
index 00000000000000..12a19759bb9794
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/cuda/cuda_context.h
@@ -0,0 +1,108 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// This header is to expose a context for cuda custom ops.
+// By the context, a custom cuda operator could fetch existing resources,
+// such as cuda stream and cudnn handle, for reusing.
+
+// For concrete usage, pls find page here:
+// https://onnxruntime.ai/docs/reference/operators/add-custom-op.html#custom-ops-for-cuda-and-rocm
+
+#pragma once
+
+#define ORT_CUDA_CTX
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#ifndef USE_CUDA_MINIMAL
+#include <cublas_v2.h>
+#include <cudnn.h>
+#endif
+
+#include "core/providers/cuda/cuda_resource.h"
+#include "core/providers/custom_op_context.h"
+
+namespace Ort {
+
+namespace Custom {
+
+struct CudaContext : public CustomOpContext {
+  cudaStream_t cuda_stream = {};
+  cudnnHandle_t cudnn_handle = {};
+  cublasHandle_t cublas_handle = {};
+  OrtAllocator* deferred_cpu_allocator = {};
+  // below are cuda ep options
+  int16_t device_id = 0;
+  int32_t arena_extend_strategy = 0;
+  int32_t cudnn_conv_algo_search = 0;
+  bool cudnn_conv_use_max_workspace = true;
+  bool cudnn_conv1d_pad_to_nc1d = false;
+  bool enable_skip_layer_norm_strict_mode = false;
+  bool prefer_nhwc = false;
+  bool use_tf32 = true;
+  bool fuse_conv_bias = true;
+
+  void Init(const OrtKernelContext& kernel_ctx) {
+    cuda_stream = FetchResource<cudaStream_t>(kernel_ctx, CudaResource::cuda_stream_t);
+    cudnn_handle = FetchResource<cudnnHandle_t>(kernel_ctx, CudaResource::cudnn_handle_t);
+    cublas_handle = FetchResource<cublasHandle_t>(kernel_ctx, CudaResource::cublas_handle_t);
+    deferred_cpu_allocator = FetchResource<OrtAllocator*>(kernel_ctx, CudaResource::deferred_cpu_allocator_t);
+
+    device_id = FetchResource<int16_t>(kernel_ctx, CudaResource::device_id_t);
+    arena_extend_strategy = FetchResource<int32_t>(kernel_ctx, CudaResource::arena_extend_strategy_t);
+    cudnn_conv_algo_search = FetchResource<int32_t>(kernel_ctx, CudaResource::cudnn_conv_algo_search_t);
+    cudnn_conv_use_max_workspace = FetchResource<bool>(kernel_ctx, CudaResource::cudnn_conv_use_max_workspace_t);
+
+    cudnn_conv1d_pad_to_nc1d = FetchResource<bool>(kernel_ctx, CudaResource::cudnn_conv1d_pad_to_nc1d_t);
+    enable_skip_layer_norm_strict_mode = FetchResource<bool>(
+        kernel_ctx, CudaResource::enable_skip_layer_norm_strict_mode_t);
+    prefer_nhwc = FetchResource<bool>(kernel_ctx, CudaResource::prefer_nhwc_t);
+    use_tf32 = FetchResource<bool>(kernel_ctx, CudaResource::use_tf32_t);
+    fuse_conv_bias = FetchResource<bool>(kernel_ctx, CudaResource::fuse_conv_bias_t);
+  }
+
+  template <typename T>
+  T FetchResource(const OrtKernelContext& kernel_ctx, CudaResource resource_type) {
+    if constexpr (sizeof(T) > sizeof(void*)) {
+      ORT_CXX_API_THROW("void* is not large enough to hold resource type: " + std::to_string(resource_type),
+                        OrtErrorCode::ORT_INVALID_ARGUMENT);
+    }
+    const auto& ort_api = Ort::GetApi();
+    void* resource = {};
+    OrtStatus* status = ort_api.KernelContext_GetResource(
+        &kernel_ctx, ORT_CUDA_RESOURCE_VERSION, resource_type, &resource);
+    if (status) {
+      ORT_CXX_API_THROW("Failed to fetch cuda ep resource, resource type: " + std::to_string(resource_type),
+                        OrtErrorCode::ORT_RUNTIME_EXCEPTION);
+    }
+    T t = {};
+    memcpy(&t, &resource, sizeof(T));
+    return t;
+  }
+
+  void* AllocDeferredCpuMem(size_t size) const {
+    if (0 == size) {
+      return {};
+    }
+    const auto& ort_api = Ort::GetApi();
+    void* mem = {};
+    auto status = ort_api.AllocatorAlloc(deferred_cpu_allocator, size, &mem);
+    if (status) {
+      ORT_CXX_API_THROW("failed to allocate deferred cpu memory", OrtErrorCode::ORT_RUNTIME_EXCEPTION);
+    }
+    return mem;
+  }
+
+  void FreeDeferredCpuMem(void* mem) const {
+    if (mem) {
+      const auto& ort_api = Ort::GetApi();
+      auto status = ort_api.AllocatorFree(deferred_cpu_allocator, mem);
+      if (status) {
+        ORT_CXX_API_THROW("failed to free deferred cpu memory", OrtErrorCode::ORT_RUNTIME_EXCEPTION);
+      }
+    }
+  }
+};
+
+}  // namespace Custom
+}  // namespace Ort
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/cuda/cuda_provider_options.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/cuda/cuda_provider_options.h
new file mode 100644
index 00000000000000..3b7a1e99346d72
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/cuda/cuda_provider_options.h
@@ -0,0 +1,43 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) 2023 NVIDIA Corporation.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <limits>
+
+#include "onnxruntime_c_api.h"
+#include "core/framework/arena_extend_strategy.h"
+
+/// <summary>
+/// Options for the CUDA provider that are passed to SessionOptionsAppendExecutionProvider_CUDA_V2.
+/// Please note that this struct is *similar* to OrtCUDAProviderOptions but only to be used internally.
+/// Going forward, new cuda provider options are to be supported via this struct and usage of the publicly defined
+/// OrtCUDAProviderOptions will be deprecated over time.
+/// User can only get the instance of OrtCUDAProviderOptionsV2 via CreateCUDAProviderOptions.
+/// </summary>
+struct OrtCUDAProviderOptionsV2 {
+  int device_id = 0;                                                                                           // cuda device id.
+  int has_user_compute_stream = 0;                                                                             // indicator of user specified CUDA compute stream.
+  void* user_compute_stream = nullptr;                                                                         // user specified CUDA compute stream.
+  int do_copy_in_default_stream = 1;                                                                           // flag specifying if the default stream is to be used for copying.
+  OrtCudnnConvAlgoSearch cudnn_conv_algo_search = OrtCudnnConvAlgoSearchExhaustive;                            // cudnn algo search enum.
+  size_t gpu_mem_limit = std::numeric_limits<size_t>::max();                                                   // BFC Arena memory limit for CUDA.
+                                                                                                               // (will be overridden by contents of `default_memory_arena_cfg` is it exists)
+  onnxruntime::ArenaExtendStrategy arena_extend_strategy = onnxruntime::ArenaExtendStrategy::kNextPowerOfTwo;  // BFC Arena extension strategy.
+                                                                                                               // (will be overridden by contents of `default_memory_arena_cfg` is it exists)
+  OrtArenaCfg* default_memory_arena_cfg = nullptr;                                                             // BFC Arena config flags.
+  int cudnn_conv_use_max_workspace = 1;                                                                        // flag specifying if maximum workspace can be used in cudnn conv algo search.
+  int enable_cuda_graph = 0;                                                                                   // flag specifying if the CUDA graph is to be captured for the model.
+  int cudnn_conv1d_pad_to_nc1d = 0;                                                                            // flag specifying if pad Conv1D's input [N,C,D] to [N,C,1,D] or [N,C,D,1].
+  int tunable_op_enable = 0;                                                                                   // flag specifying if TunableOp is enabled.
+  int tunable_op_tuning_enable = 0;                                                                            // flag specifying if TunableOp is enabled for tuning, this relies on TunableOp is enabled.
+  int tunable_op_max_tuning_duration_ms = 0;                                                                   // Max tuning duration time limit for TunableOp.
+  int enable_skip_layer_norm_strict_mode = 0;                                                                  // flag specifying if SkipLayerNorm is in strict mode. If true, use LayerNormalization kernel.
+                                                                                                               // The strict mode has better accuracy but lower performance.
+  int prefer_nhwc = 0;                                                                                         // make the CUDA EP NHWC preferred
+  int use_ep_level_unified_stream = 0;                                                                         // flag specifying if ep level stream is used or not
+  int use_tf32 = 1;                                                                                            // use TF32
+  int fuse_conv_bias = 0;                                                                                      // Enable CUDNN Frontend kernel fusing, results in JIT compiles
+  int sdpa_kernel = 0;                                                                                         // Scaled Dot Product Attention kernel option
+};
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/cuda/cuda_resource.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/cuda/cuda_resource.h
new file mode 100644
index 00000000000000..b248d33035bc11
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/cuda/cuda_resource.h
@@ -0,0 +1,23 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/resource.h"
+
+#define ORT_CUDA_RESOURCE_VERSION 3
+
+enum CudaResource : int {
+  cuda_stream_t = cuda_resource_offset,  // 10000
+  cudnn_handle_t,
+  cublas_handle_t,
+  deferred_cpu_allocator_t,
+  // below are cuda ep options
+  device_id_t,  // 10004
+  arena_extend_strategy_t,
+  cudnn_conv_algo_search_t,
+  cudnn_conv_use_max_workspace_t,
+  cudnn_conv1d_pad_to_nc1d_t,
+  enable_skip_layer_norm_strict_mode_t,
+  prefer_nhwc_t,
+  use_tf32_t,
+  fuse_conv_bias_t
+};
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/custom_op_context.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/custom_op_context.h
new file mode 100644
index 00000000000000..b10126da8e0fb5
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/custom_op_context.h
@@ -0,0 +1,10 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+// CustomOpContext defines an interface allowing a custom op to access ep-specific resources.
+struct CustomOpContext {
+  CustomOpContext() = default;
+  virtual ~CustomOpContext() {};
+};
\ No newline at end of file
diff --git a/third_party/microsoft_dxheaders/include/dml_provider_factory.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/dml/dml_provider_factory.h
similarity index 100%
rename from third_party/microsoft_dxheaders/include/dml_provider_factory.h
rename to third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/dml/dml_provider_factory.h
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/dnnl/dnnl_provider_options.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/dnnl/dnnl_provider_options.h
new file mode 100644
index 00000000000000..715bf614cf8dbd
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/dnnl/dnnl_provider_options.h
@@ -0,0 +1,8 @@
+// Copyright(C) 2022 Intel Corporation
+// Licensed under the MIT License
+#pragma once
+
+struct OrtDnnlProviderOptions {
+  int use_arena;          // If arena is used, use_arena 0 = not used, nonzero = used
+  void* threadpool_args;  // Used to enable configure the oneDNN threadpool
+};
\ No newline at end of file
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/nnapi/nnapi_provider_factory.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/nnapi/nnapi_provider_factory.h
new file mode 100644
index 00000000000000..a0aeb5ac26d816
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/nnapi/nnapi_provider_factory.h
@@ -0,0 +1,62 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+
+#include "onnxruntime_c_api.h"
+
+// NNAPIFlags are bool options we want to set for NNAPI EP
+// This enum is defined as bit flags, and cannot have negative value
+// To generate an uint32_t nnapi_flags for using with OrtSessionOptionsAppendExecutionProvider_Nnapi below,
+//   uint32_t nnapi_flags = 0;
+//   nnapi_flags |= NNAPI_FLAG_USE_FP16;
+enum NNAPIFlags {
+  NNAPI_FLAG_USE_NONE = 0x000,
+
+  // Using fp16 relaxation in NNAPI EP, this may improve perf but may also reduce precision
+  NNAPI_FLAG_USE_FP16 = 0x001,
+
+  // Use NCHW layout in NNAPI EP, this is only available after Android API level 29
+  // Please note for now, NNAPI perform worse using NCHW compare to using NHWC
+  NNAPI_FLAG_USE_NCHW = 0x002,
+
+  // Prevent NNAPI from using CPU devices.
+  //
+  // NNAPI is more efficient using GPU or NPU for execution, and NNAPI might fall back to its own CPU implementation
+  // for operations not supported by GPU/NPU. The CPU implementation of NNAPI (which is called nnapi-reference)
+  // might be less efficient than the optimized versions of the operation of ORT. It might be advantageous to disable
+  // the NNAPI CPU fallback and handle execution using ORT kernels.
+  //
+  // For some models, if NNAPI would use CPU to execute an operation, and this flag is set, the execution of the
+  // model may fall back to ORT kernels.
+  //
+  // This option is only available after Android API level 29, and will be ignored for Android API level 28-
+  //
+  // For NNAPI device assignments, see https://developer.android.com/ndk/guides/neuralnetworks#device-assignment
+  // For NNAPI CPU fallback, see https://developer.android.com/ndk/guides/neuralnetworks#cpu-fallback
+  //
+  // Please note, the NNAPI EP will return error status if both NNAPI_FLAG_CPU_DISABLED
+  // and NNAPI_FLAG_CPU_ONLY flags are set
+  NNAPI_FLAG_CPU_DISABLED = 0x004,
+
+  // Using CPU only in NNAPI EP, this may decrease the perf but will provide
+  // reference output value without precision loss, which is useful for validation
+  //
+  // Please note, the NNAPI EP will return error status if both NNAPI_FLAG_CPU_DISABLED
+  // and NNAPI_FLAG_CPU_ONLY flags are set
+  NNAPI_FLAG_CPU_ONLY = 0x008,
+
+  // Keep NNAPI_FLAG_LAST at the end of the enum definition
+  // And assign the last NNAPIFlag to it
+  NNAPI_FLAG_LAST = NNAPI_FLAG_CPU_ONLY,
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ORT_EXPORT ORT_API_STATUS(OrtSessionOptionsAppendExecutionProvider_Nnapi,
+                          _In_ OrtSessionOptions* options, uint32_t nnapi_flags);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/openvino/openvino_provider_factory.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/openvino/openvino_provider_factory.h
new file mode 100644
index 00000000000000..a33a8de485b772
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/openvino/openvino_provider_factory.h
@@ -0,0 +1,23 @@
+// Copyright(C) 2019 Intel Corporation
+// Licensed under the MIT License
+
+#include "onnxruntime_c_api.h"
+
+#ifdef __cplusplus
+struct ProviderInfo_OpenVINO {
+  virtual std::vector<std::string> GetAvailableDevices() const = 0;
+};
+
+extern "C" {
+#endif
+
+/**
+ * \param device_type openvino device type and precision. Could be any of
+ * CPU_FP32, CPU_FP16, GPU_FP32, GPU_FP16
+ */
+ORT_API_STATUS(OrtSessionOptionsAppendExecutionProvider_OpenVINO,
+               _In_ OrtSessionOptions* options, _In_ const char* device_type);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/providers.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/providers.h
new file mode 100644
index 00000000000000..2cfd5acf662939
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/providers.h
@@ -0,0 +1,13 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+namespace onnxruntime {
+class IExecutionProvider;
+
+struct IExecutionProviderFactory {
+  virtual ~IExecutionProviderFactory() = default;
+  virtual std::unique_ptr<IExecutionProvider> CreateProvider() = 0;
+};
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/resource.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/resource.h
new file mode 100644
index 00000000000000..bd123e1cd41c2e
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/resource.h
@@ -0,0 +1,14 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+enum ResourceOffset {
+  cpu_resource_offset = 0,
+  cuda_resource_offset = 10000,
+  dml_resource_offset = 20000,
+  rocm_resource_offset = 30000,
+  // offsets for other ort eps
+  custom_ep_resource_offset = 10000000,
+  // offsets for customized eps
+};
\ No newline at end of file
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/rknpu/rknpu_provider_factory.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/rknpu/rknpu_provider_factory.h
new file mode 100644
index 00000000000000..832171e09d952b
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/rknpu/rknpu_provider_factory.h
@@ -0,0 +1,14 @@
+// Copyright 2020 rock-chips.com Inc.
+
+#include "onnxruntime_c_api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ORT_API_STATUS(OrtSessionOptionsAppendExecutionProvider_Rknpu,
+               _In_ OrtSessionOptions* options);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/rocm/rocm_context.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/rocm/rocm_context.h
new file mode 100644
index 00000000000000..aad1736217129e
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/rocm/rocm_context.h
@@ -0,0 +1,52 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#define ORT_ROCM_CTX
+
+#include "rocm_resource.h"
+#include "core/providers/custom_op_context.h"
+#include <hip/hip_runtime.h>
+#include <miopen/miopen.h>
+#include <hipblas/hipblas.h>
+
+namespace Ort {
+
+namespace Custom {
+
+struct RocmContext : public CustomOpContext {
+  hipStream_t hip_stream = {};
+  miopenHandle_t miopen_handle = {};
+  hipblasHandle_t blas_handle = {};
+
+  void Init(const OrtKernelContext& kernel_ctx) {
+    const auto& ort_api = Ort::GetApi();
+    void* resource = {};
+    OrtStatus* status = nullptr;
+
+    status = ort_api.KernelContext_GetResource(
+        &kernel_ctx, ORT_ROCM_RESOURCE_VERSION, RocmResource::hip_stream_t, &resource);
+    if (status) {
+      ORT_CXX_API_THROW("failed to fetch hip stream", OrtErrorCode::ORT_RUNTIME_EXCEPTION);
+    }
+    hip_stream = reinterpret_cast<hipStream_t>(resource);
+
+    resource = {};
+    status = ort_api.KernelContext_GetResource(
+        &kernel_ctx, ORT_ROCM_RESOURCE_VERSION, RocmResource::miopen_handle_t, &resource);
+    if (status) {
+      ORT_CXX_API_THROW("failed to fetch miopen handle", OrtErrorCode::ORT_RUNTIME_EXCEPTION);
+    }
+    miopen_handle = reinterpret_cast<miopenHandle_t>(resource);
+
+    resource = {};
+    status = ort_api.KernelContext_GetResource(
+        &kernel_ctx, ORT_ROCM_RESOURCE_VERSION, RocmResource::hipblas_handle_t, &resource);
+    if (status) {
+      ORT_CXX_API_THROW("failed to fetch hipblas handle", OrtErrorCode::ORT_RUNTIME_EXCEPTION);
+    }
+    blas_handle = reinterpret_cast<hipblasHandle_t>(resource);
+  }
+};
+
+}  // namespace Custom
+}  // namespace Ort
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/rocm/rocm_resource.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/rocm/rocm_resource.h
new file mode 100644
index 00000000000000..db032b48714c38
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/rocm/rocm_resource.h
@@ -0,0 +1,16 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/resource.h"
+
+#define ORT_ROCM_RESOURCE_VERSION 1
+
+enum RocmResource : int {
+  hip_stream_t = rocm_resource_offset,
+  miopen_handle_t,
+  hipblas_handle_t,
+  deferred_cpu_allocator_t,
+  // below are rocm ep options
+  device_id_t,  // 10004
+  arena_extend_strategy_t
+};
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h
new file mode 100644
index 00000000000000..ec9be80a63574a
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h
@@ -0,0 +1,91 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+/// <summary>
+/// Options for the TensorRT provider that are passed to SessionOptionsAppendExecutionProvider_TensorRT_V2.
+/// Please note that this struct is *similar* to OrtTensorRTProviderOptions but only to be used internally.
+/// Going forward, new trt provider options are to be supported via this struct and usage of the publicly defined
+/// OrtTensorRTProviderOptions will be deprecated over time.
+/// User can only get the instance of OrtTensorRTProviderOptionsV2 via CreateTensorRTProviderOptions.
+/// </summary>
+struct OrtTensorRTProviderOptionsV2 {
+  OrtTensorRTProviderOptionsV2& operator=(const OrtTensorRTProviderOptionsV2& other);  // copy assignment operator
+
+  int device_id{0};                                      // cuda device id.
+  int has_user_compute_stream{0};                        // indicator of user specified CUDA compute stream.
+  void* user_compute_stream{nullptr};                    // user specified CUDA compute stream.
+                                                         // can be updated using: UpdateTensorRTProviderOptionsWithValue
+  int trt_max_partition_iterations{1000};                // maximum iterations for TensorRT parser to get capability
+  int trt_min_subgraph_size{1};                          // minimum size of TensorRT subgraphs
+  size_t trt_max_workspace_size{0};                      // maximum workspace size for TensorRT. Default is 0 means max device memory size
+  int trt_fp16_enable{0};                                // enable TensorRT FP16 precision. Default 0 = false, nonzero = true
+  int trt_int8_enable{0};                                // enable TensorRT INT8 precision. Default 0 = false, nonzero = true
+  const char* trt_int8_calibration_table_name{nullptr};  // TensorRT INT8 calibration table name.
+  int trt_int8_use_native_calibration_table{0};          // use native TensorRT generated calibration table. Default 0 = false, nonzero = true
+  int trt_dla_enable{0};                                 // enable DLA. Default 0 = false, nonzero = true
+  int trt_dla_core{0};                                   // DLA core number. Default 0
+  int trt_dump_subgraphs{0};                             // dump TRT subgraph. Default 0 = false, nonzero = true
+  int trt_engine_cache_enable{0};                        // enable engine caching. Default 0 = false, nonzero = true
+  const char* trt_engine_cache_path{nullptr};            // specify engine cache path, defaults to the working directory
+  int trt_engine_decryption_enable{0};                   // enable engine decryption. Default 0 = false, nonzero = true
+  const char* trt_engine_decryption_lib_path{nullptr};   // specify engine decryption library path
+  int trt_force_sequential_engine_build{0};              // force building TensorRT engine sequentially. Default 0 = false, nonzero = true
+  int trt_context_memory_sharing_enable{0};              // enable context memory sharing between subgraphs. Default 0 = false, nonzero = true
+  int trt_layer_norm_fp32_fallback{0};                   // force Pow + Reduce ops in layer norm to FP32. Default 0 = false, nonzero = true
+  int trt_timing_cache_enable{0};                        // enable TensorRT timing cache. Default 0 = false, nonzero = true
+  const char* trt_timing_cache_path{nullptr};            // specify timing cache path, if none is provided the trt_engine_cache_path is used
+  int trt_force_timing_cache{0};                         // force the TensorRT cache to be used even if device profile does not match. Default 0 = false, nonzero = true
+  int trt_detailed_build_log{0};                         // Enable detailed build step logging on TensorRT EP with timing for each engine build. Default 0 = false, nonzero = true
+  int trt_build_heuristics_enable{0};                    // Build engine using heuristics to reduce build time. Default 0 = false, nonzero = true
+  int trt_sparsity_enable{0};                            // Control if sparsity can be used by TRT. Default 0 = false, 1 = true
+  int trt_builder_optimization_level{3};                 // Set the builder optimization level. WARNING: levels below 3 do not guarantee good engine performance, but greatly improve build time.  Default 3, valid range [0-5]
+  int trt_auxiliary_streams{-1};                         // Set maximum number of auxiliary streams per inference stream. Setting this value to 0 will lead to optimal memory usage. Default -1 = heuristics
+  const char* trt_tactic_sources{nullptr};               // pecify the tactics to be used by adding (+) or removing (-) tactics from the default
+                                                         // tactic sources (default = all available tactics) e.g. "-CUDNN,+CUBLAS" available keys: "CUBLAS"|"CUBLAS_LT"|"CUDNN"|"EDGE_MASK_CONVOLUTIONS"
+  const char* trt_extra_plugin_lib_paths{nullptr};       // specify extra TensorRT plugin library paths
+  const char* trt_profile_min_shapes{nullptr};           // Specify the range of the input shapes to build the engine with
+  const char* trt_profile_max_shapes{nullptr};           // Specify the range of the input shapes to build the engine with
+  const char* trt_profile_opt_shapes{nullptr};           // Specify the range of the input shapes to build the engine with
+  int trt_cuda_graph_enable{0};                          // Enable CUDA graph in ORT TRT
+
+  /*
+   * Please note that there are rules for using following context model related provider options:
+   *
+   * 1. In the case of dumping the context model and loading the context model,
+   *    for security reason, TRT EP doesn't allow the "ep_cache_context" node attribute of EP context node to be
+   *    the absolute path or relative path that is outside of context model directory.
+   *    It means engine cache needs to be in the same directory or sub-directory of context model.
+   *
+   * 2. In the case of dumping the context model, the engine cache path will be changed to the relative path of context model directory.
+   *    For example:
+   *    If "trt_dump_ep_context_model" is enabled and "trt_engine_cache_enable" is enabled,
+   *       if "trt_ep_context_file_path" is "./context_model_dir",
+   *       - if "trt_engine_cache_path" is "" -> the engine cache will be saved to "./context_model_dir"
+   *       - if "trt_engine_cache_path" is "engine_dir" -> the engine cache will be saved to "./context_model_dir/engine_dir"
+   *
+   * 3. In the case of building weight-stripped engines, the same security reasons as listed in 1) apply to the
+   *    "onnx_model_filename" node attribute of EP context node, which contains a filename of the ONNX model with the
+   *    weights needed for the refit process. User can specify a folder path relative to the current working
+   *    directory by means of the "trt_onnx_model_folder_path" option.
+   *
+   */
+  int trt_dump_ep_context_model{0};                 // Dump EP context node model
+  const char* trt_ep_context_file_path{nullptr};    // Specify file name to dump EP context node model. Can be a path or a file name or a file name with path.
+  int trt_ep_context_embed_mode{0};                 // Specify EP context embed mode. Default 0 = context is engine cache path, 1 = context is engine binary data
+  int trt_weight_stripped_engine_enable{0};         // Enable weight-stripped engine build. Default 0 = false,
+                                                    // nonzero = true
+  const char* trt_onnx_model_folder_path{nullptr};  // Folder path relative to the current working directory for
+                                                    // the ONNX model containing the weights (applicable only when
+                                                    // the "trt_weight_stripped_engine_enable" option is enabled)
+  const void* trt_onnx_bytestream{nullptr};         // The byte stream of th original ONNX model containing the weights
+                                                    // (applicable only when the "trt_weight_stripped_engine_enable"
+                                                    // option is enabled)
+                                                    // can be updated using: UpdateTensorRTProviderOptionsWithValue
+  size_t trt_onnx_bytestream_size{0};               // size of the byte stream provided as "trt_onnx_bytestream"
+                                                    // can be updated using: UpdateTensorRTProviderOptionsWithValue
+
+  const char* trt_engine_cache_prefix{nullptr};  // specify engine cache prefix
+  int trt_engine_hw_compatible{0};               // Enable hardware compatibility. Default 0 = false, nonzero = true
+};
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/tvm/tvm_provider_factory.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/tvm/tvm_provider_factory.h
new file mode 100644
index 00000000000000..3dbd270cde9b65
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/tvm/tvm_provider_factory.h
@@ -0,0 +1,19 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#ifndef TVM_EXECUTION_PROVIDER_FACTORY_H
+#define TVM_EXECUTION_PROVIDER_FACTORY_H
+
+#include "onnxruntime_c_api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ORT_API_STATUS(OrtSessionOptionsAppendExecutionProvider_Tvm, _In_ OrtSessionOptions* options, _In_ const char* opt_str);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // TVM_EXECUTION_PROVIDER_FACTORY_H
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/vsinpu/vsinpu_provider_factory.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/vsinpu/vsinpu_provider_factory.h
new file mode 100644
index 00000000000000..a84067a19aa8ad
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/vsinpu/vsinpu_provider_factory.h
@@ -0,0 +1,34 @@
+/****************************************************************************
+ *
+ *    Copyright (c) 2023 Vivante Corporation
+ *
+ *    Permission is hereby granted, free of charge, to any person obtaining a
+ *    copy of this software and associated documentation files (the "Software"),
+ *    to deal in the Software without restriction, including without limitation
+ *    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ *    and/or sell copies of the Software, and to permit persons to whom the
+ *    Software is furnished to do so, subject to the following conditions:
+ *
+ *    The above copyright notice and this permission notice shall be included in
+ *    all copies or substantial portions of the Software.
+ *
+ *    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ *    DEALINGS IN THE SOFTWARE.
+ *
+ *****************************************************************************/
+#include "onnxruntime_c_api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ORT_API_STATUS(OrtSessionOptionsAppendExecutionProvider_VSINPU, _In_ OrtSessionOptions* options);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/webgpu/webgpu_provider_factory.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/webgpu/webgpu_provider_factory.h
new file mode 100644
index 00000000000000..0b45b847d651ff
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/webgpu/webgpu_provider_factory.h
@@ -0,0 +1,14 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// Dummy file to provide a signal in the ONNX Runtime C cocoapod as to whether the WebGPU EP was included in the build.
+// If it was, this file will be included in the cocoapod, and a test like this can be used:
+//
+//   #if __has_include(<onnxruntime/webgpu_provider_factory.h>)
+//     #define WEBGPU_EP_AVAILABLE 1
+//   #else
+//     #define WEBGPU_EP_AVAILABLE 0
+//   #endif
+
+// The WebGPU EP can be enabled via the generic SessionOptionsAppendExecutionProvider method, so no direct usage of
+// the provider factory is required.
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/winml/winml_provider_factory.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/winml/winml_provider_factory.h
new file mode 100644
index 00000000000000..19fd2f0ad7f701
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/providers/winml/winml_provider_factory.h
@@ -0,0 +1,16 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// The winml "provider factory" is not a true execution provider.
+// It is placed here as an execution provider to facilitate the export of the WinMLAdapter API
+// via the OrtGetWinMLAdapter method.
+
+#include "onnxruntime_c_api.h"
+
+struct OrtWinApi;
+typedef struct OrtWinApi OrtWinApi;
+
+struct WinmlAdapterApi;
+typedef struct WinmlAdapterApi WinmlAdapterApi;
+
+ORT_EXPORT const WinmlAdapterApi* ORT_API_CALL OrtGetWinMLAdapter(_In_ uint32_t ort_api_version) NO_EXCEPTION;
\ No newline at end of file
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/session/environment.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/session/environment.h
new file mode 100644
index 00000000000000..20c7cace152d5e
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/session/environment.h
@@ -0,0 +1,103 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <atomic>
+#include <memory>
+#include "core/common/common.h"
+#include "core/common/status.h"
+#include "core/platform/threadpool.h"
+#include "core/common/logging/logging.h"
+#include "core/framework/allocator.h"
+
+struct OrtThreadingOptions;
+namespace onnxruntime {
+/** TODO: remove this class
+   Provides the runtime environment for onnxruntime.
+   Create one instance for the duration of execution.
+*/
+class Environment {
+ public:
+  /**
+     Create and initialize the runtime environment.
+    @param logging manager instance that will enable per session logger output using
+    session_options.session_logid as the logger id in messages.
+    If nullptr, the default LoggingManager MUST have been created previously as it will be used
+    for logging. This will use the default logger id in messages.
+    See core/common/logging/logging.h for details, and how LoggingManager::DefaultLogger works.
+    @param tp_options optional set of parameters controlling the number of intra and inter op threads for the global
+    threadpools.
+    @param create_global_thread_pools determine if this function will create the global threadpools or not.
+  */
+  static Status Create(std::unique_ptr<logging::LoggingManager> logging_manager,
+                       std::unique_ptr<Environment>& environment,
+                       const OrtThreadingOptions* tp_options = nullptr,
+                       bool create_global_thread_pools = false);
+
+  logging::LoggingManager* GetLoggingManager() const {
+    return logging_manager_.get();
+  }
+
+  void SetLoggingManager(std::unique_ptr<onnxruntime::logging::LoggingManager> logging_manager) {
+    logging_manager_ = std::move(logging_manager);
+  }
+
+  onnxruntime::concurrency::ThreadPool* GetIntraOpThreadPool() const {
+    return intra_op_thread_pool_.get();
+  }
+
+  onnxruntime::concurrency::ThreadPool* GetInterOpThreadPool() const {
+    return inter_op_thread_pool_.get();
+  }
+
+  bool EnvCreatedWithGlobalThreadPools() const {
+    return create_global_thread_pools_;
+  }
+
+  /**
+   * Registers an allocator for sharing between multiple sessions.
+   * Return an error if an allocator with the same OrtMemoryInfo is already registered.
+   */
+  Status RegisterAllocator(AllocatorPtr allocator);
+
+  /**
+   * Creates and registers an allocator for sharing between multiple sessions.
+   * Return an error if an allocator with the same OrtMemoryInfo is already registered.
+   */
+  Status CreateAndRegisterAllocator(const OrtMemoryInfo& mem_info, const OrtArenaCfg* arena_cfg = nullptr);
+
+  /**
+   * Returns the list of registered allocators in this env.
+   */
+  const std::vector<AllocatorPtr>& GetRegisteredSharedAllocators() const {
+    return shared_allocators_;
+  }
+
+  /**
+   * Removes registered allocator that was previously registered for sharing between multiple sessions.
+   */
+  Status UnregisterAllocator(const OrtMemoryInfo& mem_info);
+
+  Environment() = default;
+
+  /**
+   * Create and register an allocator, specified by provider_type, for sharing between multiple sessions.
+   * Return an error if an allocator with the same OrtMemoryInfo is already registered.
+   * For provider_type please refer core/graph/constants.h
+   */
+  Status CreateAndRegisterAllocatorV2(const std::string& provider_type, const OrtMemoryInfo& mem_info, const std::unordered_map<std::string, std::string>& options, const OrtArenaCfg* arena_cfg = nullptr);
+
+ private:
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(Environment);
+  Status Initialize(std::unique_ptr<logging::LoggingManager> logging_manager,
+                    const OrtThreadingOptions* tp_options = nullptr,
+                    bool create_global_thread_pools = false);
+
+  std::unique_ptr<logging::LoggingManager> logging_manager_;
+  std::unique_ptr<onnxruntime::concurrency::ThreadPool> intra_op_thread_pool_;
+  std::unique_ptr<onnxruntime::concurrency::ThreadPool> inter_op_thread_pool_;
+  bool create_global_thread_pools_{false};
+  std::vector<AllocatorPtr> shared_allocators_;
+};
+}  // namespace onnxruntime
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/session/experimental_onnxruntime_cxx_api.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/session/experimental_onnxruntime_cxx_api.h
new file mode 100644
index 00000000000000..c1a7839ff22fa7
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/session/experimental_onnxruntime_cxx_api.h
@@ -0,0 +1,69 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// Summary: The experimental Ort C++ API is a wrapper around the Ort C++ API.
+//
+// This C++ API further simplifies usage and provides support for modern C++ syntax/features
+// at the cost of some overhead (i.e. using std::string over char *).
+//
+// Where applicable, default memory allocator options are used unless explicitly set.
+//
+// Experimental components are designed as drop-in replacements of the regular API, requiring
+// minimal code modifications to use.
+//
+// Example:  Ort::Session -> Ort::Experimental::Session
+//
+// NOTE: Experimental API components are subject to change based on feedback and provide no
+// guarantee of backwards compatibility in future releases.
+
+#pragma once
+#include "onnxruntime_cxx_api.h"
+
+namespace Ort {
+namespace Experimental {
+
+struct Session : Ort::Session {
+  Session(Env& env, std::basic_string<ORTCHAR_T>& model_path, SessionOptions& options)
+      : Ort::Session(env, model_path.data(), options) {};
+  Session(Env& env, void* model_data, size_t model_data_length, SessionOptions& options)
+      : Ort::Session(env, model_data, model_data_length, options) {};
+
+  // overloaded Run() with sensible defaults
+  std::vector<Ort::Value> Run(const std::vector<std::string>& input_names,
+                              const std::vector<Ort::Value>& input_values,
+                              const std::vector<std::string>& output_names,
+                              const RunOptions& run_options = RunOptions());
+  void Run(const std::vector<std::string>& input_names,
+           const std::vector<Ort::Value>& input_values,
+           const std::vector<std::string>& output_names,
+           std::vector<Ort::Value>& output_values,
+           const RunOptions& run_options = RunOptions());
+
+  // convenience methods that simplify common lower-level API calls
+  std::vector<std::string> GetInputNames() const;
+  std::vector<std::string> GetOutputNames() const;
+  std::vector<std::string> GetOverridableInitializerNames() const;
+
+  // NOTE: shape dimensions may have a negative value to indicate a symbolic/unknown dimension.
+  std::vector<std::vector<int64_t> > GetInputShapes() const;
+  std::vector<std::vector<int64_t> > GetOutputShapes() const;
+  std::vector<std::vector<int64_t> > GetOverridableInitializerShapes() const;
+};
+
+struct Value : Ort::Value {
+  Value(OrtValue* p)
+      : Ort::Value(p) {};
+
+  template <typename T>
+  static Ort::Value CreateTensor(T* p_data, size_t p_data_element_count, const std::vector<int64_t>& shape);
+  static Ort::Value CreateTensor(void* p_data, size_t p_data_byte_count, const std::vector<int64_t>& shape, ONNXTensorElementDataType type);
+
+  template <typename T>
+  static Ort::Value CreateTensor(const std::vector<int64_t>& shape);
+  static Ort::Value CreateTensor(const std::vector<int64_t>& shape, ONNXTensorElementDataType type);
+};
+
+}  // namespace Experimental
+}  // namespace Ort
+
+#include "experimental_onnxruntime_cxx_inline.h"
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/session/experimental_onnxruntime_cxx_inline.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/session/experimental_onnxruntime_cxx_inline.h
new file mode 100644
index 00000000000000..b951fa07c9da0b
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/session/experimental_onnxruntime_cxx_inline.h
@@ -0,0 +1,108 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// Do not include this file directly. Please include "experimental_onnxruntime_cxx_api.h" instead.
+//
+// These are the inline implementations of the C++ header APIs. They are in this separate file as to not clutter
+// the main C++ file with implementation details.
+namespace Ort {
+namespace Experimental {
+
+inline std::vector<Ort::Value> Session::Run(const std::vector<std::string>& input_names, const std::vector<Ort::Value>& input_values,
+                                            const std::vector<std::string>& output_names, const RunOptions& run_options) {
+  size_t output_names_count = GetOutputNames().size();
+  std::vector<Ort::Value> output_values;
+  for (size_t i = 0; i < output_names_count; i++) output_values.emplace_back(nullptr);
+  Run(input_names, input_values, output_names, output_values, run_options);
+  return output_values;
+}
+
+inline void Session::Run(const std::vector<std::string>& input_names, const std::vector<Ort::Value>& input_values,
+                         const std::vector<std::string>& output_names, std::vector<Ort::Value>& output_values, const RunOptions& run_options) {
+  size_t input_names_count = input_names.size();
+  size_t output_names_count = output_names.size();
+  std::vector<const char*> input_names_(input_names_count, nullptr);
+  size_t i = 0;
+  for (auto it = input_names.begin(); it != input_names.end(); it++) input_names_[i++] = (*it).c_str();
+  std::vector<const char*> output_names_(output_names_count, nullptr);
+  i = 0;
+  for (auto it = output_names.begin(); it != output_names.end(); it++) output_names_[i++] = (*it).c_str();
+  Ort::Session::Run(run_options, input_names_.data(), input_values.data(), input_names_count, output_names_.data(), output_values.data(), output_names_count);
+}
+
+inline std::vector<std::string> Session::GetInputNames() const {
+  Ort::AllocatorWithDefaultOptions allocator;
+  size_t node_count = GetInputCount();
+  std::vector<std::string> out(node_count);
+  for (size_t i = 0; i < node_count; i++) {
+    auto tmp = GetInputNameAllocated(i, allocator);
+    out[i] = tmp.get();
+  }
+  return out;
+}
+
+inline std::vector<std::string> Session::GetOutputNames() const {
+  Ort::AllocatorWithDefaultOptions allocator;
+  size_t node_count = GetOutputCount();
+  std::vector<std::string> out(node_count);
+  for (size_t i = 0; i < node_count; i++) {
+    auto tmp = GetOutputNameAllocated(i, allocator);
+    out[i] = tmp.get();
+  }
+  return out;
+}
+
+inline std::vector<std::string> Session::GetOverridableInitializerNames() const {
+  Ort::AllocatorWithDefaultOptions allocator;
+  size_t init_count = GetOverridableInitializerCount();
+  std::vector<std::string> out(init_count);
+  for (size_t i = 0; i < init_count; i++) {
+    auto tmp = GetOverridableInitializerNameAllocated(i, allocator);
+    out[i] = tmp.get();
+  }
+  return out;
+}
+
+inline std::vector<std::vector<int64_t>> Session::GetInputShapes() const {
+  size_t node_count = GetInputCount();
+  std::vector<std::vector<int64_t>> out(node_count);
+  for (size_t i = 0; i < node_count; i++) out[i] = GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape();
+  return out;
+}
+
+inline std::vector<std::vector<int64_t>> Session::GetOutputShapes() const {
+  size_t node_count = GetOutputCount();
+  std::vector<std::vector<int64_t>> out(node_count);
+  for (size_t i = 0; i < node_count; i++) out[i] = GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape();
+  return out;
+}
+
+inline std::vector<std::vector<int64_t>> Session::GetOverridableInitializerShapes() const {
+  size_t init_count = GetOverridableInitializerCount();
+  std::vector<std::vector<int64_t>> out(init_count);
+  for (size_t i = 0; i < init_count; i++) out[i] = GetOverridableInitializerTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape();
+  return out;
+}
+
+template <typename T>
+inline Ort::Value Value::CreateTensor(T* p_data, size_t p_data_element_count, const std::vector<int64_t>& shape) {
+  return CreateTensor(p_data, p_data_element_count * sizeof(T), shape, TypeToTensorType<T>::type);
+}
+
+inline Ort::Value Value::CreateTensor(void* p_data, size_t p_data_byte_count, const std::vector<int64_t>& shape, ONNXTensorElementDataType type) {
+  Ort::MemoryInfo info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
+  return Ort::Value::CreateTensor(info, p_data, p_data_byte_count, shape.data(), shape.size(), type);
+}
+
+template <typename T>
+inline Ort::Value Value::CreateTensor(const std::vector<int64_t>& shape) {
+  return CreateTensor(shape, TypeToTensorType<T>::type);
+}
+
+inline Ort::Value Value::CreateTensor(const std::vector<int64_t>& shape, ONNXTensorElementDataType type) {
+  Ort::AllocatorWithDefaultOptions allocator;
+  return Ort::Value::CreateTensor(allocator, shape.data(), shape.size(), type);
+}
+
+}  // namespace Experimental
+}  // namespace Ort
diff --git a/third_party/microsoft_dxheaders/include/onnxruntime_c_api.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/session/onnxruntime_c_api.h
similarity index 93%
rename from third_party/microsoft_dxheaders/include/onnxruntime_c_api.h
rename to third_party/onnxruntime_headers/src/include/onnxruntime/core/session/onnxruntime_c_api.h
index ca6fe9b034fab5..c883ffa1003209 100644
--- a/third_party/microsoft_dxheaders/include/onnxruntime_c_api.h
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -665,8 +665,8 @@ typedef struct OrtApi OrtApi;
 struct OrtTrainingApi;
 typedef struct OrtTrainingApi OrtTrainingApi;
 
-struct OrtGraphApi;
-typedef struct OrtGraphApi OrtGraphApi;
+struct OrtModelBuilderApi;
+typedef struct OrtModelBuilderApi OrtModelBuilderApi;
 
 /** \brief The helper interface to get the right version of OrtApi
  *
@@ -850,7 +850,8 @@ struct OrtApi {
    *
    * \snippet{doc} snippets.dox OrtStatus Return Value
    */
-  ORT_API2_STATUS(CreateSessionFromArray, _In_ const OrtEnv* env, _In_ const void* model_data, size_t model_data_length,
+  ORT_API2_STATUS(CreateSessionFromArray, _In_ const OrtEnv* env,
+                  _In_ const void* model_data, size_t model_data_length,
                   _In_ const OrtSessionOptions* options, _Outptr_ OrtSession** out);
 
   /** \brief Run the model in an ::OrtSession
@@ -1343,6 +1344,8 @@ struct OrtApi {
    * Create a tensor with user's buffer. You can fill the buffer either before calling this function or after.
    * p_data is owned by caller. ReleaseValue won't release p_data.
    *
+   * If you wish to transfer ownership of p_data to ORT use CreateTensorWithDataAndDeleterAsOrtValue.
+   *
    * \param[in] info Memory description of where the p_data buffer resides (CPU vs GPU etc).
    * \param[in] p_data Pointer to the data buffer.
    * \param[in] p_data_len The number of bytes in the data buffer.
@@ -2890,7 +2893,8 @@ struct OrtApi {
    * \snippet{doc} snippets.dox OrtStatus Return Value
    */
   ORT_API2_STATUS(CreateSessionWithPrepackedWeightsContainer, _In_ const OrtEnv* env, _In_ const ORTCHAR_T* model_path,
-                  _In_ const OrtSessionOptions* options, _Inout_ OrtPrepackedWeightsContainer* prepacked_weights_container,
+                  _In_ const OrtSessionOptions* options,
+                  _Inout_ OrtPrepackedWeightsContainer* prepacked_weights_container,
                   _Outptr_ OrtSession** out);
 
   /** \brief Create session from memory with prepacked weights container
@@ -2913,7 +2917,8 @@ struct OrtApi {
    */
   ORT_API2_STATUS(CreateSessionFromArrayWithPrepackedWeightsContainer, _In_ const OrtEnv* env,
                   _In_ const void* model_data, size_t model_data_length,
-                  _In_ const OrtSessionOptions* options, _Inout_ OrtPrepackedWeightsContainer* prepacked_weights_container,
+                  _In_ const OrtSessionOptions* options,
+                  _Inout_ OrtPrepackedWeightsContainer* prepacked_weights_container,
                   _Outptr_ OrtSession** out);
 
   /// @}
@@ -4782,8 +4787,133 @@ struct OrtApi {
   ORT_API2_STATUS(SetEpDynamicOptions, _Inout_ OrtSession* sess, _In_reads_(kv_len) const char* const* keys,
                   _In_reads_(kv_len) const char* const* values, _In_ size_t kv_len);
 
-  // Get the OrtGraphApi instance for creating a model dynamically.
-  const OrtGraphApi*(ORT_API_CALL* GetGraphApi)();
+  /** \brief Get the Model Builder API instance
+   *
+   * Get the Model Builder API instance to create a new model or augment an existing model.
+   *
+   * \return Model Builder API struct
+   *
+   * \since Version 1.21.
+   */
+  const OrtModelBuilderApi*(ORT_API_CALL* GetModelBuilderApi)();
+
+  /** \brief Create an OrtValue for a Tensor that uses pre-existing memory.
+   *
+   * Create an OrtValue for a Tensor that uses pre-existing memory. ORT will take ownership of the memory and free it
+   * using the provided deleter when no longer in use.
+   *
+   * \param[in] deleter OrtAllocator instance that will be used to free the memory.
+   *                    Only the OrtAllocator:Info and OrtAllocator::Release functions are required.
+   *                    The OrtMemoryInfo returned by OrtAllocator::Info must match the location of p_data.
+   * \param[in] p_data Pointer to the memory that will be used by the Tensor. ORT will take ownership of the memory.
+   * \param[in] p_data_len Length of the memory in bytes.
+   * \param[in] shape Dimensions of the Tensor. All values should be > 0.
+   * \param[in] shape_len Number of dimensions in the shape array.
+   * \param[in] type Data type of the Tensor.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.21.
+   */
+  ORT_API2_STATUS(CreateTensorWithDataAndDeleterAsOrtValue, _In_ OrtAllocator* deleter,
+                  _In_ void* p_data, size_t p_data_len,
+                  _In_ const int64_t* shape, size_t shape_len,
+                  ONNXTensorElementDataType type,
+                  _Outptr_ OrtValue** out);
+
+  /** \brief Query the session for the opset version of a domain.
+   *
+   * When using the Model Builder API to augment a model, any new nodes must conform to the opset version of the
+   * original model.
+   *
+   * \param[in] session OrtSession to query
+   * \param[in] domain Domain to query. The ONNX domain is an empty string.
+   * \param[out] opset The opset version of the domain.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value. Returns an error if the domain is not used in the model.
+   *
+   * \since Version 1.21.
+   */
+  ORT_API2_STATUS(SessionGetOpsetForDomain, _In_ const OrtSession* session, _In_ const char* domain, _Out_ int* opset);
+
+  /** \brief Create an OrtTypeInfo instance for a Tensor.
+   *
+   * Create an OrtTypeInfo instance for a Tensor to use as graph inputs/outputs with the Model Builder API.
+   *
+   * User can release `tensor_info` after creating the OrtTypeInfo.
+   *
+   * \param[in] tensor_info Tensor type and shape information.
+   * \param[out] TypeInfo instance for the tensor.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.21.
+   */
+  ORT_API2_STATUS(CreateTensorTypeInfo, _In_ const OrtTensorTypeAndShapeInfo* tensor_info,
+                  _Out_ OrtTypeInfo** type_info);
+
+  /** \brief Create an OrtTypeInfo instance for a SparseTensor.
+   *
+   * Create an OrtTypeInfo instance for a SparseTensor to use as graph inputs/outputs with the Model Builder API.
+   *
+   * User can release `tensor_info` after creating the OrtTypeInfo.
+   *
+   * \param[in] tensor_info SparseTensor type and shape information.
+   * \param[out] TypeInfo instance for the tensor.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.21.
+   */
+  ORT_API2_STATUS(CreateSparseTensorTypeInfo, _In_ const OrtTensorTypeAndShapeInfo* tensor_info,
+                  _Out_ OrtTypeInfo** type_info);
+
+  /** \brief Create an OrtTypeInfo instance for a Map.
+   *
+   * Create an OrtTypeInfo instance for a Map to use as graph inputs/outputs with the Model Builder API.
+   *
+   * User can release `map_value_type` after creating the OrtTypeInfo.
+   *
+   * \param[in] map_key_type Key type for the map.
+   * \param[in] map_value_type Value type for the map.
+   * \param[out] TypeInfo instance for the map.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.21.
+   */
+  ORT_API2_STATUS(CreateMapTypeInfo, ONNXTensorElementDataType map_key_type, _In_ const OrtTypeInfo* map_value_type,
+                  _Out_ OrtTypeInfo** type_info);
+
+  /** \brief Create an OrtTypeInfo instance for a Sequence.
+   *
+   * Create an OrtTypeInfo instance for a Sequence to use as graph inputs/outputs with the Model Builder API.
+   *
+   * User can release `sequence_type` after creating the OrtTypeInfo.
+   *
+   * \param[in] sequence_type Sequence type and shape information.
+   * \param[out] TypeInfo instance for the sequence.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.21.
+   */
+  ORT_API2_STATUS(CreateSequenceTypeInfo, _In_ const OrtTypeInfo* sequence_type, _Out_ OrtTypeInfo** type_info);
+
+  /** \brief Create an OrtTypeInfo instance for an Optional.
+   *
+   * Create an OrtTypeInfo instance for an Optional to use as graph inputs/outputs with the Model Builder API.
+   *
+   * User can release `contained_type` after creating the OrtTypeInfo.
+   *
+   * \param[in] tensor_info Tensor type and shape information.
+   * \param[out] TypeInfo instance for the tensor.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.21.
+   */
+  ORT_API2_STATUS(CreateOptionalTypeInfo, _In_ const OrtTypeInfo* contained_type, _Out_ OrtTypeInfo** type_info);
 };
 
 /*
@@ -4899,120 +5029,296 @@ struct OrtCustomOp {
 };
 
 /**
- * ORT Graph API
+ * ORT Model Builder API
  */
 ORT_RUNTIME_CLASS(Model);
 ORT_RUNTIME_CLASS(Graph);
 ORT_RUNTIME_CLASS(Node);
-ORT_RUNTIME_CLASS(Shape);      // Shape to enable support for dynamic shapes in the future
-ORT_RUNTIME_CLASS(ValueInfo);  // could be Tensor if we don't think we ever need to support non-tensor types
-
-// TODO: Should we prefer ownership transfer (possible double free on misuse) or require the user to manage the
-// lifetime (possible invalidation of memory if done at the wrong time, or leak if not done)?
-// If we expect the C++ API to be the primary API, the former might be a better fit.
-// We could use _Inout_ to indicate ownership transfer, and set the input to nullptr after the call to make any
-// misuse obvious to the developer calling the API during development.
-struct OrtGraphApi {
-  // Option A:
-  // - Allows the API usage to be more natural as it's not dictated by the current implementation of the ORT C++
-  //     classes
-  //   - Uses intermediate structs to build the model
-  //   - Converts to onnxruntime C++ classes at the end
-  //   - Less efficient to use intermediate classes
-  //
-  // General setup for the types is Create/Modify/Add or Release. Ownership transfers with Add
-
-  // Standalone Graph. needs interface struct to hold info with conversion to the C++ types when everything
-  // is complete. can create a subgraph more directly but requires interface structs to hold all the model info with
-  // copy to the C++ types at the end.
-
-  //
-  // Shape related APIs
-  //
-
-  ORT_API2_STATUS(CreateFixedShape, _In_ const int64_t* dim_values, size_t dim_count, _Outptr_ OrtShape** shape);
-  ORT_API2_STATUS(CreateShape, _Outptr_ OrtShape** shape);
-  ORT_API2_STATUS(AddDimension, _In_ OrtShape* shape, int64_t dim_value);
-  ORT_API2_STATUS(AddDynamicDimension, _In_ OrtShape* shape, const char* dimension_name);
-  ORT_CLASS_RELEASE(Shape);  // call if not added to ValueInfo
-
-  //
-  // ValueInfo APIs (for graph inputs/outputs)
-  //
-
-  // start with Tensor only. add helpers for other types as needed. ValueInfo takes ownership of the Shape.
-  // ValueInfo takes ownership of `shape` input.
-  ORT_API2_STATUS(CreateTensorValueInfo, _In_ const char* name, _In_ ONNXTensorElementDataType type,
-                  _Inout_ OrtShape** shape, _Outptr_ OrtValueInfo** value_info);
-  ORT_CLASS_RELEASE(ValueInfo);  // call if not added to Graph
-
-  //
-  // Node APIs
-  //
-
-  // Create attributes with CreateOpAttr.
-  // Node takes ownership of OrtOpAttr instances if provided.
-  // Technically we could have an AddNode function that creates and adds the node to the Graph class in one,
-  // but a CreateNode function makes all the APIs more consistent in their usage (Create/Modify/Add or Release) and
-  // allows operations between the Create and Add as needed.
-  // NOTE: `attributes` is an array of OrtOpAttr**. The value pointed to by an entry in the array will be set
-  //       to nullptr if ownership transfers.
+ORT_RUNTIME_CLASS(ValueInfo);
+
+/**
+ * \brief The OrtModelBuilderApi struct provides functions to create or augment an ONNX model.
+ *
+ * See onnxruntime/test/shared_lib/test_model_builder_api.cc for example usage.
+ *
+ * \since Version 1.21.
+ */
+struct OrtModelBuilderApi {
+  /** \brief Create an OrtValueInfo for use as an OrtGraph input or output.
+   *
+   * \param[in] name The name of the input or output.
+   * \param[in] type_info The type information for the input or output.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.21.
+   */
+  ORT_API2_STATUS(CreateValueInfo, _In_ const char* name, _In_ const OrtTypeInfo* type_info,
+                  _Outptr_ OrtValueInfo** value_info);
+
+  /** \brief Get the name from an OrtValueInfo instance.
+   *
+   * \param[in] value_info The OrtValueInfo instance.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.21.
+   */
+  ORT_API2_STATUS(GetValueInfoName, _In_ const OrtValueInfo* value_info, _Out_ const char** name);
+
+  /** \brief Get the type information from an OrtValueInfo instance.
+   *
+   * \param[in] value_info The OrtValueInfo instance.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.21.
+   */
+  ORT_API2_STATUS(GetValueInfoTypeInfo, _In_ const OrtValueInfo* value_info, _Outptr_ const OrtTypeInfo** type_info);
+
+  /** \brief Release an OrtValueInfo instance if it was not added to an OrtGraph.
+   * \since Version 1.21.
+   */
+  ORT_CLASS_RELEASE(ValueInfo);
+
+  /** \brief Create an OrtNode to add to an OrtGraph.
+   *
+   * Create an OrtNode.
+   *
+   * Create attributes with CreateOpAttr. OrtOpAttr instances are copied.
+   *
+   * \param[in] operator_name The name of the operator.
+   * \param[in] domain_name The domain of the operator. Use an empty string for ONNX operators.
+   * \param[in] node_name The name of the node.
+   * \param[in] input_names The names of the inputs.
+   * \param[in] input_names_len The number of input names.
+   * \param[in] output_names The names of the outputs.
+   * \param[in] output_names_len The number of output names.
+   * \param[in] attributes The optional attributes of the node.
+   * \param[in] attribs_len The number of attributes. May be zero.
+   * \param[out] node The OrtNode instance.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.21.
+   */
   ORT_API2_STATUS(CreateNode, _In_ const char* operator_name, const char* domain_name, _In_ const char* node_name,
                   _In_reads_(input_names_len) const char* const* input_names, size_t input_names_len,
                   _In_reads_(output_names_len) const char* const* output_names, size_t output_names_len,
-                  _In_reads_(attribs_len) _Inout_opt_ OrtOpAttr*** attributes, _In_opt_ size_t attribs_len,
+                  _In_reads_(attribs_len) _In_opt_ OrtOpAttr** attributes, _In_ size_t attribs_len,
                   _Outptr_ OrtNode** node);
-  ORT_CLASS_RELEASE(Node);  // call if not added to Graph
 
-  //
-  // Graph APIs
-  //
+  /** \brief Release an OrtNode if it was not added to an OrtGraph.
+   * \since Version 1.21.
+   */
+  ORT_CLASS_RELEASE(Node);
 
+  /** \brief Create an OrtGraph
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   * \since Version 1.21.
+   */
   ORT_API2_STATUS(CreateGraph, _Outptr_ OrtGraph** graph);
-  ORT_API2_STATUS(AddInput, _In_ OrtGraph* graph, _Inout_ OrtValueInfo** value_info);
-  ORT_API2_STATUS(AddOutput, _In_ OrtGraph* graph, _Inout_ OrtValueInfo** value_info);
-
-  // 2 use cases. User is free to choose either approach but the suggested usage would be:
-  //
-  // 1: Weights:
-  //             Use CreateTensorWithDataAsOrtValue to create an OrtValue with a tensor that contains a pointer to
-  //             the existing data. We can convert to a TensorProto with the existing memory tag when adding to the
-  //             onnxruntime::Graph. User must keep pointer valid.
-  //
-  // 2: Values used as inputs to nodes:
-  //             e.g. min/max input of Clip, indices for Gather.
-  //             Use CreateTensorAsOrtValue (allocates memory) and populate the tensor with the actual data.
-  //             We will copy the data when converting to TensorProto so user doesn't need to keep the data alive.
-  ORT_API2_STATUS(AddInitializer, _In_ OrtGraph* graph, _In_ const char* name, _Inout_ OrtValue** tensor);
-  ORT_API2_STATUS(AddNode, _In_ OrtGraph* graph, _In_ OrtNode** node);
-  ORT_CLASS_RELEASE(Graph);  // call if not added to Model
-
-  //
-  // Model APIs
-  //
-
-  // TODO: Should we allow ModelMetadata to be provided?
-  // We have existing APIs to read the ModelMetadata but none to create it. Can add if/when needed.
+
+  /** \brief Set the inputs for the OrtGraph.
+   *
+   * Set the graph inputs.
+   * The OrtGraph takes ownership of the OrtValueInfo instances and you should NOT call ReleaseOrtValueInfo.
+   *
+   * \param[in] graph The OrtGraph instance to update.
+   * \param[in] inputs The input OrtValueInfo instances.
+   * \param[in] inputs_len The number of input OrtValueInfo instances.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.21.
+   */
+  ORT_API2_STATUS(SetGraphInputs, _In_ OrtGraph* graph,
+                  _In_reads_(inputs_len) _In_ OrtValueInfo** inputs, _In_ size_t inputs_len);
+
+  /** \brief Set the outputs for the OrtGraph.
+   *
+   * Set the graph outputs.
+   * The OrtGraph takes ownership of the OrtValueInfo instances provided and you should NOT call ReleaseOrtValueInfo.
+   *
+   * \param[in] graph The OrtGraph instance to update.
+   * \param[in] outputs The output OrtValueInfo instances.
+   * \param[in] outputs_len The number of output OrtValueInfo instances.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.21.
+   */
+  ORT_API2_STATUS(SetGraphOutputs, _In_ OrtGraph* graph,
+                  _In_reads_(outputs_len) _In_ OrtValueInfo** outputs, _In_ size_t outputs_len);
+
+  /** \brief Add an initializer to the OrtGraph
+   *
+   * Add the initializer to the graph.
+   * ORT will take ownership of the OrtValue and you should NOT call ReleaseOrtValue.
+   *
+   * Two options:
+   *
+   * Pre-existing memory:
+   *    Use CreateTensorWithDataAsOrtValue or CreateTensorWithDataAndDeleterAsOrtValue to create an OrtValue
+   *    with a tensor that contains a pointer to the existing data.
+   *    User must keep pointer valid for lifetime of the inference session.
+   *    Set `data_is_external` to true.
+   *
+   * Allocated memory:
+   *    Use CreateTensorAsOrtValue (allocates memory) and populate the tensor with the data.
+   *    Set `data_is_external` to false.
+   *
+   * \param[in] graph The OrtGraph instance to update.
+   * \param[in] name The value name for the initializer.
+   * \param[in] tensor The OrtValue instance containing the tensor data.
+   * \param[in] data_is_external Set to true if the data is external and should not be copied.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.21.
+   */
+  ORT_API2_STATUS(AddInitializerToGraph, _In_ OrtGraph* graph, _In_ const char* name, _Inout_ OrtValue* tensor,
+                  bool data_is_external);
+
+  /** \brief Add an OrtNode to an OrtGraph
+   *
+   * Add the node to the graph. The OrtGraph will take ownership of OrtNode and you should NOT call ReleaseOrtNode.
+   *
+   * \param[in] graph The OrtGraph instance to update.
+   * \param[in] node The OrtNode instance to add to the graph.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.21.
+   */
+  ORT_API2_STATUS(AddNodeToGraph, _In_ OrtGraph* graph, _In_ OrtNode* node);
+
+  /** \brief Release an OrtGraph if it was not added to an OrtModel.
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   * \since Version 1.21.
+   */
+  ORT_CLASS_RELEASE(Graph);
+
+  /** \brief Create an OrtModel.
+   *
+   * Create an OrtModel.
+   *
+   * This can be used to build a new model, or to augment an existing model.
+   *
+   * \param[in] domain_names The domain names for the model.
+   *                         If augmenting an existing model add additional domains if needed.
+   * \param[in] opset_versions The opset versions for the model.
+   *                           If augmenting an existing model add additional opset versions if needed.
+   * \param[in] opset_entries_len The number of domain_names and opset_versions entries.
+   *                              Domain and opset entries should be 1:1
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.21.
+   */
   ORT_API2_STATUS(CreateModel,
                   _In_reads_(opset_entries_len) const char* const* domain_names,
                   _In_reads_(opset_entries_len) const int* opset_versions,
                   size_t opset_entries_len,
                   _Outptr_ OrtModel** model);
 
-  ORT_API2_STATUS(AddGraph, _In_ OrtModel* model, _Inout_ OrtGraph** graph);
-  ORT_CLASS_RELEASE(Model);
+  /** \brief Add an OrtGraph to an OrtModel.
+   *
+   * Add the graph to a model. This should be called once when creating a new model.
+   *
+   * The OrtModel takes ownership of the OrtGraph and you should NOT call ReleaseOrtGraph.
+   *
+   * \param[in] model The OrtModel instance to update.
+   * \param[in] graph The OrtGraph instance to add to the model.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.21.
+   */
+  ORT_API2_STATUS(AddGraphToModel, _In_ OrtModel* model, _Inout_ OrtGraph* graph);
 
-  //
-  // Session API
-  //
+  /** \brief Release an OrtModel.
+   *
+   * Release the OrtModel.
+   * This should be called after the model is added to a session using CreateSessionFromModel or ApplyModelToSession.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.21.
+   */
+  ORT_CLASS_RELEASE(Model);
 
-  // Create session.
-  // The OrtModel does not transfer ownership to allow multiple sessions to be created.
-  // Once the session is created it can be released at any time.
-  // TBD is this re-using a model is needed. If not we could transfer ownership and automatically release the OrtModel
-  // once the session is created.
+  /** \brief Create an OrtSession using the OrtModel.
+   *
+   * Create an inference session using the OrtModel.
+   * This will validate the model, run optimizers, and prepare the session for inferencing.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.21.
+   */
   ORT_API2_STATUS(CreateSessionFromModel, _In_ const OrtEnv* env, _In_ const OrtModel* model,
                   _In_ const OrtSessionOptions* options, _Outptr_ OrtSession** out);
+
+  /** \brief Create an OrtSession to augment an existing model.
+   *
+   * Create an OrtSession with an existing model that can be augmented with additional nodes.
+   * Nodes can be added to the model using AddNodeToGraph.
+   * Graph inputs/outputs should be updated wtih SetGraphInputs and SetGraphOutputs to reflect the new nodes.
+   * Apply the changes with ApplyModelToSession and prepare the session for inferencing by calling
+   * FinalizeModelBuilderSession.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.21.
+   */
+  ORT_API2_STATUS(CreateModelBuilderSession, _In_ const OrtEnv* env, _In_ const ORTCHAR_T* model_path,
+                  _In_ const OrtSessionOptions* options,
+                  _Outptr_ OrtSession** out);
+
+  /** \brief Create an OrtSession to augment an existing model.
+   *
+   * Create an OrtSession with an existing model that can be augmented with additional nodes.
+   * Nodes can be added to the model using AddNodeToGraph.
+   * Graph inputs/outputs should be updated wtih SetGraphInputs and SetGraphOutputs to reflect the new nodes.
+   * Apply the changes with ApplyModelToSession and prepare the session for inferencing by calling
+   * FinalizeModelBuilderSession.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.21.
+   */
+  ORT_API2_STATUS(CreateModelBuilderSessionFromArray, _In_ const OrtEnv* env,
+                  _In_ const void* model_data, size_t model_data_length,
+                  _In_ const OrtSessionOptions* options,
+                  _Outptr_ OrtSession** out);
+
+  /** \brief Apply the changes from the model to the session.
+   *
+   * Apply the changes from the model to the session that was created using CreateModelBuilderSession[FromArray].
+   * All changes will be validated.
+   * Call FinalizeModelBuilderSession to prepare the session for inferencing.
+   *
+   * Existing input/outputs will only be updated if the OrtGraph inputs/outputs are set in the OrtModel.
+   *   i.e. you don't need to call SetGraphInputs/Outputs if they are unchanged.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.21.
+   */
+  ORT_API2_STATUS(ApplyModelToModelBuilderSession, _In_ OrtSession* session, _In_ OrtModel* model);
+
+  /** \brief Finalize the Model Builder session.
+   *
+   * Finalize the Model Builder session.
+   * This will run optimizers and prepare the session for inferencing.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.21.
+   */
+  ORT_API2_STATUS(FinalizeModelBuilderSession, _In_ OrtSession* session, _In_ const OrtSessionOptions* options,
+                  _Inout_ OrtPrepackedWeightsContainer* prepacked_weights_container);
 };
 
 /*
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/session/onnxruntime_cxx_api.h
new file mode 100644
index 00000000000000..715f61b17144c7
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/session/onnxruntime_cxx_api.h
@@ -0,0 +1,2686 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// Summary: The Ort C++ API is a header only wrapper around the Ort C API.
+//
+// The C++ API simplifies usage by returning values directly instead of error codes, throwing exceptions on errors
+// and automatically releasing resources in the destructors. The primary purpose of C++ API is exception safety so
+// all the resources follow RAII and do not leak memory.
+//
+// Each of the C++ wrapper classes holds only a pointer to the C internal object. Treat them like smart pointers.
+// To create an empty object, pass 'nullptr' to the constructor (for example, Env e{nullptr};). However, you can't use them
+// until you assign an instance that actually holds an underlying object.
+//
+// For Ort objects only move assignment between objects is allowed, there are no copy constructors.
+// Some objects have explicit 'Clone' methods for this purpose.
+//
+// ConstXXXX types are copyable since they do not own the underlying C object, so you can pass them to functions as arguments
+// by value or by reference. ConstXXXX types are restricted to const only interfaces.
+//
+// UnownedXXXX are similar to ConstXXXX but also allow non-const interfaces.
+//
+// The lifetime of the corresponding owning object must eclipse the lifetimes of the ConstXXXX/UnownedXXXX types. They exists so you do not
+// have to fallback to C types and the API with the usual pitfalls. In general, do not use C API from your C++ code.
+
+#pragma once
+#include "onnxruntime_c_api.h"
+#include "onnxruntime_float16.h"
+
+#include <array>
+#include <cstddef>
+#include <cstdio>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <unordered_map>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#ifdef ORT_NO_EXCEPTIONS
+#include <iostream>
+#endif
+
+/** \brief All C++ Onnxruntime APIs are defined inside this namespace
+ *
+ */
+namespace Ort {
+
+/** \brief All C++ methods that can fail will throw an exception of this type
+ *
+ * If <tt>ORT_NO_EXCEPTIONS</tt> is defined, then any error will result in a call to abort()
+ */
+struct Exception : std::exception {
+  Exception(std::string&& string, OrtErrorCode code) : message_{std::move(string)}, code_{code} {}
+
+  OrtErrorCode GetOrtErrorCode() const { return code_; }
+  const char* what() const noexcept override { return message_.c_str(); }
+
+ private:
+  std::string message_;
+  OrtErrorCode code_;
+};
+
+#ifdef ORT_NO_EXCEPTIONS
+// The #ifndef is for the very special case where the user of this library wants to define their own way of handling errors.
+// NOTE: This header expects control flow to not continue after calling ORT_CXX_API_THROW
+#ifndef ORT_CXX_API_THROW
+#define ORT_CXX_API_THROW(string, code)       \
+  do {                                        \
+    std::cerr << Ort::Exception(string, code) \
+                     .what()                  \
+              << std::endl;                   \
+    abort();                                  \
+  } while (false)
+#endif
+#else
+#define ORT_CXX_API_THROW(string, code) \
+  throw Ort::Exception(string, code)
+#endif
+
+// This is used internally by the C++ API. This class holds the global variable that points to the OrtApi,
+//  it's in a template so that we can define a global variable in a header and make
+// it transparent to the users of the API.
+template <typename T>
+struct Global {
+  static const OrtApi* api_;
+};
+
+// If macro ORT_API_MANUAL_INIT is defined, no static initialization will be performed. Instead, user must call InitApi() before using it.
+template <typename T>
+#ifdef ORT_API_MANUAL_INIT
+const OrtApi* Global<T>::api_{};
+inline void InitApi() noexcept { Global<void>::api_ = OrtGetApiBase()->GetApi(ORT_API_VERSION); }
+
+// Used by custom operator libraries that are not linked to onnxruntime. Sets the global API object, which is
+// required by C++ APIs.
+//
+// Example mycustomop.cc:
+//
+// #define ORT_API_MANUAL_INIT
+// #include <onnxruntime_cxx_api.h>
+// #undef ORT_API_MANUAL_INIT
+//
+// OrtStatus* ORT_API_CALL RegisterCustomOps(OrtSessionOptions* options, const OrtApiBase* api_base) {
+//   Ort::InitApi(api_base->GetApi(ORT_API_VERSION));
+//   // ...
+// }
+//
+inline void InitApi(const OrtApi* api) noexcept { Global<void>::api_ = api; }
+#else
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(push)
+// "Global initializer calls a non-constexpr function." Therefore you can't use ORT APIs in the other global initializers.
+// Please define ORT_API_MANUAL_INIT if it conerns you.
+#pragma warning(disable : 26426)
+#endif
+const OrtApi* Global<T>::api_ = OrtGetApiBase()->GetApi(ORT_API_VERSION);
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(pop)
+#endif
+#endif
+
+/// This returns a reference to the ORT C API.
+inline const OrtApi& GetApi() noexcept { return *Global<void>::api_; }
+
+/// <summary>
+/// This function returns the onnxruntime version string
+/// </summary>
+/// <returns>version string major.minor.rev</returns>
+std::string GetVersionString();
+
+/// <summary>
+/// This function returns the onnxruntime build information: including git branch,
+/// git commit id, build type(Debug/Release/RelWithDebInfo) and cmake cpp flags.
+/// </summary>
+/// <returns>string</returns>
+std::string GetBuildInfoString();
+
+/// <summary>
+/// This is a C++ wrapper for OrtApi::GetAvailableProviders() and
+/// returns a vector of strings representing the available execution providers.
+/// </summary>
+/// <returns>vector of strings</returns>
+std::vector<std::string> GetAvailableProviders();
+
+/// <summary>
+/// This returns a reference to the ORT C Model Builder API. Used if building or augmenting a model at runtime.
+/// </summary>
+/// <returns>ORT C Model Builder API reference</returns>
+inline const OrtModelBuilderApi& GetModelBuilderApi() {
+  auto* api = GetApi().GetModelBuilderApi();
+  if (api == nullptr) {
+    // minimal build
+    ORT_CXX_API_THROW("Model Builder API is not available in this build", ORT_FAIL);
+  }
+
+  return *api;
+}
+
+/** \brief IEEE 754 half-precision floating point data type
+ *
+ * \details This struct is used for converting float to float16 and back
+ * so the user could feed inputs and fetch outputs using these type.
+ *
+ * The size of the structure should align with uint16_t and one can freely cast
+ * uint16_t buffers to/from Ort::Float16_t to feed and retrieve data.
+ *
+ * \code{.unparsed}
+ * // This example demonstrates converion from float to float16
+ * constexpr float values[] = {1.f, 2.f, 3.f, 4.f, 5.f};
+ * std::vector<Ort::Float16_t> fp16_values;
+ * fp16_values.reserve(std::size(values));
+ * std::transform(std::begin(values), std::end(values), std::back_inserter(fp16_values),
+ *     [](float value) { return Ort::Float16_t(value); });
+ *
+ * \endcode
+ */
+struct Float16_t : onnxruntime_float16::Float16Impl<Float16_t> {
+ private:
+  /// <summary>
+  /// Constructor from a 16-bit representation of a float16 value
+  /// No conversion is done here.
+  /// </summary>
+  /// <param name="v">16-bit representation</param>
+  constexpr explicit Float16_t(uint16_t v) noexcept { val = v; }
+
+ public:
+  using Base = onnxruntime_float16::Float16Impl<Float16_t>;
+
+  /// <summary>
+  /// Default constructor
+  /// </summary>
+  Float16_t() = default;
+
+  /// <summary>
+  /// Explicit conversion to uint16_t representation of float16.
+  /// </summary>
+  /// <param name="v">uint16_t bit representation of float16</param>
+  /// <returns>new instance of Float16_t</returns>
+  constexpr static Float16_t FromBits(uint16_t v) noexcept { return Float16_t(v); }
+
+  /// <summary>
+  /// __ctor from float. Float is converted into float16 16-bit representation.
+  /// </summary>
+  /// <param name="v">float value</param>
+  explicit Float16_t(float v) noexcept { val = Base::ToUint16Impl(v); }
+
+  /// <summary>
+  /// Converts float16 to float
+  /// </summary>
+  /// <returns>float representation of float16 value</returns>
+  float ToFloat() const noexcept { return Base::ToFloatImpl(); }
+
+  /// <summary>
+  /// Checks if the value is negative
+  /// </summary>
+  /// <returns>true if negative</returns>
+  using Base::IsNegative;
+
+  /// <summary>
+  /// Tests if the value is NaN
+  /// </summary>
+  /// <returns>true if NaN</returns>
+  using Base::IsNaN;
+
+  /// <summary>
+  /// Tests if the value is finite
+  /// </summary>
+  /// <returns>true if finite</returns>
+  using Base::IsFinite;
+
+  /// <summary>
+  /// Tests if the value represents positive infinity.
+  /// </summary>
+  /// <returns>true if positive infinity</returns>
+  using Base::IsPositiveInfinity;
+
+  /// <summary>
+  /// Tests if the value represents negative infinity
+  /// </summary>
+  /// <returns>true if negative infinity</returns>
+  using Base::IsNegativeInfinity;
+
+  /// <summary>
+  /// Tests if the value is either positive or negative infinity.
+  /// </summary>
+  /// <returns>True if absolute value is infinity</returns>
+  using Base::IsInfinity;
+
+  /// <summary>
+  /// Tests if the value is NaN or zero. Useful for comparisons.
+  /// </summary>
+  /// <returns>True if NaN or zero.</returns>
+  using Base::IsNaNOrZero;
+
+  /// <summary>
+  /// Tests if the value is normal (not zero, subnormal, infinite, or NaN).
+  /// </summary>
+  /// <returns>True if so</returns>
+  using Base::IsNormal;
+
+  /// <summary>
+  /// Tests if the value is subnormal (denormal).
+  /// </summary>
+  /// <returns>True if so</returns>
+  using Base::IsSubnormal;
+
+  /// <summary>
+  /// Creates an instance that represents absolute value.
+  /// </summary>
+  /// <returns>Absolute value</returns>
+  using Base::Abs;
+
+  /// <summary>
+  /// Creates a new instance with the sign flipped.
+  /// </summary>
+  /// <returns>Flipped sign instance</returns>
+  using Base::Negate;
+
+  /// <summary>
+  /// IEEE defines that positive and negative zero are equal, this gives us a quick equality check
+  /// for two values by or'ing the private bits together and stripping the sign. They are both zero,
+  /// and therefore equivalent, if the resulting value is still zero.
+  /// </summary>
+  /// <param name="lhs">first value</param>
+  /// <param name="rhs">second value</param>
+  /// <returns>True if both arguments represent zero</returns>
+  using Base::AreZero;
+
+  /// <summary>
+  /// User defined conversion operator. Converts Float16_t to float.
+  /// </summary>
+  explicit operator float() const noexcept { return ToFloat(); }
+
+  using Base::operator==;
+  using Base::operator!=;
+  using Base::operator<;
+};
+
+static_assert(sizeof(Float16_t) == sizeof(uint16_t), "Sizes must match");
+
+/** \brief bfloat16 (Brain Floating Point) data type
+ *
+ * \details This struct is used for converting float to bfloat16 and back
+ * so the user could feed inputs and fetch outputs using these type.
+ *
+ * The size of the structure should align with uint16_t and one can freely cast
+ * uint16_t buffers to/from Ort::BFloat16_t to feed and retrieve data.
+ *
+ * \code{.unparsed}
+ * // This example demonstrates converion from float to float16
+ * constexpr float values[] = {1.f, 2.f, 3.f, 4.f, 5.f};
+ * std::vector<Ort::BFloat16_t> bfp16_values;
+ * bfp16_values.reserve(std::size(values));
+ * std::transform(std::begin(values), std::end(values), std::back_inserter(bfp16_values),
+ *     [](float value) { return Ort::BFloat16_t(value); });
+ *
+ * \endcode
+ */
+struct BFloat16_t : onnxruntime_float16::BFloat16Impl<BFloat16_t> {
+ private:
+  /// <summary>
+  /// Constructor from a uint16_t representation of bfloat16
+  /// used in FromBits() to escape overload resolution issue with
+  /// constructor from float.
+  /// No conversion is done.
+  /// </summary>
+  /// <param name="v">16-bit bfloat16 value</param>
+  constexpr explicit BFloat16_t(uint16_t v) noexcept { val = v; }
+
+ public:
+  using Base = onnxruntime_float16::BFloat16Impl<BFloat16_t>;
+
+  BFloat16_t() = default;
+
+  /// <summary>
+  /// Explicit conversion to uint16_t representation of bfloat16.
+  /// </summary>
+  /// <param name="v">uint16_t bit representation of bfloat16</param>
+  /// <returns>new instance of BFloat16_t</returns>
+  static constexpr BFloat16_t FromBits(uint16_t v) noexcept { return BFloat16_t(v); }
+
+  /// <summary>
+  /// __ctor from float. Float is converted into bfloat16 16-bit representation.
+  /// </summary>
+  /// <param name="v">float value</param>
+  explicit BFloat16_t(float v) noexcept { val = Base::ToUint16Impl(v); }
+
+  /// <summary>
+  /// Converts bfloat16 to float
+  /// </summary>
+  /// <returns>float representation of bfloat16 value</returns>
+  float ToFloat() const noexcept { return Base::ToFloatImpl(); }
+
+  /// <summary>
+  /// Checks if the value is negative
+  /// </summary>
+  /// <returns>true if negative</returns>
+  using Base::IsNegative;
+
+  /// <summary>
+  /// Tests if the value is NaN
+  /// </summary>
+  /// <returns>true if NaN</returns>
+  using Base::IsNaN;
+
+  /// <summary>
+  /// Tests if the value is finite
+  /// </summary>
+  /// <returns>true if finite</returns>
+  using Base::IsFinite;
+
+  /// <summary>
+  /// Tests if the value represents positive infinity.
+  /// </summary>
+  /// <returns>true if positive infinity</returns>
+  using Base::IsPositiveInfinity;
+
+  /// <summary>
+  /// Tests if the value represents negative infinity
+  /// </summary>
+  /// <returns>true if negative infinity</returns>
+  using Base::IsNegativeInfinity;
+
+  /// <summary>
+  /// Tests if the value is either positive or negative infinity.
+  /// </summary>
+  /// <returns>True if absolute value is infinity</returns>
+  using Base::IsInfinity;
+
+  /// <summary>
+  /// Tests if the value is NaN or zero. Useful for comparisons.
+  /// </summary>
+  /// <returns>True if NaN or zero.</returns>
+  using Base::IsNaNOrZero;
+
+  /// <summary>
+  /// Tests if the value is normal (not zero, subnormal, infinite, or NaN).
+  /// </summary>
+  /// <returns>True if so</returns>
+  using Base::IsNormal;
+
+  /// <summary>
+  /// Tests if the value is subnormal (denormal).
+  /// </summary>
+  /// <returns>True if so</returns>
+  using Base::IsSubnormal;
+
+  /// <summary>
+  /// Creates an instance that represents absolute value.
+  /// </summary>
+  /// <returns>Absolute value</returns>
+  using Base::Abs;
+
+  /// <summary>
+  /// Creates a new instance with the sign flipped.
+  /// </summary>
+  /// <returns>Flipped sign instance</returns>
+  using Base::Negate;
+
+  /// <summary>
+  /// IEEE defines that positive and negative zero are equal, this gives us a quick equality check
+  /// for two values by or'ing the private bits together and stripping the sign. They are both zero,
+  /// and therefore equivalent, if the resulting value is still zero.
+  /// </summary>
+  /// <param name="lhs">first value</param>
+  /// <param name="rhs">second value</param>
+  /// <returns>True if both arguments represent zero</returns>
+  using Base::AreZero;
+
+  /// <summary>
+  /// User defined conversion operator. Converts BFloat16_t to float.
+  /// </summary>
+  explicit operator float() const noexcept { return ToFloat(); }
+
+  // We do not have an inherited impl for the below operators
+  // as the internal class implements them a little differently
+  bool operator==(const BFloat16_t& rhs) const noexcept;
+  bool operator!=(const BFloat16_t& rhs) const noexcept { return !(*this == rhs); }
+  bool operator<(const BFloat16_t& rhs) const noexcept;
+};
+
+static_assert(sizeof(BFloat16_t) == sizeof(uint16_t), "Sizes must match");
+
+/** \brief float8e4m3fn (Float8 Floating Point) data type
+ * \details It is necessary for type dispatching to make use of C++ API
+ * The type is implicitly convertible to/from uint8_t.
+ * See https://onnx.ai/onnx/technical/float8.html for further details.
+ */
+struct Float8E4M3FN_t {
+  uint8_t value;
+  constexpr Float8E4M3FN_t() noexcept : value(0) {}
+  constexpr Float8E4M3FN_t(uint8_t v) noexcept : value(v) {}
+  constexpr operator uint8_t() const noexcept { return value; }
+  // nan values are treated like any other value for operator ==, !=
+  constexpr bool operator==(const Float8E4M3FN_t& rhs) const noexcept { return value == rhs.value; };
+  constexpr bool operator!=(const Float8E4M3FN_t& rhs) const noexcept { return value != rhs.value; };
+};
+
+static_assert(sizeof(Float8E4M3FN_t) == sizeof(uint8_t), "Sizes must match");
+
+/** \brief float8e4m3fnuz (Float8 Floating Point) data type
+ * \details It is necessary for type dispatching to make use of C++ API
+ * The type is implicitly convertible to/from uint8_t.
+ * See https://onnx.ai/onnx/technical/float8.html for further details.
+ */
+struct Float8E4M3FNUZ_t {
+  uint8_t value;
+  constexpr Float8E4M3FNUZ_t() noexcept : value(0) {}
+  constexpr Float8E4M3FNUZ_t(uint8_t v) noexcept : value(v) {}
+  constexpr operator uint8_t() const noexcept { return value; }
+  // nan values are treated like any other value for operator ==, !=
+  constexpr bool operator==(const Float8E4M3FNUZ_t& rhs) const noexcept { return value == rhs.value; };
+  constexpr bool operator!=(const Float8E4M3FNUZ_t& rhs) const noexcept { return value != rhs.value; };
+};
+
+static_assert(sizeof(Float8E4M3FNUZ_t) == sizeof(uint8_t), "Sizes must match");
+
+/** \brief float8e5m2 (Float8 Floating Point) data type
+ * \details It is necessary for type dispatching to make use of C++ API
+ * The type is implicitly convertible to/from uint8_t.
+ * See https://onnx.ai/onnx/technical/float8.html for further details.
+ */
+struct Float8E5M2_t {
+  uint8_t value;
+  constexpr Float8E5M2_t() noexcept : value(0) {}
+  constexpr Float8E5M2_t(uint8_t v) noexcept : value(v) {}
+  constexpr operator uint8_t() const noexcept { return value; }
+  // nan values are treated like any other value for operator ==, !=
+  constexpr bool operator==(const Float8E5M2_t& rhs) const noexcept { return value == rhs.value; };
+  constexpr bool operator!=(const Float8E5M2_t& rhs) const noexcept { return value != rhs.value; };
+};
+
+static_assert(sizeof(Float8E5M2_t) == sizeof(uint8_t), "Sizes must match");
+
+/** \brief float8e5m2fnuz (Float8 Floating Point) data type
+ * \details It is necessary for type dispatching to make use of C++ API
+ * The type is implicitly convertible to/from uint8_t.
+ * See https://onnx.ai/onnx/technical/float8.html for further details.
+ */
+struct Float8E5M2FNUZ_t {
+  uint8_t value;
+  constexpr Float8E5M2FNUZ_t() noexcept : value(0) {}
+  constexpr Float8E5M2FNUZ_t(uint8_t v) noexcept : value(v) {}
+  constexpr operator uint8_t() const noexcept { return value; }
+  // nan values are treated like any other value for operator ==, !=
+  constexpr bool operator==(const Float8E5M2FNUZ_t& rhs) const noexcept { return value == rhs.value; };
+  constexpr bool operator!=(const Float8E5M2FNUZ_t& rhs) const noexcept { return value != rhs.value; };
+};
+
+static_assert(sizeof(Float8E5M2FNUZ_t) == sizeof(uint8_t), "Sizes must match");
+
+namespace detail {
+// This is used internally by the C++ API. This macro is to make it easy to generate overloaded methods for all of the various OrtRelease* functions for every Ort* type
+// This can't be done in the C API since C doesn't have function overloading.
+#define ORT_DEFINE_RELEASE(NAME) \
+  inline void OrtRelease(Ort##NAME* ptr) { GetApi().Release##NAME(ptr); }
+
+ORT_DEFINE_RELEASE(Allocator);
+ORT_DEFINE_RELEASE(MemoryInfo);
+ORT_DEFINE_RELEASE(CustomOpDomain);
+ORT_DEFINE_RELEASE(ThreadingOptions);
+ORT_DEFINE_RELEASE(Env);
+ORT_DEFINE_RELEASE(RunOptions);
+ORT_DEFINE_RELEASE(LoraAdapter);
+ORT_DEFINE_RELEASE(Session);
+ORT_DEFINE_RELEASE(SessionOptions);
+ORT_DEFINE_RELEASE(TensorTypeAndShapeInfo);
+ORT_DEFINE_RELEASE(SequenceTypeInfo);
+ORT_DEFINE_RELEASE(MapTypeInfo);
+ORT_DEFINE_RELEASE(TypeInfo);
+ORT_DEFINE_RELEASE(Value);
+ORT_DEFINE_RELEASE(ModelMetadata);
+ORT_DEFINE_RELEASE(IoBinding);
+ORT_DEFINE_RELEASE(ArenaCfg);
+ORT_DEFINE_RELEASE(Status);
+ORT_DEFINE_RELEASE(OpAttr);
+ORT_DEFINE_RELEASE(Op);
+ORT_DEFINE_RELEASE(KernelInfo);
+
+#undef ORT_DEFINE_RELEASE
+
+#define ORT_DEFINE_MODELBUILDER_API_RELEASE(NAME) \
+  inline void OrtRelease(Ort##NAME* ptr) { GetModelBuilderApi().Release##NAME(ptr); }
+
+ORT_DEFINE_MODELBUILDER_API_RELEASE(ValueInfo);
+ORT_DEFINE_MODELBUILDER_API_RELEASE(Node);
+ORT_DEFINE_MODELBUILDER_API_RELEASE(Graph);
+ORT_DEFINE_MODELBUILDER_API_RELEASE(Model);
+#undef ORT_DEFINE_MODELBUILDER_API_RELEASE
+
+/** \brief This is a tagging template type. Use it with Base<T> to indicate that the C++ interface object
+ *   has no ownership of the underlying C object.
+ */
+template <typename T>
+struct Unowned {
+  using Type = T;
+};
+
+/** \brief Used internally by the C++ API. C++ wrapper types inherit from this.
+ *   This is a zero cost abstraction to wrap the C API objects and delete them on destruction.
+ *
+ * All of the C++ classes
+ *  a) serve as containers for pointers to objects that are created by the underlying C API.
+ *     Their size is just a pointer size, no need to dynamically allocate them. Use them by value.
+ *  b) Each of struct XXXX, XXX instances function as smart pointers to the underlying C API objects.
+ *     they would release objects owned automatically when going out of scope, they are move-only.
+ *  c) ConstXXXX and UnownedXXX structs function as non-owning, copyable containers for the above pointers.
+ *     ConstXXXX allow calling const interfaces only. They give access to objects that are owned by somebody else
+ *     such as Onnxruntime or instances of XXXX classes.
+ *  d) serve convenient interfaces that return C++ objects and further enhance exception and type safety so they can be used
+ *     in C++ code.
+ *
+ */
+
+/// <summary>
+/// This is a non-const pointer holder that is move-only. Disposes of the pointer on destruction.
+/// </summary>
+template <typename T>
+struct Base {
+  using contained_type = T;
+
+  constexpr Base() = default;
+  constexpr explicit Base(contained_type* p) noexcept : p_{p} {}
+  ~Base() {
+    OrtRelease(p_);
+  }
+
+  Base(const Base&) = delete;
+  Base& operator=(const Base&) = delete;
+
+  Base(Base&& v) noexcept : p_{v.p_} { v.p_ = nullptr; }
+  Base& operator=(Base&& v) noexcept {
+    OrtRelease(p_);
+    p_ = v.release();
+    return *this;
+  }
+
+  constexpr operator contained_type*() const noexcept { return p_; }
+
+  /// \brief Relinquishes ownership of the contained C object pointer
+  /// The underlying object is not destroyed
+  contained_type* release() {
+    T* p = p_;
+    p_ = nullptr;
+    return p;
+  }
+
+ protected:
+  contained_type* p_{};
+};
+
+// Undefined. For const types use Base<Unowned<const T>>
+template <typename T>
+struct Base<const T>;
+
+/// <summary>
+/// Covers unowned pointers owned by either the ORT
+/// or some other instance of CPP wrappers.
+/// Used for ConstXXX and UnownedXXXX types that are copyable.
+/// Also convenient to wrap raw OrtXX pointers .
+/// </summary>
+/// <typeparam name="T"></typeparam>
+template <typename T>
+struct Base<Unowned<T>> {
+  using contained_type = typename Unowned<T>::Type;
+
+  constexpr Base() = default;
+  constexpr explicit Base(contained_type* p) noexcept : p_{p} {}
+
+  ~Base() = default;
+
+  Base(const Base&) = default;
+  Base& operator=(const Base&) = default;
+
+  Base(Base&& v) noexcept : p_{v.p_} { v.p_ = nullptr; }
+  Base& operator=(Base&& v) noexcept {
+    p_ = nullptr;
+    std::swap(p_, v.p_);
+    return *this;
+  }
+
+  constexpr operator contained_type*() const noexcept { return p_; }
+
+ protected:
+  contained_type* p_{};
+};
+
+// Light functor to release memory with OrtAllocator
+struct AllocatedFree {
+  OrtAllocator* allocator_;
+  explicit AllocatedFree(OrtAllocator* allocator)
+      : allocator_(allocator) {}
+  void operator()(void* ptr) const {
+    if (ptr) allocator_->Free(allocator_, ptr);
+  }
+};
+
+}  // namespace detail
+
+struct AllocatorWithDefaultOptions;
+struct Env;
+struct TypeInfo;
+struct Value;
+struct ModelMetadata;
+
+namespace ModelBuilderAPI {
+struct Model;
+}
+
+/** \brief unique_ptr typedef used to own strings allocated by OrtAllocators
+ *  and release them at the end of the scope. The lifespan of the given allocator
+ *  must eclipse the lifespan of AllocatedStringPtr instance
+ */
+using AllocatedStringPtr = std::unique_ptr<char, detail::AllocatedFree>;
+
+/** \brief The Status that holds ownership of OrtStatus received from C API
+ *  Use it to safely destroy OrtStatus* returned from the C API. Use appropriate
+ *  constructors to construct an instance of a Status object from exceptions.
+ */
+struct Status : detail::Base<OrtStatus> {
+  using Base = detail::Base<OrtStatus>;
+  using Base::Base;
+
+  explicit Status(std::nullptr_t) noexcept {}               ///< Create an empty object, must be assigned a valid one to be used
+  explicit Status(OrtStatus* status) noexcept;              ///< Takes ownership of OrtStatus instance returned from the C API.
+  explicit Status(const Exception&) noexcept;               ///< Creates status instance out of exception
+  explicit Status(const std::exception&) noexcept;          ///< Creates status instance out of exception
+  Status(const char* message, OrtErrorCode code) noexcept;  ///< Creates status instance out of null-terminated string message.
+  std::string GetErrorMessage() const;
+  OrtErrorCode GetErrorCode() const;
+  bool IsOK() const noexcept;  ///< Returns true if instance represents an OK (non-error) status.
+};
+
+/** \brief The ThreadingOptions
+ *
+ * The ThreadingOptions used for set global threadpools' options of The Env.
+ */
+struct ThreadingOptions : detail::Base<OrtThreadingOptions> {
+  /// \brief Wraps OrtApi::CreateThreadingOptions
+  ThreadingOptions();
+
+  /// \brief Wraps OrtApi::SetGlobalIntraOpNumThreads
+  ThreadingOptions& SetGlobalIntraOpNumThreads(int intra_op_num_threads);
+
+  /// \brief Wraps OrtApi::SetGlobalInterOpNumThreads
+  ThreadingOptions& SetGlobalInterOpNumThreads(int inter_op_num_threads);
+
+  /// \brief Wraps OrtApi::SetGlobalSpinControl
+  ThreadingOptions& SetGlobalSpinControl(int allow_spinning);
+
+  /// \brief Wraps OrtApi::SetGlobalDenormalAsZero
+  ThreadingOptions& SetGlobalDenormalAsZero();
+
+  /// \brief Wraps OrtApi::SetGlobalCustomCreateThreadFn
+  ThreadingOptions& SetGlobalCustomCreateThreadFn(OrtCustomCreateThreadFn ort_custom_create_thread_fn);
+
+  /// \brief Wraps OrtApi::SetGlobalCustomThreadCreationOptions
+  ThreadingOptions& SetGlobalCustomThreadCreationOptions(void* ort_custom_thread_creation_options);
+
+  /// \brief Wraps OrtApi::SetGlobalCustomJoinThreadFn
+  ThreadingOptions& SetGlobalCustomJoinThreadFn(OrtCustomJoinThreadFn ort_custom_join_thread_fn);
+};
+
+/** \brief The Env (Environment)
+ *
+ * The Env holds the logging state used by all other objects.
+ * <b>Note:</b> One Env must be created before using any other Onnxruntime functionality
+ */
+struct Env : detail::Base<OrtEnv> {
+  explicit Env(std::nullptr_t) {}  ///< Create an empty Env object, must be assigned a valid one to be used
+
+  /// \brief Wraps OrtApi::CreateEnv
+  Env(OrtLoggingLevel logging_level = ORT_LOGGING_LEVEL_WARNING, _In_ const char* logid = "");
+
+  /// \brief Wraps OrtApi::CreateEnvWithCustomLogger
+  Env(OrtLoggingLevel logging_level, const char* logid, OrtLoggingFunction logging_function, void* logger_param);
+
+  /// \brief Wraps OrtApi::CreateEnvWithGlobalThreadPools
+  Env(const OrtThreadingOptions* tp_options, OrtLoggingLevel logging_level = ORT_LOGGING_LEVEL_WARNING, _In_ const char* logid = "");
+
+  /// \brief Wraps OrtApi::CreateEnvWithCustomLoggerAndGlobalThreadPools
+  Env(const OrtThreadingOptions* tp_options, OrtLoggingFunction logging_function, void* logger_param,
+      OrtLoggingLevel logging_level = ORT_LOGGING_LEVEL_WARNING, _In_ const char* logid = "");
+
+  /// \brief C Interop Helper
+  explicit Env(OrtEnv* p) : Base<OrtEnv>{p} {}
+
+  Env& EnableTelemetryEvents();   ///< Wraps OrtApi::EnableTelemetryEvents
+  Env& DisableTelemetryEvents();  ///< Wraps OrtApi::DisableTelemetryEvents
+
+  Env& UpdateEnvWithCustomLogLevel(OrtLoggingLevel log_severity_level);  ///< Wraps OrtApi::UpdateEnvWithCustomLogLevel
+
+  Env& CreateAndRegisterAllocator(const OrtMemoryInfo* mem_info, const OrtArenaCfg* arena_cfg);  ///< Wraps OrtApi::CreateAndRegisterAllocator
+
+  Env& CreateAndRegisterAllocatorV2(const std::string& provider_type, const OrtMemoryInfo* mem_info, const std::unordered_map<std::string, std::string>& options, const OrtArenaCfg* arena_cfg);  ///< Wraps OrtApi::CreateAndRegisterAllocatorV2
+};
+
+/** \brief Custom Op Domain
+ *
+ */
+struct CustomOpDomain : detail::Base<OrtCustomOpDomain> {
+  using Base = detail::Base<OrtCustomOpDomain>;
+  using Base::Base;
+
+  explicit CustomOpDomain(std::nullptr_t) {}  ///< Create an empty CustomOpDomain object, must be assigned a valid one to be used
+
+  /// \brief Wraps OrtApi::CreateCustomOpDomain
+  explicit CustomOpDomain(const char* domain);
+
+  // This does not take ownership of the op, simply registers it.
+  void Add(const OrtCustomOp* op);  ///< Wraps CustomOpDomain_Add
+};
+
+/// \brief LoraAdapter holds a set of Lora Parameters loaded from a single file
+struct LoraAdapter : detail::Base<OrtLoraAdapter> {
+  using Base = detail::Base<OrtLoraAdapter>;
+  using Base::Base;
+
+  explicit LoraAdapter(std::nullptr_t) {}  ///< Create an empty LoraAdapter object, must be assigned a valid one to be used
+  /// \brief Wraps OrtApi::CreateLoraAdapter
+  ///
+  /// The function attempts to load the adapter from the specified file
+  /// \param adapter_path The path to the Lora adapter
+  /// \param allocator optional pointer to a device allocator. If nullptr, the data stays on CPU. It would still
+  ///        be copied to device if required by the model at inference time.
+  static LoraAdapter CreateLoraAdapter(const std::basic_string<ORTCHAR_T>& adapter_path,
+                                       OrtAllocator* allocator);
+
+  /// \brief Wraps OrtApi::CreateLoraAdapterFromArray
+  ///
+  /// The function attempts to load the adapter from the specified byte array.
+  /// \param bytes The byte array containing file LoraAdapter format
+  /// \param num_bytes The number of bytes in the byte array
+  /// \param allocator optional pointer to a device allocator. If nullptr, the data stays on CPU. It would still
+  ///        be copied to device if required by the model at inference time.
+  static LoraAdapter CreateLoraAdapterFromArray(const void* bytes, size_t num_bytes,
+                                                OrtAllocator* allocator);
+};
+
+/** \brief RunOptions
+ *
+ */
+struct RunOptions : detail::Base<OrtRunOptions> {
+  explicit RunOptions(std::nullptr_t) {}  ///< Create an empty RunOptions object, must be assigned a valid one to be used
+  RunOptions();                           ///< Wraps OrtApi::CreateRunOptions
+
+  RunOptions& SetRunLogVerbosityLevel(int);  ///< Wraps OrtApi::RunOptionsSetRunLogVerbosityLevel
+  int GetRunLogVerbosityLevel() const;       ///< Wraps OrtApi::RunOptionsGetRunLogVerbosityLevel
+
+  RunOptions& SetRunLogSeverityLevel(int);  ///< Wraps OrtApi::RunOptionsSetRunLogSeverityLevel
+  int GetRunLogSeverityLevel() const;       ///< Wraps OrtApi::RunOptionsGetRunLogSeverityLevel
+
+  RunOptions& SetRunTag(const char* run_tag);  ///< wraps OrtApi::RunOptionsSetRunTag
+  const char* GetRunTag() const;               ///< Wraps OrtApi::RunOptionsGetRunTag
+
+  RunOptions& AddConfigEntry(const char* config_key, const char* config_value);  ///< Wraps OrtApi::AddRunConfigEntry
+
+  /** \brief Terminates all currently executing Session::Run calls that were made using this RunOptions instance
+   *
+   * If a currently executing session needs to be force terminated, this can be called from another thread to force it to fail with an error
+   * Wraps OrtApi::RunOptionsSetTerminate
+   */
+  RunOptions& SetTerminate();
+
+  /** \brief Clears the terminate flag so this RunOptions instance can be used in a new Session::Run call without it instantly terminating
+   *
+   * Wraps OrtApi::RunOptionsUnsetTerminate
+   */
+  RunOptions& UnsetTerminate();
+
+  /** \brief Add the LoraAdapter to the list of active adapters.
+   *  The setting does not affect RunWithBinding() calls.
+   *
+   * Wraps OrtApi::RunOptionsAddActiveLoraAdapter
+   * \param adapter The LoraAdapter to be used as the active adapter
+   */
+  RunOptions& AddActiveLoraAdapter(const LoraAdapter& adapter);
+};
+
+namespace detail {
+// Utility function that returns a SessionOption config entry key for a specific custom operator.
+// Ex: custom_op.[custom_op_name].[config]
+std::string MakeCustomOpConfigEntryKey(const char* custom_op_name, const char* config);
+}  // namespace detail
+
+/// <summary>
+/// Class that represents session configuration entries for one or more custom operators.
+///
+/// Example:
+///   Ort::CustomOpConfigs op_configs;
+///   op_configs.AddConfig("my_custom_op", "device_type", "CPU");
+///
+/// Passed to Ort::SessionOptions::RegisterCustomOpsLibrary.
+/// </summary>
+struct CustomOpConfigs {
+  CustomOpConfigs() = default;
+  ~CustomOpConfigs() = default;
+  CustomOpConfigs(const CustomOpConfigs&) = default;
+  CustomOpConfigs& operator=(const CustomOpConfigs&) = default;
+  CustomOpConfigs(CustomOpConfigs&& o) = default;
+  CustomOpConfigs& operator=(CustomOpConfigs&& o) = default;
+
+  /** \brief Adds a session configuration entry/value for a specific custom operator.
+   *
+   * \param custom_op_name The name of the custom operator for which to add a configuration entry.
+   *                       Must match the name returned by the CustomOp's GetName() method.
+   * \param config_key The name of the configuration entry.
+   * \param config_value The value of the configuration entry.
+   * \return A reference to this object to enable call chaining.
+   */
+  CustomOpConfigs& AddConfig(const char* custom_op_name, const char* config_key, const char* config_value);
+
+  /** \brief Returns a flattened map of custom operator configuration entries and their values.
+   *
+   * The keys has been flattened to include both the custom operator name and the configuration entry key name.
+   * For example, a prior call to AddConfig("my_op", "key", "value") corresponds to the flattened key/value pair
+   * {"my_op.key", "value"}.
+   *
+   * \return An unordered map of flattened configurations.
+   */
+  const std::unordered_map<std::string, std::string>& GetFlattenedConfigs() const;
+
+ private:
+  std::unordered_map<std::string, std::string> flat_configs_;
+};
+
+/** \brief Options object used when creating a new Session object
+ *
+ * Wraps ::OrtSessionOptions object and methods
+ */
+
+struct SessionOptions;
+
+namespace detail {
+// we separate const-only methods because passing const ptr to non-const methods
+// is only discovered when inline methods are compiled which is counter-intuitive
+template <typename T>
+struct ConstSessionOptionsImpl : Base<T> {
+  using B = Base<T>;
+  using B::B;
+
+  SessionOptions Clone() const;  ///< Creates and returns a copy of this SessionOptions object. Wraps OrtApi::CloneSessionOptions
+
+  std::string GetConfigEntry(const char* config_key) const;  ///< Wraps OrtApi::GetSessionConfigEntry
+  bool HasConfigEntry(const char* config_key) const;         ///< Wraps OrtApi::HasSessionConfigEntry
+  std::string GetConfigEntryOrDefault(const char* config_key, const std::string& def);
+};
+
+template <typename T>
+struct SessionOptionsImpl : ConstSessionOptionsImpl<T> {
+  using B = ConstSessionOptionsImpl<T>;
+  using B::B;
+
+  SessionOptionsImpl& SetIntraOpNumThreads(int intra_op_num_threads);                              ///< Wraps OrtApi::SetIntraOpNumThreads
+  SessionOptionsImpl& SetInterOpNumThreads(int inter_op_num_threads);                              ///< Wraps OrtApi::SetInterOpNumThreads
+  SessionOptionsImpl& SetGraphOptimizationLevel(GraphOptimizationLevel graph_optimization_level);  ///< Wraps OrtApi::SetSessionGraphOptimizationLevel
+  SessionOptionsImpl& SetDeterministicCompute(bool value);                                         ///< Wraps OrtApi::SetDeterministicCompute
+
+  SessionOptionsImpl& EnableCpuMemArena();   ///< Wraps OrtApi::EnableCpuMemArena
+  SessionOptionsImpl& DisableCpuMemArena();  ///< Wraps OrtApi::DisableCpuMemArena
+
+  SessionOptionsImpl& SetOptimizedModelFilePath(const ORTCHAR_T* optimized_model_file);  ///< Wraps OrtApi::SetOptimizedModelFilePath
+
+  SessionOptionsImpl& EnableProfiling(const ORTCHAR_T* profile_file_prefix);  ///< Wraps OrtApi::EnableProfiling
+  SessionOptionsImpl& DisableProfiling();                                     ///< Wraps OrtApi::DisableProfiling
+
+  SessionOptionsImpl& EnableOrtCustomOps();  ///< Wraps OrtApi::EnableOrtCustomOps
+
+  SessionOptionsImpl& EnableMemPattern();   ///< Wraps OrtApi::EnableMemPattern
+  SessionOptionsImpl& DisableMemPattern();  ///< Wraps OrtApi::DisableMemPattern
+
+  SessionOptionsImpl& SetExecutionMode(ExecutionMode execution_mode);  ///< Wraps OrtApi::SetSessionExecutionMode
+
+  SessionOptionsImpl& SetLogId(const char* logid);     ///< Wraps OrtApi::SetSessionLogId
+  SessionOptionsImpl& SetLogSeverityLevel(int level);  ///< Wraps OrtApi::SetSessionLogSeverityLevel
+
+  SessionOptionsImpl& Add(OrtCustomOpDomain* custom_op_domain);  ///< Wraps OrtApi::AddCustomOpDomain
+
+  SessionOptionsImpl& DisablePerSessionThreads();  ///< Wraps OrtApi::DisablePerSessionThreads
+
+  SessionOptionsImpl& AddConfigEntry(const char* config_key, const char* config_value);  ///< Wraps OrtApi::AddSessionConfigEntry
+
+  SessionOptionsImpl& AddInitializer(const char* name, const OrtValue* ort_val);                                             ///< Wraps OrtApi::AddInitializer
+  SessionOptionsImpl& AddExternalInitializers(const std::vector<std::string>& names, const std::vector<Value>& ort_values);  ///< Wraps OrtApi::AddExternalInitializers
+  SessionOptionsImpl& AddExternalInitializersFromFilesInMemory(const std::vector<std::basic_string<ORTCHAR_T>>& external_initializer_file_names,
+                                                               const std::vector<char*>& external_initializer_file_buffer_array,
+                                                               const std::vector<size_t>& external_initializer_file_lengths);  ///< Wraps OrtApi::AddExternalInitializersFromFilesInMemory
+
+  SessionOptionsImpl& AppendExecutionProvider_CUDA(const OrtCUDAProviderOptions& provider_options);          ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_CUDA
+  SessionOptionsImpl& AppendExecutionProvider_CUDA_V2(const OrtCUDAProviderOptionsV2& provider_options);     ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_CUDA_V2
+  SessionOptionsImpl& AppendExecutionProvider_ROCM(const OrtROCMProviderOptions& provider_options);          ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_ROCM
+  SessionOptionsImpl& AppendExecutionProvider_OpenVINO(const OrtOpenVINOProviderOptions& provider_options);  ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_OpenVINO
+  ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_OpenVINO_V2
+  SessionOptionsImpl& AppendExecutionProvider_OpenVINO_V2(const std::unordered_map<std::string, std::string>& provider_options = {});
+  SessionOptionsImpl& AppendExecutionProvider_TensorRT(const OrtTensorRTProviderOptions& provider_options);       ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_TensorRT
+  SessionOptionsImpl& AppendExecutionProvider_TensorRT_V2(const OrtTensorRTProviderOptionsV2& provider_options);  ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_TensorRT
+  SessionOptionsImpl& AppendExecutionProvider_MIGraphX(const OrtMIGraphXProviderOptions& provider_options);       ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_MIGraphX
+  ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_CANN
+  SessionOptionsImpl& AppendExecutionProvider_CANN(const OrtCANNProviderOptions& provider_options);
+  ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_Dnnl
+  SessionOptionsImpl& AppendExecutionProvider_Dnnl(const OrtDnnlProviderOptions& provider_options);
+  /// Wraps OrtApi::SessionOptionsAppendExecutionProvider. Currently supports QNN, SNPE and XNNPACK.
+  SessionOptionsImpl& AppendExecutionProvider(const std::string& provider_name,
+                                              const std::unordered_map<std::string, std::string>& provider_options = {});
+
+  SessionOptionsImpl& SetCustomCreateThreadFn(OrtCustomCreateThreadFn ort_custom_create_thread_fn);  ///< Wraps OrtApi::SessionOptionsSetCustomCreateThreadFn
+  SessionOptionsImpl& SetCustomThreadCreationOptions(void* ort_custom_thread_creation_options);      ///< Wraps OrtApi::SessionOptionsSetCustomThreadCreationOptions
+  SessionOptionsImpl& SetCustomJoinThreadFn(OrtCustomJoinThreadFn ort_custom_join_thread_fn);        ///< Wraps OrtApi::SessionOptionsSetCustomJoinThreadFn
+
+  ///< Registers the custom operator from the specified shared library via OrtApi::RegisterCustomOpsLibrary_V2.
+  ///< The custom operator configurations are optional. If provided, custom operator configs are set via
+  ///< OrtApi::AddSessionConfigEntry.
+  SessionOptionsImpl& RegisterCustomOpsLibrary(const ORTCHAR_T* library_name, const CustomOpConfigs& custom_op_configs = {});
+
+  SessionOptionsImpl& RegisterCustomOpsUsingFunction(const char* function_name);  ///< Wraps OrtApi::RegisterCustomOpsUsingFunction
+
+  ///< Wraps OrtApi::SessionOptionsAppendExecutionProvider_VitisAI
+  SessionOptionsImpl& AppendExecutionProvider_VitisAI(const std::unordered_map<std::string, std::string>& provider_options = {});
+};
+}  // namespace detail
+
+using UnownedSessionOptions = detail::SessionOptionsImpl<detail::Unowned<OrtSessionOptions>>;
+using ConstSessionOptions = detail::ConstSessionOptionsImpl<detail::Unowned<const OrtSessionOptions>>;
+
+/** \brief Wrapper around ::OrtSessionOptions
+ *
+ */
+struct SessionOptions : detail::SessionOptionsImpl<OrtSessionOptions> {
+  explicit SessionOptions(std::nullptr_t) {}                                                   ///< Create an empty SessionOptions object, must be assigned a valid one to be used
+  SessionOptions();                                                                            ///< Wraps OrtApi::CreateSessionOptions
+  explicit SessionOptions(OrtSessionOptions* p) : SessionOptionsImpl<OrtSessionOptions>{p} {}  ///< Used for interop with the C API
+  UnownedSessionOptions GetUnowned() const { return UnownedSessionOptions{this->p_}; }
+  ConstSessionOptions GetConst() const { return ConstSessionOptions{this->p_}; }
+};
+
+/** \brief Wrapper around ::OrtModelMetadata
+ *
+ */
+struct ModelMetadata : detail::Base<OrtModelMetadata> {
+  using Base = detail::Base<OrtModelMetadata>;
+  using Base::Base;
+
+  explicit ModelMetadata(std::nullptr_t) {}  ///< Create an empty ModelMetadata object, must be assigned a valid one to be used
+
+  /** \brief Returns a copy of the producer name.
+   *
+   * \param allocator to allocate memory for the copy of the name returned
+   * \return a instance of smart pointer that would deallocate the buffer when out of scope.
+   *  The OrtAllocator instances must be valid at the point of memory release.
+   */
+  AllocatedStringPtr GetProducerNameAllocated(OrtAllocator* allocator) const;  ///< Wraps OrtApi::ModelMetadataGetProducerName
+
+  /** \brief Returns a copy of the graph name.
+   *
+   * \param allocator to allocate memory for the copy of the name returned
+   * \return a instance of smart pointer that would deallocate the buffer when out of scope.
+   *  The OrtAllocator instances must be valid at the point of memory release.
+   */
+  AllocatedStringPtr GetGraphNameAllocated(OrtAllocator* allocator) const;  ///< Wraps OrtApi::ModelMetadataGetGraphName
+
+  /** \brief Returns a copy of the domain name.
+   *
+   * \param allocator to allocate memory for the copy of the name returned
+   * \return a instance of smart pointer that would deallocate the buffer when out of scope.
+   *  The OrtAllocator instances must be valid at the point of memory release.
+   */
+  AllocatedStringPtr GetDomainAllocated(OrtAllocator* allocator) const;  ///< Wraps OrtApi::ModelMetadataGetDomain
+
+  /** \brief Returns a copy of the description.
+   *
+   * \param allocator to allocate memory for the copy of the string returned
+   * \return a instance of smart pointer that would deallocate the buffer when out of scope.
+   *  The OrtAllocator instances must be valid at the point of memory release.
+   */
+  AllocatedStringPtr GetDescriptionAllocated(OrtAllocator* allocator) const;  ///< Wraps OrtApi::ModelMetadataGetDescription
+
+  /** \brief Returns a copy of the graph description.
+   *
+   * \param allocator to allocate memory for the copy of the string returned
+   * \return a instance of smart pointer that would deallocate the buffer when out of scope.
+   *  The OrtAllocator instances must be valid at the point of memory release.
+   */
+  AllocatedStringPtr GetGraphDescriptionAllocated(OrtAllocator* allocator) const;  ///< Wraps OrtApi::ModelMetadataGetGraphDescription
+
+  /** \brief Returns a vector of copies of the custom metadata keys.
+   *
+   * \param allocator to allocate memory for the copy of the string returned
+   * \return a instance std::vector of smart pointers that would deallocate the buffers when out of scope.
+   *  The OrtAllocator instance must be valid at the point of memory release.
+   */
+  std::vector<AllocatedStringPtr> GetCustomMetadataMapKeysAllocated(OrtAllocator* allocator) const;  ///< Wraps OrtApi::ModelMetadataGetCustomMetadataMapKeys
+
+  /** \brief Looks up a value by a key in the Custom Metadata map
+   *
+   * \param key zero terminated string key to lookup
+   * \param allocator to allocate memory for the copy of the string returned
+   * \return a instance of smart pointer that would deallocate the buffer when out of scope.
+   *  maybe nullptr if key is not found.
+   *
+   *  The OrtAllocator instances must be valid at the point of memory release.
+   */
+  AllocatedStringPtr LookupCustomMetadataMapAllocated(const char* key, OrtAllocator* allocator) const;  ///< Wraps OrtApi::ModelMetadataLookupCustomMetadataMap
+
+  int64_t GetVersion() const;  ///< Wraps OrtApi::ModelMetadataGetVersion
+};
+
+struct IoBinding;
+
+namespace detail {
+
+// we separate const-only methods because passing const ptr to non-const methods
+// is only discovered when inline methods are compiled which is counter-intuitive
+template <typename T>
+struct ConstSessionImpl : Base<T> {
+  using B = Base<T>;
+  using B::B;
+
+  size_t GetInputCount() const;                   ///< Returns the number of model inputs
+  size_t GetOutputCount() const;                  ///< Returns the number of model outputs
+  size_t GetOverridableInitializerCount() const;  ///< Returns the number of inputs that have defaults that can be overridden
+
+  std::vector<std::string> GetInputNames() const;
+  std::vector<std::string> GetOutputNames() const;
+  std::vector<std::string> GetOverridableInitializerNames() const;
+
+  /** \brief Returns a copy of input name at the specified index.
+   *
+   * \param index must less than the value returned by GetInputCount()
+   * \param allocator to allocate memory for the copy of the name returned
+   * \return a instance of smart pointer that would deallocate the buffer when out of scope.
+   *  The OrtAllocator instances must be valid at the point of memory release.
+   */
+  AllocatedStringPtr GetInputNameAllocated(size_t index, OrtAllocator* allocator) const;
+
+  /** \brief Returns a copy of output name at then specified index.
+   *
+   * \param index must less than the value returned by GetOutputCount()
+   * \param allocator to allocate memory for the copy of the name returned
+   * \return a instance of smart pointer that would deallocate the buffer when out of scope.
+   *  The OrtAllocator instances must be valid at the point of memory release.
+   */
+  AllocatedStringPtr GetOutputNameAllocated(size_t index, OrtAllocator* allocator) const;
+
+  /** \brief Returns a copy of the overridable initializer name at then specified index.
+   *
+   * \param index must less than the value returned by GetOverridableInitializerCount()
+   * \param allocator to allocate memory for the copy of the name returned
+   * \return a instance of smart pointer that would deallocate the buffer when out of scope.
+   *  The OrtAllocator instances must be valid at the point of memory release.
+   */
+  AllocatedStringPtr GetOverridableInitializerNameAllocated(size_t index, OrtAllocator* allocator) const;  ///< Wraps OrtApi::SessionGetOverridableInitializerName
+
+  uint64_t GetProfilingStartTimeNs() const;  ///< Wraps OrtApi::SessionGetProfilingStartTimeNs
+  ModelMetadata GetModelMetadata() const;    ///< Wraps OrtApi::SessionGetModelMetadata
+
+  TypeInfo GetInputTypeInfo(size_t index) const;                   ///< Wraps OrtApi::SessionGetInputTypeInfo
+  TypeInfo GetOutputTypeInfo(size_t index) const;                  ///< Wraps OrtApi::SessionGetOutputTypeInfo
+  TypeInfo GetOverridableInitializerTypeInfo(size_t index) const;  ///< Wraps OrtApi::SessionGetOverridableInitializerTypeInfo
+
+  int GetOpset(const std::string& domain) const;  ///< Wraps OrtApi::SessionGetOpsetForDomain
+};
+
+template <typename T>
+struct SessionImpl : ConstSessionImpl<T> {
+  using B = ConstSessionImpl<T>;
+  using B::B;
+
+  /** \brief Run the model returning results in an Ort allocated vector.
+   *
+   * Wraps OrtApi::Run
+   *
+   * The caller provides a list of inputs and a list of the desired outputs to return.
+   *
+   * See the output logs for more information on warnings/errors that occur while processing the model.
+   * Common errors are.. (TODO)
+   *
+   * \param[in] run_options
+   * \param[in] input_names Array of null terminated strings of length input_count that is the list of input names
+   * \param[in] input_values Array of Value objects of length input_count that is the list of input values
+   * \param[in] input_count Number of inputs (the size of the input_names & input_values arrays)
+   * \param[in] output_names Array of C style strings of length output_count that is the list of output names
+   * \param[in] output_count Number of outputs (the size of the output_names array)
+   * \return A std::vector of Value objects that directly maps to the output_names array (eg. output_name[0] is the first entry of the returned vector)
+   */
+  std::vector<Value> Run(const RunOptions& run_options, const char* const* input_names, const Value* input_values, size_t input_count,
+                         const char* const* output_names, size_t output_count);
+
+  /** \brief Run the model returning results in user provided outputs
+   * Same as Run(const RunOptions&, const char* const*, const Value*, size_t,const char* const*, size_t)
+   */
+  void Run(const RunOptions& run_options, const char* const* input_names, const Value* input_values, size_t input_count,
+           const char* const* output_names, Value* output_values, size_t output_count);
+
+  void Run(const RunOptions& run_options, const IoBinding&);  ///< Wraps OrtApi::RunWithBinding
+
+  /** \brief Run the model asynchronously in a thread owned by intra op thread pool
+   *
+   * Wraps OrtApi::RunAsync
+   *
+   * \param[in] run_options
+   * \param[in] input_names Array of null terminated UTF8 encoded strings of the input names
+   * \param[in] input_values Array of Value objects of length input_count
+   * \param[in] input_count Number of elements in the input_names and inputs arrays
+   * \param[in] output_names Array of null terminated UTF8 encoded strings of the output names
+   * \param[out] output_values Array of provided Values to be filled with outputs.
+   *             On calling RunAsync, output_values[i] could either be initialized by a null pointer or a preallocated OrtValue*.
+   *             Later, on invoking the callback, each output_values[i] of null will be filled with an OrtValue* allocated by onnxruntime.
+   *             Then, an OrtValue** pointer will be casted from output_values, and pass to the callback.
+   *             NOTE: it is customer's duty to finally release output_values and each of its member,
+   *             regardless of whether the member (Ort::Value) is allocated by onnxruntime or preallocated by the customer.
+   * \param[in] output_count Number of elements in the output_names and outputs array
+   * \param[in] callback Callback function on model run completion
+   * \param[in] user_data User data that pass back to the callback
+   */
+  void RunAsync(const RunOptions& run_options, const char* const* input_names, const Value* input_values, size_t input_count,
+                const char* const* output_names, Value* output_values, size_t output_count, RunAsyncCallbackFn callback, void* user_data);
+
+  /** \brief End profiling and return a copy of the profiling file name.
+   *
+   * \param allocator to allocate memory for the copy of the string returned
+   * \return a instance of smart pointer that would deallocate the buffer when out of scope.
+   *  The OrtAllocator instances must be valid at the point of memory release.
+   */
+  AllocatedStringPtr EndProfilingAllocated(OrtAllocator* allocator);  ///< Wraps OrtApi::SessionEndProfiling
+
+  /** \brief Set DynamicOptions for EPs (Execution Providers)
+   *
+   * Wraps OrtApi::SetEpDynamicOptions
+   *
+   * Valid options can be found in `include\onnxruntime\core\session\onnxruntime_session_options_config_keys.h`
+   * Look for `kOrtEpDynamicOptions`
+   *
+   * \param[in] keys Array of null terminated UTF8 encoded strings of EP dynamic option keys
+   * \param[in] values Array of null terminated UTF8 encoded string of EP dynamic option values
+   * \param[in] kv_len Number of elements in the keys and values arrays
+   */
+  void SetEpDynamicOptions(const char* const* keys, const char* const* values, size_t kv_len);
+
+  void FinalizeModelBuilderSession(const ModelBuilderAPI::Model& model, const SessionOptions& options,
+                                   OrtPrepackedWeightsContainer* prepacked_weights_container = nullptr);
+};
+
+}  // namespace detail
+
+using ConstSession = detail::ConstSessionImpl<detail::Unowned<const OrtSession>>;
+using UnownedSession = detail::SessionImpl<detail::Unowned<OrtSession>>;
+
+/** \brief Wrapper around ::OrtSession
+ *
+ */
+struct Session : detail::SessionImpl<OrtSession> {
+  /// Create an empty Session object, must be assigned a valid one to be used. Wraps OrtApi::CreateSession
+  explicit Session(std::nullptr_t) {}
+  explicit Session(OrtSession* p) : SessionImpl{p} {}  ///< C API Interop
+
+  Session(const Env& env, const ORTCHAR_T* model_path, const SessionOptions& options);
+
+  /// Wraps OrtApi::CreateSessionWithPrepackedWeightsContainer
+  Session(const Env& env, const ORTCHAR_T* model_path, const SessionOptions& options,
+          OrtPrepackedWeightsContainer* prepacked_weights_container);
+
+  /// Wraps OrtApi::CreateSessionFromArray
+  Session(const Env& env, const void* model_data, size_t model_data_length, const SessionOptions& options);
+
+  /// Wraps OrtApi::CreateSessionFromArrayWithPrepackedWeightsContainer
+  Session(const Env& env, const void* model_data, size_t model_data_length, const SessionOptions& options,
+          OrtPrepackedWeightsContainer* prepacked_weights_container);
+
+  /// Wraps OrtModelBuilderApi::CreateSessionFromModel
+  Session(const Env& env, const ModelBuilderAPI::Model& model, const SessionOptions& options);
+
+  /// Wraps OrtModelBuilderApi::CreateModelBuilderSession
+  static Session CreateModelBuilderSession(const Env& env, const ORTCHAR_T* model_path, const SessionOptions& options);
+
+  /// Wraps OrtModelBuilderApi::CreateModelBuilderSession
+  static Session CreateModelBuilderSession(const Env& env, const void* model_data, size_t model_data_length,
+                                           const SessionOptions& options);
+
+  ConstSession GetConst() const { return ConstSession{this->p_}; }
+  UnownedSession GetUnowned() const { return UnownedSession{this->p_}; }
+};
+
+namespace detail {
+template <typename T>
+struct MemoryInfoImpl : Base<T> {
+  using B = Base<T>;
+  using B::B;
+
+  std::string GetAllocatorName() const;
+  OrtAllocatorType GetAllocatorType() const;
+  int GetDeviceId() const;
+  OrtMemoryInfoDeviceType GetDeviceType() const;
+  OrtMemType GetMemoryType() const;
+
+  template <typename U>
+  bool operator==(const MemoryInfoImpl<U>& o) const;
+};
+}  // namespace detail
+
+// Const object holder that does not own the underlying object
+using ConstMemoryInfo = detail::MemoryInfoImpl<detail::Unowned<const OrtMemoryInfo>>;
+
+/** \brief Wrapper around ::OrtMemoryInfo
+ *
+ */
+struct MemoryInfo : detail::MemoryInfoImpl<OrtMemoryInfo> {
+  static MemoryInfo CreateCpu(OrtAllocatorType type, OrtMemType mem_type1);
+  explicit MemoryInfo(std::nullptr_t) {}                                       ///< No instance is created
+  explicit MemoryInfo(OrtMemoryInfo* p) : MemoryInfoImpl<OrtMemoryInfo>{p} {}  ///< Take ownership of a pointer created by C API
+  MemoryInfo(const char* name, OrtAllocatorType type, int id, OrtMemType mem_type);
+  ConstMemoryInfo GetConst() const { return ConstMemoryInfo{this->p_}; }
+};
+
+namespace detail {
+template <typename T>
+struct TensorTypeAndShapeInfoImpl : Base<T> {
+  using B = Base<T>;
+  using B::B;
+
+  ONNXTensorElementDataType GetElementType() const;  ///< Wraps OrtApi::GetTensorElementType
+  size_t GetElementCount() const;                    ///< Wraps OrtApi::GetTensorShapeElementCount
+
+  size_t GetDimensionsCount() const;  ///< Wraps OrtApi::GetDimensionsCount
+
+  /** \deprecated use GetShape() returning std::vector
+   * [[deprecated]]
+   * This interface is unsafe to use
+   */
+  [[deprecated("use GetShape()")]] void GetDimensions(int64_t* values, size_t values_count) const;  ///< Wraps OrtApi::GetDimensions
+
+  void GetSymbolicDimensions(const char** values, size_t values_count) const;  ///< Wraps OrtApi::GetSymbolicDimensions
+  std::vector<const char*> GetSymbolicDimensions() const;
+
+  std::vector<int64_t> GetShape() const;  ///< Uses GetDimensionsCount & GetDimensions to return a std::vector of the shape
+};
+
+}  // namespace detail
+
+using ConstTensorTypeAndShapeInfo = detail::TensorTypeAndShapeInfoImpl<detail::Unowned<const OrtTensorTypeAndShapeInfo>>;
+
+/** \brief Wrapper around ::OrtTensorTypeAndShapeInfo
+ *
+ */
+struct TensorTypeAndShapeInfo : detail::TensorTypeAndShapeInfoImpl<OrtTensorTypeAndShapeInfo> {
+  using Base = detail::TensorTypeAndShapeInfoImpl<OrtTensorTypeAndShapeInfo>;
+  using Base::Base;
+
+  /// Create an empty TensorTypeAndShapeInfo object, must be assigned a valid one to be used
+  explicit TensorTypeAndShapeInfo(std::nullptr_t) {}
+  /// Used for interop with the C API
+  explicit TensorTypeAndShapeInfo(OrtTensorTypeAndShapeInfo* p) : TensorTypeAndShapeInfoImpl{p} {}
+
+  // Create a TensorTypeAndShapeInfo object with the specified element type and dimensions
+  // symbolic_dims are optional, but should be 1:1 with dims.
+  // The value in symbolic_dims will be used for all entries in dims that are -1.
+  explicit TensorTypeAndShapeInfo(ONNXTensorElementDataType element_type,
+                                  const std::vector<int64_t>& dims,
+                                  const std::vector<std::string>* symbolic_dims = nullptr);
+
+  ConstTensorTypeAndShapeInfo GetConst() const { return ConstTensorTypeAndShapeInfo{this->p_}; }
+};
+
+namespace detail {
+template <typename T>
+struct SequenceTypeInfoImpl : Base<T> {
+  using B = Base<T>;
+  using B::B;
+  TypeInfo GetSequenceElementType() const;  ///< Wraps OrtApi::GetSequenceElementType
+};
+
+}  // namespace detail
+
+using ConstSequenceTypeInfo = detail::SequenceTypeInfoImpl<detail::Unowned<const OrtSequenceTypeInfo>>;
+
+/** \brief Wrapper around ::OrtSequenceTypeInfo
+ *
+ */
+struct SequenceTypeInfo : detail::SequenceTypeInfoImpl<OrtSequenceTypeInfo> {
+  using Base = detail::SequenceTypeInfoImpl<OrtSequenceTypeInfo>;
+  using Base::Base;
+
+  explicit SequenceTypeInfo(std::nullptr_t) {}                                                         ///< Create an empty SequenceTypeInfo object, must be assigned a valid one to be used
+  explicit SequenceTypeInfo(OrtSequenceTypeInfo* p) : SequenceTypeInfoImpl<OrtSequenceTypeInfo>{p} {}  ///< Used for interop with the C API
+  ConstSequenceTypeInfo GetConst() const { return ConstSequenceTypeInfo{this->p_}; }
+};
+
+namespace detail {
+template <typename T>
+struct OptionalTypeInfoImpl : Base<T> {
+  using B = Base<T>;
+  using B::B;
+  TypeInfo GetOptionalElementType() const;  ///< Wraps OrtApi::CastOptionalTypeToContainedTypeInfo
+};
+
+}  // namespace detail
+
+// This is always owned by the TypeInfo and can only be obtained from it.
+using ConstOptionalTypeInfo = detail::OptionalTypeInfoImpl<detail::Unowned<const OrtOptionalTypeInfo>>;
+
+namespace detail {
+template <typename T>
+struct MapTypeInfoImpl : detail::Base<T> {
+  using B = Base<T>;
+  using B::B;
+  ONNXTensorElementDataType GetMapKeyType() const;  ///< Wraps OrtApi::GetMapKeyType
+  TypeInfo GetMapValueType() const;                 ///< Wraps OrtApi::GetMapValueType
+};
+
+}  // namespace detail
+
+using ConstMapTypeInfo = detail::MapTypeInfoImpl<detail::Unowned<const OrtMapTypeInfo>>;
+
+/** \brief Wrapper around ::OrtMapTypeInfo
+ *
+ */
+struct MapTypeInfo : detail::MapTypeInfoImpl<OrtMapTypeInfo> {
+  using Base = detail::MapTypeInfoImpl<OrtMapTypeInfo>;
+  using Base::Base;
+
+  explicit MapTypeInfo(std::nullptr_t) {}                                          ///< Create an empty MapTypeInfo object, must be assigned a valid one to be used
+  explicit MapTypeInfo(OrtMapTypeInfo* p) : MapTypeInfoImpl<OrtMapTypeInfo>{p} {}  ///< Used for interop with the C API
+  ConstMapTypeInfo GetConst() const { return ConstMapTypeInfo{this->p_}; }
+};
+
+namespace detail {
+template <typename T>
+struct TypeInfoImpl : detail::Base<T> {
+  using B = Base<T>;
+  using B::B;
+
+  ConstTensorTypeAndShapeInfo GetTensorTypeAndShapeInfo() const;  ///< Wraps OrtApi::CastTypeInfoToTensorInfo
+  ConstSequenceTypeInfo GetSequenceTypeInfo() const;              ///< Wraps OrtApi::CastTypeInfoToSequenceTypeInfo
+  ConstMapTypeInfo GetMapTypeInfo() const;                        ///< Wraps OrtApi::CastTypeInfoToMapTypeInfo
+  ConstOptionalTypeInfo GetOptionalTypeInfo() const;              ///< wraps OrtApi::CastTypeInfoToOptionalTypeInfo
+
+  ONNXType GetONNXType() const;
+};
+}  // namespace detail
+
+/// <summary>
+/// Contains a constant, unowned OrtTypeInfo that can be copied and passed around by value.
+/// Provides access to const OrtTypeInfo APIs.
+/// </summary>
+using ConstTypeInfo = detail::TypeInfoImpl<detail::Unowned<const OrtTypeInfo>>;
+
+/// <summary>
+/// Type information that may contain either TensorTypeAndShapeInfo or
+/// the information about contained sequence or map depending on the ONNXType.
+/// </summary>
+struct TypeInfo : detail::TypeInfoImpl<OrtTypeInfo> {
+  using Base = detail::TypeInfoImpl<OrtTypeInfo>;
+  using Base::Base;
+
+  /// Create an empty TypeInfo object, must be assigned a valid one to be used
+  explicit TypeInfo(std::nullptr_t) {}
+  explicit TypeInfo(OrtTypeInfo* p) : TypeInfoImpl<OrtTypeInfo>{p} {}  ///< C API Interop
+
+  static TypeInfo CreateTensorInfo(ConstTensorTypeAndShapeInfo tensor_info);
+  static TypeInfo CreateSparseTensorInfo(ConstTensorTypeAndShapeInfo sparse_tensor_info);
+  static TypeInfo CreateSequenceTypeInfo(ConstTypeInfo sequence_type);
+  static TypeInfo CreateMapTypeInfo(ONNXTensorElementDataType key_type, ConstTypeInfo value_type);
+  static TypeInfo CreateOptionalTypeInfo(ConstTypeInfo contained_type);
+
+  ConstTypeInfo GetConst() const { return ConstTypeInfo{this->p_}; }
+};
+
+namespace detail {
+// This structure is used to feed  sparse tensor values
+// information for use with FillSparseTensor<Format>() API
+// if the data type for the sparse tensor values is numeric
+// use data.p_data, otherwise, use data.str pointer to feed
+// values. data.str is an array of const char* that are zero terminated.
+// number of strings in the array must match shape size.
+// For fully sparse tensors use shape {0} and set p_data/str
+// to nullptr.
+struct OrtSparseValuesParam {
+  const int64_t* values_shape;
+  size_t values_shape_len;
+  union {
+    const void* p_data;
+    const char** str;
+  } data;
+};
+
+// Provides a way to pass shape in a single
+// argument
+struct Shape {
+  const int64_t* shape;
+  size_t shape_len;
+};
+
+template <typename T>
+struct ConstValueImpl : Base<T> {
+  using B = Base<T>;
+  using B::B;
+
+  /// <summary>
+  /// Obtains a pointer to a user defined data for experimental purposes
+  /// </summary>
+  template <typename R>
+  void GetOpaqueData(const char* domain, const char* type_name, R&) const;  ///< Wraps OrtApi::GetOpaqueValue
+
+  bool IsTensor() const;  ///< Returns true if Value is a tensor, false for other types like map/sequence/etc
+  bool HasValue() const;  /// < Return true if OrtValue contains data and returns false if the OrtValue is a None
+
+  size_t GetCount() const;  // If a non tensor, returns 2 for map and N for sequence, where N is the number of elements
+  Value GetValue(int index, OrtAllocator* allocator) const;
+
+  /// <summary>
+  /// This API returns a full length of string data contained within either a tensor or a sparse Tensor.
+  /// For sparse tensor it returns a full length of stored non-empty strings (values). The API is useful
+  /// for allocating necessary memory and calling GetStringTensorContent().
+  /// </summary>
+  /// <returns>total length of UTF-8 encoded bytes contained. No zero terminators counted.</returns>
+  size_t GetStringTensorDataLength() const;
+
+  /// <summary>
+  /// The API copies all of the UTF-8 encoded string data contained within a tensor or a sparse tensor
+  /// into a supplied buffer. Use GetStringTensorDataLength() to find out the length of the buffer to allocate.
+  /// The user must also allocate offsets buffer with the number of entries equal to that of the contained
+  /// strings.
+  ///
+  /// Strings are always assumed to be on CPU, no X-device copy.
+  /// </summary>
+  /// <param name="buffer">user allocated buffer</param>
+  /// <param name="buffer_length">length in bytes of the allocated buffer</param>
+  /// <param name="offsets">a pointer to the offsets user allocated buffer</param>
+  /// <param name="offsets_count">count of offsets, must be equal to the number of strings contained.
+  ///   that can be obtained from the shape of the tensor or from GetSparseTensorValuesTypeAndShapeInfo()
+  ///   for sparse tensors</param>
+  void GetStringTensorContent(void* buffer, size_t buffer_length, size_t* offsets, size_t offsets_count) const;
+
+  /// <summary>
+  /// Returns a const typed pointer to the tensor contained data.
+  /// No type checking is performed, the caller must ensure the type matches the tensor type.
+  /// </summary>
+  /// <typeparam name="T"></typeparam>
+  /// <returns>const pointer to data, no copies made</returns>
+  template <typename R>
+  const R* GetTensorData() const;  ///< Wraps OrtApi::GetTensorMutableData   /// <summary>
+
+  /// <summary>
+  /// Returns a non-typed pointer to a tensor contained data.
+  /// </summary>
+  /// <returns>const pointer to data, no copies made</returns>
+  const void* GetTensorRawData() const;
+
+  /// <summary>
+  /// The API returns type information for data contained in a tensor. For sparse
+  /// tensors it returns type information for contained non-zero values.
+  /// It returns dense shape for sparse tensors.
+  /// </summary>
+  /// <returns>TypeInfo</returns>
+  TypeInfo GetTypeInfo() const;
+
+  /// <summary>
+  /// The API returns type information for data contained in a tensor. For sparse
+  /// tensors it returns type information for contained non-zero values.
+  /// It returns dense shape for sparse tensors.
+  /// </summary>
+  /// <returns>TensorTypeAndShapeInfo</returns>
+  TensorTypeAndShapeInfo GetTensorTypeAndShapeInfo() const;
+
+  /// <summary>
+  /// This API returns information about the memory allocation used to hold data.
+  /// </summary>
+  /// <returns>Non owning instance of MemoryInfo</returns>
+  ConstMemoryInfo GetTensorMemoryInfo() const;
+
+  /// <summary>
+  /// The API copies UTF-8 encoded bytes for the requested string element
+  /// contained within a tensor or a sparse tensor into a provided buffer.
+  /// Use GetStringTensorElementLength() to obtain the length of the buffer to allocate.
+  /// </summary>
+  /// <param name="buffer_length"></param>
+  /// <param name="element_index"></param>
+  /// <param name="buffer"></param>
+  void GetStringTensorElement(size_t buffer_length, size_t element_index, void* buffer) const;
+
+  /// <summary>
+  /// Returns string tensor UTF-8 encoded string element.
+  /// Use of this API is recommended over GetStringTensorElement() that takes void* buffer pointer.
+  /// </summary>
+  /// <param name="element_index"></param>
+  /// <returns>std::string</returns>
+  std::string GetStringTensorElement(size_t element_index) const;
+
+  /// <summary>
+  /// The API returns a byte length of UTF-8 encoded string element
+  /// contained in either a tensor or a spare tensor values.
+  /// </summary>
+  /// <param name="element_index"></param>
+  /// <returns>byte length for the specified string element</returns>
+  size_t GetStringTensorElementLength(size_t element_index) const;
+
+#if !defined(DISABLE_SPARSE_TENSORS)
+  /// <summary>
+  /// The API returns the sparse data format this OrtValue holds in a sparse tensor.
+  /// If the sparse tensor was not fully constructed, i.e. Use*() or Fill*() API were not used
+  /// the value returned is ORT_SPARSE_UNDEFINED.
+  /// </summary>
+  /// <returns>Format enum</returns>
+  OrtSparseFormat GetSparseFormat() const;
+
+  /// <summary>
+  /// The API returns type and shape information for stored non-zero values of the
+  /// sparse tensor. Use GetSparseTensorValues() to obtain values buffer pointer.
+  /// </summary>
+  /// <returns>TensorTypeAndShapeInfo values information</returns>
+  TensorTypeAndShapeInfo GetSparseTensorValuesTypeAndShapeInfo() const;
+
+  /// <summary>
+  /// The API returns type and shape information for the specified indices. Each supported
+  /// indices have their own enum values even if a give format has more than one kind of indices.
+  /// Use GetSparseTensorIndicesData() to obtain pointer to indices buffer.
+  /// </summary>
+  /// <param name="format">enum requested</param>
+  /// <returns>type and shape information</returns>
+  TensorTypeAndShapeInfo GetSparseTensorIndicesTypeShapeInfo(OrtSparseIndicesFormat format) const;
+
+  /// <summary>
+  /// The API retrieves a pointer to the internal indices buffer. The API merely performs
+  /// a convenience data type casting on the return type pointer. Make sure you are requesting
+  /// the right type, use GetSparseTensorIndicesTypeShapeInfo();
+  /// </summary>
+  /// <typeparam name="T">type to cast to</typeparam>
+  /// <param name="indices_format">requested indices kind</param>
+  /// <param name="num_indices">number of indices entries</param>
+  /// <returns>Pinter to the internal sparse tensor buffer containing indices. Do not free this pointer.</returns>
+  template <typename R>
+  const R* GetSparseTensorIndicesData(OrtSparseIndicesFormat indices_format, size_t& num_indices) const;
+
+  /// <summary>
+  /// Returns true if the OrtValue contains a sparse tensor
+  /// </summary>
+  /// <returns></returns>
+  bool IsSparseTensor() const;
+
+  /// <summary>
+  /// The API returns a pointer to an internal buffer of the sparse tensor
+  /// containing non-zero values. The API merely does casting. Make sure you
+  /// are requesting the right data type by calling GetSparseTensorValuesTypeAndShapeInfo()
+  /// first.
+  /// </summary>
+  /// <typeparam name="T">numeric data types only. Use GetStringTensor*() to retrieve strings.</typeparam>
+  /// <returns>a pointer to the internal values buffer. Do not free this pointer.</returns>
+  template <typename R>
+  const R* GetSparseTensorValues() const;
+
+#endif
+};
+
+template <typename T>
+struct ValueImpl : ConstValueImpl<T> {
+  using B = ConstValueImpl<T>;
+  using B::B;
+
+  /// <summary>
+  /// Returns a non-const typed pointer to an OrtValue/Tensor contained buffer
+  /// No type checking is performed, the caller must ensure the type matches the tensor type.
+  /// </summary>
+  /// <returns>non-const pointer to data, no copies made</returns>
+  template <typename R>
+  R* GetTensorMutableData();
+
+  /// <summary>
+  /// Returns a non-typed non-const pointer to a tensor contained data.
+  /// </summary>
+  /// <returns>pointer to data, no copies made</returns>
+  void* GetTensorMutableRawData();
+
+  /// <summary>
+  //  Obtain a reference to an element of data at the location specified
+  /// by the vector of dims.
+  /// </summary>
+  /// <typeparam name="R"></typeparam>
+  /// <param name="location">[in] expressed by a vecotr of dimensions offsets</param>
+  /// <returns></returns>
+  template <typename R>
+  R& At(const std::vector<int64_t>& location);
+
+  /// <summary>
+  /// Set all strings at once in a string tensor
+  /// </summary>
+  /// <param name="s">[in] An array of strings. Each string in this array must be null terminated.</param>
+  /// <param name="s_len">[in] Count of strings in s (Must match the size of \p value's tensor shape)</param>
+  void FillStringTensor(const char* const* s, size_t s_len);
+
+  /// <summary>
+  /// Set a single string in a string tensor
+  /// </summary>
+  /// <param name="s">[in] A null terminated UTF-8 encoded string</param>
+  /// <param name="index">[in] Index of the string in the tensor to set</param>
+  void FillStringTensorElement(const char* s, size_t index);
+
+  /// <summary>
+  /// Allocate if necessary and obtain a pointer to a UTF-8
+  /// encoded string element buffer indexed by the flat element index,
+  /// of the specified length.
+  ///
+  /// This API is for advanced usage. It avoids a need to construct
+  /// an auxiliary array of string pointers, and allows to write data directly
+  /// (do not zero terminate).
+  /// </summary>
+  /// <param name="index"></param>
+  /// <param name="buffer_length"></param>
+  /// <returns>a pointer to a writable buffer</returns>
+  char* GetResizedStringTensorElementBuffer(size_t index, size_t buffer_length);
+
+#if !defined(DISABLE_SPARSE_TENSORS)
+  /// <summary>
+  /// Supplies COO format specific indices and marks the contained sparse tensor as being a COO format tensor.
+  /// Values are supplied with a CreateSparseTensor() API. The supplied indices are not copied and the user
+  /// allocated buffers lifespan must eclipse that of the OrtValue.
+  /// The location of the indices is assumed to be the same as specified by OrtMemoryInfo argument at the creation time.
+  /// </summary>
+  /// <param name="indices_data">pointer to the user allocated buffer with indices. Use nullptr for fully sparse tensors.</param>
+  /// <param name="indices_num">number of indices entries. Use 0 for fully sparse tensors</param>
+  void UseCooIndices(int64_t* indices_data, size_t indices_num);
+
+  /// <summary>
+  /// Supplies CSR format specific indices and marks the contained sparse tensor as being a CSR format tensor.
+  /// Values are supplied with a CreateSparseTensor() API. The supplied indices are not copied and the user
+  /// allocated buffers lifespan must eclipse that of the OrtValue.
+  /// The location of the indices is assumed to be the same as specified by OrtMemoryInfo argument at the creation time.
+  /// </summary>
+  /// <param name="inner_data">pointer to the user allocated buffer with inner indices or nullptr for fully sparse tensors</param>
+  /// <param name="inner_num">number of csr inner indices or 0 for fully sparse tensors</param>
+  /// <param name="outer_data">pointer to the user allocated buffer with outer indices or nullptr for fully sparse tensors</param>
+  /// <param name="outer_num">number of csr outer indices or 0 for fully sparse tensors</param>
+  void UseCsrIndices(int64_t* inner_data, size_t inner_num, int64_t* outer_data, size_t outer_num);
+
+  /// <summary>
+  /// Supplies BlockSparse format specific indices and marks the contained sparse tensor as being a BlockSparse format tensor.
+  /// Values are supplied with a CreateSparseTensor() API. The supplied indices are not copied and the user
+  /// allocated buffers lifespan must eclipse that of the OrtValue.
+  /// The location of the indices is assumed to be the same as specified by OrtMemoryInfo argument at the creation time.
+  /// </summary>
+  /// <param name="indices_shape">indices shape or a {0} for fully sparse</param>
+  /// <param name="indices_data">user allocated buffer with indices or nullptr for fully spare tensors</param>
+  void UseBlockSparseIndices(const Shape& indices_shape, int32_t* indices_data);
+
+  /// <summary>
+  /// The API will allocate memory using the allocator instance supplied to the CreateSparseTensor() API
+  /// and copy the values and COO indices into it. If data_mem_info specifies that the data is located
+  /// at difference device than the allocator, a X-device copy will be performed if possible.
+  /// </summary>
+  /// <param name="data_mem_info">specified buffer memory description</param>
+  /// <param name="values_param">values buffer information.</param>
+  /// <param name="indices_data">coo indices buffer or nullptr for fully sparse data</param>
+  /// <param name="indices_num">number of COO indices or 0 for fully sparse data</param>
+  void FillSparseTensorCoo(const OrtMemoryInfo* data_mem_info, const OrtSparseValuesParam& values_param,
+                           const int64_t* indices_data, size_t indices_num);
+
+  /// <summary>
+  /// The API will allocate memory using the allocator instance supplied to the CreateSparseTensor() API
+  /// and copy the values and CSR indices into it. If data_mem_info specifies that the data is located
+  /// at difference device than the allocator, a X-device copy will be performed if possible.
+  /// </summary>
+  /// <param name="data_mem_info">specified buffer memory description</param>
+  /// <param name="values">values buffer information</param>
+  /// <param name="inner_indices_data">csr inner indices pointer or nullptr for fully sparse tensors</param>
+  /// <param name="inner_indices_num">number of csr inner indices or 0 for fully sparse tensors</param>
+  /// <param name="outer_indices_data">pointer to csr indices data or nullptr for fully sparse tensors</param>
+  /// <param name="outer_indices_num">number of csr outer indices or 0</param>
+  void FillSparseTensorCsr(const OrtMemoryInfo* data_mem_info,
+                           const OrtSparseValuesParam& values,
+                           const int64_t* inner_indices_data, size_t inner_indices_num,
+                           const int64_t* outer_indices_data, size_t outer_indices_num);
+
+  /// <summary>
+  /// The API will allocate memory using the allocator instance supplied to the CreateSparseTensor() API
+  /// and copy the values and BlockSparse indices into it. If data_mem_info specifies that the data is located
+  /// at difference device than the allocator, a X-device copy will be performed if possible.
+  /// </summary>
+  /// <param name="data_mem_info">specified buffer memory description</param>
+  /// <param name="values">values buffer information</param>
+  /// <param name="indices_shape">indices shape. use {0} for fully sparse tensors</param>
+  /// <param name="indices_data">pointer to indices data or nullptr for fully sparse tensors</param>
+  void FillSparseTensorBlockSparse(const OrtMemoryInfo* data_mem_info,
+                                   const OrtSparseValuesParam& values,
+                                   const Shape& indices_shape,
+                                   const int32_t* indices_data);
+
+#endif
+};
+
+}  // namespace detail
+
+using ConstValue = detail::ConstValueImpl<detail::Unowned<const OrtValue>>;
+using UnownedValue = detail::ValueImpl<detail::Unowned<OrtValue>>;
+
+/** \brief Wrapper around ::OrtValue
+ *
+ */
+struct Value : detail::ValueImpl<OrtValue> {
+  using Base = detail::ValueImpl<OrtValue>;
+  using Base::Base;
+  using OrtSparseValuesParam = detail::OrtSparseValuesParam;
+  using Shape = detail::Shape;
+
+  explicit Value(std::nullptr_t) {}  ///< Create an empty Value object, must be assigned a valid one to be used
+  Value(Value&&) = default;
+  Value& operator=(Value&&) = default;
+
+  ConstValue GetConst() const { return ConstValue{this->p_}; }
+  UnownedValue GetUnowned() const { return UnownedValue{this->p_}; }
+
+  /** \brief Creates a tensor with a user supplied buffer. Wraps OrtApi::CreateTensorWithDataAsOrtValue.
+   * \tparam T The numeric datatype. This API is not suitable for strings.
+   * \param info Memory description of where the p_data buffer resides (CPU vs GPU etc).
+   * \param p_data Pointer to the data buffer.
+   * \param p_data_element_count The number of elements in the data buffer.
+   * \param shape Pointer to the tensor shape dimensions.
+   * \param shape_len The number of tensor shape dimensions.
+   */
+  template <typename T>
+  static Value CreateTensor(const OrtMemoryInfo* info, T* p_data, size_t p_data_element_count,
+                            const int64_t* shape, size_t shape_len);
+
+  /** \brief Creates a tensor with a user supplied buffer. Wraps OrtApi::CreateTensorWithDataAsOrtValue.
+   *
+   * \param info Memory description of where the p_data buffer resides (CPU vs GPU etc).
+   * \param p_data Pointer to the data buffer.
+   * \param p_data_byte_count The number of bytes in the data buffer.
+   * \param shape Pointer to the tensor shape dimensions.
+   * \param shape_len The number of tensor shape dimensions.
+   * \param type The data type.
+   */
+  static Value CreateTensor(const OrtMemoryInfo* info, void* p_data, size_t p_data_byte_count,
+                            const int64_t* shape, size_t shape_len,
+                            ONNXTensorElementDataType type);
+
+  /** \brief Creates a tensor with a user supplied buffer. Wraps OrtApi::CreateTensorWithDataAndDeleterAsOrtValue.
+   *
+   * \param deleter OrtAllocator that will be used to free the buffer when no longer required.
+   * \param p_data Pointer to the data buffer.
+   * \param p_data_byte_count The number of bytes in the data buffer.
+   * \param shape Pointer to the tensor shape dimensions.
+   * \param shape_len The number of tensor shape dimensions.
+   * \param type The data type.
+   */
+  static Value CreateTensor(OrtAllocator* deleter, void* p_data, size_t p_data_byte_count,
+                            const int64_t* shape, size_t shape_len,
+                            ONNXTensorElementDataType type);
+
+  /** \brief Creates an OrtValue with a tensor using a supplied OrtAllocator. Wraps OrtApi::CreateTensorAsOrtValue.
+   *         This overload will allocate the buffer for the tensor according to the supplied shape and data type.
+   *         The allocated buffer will be owned by the returned OrtValue and will be freed when the OrtValue is released.
+   *         The input data would need to be copied into the allocated buffer.
+   *         This API is not suitable for strings.
+   *
+   * \tparam T The numeric datatype. This API is not suitable for strings.
+   * \param allocator The allocator to use.
+   * \param shape Pointer to the tensor shape dimensions.
+   * \param shape_len The number of tensor shape dimensions.
+   */
+  template <typename T>
+  static Value CreateTensor(OrtAllocator* allocator, const int64_t* shape, size_t shape_len);
+
+  /** \brief Creates an OrtValue with a tensor using the supplied OrtAllocator.
+   *   Wraps OrtApi::CreateTensorAsOrtValue.
+   *   The allocated buffer will be owned by the returned OrtValue and will be freed when the OrtValue is released.
+   *   The input data would need to be copied into the allocated buffer.
+   *   This API is not suitable for strings.
+   *
+   * \param allocator The allocator to use.
+   * \param shape Pointer to the tensor shape dimensions.
+   * \param shape_len The number of tensor shape dimensions.
+   * \param type The data type.
+   */
+  static Value CreateTensor(OrtAllocator* allocator, const int64_t* shape, size_t shape_len,
+                            ONNXTensorElementDataType type);
+
+  /** \brief Creates an OrtValue with a Map Onnx type representation.
+   *  The API would ref-count the supplied OrtValues and they will be released
+   *  when the returned OrtValue is released. The caller may release keys and values after the call
+   *  returns.
+   *
+   * \param keys an OrtValue containing a tensor with primitive data type keys.
+   * \param values an OrtValue that may contain a tensor. Ort currently supports only primitive data type values.
+   */
+  static Value CreateMap(const Value& keys, const Value& values);  ///< Wraps OrtApi::CreateValue
+
+  /** \brief Creates an OrtValue with a Sequence Onnx type representation.
+   *  The API would ref-count the supplied OrtValues and they will be released
+   *  when the returned OrtValue is released. The caller may release the values after the call
+   *  returns.
+   *
+   * \param values a vector of OrtValues that must have the same Onnx value type.
+   */
+  static Value CreateSequence(const std::vector<Value>& values);  ///< Wraps OrtApi::CreateValue
+
+  /** \brief Creates an OrtValue wrapping an Opaque type.
+   *  This is used for experimental support of non-tensor types.
+   *
+   * \tparam T - the type of the value.
+   * \param domain - zero terminated utf-8 string. Domain of the type.
+   * \param type_name - zero terminated utf-8 string. Name of the type.
+   * \param value - the value to be wrapped.
+   */
+  template <typename T>
+  static Value CreateOpaque(const char* domain, const char* type_name, const T& value);  ///< Wraps OrtApi::CreateOpaqueValue
+
+#if !defined(DISABLE_SPARSE_TENSORS)
+  /// <summary>
+  /// This is a simple forwarding method to the other overload that helps deducing
+  /// data type enum value from the type of the buffer.
+  /// </summary>
+  /// <typeparam name="T">numeric datatype. This API is not suitable for strings.</typeparam>
+  /// <param name="info">Memory description where the user buffers reside (CPU vs GPU etc)</param>
+  /// <param name="p_data">pointer to the user supplied buffer, use nullptr for fully sparse tensors</param>
+  /// <param name="dense_shape">a would be dense shape of the tensor</param>
+  /// <param name="values_shape">non zero values shape. Use a single 0 shape for fully sparse tensors.</param>
+  /// <returns></returns>
+  template <typename T>
+  static Value CreateSparseTensor(const OrtMemoryInfo* info, T* p_data, const Shape& dense_shape,
+                                  const Shape& values_shape);
+
+  /// <summary>
+  /// Creates an OrtValue instance containing SparseTensor. This constructs
+  /// a sparse tensor that makes use of user allocated buffers. It does not make copies
+  /// of the user provided data and does not modify it. The lifespan of user provided buffers should
+  /// eclipse the life span of the resulting OrtValue. This call constructs an instance that only contain
+  /// a pointer to non-zero values. To fully populate the sparse tensor call Use<Format>Indices() API below
+  /// to supply a sparse format specific indices.
+  /// This API is not suitable for string data. Use CreateSparseTensor() with allocator specified so strings
+  /// can be properly copied into the allocated buffer.
+  /// </summary>
+  /// <param name="info">Memory description where the user buffers reside (CPU vs GPU etc)</param>
+  /// <param name="p_data">pointer to the user supplied buffer, use nullptr for fully sparse tensors</param>
+  /// <param name="dense_shape">a would be dense shape of the tensor</param>
+  /// <param name="values_shape">non zero values shape. Use a single 0 shape for fully sparse tensors.</param>
+  /// <param name="type">data type</param>
+  /// <returns>Ort::Value instance containing SparseTensor</returns>
+  static Value CreateSparseTensor(const OrtMemoryInfo* info, void* p_data, const Shape& dense_shape,
+                                  const Shape& values_shape, ONNXTensorElementDataType type);
+
+  /// <summary>
+  /// This is a simple forwarding method to the below CreateSparseTensor.
+  /// This helps to specify data type enum in terms of C++ data type.
+  /// Use CreateSparseTensor<T>
+  /// </summary>
+  /// <typeparam name="T">numeric data type only. String data enum must be specified explicitly.</typeparam>
+  /// <param name="allocator">allocator to use</param>
+  /// <param name="dense_shape">a would be dense shape of the tensor</param>
+  /// <returns>Ort::Value</returns>
+  template <typename T>
+  static Value CreateSparseTensor(OrtAllocator* allocator, const Shape& dense_shape);
+
+  /// <summary>
+  /// Creates an instance of OrtValue containing sparse tensor. The created instance has no data.
+  /// The data must be supplied by on of the FillSparseTensor<Format>() methods that take both non-zero values
+  /// and indices. The data will be copied into a buffer that would be allocated using the supplied allocator.
+  /// Use this API to create OrtValues that contain sparse tensors with all supported data types including
+  /// strings.
+  /// </summary>
+  /// <param name="allocator">allocator to use. The allocator lifespan must eclipse that of the resulting OrtValue</param>
+  /// <param name="dense_shape">a would be dense shape of the tensor</param>
+  /// <param name="type">data type</param>
+  /// <returns>an instance of Ort::Value</returns>
+  static Value CreateSparseTensor(OrtAllocator* allocator, const Shape& dense_shape, ONNXTensorElementDataType type);
+
+#endif  // !defined(DISABLE_SPARSE_TENSORS)
+};
+
+/// <summary>
+/// Represents native memory allocation coming from one of the
+/// OrtAllocators registered with OnnxRuntime.
+/// Use it to wrap an allocation made by an allocator
+/// so it can be automatically released when no longer needed.
+/// </summary>
+struct MemoryAllocation {
+  MemoryAllocation(OrtAllocator* allocator, void* p, size_t size);
+  ~MemoryAllocation();
+  MemoryAllocation(const MemoryAllocation&) = delete;
+  MemoryAllocation& operator=(const MemoryAllocation&) = delete;
+  MemoryAllocation(MemoryAllocation&&) noexcept;
+  MemoryAllocation& operator=(MemoryAllocation&&) noexcept;
+
+  void* get() { return p_; }
+  size_t size() const { return size_; }
+
+ private:
+  OrtAllocator* allocator_;
+  void* p_;
+  size_t size_;
+};
+
+namespace detail {
+template <typename T>
+struct AllocatorImpl : Base<T> {
+  using B = Base<T>;
+  using B::B;
+
+  void* Alloc(size_t size);
+  MemoryAllocation GetAllocation(size_t size);
+  void Free(void* p);
+  ConstMemoryInfo GetInfo() const;
+};
+
+}  // namespace detail
+
+/** \brief Wrapper around ::OrtAllocator default instance that is owned by Onnxruntime
+ *
+ */
+struct AllocatorWithDefaultOptions : detail::AllocatorImpl<detail::Unowned<OrtAllocator>> {
+  explicit AllocatorWithDefaultOptions(std::nullptr_t) {}  ///< Convenience to create a class member and then replace with an instance
+  AllocatorWithDefaultOptions();
+};
+
+/** \brief Wrapper around ::OrtAllocator
+ *
+ */
+struct Allocator : detail::AllocatorImpl<OrtAllocator> {
+  explicit Allocator(std::nullptr_t) {}  ///< Convenience to create a class member and then replace with an instance
+  Allocator(const Session& session, const OrtMemoryInfo*);
+};
+
+using UnownedAllocator = detail::AllocatorImpl<detail::Unowned<OrtAllocator>>;
+
+namespace detail {
+namespace binding_utils {
+// Bring these out of template
+std::vector<std::string> GetOutputNamesHelper(const OrtIoBinding* binding, OrtAllocator*);
+std::vector<Value> GetOutputValuesHelper(const OrtIoBinding* binding, OrtAllocator*);
+}  // namespace binding_utils
+
+template <typename T>
+struct ConstIoBindingImpl : Base<T> {
+  using B = Base<T>;
+  using B::B;
+
+  std::vector<std::string> GetOutputNames() const;
+  std::vector<std::string> GetOutputNames(OrtAllocator*) const;
+  std::vector<Value> GetOutputValues() const;
+  std::vector<Value> GetOutputValues(OrtAllocator*) const;
+};
+
+template <typename T>
+struct IoBindingImpl : ConstIoBindingImpl<T> {
+  using B = ConstIoBindingImpl<T>;
+  using B::B;
+
+  void BindInput(const char* name, const Value&);
+  void BindOutput(const char* name, const Value&);
+  void BindOutput(const char* name, const OrtMemoryInfo*);
+  void ClearBoundInputs();
+  void ClearBoundOutputs();
+  void SynchronizeInputs();
+  void SynchronizeOutputs();
+};
+
+}  // namespace detail
+
+using ConstIoBinding = detail::ConstIoBindingImpl<detail::Unowned<const OrtIoBinding>>;
+using UnownedIoBinding = detail::IoBindingImpl<detail::Unowned<OrtIoBinding>>;
+
+/** \brief Wrapper around ::OrtIoBinding
+ *
+ */
+struct IoBinding : detail::IoBindingImpl<OrtIoBinding> {
+  explicit IoBinding(std::nullptr_t) {}  ///< Create an empty object for convenience. Sometimes, we want to initialize members later.
+  explicit IoBinding(Session& session);
+  ConstIoBinding GetConst() const { return ConstIoBinding{this->p_}; }
+  UnownedIoBinding GetUnowned() const { return UnownedIoBinding{this->p_}; }
+};
+
+/*! \struct Ort::ArenaCfg
+ * \brief it is a structure that represents the configuration of an arena based allocator
+ * \details Please see docs/C_API.md for details
+ */
+struct ArenaCfg : detail::Base<OrtArenaCfg> {
+  explicit ArenaCfg(std::nullptr_t) {}  ///< Create an empty ArenaCfg object, must be assigned a valid one to be used
+  /**
+   * Wraps OrtApi::CreateArenaCfg
+   * \param max_mem - use 0 to allow ORT to choose the default
+   * \param arena_extend_strategy -  use -1 to allow ORT to choose the default, 0 = kNextPowerOfTwo, 1 = kSameAsRequested
+   * \param initial_chunk_size_bytes - use -1 to allow ORT to choose the default
+   * \param max_dead_bytes_per_chunk - use -1 to allow ORT to choose the default
+   * See docs/C_API.md for details on what the following parameters mean and how to choose these values
+   */
+  ArenaCfg(size_t max_mem, int arena_extend_strategy, int initial_chunk_size_bytes, int max_dead_bytes_per_chunk);
+};
+
+//
+// Custom OPs (only needed to implement custom OPs)
+//
+
+/// <summary>
+/// This struct provides life time management for custom op attribute
+/// </summary>
+struct OpAttr : detail::Base<OrtOpAttr> {
+  using Base = detail::Base<OrtOpAttr>;
+  using Base::Base;
+
+  explicit OpAttr(std::nullptr_t) {}
+  OpAttr(const char* name, const void* data, int len, OrtOpAttrType type);
+};
+
+/**
+ * Macro that logs a message using the provided logger. Throws an exception if OrtApi::Logger_LogMessage fails.
+ * Example: ORT_CXX_LOG(logger, ORT_LOGGING_LEVEL_INFO, "Log a message");
+ *
+ * \param logger The Ort::Logger instance to use. Must be a value or reference.
+ * \param message_severity The logging severity level of the message.
+ * \param message A null-terminated UTF-8 message to log.
+ */
+#define ORT_CXX_LOG(logger, message_severity, message)                                       \
+  do {                                                                                       \
+    if (message_severity >= logger.GetLoggingSeverityLevel()) {                              \
+      Ort::ThrowOnError(logger.LogMessage(message_severity, ORT_FILE, __LINE__,              \
+                                          static_cast<const char*>(__FUNCTION__), message)); \
+    }                                                                                        \
+  } while (false)
+
+/**
+ * Macro that logs a message using the provided logger. Can be used in noexcept code since errors are silently ignored.
+ * Example: ORT_CXX_LOG_NOEXCEPT(logger, ORT_LOGGING_LEVEL_INFO, "Log a message");
+ *
+ * \param logger The Ort::Logger instance to use. Must be a value or reference.
+ * \param message_severity The logging severity level of the message.
+ * \param message A null-terminated UTF-8 message to log.
+ */
+#define ORT_CXX_LOG_NOEXCEPT(logger, message_severity, message)                              \
+  do {                                                                                       \
+    if (message_severity >= logger.GetLoggingSeverityLevel()) {                              \
+      static_cast<void>(logger.LogMessage(message_severity, ORT_FILE, __LINE__,              \
+                                          static_cast<const char*>(__FUNCTION__), message)); \
+    }                                                                                        \
+  } while (false)
+
+/**
+ * Macro that logs a printf-like formatted message using the provided logger. Throws an exception if
+ * OrtApi::Logger_LogMessage fails or if a formatting error occurs.
+ * Example: ORT_CXX_LOGF(logger, ORT_LOGGING_LEVEL_INFO, "Log an int: %d", 12);
+ *
+ * \param logger The Ort::Logger instance to use. Must be a value or reference.
+ * \param message_severity The logging severity level of the message.
+ * \param format A null-terminated UTF-8 format string forwarded to a printf-like function.
+ *               Refer to https://en.cppreference.com/w/cpp/io/c/fprintf for information on valid formats.
+ * \param ... Zero or more variadic arguments referenced by the format string.
+ */
+#define ORT_CXX_LOGF(logger, message_severity, /*format,*/...)                                            \
+  do {                                                                                                    \
+    if (message_severity >= logger.GetLoggingSeverityLevel()) {                                           \
+      Ort::ThrowOnError(logger.LogFormattedMessage(message_severity, ORT_FILE, __LINE__,                  \
+                                                   static_cast<const char*>(__FUNCTION__), __VA_ARGS__)); \
+    }                                                                                                     \
+  } while (false)
+
+/**
+ * Macro that logs a printf-like formatted message using the provided logger. Can be used in noexcept code since errors
+ * are silently ignored.
+ * Example: ORT_CXX_LOGF_NOEXCEPT(logger, ORT_LOGGING_LEVEL_INFO, "Log an int: %d", 12);
+ *
+ * \param logger The Ort::Logger instance to use. Must be a value or reference.
+ * \param message_severity The logging severity level of the message.
+ * \param format A null-terminated UTF-8 format string forwarded to a printf-like function.
+ *               Refer to https://en.cppreference.com/w/cpp/io/c/fprintf for information on valid formats.
+ * \param ... Zero or more variadic arguments referenced by the format string.
+ */
+#define ORT_CXX_LOGF_NOEXCEPT(logger, message_severity, /*format,*/...)                                   \
+  do {                                                                                                    \
+    if (message_severity >= logger.GetLoggingSeverityLevel()) {                                           \
+      static_cast<void>(logger.LogFormattedMessage(message_severity, ORT_FILE, __LINE__,                  \
+                                                   static_cast<const char*>(__FUNCTION__), __VA_ARGS__)); \
+    }                                                                                                     \
+  } while (false)
+
+/// <summary>
+/// This class represents an ONNX Runtime logger that can be used to log information with an
+/// associated severity level and source code location (file path, line number, function name).
+///
+/// A Logger can be obtained from within custom operators by calling Ort::KernelInfo::GetLogger().
+/// Instances of Ort::Logger are the size of two pointers and can be passed by value.
+///
+/// Use the ORT_CXX_LOG macros to ensure the source code location is set properly from the callsite
+/// and to take advantage of a cached logging severity level that can bypass calls to the underlying C API.
+/// </summary>
+struct Logger {
+  /**
+   * Creates an empty Ort::Logger. Must be initialized from a valid Ort::Logger before use.
+   */
+  Logger() = default;
+
+  /**
+   * Creates an empty Ort::Logger. Must be initialized from a valid Ort::Logger before use.
+   */
+  explicit Logger(std::nullptr_t) {}
+
+  /**
+   * Creates a logger from an ::OrtLogger instance. Caches the logger's current severity level by calling
+   * OrtApi::Logger_GetLoggingSeverityLevel. Throws an exception if OrtApi::Logger_GetLoggingSeverityLevel fails.
+   *
+   * \param logger The ::OrtLogger to wrap.
+   */
+  explicit Logger(const OrtLogger* logger);
+
+  ~Logger() = default;
+
+  Logger(const Logger&) = default;
+  Logger& operator=(const Logger&) = default;
+
+  Logger(Logger&& v) noexcept = default;
+  Logger& operator=(Logger&& v) noexcept = default;
+
+  /**
+   * Returns the logger's current severity level from the cached member.
+   *
+   * \return The current ::OrtLoggingLevel.
+   */
+  OrtLoggingLevel GetLoggingSeverityLevel() const noexcept;
+
+  /**
+   * Logs the provided message via OrtApi::Logger_LogMessage. Use the ORT_CXX_LOG or ORT_CXX_LOG_NOEXCEPT
+   * macros to properly set the source code location and to use the cached severity level to potentially bypass
+   * calls to the underlying C API.
+   *
+   * \param log_severity_level The message's logging severity level.
+   * \param file_path The filepath of the file in which the message is logged. Usually the value of ORT_FILE.
+   * \param line_number The file line number in which the message is logged. Usually the value of __LINE__.
+   * \param func_name The name of the function in which the message is logged. Usually the value of __FUNCTION__.
+   * \param message The message to log.
+   * \return A Ort::Status value to indicate error or success.
+   */
+  Status LogMessage(OrtLoggingLevel log_severity_level, const ORTCHAR_T* file_path, int line_number,
+                    const char* func_name, const char* message) const noexcept;
+
+  /**
+   * Logs a printf-like formatted message via OrtApi::Logger_LogMessage. Use the ORT_CXX_LOGF or ORT_CXX_LOGF_NOEXCEPT
+   * macros to properly set the source code location and to use the cached severity level to potentially bypass
+   * calls to the underlying C API. Returns an error status if a formatting error occurs.
+   *
+   * \param log_severity_level The message's logging severity level.
+   * \param file_path The filepath of the file in which the message is logged. Usually the value of ORT_FILE.
+   * \param line_number The file line number in which the message is logged. Usually the value of __LINE__.
+   * \param func_name The name of the function in which the message is logged. Usually the value of __FUNCTION__.
+   * \param format A null-terminated UTF-8 format string forwarded to a printf-like function.
+   *               Refer to https://en.cppreference.com/w/cpp/io/c/fprintf for information on valid formats.
+   * \param args Zero or more variadic arguments referenced by the format string.
+   * \return A Ort::Status value to indicate error or success.
+   */
+  template <typename... Args>
+  Status LogFormattedMessage(OrtLoggingLevel log_severity_level, const ORTCHAR_T* file_path, int line_number,
+                             const char* func_name, const char* format, Args&&... args) const noexcept;
+
+ private:
+  const OrtLogger* logger_{};
+  OrtLoggingLevel cached_severity_level_{};
+};
+
+/// <summary>
+/// This class wraps a raw pointer OrtKernelContext* that is being passed
+/// to the custom kernel Compute() method. Use it to safely access context
+/// attributes, input and output parameters with exception safety guarantees.
+/// See usage example in onnxruntime/test/testdata/custom_op_library/custom_op_library.cc
+/// </summary>
+struct KernelContext {
+  explicit KernelContext(OrtKernelContext* context);
+  size_t GetInputCount() const;
+  size_t GetOutputCount() const;
+  // If input is optional and is not present, the method returns en empty ConstValue
+  // which can be compared to nullptr.
+  ConstValue GetInput(size_t index) const;
+  // If outout is optional and is not present, the method returns en empty UnownedValue
+  // which can be compared to nullptr.
+  UnownedValue GetOutput(size_t index, const int64_t* dim_values, size_t dim_count) const;
+  UnownedValue GetOutput(size_t index, const std::vector<int64_t>& dims) const;
+  void* GetGPUComputeStream() const;
+  Logger GetLogger() const;
+  OrtAllocator* GetAllocator(const OrtMemoryInfo& memory_info) const;
+  OrtKernelContext* GetOrtKernelContext() const { return ctx_; }
+  void ParallelFor(void (*fn)(void*, size_t), size_t total, size_t num_batch, void* usr_data) const;
+
+ private:
+  OrtKernelContext* ctx_;
+};
+
+struct KernelInfo;
+
+namespace detail {
+namespace attr_utils {
+void GetAttr(const OrtKernelInfo* p, const char* name, float&);
+void GetAttr(const OrtKernelInfo* p, const char* name, int64_t&);
+void GetAttr(const OrtKernelInfo* p, const char* name, std::string&);
+void GetAttrs(const OrtKernelInfo* p, const char* name, std::vector<float>&);
+void GetAttrs(const OrtKernelInfo* p, const char* name, std::vector<int64_t>&);
+}  // namespace attr_utils
+
+template <typename T>
+struct KernelInfoImpl : Base<T> {
+  using B = Base<T>;
+  using B::B;
+
+  KernelInfo Copy() const;
+
+  template <typename R>  // R is only implemented for float, int64_t, and string
+  R GetAttribute(const char* name) const {
+    R val;
+    attr_utils::GetAttr(this->p_, name, val);
+    return val;
+  }
+
+  template <typename R>  // R is only implemented for std::vector<float>, std::vector<int64_t>
+  std::vector<R> GetAttributes(const char* name) const {
+    std::vector<R> result;
+    attr_utils::GetAttrs(this->p_, name, result);
+    return result;
+  }
+
+  Value GetTensorAttribute(const char* name, OrtAllocator* allocator) const;
+
+  size_t GetInputCount() const;
+  size_t GetOutputCount() const;
+
+  std::string GetInputName(size_t index) const;
+  std::string GetOutputName(size_t index) const;
+
+  TypeInfo GetInputTypeInfo(size_t index) const;
+  TypeInfo GetOutputTypeInfo(size_t index) const;
+
+  ConstValue GetTensorConstantInput(size_t index, int* is_constant) const;
+
+  std::string GetNodeName() const;
+  Logger GetLogger() const;
+};
+
+}  // namespace detail
+
+using ConstKernelInfo = detail::KernelInfoImpl<detail::Unowned<const OrtKernelInfo>>;
+
+/// <summary>
+/// This struct owns the OrtKernInfo* pointer when a copy is made.
+/// For convenient wrapping of OrtKernelInfo* passed to kernel constructor
+/// and query attributes, warp the pointer with Ort::Unowned<KernelInfo> instance
+/// so it does not destroy the pointer the kernel does not own.
+/// </summary>
+struct KernelInfo : detail::KernelInfoImpl<OrtKernelInfo> {
+  using Base = detail::KernelInfoImpl<OrtKernelInfo>;
+  using Base::Base;
+  explicit KernelInfo(std::nullptr_t) {}     ///< Create an empty instance to initialize later
+  explicit KernelInfo(OrtKernelInfo* info);  ///< Take ownership of the instance
+  ConstKernelInfo GetConst() const { return ConstKernelInfo{this->p_}; }
+};
+
+/// <summary>
+/// Create and own custom defined operation.
+/// </summary>
+struct Op : detail::Base<OrtOp> {
+  using Base = detail::Base<OrtOp>;
+  using Base::Base;
+
+  explicit Op(std::nullptr_t) {}  ///< Create an empty Operator object, must be assigned a valid one to be used
+
+  explicit Op(OrtOp*);  ///< Take ownership of the OrtOp
+
+  static Op Create(const OrtKernelInfo* info, const char* op_name, const char* domain,
+                   int version, const char** type_constraint_names,
+                   const ONNXTensorElementDataType* type_constraint_values,
+                   size_t type_constraint_count,
+                   const OpAttr* attr_values,
+                   size_t attr_count,
+                   size_t input_count, size_t output_count);
+
+  void Invoke(const OrtKernelContext* context,
+              const Value* input_values,
+              size_t input_count,
+              Value* output_values,
+              size_t output_count);
+
+  // For easier refactoring
+  void Invoke(const OrtKernelContext* context,
+              const OrtValue* const* input_values,
+              size_t input_count,
+              OrtValue* const* output_values,
+              size_t output_count);
+};
+
+/// <summary>
+/// Provide access to per-node attributes and input shapes, so one could compute and set output shapes.
+/// </summary>
+struct ShapeInferContext {
+  struct SymbolicInteger {
+    SymbolicInteger(int64_t i) : i_(i), is_int_(true) {};
+    SymbolicInteger(const char* s) : s_(s), is_int_(false) {};
+    SymbolicInteger(const SymbolicInteger&) = default;
+    SymbolicInteger(SymbolicInteger&&) = default;
+
+    SymbolicInteger& operator=(const SymbolicInteger&) = default;
+    SymbolicInteger& operator=(SymbolicInteger&&) = default;
+
+    bool operator==(const SymbolicInteger& dim) const {
+      if (is_int_ == dim.is_int_) {
+        if (is_int_) {
+          return i_ == dim.i_;
+        } else {
+          return std::string{s_} == std::string{dim.s_};
+        }
+      }
+      return false;
+    }
+
+    bool IsInt() const { return is_int_; }
+    int64_t AsInt() const { return i_; }
+    const char* AsSym() const { return s_; }
+
+    static constexpr int INVALID_INT_DIM = -2;
+
+   private:
+    union {
+      int64_t i_;
+      const char* s_;
+    };
+    bool is_int_;
+  };
+
+  using Shape = std::vector<SymbolicInteger>;
+
+  ShapeInferContext(const OrtApi* ort_api, OrtShapeInferContext* ctx);
+
+  const Shape& GetInputShape(size_t indice) const { return input_shapes_.at(indice); }
+
+  size_t GetInputCount() const { return input_shapes_.size(); }
+
+  Status SetOutputShape(size_t indice, const Shape& shape, ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT);
+
+  int64_t GetAttrInt(const char* attr_name);
+
+  using Ints = std::vector<int64_t>;
+  Ints GetAttrInts(const char* attr_name);
+
+  float GetAttrFloat(const char* attr_name);
+
+  using Floats = std::vector<float>;
+  Floats GetAttrFloats(const char* attr_name);
+
+  std::string GetAttrString(const char* attr_name);
+
+  using Strings = std::vector<std::string>;
+  Strings GetAttrStrings(const char* attr_name);
+
+ private:
+  const OrtOpAttr* GetAttrHdl(const char* attr_name) const;
+  const OrtApi* ort_api_;
+  OrtShapeInferContext* ctx_;
+  std::vector<Shape> input_shapes_;
+};
+
+using ShapeInferFn = Ort::Status (*)(Ort::ShapeInferContext&);
+
+#define MAX_CUSTOM_OP_END_VER (1UL << 31) - 1
+
+template <typename TOp, typename TKernel, bool WithStatus = false>
+struct CustomOpBase : OrtCustomOp {
+  CustomOpBase() {
+    OrtCustomOp::version = ORT_API_VERSION;
+    OrtCustomOp::GetName = [](const OrtCustomOp* this_) { return static_cast<const TOp*>(this_)->GetName(); };
+
+    OrtCustomOp::GetExecutionProviderType = [](const OrtCustomOp* this_) { return static_cast<const TOp*>(this_)->GetExecutionProviderType(); };
+
+    OrtCustomOp::GetInputTypeCount = [](const OrtCustomOp* this_) { return static_cast<const TOp*>(this_)->GetInputTypeCount(); };
+    OrtCustomOp::GetInputType = [](const OrtCustomOp* this_, size_t index) { return static_cast<const TOp*>(this_)->GetInputType(index); };
+    OrtCustomOp::GetInputMemoryType = [](const OrtCustomOp* this_, size_t index) { return static_cast<const TOp*>(this_)->GetInputMemoryType(index); };
+
+    OrtCustomOp::GetOutputTypeCount = [](const OrtCustomOp* this_) { return static_cast<const TOp*>(this_)->GetOutputTypeCount(); };
+    OrtCustomOp::GetOutputType = [](const OrtCustomOp* this_, size_t index) { return static_cast<const TOp*>(this_)->GetOutputType(index); };
+
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(push)
+#pragma warning(disable : 26409)
+#endif
+    OrtCustomOp::KernelDestroy = [](void* op_kernel) { delete static_cast<TKernel*>(op_kernel); };
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(pop)
+#endif
+    OrtCustomOp::GetInputCharacteristic = [](const OrtCustomOp* this_, size_t index) { return static_cast<const TOp*>(this_)->GetInputCharacteristic(index); };
+    OrtCustomOp::GetOutputCharacteristic = [](const OrtCustomOp* this_, size_t index) { return static_cast<const TOp*>(this_)->GetOutputCharacteristic(index); };
+
+    OrtCustomOp::GetVariadicInputMinArity = [](const OrtCustomOp* this_) { return static_cast<const TOp*>(this_)->GetVariadicInputMinArity(); };
+    OrtCustomOp::GetVariadicInputHomogeneity = [](const OrtCustomOp* this_) { return static_cast<int>(static_cast<const TOp*>(this_)->GetVariadicInputHomogeneity()); };
+    OrtCustomOp::GetVariadicOutputMinArity = [](const OrtCustomOp* this_) { return static_cast<const TOp*>(this_)->GetVariadicOutputMinArity(); };
+    OrtCustomOp::GetVariadicOutputHomogeneity = [](const OrtCustomOp* this_) { return static_cast<int>(static_cast<const TOp*>(this_)->GetVariadicOutputHomogeneity()); };
+#ifdef __cpp_if_constexpr
+    if constexpr (WithStatus) {
+#else
+    if (WithStatus) {
+#endif
+      OrtCustomOp::CreateKernelV2 = [](const OrtCustomOp* this_, const OrtApi* api, const OrtKernelInfo* info, void** op_kernel) -> OrtStatusPtr {
+        return static_cast<const TOp*>(this_)->CreateKernelV2(*api, info, op_kernel);
+      };
+      OrtCustomOp::KernelComputeV2 = [](void* op_kernel, OrtKernelContext* context) -> OrtStatusPtr {
+        return static_cast<TKernel*>(op_kernel)->ComputeV2(context);
+      };
+    } else {
+      OrtCustomOp::CreateKernelV2 = nullptr;
+      OrtCustomOp::KernelComputeV2 = nullptr;
+
+      OrtCustomOp::CreateKernel = [](const OrtCustomOp* this_, const OrtApi* api, const OrtKernelInfo* info) { return static_cast<const TOp*>(this_)->CreateKernel(*api, info); };
+      OrtCustomOp::KernelCompute = [](void* op_kernel, OrtKernelContext* context) {
+        static_cast<TKernel*>(op_kernel)->Compute(context);
+      };
+    }
+
+    SetShapeInferFn<TOp>(0);
+
+    OrtCustomOp::GetStartVersion = [](const OrtCustomOp* this_) {
+      return static_cast<const TOp*>(this_)->start_ver_;
+    };
+
+    OrtCustomOp::GetEndVersion = [](const OrtCustomOp* this_) {
+      return static_cast<const TOp*>(this_)->end_ver_;
+    };
+
+    OrtCustomOp::GetMayInplace = nullptr;
+    OrtCustomOp::ReleaseMayInplace = nullptr;
+    OrtCustomOp::GetAliasMap = nullptr;
+    OrtCustomOp::ReleaseAliasMap = nullptr;
+  }
+
+  // Default implementation of GetExecutionProviderType that returns nullptr to default to the CPU provider
+  const char* GetExecutionProviderType() const { return nullptr; }
+
+  // Default implementations of GetInputCharacteristic() and GetOutputCharacteristic() below
+  // (inputs and outputs are required by default)
+  OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(size_t /*index*/) const {
+    return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_REQUIRED;
+  }
+
+  OrtCustomOpInputOutputCharacteristic GetOutputCharacteristic(size_t /*index*/) const {
+    return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_REQUIRED;
+  }
+
+  // Default implemention of GetInputMemoryType() that returns OrtMemTypeDefault
+  OrtMemType GetInputMemoryType(size_t /*index*/) const {
+    return OrtMemTypeDefault;
+  }
+
+  // Default implementation of GetVariadicInputMinArity() returns 1 to specify that a variadic input
+  // should expect at least 1 argument.
+  int GetVariadicInputMinArity() const {
+    return 1;
+  }
+
+  // Default implementation of GetVariadicInputHomegeneity() returns true to specify that all arguments
+  // to a variadic input should be of the same type.
+  bool GetVariadicInputHomogeneity() const {
+    return true;
+  }
+
+  // Default implementation of GetVariadicOutputMinArity() returns 1 to specify that a variadic output
+  // should produce at least 1 output value.
+  int GetVariadicOutputMinArity() const {
+    return 1;
+  }
+
+  // Default implementation of GetVariadicOutputHomegeneity() returns true to specify that all output values
+  // produced by a variadic output should be of the same type.
+  bool GetVariadicOutputHomogeneity() const {
+    return true;
+  }
+
+  // Declare list of session config entries used by this Custom Op.
+  // Implement this function in order to get configs from CustomOpBase::GetSessionConfigs().
+  // This default implementation returns an empty vector of config entries.
+  std::vector<std::string> GetSessionConfigKeys() const {
+    return std::vector<std::string>{};
+  }
+
+  template <typename C>
+  decltype(&C::InferOutputShape) SetShapeInferFn(decltype(&C::InferOutputShape)) {
+    OrtCustomOp::InferOutputShapeFn = [](const OrtCustomOp*, OrtShapeInferContext* ort_ctx) -> OrtStatusPtr {
+      ShapeInferContext ctx(&GetApi(), ort_ctx);
+      return C::InferOutputShape(ctx);
+    };
+    return {};
+  }
+
+  template <typename C>
+  void SetShapeInferFn(...) {
+    OrtCustomOp::InferOutputShapeFn = {};
+  }
+
+ protected:
+  // Helper function that returns a map of session config entries specified by CustomOpBase::GetSessionConfigKeys.
+  void GetSessionConfigs(std::unordered_map<std::string, std::string>& out, ConstSessionOptions options) const;
+
+  int start_ver_ = 1;
+  int end_ver_ = MAX_CUSTOM_OP_END_VER;
+};
+
+//
+// Model Builder API C++ wrappers
+//
+namespace ModelBuilderAPI {
+
+namespace detail {
+template <typename T>
+struct ValueInfoImpl : Ort::detail::Base<T> {
+  using B = Ort::detail::Base<T>;
+  using B::B;
+
+  std::string Name() const;
+  ConstTypeInfo TypeInfo() const;
+};
+}  // namespace detail
+
+// Const object holder that does not own the underlying object
+using ConstValueInfo = detail::ValueInfoImpl<Ort::detail::Unowned<const OrtValueInfo>>;
+
+/** \brief Wrapper around ::OrtValueInfo
+ *
+ */
+struct ValueInfo : detail::ValueInfoImpl<OrtValueInfo> {
+  explicit ValueInfo(std::nullptr_t) {}  ///< No instance is created
+  /// Take ownership of a pointer created by C API
+  explicit ValueInfo(OrtValueInfo* p) : ValueInfoImpl<OrtValueInfo>{p} {}
+
+  // Create ValueInfo for a tensor
+  explicit ValueInfo(const std::string& name, const ConstTypeInfo& type_info);
+
+  ConstValueInfo GetConst() const { return ConstValueInfo{this->p_}; }
+};
+
+namespace detail {
+template <typename T>
+struct NodeImpl : Ort::detail::Base<T> {
+  using B = Ort::detail::Base<T>;
+  using B::B;
+};
+}  // namespace detail
+
+// Const object holder that does not own the underlying object
+using ConstNode = detail::NodeImpl<Ort::detail::Unowned<const OrtNode>>;
+
+/** \brief Wrapper around ::OrtNode
+ *
+ */
+struct Node : detail::NodeImpl<OrtNode> {
+  explicit Node(std::nullptr_t) {}                     ///< No instance is created
+  explicit Node(OrtNode* p) : NodeImpl<OrtNode>{p} {}  ///< Take ownership of a pointer created by C API
+
+  Node(const std::string& operator_name, const std::string& operator_domain,
+       const std::string& node_name,
+       const std::vector<std::string>& input_names,
+       const std::vector<std::string>& output_names);
+
+  /// <summary>
+  /// Wraps CreateNode. Node takes ownership of attributes on success and updates the OpAttr in `attributes` to do so.
+  /// </summary>
+  Node(const std::string& operator_name, const std::string& operator_domain,
+       const std::string& node_name,
+       const std::vector<std::string>& input_names,
+       const std::vector<std::string>& output_names,
+       std::vector<OpAttr>& attributes);
+
+  ConstNode GetConst() const { return ConstNode{this->p_}; }
+
+ private:
+  static void Init(const std::string& operator_name, const std::string& operator_domain,
+                   const std::string& node_name,
+                   const std::vector<std::string>& input_names,
+                   const std::vector<std::string>& output_names,
+                   std::vector<OpAttr>& attributes,
+                   OrtNode*& node);
+};
+
+namespace detail {
+template <typename T>
+struct GraphImpl : Ort::detail::Base<T> {
+  using B = Ort::detail::Base<T>;
+  using B::B;
+
+  void SetInputs(std::vector<ValueInfo>& inputs);
+  void SetOutputs(std::vector<ValueInfo>& outputs);
+  void AddInitializer(const std::string& name, Value& initializer, bool data_is_external);  // Graph takes ownership of Value
+  void AddNode(Node& node);                                                                 // Graph takes ownership of Node
+};
+}  // namespace detail
+
+// Const object holder that does not own the underlying object
+using ConstGraph = detail::GraphImpl<Ort::detail::Unowned<const OrtGraph>>;
+
+/** \brief Wrapper around ::OrtGraph
+ *
+ */
+struct Graph : detail::GraphImpl<OrtGraph> {
+  explicit Graph(std::nullptr_t) {}                        ///< No instance is created
+  explicit Graph(OrtGraph* p) : GraphImpl<OrtGraph>{p} {}  ///< Take ownership of a pointer created by C API
+  Graph();
+
+  ConstGraph GetConst() const { return ConstGraph{this->p_}; }
+};
+
+namespace detail {
+template <typename T>
+struct ModelImpl : Ort::detail::Base<T> {
+  using B = Ort::detail::Base<T>;
+  using B::B;
+
+  void AddGraph(Graph& graph);
+};
+}  // namespace detail
+
+// Const object holder that does not own the underlying object
+using ConstModel = detail::ModelImpl<Ort::detail::Unowned<const OrtModel>>;
+
+/** \brief Wrapper around ::OrtModel
+ *
+ */
+struct Model : detail::ModelImpl<OrtModel> {
+  using DomainOpsetPair = std::pair<std::string, int>;
+
+  explicit Model(std::nullptr_t) {}                        ///< No instance is created
+  explicit Model(OrtModel* p) : ModelImpl<OrtModel>{p} {}  ///< Take ownership of a pointer created by C API
+  Model(const std::vector<DomainOpsetPair>& opsets);
+
+  ConstModel GetConst() const { return ConstModel{this->p_}; }
+};
+}  // namespace ModelBuilderAPI
+
+}  // namespace Ort
+
+#include "onnxruntime_cxx_inline.h"
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/session/onnxruntime_cxx_inline.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
new file mode 100644
index 00000000000000..1de5db266961dd
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
@@ -0,0 +1,2464 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// Do not include this file directly. Please include "onnxruntime_cxx_api.h" instead.
+// If interested in trying out features of the new experimental C++ API, include "experimental_onnxruntime_cxx_api.h" instead.
+//
+// These are the inline implementations of the C++ header APIs. They're in this separate file as to not clutter
+// the main C++ file with implementation details.
+
+#include <algorithm>
+#include <functional>
+#include <iterator>
+#include <type_traits>
+
+// Convert OrtStatus to Ort::Status and return
+// instead of throwing
+#define ORT_CXX_RETURN_ON_API_FAIL(expression) \
+  {                                            \
+    auto ort_status = (expression);            \
+    if (ort_status) {                          \
+      return Ort::Status(ort_status);          \
+    }                                          \
+  }
+
+#ifdef __cpp_if_constexpr
+#define ORT_CXX_IF_CONSTEXPR if constexpr
+#else
+#define ORT_CXX_IF_CONSTEXPR if
+#endif
+
+namespace Ort {
+
+namespace detail {
+inline void ThrowStatus(const Status& st) {
+  std::string error_message = st.GetErrorMessage();
+  OrtErrorCode error_code = st.GetErrorCode();
+  ORT_CXX_API_THROW(std::move(error_message), error_code);
+}
+}  // namespace detail
+
+inline void ThrowOnError(OrtStatus* ort_status) {
+  if (ort_status) {
+    Ort::Status st(ort_status);
+    detail::ThrowStatus(st);
+  }
+}
+
+inline void ThrowOnError(const Status& st) {
+  if (st) {
+    detail::ThrowStatus(st);
+  }
+}
+
+inline Status::Status(OrtStatus* status) noexcept : detail::Base<OrtStatus>{status} {
+}
+
+inline Status::Status(const std::exception& e) noexcept {
+  p_ = GetApi().CreateStatus(ORT_FAIL, e.what());
+}
+
+inline Status::Status(const Exception& e) noexcept {
+  p_ = GetApi().CreateStatus(e.GetOrtErrorCode(), e.what());
+}
+
+inline Status::Status(const char* message, OrtErrorCode code) noexcept {
+  p_ = GetApi().CreateStatus(code, message);
+}
+
+inline std::string Status::GetErrorMessage() const {
+  std::string message(GetApi().GetErrorMessage(p_));
+  return message;
+}
+
+inline OrtErrorCode Status::GetErrorCode() const {
+  return GetApi().GetErrorCode(p_);
+}
+
+inline bool Status::IsOK() const noexcept {
+  return (p_ == nullptr);
+}
+
+// This template converts a C++ type into it's ONNXTensorElementDataType
+template <typename T>
+struct TypeToTensorType;
+template <>
+struct TypeToTensorType<float> {
+  static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+};
+template <>
+struct TypeToTensorType<Float16_t> {
+  static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16;
+};
+template <>
+struct TypeToTensorType<BFloat16_t> {
+  static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16;
+};
+template <>
+struct TypeToTensorType<double> {
+  static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE;
+};
+template <>
+struct TypeToTensorType<int8_t> {
+  static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8;
+};
+template <>
+struct TypeToTensorType<int16_t> {
+  static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16;
+};
+template <>
+struct TypeToTensorType<int32_t> {
+  static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32;
+};
+template <>
+struct TypeToTensorType<int64_t> {
+  static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
+};
+template <>
+struct TypeToTensorType<uint8_t> {
+  static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8;
+};
+template <>
+struct TypeToTensorType<uint16_t> {
+  static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16;
+};
+template <>
+struct TypeToTensorType<uint32_t> {
+  static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32;
+};
+template <>
+struct TypeToTensorType<uint64_t> {
+  static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64;
+};
+template <>
+struct TypeToTensorType<bool> {
+  static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL;
+};
+
+template <>
+struct TypeToTensorType<Float8E4M3FN_t> {
+  static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E4M3FN;
+};
+template <>
+struct TypeToTensorType<Float8E4M3FNUZ_t> {
+  static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E4M3FNUZ;
+};
+template <>
+struct TypeToTensorType<Float8E5M2_t> {
+  static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E5M2;
+};
+template <>
+struct TypeToTensorType<Float8E5M2FNUZ_t> {
+  static constexpr ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E5M2FNUZ;
+};
+
+inline bool BFloat16_t::operator==(const BFloat16_t& rhs) const noexcept {
+  if (IsNaN() || rhs.IsNaN()) {
+    // IEEE defines that NaN is not equal to anything, including itself.
+    return false;
+  }
+  return val == rhs.val;
+}
+
+inline bool BFloat16_t::operator<(const BFloat16_t& rhs) const noexcept {
+  if (IsNaN() || rhs.IsNaN()) {
+    // IEEE defines that NaN is unordered with respect to everything, including itself.
+    return false;
+  }
+
+  const bool left_is_negative = IsNegative();
+  if (left_is_negative != rhs.IsNegative()) {
+    // When the signs of left and right differ, we know that left is less than right if it is
+    // the negative value. The exception to this is if both values are zero, in which case IEEE
+    // says they should be equal, even if the signs differ.
+    return left_is_negative && !AreZero(*this, rhs);
+  }
+  return (val != rhs.val) && ((val < rhs.val) ^ left_is_negative);
+}
+
+inline MemoryAllocation::MemoryAllocation(OrtAllocator* allocator, void* p, size_t size)
+    : allocator_(allocator), p_(p), size_(size) {
+}
+
+inline MemoryAllocation::~MemoryAllocation() {
+  if (p_ != nullptr) {
+    // We do not throw out of destructor
+    auto ret = GetApi().AllocatorFree(allocator_, p_);
+    static_cast<void>(ret);
+  }
+}
+
+inline MemoryAllocation::MemoryAllocation(MemoryAllocation&& o) noexcept : allocator_(nullptr), p_(nullptr), size_(0) {
+  *this = std::move(o);
+}
+
+inline MemoryAllocation& MemoryAllocation::operator=(MemoryAllocation&& o) noexcept {
+  OrtAllocator* alloc = nullptr;
+  void* p = nullptr;
+  size_t sz = 0;
+
+  // Swap out this
+  std::swap(alloc, allocator_);
+  std::swap(p, p_);
+  std::swap(sz, size_);
+
+  // Swap with incoming
+  std::swap(allocator_, o.allocator_);
+  std::swap(p_, o.p_);
+  std::swap(size_, o.size_);
+
+  // Destroy this instance if needed
+  MemoryAllocation this_alloc(alloc, p, sz);
+  return *this;
+}
+
+namespace detail {
+
+template <typename T>
+inline void* AllocatorImpl<T>::Alloc(size_t size) {
+  void* out;
+  ThrowOnError(GetApi().AllocatorAlloc(this->p_, size, &out));
+  return out;
+}
+
+template <typename T>
+inline MemoryAllocation AllocatorImpl<T>::GetAllocation(size_t size) {
+  void* out;
+  ThrowOnError(GetApi().AllocatorAlloc(this->p_, size, &out));
+  MemoryAllocation result(this->p_, out, size);
+  return result;
+}
+
+template <typename T>
+inline void AllocatorImpl<T>::Free(void* p) {
+  ThrowOnError(GetApi().AllocatorFree(this->p_, p));
+}
+
+template <typename T>
+inline ConstMemoryInfo AllocatorImpl<T>::GetInfo() const {
+  const OrtMemoryInfo* out;
+  ThrowOnError(GetApi().AllocatorGetInfo(this->p_, &out));
+  return ConstMemoryInfo{out};
+}
+
+}  // namespace detail
+
+inline AllocatorWithDefaultOptions::AllocatorWithDefaultOptions() {
+  ThrowOnError(GetApi().GetAllocatorWithDefaultOptions(&this->p_));
+}
+
+inline Allocator::Allocator(const Session& sess, const OrtMemoryInfo* mem_info) {
+  ThrowOnError(GetApi().CreateAllocator(sess, mem_info, &this->p_));
+}
+
+namespace detail {
+
+template <typename T>
+inline std::string MemoryInfoImpl<T>::GetAllocatorName() const {
+  const char* name = nullptr;
+  ThrowOnError(GetApi().MemoryInfoGetName(this->p_, &name));
+  return std::string(name);
+}
+
+template <typename T>
+inline OrtAllocatorType MemoryInfoImpl<T>::GetAllocatorType() const {
+  OrtAllocatorType type;
+  ThrowOnError(GetApi().MemoryInfoGetType(this->p_, &type));
+  return type;
+}
+
+template <typename T>
+inline int MemoryInfoImpl<T>::GetDeviceId() const {
+  int id = 0;
+  ThrowOnError(GetApi().MemoryInfoGetId(this->p_, &id));
+  return id;
+}
+
+template <typename T>
+inline OrtMemoryInfoDeviceType MemoryInfoImpl<T>::GetDeviceType() const {
+  OrtMemoryInfoDeviceType type;
+  GetApi().MemoryInfoGetDeviceType(this->p_, &type);
+  return type;
+}
+
+template <typename T>
+inline OrtMemType MemoryInfoImpl<T>::GetMemoryType() const {
+  OrtMemType type;
+  ThrowOnError(GetApi().MemoryInfoGetMemType(this->p_, &type));
+  return type;
+}
+
+template <typename T>
+template <typename U>
+inline bool MemoryInfoImpl<T>::operator==(const MemoryInfoImpl<U>& o) const {
+  int comp_result = 0;
+  ThrowOnError(Ort::GetApi().CompareMemoryInfo(this->p_, o, &comp_result));
+  return comp_result == 0;
+}
+
+}  // namespace detail
+
+inline MemoryInfo MemoryInfo::CreateCpu(OrtAllocatorType type, OrtMemType mem_type) {
+  OrtMemoryInfo* p;
+  ThrowOnError(GetApi().CreateCpuMemoryInfo(type, mem_type, &p));
+  return MemoryInfo(p);
+}
+
+inline MemoryInfo::MemoryInfo(const char* name, OrtAllocatorType type, int id, OrtMemType mem_type) {
+  ThrowOnError(GetApi().CreateMemoryInfo(name, type, id, mem_type, &this->p_));
+}
+
+namespace detail {
+template <typename T>
+inline std::vector<std::string> ConstIoBindingImpl<T>::GetOutputNames() const {
+  AllocatorWithDefaultOptions allocator;
+  return binding_utils::GetOutputNamesHelper(this->p_, allocator);
+}
+
+template <typename T>
+inline std::vector<std::string> ConstIoBindingImpl<T>::GetOutputNames(OrtAllocator* allocator) const {
+  return binding_utils::GetOutputNamesHelper(this->p_, allocator);
+}
+
+template <typename T>
+inline std::vector<Value> ConstIoBindingImpl<T>::GetOutputValues() const {
+  AllocatorWithDefaultOptions allocator;
+  return binding_utils::GetOutputValuesHelper(this->p_, allocator);
+}
+
+template <typename T>
+inline std::vector<Value> ConstIoBindingImpl<T>::GetOutputValues(OrtAllocator* allocator) const {
+  return binding_utils::GetOutputValuesHelper(this->p_, allocator);
+}
+
+template <typename T>
+inline void IoBindingImpl<T>::BindInput(const char* name, const Value& value) {
+  ThrowOnError(GetApi().BindInput(this->p_, name, value));
+}
+
+template <typename T>
+inline void IoBindingImpl<T>::BindOutput(const char* name, const Value& value) {
+  ThrowOnError(GetApi().BindOutput(this->p_, name, value));
+}
+
+template <typename T>
+inline void IoBindingImpl<T>::BindOutput(const char* name, const OrtMemoryInfo* mem_info) {
+  ThrowOnError(GetApi().BindOutputToDevice(this->p_, name, mem_info));
+}
+
+template <typename T>
+inline void IoBindingImpl<T>::ClearBoundInputs() {
+  GetApi().ClearBoundInputs(this->p_);
+}
+
+template <typename T>
+inline void IoBindingImpl<T>::ClearBoundOutputs() {
+  GetApi().ClearBoundOutputs(this->p_);
+}
+
+template <typename T>
+inline void IoBindingImpl<T>::SynchronizeInputs() {
+  ThrowOnError(GetApi().SynchronizeBoundInputs(this->p_));
+}
+
+template <typename T>
+inline void IoBindingImpl<T>::SynchronizeOutputs() {
+  ThrowOnError(GetApi().SynchronizeBoundOutputs(this->p_));
+}
+
+namespace binding_utils {
+inline std::vector<std::string> GetOutputNamesHelper(const OrtIoBinding* binding, OrtAllocator* allocator) {
+  std::vector<std::string> result;
+  auto free_fn = detail::AllocatedFree(allocator);
+  using Ptr = std::unique_ptr<void, decltype(free_fn)>;
+
+  char* buffer = nullptr;
+  size_t* lengths = nullptr;
+  size_t count = 0;
+  ThrowOnError(GetApi().GetBoundOutputNames(binding, allocator, &buffer, &lengths, &count));
+
+  if (count == 0) {
+    return result;
+  }
+
+  Ptr buffer_g(buffer, free_fn);
+  Ptr lengths_g(lengths, free_fn);
+
+  result.reserve(count);
+  for (size_t i = 0; i < count; ++i) {
+    auto sz = *lengths;
+    result.emplace_back(buffer, sz);
+    buffer += sz;
+    ++lengths;
+  }
+  return result;
+}
+
+inline std::vector<Value> GetOutputValuesHelper(const OrtIoBinding* binding, OrtAllocator* allocator) {
+  std::vector<Value> result;
+  size_t owned = 0;
+  size_t output_count = 0;
+  // Lambda to release the buffer when no longer needed and
+  // make sure that we destroy all instances on exception
+  auto free_fn = [&owned, &output_count, allocator](OrtValue** buffer) {
+    if (buffer) {
+      while (owned < output_count) {
+        auto* p = buffer + owned++;
+        GetApi().ReleaseValue(*p);
+      }
+      allocator->Free(allocator, buffer);
+    }
+  };
+  using Ptr = std::unique_ptr<OrtValue*, decltype(free_fn)>;
+
+  OrtValue** output_buffer = nullptr;
+  ThrowOnError(GetApi().GetBoundOutputValues(binding, allocator, &output_buffer, &output_count));
+  if (output_count == 0) {
+    return result;
+  }
+
+  Ptr buffer_g(output_buffer, free_fn);
+
+  result.reserve(output_count);
+  for (size_t i = 0; i < output_count; ++i) {
+    result.emplace_back(output_buffer[i]);
+    ++owned;
+  }
+  return result;
+}
+
+}  // namespace binding_utils
+}  // namespace detail
+
+inline IoBinding::IoBinding(Session& session) {
+  ThrowOnError(GetApi().CreateIoBinding(session, &this->p_));
+}
+
+inline ArenaCfg::ArenaCfg(size_t max_mem, int arena_extend_strategy, int initial_chunk_size_bytes, int max_dead_bytes_per_chunk) {
+  ThrowOnError(GetApi().CreateArenaCfg(max_mem, arena_extend_strategy, initial_chunk_size_bytes, max_dead_bytes_per_chunk, &p_));
+}
+
+inline ThreadingOptions::ThreadingOptions() {
+  ThrowOnError(GetApi().CreateThreadingOptions(&p_));
+}
+
+inline ThreadingOptions& ThreadingOptions::SetGlobalIntraOpNumThreads(int intra_op_num_threads) {
+  ThrowOnError(GetApi().SetGlobalIntraOpNumThreads(p_, intra_op_num_threads));
+  return *this;
+}
+
+inline ThreadingOptions& ThreadingOptions::SetGlobalInterOpNumThreads(int inter_op_num_threads) {
+  ThrowOnError(GetApi().SetGlobalInterOpNumThreads(p_, inter_op_num_threads));
+  return *this;
+}
+
+inline ThreadingOptions& ThreadingOptions::SetGlobalSpinControl(int allow_spinning) {
+  ThrowOnError(GetApi().SetGlobalSpinControl(p_, allow_spinning));
+  return *this;
+}
+
+inline ThreadingOptions& ThreadingOptions::SetGlobalDenormalAsZero() {
+  ThrowOnError(GetApi().SetGlobalDenormalAsZero(p_));
+  return *this;
+}
+
+inline ThreadingOptions& ThreadingOptions::SetGlobalCustomCreateThreadFn(OrtCustomCreateThreadFn ort_custom_create_thread_fn) {
+  ThrowOnError(GetApi().SetGlobalCustomCreateThreadFn(p_, ort_custom_create_thread_fn));
+  return *this;
+}
+
+inline ThreadingOptions& ThreadingOptions::SetGlobalCustomThreadCreationOptions(void* ort_custom_thread_creation_options) {
+  ThrowOnError(GetApi().SetGlobalCustomThreadCreationOptions(p_, ort_custom_thread_creation_options));
+  return *this;
+}
+
+inline ThreadingOptions& ThreadingOptions::SetGlobalCustomJoinThreadFn(OrtCustomJoinThreadFn ort_custom_join_thread_fn) {
+  ThrowOnError(GetApi().SetGlobalCustomJoinThreadFn(p_, ort_custom_join_thread_fn));
+  return *this;
+}
+
+inline Env::Env(OrtLoggingLevel logging_level, _In_ const char* logid) {
+  ThrowOnError(GetApi().CreateEnv(logging_level, logid, &p_));
+  if (strcmp(logid, "onnxruntime-node") == 0) {
+    ThrowOnError(GetApi().SetLanguageProjection(p_, OrtLanguageProjection::ORT_PROJECTION_NODEJS));
+  } else {
+    ThrowOnError(GetApi().SetLanguageProjection(p_, OrtLanguageProjection::ORT_PROJECTION_CPLUSPLUS));
+  }
+}
+
+inline Env::Env(OrtLoggingLevel logging_level, const char* logid, OrtLoggingFunction logging_function, void* logger_param) {
+  ThrowOnError(GetApi().CreateEnvWithCustomLogger(logging_function, logger_param, logging_level, logid, &p_));
+  if (strcmp(logid, "onnxruntime-node") == 0) {
+    ThrowOnError(GetApi().SetLanguageProjection(p_, OrtLanguageProjection::ORT_PROJECTION_NODEJS));
+  } else {
+    ThrowOnError(GetApi().SetLanguageProjection(p_, OrtLanguageProjection::ORT_PROJECTION_CPLUSPLUS));
+  }
+}
+
+inline Env::Env(const OrtThreadingOptions* tp_options, OrtLoggingLevel logging_level, _In_ const char* logid) {
+  ThrowOnError(GetApi().CreateEnvWithGlobalThreadPools(logging_level, logid, tp_options, &p_));
+  if (strcmp(logid, "onnxruntime-node") == 0) {
+    ThrowOnError(GetApi().SetLanguageProjection(p_, OrtLanguageProjection::ORT_PROJECTION_NODEJS));
+  } else {
+    ThrowOnError(GetApi().SetLanguageProjection(p_, OrtLanguageProjection::ORT_PROJECTION_CPLUSPLUS));
+  }
+}
+
+inline Env::Env(const OrtThreadingOptions* tp_options, OrtLoggingFunction logging_function, void* logger_param,
+                OrtLoggingLevel logging_level, _In_ const char* logid) {
+  ThrowOnError(GetApi().CreateEnvWithCustomLoggerAndGlobalThreadPools(logging_function, logger_param, logging_level, logid, tp_options, &p_));
+  if (strcmp(logid, "onnxruntime-node") == 0) {
+    ThrowOnError(GetApi().SetLanguageProjection(p_, OrtLanguageProjection::ORT_PROJECTION_NODEJS));
+  } else {
+    ThrowOnError(GetApi().SetLanguageProjection(p_, OrtLanguageProjection::ORT_PROJECTION_CPLUSPLUS));
+  }
+}
+
+inline Env& Env::EnableTelemetryEvents() {
+  ThrowOnError(GetApi().EnableTelemetryEvents(p_));
+  return *this;
+}
+
+inline Env& Env::DisableTelemetryEvents() {
+  ThrowOnError(GetApi().DisableTelemetryEvents(p_));
+  return *this;
+}
+
+inline Env& Env::UpdateEnvWithCustomLogLevel(OrtLoggingLevel log_severity_level) {
+  ThrowOnError(GetApi().UpdateEnvWithCustomLogLevel(p_, log_severity_level));
+  return *this;
+}
+
+inline Env& Env::CreateAndRegisterAllocator(const OrtMemoryInfo* mem_info, const OrtArenaCfg* arena_cfg) {
+  ThrowOnError(GetApi().CreateAndRegisterAllocator(p_, mem_info, arena_cfg));
+  return *this;
+}
+
+inline Env& Env::CreateAndRegisterAllocatorV2(const std::string& provider_type, const OrtMemoryInfo* mem_info, const std::unordered_map<std::string, std::string>& options, const OrtArenaCfg* arena_cfg) {
+  std::vector<const char*> keys, values;
+  auto num_entries = options.size();
+  if (num_entries > 0) {
+    keys.reserve(num_entries);
+    values.reserve(num_entries);
+    for (const auto& entry : options) {
+      keys.push_back(entry.first.c_str());
+      values.push_back(entry.second.c_str());
+    }
+  }
+  ThrowOnError(GetApi().CreateAndRegisterAllocatorV2(p_, provider_type.c_str(), mem_info, arena_cfg, keys.data(), values.data(), num_entries));
+  return *this;
+}
+
+inline CustomOpDomain::CustomOpDomain(const char* domain) {
+  ThrowOnError(GetApi().CreateCustomOpDomain(domain, &p_));
+}
+
+inline void CustomOpDomain::Add(const OrtCustomOp* op) {
+  ThrowOnError(GetApi().CustomOpDomain_Add(p_, op));
+}
+
+inline LoraAdapter LoraAdapter::CreateLoraAdapter(const std::basic_string<ORTCHAR_T>& adapter_path,
+                                                  OrtAllocator* allocator) {
+  OrtLoraAdapter* p;
+  ThrowOnError(GetApi().CreateLoraAdapter(adapter_path.c_str(), allocator, &p));
+  return LoraAdapter{p};
+}
+
+inline LoraAdapter LoraAdapter::CreateLoraAdapterFromArray(const void* bytes, size_t num_bytes,
+                                                           OrtAllocator* allocator) {
+  OrtLoraAdapter* p;
+  ThrowOnError(GetApi().CreateLoraAdapterFromArray(bytes, num_bytes, allocator, &p));
+  return LoraAdapter{p};
+}
+
+inline RunOptions::RunOptions() {
+  ThrowOnError(GetApi().CreateRunOptions(&p_));
+}
+
+inline RunOptions& RunOptions::SetRunLogVerbosityLevel(int level) {
+  ThrowOnError(GetApi().RunOptionsSetRunLogVerbosityLevel(p_, level));
+  return *this;
+}
+
+inline RunOptions& RunOptions::SetRunLogSeverityLevel(int level) {
+  ThrowOnError(GetApi().RunOptionsSetRunLogSeverityLevel(p_, level));
+  return *this;
+}
+
+inline int RunOptions::GetRunLogVerbosityLevel() const {
+  int out;
+  ThrowOnError(GetApi().RunOptionsGetRunLogVerbosityLevel(p_, &out));
+  return out;
+}
+
+inline int RunOptions::GetRunLogSeverityLevel() const {
+  int out;
+  ThrowOnError(GetApi().RunOptionsGetRunLogSeverityLevel(p_, &out));
+  return out;
+}
+
+inline RunOptions& RunOptions::SetRunTag(const char* run_tag) {
+  ThrowOnError(GetApi().RunOptionsSetRunTag(p_, run_tag));
+  return *this;
+}
+
+inline const char* RunOptions::GetRunTag() const {
+  const char* out;
+  ThrowOnError(GetApi().RunOptionsGetRunTag(p_, &out));
+  return out;
+}
+
+inline RunOptions& RunOptions::AddConfigEntry(const char* config_key, const char* config_value) {
+  ThrowOnError(GetApi().AddRunConfigEntry(p_, config_key, config_value));
+  return *this;
+}
+
+inline RunOptions& RunOptions::SetTerminate() {
+  ThrowOnError(GetApi().RunOptionsSetTerminate(p_));
+  return *this;
+}
+
+inline RunOptions& RunOptions::UnsetTerminate() {
+  ThrowOnError(GetApi().RunOptionsUnsetTerminate(p_));
+  return *this;
+}
+
+inline RunOptions& RunOptions::AddActiveLoraAdapter(const LoraAdapter& adapter) {
+  ThrowOnError(GetApi().RunOptionsAddActiveLoraAdapter(p_, adapter));
+  return *this;
+}
+
+namespace detail {
+
+template <typename T>
+inline Ort::SessionOptions ConstSessionOptionsImpl<T>::Clone() const {
+  OrtSessionOptions* out;
+  ThrowOnError(GetApi().CloneSessionOptions(this->p_, &out));
+  return SessionOptions{out};
+}
+
+template <typename T>
+inline std::string ConstSessionOptionsImpl<T>::GetConfigEntry(const char* config_key) const {
+  size_t size = 0;
+  // Feed nullptr for the data buffer to query the true size of the string value
+  Ort::ThrowOnError(GetApi().GetSessionConfigEntry(this->p_, config_key, nullptr, &size));
+
+  std::string out;
+  out.resize(size);
+  Ort::ThrowOnError(GetApi().GetSessionConfigEntry(this->p_, config_key, &out[0], &size));
+  out.resize(size - 1);  // remove the terminating character '\0'
+
+  return out;
+}
+
+template <typename T>
+inline bool ConstSessionOptionsImpl<T>::HasConfigEntry(const char* config_key) const {
+  int out = 0;
+  Ort::ThrowOnError(GetApi().HasSessionConfigEntry(this->p_, config_key, &out));
+  return static_cast<bool>(out);
+}
+
+template <typename T>
+inline std::string ConstSessionOptionsImpl<T>::GetConfigEntryOrDefault(const char* config_key, const std::string& def) {
+  if (!this->HasConfigEntry(config_key)) {
+    return def;
+  }
+
+  return this->GetConfigEntry(config_key);
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::SetIntraOpNumThreads(int intra_op_num_threads) {
+  ThrowOnError(GetApi().SetIntraOpNumThreads(this->p_, intra_op_num_threads));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::SetInterOpNumThreads(int inter_op_num_threads) {
+  ThrowOnError(GetApi().SetInterOpNumThreads(this->p_, inter_op_num_threads));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::SetGraphOptimizationLevel(GraphOptimizationLevel graph_optimization_level) {
+  ThrowOnError(GetApi().SetSessionGraphOptimizationLevel(this->p_, graph_optimization_level));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::SetDeterministicCompute(bool value) {
+  ThrowOnError(GetApi().SetDeterministicCompute(this->p_, value));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::SetOptimizedModelFilePath(const ORTCHAR_T* optimized_model_filepath) {
+  ThrowOnError(GetApi().SetOptimizedModelFilePath(this->p_, optimized_model_filepath));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::EnableProfiling(const ORTCHAR_T* profile_file_prefix) {
+  ThrowOnError(GetApi().EnableProfiling(this->p_, profile_file_prefix));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::DisableProfiling() {
+  ThrowOnError(GetApi().DisableProfiling(this->p_));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::EnableOrtCustomOps() {
+  ThrowOnError(GetApi().EnableOrtCustomOps(this->p_));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::EnableMemPattern() {
+  ThrowOnError(GetApi().EnableMemPattern(this->p_));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::DisableMemPattern() {
+  ThrowOnError(GetApi().DisableMemPattern(this->p_));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::EnableCpuMemArena() {
+  ThrowOnError(GetApi().EnableCpuMemArena(this->p_));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::DisableCpuMemArena() {
+  ThrowOnError(GetApi().DisableCpuMemArena(this->p_));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::SetExecutionMode(ExecutionMode execution_mode) {
+  ThrowOnError(GetApi().SetSessionExecutionMode(this->p_, execution_mode));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::SetLogId(const char* logid) {
+  ThrowOnError(GetApi().SetSessionLogId(this->p_, logid));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::SetLogSeverityLevel(int level) {
+  ThrowOnError(GetApi().SetSessionLogSeverityLevel(this->p_, level));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::Add(OrtCustomOpDomain* custom_op_domain) {
+  ThrowOnError(GetApi().AddCustomOpDomain(this->p_, custom_op_domain));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::AddConfigEntry(const char* config_key, const char* config_value) {
+  ThrowOnError(GetApi().AddSessionConfigEntry(this->p_, config_key, config_value));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::AddInitializer(const char* name, const OrtValue* ort_val) {
+  ThrowOnError(GetApi().AddInitializer(this->p_, name, ort_val));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::DisablePerSessionThreads() {
+  ThrowOnError(GetApi().DisablePerSessionThreads(this->p_));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::AddExternalInitializers(const std::vector<std::string>& names,
+                                                                             const std::vector<Value>& ort_values) {
+  const size_t inputs_num = names.size();
+  if (inputs_num != ort_values.size()) {
+    ORT_CXX_API_THROW("Expecting names and ort_values to have the same length", ORT_INVALID_ARGUMENT);
+  }
+  std::vector<const char*> names_ptr;
+  std::vector<const OrtValue*> ort_values_ptrs;
+  names_ptr.reserve(inputs_num);
+  ort_values_ptrs.reserve(inputs_num);
+  for (size_t i = 0; i < inputs_num; ++i) {
+    names_ptr.push_back(names[i].c_str());
+    ort_values_ptrs.push_back(ort_values[i]);
+  }
+  ThrowOnError(GetApi().AddExternalInitializers(this->p_, names_ptr.data(), ort_values_ptrs.data(), inputs_num));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::AddExternalInitializersFromFilesInMemory(const std::vector<std::basic_string<ORTCHAR_T>>& file_names,
+                                                                                              const std::vector<char*>& buffer_array,
+                                                                                              const std::vector<size_t>& file_lengths) {
+  const size_t inputs_num = file_names.size();
+  if (inputs_num != buffer_array.size()) {
+    ORT_CXX_API_THROW("Expecting names and buffer_array to have the same length", ORT_INVALID_ARGUMENT);
+  }
+  if (inputs_num != file_lengths.size()) {
+    ORT_CXX_API_THROW("Expecting names and file_lengths to have the same length", ORT_INVALID_ARGUMENT);
+  }
+  std::vector<const ORTCHAR_T*> names_ptr;
+  names_ptr.reserve(inputs_num);
+  for (size_t i = 0; i < inputs_num; ++i) {
+    names_ptr.push_back(file_names[i].c_str());
+  }
+  ThrowOnError(GetApi().AddExternalInitializersFromFilesInMemory(this->p_, names_ptr.data(), buffer_array.data(),
+                                                                 file_lengths.data(), inputs_num));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::AppendExecutionProvider_CUDA(const OrtCUDAProviderOptions& provider_options) {
+  ThrowOnError(GetApi().SessionOptionsAppendExecutionProvider_CUDA(this->p_, &provider_options));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::AppendExecutionProvider_CUDA_V2(const OrtCUDAProviderOptionsV2& provider_options) {
+  ThrowOnError(GetApi().SessionOptionsAppendExecutionProvider_CUDA_V2(this->p_, &provider_options));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::AppendExecutionProvider_ROCM(const OrtROCMProviderOptions& provider_options) {
+  ThrowOnError(GetApi().SessionOptionsAppendExecutionProvider_ROCM(this->p_, &provider_options));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::AppendExecutionProvider_TensorRT(const OrtTensorRTProviderOptions& provider_options) {
+  ThrowOnError(GetApi().SessionOptionsAppendExecutionProvider_TensorRT(this->p_, &provider_options));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::AppendExecutionProvider_TensorRT_V2(const OrtTensorRTProviderOptionsV2& provider_options) {
+  ThrowOnError(GetApi().SessionOptionsAppendExecutionProvider_TensorRT_V2(this->p_, &provider_options));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::AppendExecutionProvider_MIGraphX(const OrtMIGraphXProviderOptions& provider_options) {
+  ThrowOnError(GetApi().SessionOptionsAppendExecutionProvider_MIGraphX(this->p_, &provider_options));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::AppendExecutionProvider_CANN(const OrtCANNProviderOptions& provider_options) {
+  ThrowOnError(GetApi().SessionOptionsAppendExecutionProvider_CANN(this->p_, &provider_options));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::AppendExecutionProvider_Dnnl(const OrtDnnlProviderOptions& provider_options) {
+  ThrowOnError(GetApi().SessionOptionsAppendExecutionProvider_Dnnl(this->p_, &provider_options));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::AppendExecutionProvider(
+    const std::string& provider_name,
+    const std::unordered_map<std::string, std::string>& provider_options) {
+  auto num_entries = provider_options.size();
+  std::vector<const char*> keys, values;
+  if (num_entries > 0) {
+    keys.reserve(num_entries);
+    values.reserve(num_entries);
+
+    for (const auto& entry : provider_options) {
+      keys.push_back(entry.first.c_str());
+      values.push_back(entry.second.c_str());
+    }
+  }
+
+  ThrowOnError(GetApi().SessionOptionsAppendExecutionProvider(this->p_, provider_name.c_str(),
+                                                              keys.data(), values.data(), num_entries));
+
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::SetCustomCreateThreadFn(OrtCustomCreateThreadFn ort_custom_create_thread_fn) {
+  ThrowOnError(GetApi().SessionOptionsSetCustomCreateThreadFn(this->p_, ort_custom_create_thread_fn));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::SetCustomThreadCreationOptions(void* ort_custom_thread_creation_options) {
+  ThrowOnError(GetApi().SessionOptionsSetCustomThreadCreationOptions(this->p_, ort_custom_thread_creation_options));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::SetCustomJoinThreadFn(OrtCustomJoinThreadFn ort_custom_join_thread_fn) {
+  ThrowOnError(GetApi().SessionOptionsSetCustomJoinThreadFn(this->p_, ort_custom_join_thread_fn));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::AppendExecutionProvider_OpenVINO(const OrtOpenVINOProviderOptions& provider_options) {
+  ThrowOnError(GetApi().SessionOptionsAppendExecutionProvider_OpenVINO(this->p_, &provider_options));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::AppendExecutionProvider_OpenVINO_V2(const std::unordered_map<std::string, std::string>& provider_options) {
+  auto num_entries = provider_options.size();
+  std::vector<const char*> keys, values;
+  if (num_entries > 0) {
+    keys.reserve(num_entries);
+    values.reserve(num_entries);
+
+    for (const auto& entry : provider_options) {
+      keys.push_back(entry.first.c_str());
+      values.push_back(entry.second.c_str());
+    }
+  }
+
+  ThrowOnError(GetApi().SessionOptionsAppendExecutionProvider_OpenVINO_V2(this->p_,
+                                                                          keys.data(), values.data(), num_entries));
+
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::AppendExecutionProvider_VitisAI(const std::unordered_map<std::string, std::string>& provider_options) {
+  auto num_entries = provider_options.size();
+  std::vector<const char*> keys, values;
+  if (num_entries > 0) {
+    keys.reserve(num_entries);
+    values.reserve(num_entries);
+
+    for (const auto& entry : provider_options) {
+      keys.push_back(entry.first.c_str());
+      values.push_back(entry.second.c_str());
+    }
+  }
+
+  ThrowOnError(GetApi().SessionOptionsAppendExecutionProvider_VitisAI(this->p_, keys.data(), values.data(), num_entries));
+
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::RegisterCustomOpsLibrary(const ORTCHAR_T* library_name,
+                                                                              const CustomOpConfigs& custom_op_configs) {
+  // Add custom op config entries before registering the custom op library. Otherwise, the config entries _may_ be ignored by
+  // the custom op library.
+  for (const auto& config_iter : custom_op_configs.GetFlattenedConfigs()) {
+    AddConfigEntry(config_iter.first.c_str(), config_iter.second.c_str());
+  }
+
+  ThrowOnError(GetApi().RegisterCustomOpsLibrary_V2(this->p_, library_name));
+  return *this;
+}
+
+template <typename T>
+inline SessionOptionsImpl<T>& SessionOptionsImpl<T>::RegisterCustomOpsUsingFunction(const char* registration_function_name) {
+  ThrowOnError(GetApi().RegisterCustomOpsUsingFunction(this->p_, registration_function_name));
+  return *this;
+}
+
+/// Session
+template <typename T>
+inline size_t ConstSessionImpl<T>::GetInputCount() const {
+  size_t out;
+  ThrowOnError(GetApi().SessionGetInputCount(this->p_, &out));
+  return out;
+}
+
+template <typename T>
+inline size_t ConstSessionImpl<T>::GetOutputCount() const {
+  size_t out;
+  ThrowOnError(GetApi().SessionGetOutputCount(this->p_, &out));
+  return out;
+}
+
+template <typename T>
+inline size_t ConstSessionImpl<T>::GetOverridableInitializerCount() const {
+  size_t out;
+  ThrowOnError(GetApi().SessionGetOverridableInitializerCount(this->p_, &out));
+  return out;
+}
+
+template <typename T>
+inline std::vector<std::string> ConstSessionImpl<T>::GetInputNames() const {
+  AllocatorWithDefaultOptions allocator;
+
+  auto num_inputs = GetInputCount();
+  std::vector<std::string> input_names;
+  input_names.reserve(num_inputs);
+
+  for (size_t i = 0; i < num_inputs; ++i) {
+    char* name = nullptr;
+    ThrowOnError(GetApi().SessionGetInputName(this->p_, i, allocator, &name));
+    input_names.push_back(name);
+  }
+
+  return input_names;
+}
+
+template <typename T>
+inline std::vector<std::string> ConstSessionImpl<T>::GetOutputNames() const {
+  AllocatorWithDefaultOptions allocator;
+
+  auto num_inputs = GetOutputCount();
+  std::vector<std::string> output_names;
+  output_names.reserve(num_inputs);
+
+  for (size_t i = 0; i < num_inputs; ++i) {
+    char* name = nullptr;
+    ThrowOnError(GetApi().SessionGetOutputName(this->p_, i, allocator, &name));
+    output_names.push_back(name);
+  }
+
+  return output_names;
+}
+
+template <typename T>
+inline std::vector<std::string> ConstSessionImpl<T>::GetOverridableInitializerNames() const {
+  AllocatorWithDefaultOptions allocator;
+
+  auto num_initializers = GetOverridableInitializerCount();
+  std::vector<std::string> initializer_names;
+  initializer_names.reserve(num_initializers);
+
+  for (size_t i = 0; i < num_initializers; ++i) {
+    char* name = nullptr;
+    ThrowOnError(GetApi().SessionGetOverridableInitializerName(this->p_, i, allocator, &name));
+    initializer_names.push_back(name);
+  }
+
+  return initializer_names;
+}
+
+template <typename T>
+inline AllocatedStringPtr ConstSessionImpl<T>::GetInputNameAllocated(size_t index, OrtAllocator* allocator) const {
+  char* out;
+  ThrowOnError(GetApi().SessionGetInputName(this->p_, index, allocator, &out));
+  return AllocatedStringPtr(out, detail::AllocatedFree(allocator));
+}
+
+template <typename T>
+inline AllocatedStringPtr ConstSessionImpl<T>::GetOutputNameAllocated(size_t index, OrtAllocator* allocator) const {
+  char* out;
+  ThrowOnError(GetApi().SessionGetOutputName(this->p_, index, allocator, &out));
+  return AllocatedStringPtr(out, detail::AllocatedFree(allocator));
+}
+
+template <typename T>
+inline AllocatedStringPtr ConstSessionImpl<T>::GetOverridableInitializerNameAllocated(size_t index, OrtAllocator* allocator) const {
+  char* out;
+  ThrowOnError(GetApi().SessionGetOverridableInitializerName(this->p_, index, allocator, &out));
+  return AllocatedStringPtr(out, detail::AllocatedFree(allocator));
+}
+
+template <typename T>
+inline uint64_t ConstSessionImpl<T>::GetProfilingStartTimeNs() const {
+  uint64_t out;
+  ThrowOnError(GetApi().SessionGetProfilingStartTimeNs(this->p_, &out));
+  return out;
+}
+
+template <typename T>
+inline ModelMetadata ConstSessionImpl<T>::GetModelMetadata() const {
+  OrtModelMetadata* out;
+  ThrowOnError(GetApi().SessionGetModelMetadata(this->p_, &out));
+  return ModelMetadata{out};
+}
+
+template <typename T>
+inline TypeInfo ConstSessionImpl<T>::GetInputTypeInfo(size_t index) const {
+  OrtTypeInfo* out;
+  ThrowOnError(GetApi().SessionGetInputTypeInfo(this->p_, index, &out));
+  return TypeInfo{out};
+}
+
+template <typename T>
+inline TypeInfo ConstSessionImpl<T>::GetOutputTypeInfo(size_t index) const {
+  OrtTypeInfo* out;
+  ThrowOnError(GetApi().SessionGetOutputTypeInfo(this->p_, index, &out));
+  return TypeInfo{out};
+}
+
+template <typename T>
+inline TypeInfo ConstSessionImpl<T>::GetOverridableInitializerTypeInfo(size_t index) const {
+  OrtTypeInfo* out;
+  ThrowOnError(GetApi().SessionGetOverridableInitializerTypeInfo(this->p_, index, &out));
+  return TypeInfo{out};
+}
+
+template <typename T>
+inline int ConstSessionImpl<T>::GetOpset(const std::string& domain) const {
+  int opset;
+  ThrowOnError(GetApi().SessionGetOpsetForDomain(this->p_, domain.c_str(), &opset));
+  return opset;
+}
+
+template <typename T>
+inline std::vector<Value> SessionImpl<T>::Run(const RunOptions& run_options, const char* const* input_names, const Value* input_values, size_t input_count,
+                                              const char* const* output_names, size_t output_count) {
+  std::vector<Value> output_values;
+  output_values.reserve(output_count);
+  for (size_t i = 0; i < output_count; i++)
+    output_values.emplace_back(nullptr);
+  Run(run_options, input_names, input_values, input_count, output_names, output_values.data(), output_count);
+  return output_values;
+}
+
+template <typename T>
+inline void SessionImpl<T>::Run(const RunOptions& run_options, const char* const* input_names, const Value* input_values, size_t input_count,
+                                const char* const* output_names, Value* output_values, size_t output_count) {
+  static_assert(sizeof(Value) == sizeof(OrtValue*), "Value is really just an array of OrtValue* in memory, so we can reinterpret_cast safely");
+  auto ort_input_values = reinterpret_cast<const OrtValue* const*>(input_values);
+  auto ort_output_values = reinterpret_cast<OrtValue**>(output_values);
+  ThrowOnError(GetApi().Run(this->p_, run_options, input_names, ort_input_values, input_count, output_names, output_count, ort_output_values));
+}
+
+template <typename T>
+inline void SessionImpl<T>::Run(const RunOptions& run_options, const IoBinding& io_binding) {
+  ThrowOnError(GetApi().RunWithBinding(this->p_, run_options, io_binding));
+}
+
+template <typename T>
+inline void SessionImpl<T>::RunAsync(const RunOptions& run_options, const char* const* input_names, const Value* input_values, size_t input_count,
+                                     const char* const* output_names, Value* output_values, size_t output_count, RunAsyncCallbackFn callback, void* user_data) {
+  auto ort_input_values = reinterpret_cast<const OrtValue* const*>(input_values);
+  auto ort_output_values = reinterpret_cast<OrtValue**>(output_values);
+  ThrowOnError(GetApi().RunAsync(this->p_, run_options, input_names,
+                                 ort_input_values, input_count, output_names, output_count,
+                                 ort_output_values, callback, user_data));
+}
+
+template <typename T>
+inline AllocatedStringPtr SessionImpl<T>::EndProfilingAllocated(OrtAllocator* allocator) {
+  char* out = nullptr;
+  ThrowOnError(GetApi().SessionEndProfiling(this->p_, allocator, &out));
+  return AllocatedStringPtr(out, detail::AllocatedFree(allocator));
+}
+
+template <typename T>
+inline void SessionImpl<T>::SetEpDynamicOptions(const char* const* keys, const char* const* values, size_t kv_len) {
+  ThrowOnError(GetApi().SetEpDynamicOptions(this->p_, keys, values, kv_len));
+}
+
+template <typename T>
+inline void SessionImpl<T>::FinalizeModelBuilderSession(const ModelBuilderAPI::Model& model, const SessionOptions& options,
+                                                        OrtPrepackedWeightsContainer* prepacked_weights_container) {
+  ThrowOnError(GetModelBuilderApi().ApplyModelToModelBuilderSession(this->p_, model));
+  ThrowOnError(GetModelBuilderApi().FinalizeModelBuilderSession(this->p_, options, prepacked_weights_container));
+}
+
+}  // namespace detail
+
+inline SessionOptions::SessionOptions() {
+  ThrowOnError(GetApi().CreateSessionOptions(&this->p_));
+}
+
+/// CustomOpConfigs
+inline std::string detail::MakeCustomOpConfigEntryKey(const char* custom_op_name, const char* config) {
+  std::string config_key = "custom_op.";
+
+  config_key += custom_op_name;
+  config_key += ".";
+  config_key += config;
+
+  return config_key;
+}
+
+inline CustomOpConfigs& CustomOpConfigs::AddConfig(const char* custom_op_name, const char* config_key, const char* config_value) {
+  const std::string full_flat_key = detail::MakeCustomOpConfigEntryKey(custom_op_name, config_key);
+  flat_configs_[full_flat_key] = config_value;
+  return *this;
+}
+
+inline const std::unordered_map<std::string, std::string>& CustomOpConfigs::GetFlattenedConfigs() const {
+  return flat_configs_;
+}
+
+inline Session::Session(const Env& env, const ORTCHAR_T* model_path, const SessionOptions& options) {
+  ThrowOnError(GetApi().CreateSession(env, model_path, options, &this->p_));
+}
+
+inline Session::Session(const Env& env, const ORTCHAR_T* model_path, const SessionOptions& options,
+                        OrtPrepackedWeightsContainer* prepacked_weights_container) {
+  ThrowOnError(GetApi().CreateSessionWithPrepackedWeightsContainer(env, model_path, options, prepacked_weights_container, &this->p_));
+}
+
+inline Session::Session(const Env& env, const void* model_data, size_t model_data_length, const SessionOptions& options) {
+  ThrowOnError(GetApi().CreateSessionFromArray(env, model_data, model_data_length, options, &this->p_));
+}
+
+inline Session::Session(const Env& env, const void* model_data, size_t model_data_length,
+                        const SessionOptions& options, OrtPrepackedWeightsContainer* prepacked_weights_container) {
+  ThrowOnError(GetApi().CreateSessionFromArrayWithPrepackedWeightsContainer(env, model_data, model_data_length, options,
+                                                                            prepacked_weights_container, &this->p_));
+}
+
+inline Session::Session(const Env& env, const ModelBuilderAPI::Model& model, const SessionOptions& options) {
+  ThrowOnError(GetModelBuilderApi().CreateSessionFromModel(env, model.GetConst(), options, &this->p_));
+}
+
+// static
+inline Session Session::CreateModelBuilderSession(const Env& env, const ORTCHAR_T* model_path,
+                                                  const SessionOptions& options) {
+  OrtSession* session = nullptr;
+  ThrowOnError(GetModelBuilderApi().CreateModelBuilderSession(env, model_path, options, &session));
+  return Session(session);
+}
+
+// static
+inline Session Session::CreateModelBuilderSession(const Env& env, const void* model_data, size_t model_data_length,
+                                                  const SessionOptions& options) {
+  OrtSession* session = nullptr;
+  ThrowOnError(GetModelBuilderApi().CreateModelBuilderSessionFromArray(env, model_data, model_data_length, options,
+                                                                       &session));
+  return Session(session);
+}
+
+void FinalizeModelBuilderSession(const ModelBuilderAPI::Model& model, const SessionOptions& options,
+                                 OrtPrepackedWeightsContainer* prepacked_weights_container);
+
+inline AllocatedStringPtr ModelMetadata::GetProducerNameAllocated(OrtAllocator* allocator) const {
+  char* out;
+  ThrowOnError(GetApi().ModelMetadataGetProducerName(p_, allocator, &out));
+  return AllocatedStringPtr(out, detail::AllocatedFree(allocator));
+}
+
+inline AllocatedStringPtr ModelMetadata::GetGraphNameAllocated(OrtAllocator* allocator) const {
+  char* out;
+  ThrowOnError(GetApi().ModelMetadataGetGraphName(p_, allocator, &out));
+  return AllocatedStringPtr(out, detail::AllocatedFree(allocator));
+}
+
+inline AllocatedStringPtr ModelMetadata::GetDomainAllocated(OrtAllocator* allocator) const {
+  char* out;
+  ThrowOnError(GetApi().ModelMetadataGetDomain(p_, allocator, &out));
+  return AllocatedStringPtr(out, detail::AllocatedFree(allocator));
+}
+
+inline AllocatedStringPtr Ort::ModelMetadata::GetDescriptionAllocated(OrtAllocator* allocator) const {
+  char* out;
+  ThrowOnError(GetApi().ModelMetadataGetDescription(p_, allocator, &out));
+  return AllocatedStringPtr(out, detail::AllocatedFree(allocator));
+}
+
+inline AllocatedStringPtr ModelMetadata::GetGraphDescriptionAllocated(OrtAllocator* allocator) const {
+  char* out;
+  ThrowOnError(GetApi().ModelMetadataGetGraphDescription(p_, allocator, &out));
+  return AllocatedStringPtr(out, detail::AllocatedFree(allocator));
+}
+
+inline AllocatedStringPtr ModelMetadata::LookupCustomMetadataMapAllocated(const char* key, OrtAllocator* allocator) const {
+  char* out;
+  ThrowOnError(GetApi().ModelMetadataLookupCustomMetadataMap(p_, allocator, key, &out));
+  return AllocatedStringPtr(out, detail::AllocatedFree(allocator));
+}
+
+inline std::vector<AllocatedStringPtr> ModelMetadata::GetCustomMetadataMapKeysAllocated(OrtAllocator* allocator) const {
+  auto deletor = detail::AllocatedFree(allocator);
+  std::vector<AllocatedStringPtr> result;
+
+  char** out = nullptr;
+  int64_t num_keys = 0;
+  ThrowOnError(GetApi().ModelMetadataGetCustomMetadataMapKeys(p_, allocator, &out, &num_keys));
+  if (num_keys <= 0) {
+    return result;
+  }
+
+  // array of pointers will be freed
+  std::unique_ptr<void, decltype(deletor)> array_guard(out, deletor);
+  // reserve may throw
+  auto strings_deletor = [&deletor, num_keys](char** out) { for(int64_t i = 0; i < num_keys; ++i) deletor(out[i]); };
+  std::unique_ptr<char*, decltype(strings_deletor)> strings_guard(out, strings_deletor);
+  result.reserve(static_cast<size_t>(num_keys));
+  strings_guard.release();
+  for (int64_t i = 0; i < num_keys; ++i) {
+    result.push_back(AllocatedStringPtr(out[i], deletor));
+  }
+
+  return result;
+}
+
+inline int64_t ModelMetadata::GetVersion() const {
+  int64_t out;
+  ThrowOnError(GetApi().ModelMetadataGetVersion(p_, &out));
+  return out;
+}
+
+inline TensorTypeAndShapeInfo::TensorTypeAndShapeInfo(ONNXTensorElementDataType element_type,
+                                                      const std::vector<int64_t>& dims,
+                                                      const std::vector<std::string>* symbolic_dims) {
+  ThrowOnError(GetApi().CreateTensorTypeAndShapeInfo(&p_));
+  ThrowOnError(GetApi().SetTensorElementType(p_, element_type));
+  ThrowOnError(GetApi().SetDimensions(p_, dims.data(), dims.size()));
+
+  if (symbolic_dims) {
+    std::vector<const char*> symbolic_dims_cstr;
+    symbolic_dims_cstr.reserve(symbolic_dims->size());
+    std::transform(symbolic_dims->begin(), symbolic_dims->end(), std::back_inserter(symbolic_dims_cstr),
+                   [](const std::string& s) { return s.c_str(); });
+    ThrowOnError(GetApi().SetSymbolicDimensions(p_, symbolic_dims_cstr.data(), symbolic_dims_cstr.size()));
+  }
+}
+
+// static
+inline TypeInfo TypeInfo::CreateTensorInfo(ConstTensorTypeAndShapeInfo tensor_type_and_shape_info) {
+  OrtTypeInfo* output = nullptr;
+  ThrowOnError(GetApi().CreateTensorTypeInfo(tensor_type_and_shape_info, &output));
+  return TypeInfo{output};
+}
+
+// static
+inline TypeInfo TypeInfo::CreateSparseTensorInfo(ConstTensorTypeAndShapeInfo sparse_tensor_type_and_shape_info) {
+  OrtTypeInfo* output = nullptr;
+  ThrowOnError(GetApi().CreateSparseTensorTypeInfo(sparse_tensor_type_and_shape_info, &output));
+  return TypeInfo{output};
+}
+
+// static
+inline TypeInfo TypeInfo::CreateSequenceTypeInfo(ConstTypeInfo sequence_type) {
+  OrtTypeInfo* output;
+  ThrowOnError(GetApi().CreateSequenceTypeInfo(sequence_type, &output));
+  return TypeInfo{output};
+}
+
+// static
+inline TypeInfo TypeInfo::CreateMapTypeInfo(ONNXTensorElementDataType key_type, ConstTypeInfo value_type) {
+  OrtTypeInfo* output;
+  ThrowOnError(GetApi().CreateMapTypeInfo(key_type, value_type, &output));
+  return TypeInfo{output};
+}
+
+// static
+inline TypeInfo TypeInfo::CreateOptionalTypeInfo(ConstTypeInfo contained_type) {
+  OrtTypeInfo* output;
+  ThrowOnError(GetApi().CreateOptionalTypeInfo(contained_type, &output));
+  return TypeInfo{output};
+}
+
+namespace detail {
+
+template <typename T>
+inline ONNXTensorElementDataType TensorTypeAndShapeInfoImpl<T>::GetElementType() const {
+  ONNXTensorElementDataType out;
+  ThrowOnError(GetApi().GetTensorElementType(this->p_, &out));
+  return out;
+}
+
+template <typename T>
+inline size_t TensorTypeAndShapeInfoImpl<T>::GetElementCount() const {
+  size_t out;
+  ThrowOnError(GetApi().GetTensorShapeElementCount(this->p_, &out));
+  return static_cast<size_t>(out);
+}
+
+template <typename T>
+inline size_t TensorTypeAndShapeInfoImpl<T>::GetDimensionsCount() const {
+  size_t out;
+  ThrowOnError(GetApi().GetDimensionsCount(this->p_, &out));
+  return out;
+}
+
+template <typename T>
+inline void TensorTypeAndShapeInfoImpl<T>::GetDimensions(int64_t* values, size_t values_count) const {
+  ThrowOnError(GetApi().GetDimensions(this->p_, values, values_count));
+}
+
+template <typename T>
+inline void TensorTypeAndShapeInfoImpl<T>::GetSymbolicDimensions(const char** values, size_t values_count) const {
+  ThrowOnError(GetApi().GetSymbolicDimensions(this->p_, values, values_count));
+}
+
+template <typename T>
+inline std::vector<const char*> TensorTypeAndShapeInfoImpl<T>::GetSymbolicDimensions() const {
+  std::vector<const char*> out(GetDimensionsCount(), nullptr);
+  ThrowOnError(GetApi().GetSymbolicDimensions(this->p_, out.data(), out.size()));
+  return out;
+}
+
+template <typename T>
+inline std::vector<int64_t> TensorTypeAndShapeInfoImpl<T>::GetShape() const {
+  std::vector<int64_t> out(GetDimensionsCount(), -1);
+  ThrowOnError(GetApi().GetDimensions(this->p_, out.data(), out.size()));
+  return out;
+}
+
+template <typename T>
+inline ConstTensorTypeAndShapeInfo TypeInfoImpl<T>::GetTensorTypeAndShapeInfo() const {
+  const OrtTensorTypeAndShapeInfo* out;
+  ThrowOnError(GetApi().CastTypeInfoToTensorInfo(this->p_, &out));
+  return ConstTensorTypeAndShapeInfo{out};
+}
+
+template <typename T>
+inline ConstSequenceTypeInfo TypeInfoImpl<T>::GetSequenceTypeInfo() const {
+  const OrtSequenceTypeInfo* out;
+  ThrowOnError(GetApi().CastTypeInfoToSequenceTypeInfo(this->p_, &out));
+  return ConstSequenceTypeInfo{out};
+}
+
+template <typename T>
+inline ConstMapTypeInfo TypeInfoImpl<T>::GetMapTypeInfo() const {
+  const OrtMapTypeInfo* out;
+  ThrowOnError(GetApi().CastTypeInfoToMapTypeInfo(this->p_, &out));
+  return ConstMapTypeInfo{out};
+}
+
+template <typename T>
+inline ONNXType TypeInfoImpl<T>::GetONNXType() const {
+  ONNXType out;
+  ThrowOnError(GetApi().GetOnnxTypeFromTypeInfo(this->p_, &out));
+  return out;
+}
+
+template <typename T>
+inline TypeInfo SequenceTypeInfoImpl<T>::GetSequenceElementType() const {
+  OrtTypeInfo* output;
+  ThrowOnError(GetApi().GetSequenceElementType(this->p_, &output));
+  return TypeInfo{output};
+}
+
+template <typename T>
+inline TypeInfo OptionalTypeInfoImpl<T>::GetOptionalElementType() const {
+  OrtTypeInfo* info;
+  ThrowOnError(GetApi().GetOptionalContainedTypeInfo(this->p_, &info));
+  return TypeInfo{info};
+}
+
+template <typename T>
+inline ONNXTensorElementDataType MapTypeInfoImpl<T>::GetMapKeyType() const {
+  ONNXTensorElementDataType out;
+  ThrowOnError(GetApi().GetMapKeyType(this->p_, &out));
+  return out;
+}
+
+template <typename T>
+inline TypeInfo MapTypeInfoImpl<T>::GetMapValueType() const {
+  OrtTypeInfo* output;
+  ThrowOnError(GetApi().GetMapValueType(this->p_, &output));
+  return TypeInfo{output};
+}
+
+template <typename T>
+inline ConstOptionalTypeInfo TypeInfoImpl<T>::GetOptionalTypeInfo() const {
+  const OrtOptionalTypeInfo* info;
+  ThrowOnError(GetApi().CastTypeInfoToOptionalTypeInfo(this->p_, &info));
+  return ConstOptionalTypeInfo{info};
+}
+
+}  // namespace detail
+
+namespace detail {
+
+template <typename T>
+template <typename R>
+inline void ConstValueImpl<T>::GetOpaqueData(const char* domain, const char* type_name, R& out) const {
+  ThrowOnError(GetApi().GetOpaqueValue(domain, type_name, this->p_, &out, sizeof(R)));
+}
+
+template <typename T>
+inline bool ConstValueImpl<T>::IsTensor() const {
+  int out;
+  ThrowOnError(GetApi().IsTensor(this->p_, &out));
+  return out != 0;
+}
+
+template <typename T>
+inline bool ConstValueImpl<T>::HasValue() const {
+  int out;
+  ThrowOnError(GetApi().HasValue(this->p_, &out));
+  return out != 0;
+}
+
+template <typename T>
+inline size_t ConstValueImpl<T>::GetCount() const {
+  size_t out;
+  ThrowOnError(GetApi().GetValueCount(this->p_, &out));
+  return out;
+}
+
+template <typename T>
+inline Value ConstValueImpl<T>::GetValue(int index, OrtAllocator* allocator) const {
+  OrtValue* out;
+  ThrowOnError(GetApi().GetValue(this->p_, index, allocator, &out));
+  return Value{out};
+}
+
+template <typename T>
+inline size_t ConstValueImpl<T>::GetStringTensorDataLength() const {
+  size_t out;
+  ThrowOnError(GetApi().GetStringTensorDataLength(this->p_, &out));
+  return out;
+}
+
+template <typename T>
+inline size_t ConstValueImpl<T>::GetStringTensorElementLength(size_t element_index) const {
+  size_t out;
+  ThrowOnError(GetApi().GetStringTensorElementLength(this->p_, element_index, &out));
+  return out;
+}
+
+template <typename T>
+template <typename R>
+inline const R* ConstValueImpl<T>::GetTensorData() const {
+  R* out;
+  ThrowOnError(GetApi().GetTensorMutableData(const_cast<OrtValue*>(this->p_), (void**)&out));
+  return out;
+}
+
+template <typename T>
+inline const void* ConstValueImpl<T>::GetTensorRawData() const {
+  void* out;
+  ThrowOnError(GetApi().GetTensorMutableData(const_cast<OrtValue*>(this->p_), &out));
+  return out;
+}
+
+template <typename T>
+inline TypeInfo ConstValueImpl<T>::GetTypeInfo() const {
+  OrtTypeInfo* output;
+  ThrowOnError(GetApi().GetTypeInfo(this->p_, &output));
+  return TypeInfo{output};
+}
+
+template <typename T>
+inline TensorTypeAndShapeInfo ConstValueImpl<T>::GetTensorTypeAndShapeInfo() const {
+  OrtTensorTypeAndShapeInfo* output;
+  ThrowOnError(GetApi().GetTensorTypeAndShape(this->p_, &output));
+  return TensorTypeAndShapeInfo{output};
+}
+
+template <typename T>
+inline ConstMemoryInfo ConstValueImpl<T>::GetTensorMemoryInfo() const {
+  const OrtMemoryInfo* mem_info;
+  ThrowOnError(GetApi().GetTensorMemoryInfo(this->p_, &mem_info));
+  return ConstMemoryInfo(mem_info);
+}
+
+template <typename T>
+inline void ConstValueImpl<T>::GetStringTensorElement(size_t buffer_length, size_t element_index, void* buffer) const {
+  ThrowOnError(GetApi().GetStringTensorElement(this->p_, buffer_length, element_index, buffer));
+}
+
+template <typename T>
+inline std::string ConstValueImpl<T>::GetStringTensorElement(size_t element_index) const {
+  size_t buffer_length;
+  ThrowOnError(GetApi().GetStringTensorElementLength(this->p_, element_index, &buffer_length));
+
+  std::string s;
+  s.resize(buffer_length);
+  ThrowOnError(GetApi().GetStringTensorElement(this->p_, buffer_length, element_index, &s[0]));
+  return s;
+}
+
+template <typename T>
+inline void ConstValueImpl<T>::GetStringTensorContent(void* buffer, size_t buffer_length, size_t* offsets, size_t offsets_count) const {
+  ThrowOnError(GetApi().GetStringTensorContent(this->p_, buffer, buffer_length, offsets, offsets_count));
+}
+
+#if !defined(DISABLE_SPARSE_TENSORS)
+template <typename T>
+inline OrtSparseFormat ConstValueImpl<T>::GetSparseFormat() const {
+  OrtSparseFormat format;
+  ThrowOnError(GetApi().GetSparseTensorFormat(this->p_, &format));
+  return format;
+}
+
+template <typename T>
+inline TensorTypeAndShapeInfo ConstValueImpl<T>::GetSparseTensorValuesTypeAndShapeInfo() const {
+  OrtTensorTypeAndShapeInfo* output;
+  ThrowOnError(GetApi().GetSparseTensorValuesTypeAndShape(this->p_, &output));
+  return TensorTypeAndShapeInfo{output};
+}
+
+template <typename T>
+inline TensorTypeAndShapeInfo ConstValueImpl<T>::GetSparseTensorIndicesTypeShapeInfo(OrtSparseIndicesFormat indices_format) const {
+  OrtTensorTypeAndShapeInfo* output;
+  ThrowOnError(GetApi().GetSparseTensorIndicesTypeShape(this->p_, indices_format, &output));
+  return TensorTypeAndShapeInfo{output};
+}
+
+template <typename T>
+template <typename R>
+inline const R* ConstValueImpl<T>::GetSparseTensorIndicesData(OrtSparseIndicesFormat indices_format, size_t& num_indices) const {
+  const void* out;
+  ThrowOnError(GetApi().GetSparseTensorIndices(this->p_, indices_format, &num_indices, &out));
+  return reinterpret_cast<const R*>(out);
+}
+
+template <typename T>
+inline bool ConstValueImpl<T>::IsSparseTensor() const {
+  int out;
+  ThrowOnError(GetApi().IsSparseTensor(this->p_, &out));
+  return out != 0;
+}
+
+template <typename T>
+template <typename R>
+inline const R* ConstValueImpl<T>::GetSparseTensorValues() const {
+  const void* out;
+  ThrowOnError(GetApi().GetSparseTensorValues(this->p_, &out));
+  return reinterpret_cast<const R*>(out);
+}
+
+#endif
+
+template <typename T>
+void ValueImpl<T>::FillStringTensor(const char* const* s, size_t s_len) {
+  ThrowOnError(GetApi().FillStringTensor(this->p_, s, s_len));
+}
+
+template <typename T>
+void ValueImpl<T>::FillStringTensorElement(const char* s, size_t index) {
+  ThrowOnError(GetApi().FillStringTensorElement(this->p_, s, index));
+}
+
+template <typename T>
+inline char* ValueImpl<T>::GetResizedStringTensorElementBuffer(size_t index, size_t buffer_length) {
+  char* result;
+  ThrowOnError(GetApi().GetResizedStringTensorElementBuffer(this->p_, index, buffer_length, &result));
+  return result;
+}
+
+template <typename T>
+void* ValueImpl<T>::GetTensorMutableRawData() {
+  void* out;
+  ThrowOnError(GetApi().GetTensorMutableData(this->p_, &out));
+  return out;
+}
+
+template <typename T>
+template <typename R>
+R* ValueImpl<T>::GetTensorMutableData() {
+  R* out;
+  ThrowOnError(GetApi().GetTensorMutableData(this->p_, (void**)&out));
+  return out;
+}
+
+template <typename T>
+template <typename R>
+R& ValueImpl<T>::At(const std::vector<int64_t>& location) {
+  static_assert(!std::is_same<T, std::string>::value, "this api does not support std::string");
+  R* out;
+  ThrowOnError(GetApi().TensorAt(this->p_, location.data(), location.size(), (void**)&out));
+  return *out;
+}
+
+#if !defined(DISABLE_SPARSE_TENSORS)
+template <typename T>
+void ValueImpl<T>::UseCooIndices(int64_t* indices_data, size_t indices_num) {
+  ThrowOnError(GetApi().UseCooIndices(this->p_, indices_data, indices_num));
+}
+
+template <typename T>
+void ValueImpl<T>::UseCsrIndices(int64_t* inner_data, size_t inner_num, int64_t* outer_data, size_t outer_num) {
+  ThrowOnError(GetApi().UseCsrIndices(this->p_, inner_data, inner_num, outer_data, outer_num));
+}
+
+template <typename T>
+void ValueImpl<T>::UseBlockSparseIndices(const Shape& indices_shape, int32_t* indices_data) {
+  ThrowOnError(GetApi().UseBlockSparseIndices(this->p_, indices_shape.shape, indices_shape.shape_len, indices_data));
+}
+
+template <typename T>
+void ValueImpl<T>::FillSparseTensorCoo(const OrtMemoryInfo* mem_info, const OrtSparseValuesParam& values_param,
+                                       const int64_t* indices_data, size_t indices_num) {
+  ThrowOnError(GetApi().FillSparseTensorCoo(this->p_, mem_info, values_param.values_shape,
+                                            values_param.values_shape_len, values_param.data.p_data,
+                                            indices_data, indices_num));
+}
+
+template <typename T>
+void ValueImpl<T>::FillSparseTensorCsr(const OrtMemoryInfo* data_mem_info,
+                                       const OrtSparseValuesParam& values,
+                                       const int64_t* inner_indices_data, size_t inner_indices_num,
+                                       const int64_t* outer_indices_data, size_t outer_indices_num) {
+  ThrowOnError(GetApi().FillSparseTensorCsr(this->p_, data_mem_info, values.values_shape, values.values_shape_len, values.data.p_data,
+                                            inner_indices_data, inner_indices_num,
+                                            outer_indices_data, outer_indices_num));
+}
+
+template <typename T>
+void ValueImpl<T>::FillSparseTensorBlockSparse(const OrtMemoryInfo* data_mem_info,
+                                               const OrtSparseValuesParam& values,
+                                               const Shape& indices_shape,
+                                               const int32_t* indices_data) {
+  ThrowOnError(GetApi().FillSparseTensorBlockSparse(this->p_, data_mem_info, values.values_shape, values.values_shape_len, values.data.p_data,
+                                                    indices_shape.shape, indices_shape.shape_len,
+                                                    indices_data));
+}
+
+#endif  // !defined(DISABLE_SPARSE_TENSORS)
+
+}  // namespace detail
+
+template <typename T>
+inline Value Value::CreateTensor(const OrtMemoryInfo* info, T* p_data, size_t p_data_element_count,
+                                 const int64_t* shape, size_t shape_len) {
+  return CreateTensor(info, p_data, p_data_element_count * sizeof(T), shape, shape_len, TypeToTensorType<T>::type);
+}
+
+inline Value Value::CreateTensor(const OrtMemoryInfo* info, void* p_data, size_t p_data_byte_count,
+                                 const int64_t* shape, size_t shape_len,
+                                 ONNXTensorElementDataType type) {
+  OrtValue* out;
+  ThrowOnError(GetApi().CreateTensorWithDataAsOrtValue(info, p_data, p_data_byte_count, shape, shape_len, type, &out));
+  return Value{out};
+}
+
+inline Value Value::CreateTensor(OrtAllocator* deleter, void* p_data, size_t p_data_byte_count,
+                                 const int64_t* shape, size_t shape_len,
+                                 ONNXTensorElementDataType type) {
+  OrtValue* out;
+  ThrowOnError(GetApi().CreateTensorWithDataAndDeleterAsOrtValue(deleter, p_data, p_data_byte_count,
+                                                                 shape, shape_len, type, &out));
+  return Value{out};
+}
+
+template <typename T>
+inline Value Value::CreateTensor(OrtAllocator* allocator, const int64_t* shape, size_t shape_len) {
+  return CreateTensor(allocator, shape, shape_len, TypeToTensorType<T>::type);
+}
+
+inline Value Value::CreateTensor(OrtAllocator* allocator, const int64_t* shape, size_t shape_len,
+                                 ONNXTensorElementDataType type) {
+  OrtValue* out;
+  ThrowOnError(GetApi().CreateTensorAsOrtValue(allocator, shape, shape_len, type, &out));
+  return Value{out};
+}
+
+#if !defined(DISABLE_SPARSE_TENSORS)
+
+template <typename T>
+inline Value Value::CreateSparseTensor(const OrtMemoryInfo* info, T* p_data, const Shape& dense_shape,
+                                       const Shape& values_shape) {
+  return CreateSparseTensor(info, p_data, dense_shape, values_shape, TypeToTensorType<T>::type);
+}
+
+inline Value Value::CreateSparseTensor(const OrtMemoryInfo* info, void* p_data, const Shape& dense_shape,
+                                       const Shape& values_shape, ONNXTensorElementDataType type) {
+  OrtValue* out;
+  ThrowOnError(GetApi().CreateSparseTensorWithValuesAsOrtValue(info, p_data, dense_shape.shape, dense_shape.shape_len,
+                                                               values_shape.shape, values_shape.shape_len, type,
+                                                               &out));
+  return Value{out};
+}
+
+template <typename T>
+inline Value Value::CreateSparseTensor(OrtAllocator* allocator, const Shape& dense_shape) {
+  return CreateSparseTensor(allocator, dense_shape, TypeToTensorType<T>::type);
+}
+
+inline Value Value::CreateSparseTensor(OrtAllocator* allocator, const Shape& dense_shape,
+                                       ONNXTensorElementDataType type) {
+  OrtValue* out;
+  ThrowOnError(GetApi().CreateSparseTensorAsOrtValue(allocator, dense_shape.shape, dense_shape.shape_len, type, &out));
+  return Value{out};
+}
+#endif  // !defined(DISABLE_SPARSE_TENSORS)
+
+inline Value Value::CreateMap(const Value& keys, const Value& values) {
+  OrtValue* out;
+  const OrtValue* inputs[2] = {keys, values};
+  ThrowOnError(GetApi().CreateValue(inputs, 2, ONNX_TYPE_MAP, &out));
+  return Value{out};
+}
+
+inline Value Value::CreateSequence(const std::vector<Value>& values) {
+  OrtValue* out;
+  std::vector<const OrtValue*> values_ort{values.data(), values.data() + values.size()};
+  ThrowOnError(GetApi().CreateValue(values_ort.data(), values_ort.size(), ONNX_TYPE_SEQUENCE, &out));
+  return Value{out};
+}
+
+template <typename T>
+inline Value Value::CreateOpaque(const char* domain, const char* type_name, const T& data_container) {
+  OrtValue* out;
+  ThrowOnError(GetApi().CreateOpaqueValue(domain, type_name, &data_container, sizeof(T), &out));
+  return Value{out};
+}
+
+//
+// Custom OP Inlines
+//
+inline Logger::Logger(const OrtLogger* logger) : logger_(logger) {
+  Ort::ThrowOnError(GetApi().Logger_GetLoggingSeverityLevel(this->logger_, &this->cached_severity_level_));
+}
+
+inline OrtLoggingLevel Logger::GetLoggingSeverityLevel() const noexcept {
+  return cached_severity_level_;
+}
+
+inline Status Logger::LogMessage(OrtLoggingLevel log_severity_level, const ORTCHAR_T* file_path, int line_number,
+                                 const char* func_name, const char* message) const noexcept {
+  OrtStatus* status = GetApi().Logger_LogMessage(logger_, log_severity_level, message, file_path, line_number,
+                                                 func_name);
+  return Status{status};
+}
+
+// Disable warnings about the format string not being a literal (-Wformat-nonliteral and -Wformat-security)
+// for gcc and clang. The alternative is to use actual C-style variadic parameters and apply
+// __attribute__(format(printf...)), which does not work with variadic templates.
+#if defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wformat-nonliteral"
+#pragma GCC diagnostic ignored "-Wformat-security"
+#elif defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wformat-nonliteral"
+#pragma clang diagnostic ignored "-Wformat-security"
+#endif
+template <typename... Args>
+inline Status Logger::LogFormattedMessage(OrtLoggingLevel log_severity_level, const ORTCHAR_T* file_path,
+                                          int line_number, const char* func_name, const char* format,
+                                          Args&&... args) const noexcept {
+  int msg_len = std::snprintf(nullptr, 0U, format, std::forward<Args>(args)...);
+
+  if (msg_len < 0) {  // Formatting error
+    return Status("Failed to log message due to formatting error", OrtErrorCode::ORT_FAIL);
+  }
+
+  OrtStatus* status = nullptr;
+  const size_t buffer_size = static_cast<size_t>(msg_len) + 1U;
+
+  constexpr size_t kStackBufferSize = 1024;
+
+  if (buffer_size < kStackBufferSize) {
+    char buffer[kStackBufferSize];
+    snprintf(buffer, kStackBufferSize, format, std::forward<Args>(args)...);
+    status = GetApi().Logger_LogMessage(logger_, log_severity_level, buffer, file_path, line_number, func_name);
+  } else {
+    // std::make_unique is only supported starting at C++14.
+#if (__cplusplus >= 201402L) || (_MSC_VER >= 1900)
+    auto buffer = std::make_unique<char[]>(buffer_size);
+#else
+    std::unique_ptr<char[]> buffer(new char[buffer_size]);
+#endif
+    std::snprintf(buffer.get(), buffer_size, format, std::forward<Args>(args)...);
+    status = GetApi().Logger_LogMessage(logger_, log_severity_level, buffer.get(), file_path, line_number, func_name);
+  }
+
+  return Status{status};
+}
+// Re-enable -Wformat-nonliteral and -Wformat-security
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#elif defined(__clang__)
+#pragma clang diagnostic pop
+#endif
+
+inline KernelContext::KernelContext(OrtKernelContext* context) : ctx_(context) {
+}
+
+inline size_t KernelContext::GetInputCount() const {
+  size_t out = 0;
+  Ort::ThrowOnError(GetApi().KernelContext_GetInputCount(ctx_, &out));
+  return out;
+}
+
+inline size_t KernelContext::GetOutputCount() const {
+  size_t out = 0;
+  Ort::ThrowOnError(GetApi().KernelContext_GetOutputCount(ctx_, &out));
+  return out;
+}
+
+inline ConstValue KernelContext::GetInput(size_t index) const {
+  const OrtValue* out = nullptr;
+  Ort::ThrowOnError(GetApi().KernelContext_GetInput(ctx_, index, &out));
+  return ConstValue{out};
+}
+
+inline UnownedValue KernelContext::GetOutput(size_t index, const int64_t* dim_values, size_t dim_count) const {
+  OrtValue* out = nullptr;
+  Ort::ThrowOnError(GetApi().KernelContext_GetOutput(ctx_, index, dim_values, dim_count, &out));
+  return UnownedValue(out);
+}
+
+inline UnownedValue KernelContext::GetOutput(size_t index, const std::vector<int64_t>& dims) const {
+  OrtValue* out = nullptr;
+  Ort::ThrowOnError(GetApi().KernelContext_GetOutput(ctx_, index, dims.data(), dims.size(), &out));
+  return UnownedValue(out);
+}
+
+inline void* KernelContext::GetGPUComputeStream() const {
+  void* out = nullptr;
+  Ort::ThrowOnError(GetApi().KernelContext_GetGPUComputeStream(ctx_, &out));
+  return out;
+}
+
+inline OrtAllocator* KernelContext::GetAllocator(const OrtMemoryInfo& memory_info) const {
+  OrtAllocator* out = nullptr;
+  Ort::ThrowOnError(GetApi().KernelContext_GetAllocator(ctx_, &memory_info, &out));
+  return out;
+}
+
+inline Logger KernelContext::GetLogger() const {
+  const OrtLogger* out = nullptr;
+  ThrowOnError(GetApi().KernelContext_GetLogger(this->ctx_, &out));
+  return Logger{out};
+}
+
+inline void KernelContext::ParallelFor(void (*fn)(void*, size_t), size_t total, size_t num_batch, void* usr_data) const {
+  ThrowOnError(GetApi().KernelContext_ParallelFor(ctx_, fn, total, num_batch, usr_data));
+}
+
+inline OpAttr::OpAttr(const char* name, const void* data, int len, OrtOpAttrType type) {
+  Ort::ThrowOnError(GetApi().CreateOpAttr(name, data, len, type, &p_));
+}
+
+namespace detail {
+template <typename T>
+inline KernelInfo KernelInfoImpl<T>::Copy() const {
+  OrtKernelInfo* info_copy = nullptr;
+  Ort::ThrowOnError(GetApi().CopyKernelInfo(this->p_, &info_copy));
+  return KernelInfo{info_copy};
+}
+
+template <typename T>
+inline size_t KernelInfoImpl<T>::GetInputCount() const {
+  size_t out = 0;
+  ThrowOnError(GetApi().KernelInfo_GetInputCount(this->p_, &out));
+  return out;
+}
+
+template <typename T>
+inline size_t KernelInfoImpl<T>::GetOutputCount() const {
+  size_t out = 0;
+  ThrowOnError(GetApi().KernelInfo_GetOutputCount(this->p_, &out));
+  return out;
+}
+
+template <typename T>
+inline std::string KernelInfoImpl<T>::GetInputName(size_t index) const {
+  size_t size = 0;
+
+  // Feed nullptr for the data buffer to query the true size of the string value
+  Ort::ThrowOnError(GetApi().KernelInfo_GetInputName(this->p_, index, nullptr, &size));
+
+  std::string out;
+  out.resize(size);
+  Ort::ThrowOnError(GetApi().KernelInfo_GetInputName(this->p_, index, &out[0], &size));
+  out.resize(size - 1);  // remove the terminating character '\0'
+
+  return out;
+}
+
+template <typename T>
+inline std::string KernelInfoImpl<T>::GetOutputName(size_t index) const {
+  size_t size = 0;
+
+  // Feed nullptr for the data buffer to query the true size of the string value
+  Ort::ThrowOnError(GetApi().KernelInfo_GetOutputName(this->p_, index, nullptr, &size));
+
+  std::string out;
+  out.resize(size);
+  Ort::ThrowOnError(GetApi().KernelInfo_GetOutputName(this->p_, index, &out[0], &size));
+  out.resize(size - 1);  // remove the terminating character '\0'
+
+  return out;
+}
+
+template <typename T>
+inline TypeInfo KernelInfoImpl<T>::GetInputTypeInfo(size_t index) const {
+  OrtTypeInfo* out = nullptr;
+  ThrowOnError(GetApi().KernelInfo_GetInputTypeInfo(this->p_, index, &out));
+  return TypeInfo{out};
+}
+
+template <typename T>
+inline TypeInfo KernelInfoImpl<T>::GetOutputTypeInfo(size_t index) const {
+  OrtTypeInfo* out = nullptr;
+  ThrowOnError(GetApi().KernelInfo_GetOutputTypeInfo(this->p_, index, &out));
+  return TypeInfo{out};
+}
+
+template <typename T>
+inline Value KernelInfoImpl<T>::GetTensorAttribute(const char* name, OrtAllocator* allocator) const {
+  OrtValue* out = nullptr;
+  ThrowOnError(GetApi().KernelInfoGetAttribute_tensor(this->p_, name, allocator, &out));
+  return Value{out};
+}
+
+template <typename T>
+inline ConstValue KernelInfoImpl<T>::GetTensorConstantInput(size_t index, int* is_constant) const {
+  const OrtValue* out = nullptr;
+  ThrowOnError(GetApi().KernelInfoGetConstantInput_tensor(this->p_, index, is_constant, &out));
+  return ConstValue{out};
+}
+
+template <typename T>
+inline std::string KernelInfoImpl<T>::GetNodeName() const {
+  size_t size = 0;
+
+  // Feed nullptr for the data buffer to query the true size of the string value
+  Ort::ThrowOnError(GetApi().KernelInfo_GetNodeName(this->p_, nullptr, &size));
+
+  std::string out;
+  out.resize(size);
+  Ort::ThrowOnError(GetApi().KernelInfo_GetNodeName(this->p_, &out[0], &size));
+  out.resize(size - 1);  // remove the terminating character '\0'
+
+  return out;
+}
+
+template <typename T>
+inline Logger KernelInfoImpl<T>::GetLogger() const {
+  const OrtLogger* out = nullptr;
+  ThrowOnError(GetApi().KernelInfo_GetLogger(this->p_, &out));
+  return Logger{out};
+}
+
+inline void attr_utils::GetAttr(const OrtKernelInfo* p, const char* name, float& out) {
+  Ort::ThrowOnError(GetApi().KernelInfoGetAttribute_float(p, name, &out));
+}
+
+inline void attr_utils::GetAttr(const OrtKernelInfo* p, const char* name, int64_t& out) {
+  Ort::ThrowOnError(GetApi().KernelInfoGetAttribute_int64(p, name, &out));
+}
+
+inline void attr_utils::GetAttr(const OrtKernelInfo* p, const char* name, std::string& result) {
+  size_t size = 0;
+  // Feed nullptr for the data buffer to query the true size of the string attribute
+  Ort::ThrowOnError(GetApi().KernelInfoGetAttribute_string(p, name, nullptr, &size));
+
+  std::string out;
+  out.resize(size);
+  Ort::ThrowOnError(GetApi().KernelInfoGetAttribute_string(p, name, &out[0], &size));
+  out.resize(size - 1);  // remove the terminating character '\0'
+  out.swap(result);
+}
+
+inline void attr_utils::GetAttrs(const OrtKernelInfo* p, const char* name, std::vector<float>& result) {
+  size_t size = 0;
+  // Feed nullptr for the data buffer to query the true size of the attribute
+  Ort::ThrowOnError(GetApi().KernelInfoGetAttributeArray_float(p, name, nullptr, &size));
+
+  std::vector<float> out;
+  out.resize(size);
+  Ort::ThrowOnError(GetApi().KernelInfoGetAttributeArray_float(p, name, out.data(), &size));
+  out.swap(result);
+}
+
+inline void attr_utils::GetAttrs(const OrtKernelInfo* p, const char* name, std::vector<int64_t>& result) {
+  size_t size = 0;
+
+  // Feed nullptr for the data buffer to query the true size of the attribute
+  Ort::ThrowOnError(GetApi().KernelInfoGetAttributeArray_int64(p, name, nullptr, &size));
+
+  std::vector<int64_t> out;
+  out.resize(size);
+  Ort::ThrowOnError(GetApi().KernelInfoGetAttributeArray_int64(p, name, out.data(), &size));
+  out.swap(result);
+}
+}  // namespace detail
+
+inline KernelInfo::KernelInfo(OrtKernelInfo* info) : detail::KernelInfoImpl<OrtKernelInfo>{info} {}
+
+inline Op::Op(OrtOp* p) : detail::Base<OrtOp>(p) {}
+
+inline Op Op::Create(const OrtKernelInfo* info, const char* op_name, const char* domain, int version,
+                     const char** type_constraint_names,
+                     const ONNXTensorElementDataType* type_constraint_values,
+                     size_t type_constraint_count,
+                     const OpAttr* attr_values, size_t attr_count,
+                     size_t input_count, size_t output_count) {
+  static_assert(sizeof(OpAttr) == sizeof(OrtOpAttr*),
+                "OpAttr's is expected to be just an array of OrtOpAttr in memory so we can reinterpret safely");
+  auto attr_input_values = reinterpret_cast<const OrtOpAttr* const*>(attr_values);
+  OrtOp* op;
+  Ort::ThrowOnError(GetApi().CreateOp(info, op_name, domain, version, type_constraint_names, type_constraint_values,
+                                      static_cast<int>(type_constraint_count),
+                                      attr_input_values,
+                                      static_cast<int>(attr_count),
+                                      static_cast<int>(input_count),
+                                      static_cast<int>(output_count), &op));
+  return Op{op};
+}
+
+inline void Op::Invoke(const OrtKernelContext* context,
+                       const Value* input_values,
+                       size_t input_count,
+                       Value* output_values,
+                       size_t output_count) {
+  static_assert(sizeof(Value) == sizeof(OrtValue*),
+                "Value is really just an array of OrtValue* in memory, so we can reinterpret_cast safely");
+  auto ort_input_values = reinterpret_cast<const OrtValue* const*>(input_values);
+  auto ort_output_values = reinterpret_cast<OrtValue**>(output_values);
+  Ort::ThrowOnError(GetApi().InvokeOp(context, p_, ort_input_values, static_cast<int>(input_count),
+                                      ort_output_values, static_cast<int>(output_count)));
+}
+
+inline void Op::Invoke(const OrtKernelContext* context,
+                       const OrtValue* const* input_values,
+                       size_t input_count,
+                       OrtValue* const* output_values,
+                       size_t output_count) {
+  Ort::ThrowOnError(GetApi().InvokeOp(context, p_, input_values, static_cast<int>(input_count),
+                                      output_values, static_cast<int>(output_count)));
+}
+
+inline std::string GetVersionString() {
+  return OrtGetApiBase()->GetVersionString();
+}
+
+inline std::string GetBuildInfoString() {
+  return GetApi().GetBuildInfoString();
+}
+
+inline std::vector<std::string> GetAvailableProviders() {
+  char** providers;
+  int len;
+
+  auto release_fn = [&len](char** providers) {
+    // This should always return nullptr.
+    ThrowOnError(GetApi().ReleaseAvailableProviders(providers, len));
+  };
+
+  ThrowOnError(GetApi().GetAvailableProviders(&providers, &len));
+  std::unique_ptr<char*, decltype(release_fn)> guard(providers, release_fn);
+  std::vector<std::string> available_providers;
+  available_providers.reserve(static_cast<size_t>(len));
+  for (int i = 0; i < len; ++i) {
+    available_providers.emplace_back(providers[i]);
+  }
+  return available_providers;
+}
+
+template <typename TOp, typename TKernel, bool WithStatus>
+void CustomOpBase<TOp, TKernel, WithStatus>::GetSessionConfigs(std::unordered_map<std::string, std::string>& out,
+                                                               ConstSessionOptions options) const {
+  const TOp* derived = static_cast<const TOp*>(this);
+  std::vector<std::string> keys = derived->GetSessionConfigKeys();
+
+  out.reserve(keys.size());
+
+  std::string config_entry_key = detail::MakeCustomOpConfigEntryKey(derived->GetName(), "");
+  const size_t prefix_size = config_entry_key.length();
+
+  for (const auto& key : keys) {
+    config_entry_key.resize(prefix_size);
+    config_entry_key.append(key);
+    out[key] = options.GetConfigEntryOrDefault(config_entry_key.c_str(), "");
+  }
+}
+
+inline ShapeInferContext::ShapeInferContext(const OrtApi* ort_api,
+                                            OrtShapeInferContext* ctx) : ort_api_(ort_api), ctx_(ctx) {
+  size_t input_count = 0;
+  Ort::ThrowOnError(ort_api_->ShapeInferContext_GetInputCount(ctx_, &input_count));
+  for (size_t ith_input = 0; ith_input < input_count; ++ith_input) {
+    OrtTensorTypeAndShapeInfo* info{};
+    Ort::ThrowOnError(ort_api_->ShapeInferContext_GetInputTypeShape(ctx, ith_input, &info));
+    TensorTypeAndShapeInfo type_shape_info(info);
+    auto integer_shape = type_shape_info.GetShape();
+    std::vector<const char*> symbolic_shape(integer_shape.size(), {});
+    if (!integer_shape.empty()) {
+      type_shape_info.GetSymbolicDimensions(&symbolic_shape[0], integer_shape.size());
+    }
+    Shape shape;
+    for (size_t ith = 0; ith < integer_shape.size(); ++ith) {
+      if (symbolic_shape[ith] && std::string{symbolic_shape[ith]}.size() > 0) {
+        shape.emplace_back(symbolic_shape[ith]);
+      } else {
+        shape.emplace_back(integer_shape[ith]);
+      }
+    }
+    input_shapes_.push_back(std::move(shape));
+    type_shape_info.release();
+  }
+}
+
+inline Status ShapeInferContext::SetOutputShape(size_t indice, const Shape& shape, ONNXTensorElementDataType type) {
+  OrtTensorTypeAndShapeInfo* info = {};
+  ORT_CXX_RETURN_ON_API_FAIL(ort_api_->CreateTensorTypeAndShapeInfo(&info));
+  ORT_CXX_RETURN_ON_API_FAIL(ort_api_->SetTensorElementType(info, type));
+
+  using InfoPtr = std::unique_ptr<OrtTensorTypeAndShapeInfo, std::function<void(OrtTensorTypeAndShapeInfo*)>>;
+
+  InfoPtr info_ptr(info, [this](OrtTensorTypeAndShapeInfo* obj) {
+    ort_api_->ReleaseTensorTypeAndShapeInfo(obj);
+  });
+
+  std::vector<int64_t> integer_dims;
+  std::vector<const char*> symbolic_dims;
+
+  for (const auto dim : shape) {
+    if (dim.IsInt()) {
+      integer_dims.push_back(dim.AsInt());
+      symbolic_dims.push_back("");
+    } else {
+      if (!dim.AsSym() || std::string{dim.AsSym()}.empty()) {
+        ORT_CXX_API_THROW("Symbolic dim must not be an empty string", ORT_INVALID_ARGUMENT);
+      }
+      integer_dims.push_back(SymbolicInteger::INVALID_INT_DIM);
+      symbolic_dims.push_back(dim.AsSym());
+    }
+  }
+
+  ORT_CXX_RETURN_ON_API_FAIL(ort_api_->SetDimensions(info, integer_dims.data(), integer_dims.size()));
+  ORT_CXX_RETURN_ON_API_FAIL(ort_api_->SetSymbolicDimensions(info, symbolic_dims.data(), symbolic_dims.size()));
+  ORT_CXX_RETURN_ON_API_FAIL(ort_api_->ShapeInferContext_SetOutputTypeShape(ctx_, indice, info));
+  return Status{nullptr};
+}
+
+inline int64_t ShapeInferContext::GetAttrInt(const char* attr_name) {
+  const auto* attr = GetAttrHdl(attr_name);
+  int64_t i = {};
+  size_t out = {};
+  Ort::ThrowOnError(ort_api_->ReadOpAttr(attr, ORT_OP_ATTR_INT, &i, sizeof(i), &out));
+  return i;
+}
+
+inline ShapeInferContext::Ints ShapeInferContext::GetAttrInts(const char* attr_name) {
+  const auto* attr = GetAttrHdl(attr_name);
+  int64_t i = {};
+  size_t out = {};
+  // first call to get the bytes needed
+  // 1. A status == nullptr means that ReadOpAttr was successful. A status != nullptr means failure.
+  // 2. The ReadOpAttr function should normally be called twice: once to get the needed buffer size (returns a status != nullptr), and a second time to actually read the ints (returns status == null on success).
+  // 3. This code tries a subtle optimization in the first call to ReadOpAttr. It passes in a buffer (&i) of size 1 just in case there is only 1 int. In this case, status == nullptr and we need to return {i}.
+  auto status = ort_api_->ReadOpAttr(attr, ORT_OP_ATTR_INTS, &i, sizeof(i), &out);
+  if (status) {
+    size_t num_i = out / sizeof(int64_t);
+    ShapeInferContext::Ints ints(num_i, 0);
+    Ort::ThrowOnError(ort_api_->ReadOpAttr(attr, ORT_OP_ATTR_INTS, ints.data(), out, &out));
+    return ints;
+  } else {
+    if (out == 0u) {
+      return {};
+    }
+    return {i};
+  }
+}
+
+inline float ShapeInferContext::GetAttrFloat(const char* attr_name) {
+  const auto* attr = GetAttrHdl(attr_name);
+  float f = {};
+  size_t out = {};
+  Ort::ThrowOnError(ort_api_->ReadOpAttr(attr, ORT_OP_ATTR_FLOAT, &f, sizeof(f), &out));
+  return f;
+}
+
+inline ShapeInferContext::Floats ShapeInferContext::GetAttrFloats(const char* attr_name) {
+  const auto* attr = GetAttrHdl(attr_name);
+  float f = {};
+  size_t out = {};
+  // first call to get the bytes needed
+  // 1. A status == nullptr means that ReadOpAttr was successful. A status != nullptr means failure.
+  // 2. The ReadOpAttr function should normally be called twice: once to get the needed buffer size (returns a status != nullptr), and a second time to actually read the ints (returns status == null on success).
+  // 3. This code tries a subtle optimization in the first call to ReadOpAttr. It passes in a buffer (&i) of size 1 just in case there is only 1 int. In this case, status == nullptr and we need to return {i}.
+  auto status = ort_api_->ReadOpAttr(attr, ORT_OP_ATTR_FLOATS, &f, sizeof(f), &out);
+  if (status) {
+    size_t num_f = out / sizeof(float);
+    ShapeInferContext::Floats floats(num_f, 0);
+    Ort::ThrowOnError(ort_api_->ReadOpAttr(attr, ORT_OP_ATTR_FLOATS, floats.data(), out, &out));
+    return floats;
+  } else {
+    if (out == 0u) {
+      return {};
+    }
+    return {f};
+  }
+}
+
+inline std::string ShapeInferContext::GetAttrString(const char* attr_name) {
+  const auto* attr = GetAttrHdl(attr_name);
+  char c = {};
+  size_t out = {};
+  // first call to get the bytes needed
+  auto status = ort_api_->ReadOpAttr(attr, ORT_OP_ATTR_STRING, &c, sizeof(char), &out);
+  if (status) {
+    std::vector<char> chars(out, '\0');
+    Ort::ThrowOnError(ort_api_->ReadOpAttr(attr, ORT_OP_ATTR_STRING, chars.data(), out, &out));
+    return {chars.data()};
+  } else {
+    return {c};
+  }
+}
+
+inline ShapeInferContext::Strings ShapeInferContext::GetAttrStrings(const char* attr_name) {
+  const auto* attr = GetAttrHdl(attr_name);
+  char c = {};
+  size_t out = {};
+  // first call to get the bytes needed
+  // 1. A status == nullptr means that ReadOpAttr was successful. A status != nullptr means failure.
+  // 2. The ReadOpAttr function should normally be called twice: once to get the needed buffer size (returns a status != nullptr), and a second time to actually read the ints (returns status == null on success).
+  // 3. This code tries a subtle optimization in the first call to ReadOpAttr. It passes in a buffer (&i) of size 1 just in case there is only 1 int. In this case, status == nullptr and we need to return {i}.
+  auto status = ort_api_->ReadOpAttr(attr, ORT_OP_ATTR_STRINGS, &c, sizeof(char), &out);
+  if (status) {
+    std::vector<char> chars(out, '\0');
+    Ort::ThrowOnError(ort_api_->ReadOpAttr(attr, ORT_OP_ATTR_STRINGS, chars.data(), out, &out));
+    ShapeInferContext::Strings strings;
+    char* char_st = chars.data();
+    char* char_ed = char_st + out;
+    while (char_st < char_ed) {
+      strings.emplace_back(char_st);
+      while (*char_st != '\0') {
+        char_st++;
+      }
+      char_st++;
+    }
+    return strings;
+  } else {
+    if (out == 0u) {
+      return {};
+    }
+    return {std::string{c}};
+  }
+}
+
+inline const OrtOpAttr* ShapeInferContext::GetAttrHdl(const char* attr_name) const {
+  const OrtOpAttr* attr_hdl = {};
+  Ort::ThrowOnError(ort_api_->ShapeInferContext_GetAttribute(ctx_, attr_name, &attr_hdl));
+  return attr_hdl;
+}
+
+namespace ModelBuilderAPI {
+inline std::vector<const char*> StringsToCharPtrs(const std::vector<std::string>& strings) {
+  std::vector<const char*> ptrs;
+  ptrs.reserve(strings.size());
+  std::transform(strings.begin(), strings.end(), std::back_inserter(ptrs),
+                 [](const std::string& s) { return s.c_str(); });
+
+  return ptrs;
+}
+
+// static
+inline void Node::Init(const std::string& operator_name, const std::string& operator_domain,
+                       const std::string& node_name,
+                       const std::vector<std::string>& input_names,
+                       const std::vector<std::string>& output_names,
+                       std::vector<OpAttr>& attributes,
+                       OrtNode*& node) {
+  auto inputs = StringsToCharPtrs(input_names);
+  auto outputs = StringsToCharPtrs(output_names);
+
+  std::vector<OrtOpAttr*> attributes_ptrs;
+  attributes_ptrs.reserve(attributes.size());
+  std::transform(attributes.begin(), attributes.end(), std::back_inserter(attributes_ptrs),
+                 [](OpAttr& attr) -> OrtOpAttr* { return attr; });
+
+  ThrowOnError(GetModelBuilderApi().CreateNode(operator_name.c_str(), operator_domain.c_str(), node_name.c_str(),
+                                               inputs.data(), inputs.size(),
+                                               outputs.data(), outputs.size(),
+                                               attributes_ptrs.data(), attributes_ptrs.size(),
+                                               &node));
+
+  // Node now owns the attributes
+  std::for_each(attributes.begin(), attributes.end(), [](OpAttr& attr) { attr.release(); });
+}
+
+inline Node::Node(const std::string& operator_name, const std::string& operator_domain,
+                  const std::string& node_name,
+                  const std::vector<std::string>& input_names,
+                  const std::vector<std::string>& output_names,
+                  std::vector<OpAttr>& attributes) {
+  Init(operator_name, operator_domain, node_name, input_names, output_names, attributes, p_);
+}
+
+inline Node::Node(const std::string& operator_name, const std::string& operator_domain,
+                  const std::string& node_name,
+                  const std::vector<std::string>& input_names,
+                  const std::vector<std::string>& output_names) {
+  std::vector<OpAttr> empty_attributes;
+  Init(operator_name, operator_domain, node_name, input_names, output_names, empty_attributes, p_);
+}
+
+inline Graph::Graph() {
+  ThrowOnError(GetModelBuilderApi().CreateGraph(&p_));
+}
+
+inline Model::Model(const std::vector<DomainOpsetPair>& opsets) {
+  std::vector<const char*> domains;
+  std::vector<int> versions;
+  domains.reserve(opsets.size());
+  versions.reserve(opsets.size());
+
+  for (const auto& pair : opsets) {
+    domains.push_back(pair.first.c_str());
+    versions.push_back(pair.second);
+  }
+
+  ThrowOnError(GetModelBuilderApi().CreateModel(domains.data(), versions.data(), opsets.size(), &p_));
+}
+
+inline ValueInfo::ValueInfo(const std::string& name, const ConstTypeInfo& type_info) {
+  ThrowOnError(GetModelBuilderApi().CreateValueInfo(name.c_str(), type_info, &p_));
+}
+namespace detail {
+template <>
+inline std::string ValueInfoImpl<OrtValueInfo>::Name() const {
+  const char* name = nullptr;
+  ThrowOnError(GetModelBuilderApi().GetValueInfoName(this->p_, &name));
+  return name;
+}
+
+template <>
+inline ConstTypeInfo ValueInfoImpl<OrtValueInfo>::TypeInfo() const {
+  const OrtTypeInfo* type_info = nullptr;
+  ThrowOnError(GetModelBuilderApi().GetValueInfoTypeInfo(this->p_, &type_info));
+  return ConstTypeInfo{type_info};
+}
+
+template <>
+inline void GraphImpl<OrtGraph>::SetInputs(std::vector<ValueInfo>& inputs) {
+  std::vector<OrtValueInfo*> inputs_ptrs;
+  inputs_ptrs.reserve(inputs.size());
+
+  // Graph takes ownership.
+  std::transform(inputs.begin(), inputs.end(), std::back_inserter(inputs_ptrs),
+                 [](ValueInfo& vi) -> OrtValueInfo* { return vi.release(); });
+
+  ThrowOnError(GetModelBuilderApi().SetGraphInputs(p_, inputs_ptrs.data(), inputs_ptrs.size()));
+
+  // Graph now owns the inputs
+  std::for_each(inputs.begin(), inputs.end(), [](ValueInfo& vi) { vi.release(); });
+}
+
+template <>
+inline void GraphImpl<OrtGraph>::SetOutputs(std::vector<ValueInfo>& outputs) {
+  std::vector<OrtValueInfo*> outputs_ptrs;
+  outputs_ptrs.reserve(outputs.size());
+  std::transform(outputs.begin(), outputs.end(), std::back_inserter(outputs_ptrs),
+                 [](ValueInfo& vi) -> OrtValueInfo* { return vi; });
+
+  ThrowOnError(GetModelBuilderApi().SetGraphOutputs(p_, outputs_ptrs.data(), outputs_ptrs.size()));
+
+  // Graph now owns the outputs
+  std::for_each(outputs.begin(), outputs.end(), [](ValueInfo& vi) { vi.release(); });
+}
+
+template <>
+inline void GraphImpl<OrtGraph>::AddInitializer(const std::string& name, Value& initializer, bool data_is_external) {
+  // Graph takes ownership of `initializer`
+  ThrowOnError(GetModelBuilderApi().AddInitializerToGraph(p_, name.c_str(), initializer.release(), data_is_external));
+}
+
+template <>
+inline void GraphImpl<OrtGraph>::AddNode(Node& node) {
+  // Graph takes ownership of `node`
+  ThrowOnError(GetModelBuilderApi().AddNodeToGraph(p_, node.release()));
+}
+
+template <>
+inline void ModelImpl<OrtModel>::AddGraph(Graph& graph) {
+  // Model takes ownership of `graph`
+  ThrowOnError(GetModelBuilderApi().AddGraphToModel(p_, graph.release()));
+}
+}  // namespace detail
+}  // namespace ModelBuilderAPI
+}  // namespace Ort
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/session/onnxruntime_float16.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/session/onnxruntime_float16.h
new file mode 100644
index 00000000000000..408d3ccfb2416c
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/session/onnxruntime_float16.h
@@ -0,0 +1,535 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <stdint.h>
+#include <cmath>
+#include <cstring>
+#include <limits>
+
+namespace onnxruntime_float16 {
+
+namespace detail {
+
+enum class endian {
+#if defined(_WIN32)
+  little = 0,
+  big = 1,
+  native = little,
+#elif defined(__GNUC__) || defined(__clang__)
+  little = __ORDER_LITTLE_ENDIAN__,
+  big = __ORDER_BIG_ENDIAN__,
+  native = __BYTE_ORDER__,
+#else
+#error onnxruntime_float16::detail::endian is not implemented in this environment.
+#endif
+};
+
+static_assert(
+    endian::native == endian::little || endian::native == endian::big,
+    "Only little-endian or big-endian native byte orders are supported.");
+
+}  // namespace detail
+
+/// <summary>
+/// Shared implementation between public and internal classes. CRTP pattern.
+/// </summary>
+template <class Derived>
+struct Float16Impl {
+ protected:
+  /// <summary>
+  /// Converts from float to uint16_t float16 representation
+  /// </summary>
+  /// <param name="v"></param>
+  /// <returns></returns>
+  constexpr static uint16_t ToUint16Impl(float v) noexcept;
+
+  /// <summary>
+  /// Converts float16 to float
+  /// </summary>
+  /// <returns>float representation of float16 value</returns>
+  float ToFloatImpl() const noexcept;
+
+  /// <summary>
+  /// Creates an instance that represents absolute value.
+  /// </summary>
+  /// <returns>Absolute value</returns>
+  uint16_t AbsImpl() const noexcept {
+    return static_cast<uint16_t>(val & ~kSignMask);
+  }
+
+  /// <summary>
+  /// Creates a new instance with the sign flipped.
+  /// </summary>
+  /// <returns>Flipped sign instance</returns>
+  uint16_t NegateImpl() const noexcept {
+    return IsNaN() ? val : static_cast<uint16_t>(val ^ kSignMask);
+  }
+
+ public:
+  // uint16_t special values
+  static constexpr uint16_t kSignMask = 0x8000U;
+  static constexpr uint16_t kBiasedExponentMask = 0x7C00U;
+  static constexpr uint16_t kPositiveInfinityBits = 0x7C00U;
+  static constexpr uint16_t kNegativeInfinityBits = 0xFC00U;
+  static constexpr uint16_t kPositiveQNaNBits = 0x7E00U;
+  static constexpr uint16_t kNegativeQNaNBits = 0xFE00U;
+  static constexpr uint16_t kMaxValueBits = 0x7BFFU;  // Largest normal number
+  static constexpr uint16_t kOneBits = 0x3C00U;
+  static constexpr uint16_t kMinusOneBits = 0xBC00U;
+
+  uint16_t val{0};
+
+  Float16Impl() = default;
+
+  /// <summary>
+  /// Checks if the value is negative
+  /// </summary>
+  /// <returns>true if negative</returns>
+  bool IsNegative() const noexcept {
+    return static_cast<int16_t>(val) < 0;
+  }
+
+  /// <summary>
+  /// Tests if the value is NaN
+  /// </summary>
+  /// <returns>true if NaN</returns>
+  bool IsNaN() const noexcept {
+    return AbsImpl() > kPositiveInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value is finite
+  /// </summary>
+  /// <returns>true if finite</returns>
+  bool IsFinite() const noexcept {
+    return AbsImpl() < kPositiveInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value represents positive infinity.
+  /// </summary>
+  /// <returns>true if positive infinity</returns>
+  bool IsPositiveInfinity() const noexcept {
+    return val == kPositiveInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value represents negative infinity
+  /// </summary>
+  /// <returns>true if negative infinity</returns>
+  bool IsNegativeInfinity() const noexcept {
+    return val == kNegativeInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value is either positive or negative infinity.
+  /// </summary>
+  /// <returns>True if absolute value is infinity</returns>
+  bool IsInfinity() const noexcept {
+    return AbsImpl() == kPositiveInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value is NaN or zero. Useful for comparisons.
+  /// </summary>
+  /// <returns>True if NaN or zero.</returns>
+  bool IsNaNOrZero() const noexcept {
+    auto abs = AbsImpl();
+    return (abs == 0 || abs > kPositiveInfinityBits);
+  }
+
+  /// <summary>
+  /// Tests if the value is normal (not zero, subnormal, infinite, or NaN).
+  /// </summary>
+  /// <returns>True if so</returns>
+  bool IsNormal() const noexcept {
+    auto abs = AbsImpl();
+    return (abs < kPositiveInfinityBits)           // is finite
+           && (abs != 0)                           // is not zero
+           && ((abs & kBiasedExponentMask) != 0);  // is not subnormal (has a non-zero exponent)
+  }
+
+  /// <summary>
+  /// Tests if the value is subnormal (denormal).
+  /// </summary>
+  /// <returns>True if so</returns>
+  bool IsSubnormal() const noexcept {
+    auto abs = AbsImpl();
+    return (abs < kPositiveInfinityBits)           // is finite
+           && (abs != 0)                           // is not zero
+           && ((abs & kBiasedExponentMask) == 0);  // is subnormal (has a zero exponent)
+  }
+
+  /// <summary>
+  /// Creates an instance that represents absolute value.
+  /// </summary>
+  /// <returns>Absolute value</returns>
+  Derived Abs() const noexcept { return Derived::FromBits(AbsImpl()); }
+
+  /// <summary>
+  /// Creates a new instance with the sign flipped.
+  /// </summary>
+  /// <returns>Flipped sign instance</returns>
+  Derived Negate() const noexcept { return Derived::FromBits(NegateImpl()); }
+
+  /// <summary>
+  /// IEEE defines that positive and negative zero are equal, this gives us a quick equality check
+  /// for two values by or'ing the private bits together and stripping the sign. They are both zero,
+  /// and therefore equivalent, if the resulting value is still zero.
+  /// </summary>
+  /// <param name="lhs">first value</param>
+  /// <param name="rhs">second value</param>
+  /// <returns>True if both arguments represent zero</returns>
+  static bool AreZero(const Float16Impl& lhs, const Float16Impl& rhs) noexcept {
+    return static_cast<uint16_t>((lhs.val | rhs.val) & ~kSignMask) == 0;
+  }
+
+  bool operator==(const Float16Impl& rhs) const noexcept {
+    if (IsNaN() || rhs.IsNaN()) {
+      // IEEE defines that NaN is not equal to anything, including itself.
+      return false;
+    }
+    return val == rhs.val;
+  }
+
+  bool operator!=(const Float16Impl& rhs) const noexcept { return !(*this == rhs); }
+
+  bool operator<(const Float16Impl& rhs) const noexcept {
+    if (IsNaN() || rhs.IsNaN()) {
+      // IEEE defines that NaN is unordered with respect to everything, including itself.
+      return false;
+    }
+
+    const bool left_is_negative = IsNegative();
+    if (left_is_negative != rhs.IsNegative()) {
+      // When the signs of left and right differ, we know that left is less than right if it is
+      // the negative value. The exception to this is if both values are zero, in which case IEEE
+      // says they should be equal, even if the signs differ.
+      return left_is_negative && !AreZero(*this, rhs);
+    }
+    return (val != rhs.val) && ((val < rhs.val) ^ left_is_negative);
+  }
+};
+
+// The following Float16_t conversions are based on the code from
+// Eigen library.
+
+// The conversion routines are Copyright (c) Fabian Giesen, 2016.
+// The original license follows:
+//
+// Copyright (c) Fabian Giesen, 2016
+// All rights reserved.
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted.
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+namespace detail {
+union float32_bits {
+  unsigned int u;
+  float f;
+};
+}  // namespace detail
+
+template <class Derived>
+inline constexpr uint16_t Float16Impl<Derived>::ToUint16Impl(float v) noexcept {
+  detail::float32_bits f{};
+  f.f = v;
+
+  constexpr detail::float32_bits f32infty = {255 << 23};
+  constexpr detail::float32_bits f16max = {(127 + 16) << 23};
+  constexpr detail::float32_bits denorm_magic = {((127 - 15) + (23 - 10) + 1) << 23};
+  constexpr unsigned int sign_mask = 0x80000000u;
+  uint16_t val = static_cast<uint16_t>(0x0u);
+
+  unsigned int sign = f.u & sign_mask;
+  f.u ^= sign;
+
+  // NOTE all the integer compares in this function can be safely
+  // compiled into signed compares since all operands are below
+  // 0x80000000. Important if you want fast straight SSE2 code
+  // (since there's no unsigned PCMPGTD).
+
+  if (f.u >= f16max.u) {                         // result is Inf or NaN (all exponent bits set)
+    val = (f.u > f32infty.u) ? 0x7e00 : 0x7c00;  // NaN->qNaN and Inf->Inf
+  } else {                                       // (De)normalized number or zero
+    if (f.u < (113 << 23)) {                     // resulting FP16 is subnormal or zero
+      // use a magic value to align our 10 mantissa bits at the bottom of
+      // the float. as long as FP addition is round-to-nearest-even this
+      // just works.
+      f.f += denorm_magic.f;
+
+      // and one integer subtract of the bias later, we have our final float!
+      val = static_cast<uint16_t>(f.u - denorm_magic.u);
+    } else {
+      unsigned int mant_odd = (f.u >> 13) & 1;  // resulting mantissa is odd
+
+      // update exponent, rounding bias part 1
+      // Equivalent to `f.u += ((unsigned int)(15 - 127) << 23) + 0xfff`, but
+      // without arithmetic overflow.
+      f.u += 0xc8000fffU;
+      // rounding bias part 2
+      f.u += mant_odd;
+      // take the bits!
+      val = static_cast<uint16_t>(f.u >> 13);
+    }
+  }
+
+  val |= static_cast<uint16_t>(sign >> 16);
+  return val;
+}
+
+template <class Derived>
+inline float Float16Impl<Derived>::ToFloatImpl() const noexcept {
+  constexpr detail::float32_bits magic = {113 << 23};
+  constexpr unsigned int shifted_exp = 0x7c00 << 13;  // exponent mask after shift
+  detail::float32_bits o{};
+
+  o.u = (val & 0x7fff) << 13;            // exponent/mantissa bits
+  unsigned int exp = shifted_exp & o.u;  // just the exponent
+  o.u += (127 - 15) << 23;               // exponent adjust
+
+  // handle exponent special cases
+  if (exp == shifted_exp) {   // Inf/NaN?
+    o.u += (128 - 16) << 23;  // extra exp adjust
+  } else if (exp == 0) {      // Zero/Denormal?
+    o.u += 1 << 23;           // extra exp adjust
+    o.f -= magic.f;           // re-normalize
+  }
+
+  // Attempt to workaround the Internal Compiler Error on ARM64
+  // for bitwise | operator, including std::bitset
+#if (defined _MSC_VER) && (defined _M_ARM || defined _M_ARM64 || defined _M_ARM64EC)
+  if (IsNegative()) {
+    return -o.f;
+  }
+#else
+  // original code:
+  o.u |= (val & 0x8000U) << 16U;  // sign bit
+#endif
+  return o.f;
+}
+
+/// Shared implementation between public and internal classes. CRTP pattern.
+template <class Derived>
+struct BFloat16Impl {
+ protected:
+  /// <summary>
+  /// Converts from float to uint16_t float16 representation
+  /// </summary>
+  /// <param name="v"></param>
+  /// <returns></returns>
+  static uint16_t ToUint16Impl(float v) noexcept;
+
+  /// <summary>
+  /// Converts bfloat16 to float
+  /// </summary>
+  /// <returns>float representation of bfloat16 value</returns>
+  float ToFloatImpl() const noexcept;
+
+  /// <summary>
+  /// Creates an instance that represents absolute value.
+  /// </summary>
+  /// <returns>Absolute value</returns>
+  uint16_t AbsImpl() const noexcept {
+    return static_cast<uint16_t>(val & ~kSignMask);
+  }
+
+  /// <summary>
+  /// Creates a new instance with the sign flipped.
+  /// </summary>
+  /// <returns>Flipped sign instance</returns>
+  uint16_t NegateImpl() const noexcept {
+    return IsNaN() ? val : static_cast<uint16_t>(val ^ kSignMask);
+  }
+
+ public:
+  // uint16_t special values
+  static constexpr uint16_t kSignMask = 0x8000U;
+  static constexpr uint16_t kBiasedExponentMask = 0x7F80U;
+  static constexpr uint16_t kPositiveInfinityBits = 0x7F80U;
+  static constexpr uint16_t kNegativeInfinityBits = 0xFF80U;
+  static constexpr uint16_t kPositiveQNaNBits = 0x7FC1U;
+  static constexpr uint16_t kNegativeQNaNBits = 0xFFC1U;
+  static constexpr uint16_t kMaxValueBits = 0x7F7FU;
+  static constexpr uint16_t kRoundToNearest = 0x7FFFU;
+  static constexpr uint16_t kOneBits = 0x3F80U;
+  static constexpr uint16_t kMinusOneBits = 0xBF80U;
+
+  uint16_t val{0};
+
+  BFloat16Impl() = default;
+
+  /// <summary>
+  /// Checks if the value is negative
+  /// </summary>
+  /// <returns>true if negative</returns>
+  bool IsNegative() const noexcept {
+    return static_cast<int16_t>(val) < 0;
+  }
+
+  /// <summary>
+  /// Tests if the value is NaN
+  /// </summary>
+  /// <returns>true if NaN</returns>
+  bool IsNaN() const noexcept {
+    return AbsImpl() > kPositiveInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value is finite
+  /// </summary>
+  /// <returns>true if finite</returns>
+  bool IsFinite() const noexcept {
+    return AbsImpl() < kPositiveInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value represents positive infinity.
+  /// </summary>
+  /// <returns>true if positive infinity</returns>
+  bool IsPositiveInfinity() const noexcept {
+    return val == kPositiveInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value represents negative infinity
+  /// </summary>
+  /// <returns>true if negative infinity</returns>
+  bool IsNegativeInfinity() const noexcept {
+    return val == kNegativeInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value is either positive or negative infinity.
+  /// </summary>
+  /// <returns>True if absolute value is infinity</returns>
+  bool IsInfinity() const noexcept {
+    return AbsImpl() == kPositiveInfinityBits;
+  }
+
+  /// <summary>
+  /// Tests if the value is NaN or zero. Useful for comparisons.
+  /// </summary>
+  /// <returns>True if NaN or zero.</returns>
+  bool IsNaNOrZero() const noexcept {
+    auto abs = AbsImpl();
+    return (abs == 0 || abs > kPositiveInfinityBits);
+  }
+
+  /// <summary>
+  /// Tests if the value is normal (not zero, subnormal, infinite, or NaN).
+  /// </summary>
+  /// <returns>True if so</returns>
+  bool IsNormal() const noexcept {
+    auto abs = AbsImpl();
+    return (abs < kPositiveInfinityBits)           // is finite
+           && (abs != 0)                           // is not zero
+           && ((abs & kBiasedExponentMask) != 0);  // is not subnormal (has a non-zero exponent)
+  }
+
+  /// <summary>
+  /// Tests if the value is subnormal (denormal).
+  /// </summary>
+  /// <returns>True if so</returns>
+  bool IsSubnormal() const noexcept {
+    auto abs = AbsImpl();
+    return (abs < kPositiveInfinityBits)           // is finite
+           && (abs != 0)                           // is not zero
+           && ((abs & kBiasedExponentMask) == 0);  // is subnormal (has a zero exponent)
+  }
+
+  /// <summary>
+  /// Creates an instance that represents absolute value.
+  /// </summary>
+  /// <returns>Absolute value</returns>
+  Derived Abs() const noexcept { return Derived::FromBits(AbsImpl()); }
+
+  /// <summary>
+  /// Creates a new instance with the sign flipped.
+  /// </summary>
+  /// <returns>Flipped sign instance</returns>
+  Derived Negate() const noexcept { return Derived::FromBits(NegateImpl()); }
+
+  /// <summary>
+  /// IEEE defines that positive and negative zero are equal, this gives us a quick equality check
+  /// for two values by or'ing the private bits together and stripping the sign. They are both zero,
+  /// and therefore equivalent, if the resulting value is still zero.
+  /// </summary>
+  /// <param name="lhs">first value</param>
+  /// <param name="rhs">second value</param>
+  /// <returns>True if both arguments represent zero</returns>
+  static bool AreZero(const BFloat16Impl& lhs, const BFloat16Impl& rhs) noexcept {
+    // IEEE defines that positive and negative zero are equal, this gives us a quick equality check
+    // for two values by or'ing the private bits together and stripping the sign. They are both zero,
+    // and therefore equivalent, if the resulting value is still zero.
+    return static_cast<uint16_t>((lhs.val | rhs.val) & ~kSignMask) == 0;
+  }
+};
+
+template <class Derived>
+inline uint16_t BFloat16Impl<Derived>::ToUint16Impl(float v) noexcept {
+  uint16_t result;
+  if (std::isnan(v)) {
+    result = kPositiveQNaNBits;
+  } else {
+    auto get_msb_half = [](float fl) {
+      uint16_t result;
+#ifdef __cpp_if_constexpr
+      if constexpr (detail::endian::native == detail::endian::little) {
+#else
+      if (detail::endian::native == detail::endian::little) {
+#endif
+        std::memcpy(&result, reinterpret_cast<char*>(&fl) + sizeof(uint16_t), sizeof(uint16_t));
+      } else {
+        std::memcpy(&result, &fl, sizeof(uint16_t));
+      }
+      return result;
+    };
+
+    uint16_t upper_bits = get_msb_half(v);
+    union {
+      uint32_t U32;
+      float F32;
+    };
+    F32 = v;
+    U32 += (upper_bits & 1) + kRoundToNearest;
+    result = get_msb_half(F32);
+  }
+  return result;
+}
+
+template <class Derived>
+inline float BFloat16Impl<Derived>::ToFloatImpl() const noexcept {
+  if (IsNaN()) {
+    return std::numeric_limits<float>::quiet_NaN();
+  }
+  float result;
+  char* const first = reinterpret_cast<char*>(&result);
+  char* const second = first + sizeof(uint16_t);
+#ifdef __cpp_if_constexpr
+  if constexpr (detail::endian::native == detail::endian::little) {
+#else
+  if (detail::endian::native == detail::endian::little) {
+#endif
+    std::memset(first, 0, sizeof(uint16_t));
+    std::memcpy(second, &val, sizeof(uint16_t));
+  } else {
+    std::memcpy(first, &val, sizeof(uint16_t));
+    std::memset(second, 0, sizeof(uint16_t));
+  }
+  return result;
+}
+
+}  // namespace onnxruntime_float16
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/session/onnxruntime_lite_custom_op.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/session/onnxruntime_lite_custom_op.h
new file mode 100644
index 00000000000000..ce87d8c56d3fe9
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/session/onnxruntime_lite_custom_op.h
@@ -0,0 +1,1119 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// Summary
+// The header has APIs to save custom op authors the trouble of defining schemas,
+// which will be inferred by functions' signature, as long as their argument list has types supported here.
+// Input could be:
+// 1. Tensor of onnx data types.
+// 2. Span of onnx data types.
+// 3. Scalar of onnx data types.
+// A input could be optional if indicated as std::optional<...>.
+// For an output, it must be a tensor of onnx data types.
+// Further, the header also has utility for a simple custom struct, where resources could be kept, to be registered as a custom op.
+// For concrete examples, please search keyword "LiteCustomOpTest" under "<cloned_src_dir>/onnxruntime/test/".
+// Note - all APIs in this header are ABI.
+
+#pragma once
+#include "onnxruntime_cxx_api.h"
+#include <optional>
+#include <numeric>
+#include <functional>
+#include <unordered_set>
+
+namespace Ort {
+namespace Custom {
+
+class ArgBase {
+ public:
+  ArgBase(OrtKernelContext* ctx,
+          size_t indice,
+          bool is_input) : ctx_(ctx), indice_(indice), is_input_(is_input) {}
+  virtual ~ArgBase() {};
+
+ protected:
+  struct KernelContext ctx_;
+  size_t indice_;
+  bool is_input_;
+};
+
+using ArgPtr = std::unique_ptr<Custom::ArgBase>;
+using ArgPtrs = std::vector<ArgPtr>;
+
+class TensorBase : public ArgBase {
+ public:
+  TensorBase(OrtKernelContext* ctx,
+             size_t indice,
+             bool is_input) : ArgBase(ctx, indice, is_input) {}
+
+  operator bool() const {
+    return shape_.has_value();
+  }
+
+  const std::vector<int64_t>& Shape() const {
+    if (!shape_.has_value()) {
+      ORT_CXX_API_THROW("tensor shape is not yet initialized", OrtErrorCode::ORT_RUNTIME_EXCEPTION);
+    }
+    return shape_.value();
+  }
+
+  ONNXTensorElementDataType Type() const {
+    return type_;
+  }
+
+  int64_t NumberOfElement() const {
+    if (shape_.has_value()) {
+      return std::accumulate(shape_->begin(), shape_->end(), 1LL, std::multiplies<int64_t>());
+    } else {
+      return 0;
+    }
+  }
+
+  std::string Shape2Str() const {
+    if (shape_.has_value()) {
+      std::string shape_str;
+      for (const auto& dim : *shape_) {
+        shape_str.append(std::to_string(dim));
+        shape_str.append(", ");
+      }
+      return shape_str;
+    } else {
+      return "empty";
+    }
+  }
+
+  bool IsCpuTensor() const {
+    return strcmp("Cpu", mem_type_) == 0;
+  }
+
+  virtual const void* DataRaw() const = 0;
+  virtual size_t SizeInBytes() const = 0;
+
+ protected:
+  std::optional<std::vector<int64_t>> shape_;
+  ONNXTensorElementDataType type_ = ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED;
+  const char* mem_type_ = "Cpu";
+};
+
+template <typename T>
+struct Span {
+  const T* data_ = {};
+  size_t size_ = {};
+  void Assign(const T* data, size_t size) {
+    data_ = data;
+    size_ = size;
+  }
+  size_t size() const { return size_; }
+  T operator[](size_t indice) const {
+    return data_[indice];
+  }
+  const T* data() const { return data_; }
+};
+
+template <typename T>
+class Tensor : public TensorBase {
+ public:
+  using TT = typename std::remove_reference<T>::type;
+  Tensor(OrtKernelContext* ctx, size_t indice, bool is_input) : TensorBase(ctx, indice, is_input) {
+    if (is_input_) {
+      if (indice >= ctx_.GetInputCount()) {
+        ORT_CXX_API_THROW("invalid indice for Ort::Custom::Tensor", OrtErrorCode::ORT_INVALID_ARGUMENT);
+      }
+      const_value_ = ctx_.GetInput(indice);
+      auto type_shape_info = const_value_.GetTensorTypeAndShapeInfo();
+      shape_ = type_shape_info.GetShape();
+    }
+  }
+  const TT* Data() const {
+    return reinterpret_cast<const TT*>(const_value_.GetTensorRawData());
+  }
+  TT* Allocate(const std::vector<int64_t>& shape) {
+    shape_ = shape;
+    if (!data_) {
+      shape_ = shape;
+      data_ = ctx_.GetOutput(indice_, shape).template GetTensorMutableData<TT>();
+    }
+    return data_;
+  }
+  static TT GetT() { return (TT)0; }
+  const Span<T>& AsSpan() {
+    if (!shape_.has_value() || shape_->size() != 1) {
+      ORT_CXX_API_THROW("invalid shape while trying to get a span out of Ort::Custom::Tensor",
+                        OrtErrorCode::ORT_RUNTIME_EXCEPTION);
+    }
+    span_.Assign(Data(), static_cast<size_t>((*shape_)[0]));
+    return span_;
+  }
+  const T& AsScalar() {
+    if (!shape_.has_value() || shape_->size() != 1 || (*shape_)[0] != 1) {
+      ORT_CXX_API_THROW("invalid shape while trying to get a scalar from Ort::Custom::Tensor",
+                        OrtErrorCode::ORT_RUNTIME_EXCEPTION);
+    }
+    return *Data();
+  }
+  const void* DataRaw() const override {
+    return reinterpret_cast<const void*>(Data());
+  }
+
+  size_t SizeInBytes() const override {
+    return sizeof(TT) * static_cast<size_t>(NumberOfElement());
+  }
+
+ private:
+  ConstValue const_value_;  // for input
+  TT* data_{};              // for output
+  Span<T> span_;
+};
+
+template <>
+class Tensor<std::string> : public TensorBase {
+ public:
+  using strings = std::vector<std::string>;
+
+  Tensor(OrtKernelContext* ctx, size_t indice, bool is_input) : TensorBase(ctx, indice, is_input) {
+    if (is_input_) {
+      if (indice >= ctx_.GetInputCount()) {
+        ORT_CXX_API_THROW("invalid indice for Ort::Custom::Tensor", OrtErrorCode::ORT_INVALID_ARGUMENT);
+      }
+      auto const_value = ctx_.GetInput(indice);
+      auto type_shape_info = const_value.GetTensorTypeAndShapeInfo();
+      shape_ = type_shape_info.GetShape();
+      auto num_chars = const_value.GetStringTensorDataLength();
+      // note - there will be copy ...
+      auto num_strings = static_cast<size_t>(NumberOfElement());
+      if (num_strings) {
+        std::vector<char> chars(num_chars + 1, '\0');
+        std::vector<size_t> offsets(num_strings);
+        const_value.GetStringTensorContent(static_cast<void*>(chars.data()), num_chars, offsets.data(), offsets.size());
+        auto upper_bound = num_strings - 1;
+        input_strings_.resize(num_strings);
+        for (size_t i = upper_bound;; --i) {
+          if (i < upper_bound) {
+            chars[offsets[i + 1]] = '\0';
+          }
+          input_strings_[i] = chars.data() + offsets[i];
+          if (0 == i) {
+            break;
+          }
+        }
+      }
+    }
+  }
+  const strings& Data() const {
+    return input_strings_;
+  }
+  const void* DataRaw() const override {
+    if (input_strings_.size() != 1) {
+      ORT_CXX_API_THROW("DataRaw() only applies to string scalar", ORT_RUNTIME_EXCEPTION);
+    }
+    return reinterpret_cast<const void*>(input_strings_[0].c_str());
+  }
+  size_t SizeInBytes() const override {
+    if (input_strings_.size() != 1) {
+      ORT_CXX_API_THROW("SizeInBytes() only applies to string scalar", ORT_RUNTIME_EXCEPTION);
+    }
+    return input_strings_[0].size();
+  }
+  void SetStringOutput(const strings& ss, const std::vector<int64_t>& dims) {
+    shape_ = dims;
+    std::vector<const char*> raw;
+    for (const auto& s : ss) {
+      raw.push_back(s.data());
+    }
+    auto output = ctx_.GetOutput(indice_, dims.data(), dims.size());
+    // note - there will be copy ...
+    output.FillStringTensor(raw.data(), raw.size());
+  }
+  const Span<std::string>& AsSpan() {
+    ORT_CXX_API_THROW("span for TensorT of string not implemented", OrtErrorCode::ORT_RUNTIME_EXCEPTION);
+  }
+  const std::string& AsScalar() {
+    if (input_strings_.size() != 1) {
+      ORT_CXX_API_THROW("invalid shape while trying to get a scalar string from Ort::Custom::Tensor",
+                        OrtErrorCode::ORT_RUNTIME_EXCEPTION);
+    }
+    return input_strings_[0];
+  }
+
+ private:
+  std::vector<std::string> input_strings_;  // for input
+};
+
+template <>
+class Tensor<std::string_view> : public TensorBase {
+ public:
+  using strings = std::vector<std::string>;
+  using string_views = std::vector<std::string_view>;
+
+  Tensor(OrtKernelContext* ctx, size_t indice, bool is_input) : TensorBase(ctx, indice, is_input) {
+    if (is_input_) {
+      if (indice >= ctx_.GetInputCount()) {
+        ORT_CXX_API_THROW("invalid indice for Ort::Custom::Tensor", OrtErrorCode::ORT_INVALID_ARGUMENT);
+      }
+      auto const_value = ctx_.GetInput(indice);
+      auto type_shape_info = const_value.GetTensorTypeAndShapeInfo();
+      shape_ = type_shape_info.GetShape();
+      auto num_chars = const_value.GetStringTensorDataLength();
+      chars_.resize(num_chars + 1, '\0');
+      auto num_strings = static_cast<size_t>(NumberOfElement());
+      if (num_strings) {
+        std::vector<size_t> offsets(num_strings);
+        const_value.GetStringTensorContent(static_cast<void*>(chars_.data()), num_chars, offsets.data(), offsets.size());
+        offsets.push_back(num_chars);
+        for (size_t i = 0; i < num_strings; ++i) {
+          input_string_views_.emplace_back(chars_.data() + offsets[i], offsets[i + 1] - offsets[i]);
+        }
+      }
+    }
+  }
+  const string_views& Data() const {
+    return input_string_views_;
+  }
+  const void* DataRaw() const override {
+    if (input_string_views_.size() != 1) {
+      ORT_CXX_API_THROW("DataRaw() only applies to string scalar", ORT_RUNTIME_EXCEPTION);
+    }
+    return reinterpret_cast<const void*>(input_string_views_[0].data());
+  }
+  size_t SizeInBytes() const override {
+    if (input_string_views_.size() != 1) {
+      ORT_CXX_API_THROW("SizeInBytes() only applies to string scalar", ORT_RUNTIME_EXCEPTION);
+    }
+    return input_string_views_[0].size();
+  }
+  void SetStringOutput(const strings& ss, const std::vector<int64_t>& dims) {
+    shape_ = dims;
+    std::vector<const char*> raw;
+    for (const auto& s : ss) {
+      raw.push_back(s.data());
+    }
+    auto output = ctx_.GetOutput(indice_, dims.data(), dims.size());
+    // note - there will be copy ...
+    output.FillStringTensor(raw.data(), raw.size());
+  }
+  const Span<std::string_view>& AsSpan() {
+    ORT_CXX_API_THROW("span for TensorT of string view not implemented", OrtErrorCode::ORT_RUNTIME_EXCEPTION);
+  }
+  std::string_view AsScalar() {
+    if (input_string_views_.size() != 1) {
+      ORT_CXX_API_THROW("invalid shape while trying to get a scalar string view from Ort::Custom::Tensor",
+                        OrtErrorCode::ORT_RUNTIME_EXCEPTION);
+    }
+    return input_string_views_[0];
+  }
+
+ private:
+  std::vector<char> chars_;                           // for input
+  std::vector<std::string_view> input_string_views_;  // for input
+};
+
+using TensorPtr = std::unique_ptr<Custom::TensorBase>;
+using TensorPtrs = std::vector<TensorPtr>;
+
+struct TensorArray : public ArgBase {
+  TensorArray(OrtKernelContext* ctx,
+              size_t start_indice,
+              bool is_input) : ArgBase(ctx,
+                                       start_indice,
+                                       is_input) {
+    if (is_input) {
+      auto input_count = ctx_.GetInputCount();
+      for (size_t ith_input = start_indice; ith_input < input_count; ++ith_input) {
+        auto const_value = ctx_.GetInput(start_indice);
+        auto type_shape_info = const_value.GetTensorTypeAndShapeInfo();
+        auto type = type_shape_info.GetElementType();
+        TensorPtr tensor;
+        switch (type) {
+          case ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL:
+            tensor = std::make_unique<Custom::Tensor<bool>>(ctx, ith_input, true);
+            break;
+          case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT:
+            tensor = std::make_unique<Custom::Tensor<float>>(ctx, ith_input, true);
+            break;
+          case ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE:
+            tensor = std::make_unique<Custom::Tensor<double>>(ctx, ith_input, true);
+            break;
+          case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8:
+            tensor = std::make_unique<Custom::Tensor<uint8_t>>(ctx, ith_input, true);
+            break;
+          case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8:
+            tensor = std::make_unique<Custom::Tensor<int8_t>>(ctx, ith_input, true);
+            break;
+          case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16:
+            tensor = std::make_unique<Custom::Tensor<uint16_t>>(ctx, ith_input, true);
+            break;
+          case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16:
+            tensor = std::make_unique<Custom::Tensor<int16_t>>(ctx, ith_input, true);
+            break;
+          case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32:
+            tensor = std::make_unique<Custom::Tensor<uint32_t>>(ctx, ith_input, true);
+            break;
+          case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32:
+            tensor = std::make_unique<Custom::Tensor<int32_t>>(ctx, ith_input, true);
+            break;
+          case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64:
+            tensor = std::make_unique<Custom::Tensor<uint64_t>>(ctx, ith_input, true);
+            break;
+          case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64:
+            tensor = std::make_unique<Custom::Tensor<int64_t>>(ctx, ith_input, true);
+            break;
+          case ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING:
+            tensor = std::make_unique<Custom::Tensor<std::string>>(ctx, ith_input, true);
+            break;
+          default:
+            ORT_CXX_API_THROW("unknow input type", ORT_RUNTIME_EXCEPTION);
+            break;
+        }
+        tensors_.emplace_back(tensor.release());
+      }  // for
+    }
+  }
+  template <typename T>
+  T* AllocateOutput(size_t ith_output, const std::vector<int64_t>& shape) {
+    // ith_output is the indice of output relative to the tensor array
+    // indice_ + ith_output is the indice relative to context
+    auto tensor = std::make_unique<Tensor<T>>(ctx_.GetOrtKernelContext(), indice_ + ith_output, false);
+    auto raw_output = tensor.get()->Allocate(shape);
+    tensors_.emplace_back(tensor.release());
+    return raw_output;
+  }
+  Tensor<std::string>& AllocateStringTensor(size_t ith_output) {
+    // ith_output is the indice of output relative to the tensor array
+    // indice_ + ith_output is the indice relative to context
+    auto tensor = std::make_unique<Tensor<std::string>>(ctx_.GetOrtKernelContext(), indice_ + ith_output, false);
+    Tensor<std::string>& output = *tensor;
+    tensors_.emplace_back(tensor.release());
+    return output;
+  }
+  size_t Size() const {
+    return tensors_.size();
+  }
+  const TensorPtr& operator[](size_t ith_input) const {
+    // ith_input is the indice of output relative to the tensor array
+    return tensors_.at(ith_input);
+  }
+
+ private:
+  TensorPtrs tensors_;
+};
+
+using Variadic = TensorArray;
+
+/*
+Note:
+OrtLiteCustomOp inherits from OrtCustomOp to bridge tween a custom func/struct and ort core.
+The lifetime of an OrtLiteCustomOp instance is managed by customer code, not ort, so:
+1. DO NOT cast OrtLiteCustomOp to OrtCustomOp and release since there is no virtual destructor in the hierarchy.
+2. OrtLiteCustomFunc and OrtLiteCustomStruct, as two sub-structs, can be released in form of OrtLiteCustomOp since all members are kept in the OrtLiteCustomOp,
+   hence memory could still be recycled properly.
+Further, OrtCustomOp is a c struct bearing no v-table, so offspring structs are by design to be of zero virtual functions to maintain cast safety.
+*/
+struct OrtLiteCustomOp : public OrtCustomOp {
+  using ConstOptionalFloatTensor = std::optional<const Custom::Tensor<float>&>;
+  using OptionalFloatTensor = std::optional<Custom::Tensor<float>>;
+
+  // CreateTuple
+  template <size_t ith_input, size_t ith_output, typename... Ts>
+  static typename std::enable_if<sizeof...(Ts) == 0, std::tuple<>>::type
+  CreateTuple(OrtKernelContext*, ArgPtrs&, size_t, size_t, const std::string&) {
+    return std::make_tuple();
+  }
+
+  template <size_t ith_input, size_t ith_output, typename T, typename... Ts>
+  static typename std::enable_if<std::is_same<T, OrtKernelContext*>::value, std::tuple<T, Ts...>>::type
+  CreateTuple(OrtKernelContext* context, ArgPtrs& args, size_t num_input, size_t num_output, const std::string& ep) {
+    std::tuple<T> current = std::tuple<OrtKernelContext*>{context};
+    auto next = CreateTuple<ith_input, ith_output, Ts...>(context, args, num_input, num_output, ep);
+    return std::tuple_cat(current, next);
+  }
+
+  template <size_t ith_input, size_t ith_output, typename T, typename... Ts>
+  static typename std::enable_if<std::is_same<T, OrtKernelContext&>::value, std::tuple<T, Ts...>>::type
+  CreateTuple(OrtKernelContext* context, ArgPtrs& args, size_t num_input, size_t num_output, const std::string& ep) {
+    std::tuple<T> current = std::tuple<OrtKernelContext&>{*context};
+    auto next = CreateTuple<ith_input, ith_output, Ts...>(context, args, num_input, num_output, ep);
+    return std::tuple_cat(current, next);
+  }
+
+#ifdef ORT_CUDA_CTX
+  template <size_t ith_input, size_t ith_output, typename T, typename... Ts>
+  static typename std::enable_if<std::is_same<T, const CudaContext&>::value, std::tuple<T, Ts...>>::type
+  CreateTuple(OrtKernelContext* context, ArgPtrs& args, size_t num_input, size_t num_output, const std::string& ep) {
+    thread_local CudaContext cuda_context;
+    cuda_context.Init(*context);
+    std::tuple<T> current = std::tuple<const CudaContext&>{cuda_context};
+    auto next = CreateTuple<ith_input, ith_output, Ts...>(context, args, num_input, num_output, ep);
+    return std::tuple_cat(current, next);
+  }
+#endif
+
+#ifdef ORT_ROCM_CTX
+  template <size_t ith_input, size_t ith_output, typename T, typename... Ts>
+  static typename std::enable_if<std::is_same<T, const RocmContext&>::value, std::tuple<T, Ts...>>::type
+  CreateTuple(OrtKernelContext* context, ArgPtrs& args, size_t num_input, size_t num_output, const std::string& ep) {
+    thread_local RocmContext rocm_context;
+    rocm_context.Init(*context);
+    std::tuple<T> current = std::tuple<const RocmContext&>{rocm_context};
+    auto next = CreateTuple<ith_input, ith_output, Ts...>(context, args, num_input, num_output, ep);
+    return std::tuple_cat(current, next);
+  }
+#endif
+
+  template <size_t ith_input, size_t ith_output, typename T, typename... Ts>
+  static typename std::enable_if<std::is_same<T, const TensorArray*>::value, std::tuple<T, Ts...>>::type
+  CreateTuple(OrtKernelContext* context, ArgPtrs& args, size_t num_input, size_t num_output, const std::string& ep) {
+    args.push_back(std::make_unique<TensorArray>(context, ith_input, true));
+    std::tuple<T> current = std::tuple<T>{reinterpret_cast<T>(args.back().get())};
+    auto next = CreateTuple<ith_input + 1, ith_output, Ts...>(context, args, num_input, num_output, ep);
+    return std::tuple_cat(current, next);
+  }
+
+  template <size_t ith_input, size_t ith_output, typename T, typename... Ts>
+  static typename std::enable_if<std::is_same<T, const TensorArray&>::value, std::tuple<T, Ts...>>::type
+  CreateTuple(OrtKernelContext* context, ArgPtrs& args, size_t num_input, size_t num_output, const std::string& ep) {
+    args.push_back(std::make_unique<TensorArray>(context, ith_input, true));
+    std::tuple<T> current = std::tuple<T>{reinterpret_cast<T>(*args.back().get())};
+    auto next = CreateTuple<ith_input + 1, ith_output, Ts...>(context, args, num_input, num_output, ep);
+    return std::tuple_cat(current, next);
+  }
+
+  template <size_t ith_input, size_t ith_output, typename T, typename... Ts>
+  static typename std::enable_if<std::is_same<T, TensorArray*>::value, std::tuple<T, Ts...>>::type
+  CreateTuple(OrtKernelContext* context, ArgPtrs& args, size_t num_input, size_t num_output, const std::string& ep) {
+    args.push_back(std::make_unique<TensorArray>(context, ith_output, false));
+    std::tuple<T> current = std::tuple<T>{reinterpret_cast<T>(args.back().get())};
+    auto next = CreateTuple<ith_input, ith_output + 1, Ts...>(context, args, num_input, num_output, ep);
+    return std::tuple_cat(current, next);
+  }
+
+  template <size_t ith_input, size_t ith_output, typename T, typename... Ts>
+  static typename std::enable_if<std::is_same<T, TensorArray&>::value, std::tuple<T, Ts...>>::type
+  CreateTuple(OrtKernelContext* context, ArgPtrs& args, size_t num_input, size_t num_output, const std::string& ep) {
+    args.push_back(std::make_unique<TensorArray>(context, ith_output, false));
+    std::tuple<T> current = std::tuple<T>{reinterpret_cast<T>(*args.back().get())};
+    auto next = CreateTuple<ith_input, ith_output + 1, Ts...>(context, args, num_input, num_output, ep);
+    return std::tuple_cat(current, next);
+  }
+
+#define CREATE_TUPLE_INPUT(data_type)                                                                                                 \
+  template <size_t ith_input, size_t ith_output, typename T, typename... Ts>                                                          \
+  static typename std::enable_if<std::is_same<T, const Custom::Tensor<data_type>*>::value, std::tuple<T, Ts...>>::type                \
+  CreateTuple(OrtKernelContext* context, ArgPtrs& args, size_t num_input, size_t num_output, const std::string& ep) {                 \
+    args.push_back(std::make_unique<Custom::Tensor<data_type>>(context, ith_input, true));                                            \
+    std::tuple<T> current = std::tuple<T>{reinterpret_cast<T>(args.back().get())};                                                    \
+    auto next = CreateTuple<ith_input + 1, ith_output, Ts...>(context, args, num_input, num_output, ep);                              \
+    return std::tuple_cat(current, next);                                                                                             \
+  }                                                                                                                                   \
+  template <size_t ith_input, size_t ith_output, typename T, typename... Ts>                                                          \
+  static typename std::enable_if<std::is_same<T, const Custom::Tensor<data_type>&>::value, std::tuple<T, Ts...>>::type                \
+  CreateTuple(OrtKernelContext* context, ArgPtrs& args, size_t num_input, size_t num_output, const std::string& ep) {                 \
+    args.push_back(std::make_unique<Custom::Tensor<data_type>>(context, ith_input, true));                                            \
+    std::tuple<T> current = std::tuple<T>{reinterpret_cast<T>(*args.back().get())};                                                   \
+    auto next = CreateTuple<ith_input + 1, ith_output, Ts...>(context, args, num_input, num_output, ep);                              \
+    return std::tuple_cat(current, next);                                                                                             \
+  }                                                                                                                                   \
+  template <size_t ith_input, size_t ith_output, typename T, typename... Ts>                                                          \
+  static typename std::enable_if<std::is_same<T, std::optional<const Custom::Tensor<data_type>*>>::value, std::tuple<T, Ts...>>::type \
+  CreateTuple(OrtKernelContext* context, ArgPtrs& args, size_t num_input, size_t num_output, const std::string& ep) {                 \
+    if (ith_input < num_input) {                                                                                                      \
+      args.push_back(std::make_unique<Custom::Tensor<data_type>>(context, ith_input, true));                                          \
+      std::tuple<T> current = std::tuple<T>{reinterpret_cast<Custom::Tensor<data_type>*>(args.back().get())};                         \
+      auto next = CreateTuple<ith_input + 1, ith_output, Ts...>(context, args, num_input, num_output, ep);                            \
+      return std::tuple_cat(current, next);                                                                                           \
+    } else {                                                                                                                          \
+      std::tuple<T> current = std::tuple<T>{};                                                                                        \
+      auto next = CreateTuple<ith_input + 1, ith_output, Ts...>(context, args, num_input, num_output, ep);                            \
+      return std::tuple_cat(current, next);                                                                                           \
+    }                                                                                                                                 \
+  }                                                                                                                                   \
+  template <size_t ith_input, size_t ith_output, typename T, typename... Ts>                                                          \
+  static typename std::enable_if<std::is_same<T, const Custom::Span<data_type>*>::value, std::tuple<T, Ts...>>::type                  \
+  CreateTuple(OrtKernelContext* context, ArgPtrs& args, size_t num_input, size_t num_output, const std::string& ep) {                 \
+    if ("CPUExecutionProvider" != ep) {                                                                                               \
+      ORT_CXX_API_THROW("span input could only be applied to CPU EP", OrtErrorCode::ORT_RUNTIME_EXCEPTION);                           \
+    }                                                                                                                                 \
+    args.push_back(std::make_unique<Custom::Tensor<data_type>>(context, ith_input, true));                                            \
+    std::tuple<T> current = std::tuple<T>{&reinterpret_cast<Custom::Tensor<data_type>*>(args.back().get())->AsSpan()};                \
+    auto next = CreateTuple<ith_input + 1, ith_output, Ts...>(context, args, num_input, num_output, ep);                              \
+    return std::tuple_cat(current, next);                                                                                             \
+  }                                                                                                                                   \
+  template <size_t ith_input, size_t ith_output, typename T, typename... Ts>                                                          \
+  static typename std::enable_if<std::is_same<T, const Custom::Span<data_type>&>::value, std::tuple<T, Ts...>>::type                  \
+  CreateTuple(OrtKernelContext* context, ArgPtrs& args, size_t num_input, size_t num_output, const std::string& ep) {                 \
+    if ("CPUExecutionProvider" != ep) {                                                                                               \
+      ORT_CXX_API_THROW("span input could only be applied to CPU EP", OrtErrorCode::ORT_RUNTIME_EXCEPTION);                           \
+    }                                                                                                                                 \
+    args.push_back(std::make_unique<Custom::Tensor<data_type>>(context, ith_input, true));                                            \
+    std::tuple<T> current = std::tuple<T>{reinterpret_cast<Custom::Tensor<data_type>*>(args.back().get())->AsSpan()};                 \
+    auto next = CreateTuple<ith_input + 1, ith_output, Ts...>(context, args, num_input, num_output, ep);                              \
+    return std::tuple_cat(current, next);                                                                                             \
+  }                                                                                                                                   \
+  template <size_t ith_input, size_t ith_output, typename T, typename... Ts>                                                          \
+  static typename std::enable_if<std::is_same<T, std::optional<const Custom::Span<data_type>*>>::value, std::tuple<T, Ts...>>::type   \
+  CreateTuple(OrtKernelContext* context, ArgPtrs& args, size_t num_input, size_t num_output, const std::string& ep) {                 \
+    if (ith_input < num_input) {                                                                                                      \
+      if ("CPUExecutionProvider" != ep) {                                                                                             \
+        ORT_CXX_API_THROW("span input could only be applied to CPU EP", OrtErrorCode::ORT_RUNTIME_EXCEPTION);                         \
+      }                                                                                                                               \
+      args.push_back(std::make_unique<Custom::Tensor<data_type>>(context, ith_input, true));                                          \
+      std::tuple<T> current = std::tuple<T>{&reinterpret_cast<Custom::Tensor<data_type>*>(args.back().get())->AsSpan()};              \
+      auto next = CreateTuple<ith_input + 1, ith_output, Ts...>(context, args, num_input, num_output, ep);                            \
+      return std::tuple_cat(current, next);                                                                                           \
+    } else {                                                                                                                          \
+      std::tuple<T> current = std::tuple<T>{};                                                                                        \
+      auto next = CreateTuple<ith_input + 1, ith_output, Ts...>(context, args, num_input, num_output, ep);                            \
+      return std::tuple_cat(current, next);                                                                                           \
+    }                                                                                                                                 \
+  }                                                                                                                                   \
+  template <size_t ith_input, size_t ith_output, typename T, typename... Ts>                                                          \
+  static typename std::enable_if<std::is_same<T, data_type>::value, std::tuple<T, Ts...>>::type                                       \
+  CreateTuple(OrtKernelContext* context, ArgPtrs& args, size_t num_input, size_t num_output, const std::string& ep) {                 \
+    if ("CPUExecutionProvider" != ep) {                                                                                               \
+      ORT_CXX_API_THROW("scalar input could only be applied to CPU EP", OrtErrorCode::ORT_RUNTIME_EXCEPTION);                         \
+    }                                                                                                                                 \
+    args.push_back(std::make_unique<Custom::Tensor<data_type>>(context, ith_input, true));                                            \
+    std::tuple<T> current = std::tuple<T>{reinterpret_cast<Custom::Tensor<data_type>*>(args.back().get())->AsScalar()};               \
+    auto next = CreateTuple<ith_input + 1, ith_output, Ts...>(context, args, num_input, num_output, ep);                              \
+    return std::tuple_cat(current, next);                                                                                             \
+  }                                                                                                                                   \
+  template <size_t ith_input, size_t ith_output, typename T, typename... Ts>                                                          \
+  static typename std::enable_if<std::is_same<T, std::optional<data_type>>::value, std::tuple<T, Ts...>>::type                        \
+  CreateTuple(OrtKernelContext* context, ArgPtrs& args, size_t num_input, size_t num_output, const std::string& ep) {                 \
+    if (ith_input < num_input) {                                                                                                      \
+      if ("CPUExecutionProvider" != ep) {                                                                                             \
+        ORT_CXX_API_THROW("scalar input could only be applied to CPU EP", OrtErrorCode::ORT_RUNTIME_EXCEPTION);                       \
+      }                                                                                                                               \
+      args.push_back(std::make_unique<Custom::Tensor<data_type>>(context, ith_input, true));                                          \
+      std::tuple<T> current = std::tuple<T>{reinterpret_cast<Custom::Tensor<data_type>*>(args.back().get())->AsScalar()};             \
+      auto next = CreateTuple<ith_input + 1, ith_output, Ts...>(context, args, num_input, num_output, ep);                            \
+      return std::tuple_cat(current, next);                                                                                           \
+    } else {                                                                                                                          \
+      std::tuple<T> current = std::tuple<T>{};                                                                                        \
+      auto next = CreateTuple<ith_input + 1, ith_output, Ts...>(context, args, num_input, num_output, ep);                            \
+      return std::tuple_cat(current, next);                                                                                           \
+    }                                                                                                                                 \
+  }
+#define CREATE_TUPLE_OUTPUT(data_type)                                                                                          \
+  template <size_t ith_input, size_t ith_output, typename T, typename... Ts>                                                    \
+  static typename std::enable_if<std::is_same<T, Custom::Tensor<data_type>*>::value, std::tuple<T, Ts...>>::type                \
+  CreateTuple(OrtKernelContext* context, ArgPtrs& args, size_t num_input, size_t num_output, const std::string& ep) {           \
+    args.push_back(std::make_unique<Custom::Tensor<data_type>>(context, ith_output, false));                                    \
+    std::tuple<T> current = std::tuple<T>{reinterpret_cast<T>(args.back().get())};                                              \
+    auto next = CreateTuple<ith_input, ith_output + 1, Ts...>(context, args, num_input, num_output, ep);                        \
+    return std::tuple_cat(current, next);                                                                                       \
+  }                                                                                                                             \
+  template <size_t ith_input, size_t ith_output, typename T, typename... Ts>                                                    \
+  static typename std::enable_if<std::is_same<T, Custom::Tensor<data_type>&>::value, std::tuple<T, Ts...>>::type                \
+  CreateTuple(OrtKernelContext* context, ArgPtrs& args, size_t num_input, size_t num_output, const std::string& ep) {           \
+    args.push_back(std::make_unique<Custom::Tensor<data_type>>(context, ith_output, false));                                    \
+    std::tuple<T> current = std::tuple<T>{reinterpret_cast<T>(*args.back().get())};                                             \
+    auto next = CreateTuple<ith_input, ith_output + 1, Ts...>(context, args, num_input, num_output, ep);                        \
+    return std::tuple_cat(current, next);                                                                                       \
+  }                                                                                                                             \
+  template <size_t ith_input, size_t ith_output, typename T, typename... Ts>                                                    \
+  static typename std::enable_if<std::is_same<T, std::optional<Custom::Tensor<data_type>*>>::value, std::tuple<T, Ts...>>::type \
+  CreateTuple(OrtKernelContext* context, ArgPtrs& args, size_t num_input, size_t num_output, const std::string& ep) {           \
+    if (ith_output < num_output) {                                                                                              \
+      args.push_back(std::make_unique<Custom::Tensor<data_type>>(context, ith_output, false));                                  \
+      std::tuple<T> current = std::tuple<T>{reinterpret_cast<Custom::Tensor<data_type>*>(args.back().get())};                   \
+      auto next = CreateTuple<ith_input, ith_output + 1, Ts...>(context, args, num_input, num_output, ep);                      \
+      return std::tuple_cat(current, next);                                                                                     \
+    } else {                                                                                                                    \
+      std::tuple<T> current = std::tuple<T>{};                                                                                  \
+      auto next = CreateTuple<ith_input, ith_output + 1, Ts...>(context, args, num_input, num_output, ep);                      \
+      return std::tuple_cat(current, next);                                                                                     \
+    }                                                                                                                           \
+  }
+#define CREATE_TUPLE(data_type) \
+  CREATE_TUPLE_INPUT(data_type) \
+  CREATE_TUPLE_OUTPUT(data_type)
+
+  CREATE_TUPLE(bool)
+  CREATE_TUPLE(float)
+  CREATE_TUPLE(Ort::Float16_t)
+  CREATE_TUPLE(Ort::BFloat16_t)
+  CREATE_TUPLE(double)
+  CREATE_TUPLE(int8_t)
+  CREATE_TUPLE(int16_t)
+  CREATE_TUPLE(int32_t)
+  CREATE_TUPLE(int64_t)
+  CREATE_TUPLE(uint8_t)
+  CREATE_TUPLE(uint16_t)
+  CREATE_TUPLE(uint32_t)
+  CREATE_TUPLE(uint64_t)
+  CREATE_TUPLE(std::string)
+  CREATE_TUPLE_INPUT(std::string_view)
+  CREATE_TUPLE(Ort::Float8E4M3FN_t)
+  CREATE_TUPLE(Ort::Float8E4M3FNUZ_t)
+  CREATE_TUPLE(Ort::Float8E5M2_t)
+  CREATE_TUPLE(Ort::Float8E5M2FNUZ_t)
+
+  // ParseArgs ...
+  template <typename... Ts>
+  static typename std::enable_if<0 == sizeof...(Ts)>::type
+  ParseArgs(std::vector<ONNXTensorElementDataType>&, std::vector<ONNXTensorElementDataType>&) {
+  }
+
+  template <typename T, typename... Ts>
+  static typename std::enable_if<0 <= sizeof...(Ts) && std::is_same<T, OrtKernelContext*>::value>::type
+  ParseArgs(std::vector<ONNXTensorElementDataType>& input_types, std::vector<ONNXTensorElementDataType>& output_types) {
+    ParseArgs<Ts...>(input_types, output_types);
+  }
+
+  template <typename T, typename... Ts>
+  static typename std::enable_if<0 <= sizeof...(Ts) && std::is_same<T, OrtKernelContext&>::value>::type
+  ParseArgs(std::vector<ONNXTensorElementDataType>& input_types, std::vector<ONNXTensorElementDataType>& output_types) {
+    ParseArgs<Ts...>(input_types, output_types);
+  }
+
+#ifdef ORT_CUDA_CTX
+  template <typename T, typename... Ts>
+  static typename std::enable_if<0 <= sizeof...(Ts) && std::is_same<T, const CudaContext&>::value>::type
+  ParseArgs(std::vector<ONNXTensorElementDataType>& input_types, std::vector<ONNXTensorElementDataType>& output_types) {
+    ParseArgs<Ts...>(input_types, output_types);
+  }
+#endif
+
+#ifdef ORT_ROCM_CTX
+  template <typename T, typename... Ts>
+  static typename std::enable_if<0 <= sizeof...(Ts) && std::is_same<T, const RocmContext&>::value>::type
+  ParseArgs(std::vector<ONNXTensorElementDataType>& input_types, std::vector<ONNXTensorElementDataType>& output_types) {
+    ParseArgs<Ts...>(input_types, output_types);
+  }
+#endif
+
+  template <typename T, typename... Ts>
+  static typename std::enable_if<0 <= sizeof...(Ts) && std::is_same<T, const TensorArray&>::value>::type
+  ParseArgs(std::vector<ONNXTensorElementDataType>& input_types, std::vector<ONNXTensorElementDataType>& output_types) {
+    input_types.push_back(ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED);
+    ParseArgs<Ts...>(input_types, output_types);
+  }
+
+  template <typename T, typename... Ts>
+  static typename std::enable_if<0 <= sizeof...(Ts) && std::is_same<T, const TensorArray*>::value>::type
+  ParseArgs(std::vector<ONNXTensorElementDataType>& input_types, std::vector<ONNXTensorElementDataType>& output_types) {
+    input_types.push_back(ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED);
+    ParseArgs<Ts...>(input_types, output_types);
+  }
+
+  template <typename T, typename... Ts>
+  static typename std::enable_if<0 <= sizeof...(Ts) && std::is_same<T, TensorArray&>::value>::type
+  ParseArgs(std::vector<ONNXTensorElementDataType>& input_types, std::vector<ONNXTensorElementDataType>& output_types) {
+    output_types.push_back(ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED);
+    ParseArgs<Ts...>(input_types, output_types);
+  }
+
+  template <typename T, typename... Ts>
+  static typename std::enable_if<0 <= sizeof...(Ts) && std::is_same<T, TensorArray*>::value>::type
+  ParseArgs(std::vector<ONNXTensorElementDataType>& input_types, std::vector<ONNXTensorElementDataType>& output_types) {
+    output_types.push_back(ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED);
+    ParseArgs<Ts...>(input_types, output_types);
+  }
+
+#define PARSE_INPUT_BASE(pack_type, onnx_type)                                                                           \
+  template <typename T, typename... Ts>                                                                                  \
+  static typename std::enable_if<0 <= sizeof...(Ts) && std::is_same<T, pack_type>::value>::type                          \
+  ParseArgs(std::vector<ONNXTensorElementDataType>& input_types, std::vector<ONNXTensorElementDataType>& output_types) { \
+    input_types.push_back(onnx_type);                                                                                    \
+    ParseArgs<Ts...>(input_types, output_types);                                                                         \
+  }                                                                                                                      \
+  template <typename T, typename... Ts>                                                                                  \
+  static typename std::enable_if<0 <= sizeof...(Ts) && std::is_same<T, const std::optional<pack_type>>::value>::type     \
+  ParseArgs(std::vector<ONNXTensorElementDataType>& input_types, std::vector<ONNXTensorElementDataType>& output_types) { \
+    input_types.push_back(onnx_type);                                                                                    \
+    ParseArgs<Ts...>(input_types, output_types);                                                                         \
+  }                                                                                                                      \
+  template <typename T, typename... Ts>                                                                                  \
+  static typename std::enable_if<0 <= sizeof...(Ts) && std::is_same<T, std::optional<pack_type>>::value>::type           \
+  ParseArgs(std::vector<ONNXTensorElementDataType>& input_types, std::vector<ONNXTensorElementDataType>& output_types) { \
+    input_types.push_back(onnx_type);                                                                                    \
+    ParseArgs<Ts...>(input_types, output_types);                                                                         \
+  }
+
+#define PARSE_INPUT(data_type, onnx_type)                       \
+  PARSE_INPUT_BASE(const Custom::Tensor<data_type>*, onnx_type) \
+  PARSE_INPUT_BASE(const Custom::Tensor<data_type>&, onnx_type) \
+  PARSE_INPUT_BASE(const Custom::Span<data_type>*, onnx_type)   \
+  PARSE_INPUT_BASE(const Custom::Span<data_type>&, onnx_type)   \
+  PARSE_INPUT_BASE(data_type, onnx_type)
+
+#define PARSE_OUTPUT(data_type, onnx_type)                                                                                      \
+  template <typename T, typename... Ts>                                                                                         \
+  static typename std::enable_if<0 <= sizeof...(Ts) && std::is_same<T, Custom::Tensor<data_type>*>::value>::type                \
+  ParseArgs(std::vector<ONNXTensorElementDataType>& input_types, std::vector<ONNXTensorElementDataType>& output_types) {        \
+    output_types.push_back(onnx_type);                                                                                          \
+    ParseArgs<Ts...>(input_types, output_types);                                                                                \
+  }                                                                                                                             \
+  template <typename T, typename... Ts>                                                                                         \
+  static typename std::enable_if<0 <= sizeof...(Ts) && std::is_same<T, Custom::Tensor<data_type>&>::value>::type                \
+  ParseArgs(std::vector<ONNXTensorElementDataType>& input_types, std::vector<ONNXTensorElementDataType>& output_types) {        \
+    output_types.push_back(onnx_type);                                                                                          \
+    ParseArgs<Ts...>(input_types, output_types);                                                                                \
+  }                                                                                                                             \
+  template <typename T, typename... Ts>                                                                                         \
+  static typename std::enable_if<0 <= sizeof...(Ts) && std::is_same<T, std::optional<Custom::Tensor<data_type>*>>::value>::type \
+  ParseArgs(std::vector<ONNXTensorElementDataType>& input_types, std::vector<ONNXTensorElementDataType>& output_types) {        \
+    output_types.push_back(onnx_type);                                                                                          \
+    ParseArgs<Ts...>(input_types, output_types);                                                                                \
+  }
+
+#define PARSE_ARGS(data_type, onnx_type) \
+  PARSE_INPUT(data_type, onnx_type)      \
+  PARSE_OUTPUT(data_type, onnx_type)
+
+  PARSE_ARGS(bool, ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL)
+  PARSE_ARGS(float, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT)
+  PARSE_ARGS(Ort::Float16_t, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16)
+  PARSE_ARGS(Ort::BFloat16_t, ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16)
+  PARSE_ARGS(double, ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE)
+  PARSE_ARGS(int8_t, ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8)
+  PARSE_ARGS(int16_t, ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16)
+  PARSE_ARGS(int32_t, ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32)
+  PARSE_ARGS(int64_t, ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64)
+  PARSE_ARGS(uint8_t, ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8)
+  PARSE_ARGS(uint16_t, ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16)
+  PARSE_ARGS(uint32_t, ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32)
+  PARSE_ARGS(uint64_t, ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64)
+  PARSE_ARGS(std::string, ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING)
+  PARSE_ARGS(std::string_view, ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING)  // todo - remove string_view output
+  PARSE_ARGS(Ort::Float8E4M3FN_t, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E4M3FN)
+  PARSE_ARGS(Ort::Float8E4M3FNUZ_t, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E4M3FNUZ)
+  PARSE_ARGS(Ort::Float8E5M2_t, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E5M2)
+  PARSE_ARGS(Ort::Float8E5M2FNUZ_t, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E5M2FNUZ)
+
+  OrtLiteCustomOp(const char* op_name,
+                  const char* execution_provider,
+                  ShapeInferFn shape_infer_fn,
+                  int start_ver = 1,
+                  int end_ver = MAX_CUSTOM_OP_END_VER) : op_name_(op_name),
+                                                         execution_provider_(execution_provider),
+                                                         shape_infer_fn_(shape_infer_fn),
+                                                         start_ver_(start_ver),
+                                                         end_ver_(end_ver) {
+    OrtCustomOp::version = ORT_API_VERSION;
+
+    OrtCustomOp::GetName = [](const OrtCustomOp* op) { return static_cast<const OrtLiteCustomOp*>(op)->op_name_.c_str(); };
+    OrtCustomOp::GetExecutionProviderType = [](const OrtCustomOp* op) { return ((OrtLiteCustomOp*)op)->execution_provider_.c_str(); };
+    OrtCustomOp::GetInputMemoryType = [](const OrtCustomOp*, size_t) { return OrtMemTypeDefault; };
+
+    OrtCustomOp::GetInputTypeCount = [](const OrtCustomOp* op) {
+      auto self = reinterpret_cast<const OrtLiteCustomOp*>(op);
+      return self->input_types_.size();
+    };
+
+    OrtCustomOp::GetInputType = [](const OrtCustomOp* op, size_t indice) {
+      auto self = reinterpret_cast<const OrtLiteCustomOp*>(op);
+      return self->input_types_[indice];
+    };
+
+    OrtCustomOp::GetOutputTypeCount = [](const OrtCustomOp* op) {
+      auto self = reinterpret_cast<const OrtLiteCustomOp*>(op);
+      return self->output_types_.size();
+    };
+
+    OrtCustomOp::GetOutputType = [](const OrtCustomOp* op, size_t indice) {
+      auto self = reinterpret_cast<const OrtLiteCustomOp*>(op);
+      return self->output_types_[indice];
+    };
+
+    OrtCustomOp::GetInputCharacteristic = [](const OrtCustomOp* op, size_t indice) {
+      auto self = reinterpret_cast<const OrtLiteCustomOp*>(op);
+      return self->input_types_[indice] == ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED ? INPUT_OUTPUT_VARIADIC : INPUT_OUTPUT_OPTIONAL;
+    };
+
+    OrtCustomOp::GetOutputCharacteristic = [](const OrtCustomOp* op, size_t indice) {
+      auto self = reinterpret_cast<const OrtLiteCustomOp*>(op);
+      return self->output_types_[indice] == ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED ? INPUT_OUTPUT_VARIADIC : INPUT_OUTPUT_OPTIONAL;
+    };
+
+    OrtCustomOp::GetVariadicInputMinArity = [](const OrtCustomOp*) {
+      return 1;
+    };
+
+    OrtCustomOp::GetVariadicInputHomogeneity = [](const OrtCustomOp*) {
+      return 0;
+    };
+
+    OrtCustomOp::GetVariadicOutputMinArity = [](const OrtCustomOp*) {
+      return 1;
+    };
+
+    OrtCustomOp::GetVariadicOutputHomogeneity = [](const OrtCustomOp*) {
+      return 0;
+    };
+
+    OrtCustomOp::GetVariadicInputMinArity = [](const OrtCustomOp*) { return 0; };
+    OrtCustomOp::GetVariadicInputHomogeneity = [](const OrtCustomOp*) { return 0; };
+    OrtCustomOp::GetVariadicOutputMinArity = [](const OrtCustomOp*) { return 0; };
+    OrtCustomOp::GetVariadicOutputHomogeneity = [](const OrtCustomOp*) { return 0; };
+
+    OrtCustomOp::CreateKernelV2 = {};
+    OrtCustomOp::KernelComputeV2 = {};
+    OrtCustomOp::KernelCompute = {};
+
+    OrtCustomOp::InferOutputShapeFn = {};
+
+    OrtCustomOp::GetStartVersion = [](const OrtCustomOp* op) {
+      auto self = reinterpret_cast<const OrtLiteCustomOp*>(op);
+      return self->start_ver_;
+    };
+
+    OrtCustomOp::GetEndVersion = [](const OrtCustomOp* op) {
+      auto self = reinterpret_cast<const OrtLiteCustomOp*>(op);
+      return self->end_ver_;
+    };
+
+    OrtCustomOp::GetMayInplace = {};
+    OrtCustomOp::ReleaseMayInplace = {};
+    OrtCustomOp::GetAliasMap = {};
+    OrtCustomOp::ReleaseAliasMap = {};
+  }
+
+  const std::string op_name_;
+  const std::string execution_provider_;
+
+  std::vector<ONNXTensorElementDataType> input_types_;
+  std::vector<ONNXTensorElementDataType> output_types_;
+
+  ShapeInferFn shape_infer_fn_ = {};
+
+  int start_ver_ = 1;
+  int end_ver_ = MAX_CUSTOM_OP_END_VER;
+
+  void* compute_fn_ = {};
+  void* compute_fn_return_status_ = {};
+};
+
+//////////////////////////// OrtLiteCustomFunc ////////////////////////////////
+// The struct is to implement function-as-op.
+// E.g. a function might be defined as:
+//   void Filter(const Ort::Custom::Tensor<float>& floats_in, Ort::Custom::Tensor<float>& floats_out) { ... }
+// It could be registered this way:
+//   Ort::CustomOpDomain v2_domain{"v2"};
+//   std::unique_ptr<OrtLiteCustomOp> fil_op_ptr{Ort::Custom::CreateLiteCustomOp("Filter", "CPUExecutionProvider", Filter)};
+//   v2_domain.Add(fil_op_ptr.get());
+//   session_options.Add(v2_domain);
+// For the complete example, please search keyword "LiteCustomOpTest" under "<cloned_src_dir>/onnxruntime/test/".
+template <typename... Args>
+struct OrtLiteCustomFunc : public OrtLiteCustomOp {
+  using ComputeFn = void (*)(Args...);
+  using ComputeFnReturnStatus = Status (*)(Args...);
+  using MyType = OrtLiteCustomFunc<Args...>;
+
+  struct Kernel {
+    size_t num_input_{};
+    size_t num_output_{};
+    ComputeFn compute_fn_{};
+    ComputeFnReturnStatus compute_fn_return_status_{};
+    std::string ep_{};
+  };
+
+  OrtLiteCustomFunc(const char* op_name,
+                    const char* execution_provider,
+                    ComputeFn compute_fn,
+                    ShapeInferFn shape_infer_fn = {},
+                    int start_ver = 1,
+                    int end_ver = MAX_CUSTOM_OP_END_VER) : OrtLiteCustomOp(op_name, execution_provider, shape_infer_fn, start_ver, end_ver) {
+    compute_fn_ = reinterpret_cast<void*>(compute_fn);
+    ParseArgs<Args...>(input_types_, output_types_);
+
+    OrtCustomOp::KernelCompute = [](void* op_kernel, OrtKernelContext* context) {
+      auto kernel = reinterpret_cast<Kernel*>(op_kernel);
+      std::vector<ArgPtr> args;
+      auto t = CreateTuple<0, 0, Args...>(context, args, kernel->num_input_, kernel->num_output_, kernel->ep_);
+      std::apply([kernel](Args const&... t_args) { kernel->compute_fn_(t_args...); }, t);
+    };
+
+    OrtCustomOp::CreateKernel = [](const OrtCustomOp* this_, const OrtApi* ort_api, const OrtKernelInfo* info) {
+      auto kernel = std::make_unique<Kernel>();
+      auto me = static_cast<const MyType*>(this_);
+      kernel->compute_fn_ = reinterpret_cast<ComputeFn>(me->compute_fn_);
+      Ort::ThrowOnError(ort_api->KernelInfo_GetInputCount(info, &kernel->num_input_));
+      Ort::ThrowOnError(ort_api->KernelInfo_GetOutputCount(info, &kernel->num_output_));
+      auto self = static_cast<const OrtLiteCustomFunc*>(this_);
+      kernel->ep_ = self->execution_provider_;
+      return reinterpret_cast<void*>(kernel.release());
+    };
+
+    OrtCustomOp::KernelDestroy = [](void* op_kernel) {
+      delete reinterpret_cast<Kernel*>(op_kernel);
+    };
+
+    if (shape_infer_fn_) {
+      OrtCustomOp::InferOutputShapeFn = [](const OrtCustomOp* op, OrtShapeInferContext* ort_ctx) -> OrtStatusPtr {
+        auto shape_info_fn = static_cast<const MyType*>(op)->shape_infer_fn_;
+        ShapeInferContext ctx(&GetApi(), ort_ctx);
+        return shape_info_fn(ctx);
+      };
+    }
+  }
+
+  OrtLiteCustomFunc(const char* op_name,
+                    const char* execution_provider,
+                    ComputeFnReturnStatus compute_fn_return_status,
+                    ShapeInferFn shape_infer_fn = {},
+                    int start_ver = 1,
+                    int end_ver = MAX_CUSTOM_OP_END_VER) : OrtLiteCustomOp(op_name, execution_provider, shape_infer_fn, start_ver, end_ver) {
+    compute_fn_return_status_ = reinterpret_cast<void*>(compute_fn_return_status);
+    ParseArgs<Args...>(input_types_, output_types_);
+
+    OrtCustomOp::KernelComputeV2 = [](void* op_kernel, OrtKernelContext* context) -> OrtStatusPtr {
+      auto kernel = reinterpret_cast<Kernel*>(op_kernel);
+      std::vector<ArgPtr> args;
+      auto t = CreateTuple<0, 0, Args...>(context, args, kernel->num_input_, kernel->num_output_, kernel->ep_);
+      return std::apply([kernel](Args const&... t_args) { Status status = kernel->compute_fn_return_status_(t_args...); return status.release(); }, t);
+    };
+
+    OrtCustomOp::CreateKernel = [](const OrtCustomOp* this_, const OrtApi* ort_api, const OrtKernelInfo* info) {
+      auto kernel = std::make_unique<Kernel>();
+      auto me = static_cast<const MyType*>(this_);
+      kernel->compute_fn_return_status_ = reinterpret_cast<ComputeFnReturnStatus>(me->compute_fn_return_status_);
+      Ort::ThrowOnError(ort_api->KernelInfo_GetInputCount(info, &kernel->num_input_));
+      Ort::ThrowOnError(ort_api->KernelInfo_GetOutputCount(info, &kernel->num_output_));
+      auto self = static_cast<const OrtLiteCustomFunc*>(this_);
+      kernel->ep_ = self->execution_provider_;
+      return reinterpret_cast<void*>(kernel.release());
+    };
+
+    OrtCustomOp::KernelDestroy = [](void* op_kernel) {
+      delete reinterpret_cast<Kernel*>(op_kernel);
+    };
+
+    if (shape_infer_fn_) {
+      OrtCustomOp::InferOutputShapeFn = [](const OrtCustomOp* op, OrtShapeInferContext* ort_ctx) -> OrtStatusPtr {
+        auto shape_info_fn = static_cast<const MyType*>(op)->shape_infer_fn_;
+        ShapeInferContext ctx(&GetApi(), ort_ctx);
+        return shape_info_fn(ctx);
+      };
+    }
+  }
+};  // struct OrtLiteCustomFunc
+
+/////////////////////////// OrtLiteCustomStruct ///////////////////////////
+// The struct is to implement struct-as-op.
+// E.g. a struct might be defined as:
+//   struct Merge {
+//      Merge(const OrtApi* ort_api, const OrtKernelInfo* info) {...}
+//      void Compute(const Ort::Custom::Tensor<std::string_view>& strings_in,
+//                   std::string_view string_in,
+//                   Ort::Custom::Tensor<std::string>* strings_out) {...}
+//      bool reverse_ = false;
+//   };
+// It could be registered this way:
+//   Ort::CustomOpDomain v2_domain{"v2"};
+//   std::unique_ptr<OrtLiteCustomOp> mrg_op_ptr{Ort::Custom::CreateLiteCustomOp<Merge>("Merge", "CPUExecutionProvider")};
+//   v2_domain.Add(mrg_op_ptr.get());
+//   session_options.Add(v2_domain);
+// For the complete example, please search keyword "LiteCustomOpTest" under "<cloned_src_dir>/onnxruntime/test/".
+template <typename CustomOp>
+struct OrtLiteCustomStruct : public OrtLiteCustomOp {
+  template <typename... Args>
+  using CustomComputeFn = void (CustomOp::*)(Args...);
+
+  template <typename... Args>
+  using CustomComputeFnReturnStatus = Status (CustomOp::*)(Args...);
+
+  using MyType = OrtLiteCustomStruct<CustomOp>;
+
+  struct Kernel {
+    size_t num_input_{};
+    size_t num_output_{};
+    std::unique_ptr<CustomOp> custom_op_;
+    std::string ep_{};
+  };
+
+  OrtLiteCustomStruct(const char* op_name,
+                      const char* execution_provider,
+                      int start_ver = 1,
+                      int end_ver = MAX_CUSTOM_OP_END_VER) : OrtLiteCustomOp(op_name, execution_provider, {}, start_ver, end_ver) {
+    SetCompute(&CustomOp::Compute);
+
+    OrtCustomOp::CreateKernel = [](const OrtCustomOp* this_, const OrtApi* ort_api, const OrtKernelInfo* info) {
+      auto kernel = std::make_unique<Kernel>();
+      Ort::ThrowOnError(ort_api->KernelInfo_GetInputCount(info, &kernel->num_input_));
+      Ort::ThrowOnError(ort_api->KernelInfo_GetOutputCount(info, &kernel->num_output_));
+      kernel->custom_op_ = std::make_unique<CustomOp>(ort_api, info);
+      auto self = static_cast<const OrtLiteCustomStruct*>(this_);
+      kernel->ep_ = self->execution_provider_;
+      return reinterpret_cast<void*>(kernel.release());
+    };
+
+    OrtCustomOp::KernelDestroy = [](void* op_kernel) {
+      delete reinterpret_cast<Kernel*>(op_kernel);
+    };
+
+    SetShapeInfer<CustomOp>(0);
+  }
+
+  template <typename... Args>
+  void SetCompute(CustomComputeFn<Args...>) {
+    ParseArgs<Args...>(input_types_, output_types_);
+    OrtCustomOp::KernelCompute = [](void* op_kernel, OrtKernelContext* context) {
+      auto kernel = reinterpret_cast<Kernel*>(op_kernel);
+      ArgPtrs args;
+      auto t = CreateTuple<0, 0, Args...>(context, args, kernel->num_input_, kernel->num_output_, kernel->ep_);
+      std::apply([kernel](Args const&... t_args) { kernel->custom_op_->Compute(t_args...); }, t);
+    };
+  }
+
+  template <typename... Args>
+  void SetCompute(CustomComputeFnReturnStatus<Args...>) {
+    ParseArgs<Args...>(input_types_, output_types_);
+    OrtCustomOp::KernelComputeV2 = [](void* op_kernel, OrtKernelContext* context) -> OrtStatusPtr {
+      auto kernel = reinterpret_cast<Kernel*>(op_kernel);
+      ArgPtrs args;
+      auto t = CreateTuple<0, 0, Args...>(context, args, kernel->num_input_, kernel->num_output_, kernel->ep_);
+      return std::apply([kernel](Args const&... t_args) { Status status = kernel->custom_op_->Compute(t_args...); return status.release(); }, t);
+    };
+  }
+
+  template <typename C>
+  decltype(&C::InferOutputShape) SetShapeInfer(decltype(&C::InferOutputShape)) {
+    OrtCustomOp::InferOutputShapeFn = [](const OrtCustomOp*, OrtShapeInferContext* ort_ctx) -> OrtStatusPtr {
+      ShapeInferContext ctx(&GetApi(), ort_ctx);
+      return C::InferOutputShape(ctx);
+    };
+    return {};
+  }
+
+  template <typename C>
+  void SetShapeInfer(...) {
+    OrtCustomOp::InferOutputShapeFn = {};
+  }
+};  // struct OrtLiteCustomStruct
+
+/////////////////////////// CreateLiteCustomOp ////////////////////////////
+
+template <typename... Args>
+OrtLiteCustomOp* CreateLiteCustomOp(const char* op_name,
+                                    const char* execution_provider,
+                                    void (*custom_compute_fn)(Args...),
+                                    Status (*shape_infer_fn)(ShapeInferContext&) = {},
+                                    int start_ver = 1,
+                                    int end_ver = MAX_CUSTOM_OP_END_VER) {
+  using LiteOp = OrtLiteCustomFunc<Args...>;
+  return std::make_unique<LiteOp>(op_name, execution_provider, custom_compute_fn, shape_infer_fn, start_ver, end_ver).release();
+}
+
+template <typename... Args>
+OrtLiteCustomOp* CreateLiteCustomOp(const char* op_name,
+                                    const char* execution_provider,
+                                    Status (*custom_compute_fn_v2)(Args...),
+                                    Status (*shape_infer_fn)(ShapeInferContext&) = {},
+                                    int start_ver = 1,
+                                    int end_ver = MAX_CUSTOM_OP_END_VER) {
+  using LiteOp = OrtLiteCustomFunc<Args...>;
+  return std::make_unique<LiteOp>(op_name, execution_provider, custom_compute_fn_v2, shape_infer_fn, start_ver, end_ver).release();
+}
+
+template <typename CustomOp>
+OrtLiteCustomOp* CreateLiteCustomOp(const char* op_name,
+                                    const char* execution_provider,
+                                    int start_ver = 1,
+                                    int end_ver = MAX_CUSTOM_OP_END_VER) {
+  using LiteOp = OrtLiteCustomStruct<CustomOp>;
+  return std::make_unique<LiteOp>(op_name, execution_provider, start_ver, end_ver).release();
+}
+
+}  // namespace Custom
+}  // namespace Ort
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h
new file mode 100644
index 00000000000000..c80b8c0c164b6c
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h
@@ -0,0 +1,51 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+/*
+ * This file defines RunOptions Config Keys and format of the Config Values.
+ *
+ * The Naming Convention for a RunOptions Config Key,
+ * "[Area][.[SubArea1].[SubArea2]...].[Keyname]"
+ * Such as "ep.cuda.use_arena"
+ * The Config Key cannot be empty
+ * The maximum length of the Config Key is 128
+ *
+ * The string format of a RunOptions Config Value is defined individually for each Config.
+ * The maximum length of the Config Value is 1024
+ */
+
+// Key for enabling shrinkages of user listed device memory arenas.
+// Expects a list of semi-colon separated key value pairs separated by colon in the following format:
+// "device_0:device_id_0;device_1:device_id_1"
+// No white-spaces allowed in the provided list string.
+// Currently, the only supported devices are : "cpu", "gpu" (case sensitive).
+// If "cpu" is included in the list, DisableCpuMemArena() API must not be called (i.e.) arena for cpu should be enabled.
+// Example usage: "cpu:0;gpu:0" (or) "gpu:0"
+// By default, the value for this key is empty (i.e.) no memory arenas are shrunk
+static const char* const kOrtRunOptionsConfigEnableMemoryArenaShrinkage = "memory.enable_memory_arena_shrinkage";
+
+// Set to '1' to not synchronize execution providers with CPU at the end of session run.
+// Per default it will be set to '0'
+// Taking CUDA EP as an example, it omit triggering cudaStreamSynchronize on the compute stream.
+static const char* const kOrtRunOptionsConfigDisableSynchronizeExecutionProviders = "disable_synchronize_execution_providers";
+
+// Set HTP performance mode for QNN HTP backend before session run.
+// options for HTP performance mode: "burst", "balanced", "default", "high_performance",
+// "high_power_saver", "low_balanced", "extreme_power_saver", "low_power_saver", "power_saver",
+// "sustained_high_performance". Default to "default".
+static const char* const kOrtRunOptionsConfigQnnPerfMode = "qnn.htp_perf_mode";
+
+// Set HTP performance mode for QNN HTP backend post session run.
+static const char* const kOrtRunOptionsConfigQnnPerfModePostRun = "qnn.htp_perf_mode_post_run";
+
+// Set RPC control latency for QNN HTP backend
+static const char* const kOrtRunOptionsConfigQnnRpcControlLatency = "qnn.rpc_control_latency";
+
+// Set graph annotation id for CUDA EP. Use with enable_cuda_graph=true.
+// The value should be an integer. If the value is not set, the default value is 0 and
+// ORT session only captures one cuda graph before another capture is requested.
+// If the value is set to -1, cuda graph capture/replay is disabled in that run.
+// User are not expected to set the value to 0 as it is reserved for internal use.
+static const char* const kOrtRunOptionsConfigCudaGraphAnnotation = "gpu_graph_id";
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/third_party/onnxruntime_headers/src/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
new file mode 100644
index 00000000000000..0c3ebebc521d18
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
@@ -0,0 +1,312 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+/*
+ * This file defines SessionOptions Config Keys and format of the Config Values.
+ *
+ * The Naming Convention for a SessionOptions Config Key,
+ * "[Area][.[SubArea1].[SubArea2]...].[Keyname]"
+ * Such as "ep.cuda.use_arena"
+ * The Config Key cannot be empty
+ * The maximum length of the Config Key is 128
+ *
+ * The string format of a SessionOptions Config Value is defined individually for each Config.
+ * The maximum length of the Config Value is 1024
+ */
+
+// Key for disable PrePacking,
+// If the config value is set to "1" then the prepacking is disabled, otherwise prepacking is enabled (default value)
+static const char* const kOrtSessionOptionsConfigDisablePrepacking = "session.disable_prepacking";
+
+// A value of "1" means allocators registered in the env will be used. "0" means the allocators created in the session
+// will be used. Use this to override the usage of env allocators on a per session level.
+static const char* const kOrtSessionOptionsConfigUseEnvAllocators = "session.use_env_allocators";
+
+// Set to 'ORT' (case sensitive) to load an ORT format model.
+// If unset, model type will default to ONNX unless inferred from filename ('.ort' == ORT format) or bytes to be ORT
+static const char* const kOrtSessionOptionsConfigLoadModelFormat = "session.load_model_format";
+
+// Set to 'ORT' (case sensitive) to save optimized model in ORT format when SessionOptions.optimized_model_path is set.
+// If unset, format will default to ONNX unless optimized_model_filepath ends in '.ort'.
+static const char* const kOrtSessionOptionsConfigSaveModelFormat = "session.save_model_format";
+
+// If a value is "1", flush-to-zero and denormal-as-zero are applied. The default is "0".
+// When multiple sessions are created, a main thread doesn't override changes from succeeding session options,
+// but threads in session thread pools follow option changes.
+// When ORT runs with OpenMP, the same rule is applied, i.e. the first session option to flush-to-zero and
+// denormal-as-zero is only applied to global OpenMP thread pool, which doesn't support per-session thread pool.
+// Note that an alternative way not using this option at runtime is to train and export a model without denormals
+// and that's recommended because turning this option on may hurt model accuracy.
+static const char* const kOrtSessionOptionsConfigSetDenormalAsZero = "session.set_denormal_as_zero";
+
+// It controls to run quantization model in QDQ (QuantizelinearDeQuantizelinear) format or not.
+// "0": enable. ORT does fusion logic for QDQ format.
+// "1": disable. ORT doesn't do fusion logic for QDQ format.
+// Its default value is "0" unless the DirectML execution provider is registered, in which case it defaults to "1".
+static const char* const kOrtSessionOptionsDisableQuantQDQ = "session.disable_quant_qdq";
+
+// It controls whether to enable Double QDQ remover and Identical Children Consolidation
+// "0": not to disable. ORT does remove the middle 2 Nodes from a Q->(QD->Q)->QD pairs
+// "1": disable. ORT doesn't remove the middle 2 Nodes from a Q->(QD->Q)->QD pairs
+// Its default value is "0"
+static const char* const kOrtSessionOptionsDisableDoubleQDQRemover = "session.disable_double_qdq_remover";
+
+// If set to "1", enables the removal of QuantizeLinear/DequantizeLinear node pairs once all QDQ handling has been
+// completed. e.g. If after all QDQ handling has completed and we have -> FloatOp -> Q -> DQ -> FloatOp -> the
+// Q -> DQ could potentially be removed. This will provide a performance benefit by avoiding going from float to
+// 8-bit and back to float, but could impact accuracy. The impact on accuracy will be model specific and depend on
+// other factors like whether the model was created using Quantization Aware Training or Post Training Quantization.
+// As such, it's best to test to determine if enabling this works well for your scenario.
+// The default value is "0"
+// Available since version 1.11.
+static const char* const kOrtSessionOptionsEnableQuantQDQCleanup = "session.enable_quant_qdq_cleanup";
+
+// Enable or disable gelu approximation in graph optimization. "0": disable; "1": enable. The default is "0".
+// GeluApproximation has side effects which may change the inference results. It is disabled by default due to this.
+static const char* const kOrtSessionOptionsEnableGeluApproximation = "optimization.enable_gelu_approximation";
+
+// This setting controls whether to enable AheadOfTime function inlining.
+// AOT function inlining examines the graph and attempts to inline as many locally defined functions in the model
+// as possible with the help of enabled execution providers.
+// This can reduce the number of function calls and improve performance because it is done before
+// Level1 optimizers and constant folding. However, under some circumstances, when the EPs are not available,
+// one can disable the AOT inlining, produce an optimized model and postpone AOT until run time.
+// "0": enable; "1": disable.
+// Its default value is "0".
+static const char* const kOrtSessionOptionsDisableAheadOfTimeFunctionInlining = "session.disable_aot_function_inlining";
+
+#ifdef ENABLE_TRAINING
+// Specifies a path of the file containing a list of memory optimization configurations.
+// The value should be a string indicating the file path of the config file.
+// The content of the config file is a JSON struct like this:
+// [
+//   "Gelu+Cast+:1:0",
+//   "Dropout+:1:1"
+// ]
+// Taking the example of "Gelu+Cast+:1:0",
+// > "Gelu+Cast+" is the subgraph string, a valid "subgraph string" should be one subgraph representation
+//    output by ORT graph transformations.
+// > "1" is "optimization strategy", valid values: 0 - disabled, 1 - recompute.
+// > "0" is "number of subgraph to apply" which is used to control how many subgraphs to apply optimization,
+//    to avoid "oversaving" the memory.
+static const char* const kOrtSessionOptionsMemoryOptimizerApplyConfig = "optimization.memory_optimizer_config";
+
+// Specifies the config for detecting subgraphs for memory footprint reduction.
+// The value should be a string contains int separated using commas. The default value is "0:0".
+static const char* const kOrtSessionOptionsMemoryOptimizerProbeConfig = "optimization.enable_memory_probe_recompute_config";
+#endif
+
+// This setting if set should contain a comma separated list of optimizers names that should be disabled.
+// Optimizers may take time to execute and affect model loading time. If you feel that a specific optimizer
+// does not provider runtime benefits, but affects your model loading time you may disable it using this config
+// entry. This option is not enabled in ORT_MINIMAL_BUILD build.
+// A list of optimizes is available in onnxruntime/core/optimizer/graph_transformer_utils.cc
+//
+// Default is an empty string which means no optimizers are disabled.
+static const char* const kOrtSessionOptionsDisableSpecifiedOptimizers = "optimization.disable_specified_optimizers";
+
+// Enable or disable using device allocator for allocating initialized tensor memory. "1": enable; "0": disable. The default is "0".
+// Using device allocators means the memory allocation is made using malloc/new.
+static const char* const kOrtSessionOptionsUseDeviceAllocatorForInitializers = "session.use_device_allocator_for_initializers";
+
+// Configure whether to allow the inter_op/intra_op threads spinning a number of times before blocking
+// "0": thread will block if found no job to run
+// "1": default, thread will spin a number of times before blocking
+static const char* const kOrtSessionOptionsConfigAllowInterOpSpinning = "session.inter_op.allow_spinning";
+static const char* const kOrtSessionOptionsConfigAllowIntraOpSpinning = "session.intra_op.allow_spinning";
+
+// Key for using model bytes directly for ORT format
+// If a session is created using an input byte array contains the ORT format model data,
+// By default we will copy the model bytes at the time of session creation to ensure the model bytes
+// buffer is valid.
+// Setting this option to "1" will disable copy the model bytes, and use the model bytes directly. The caller
+// has to guarantee that the model bytes are valid until the ORT session using the model bytes is destroyed.
+static const char* const kOrtSessionOptionsConfigUseORTModelBytesDirectly = "session.use_ort_model_bytes_directly";
+
+/// <summary>
+/// Key for using the ORT format model flatbuffer bytes directly for initializers.
+/// This avoids copying the bytes and reduces peak memory usage during model loading and initialization.
+/// Requires `session.use_ort_model_bytes_directly` to be true.
+/// If set, the flatbuffer bytes provided when creating the InferenceSession MUST remain valid for the entire
+/// duration of the InferenceSession.
+/// </summary>
+static const char* const kOrtSessionOptionsConfigUseORTModelBytesForInitializers =
+    "session.use_ort_model_bytes_for_initializers";
+
+// This should only be specified when exporting an ORT format model for use on a different platform.
+// If the ORT format model will be used on ARM platforms set to "1". For other platforms set to "0"
+// Available since version 1.11.
+static const char* const kOrtSessionOptionsQDQIsInt8Allowed = "session.qdqisint8allowed";
+
+// x64 SSE4.1/AVX2/AVX512(with no VNNI) has overflow problem with quantizied matrix multiplication with U8S8.
+// To avoid this we need to use slower U8U8 matrix multiplication instead. This option, if
+// turned on, use slower U8U8 matrix multiplications. Only effective with AVX2 or AVX512
+// platforms.
+static const char* const kOrtSessionOptionsAvx2PrecisionMode = "session.x64quantprecision";
+
+// Specifies how minimal build graph optimizations are handled in a full build.
+// These optimizations are at the extended level or higher.
+// Possible values and their effects are:
+// "save": Save runtime optimizations when saving an ORT format model.
+// "apply": Only apply optimizations available in a minimal build.
+// ""/<unspecified>: Apply optimizations available in a full build.
+// Available since version 1.11.
+static const char* const kOrtSessionOptionsConfigMinimalBuildOptimizations =
+    "optimization.minimal_build_optimizations";
+
+// Note: The options specific to an EP should be specified prior to appending that EP to the session options object in
+// order for them to take effect.
+
+// Specifies a list of stop op types. Nodes of a type in the stop op types and nodes downstream from them will not be
+// run by the NNAPI EP.
+// The value should be a ","-delimited list of op types. For example, "Add,Sub".
+// If not specified, the default set of stop ops is used. To specify an empty stop ops types list and disable stop op
+// exclusion, set the value to "".
+static const char* const kOrtSessionOptionsConfigNnapiEpPartitioningStopOps = "ep.nnapi.partitioning_stop_ops";
+
+// Enabling dynamic block-sizing for multithreading.
+// With a positive value, thread pool will split a task of N iterations to blocks of size starting from:
+// N / (num_of_threads * dynamic_block_base)
+// As execution progresses, the size will decrease according to the diminishing residual of N,
+// meaning the task will be distributed in smaller granularity for better parallelism.
+// For some models, it helps to reduce the variance of E2E inference latency and boost performance.
+// The feature will not function by default, specify any positive integer, e.g. "4", to enable it.
+// Available since version 1.11.
+static const char* const kOrtSessionOptionsConfigDynamicBlockBase = "session.dynamic_block_base";
+
+// This option allows to decrease CPU usage between infrequent
+// requests and forces any TP threads spinning stop immediately when the last of
+// concurrent Run() call returns.
+// Spinning is restarted on the next Run() call.
+// Applies only to internal thread-pools
+static const char* const kOrtSessionOptionsConfigForceSpinningStop = "session.force_spinning_stop";
+
+// "1": all inconsistencies encountered during shape and type inference
+// will result in failures.
+// "0": in some cases warnings will be logged but processing will continue. The default.
+// May be useful to expose bugs in models.
+static const char* const kOrtSessionOptionsConfigStrictShapeTypeInference = "session.strict_shape_type_inference";
+
+// "1": every model using a more recent opset than the latest released one will fail
+// "0": the model may or may not work if onnxruntime cannot find an implementation, this option
+// is used for development purpose.
+static const char* const kOrtSessionOptionsConfigStrictAllowReleasedOpsetsOnly = "session.allow_released_opsets_only";
+
+// The file saves configuration for partitioning node among logic streams
+static const char* const kNodePartitionConfigFile = "session.node_partition_config_file";
+
+// This Option allows setting affinities for intra op threads.
+// Affinity string follows format:
+// logical_processor_id,logical_processor_id;logical_processor_id,logical_processor_id
+// Semicolon isolates configurations among threads, while comma split processors where ith thread expected to attach to.
+// e.g.1,2,3;4,5
+// specifies affinities for two threads, with the 1st thread attach to the 1st, 2nd, and 3rd processor, and 2nd thread to the 4th and 5th.
+// To ease the configuration, an "interval" is also allowed:
+// e.g. 1-8;8-16;17-24
+// orders that the 1st thread runs on first eight processors, 2nd thread runs on next eight processors, and so forth.
+// Note:
+// 1. Once set, the number of thread affinities must equal to intra_op_num_threads - 1, since ort does not set affinity on the main thread which
+//    is started and managed by the calling app;
+// 2. For windows, ort will infer the group id from a logical processor id, for example, assuming there are two groups with each has 64 logical processors,
+//    an id of 64 will be inferred as the last processor of the 1st group, while 65 will be interpreted as the 1st processor of the second group.
+//    Hence 64-65 is an invalid configuration, because a windows thread cannot be attached to processors across group boundary.
+static const char* const kOrtSessionOptionsConfigIntraOpThreadAffinities = "session.intra_op_thread_affinities";
+
+// This option will dump out the model to assist debugging any issues with layout transformation,
+// and is primarily intended for developer usage. It is only relevant if an execution provider that requests
+// NHWC layout is enabled such as NNAPI, XNNPACK or QNN.
+//
+// Default is off. Set to "1" to enable.
+//
+// If modified by layout transformation the model will be dumped after these steps:
+//   1) insertion of the layout transformation Transpose nodes
+//   2) after those are optimized using the transpose optimizer,
+//   3) after the L1 transformers are applied to the updated graph.
+// The model will be saved to filename post_layout_transform_step_<step_number>.onnx.
+static const char* const kDebugLayoutTransformation = "session.debug_layout_transformation";
+
+// Graph nodes that are not supported by the execution providers (EPs) explicitly added to the session are
+// assigned (i.e., "fallback") to the CPU EP by default.
+//
+// This option allows the user to disable the fallback of unsupported graph nodes to the CPU EP.
+// If this option is set to "1", session creation will fail if the execution providers other than the CPU EP cannot
+// fully support all of the nodes in the graph.
+//
+// It is invalid to set this option and explicitly add the CPU EP to the session. In this case, session creation
+// will also fail with an error.
+//
+// Option values:
+// - "0": CPU EP fallback is not disabled. [DEFAULT]
+// - "1": CPU EP fallback is disabled.
+static const char* const kOrtSessionOptionsDisableCPUEPFallback = "session.disable_cpu_ep_fallback";
+
+// Use this config when serializing a large model after optimization to specify an external initializers file
+static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersFileName =
+    "session.optimized_model_external_initializers_file_name";
+
+// Use this config to control the minimum size of the initializer when externalizing it during serialization
+static const char* const kOrtSessionOptionsOptimizedModelExternalInitializersMinSizeInBytes =
+    "session.optimized_model_external_initializers_min_size_in_bytes";
+
+// Use this config when saving pre-packed constant initializers to an external data file.
+// This allows you to memory map pre-packed initializers on model load and leave it to
+// to the OS the amount of memory consumed by the pre-packed initializers. Otherwise,
+// pre-packed data resides on the heap.
+//
+// - "0": Default is not save pre-packed initializers to a data file.
+// - "1": Save pre-packed constant initializers to an external data file.
+// Sample usage: sess_options.add_session_config_entry(kOrtSessionOptionsSavePrePackedConstantInitializers,  "1")
+static const char* const kOrtSessionOptionsSavePrePackedConstantInitializers =
+    "session.save_external_prepacked_constant_initializers";
+
+// Enable EP context feature to dump the partitioned graph which includes the EP context into Onnx file.
+// The dumped Onnx model with EP context can be used for future inference to avoid the EP graph partitioning/compile overhead.
+// "0": disable. (default)
+// "1": enable.
+static const char* const kOrtSessionOptionEpContextEnable = "ep.context_enable";
+
+// Specify the file path for the Onnx model which has EP context.
+// Default to original_file_name_ctx.onnx if not specified
+static const char* const kOrtSessionOptionEpContextFilePath = "ep.context_file_path";
+
+// Flag to specify whether to dump the EP context into the Onnx model.
+// "0": dump the EP context into separate file, keep the file name in the Onnx model. (default).
+// "1": dump the EP context into the Onnx model.
+static const char* const kOrtSessionOptionEpContextEmbedMode = "ep.context_embed_mode";
+
+// Specify the EPContext node name prefix to make it unique
+// in case user need to merge/connect multiple EPContext nodes in one model
+static const char* const kOrtSessionOptionEpContextNodeNamePrefix = "ep.context_node_name_prefix";
+
+// Share EP related resources across EPs
+static const char* const kOrtSessionOptionShareEpContexts = "ep.share_ep_contexts";
+
+// Gemm fastmath mode provides fp32 gemm acceleration with bfloat16 based matmul.
+// Option values:
+// - "0": Gemm FastMath mode is not enabled. [DEFAULT]
+// - "1": Gemm FastMath mode is enabled.
+static const char* const kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16 = "mlas.enable_gemm_fastmath_arm64_bfloat16";
+
+// When converting DQ + MatMul -> MatMulNBits, the accuracy level of the MatMulNBits is controlled by this option.
+// Refer to MatMulNBits op schema for more details.
+// If not provided, default is 4.
+static const char* const kOrtSessionOptionsQDQMatMulNBitsAccuracyLevel = "session.qdq_matmulnbits_accuracy_level";
+
+// THIS OPTION IS NOT A REGULAR SESSION OPTION SINCE IT CAN BE MODIFIED AT ANY TIME
+// Meant to be used with SetEpDynamicOptions
+// Specify the type of workload for this session.
+// “Default”: OS determines the scheduling priority and processor performance to service this workload. [Default]
+// “Efficient”: OS treats this workload is efficiency oriented with low scheduling priority and efficient processor performance.
+static const char* const kOrtEpDynamicOptionsWorkloadType = "ep.dynamic.workload_type";
+
+// Create an Inference Session that will use the Model Builder API to create/update the model.
+// This flag will create the session but not fully initialize it. A model, if provided, will be loaded.
+// A session logger will be created, and execution providers will be registered.
+// Any device specific allocators and IDataTransfer objects will be registered.
+// This allows CreateAllocator to return device specific allocators registered by EPs.
+// FUTURE: This will also allow CopyTensors to utilize the IDataTransfer objects
+// "0": Disabled. [DEFAULT]
+// "1": Enable Model Builder Session
+static const char* const kOrtSessionOptionsEnableModelBuilder = "session.model_builder_session";
diff --git a/third_party/onnxruntime_headers/src/include/onnxruntime/core/session/snippets.dox b/third_party/onnxruntime_headers/src/include/onnxruntime/core/session/snippets.dox
new file mode 100644
index 00000000000000..3fbcabe3e87d6e
--- /dev/null
+++ b/third_party/onnxruntime_headers/src/include/onnxruntime/core/session/snippets.dox
@@ -0,0 +1,8 @@
+[OrtStatus Return Value]
+<returns>If no error, nullptr will be returned. If there is an error, a pointer to an ::OrtStatus that contains error details will be returned. Use OrtApi::ReleaseStatus to free this pointer.</returns>
+[OrtStatus Return Value]
+
+[Log Verbosity Level]
+The log verbosity level controls the display of ::ORT_LOGGING_LEVEL_VERBOSE severity log messages at a finer granularity.
+It only has an effect in debug builds. The higher the value, the more verbose the log output will be.
+[Log Verbosity Level]