From 89ad9186e831a6ae765583242ca065da6ce3330e Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 27 May 2024 04:35:00 +0000
Subject: [PATCH 01/79] torch wip

---
 python/ark/data_type.py  | 33 +++++++++++++++++++++++----------
 python/ark/torch_mock.py | 11 +++++++++++
 2 files changed, 34 insertions(+), 10 deletions(-)
 create mode 100644 python/ark/torch_mock.py

diff --git a/python/ark/data_type.py b/python/ark/data_type.py
index fe95d0d88..de64c1d7d 100644
--- a/python/ark/data_type.py
+++ b/python/ark/data_type.py
@@ -3,26 +3,29 @@
 
 import numpy
 from . import _ark_core
-
+try:
+    import torch
+except ImportError:
+    from . import torch_mock as torch
 
 _REGISTRY_DATA_TYPE = {
-    "fp32": {"np": numpy.float32},
-    "fp16": {"np": numpy.float16},
-    "bf16": {"np": None},
-    "int32": {"np": numpy.int32},
-    "uint32": {"np": numpy.uint32},
-    "int8": {"np": numpy.int8},
-    "uint8": {"np": numpy.uint8},
-    "byte": {"np": numpy.ubyte},
+    "fp32": {"np": numpy.float32, "torch": torch.float32},
+    "fp16": {"np": numpy.float16, "torch": torch.float16},
+    "bf16": {"np": None, "torch": torch.bfloat16},
+    "int32": {"np": numpy.int32, "torch": torch.int32},
+    "uint32": {"np": numpy.uint32, "torch": None},
+    "int8": {"np": numpy.int8, "torch": torch.int8},
+    "uint8": {"np": numpy.uint8, "torch": torch.uint8},
+    "byte": {"np": numpy.ubyte, "torch": torch.uint8},
 }
 
-
 class MetaDataType(type):
     def __new__(cls, name, bases, attrs):
         new_class = super().__new__(cls, name, bases, attrs)
         if name in _REGISTRY_DATA_TYPE:
             reg = _REGISTRY_DATA_TYPE[name]
             new_class.to_numpy = staticmethod(lambda: reg["np"])
+            new_class.to_torch = staticmethod(lambda: reg["torch"])
             new_class.ctype = staticmethod(
                 lambda: getattr(_ark_core, name.upper())
             )
@@ -104,6 +107,16 @@ def to_numpy() -> numpy.dtype:
         """
         ...
 
+    @staticmethod
+    def to_torch() -> torch.dtype:
+        """
+        Return the corresponding torch data type.
+
+        Returns:
+            torch.dtype: The corresponding torch data type.
+        """
+        ...
+
     @staticmethod
     def ctype() -> _ark_core._DataType:
         """
diff --git a/python/ark/torch_mock.py b/python/ark/torch_mock.py
new file mode 100644
index 000000000..e58a3eda8
--- /dev/null
+++ b/python/ark/torch_mock.py
@@ -0,0 +1,11 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+class dtype: ...
+class float32: ...
+class float16: ...
+class bfloat16: ...
+class int32: ...
+class int8: ...
+class uint8: ...
+class ubyte: ...

From ab1998ecef18116bd92f4ea91b14c69becc66655 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Sun, 26 May 2024 21:43:10 -0700
Subject: [PATCH 02/79] Update ut-cuda.yml

---
 .github/workflows/ut-cuda.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/ut-cuda.yml b/.github/workflows/ut-cuda.yml
index e938ca877..5a78818ff 100644
--- a/.github/workflows/ut-cuda.yml
+++ b/.github/workflows/ut-cuda.yml
@@ -7,6 +7,8 @@ on:
   pull_request:
     branches:
       - main
+    types:
+      - ready_for_review
 
 jobs:
   UnitTest:

From ece4f553f62dc2da591321be3f7d5e34bff2c80d Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 27 May 2024 07:24:41 +0000
Subject: [PATCH 03/79] torch wip

---
 python/ark/data_type.py  |  2 ++
 python/ark/module.py     | 33 ++++++++++++++++++++++++++++-----
 python/ark/tensor.py     | 35 +++++++++++++++++++++++++++++++++++
 python/ark/torch_mock.py | 18 ++++++++++++++++++
 4 files changed, 83 insertions(+), 5 deletions(-)

diff --git a/python/ark/data_type.py b/python/ark/data_type.py
index de64c1d7d..f5ccd9e5b 100644
--- a/python/ark/data_type.py
+++ b/python/ark/data_type.py
@@ -3,6 +3,7 @@
 
 import numpy
 from . import _ark_core
+
 try:
     import torch
 except ImportError:
@@ -19,6 +20,7 @@
     "byte": {"np": numpy.ubyte, "torch": torch.uint8},
 }
 
+
 class MetaDataType(type):
     def __new__(cls, name, bases, attrs):
         new_class = super().__new__(cls, name, bases, attrs)
diff --git a/python/ark/module.py b/python/ark/module.py
index 62b941281..459beeda6 100644
--- a/python/ark/module.py
+++ b/python/ark/module.py
@@ -3,9 +3,14 @@
 
 import logging
 import numpy as np
-from typing import Any, Dict
+from typing import Any, Dict, Union
 from .tensor import Parameter
 
+try:
+    import torch
+except ImportError:
+    from . import torch_mock as torch
+
 
 class Module:
     """
@@ -57,7 +62,9 @@ def params_dict(self, prefix="") -> Dict[str, Parameter]:
         return params_dict
 
     def load_state_dict(
-        self, state_dict: Dict[str, np.ndarray], prefix: str = ""
+        self,
+        state_dict: Dict[str, Union[np.ndarray, torch.Tensor]],
+        prefix: str = "",
     ):
         """
         Loads a model from a state_dict and copy the parameters to the device GPU.
@@ -68,20 +75,36 @@ def load_state_dict(
         all_keys = set(state_dict.keys())
         pd = self.params_dict(prefix)
         for name, param in pd.items():
-            param.from_numpy(state_dict[name])
+            data = state_dict.get(name, None)
+            if isinstance(data, np.ndarray):
+                param.from_numpy(data)
+            elif isinstance(data, torch.Tensor):
+                param.from_torch(data)
+            else:
+                continue
             all_keys.remove(name)
         if all_keys:
             logging.warning(
                 f"{len(all_keys)} unused parameter(s) in state_dict"
             )
 
-    def state_dict(self, prefix: str = "") -> Dict[str, np.ndarray]:
+    def state_dict(
+        self, prefix: str = "", mode: str = "numpy"
+    ) -> Dict[str, Union[np.ndarray, torch.Tensor]]:
         """
         Copies the parameters from the device GPU to the host and saves the
         model to a state_dict.
         Must be called after the executor is launched.
         """
-        return {k: v.to_numpy() for k, v in self.params_dict(prefix).items()}
+        if mode == "numpy":
+            return {
+                k: v.to_numpy() for k, v in self.params_dict(prefix).items()
+            }
+        elif mode == "torch":
+            return {
+                k: v.to_torch() for k, v in self.params_dict(prefix).items()
+            }
+        raise ValueError(f"Unsupported mode: {mode}")
 
     def forward(self, *args: Any, **kwargs: Any) -> Any: ...
 
diff --git a/python/ark/tensor.py b/python/ark/tensor.py
index 316d18566..625f82bce 100644
--- a/python/ark/tensor.py
+++ b/python/ark/tensor.py
@@ -8,6 +8,15 @@
 from .data_type import DataType
 from .runtime import Runtime
 
+try:
+    import torch
+
+    _no_torch = False
+except ImportError:
+    from . import torch_mock as torch
+
+    _no_torch = True
+
 NullTensor = _NullTensor
 
 
@@ -89,6 +98,32 @@ def from_numpy(self, ndarray: np.ndarray) -> "Tensor":
         rt.executor.tensor_write(self._tensor, ndarray)
         return self
 
+    def to_torch(self, tensor: torch.Tensor = None) -> torch.Tensor:
+        """ """
+        if _no_torch:
+            raise ImportError("torch is not available")
+        torch_type = self.dtype().to_torch()
+        if tensor is None:
+            return torch.from_numpy(self.to_numpy())
+        elif tensor.shape != self.shape():
+            raise ValueError("torch tensor shape does not match the tensor")
+        elif tensor.dtype != torch_type:
+            raise ValueError("torch tensor dtype does not match the tensor")
+        elif not tensor.is_contiguous():
+            raise ValueError("torch tensor is not contiguous in memory")
+        elif tensor.numel() != self.nelems():
+            raise ValueError("torch tensor size does not match the tensor")
+        tensor.copy_(torch.from_numpy(self.to_numpy()))
+        return tensor
+
+    def from_torch(self, tensor: torch.Tensor) -> "Tensor":
+        """ """
+        if _no_torch:
+            raise ImportError("torch is not available")
+        if tensor.is_cuda:
+            tensor = tensor.cpu()
+        return self.from_numpy(tensor.numpy())
+
 
 class Parameter(Tensor):
     """
diff --git a/python/ark/torch_mock.py b/python/ark/torch_mock.py
index e58a3eda8..68333e431 100644
--- a/python/ark/torch_mock.py
+++ b/python/ark/torch_mock.py
@@ -1,11 +1,29 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
+
 class dtype: ...
+
+
 class float32: ...
+
+
 class float16: ...
+
+
 class bfloat16: ...
+
+
 class int32: ...
+
+
 class int8: ...
+
+
 class uint8: ...
+
+
 class ubyte: ...
+
+
+class Tensor: ...

From 952b7610c31288cc8851aa6466461f2ba7a2393f Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 27 May 2024 23:14:40 +0000
Subject: [PATCH 04/79] runtime module

---
 ark/api/planner.cpp                 |  4 +-
 examples/tutorial/torch_tutorial.py | 23 ++++++++
 python/ark/__init__.py              |  2 +-
 python/ark/data_type.py             | 22 +++++++
 python/ark/module.py                | 71 +++++++++++++++++++++-
 python/ark/tensor.py                | 91 ++++++++++++++++++++---------
 6 files changed, 181 insertions(+), 32 deletions(-)
 create mode 100644 examples/tutorial/torch_tutorial.py

diff --git a/ark/api/planner.cpp b/ark/api/planner.cpp
index ad5048c0e..5c9d09f2e 100644
--- a/ark/api/planner.cpp
+++ b/ark/api/planner.cpp
@@ -56,8 +56,8 @@ static void check_config_field(const ModelOpRef op, const Json &config,
 std::string DefaultPlanner::Impl::plan(bool pretty) const {
     const auto gpu_info = GpuManager::get_instance(gpu_id_)->info();
     size_t num_sm = gpu_info.num_sm;
-    Json task_infos;
-    Json processor_groups;
+    Json task_infos = Json::array();
+    Json processor_groups = Json::array();
     size_t max_num_warps = 1;
     size_t max_num_processors = 1;
     size_t next_node_id = 0;
diff --git a/examples/tutorial/torch_tutorial.py b/examples/tutorial/torch_tutorial.py
new file mode 100644
index 000000000..5677d41cd
--- /dev/null
+++ b/examples/tutorial/torch_tutorial.py
@@ -0,0 +1,23 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import ark
+import torch
+
+
+class ArkAddModule(ark.RuntimeModule):
+    def build_forward(self, x: ark.Tensor, y: ark.Tensor) -> ark.Tensor:
+        return ark.add(x, y)
+
+# ARK module for addition
+module = ArkAddModule()
+
+# Define two torch arrays
+x = torch.ones(64) * 2
+y = torch.ones(64) * 3
+
+# Run the ARK module
+z = module(x, y)
+
+# Print the result
+print(z)
diff --git a/python/ark/__init__.py b/python/ark/__init__.py
index 92e9c39c3..2a4d164e4 100644
--- a/python/ark/__init__.py
+++ b/python/ark/__init__.py
@@ -38,7 +38,7 @@ def set_world_size(world_size):
 
 from .init import init
 from .tensor import Dims, Tensor, Parameter
-from .module import Module
+from .module import Module, RuntimeModule
 from .runtime import Runtime, DefaultPlanner
 from .serialize import save, load
 from .data_type import (
diff --git a/python/ark/data_type.py b/python/ark/data_type.py
index f5ccd9e5b..8ab982106 100644
--- a/python/ark/data_type.py
+++ b/python/ark/data_type.py
@@ -64,6 +64,28 @@ def from_numpy(np_type: numpy.dtype) -> "DataType":
             f" to ark data type."
         )
 
+    @staticmethod
+    def from_torch(torch_type: torch.dtype) -> "DataType":
+        """
+        Return the corresponding ark data type.
+
+        Parameters:
+            torch_type (torch.dtype): The torch data type.
+
+        Returns:
+            DataType: The corresponding ark data type.
+
+        Raises:
+            ValueError: If there is no defined conversion from torch data type to ark data type.
+        """
+        for type_name, reg in _REGISTRY_DATA_TYPE.items():
+            if reg["torch"] == torch_type:
+                return DataType.from_name(type_name)
+        raise ValueError(
+            f"Undefined conversion from torch data type {torch_type}"
+            f" to ark data type."
+        )
+
     @staticmethod
     def from_name(type_name: str) -> "DataType":
         """
diff --git a/python/ark/module.py b/python/ark/module.py
index 459beeda6..b7919d2cd 100644
--- a/python/ark/module.py
+++ b/python/ark/module.py
@@ -3,14 +3,19 @@
 
 import logging
 import numpy as np
-from typing import Any, Dict, Union
-from .tensor import Parameter
+from typing import Any, Dict, List, Union
+from .tensor import Tensor, Parameter
+from .runtime import Runtime, DefaultPlanner
 
 try:
     import torch
+
+    _no_torch = False
 except ImportError:
     from . import torch_mock as torch
 
+    _no_torch = True
+
 
 class Module:
     """
@@ -109,3 +114,65 @@ def state_dict(
     def forward(self, *args: Any, **kwargs: Any) -> Any: ...
 
     def backward(self, *args: Any, **kwargs: Any) -> Any: ...
+
+    def initialize(self):
+        for param in self.parameters.values():
+            param.initialize()
+        for module in self.sub_modules.values():
+            module.initialize()
+
+
+def _recursive_ark_to_torch(object):
+    if isinstance(object, Tensor):
+        return object.to_torch()
+    if isinstance(object, dict):
+        return {k: _recursive_ark_to_torch(v) for k, v in object.items()}
+    if isinstance(object, list):
+        return [_recursive_ark_to_torch(v) for v in object]
+    return object
+
+
+class RuntimeModule(Module):
+    def __init__(self):
+        if _no_torch:
+            raise ImportError("torch is not available")
+        super().__init__()
+        self.built_forward = False
+        self.built_backward = False
+        self.forward_input_tensor_args: List[Tensor] = []
+        self.forward_input_tensor_kwargs: Dict[str, Tensor] = {}
+        self.forward_output = None
+        self.backward_tensor_args = []
+        self.backward_tensor_kwargs = {}
+
+    def build_forward(self, *args: Any, **kwargs: Any) -> Any: ...
+
+    def build_backward(self, *args: Any, **kwargs: Any) -> Any: ...
+
+    def forward(self, *args: Any, **kwargs: Any) -> Any:
+        if not self.built_forward:
+            for arg in args:
+                if isinstance(arg, torch.Tensor):
+                    self.forward_input_tensor_args.append(
+                        Tensor.from_torch(arg)
+                    )
+            for key, value in kwargs.items():
+                if isinstance(value, torch.Tensor):
+                    self.forward_input_tensor_kwargs[key] = Tensor.from_torch(
+                        value
+                    )
+            self.forward_output = self.build_forward(
+                *self.forward_input_tensor_args,
+                **self.forward_input_tensor_kwargs,
+            )
+            self.built_forward = True
+
+        with Runtime.get_runtime() as rt:
+            rt.launch(plan=DefaultPlanner().plan())
+            for arg in self.forward_input_tensor_args:
+                arg.initialize()
+            for value in self.forward_input_tensor_kwargs.values():
+                value.initialize()
+
+            rt.run()
+            return _recursive_ark_to_torch(self.forward_output)
diff --git a/python/ark/tensor.py b/python/ark/tensor.py
index 625f82bce..f264bb440 100644
--- a/python/ark/tensor.py
+++ b/python/ark/tensor.py
@@ -2,11 +2,12 @@
 # Licensed under the MIT license.
 
 import numpy as np
-from typing import List
+from typing import Callable, List, Union, Type
 
 from _ark_core import _Dims, _Tensor, _NullTensor
 from .data_type import DataType
 from .runtime import Runtime
+from .model import Model
 
 try:
     import torch
@@ -24,14 +25,19 @@ class Dims(_Dims):
     pass
 
 
+Initializer = Type[Callable[[], Union[torch.Tensor, np.ndarray]]]
+
+
 class Tensor:
-    def __init__(self, _tensor: _Tensor):
+    def __init__(self, _tensor: _Tensor, initializer: Initializer = None):
         """
         Initializes a new instance of the Tensor class.
         Args:
             _tensor (_ark_core._Tensor): The underlying _Tensor object.
         """
         self._tensor = _tensor
+        self.initializer: Initializer = initializer
+        Model.get_model().add_tensor(self)
 
     def shape(self) -> List[int]:
         """
@@ -80,24 +86,6 @@ def to_numpy(self, ndarray: np.ndarray = None) -> np.ndarray:
         rt.executor.tensor_read(self._tensor, ndarray)
         return ndarray
 
-    def from_numpy(self, ndarray: np.ndarray) -> "Tensor":
-        """
-        Copies the tensor from a host numpy array to the device.
-        """
-        rt = Runtime.get_runtime()
-        if not rt.launched():
-            raise RuntimeError(
-                "Tensor is not allocated yet. `Tensor.from_numpy()` is "
-                "usable only after you call `Runtime.launch()`."
-            )
-        ndarray = ndarray.astype(self.dtype().to_numpy())
-        if not ndarray.flags["C_CONTIGUOUS"]:
-            ndarray = np.ascontiguousarray(ndarray)
-        if ndarray.nbytes != self.nelems() * self.dtype().element_size():
-            raise ValueError("ndarray size does not match the tensor")
-        rt.executor.tensor_write(self._tensor, ndarray)
-        return self
-
     def to_torch(self, tensor: torch.Tensor = None) -> torch.Tensor:
         """ """
         if _no_torch:
@@ -116,13 +104,62 @@ def to_torch(self, tensor: torch.Tensor = None) -> torch.Tensor:
         tensor.copy_(torch.from_numpy(self.to_numpy()))
         return tensor
 
-    def from_torch(self, tensor: torch.Tensor) -> "Tensor":
-        """ """
-        if _no_torch:
-            raise ImportError("torch is not available")
-        if tensor.is_cuda:
-            tensor = tensor.cpu()
-        return self.from_numpy(tensor.numpy())
+    @staticmethod
+    def from_numpy(ndarray: np.ndarray):
+        return Tensor(
+            Model.get_model().tensor(
+                Dims(list(ndarray.shape)),
+                DataType.from_numpy(ndarray.dtype).ctype(),
+                Dims(),
+                Dims(),
+                Dims(),
+                "",
+            ),
+            lambda: ndarray,
+        )
+
+    @staticmethod
+    def from_torch(tensor: torch.Tensor):
+        return Tensor(
+            Model.get_model().tensor(
+                Dims(list(tensor.shape)),
+                DataType.from_torch(tensor.dtype).ctype(),
+                Dims(),
+                Dims(),
+                Dims(),
+                "",
+            ),
+            lambda: tensor,
+        )
+
+    def copy(self, data: Union[np.ndarray, torch.Tensor]) -> "Tensor":
+        """
+        Copies the tensor from a host numpy array to the device.
+        """
+        rt = Runtime.get_runtime()
+        if not rt.launched():
+            raise RuntimeError(
+                "Tensor is not allocated yet. `Tensor.from_numpy()` is "
+                "usable only after you call `Runtime.launch()`."
+            )
+        if isinstance(data, torch.Tensor):
+            data = data.cpu().numpy()
+        data = data.astype(self.dtype().to_numpy())
+        if not data.flags["C_CONTIGUOUS"]:
+            data = np.ascontiguousarray(data)
+        if data.nbytes != self.nelems() * self.dtype().element_size():
+            raise ValueError("data size does not match the tensor")
+        rt.executor.tensor_write(self._tensor, data)
+        return self
+
+    def initialize(self) -> "Tensor":
+        """
+        Initializes the tensor.
+        """
+        if self.initializer is not None:
+            data = self.initializer()
+            self.copy(data)
+        return self
 
 
 class Parameter(Tensor):

From a40926812f7b02f02e1e48a981c65e21c4dadfaa Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 27 May 2024 23:20:44 +0000
Subject: [PATCH 05/79] fix

---
 python/ark/tensor.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/ark/tensor.py b/python/ark/tensor.py
index f264bb440..5168791a8 100644
--- a/python/ark/tensor.py
+++ b/python/ark/tensor.py
@@ -37,7 +37,6 @@ def __init__(self, _tensor: _Tensor, initializer: Initializer = None):
         """
         self._tensor = _tensor
         self.initializer: Initializer = initializer
-        Model.get_model().add_tensor(self)
 
     def shape(self) -> List[int]:
         """

From 8e4622707b34cd4a71579bd65d7ba484e2424969 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 27 May 2024 23:52:16 +0000
Subject: [PATCH 06/79] fix

---
 ark/include/kernels/kernel_template.in |  5 ++++-
 examples/tutorial/torch_tutorial.py    |  6 +++++-
 python/ark/module.py                   | 20 +++++++++++++-------
 3 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/ark/include/kernels/kernel_template.in b/ark/include/kernels/kernel_template.in
index bc842ea4a..5bba320a5 100644
--- a/ark/include/kernels/kernel_template.in
+++ b/ark/include/kernels/kernel_template.in
@@ -59,9 +59,12 @@ void @NAME@(int *_iter) {
       sync_gpu<@NUM_BLOCKS@>(ARK_LOOP_SYNC_STATE);
       ark_loop_body(_buf, _i);
     }
+    if (threadIdx.x == 0) {
+      __threadfence_system();
+    }
+    sync_gpu<@NUM_BLOCKS@>(ARK_LOOP_SYNC_STATE);
     if (threadIdx.x == 0 && blockIdx.x == 0) {
       atomicStoreRelaxed(_iter, 0);
     }
-    sync_gpu<@NUM_BLOCKS@>(ARK_LOOP_SYNC_STATE);
   }
 }
diff --git a/examples/tutorial/torch_tutorial.py b/examples/tutorial/torch_tutorial.py
index 5677d41cd..e9482a7cc 100644
--- a/examples/tutorial/torch_tutorial.py
+++ b/examples/tutorial/torch_tutorial.py
@@ -9,6 +9,7 @@ class ArkAddModule(ark.RuntimeModule):
     def build_forward(self, x: ark.Tensor, y: ark.Tensor) -> ark.Tensor:
         return ark.add(x, y)
 
+
 # ARK module for addition
 module = ArkAddModule()
 
@@ -19,5 +20,8 @@ def build_forward(self, x: ark.Tensor, y: ark.Tensor) -> ark.Tensor:
 # Run the ARK module
 z = module(x, y)
 
+w = module(x, z)
+
 # Print the result
-print(z)
+print(z)  # 5
+print(w)  # 7
diff --git a/python/ark/module.py b/python/ark/module.py
index b7919d2cd..a266f522d 100644
--- a/python/ark/module.py
+++ b/python/ark/module.py
@@ -6,6 +6,8 @@
 from typing import Any, Dict, List, Union
 from .tensor import Tensor, Parameter
 from .runtime import Runtime, DefaultPlanner
+from .ops import tensor
+from .data_type import DataType
 
 try:
     import torch
@@ -154,12 +156,16 @@ def forward(self, *args: Any, **kwargs: Any) -> Any:
             for arg in args:
                 if isinstance(arg, torch.Tensor):
                     self.forward_input_tensor_args.append(
-                        Tensor.from_torch(arg)
+                        tensor(
+                            list(arg.shape),
+                            DataType.from_torch(arg.dtype),
+                        )
                     )
             for key, value in kwargs.items():
                 if isinstance(value, torch.Tensor):
-                    self.forward_input_tensor_kwargs[key] = Tensor.from_torch(
-                        value
+                    self.forward_input_tensor_kwargs[key] = tensor(
+                        list(value.shape),
+                        DataType.from_torch(value.dtype),
                     )
             self.forward_output = self.build_forward(
                 *self.forward_input_tensor_args,
@@ -169,10 +175,10 @@ def forward(self, *args: Any, **kwargs: Any) -> Any:
 
         with Runtime.get_runtime() as rt:
             rt.launch(plan=DefaultPlanner().plan())
-            for arg in self.forward_input_tensor_args:
-                arg.initialize()
-            for value in self.forward_input_tensor_kwargs.values():
-                value.initialize()
+            for tns, arg in zip(self.forward_input_tensor_args, args):
+                tns.copy(arg)
+            for key, value in self.forward_input_tensor_kwargs.items():
+                value.copy(kwargs[key])
 
             rt.run()
             return _recursive_ark_to_torch(self.forward_output)

From eee7ec2b4bb1cde335e99d780657c70e497542c9 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 28 May 2024 19:00:09 +0000
Subject: [PATCH 07/79] some fixes

---
 python/ark/module.py   | 23 ++++++++++++++++-------
 python/ark/tensor.py   | 28 +++++++++++++++++++++-------
 python/executor_py.cpp | 15 ++++++++++++++-
 3 files changed, 51 insertions(+), 15 deletions(-)

diff --git a/python/ark/module.py b/python/ark/module.py
index a266f522d..faeeea40d 100644
--- a/python/ark/module.py
+++ b/python/ark/module.py
@@ -83,12 +83,9 @@ def load_state_dict(
         pd = self.params_dict(prefix)
         for name, param in pd.items():
             data = state_dict.get(name, None)
-            if isinstance(data, np.ndarray):
-                param.from_numpy(data)
-            elif isinstance(data, torch.Tensor):
-                param.from_torch(data)
-            else:
+            if data is None:
                 continue
+            param.copy(data)
             all_keys.remove(name)
         if all_keys:
             logging.warning(
@@ -143,6 +140,8 @@ def __init__(self):
         self.built_backward = False
         self.forward_input_tensor_args: List[Tensor] = []
         self.forward_input_tensor_kwargs: Dict[str, Tensor] = {}
+        self.forward_input_args = []
+        self.forward_input_kwargs = {}
         self.forward_output = None
         self.backward_tensor_args = []
         self.backward_tensor_kwargs = {}
@@ -161,15 +160,25 @@ def forward(self, *args: Any, **kwargs: Any) -> Any:
                             DataType.from_torch(arg.dtype),
                         )
                     )
+                    self.forward_input_args.append(
+                        self.forward_input_tensor_args[-1]
+                    )
+                else:
+                    self.forward_input_args.append(arg)
             for key, value in kwargs.items():
                 if isinstance(value, torch.Tensor):
                     self.forward_input_tensor_kwargs[key] = tensor(
                         list(value.shape),
                         DataType.from_torch(value.dtype),
                     )
+                    self.forward_input_kwargs[key] = (
+                        self.forward_input_tensor_kwargs[key]
+                    )
+                else:
+                    self.forward_input_kwargs[key] = value
             self.forward_output = self.build_forward(
-                *self.forward_input_tensor_args,
-                **self.forward_input_tensor_kwargs,
+                *self.forward_input_args,
+                **self.forward_input_kwargs,
             )
             self.built_forward = True
 
diff --git a/python/ark/tensor.py b/python/ark/tensor.py
index 5168791a8..a567264d5 100644
--- a/python/ark/tensor.py
+++ b/python/ark/tensor.py
@@ -142,13 +142,27 @@ def copy(self, data: Union[np.ndarray, torch.Tensor]) -> "Tensor":
                 "usable only after you call `Runtime.launch()`."
             )
         if isinstance(data, torch.Tensor):
-            data = data.cpu().numpy()
-        data = data.astype(self.dtype().to_numpy())
-        if not data.flags["C_CONTIGUOUS"]:
-            data = np.ascontiguousarray(data)
-        if data.nbytes != self.nelems() * self.dtype().element_size():
-            raise ValueError("data size does not match the tensor")
-        rt.executor.tensor_write(self._tensor, data)
+            if data.dtype != self.dtype().to_torch():
+                raise ValueError("data dtype does not match the tensor")
+            if not data.is_contiguous():
+                data = data.contiguous()
+            if data.numel() != self.nelems():
+                raise ValueError("data size does not match the tensor")
+            rt.executor.tensor_write(
+                self._tensor,
+                data.data_ptr(),
+                data.numel() * data.element_size(),
+            )
+        elif isinstance(data, np.ndarray):
+            if data.dtype != self.dtype().to_numpy():
+                raise ValueError("data dtype does not match the tensor")
+            if not data.flags["C_CONTIGUOUS"]:
+                data = np.ascontiguousarray(data)
+            if data.nbytes != self.nelems() * self.dtype().element_size():
+                raise ValueError("data size does not match the tensor")
+            rt.executor.tensor_write(self._tensor, data)
+        else:
+            raise ValueError("data must be a numpy array or a torch tensor")
         return self
 
     def initialize(self) -> "Tensor":
diff --git a/python/executor_py.cpp b/python/executor_py.cpp
index dc2840329..13a81608e 100644
--- a/python/executor_py.cpp
+++ b/python/executor_py.cpp
@@ -17,6 +17,11 @@ static void tensor_write(ark::Executor *exe, const ark::Tensor &tensor,
                       info.size * info.itemsize);
 }
 
+static void tensor_write(ark::Executor *exe, const ark::Tensor &tensor,
+                         size_t host_address, size_t bytes) {
+    exe->tensor_write(tensor, reinterpret_cast<void *>(host_address), bytes);
+}
+
 static void tensor_read(ark::Executor *exe, const ark::Tensor &tensor,
                         py::buffer host_buffer) {
     py::buffer_info info = host_buffer.request();
@@ -39,5 +44,13 @@ void register_executor(py::module &m) {
         .def("destroy", &ark::Executor::destroy)
         .def("destroyed", &ark::Executor::destroyed)
         .def("tensor_read", &tensor_read, py::arg("tensor"), py::arg("data"))
-        .def("tensor_write", &tensor_write, py::arg("tensor"), py::arg("data"));
+        .def(
+            "tensor_write",
+            py::overload_cast<ark::Executor *, const ark::Tensor &, py::buffer>(
+                &tensor_write),
+            py::arg("tensor"), py::arg("data"))
+        .def("tensor_write",
+             py::overload_cast<ark::Executor *, const ark::Tensor &, size_t,
+                               size_t>(&tensor_write),
+             py::arg("tensor"), py::arg("address"), py::arg("bytes"));
 }

From 87b9b0127de668f810847d04d4c2a08178439ee0 Mon Sep 17 00:00:00 2001
From: Noli Gerawork <86308445+naturalcandy@users.noreply.github.com>
Date: Tue, 18 Jun 2024 11:20:45 -0400
Subject: [PATCH 08/79] Python API Multiple Runtime Support (#216)

- Introduced support for multiple Runtime instances
- Added utility functions for multi-runtime management
- Ensured backward compatibility with existing usage patterns of Runtime
- Added unit tests for multi-runtime functionality

---------

Co-authored-by: noli <t-ngerawork@microsoft.com>
---
 ark/api/executor.cpp               | 101 +++++++++++++++++++++
 ark/include/ark/executor.hpp       |   6 ++
 python/ark/init.py                 |   5 +-
 python/ark/ops.py                  | 138 ++++++++++++++++++++++------
 python/ark/runtime.py              | 139 +++++++++++++++++++++++------
 python/ark/tensor.py               |  69 ++++++++++----
 python/executor_py.cpp             |  30 ++++++-
 python/unittest/test.py            |   1 +
 python/unittest/test_conversion.py |  93 +++++++++++++++++++
 python/unittest/test_runtime.py    | 121 ++++++++++++++++++++++---
 10 files changed, 610 insertions(+), 93 deletions(-)
 create mode 100644 python/unittest/test_conversion.py

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index 198d22e51..a0711bfe8 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -3,12 +3,15 @@
 
 #include "ark/executor.hpp"
 
+#include <dlpack/dlpack.h>
+
 #include <cmath>
 #include <memory>
 #include <mscclpp/core.hpp>
 #include <mscclpp/proxy_channel.hpp>
 #include <mscclpp/sm_channel.hpp>
 
+#include "ark/data_type.hpp"
 #include "ark/model.hpp"
 #include "ark/planner.hpp"
 #include "codegen.hpp"
@@ -154,6 +157,8 @@ class Executor::Impl {
     void tensor_read(const Tensor tensor, void *data, size_t bytes) const;
     void tensor_write(const Tensor tensor, const void *data,
                       size_t bytes) const;
+    DLDeviceType get_device_type() const;
+    DLManagedTensor *get_dl_tensor(const Tensor &tensor) const;
 
    private:
     void init_communicator();
@@ -783,6 +788,94 @@ void Executor::Impl::tensor_write(const Tensor tensor, const void *data,
     copy_stream_->sync();
 }
 
+DLDeviceType Executor::Impl::get_device_type() const {
+#if defined(ARK_CUDA)
+    return kDLCUDA;
+#elif defined(ARK_ROCM)
+    return kDLROCM;
+#else
+    return kDLCPU;
+#endif
+}
+
+DLDataType get_dl_dtype(const DataType &ark_data_type) {
+    DLDataType dl_data_type;
+    dl_data_type.lanes = 1;
+    if (ark_data_type == FP32) {
+        dl_data_type.code = kDLFloat;
+        dl_data_type.bits = 32;
+    } else if (ark_data_type == FP16) {
+        dl_data_type.code = kDLFloat;
+        dl_data_type.bits = 16;
+    } else if (ark_data_type == BF16) {
+        dl_data_type.code = kDLBfloat;
+        dl_data_type.bits = 16;
+    } else if (ark_data_type == INT32) {
+        dl_data_type.code = kDLInt;
+        dl_data_type.bits = 32;
+    } else if (ark_data_type == UINT32) {
+        dl_data_type.code = kDLUInt;
+        dl_data_type.bits = 32;
+    } else if (ark_data_type == INT8) {
+        dl_data_type.code = kDLInt;
+        dl_data_type.bits = 8;
+    } else if (ark_data_type == UINT8) {
+        dl_data_type.code = kDLUInt;
+        dl_data_type.bits = 8;
+    } else if (ark_data_type == BYTE) {
+        dl_data_type.code = kDLUInt;
+        dl_data_type.bits = 8;
+    } else {
+        ERR(InvalidUsageError, "Unsupported data type");
+    }
+    return dl_data_type;
+}
+
+DLManagedTensor *Executor::Impl::get_dl_tensor(const Tensor &tensor) const {
+    DLTensor dl_tensor;
+    dl_tensor.data =
+        buffer_->ref(buffer_id_to_offset_.at(tensor.ref()->buffer()->id()));
+    size_t offset_in_elements =
+        tensor.offsets().is_no_dim() ? 0 : tensor.offsets().vector()[0];
+    dl_tensor.byte_offset = offset_in_elements * tensor.data_type().bytes();
+    dl_tensor.device.device_type = get_device_type();
+    dl_tensor.device.device_id = static_cast<int32_t>(gpu_id_);
+    dl_tensor.ndim = static_cast<int32_t>(tensor.shape().ndims());
+    dl_tensor.dtype = get_dl_dtype(tensor.data_type());
+
+    dl_tensor.shape =
+        tensor.shape().is_no_dim() ? nullptr : new int64_t[dl_tensor.ndim];
+    dl_tensor.strides =
+        tensor.strides().is_no_dim() ? nullptr : new int64_t[dl_tensor.ndim];
+    auto shape = tensor.shape();
+    if (dl_tensor.shape) {
+        for (int i = 0; i < dl_tensor.ndim; ++i) {
+            dl_tensor.shape[i] = shape[i];
+        }
+    }
+    if (dl_tensor.strides) {
+        dl_tensor.strides[dl_tensor.ndim - 1] = 1;
+        for (int i = dl_tensor.ndim - 2; i >= 0; --i) {
+            dl_tensor.strides[i] =
+                dl_tensor.shape[i + 1] * dl_tensor.strides[i + 1];
+        }
+    }
+    DLManagedTensor *dl_managed_tensor = new DLManagedTensor();
+    dl_managed_tensor->dl_tensor = dl_tensor;
+    dl_managed_tensor->manager_ctx = nullptr;
+    dl_managed_tensor->deleter = [](DLManagedTensor *self) {
+        if (self->dl_tensor.shape) {
+            delete[] self->dl_tensor.shape;
+            self->dl_tensor.shape = nullptr;
+        }
+        if (self->dl_tensor.strides) {
+            delete[] self->dl_tensor.strides;
+            self->dl_tensor.strides = nullptr;
+        }
+    };
+    return dl_managed_tensor;
+}
+
 Executor::Executor(int rank, int world_size, int gpu_id,
                    const std::string &name, const std::string &plan)
     : impl_(std::make_unique<Executor::Impl>(rank, world_size, gpu_id, name,
@@ -818,6 +911,14 @@ void Executor::tensor_write(const Tensor tensor, const void *data,
     impl_->tensor_write(tensor, data, bytes);
 }
 
+DLDeviceType Executor::get_device_type() const {
+    return impl_->get_device_type();
+}
+
+DLManagedTensor *Executor::get_dl_tensor(const Tensor &tensor) const {
+    return impl_->get_dl_tensor(tensor);
+}
+
 DefaultExecutor::DefaultExecutor(const Model &model, int gpu_id,
                                  const std::string &name)
     : Executor(
diff --git a/ark/include/ark/executor.hpp b/ark/include/ark/executor.hpp
index 4682af7d0..54c49cd29 100644
--- a/ark/include/ark/executor.hpp
+++ b/ark/include/ark/executor.hpp
@@ -4,6 +4,8 @@
 #ifndef ARK_EXECUTOR_HPP
 #define ARK_EXECUTOR_HPP
 
+#include <dlpack/dlpack.h>
+
 #include <ark/model_ref.hpp>
 #include <ark/tensor.hpp>
 #include <memory>
@@ -62,6 +64,10 @@ class Executor {
     void tensor_write(const Tensor tensor, const void *data,
                       size_t bytes) const;
 
+    DLManagedTensor *get_dl_tensor(const Tensor &tensor) const;
+
+    DLDeviceType get_device_type() const;
+
    private:
     class Impl;
     std::unique_ptr<Impl> impl_;
diff --git a/python/ark/init.py b/python/ark/init.py
index be71e8e02..dbf7c1569 100644
--- a/python/ark/init.py
+++ b/python/ark/init.py
@@ -9,7 +9,6 @@
 def init():
     """Initializes ARK."""
     Model.reset()
-    if _RuntimeState.executor is not None:
-        if not _RuntimeState.executor.destroyed():
-            _RuntimeState.executor.destroy()
+    if _RuntimeState.runtime:
+        _RuntimeState.delete_all()
     _ark_core.init()
diff --git a/python/ark/ops.py b/python/ark/ops.py
index bc1c3ed13..86b021aef 100644
--- a/python/ark/ops.py
+++ b/python/ark/ops.py
@@ -59,6 +59,8 @@ def add(
     tensor_add = ark.add(tensor1, tensor2)
     """
     if isinstance(input, Tensor) and isinstance(other, Tensor):
+        if input.runtime_id != other.runtime_id:
+            raise ValueError("Tensors must be on the same runtime")
         a = input._tensor
         b = other._tensor
     elif isinstance(input, Tensor):
@@ -75,7 +77,9 @@ def add(
         )
     if output is not NullTensor:
         output = output._tensor
-    return Tensor(Model.get_model().add(a, b, output, name))
+    return Tensor(
+        Model.get_model().add(a, b, output, name), runtime_id=input.runtime_id
+    )
 
 
 def cast(
@@ -88,7 +92,8 @@ def cast(
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
-        Model.get_model().cast(input._tensor, dtype.ctype(), output, name)
+        Model.get_model().cast(input._tensor, dtype.ctype(), output, name),
+        runtime_id=input.runtime_id,
     )
 
 
@@ -97,10 +102,12 @@ def constant(
     shape: Iterable[int],
     dtype: DataType = fp32,
     name: str = "constant",
+    runtime_id: int = -1,
 ) -> Tensor:
     """Constant."""
     return Tensor(
-        Model.get_model().constant(value, Dims(shape), dtype.ctype(), name)
+        Model.get_model().constant(value, Dims(shape), dtype.ctype(), name),
+        runtime_id=runtime_id,
     )
 
 
@@ -112,7 +119,10 @@ def copy(
         output = output._tensor
     if isinstance(input, Tensor):
         intput = intput._tensor
-    return Tensor(Model.get_model().copy(intput, output, name))
+    return Tensor(
+        Model.get_model().copy(intput, output, name),
+        runtime_id=input.runtime_id,
+    )
 
 
 def div(
@@ -130,8 +140,13 @@ def div(
     if output is not NullTensor:
         output = output._tensor
     if isinstance(other, Tensor):
+        if input.runtime_id != other.runtime_id:
+            raise ValueError("Tensors must be on the same runtime")
         other = other._tensor
-    return Tensor(Model.get_model().div(input._tensor, other, output, name))
+    return Tensor(
+        Model.get_model().div(input._tensor, other, output, name),
+        runtime_id=input.runtime_id,
+    )
 
 
 def embedding(
@@ -141,10 +156,15 @@ def embedding(
     name: str = "embedding",
 ) -> Tensor:
     """Embedding layer."""
+    if input.runtime_id != weight.runtime_id:
+        raise ValueError("Tensors must be on the same runtime")
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
-        Model.get_model().embedding(input._tensor, weight._tensor, output, name)
+        Model.get_model().embedding(
+            input._tensor, weight._tensor, output, name
+        ),
+        runtime_id=input.runtime_id,
     )
 
 
@@ -158,7 +178,10 @@ def exp(
     """
     if output is not NullTensor:
         output = output._tensor
-    return Tensor(Model.get_model().exp(input._tensor, output, name))
+    return Tensor(
+        Model.get_model().exp(input._tensor, output, name),
+        runtime_id=input.runtime_id,
+    )
 
 
 def gelu(
@@ -174,7 +197,10 @@ def gelu(
     """
     if output is not NullTensor:
         output = output._tensor
-    return Tensor(Model.get_model().gelu(input._tensor, output, name))
+    return Tensor(
+        Model.get_model().gelu(input._tensor, output, name),
+        runtime_id=input.runtime_id,
+    )
 
 
 def identity(
@@ -189,8 +215,13 @@ def identity(
     for dep in deps:
         if not isinstance(dep, Tensor):
             raise TypeError("All dependencies should be a tensor")
+        if input.runtime_id != dep.runtime_id:
+            raise ValueError("All tensors must be on the same runtime")
         dep_tensors.append(dep._tensor)
-    return Tensor(Model.get_model().identity(input._tensor, dep_tensors, name))
+    return Tensor(
+        Model.get_model().identity(input._tensor, dep_tensors, name),
+        runtime_id=input.runtime_id,
+    )
 
 
 def matmul(
@@ -210,6 +241,8 @@ def matmul(
     Usage:
     tensor_matmul = ark.matmul(tensor1, tensor2)
     """
+    if input.runtime_id != other.runtime_id:
+        raise ValueError("Tensors must be on the same runtime")
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
@@ -220,7 +253,8 @@ def matmul(
             transpose_input,
             transpose_other,
             name,
-        )
+        ),
+        runtime_id=input.runtime_id,
     )
 
 
@@ -239,8 +273,13 @@ def mul(
     if output is not NullTensor:
         output = output._tensor
     if isinstance(other, Tensor):
+        if input.runtime_id != other.runtime_id:
+            raise ValueError("Tensors must be on the same runtime")
         other = other._tensor
-    return Tensor(Model.get_model().mul(input._tensor, other, output, name))
+    return Tensor(
+        Model.get_model().mul(input._tensor, other, output, name),
+        runtime_id=input.runtime_id,
+    )
 
 
 def noop(input: Tensor, name: str = "noop"):
@@ -268,7 +307,8 @@ def reduce_max(
     return Tensor(
         Model.get_model().reduce_max(
             input._tensor, axis, keepdims, output, name
-        )
+        ),
+        runtime_id=input.runtime_id,
     )
 
 
@@ -290,7 +330,8 @@ def reduce_mean(
     return Tensor(
         Model.get_model().reduce_mean(
             input._tensor, axis, keepdims, output, name
-        )
+        ),
+        runtime_id=input.runtime_id,
     )
 
 
@@ -314,7 +355,8 @@ def reduce_sum(
     return Tensor(
         Model.get_model().reduce_sum(
             input._tensor, axis, keepdims, output, name
-        )
+        ),
+        runtime_id=input.runtime_id,
     )
 
 
@@ -329,7 +371,10 @@ def relu(
     """
     if output is not NullTensor:
         output = output._tensor
-    return Tensor(Model.get_model().relu(input._tensor, output, name))
+    return Tensor(
+        Model.get_model().relu(input._tensor, output, name),
+        runtime_id=input.runtime_id,
+    )
 
 
 def reshape(
@@ -357,7 +402,8 @@ def reshape(
     if len(shape) > 4:
         raise ValueError("Only support tensors with up to 4 dimensions")
     return Tensor(
-        Model.get_model().reshape(input._tensor, Dims(shape), allowzero, name)
+        Model.get_model().reshape(input._tensor, Dims(shape), allowzero, name),
+        runtime_id=input.runtime_id,
     )
 
 
@@ -374,8 +420,11 @@ def rope(
     """
     if output is not NullTensor:
         output = output._tensor
+    if input.runtime_id != other.runtime_id:
+        raise ValueError("Tensors must be on the same runtime")
     return Tensor(
-        Model.get_model().rope(input._tensor, other._tensor, output, name)
+        Model.get_model().rope(input._tensor, other._tensor, output, name),
+        runtime_id=input.runtime_id,
     )
 
 
@@ -389,7 +438,10 @@ def rsqrt(
     """
     if output is not NullTensor:
         output = output._tensor
-    return Tensor(Model.get_model().rsqrt(input._tensor, output, name))
+    return Tensor(
+        Model.get_model().rsqrt(input._tensor, output, name),
+        runtime_id=input.runtime_id,
+    )
 
 
 def sharding(
@@ -407,7 +459,9 @@ def sharding(
     _tensor_list = Model.get_model().sharding(
         input._tensor, axis, dim_per_shard, name
     )
-    return [Tensor(_tensor) for _tensor in _tensor_list]
+    return [
+        Tensor(_tensor, runtime_id=input.runtime_id) for _tensor in _tensor_list
+    ]
 
 
 def sigmoid(
@@ -421,7 +475,10 @@ def sigmoid(
     """
     if output is not NullTensor:
         output = output._tensor
-    return Tensor(Model.get_model().sigmoid(input._tensor, output, name))
+    return Tensor(
+        Model.get_model().sigmoid(input._tensor, output, name),
+        runtime_id=input.runtime_id,
+    )
 
 
 def sqrt(
@@ -434,7 +491,10 @@ def sqrt(
     """
     if output is not NullTensor:
         output = output._tensor
-    return Tensor(Model.get_model().sqrt(input._tensor, output, name))
+    return Tensor(
+        Model.get_model().sqrt(input._tensor, output, name),
+        runtime_id=input.runtime_id,
+    )
 
 
 def sub(
@@ -452,8 +512,13 @@ def sub(
     if output is not NullTensor:
         output = output._tensor
     if isinstance(other, Tensor):
+        if input.runtime_id != other.runtime_id:
+            raise ValueError("Tensors must be on the same runtime")
         other = other._tensor
-    return Tensor(Model.get_model().sub(input._tensor, other, output, name))
+    return Tensor(
+        Model.get_model().sub(input._tensor, other, output, name),
+        runtime_id=input.runtime_id,
+    )
 
 
 def tensor(
@@ -463,6 +528,7 @@ def tensor(
     offsets: Iterable[int] = [],
     padded_shape: Iterable[int] = [],
     name: str = "",
+    runtime_id: int = -1,
 ) -> Tensor:
     """
     Construct a tensor with given shape and data type.
@@ -470,7 +536,10 @@ def tensor(
     tensor = ark.tensor([1, 2, 3, 4], dtype=ark.fp32)
     tensor = ark.tensor([1, 2], dtype=ark.fp16)
     """
-    return Tensor(_tensor(shape, dtype, strides, offsets, padded_shape, name))
+    return Tensor(
+        _tensor(shape, dtype, strides, offsets, padded_shape, name),
+        runtime_id=runtime_id,
+    )
 
 
 def transpose(
@@ -496,7 +565,8 @@ def transpose(
     if len(perm) > 4:
         raise ValueError("Only support perm up to 4 dimensions")
     return Tensor(
-        Model.get_model().transpose(input._tensor, perm, output, name)
+        Model.get_model().transpose(input._tensor, perm, output, name),
+        runtime_id=input.runtime_id,
     )
 
 
@@ -515,11 +585,15 @@ def mean(
 
 
 def ones(
-    shape: Iterable[int], dtype: DataType = fp32, name: str = "ones"
+    shape: Iterable[int],
+    dtype: DataType = fp32,
+    name: str = "ones",
+    runtime_id: int = -1,
 ) -> Tensor:
     """Ones."""
     return Tensor(
-        Model.get_model().constant(1, Dims(shape), dtype.ctype(), name)
+        Model.get_model().constant(1, Dims(shape), dtype.ctype(), name),
+        runtime_id=runtime_id,
     )
 
 
@@ -530,12 +604,14 @@ def parameter(
     offsets: Iterable[int] = [],
     padded_shape: Iterable[int] = [],
     name: str = "",
+    runtime_id: int = -1,
 ) -> Parameter:
     """
     Construct a parameter with given shape and data type.
     """
     return Parameter(
-        _tensor(shape, dtype, strides, offsets, padded_shape, name)
+        _tensor(shape, dtype, strides, offsets, padded_shape, name),
+        runtime_id=runtime_id,
     )
 
 
@@ -569,11 +645,15 @@ def layernorm(
 
 
 def zeros(
-    shape: Iterable[int], dtype: DataType = fp32, name: str = "zeros"
+    shape: Iterable[int],
+    dtype: DataType = fp32,
+    name: str = "zeros",
+    runtime_id: int = -1,
 ) -> Tensor:
     """Zeros."""
     return Tensor(
-        Model.get_model().constant(0, Dims(shape), dtype.ctype(), name)
+        Model.get_model().constant(0, Dims(shape), dtype.ctype(), name),
+        runtime_id=runtime_id,
     )
 
 
diff --git a/python/ark/runtime.py b/python/ark/runtime.py
index 7480ce7da..798eaf9d5 100644
--- a/python/ark/runtime.py
+++ b/python/ark/runtime.py
@@ -3,7 +3,7 @@
 
 import logging
 from enum import Enum
-from typing import Callable
+from typing import Callable, Dict, List
 
 from _ark_core import _Executor, _DefaultPlanner
 from .model import Model
@@ -14,8 +14,36 @@ class _RuntimeState:
     The _RuntimeState class is used to store the state of the model.
     """
 
-    runtime = None
-    executor = None
+    runtime: Dict[int, "Runtime"] = {}
+
+    @staticmethod
+    def reset_all():
+        """
+        Resets all runtimes.
+        """
+        runtime_ids = list(_RuntimeState.runtime.keys())
+        for runtime_id in runtime_ids:
+            _RuntimeState.runtime[runtime_id].reset()
+
+    @staticmethod
+    def delete_all():
+        """
+        Deletes all runtimes.
+        """
+        runtime_ids = list(_RuntimeState.runtime.keys())
+        for runtime_id in runtime_ids:
+            _RuntimeState.runtime[runtime_id].reset(delete=True)
+
+    @staticmethod
+    def print_runtime_states():
+        """
+        Print runtimes and their corresponding states.
+        """
+        print(f"{'Runtime ID':<12} | {'Status':<20}")
+        print(f"{'-'*12} | {'-'*20}")
+        for runtime_id, runtime in _RuntimeState.runtime.items():
+            runtime_id = "-1(Default)" if runtime_id == -1 else runtime_id
+            print(f"{runtime_id:<12} | {runtime.state:<20}")
 
 
 class DefaultPlanner(_DefaultPlanner):
@@ -61,22 +89,48 @@ class State(Enum):
         LaunchedNotRunning = 1
         Running = 2
 
+    def __init__(self, runtime_id: int = -1):
+        self.runtime_id = runtime_id
+        self.executor: Executor = None
+        self.state: Runtime.State = Runtime.State.Init
+        _RuntimeState.runtime[runtime_id] = self
+
+    def get_state(self) -> "Runtime.State":
+        """
+        Get the runtime state.
+        """
+        return self.state
+
     @staticmethod
-    def get_runtime() -> "Runtime":
+    def exists(runtime_id: int) -> bool:
         """
-        Get the runtime.
+        Check if a runtime exists with the given ID.
         """
-        if _RuntimeState.runtime is None:
-            _RuntimeState.runtime = Runtime()
-        return _RuntimeState.runtime
+        return runtime_id in _RuntimeState.runtime
 
-    def __init__(self):
-        self.executor: Executor = None
-        self.state: Runtime.State = Runtime.State.Init
-        _RuntimeState.runtime = self
+    @staticmethod
+    def get_all_ids() -> List[int]:
+        """
+        Get a list of all existing runtime IDs.
+        """
+        return list(_RuntimeState.runtime.keys())
 
-    def __del__(self):
-        self.reset()
+    @staticmethod
+    def get_runtime(runtime_id=-1) -> "Runtime":
+        """
+        Get the runtime by ID. If runtime_id is not provided, use a default ID of -1.
+        If the runtime does not exist, create a new runtime with the given ID.
+        """
+        if runtime_id not in _RuntimeState.runtime:
+            _RuntimeState.runtime[runtime_id] = Runtime(runtime_id)
+        return _RuntimeState.runtime[runtime_id]
+
+    @staticmethod
+    def see_runtime_statuses() -> "Dict[int, Runtime]":
+        """
+        Returns the runtime dictionary containing all of the runtimes.
+        """
+        return _RuntimeState.runtime
 
     def __enter__(self):
         return self
@@ -113,7 +167,9 @@ def launch(
         initialized. The executor will compile the cuda kernels and launch the ARK runtime.
         """
         if self.launched():
-            logging.warn("Runtime is already launched, skip launching")
+            logging.warn(
+                f"Runtime {self.runtime_id} is already launched, skip launching"
+            )
             return
         if not plan:
             if not plan_path:
@@ -124,19 +180,19 @@ def launch(
         # If the RuntimeState is init, we need to create a new executor and
         # compile the kernels
         if self.state == Runtime.State.Init:
-            if _RuntimeState.executor is not None:
-                if not _RuntimeState.executor.destroyed():
-                    logging.warn("Destroying an old executor")
-                    _RuntimeState.executor.destroy()
-
-            _RuntimeState.executor = Executor(
+            if self.executor is not None:
+                if not self.executor.destroyed():
+                    logging.warn(
+                        f"Runtime {self.runtime_id}, has already been launched. Destroying the old executor"
+                    )
+                    self.executor.destroy()
+            self.executor = Executor(
                 rank,
                 world_size,
                 gpu_id,
                 "ArkRuntime",
                 plan,
             )
-            self.executor = _RuntimeState.executor
             self.executor.compile()
         self.executor.launch()
         self.state = Runtime.State.LaunchedNotRunning
@@ -146,8 +202,8 @@ def run(self, iter=1, non_blocking=False):
         Run the ARK program for iter iterations and wait for the kernel to finish.
         """
         if self.state != Runtime.State.LaunchedNotRunning:
-            logging.error("ARK runtime is not launched")
-            raise RuntimeError("ARK runtime is not launched")
+            logging.error(f"ARK runtime {self.runtime_id} is not launched")
+            raise RuntimeError(f"ARK runtime {self.runtime_id} is not launched")
         self.state = Runtime.State.Running
         self.executor.run(iter)
         if not non_blocking:
@@ -158,7 +214,9 @@ def wait(self):
         Wait for the kernel to finish.
         """
         if self.state != Runtime.State.Running:
-            logging.warn("ARK runtime is not running, skip waiting")
+            logging.warn(
+                f"ARK runtime {self.runtime_id} is not running, skip waiting"
+            )
             return
         self.executor.wait()
         self.state = Runtime.State.LaunchedNotRunning
@@ -169,15 +227,17 @@ def stop(self) -> float:
         Once this is called, we need to call `launch()` again to run the model again.
         """
         if not self.launched():
-            logging.warn("ARK runtime is never launched, skip stopping")
+            logging.warn(
+                f"ARK runtime {self.runtime_id} is never launched, skip stopping"
+            )
             return
         elapsed = self.executor.stop()
         self.state = Runtime.State.LaunchedNotRunning
         return elapsed
 
-    def reset(self):
+    def reset(self, delete=False):
         """
-        Reset the runtime.
+        Reset the runtime. If delete is True, delete the runtime associated with the runtime_id.
         """
         if self.launched():
             self.stop()
@@ -186,3 +246,26 @@ def reset(self):
                 self.executor.destroy()
             self.executor = None
         self.state = Runtime.State.Init
+        if delete:
+            del _RuntimeState.runtime[self.runtime_id]
+
+    @staticmethod
+    def reset_all_runtimes():
+        """
+        Reset all runtimes.
+        """
+        _RuntimeState.reset_all()
+
+    @staticmethod
+    def delete_all_runtimes():
+        """
+        Delete all runtimes.
+        """
+        _RuntimeState.delete_all()
+
+    @staticmethod
+    def print_runtime_states():
+        """
+        Print runtimes and their corresponding states.
+        """
+        _RuntimeState.print_runtime_states()
diff --git a/python/ark/tensor.py b/python/ark/tensor.py
index a567264d5..00e266929 100644
--- a/python/ark/tensor.py
+++ b/python/ark/tensor.py
@@ -29,14 +29,22 @@ class Dims(_Dims):
 
 
 class Tensor:
-    def __init__(self, _tensor: _Tensor, initializer: Initializer = None):
+    def __init__(
+        self,
+        _tensor: _Tensor,
+        initializer: Initializer = None,
+        runtime_id: int = -1,
+    ):
         """
         Initializes a new instance of the Tensor class.
         Args:
             _tensor (_ark_core._Tensor): The underlying _Tensor object.
+            intializer (Initializer): The initializer for the Tensor.
+            runtime_id (int): The ID of the Runtime to use. Defaults to -1, which is the default Runtime.
         """
         self._tensor = _tensor
         self.initializer: Initializer = initializer
+        self.runtime_id = runtime_id
 
     def shape(self) -> List[int]:
         """
@@ -69,7 +77,7 @@ def to_numpy(self, ndarray: np.ndarray = None) -> np.ndarray:
         an empty numpy array without the data buffer will be returned.
         """
         np_type = self.dtype().to_numpy()
-        rt = Runtime.get_runtime()
+        rt = Runtime.get_runtime(self.runtime_id)
         if not rt.launched():
             return np.ndarray(self.shape(), dtype=np_type, buffer=None)
         if ndarray is None:
@@ -85,7 +93,9 @@ def to_numpy(self, ndarray: np.ndarray = None) -> np.ndarray:
         rt.executor.tensor_read(self._tensor, ndarray)
         return ndarray
 
-    def to_torch(self, tensor: torch.Tensor = None) -> torch.Tensor:
+    def to_torch(
+        self, tensor: torch.Tensor = None, runtime_id: int = -1
+    ) -> torch.Tensor:
         """ """
         if _no_torch:
             raise ImportError("torch is not available")
@@ -100,22 +110,42 @@ def to_torch(self, tensor: torch.Tensor = None) -> torch.Tensor:
             raise ValueError("torch tensor is not contiguous in memory")
         elif tensor.numel() != self.nelems():
             raise ValueError("torch tensor size does not match the tensor")
-        tensor.copy_(torch.from_numpy(self.to_numpy()))
+        tensor.copy_(torch.from_numpy(self.to_numpy(self.runtime_id)))
         return tensor
 
-    @staticmethod
-    def from_numpy(ndarray: np.ndarray):
-        return Tensor(
-            Model.get_model().tensor(
-                Dims(list(ndarray.shape)),
-                DataType.from_numpy(ndarray.dtype).ctype(),
-                Dims(),
-                Dims(),
-                Dims(),
-                "",
-            ),
-            lambda: ndarray,
-        )
+    def get_torch_view(self) -> torch.Tensor:
+        """
+        Returns a torch tensor that shares the same memory with the device tensor.
+        """
+        if _no_torch:
+            raise ImportError("torch is not available")
+        rt = Runtime.get_runtime(self.runtime_id)
+        if not rt.launched():
+            raise RuntimeError(
+                "Tensor is not allocated yet. `Tensor.get_torch_view()` is "
+                "usable only after you call `Runtime.launch()`."
+            )
+        dl_tensor = rt.executor.get_dl_tensor(self._tensor)
+        torch_view = torch.utils.dlpack.from_dlpack(dl_tensor)
+        return torch_view
+
+    def from_numpy(self, ndarray: np.ndarray) -> "Tensor":
+        """
+        Copies the tensor from a host numpy array to the device.
+        """
+        rt = Runtime.get_runtime(self.runtime_id)
+        if not rt.launched():
+            raise RuntimeError(
+                "Tensor is not allocated yet. `Tensor.from_numpy()` is "
+                "usable only after you call `Runtime.launch()`."
+            )
+        ndarray = ndarray.astype(self.dtype().to_numpy())
+        if not ndarray.flags["C_CONTIGUOUS"]:
+            ndarray = np.ascontiguousarray(ndarray)
+        if ndarray.nbytes != self.nelems() * self.dtype().element_size():
+            raise ValueError("ndarray size does not match the tensor")
+        rt.executor.tensor_write(self._tensor, ndarray)
+        return self
 
     @staticmethod
     def from_torch(tensor: torch.Tensor):
@@ -135,7 +165,7 @@ def copy(self, data: Union[np.ndarray, torch.Tensor]) -> "Tensor":
         """
         Copies the tensor from a host numpy array to the device.
         """
-        rt = Runtime.get_runtime()
+        rt = Runtime.get_runtime(self.runtime_id)
         if not rt.launched():
             raise RuntimeError(
                 "Tensor is not allocated yet. `Tensor.from_numpy()` is "
@@ -180,8 +210,9 @@ class Parameter(Tensor):
     A tensor as a parameter.
     """
 
-    def __init__(self, _tensor: _Tensor):
+    def __init__(self, _tensor: _Tensor, runtime_id: int = -1):
         """
         Initializes a new instance of the Parameter class.
         """
         super().__init__(_tensor)
+        self.runtime_id = runtime_id
diff --git a/python/executor_py.cpp b/python/executor_py.cpp
index 13a81608e..59bee5a9b 100644
--- a/python/executor_py.cpp
+++ b/python/executor_py.cpp
@@ -1,13 +1,14 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
+#include <dlpack/dlpack.h>
 #include <pybind11/operators.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
 #include <ark/executor.hpp>
 #include <ark/model.hpp>
-
+#include <iostream>
 namespace py = pybind11;
 
 static void tensor_write(ark::Executor *exe, const ark::Tensor &tensor,
@@ -29,6 +30,29 @@ static void tensor_read(ark::Executor *exe, const ark::Tensor &tensor,
                      info.size * info.itemsize);
 }
 
+DLManagedTensor *to_dlpack(ark::Executor &exe, const ark::Tensor &tensor) {
+    DLManagedTensor *dl_tensor = exe.get_dl_tensor(tensor);
+    return dl_tensor;
+}
+
+void free_capsule(PyObject *capsule) {
+    const char *name = PyCapsule_GetName(capsule);
+    auto *dl_managed_tensor =
+        static_cast<DLManagedTensor *>(PyCapsule_GetPointer(capsule, name));
+    if (dl_managed_tensor) {
+        dl_managed_tensor->deleter(dl_managed_tensor);
+        dl_managed_tensor = nullptr;
+    }
+}
+
+py::capsule to_dlpack_capsule(ark::Executor &self, const ark::Tensor &tensor) {
+    DLManagedTensor *dl_managed_tensor = to_dlpack(self, tensor);
+    const char *capsule_name = "dltensor";
+    PyObject *dl_capsule = PyCapsule_New(static_cast<void *>(dl_managed_tensor),
+                                         capsule_name, free_capsule);
+    return py::reinterpret_steal<py::capsule>(dl_capsule);
+}
+
 void register_executor(py::module &m) {
     py::class_<ark::Executor>(m, "_Executor")
         .def(
@@ -52,5 +76,7 @@ void register_executor(py::module &m) {
         .def("tensor_write",
              py::overload_cast<ark::Executor *, const ark::Tensor &, size_t,
                                size_t>(&tensor_write),
-             py::arg("tensor"), py::arg("address"), py::arg("bytes"));
+             py::arg("tensor"), py::arg("address"), py::arg("bytes"))
+        .def("get_dl_tensor", &to_dlpack_capsule),
+        py::arg("tensor");
 }
diff --git a/python/unittest/test.py b/python/unittest/test.py
index f6f9b97af..e43ff11e2 100644
--- a/python/unittest/test.py
+++ b/python/unittest/test.py
@@ -9,3 +9,4 @@
 
 from test_model import *
 from test_runtime import *
+from test_conversion import *
diff --git a/python/unittest/test_conversion.py b/python/unittest/test_conversion.py
new file mode 100644
index 000000000..8f941a033
--- /dev/null
+++ b/python/unittest/test_conversion.py
@@ -0,0 +1,93 @@
+import torch
+import numpy as np
+import ark
+
+
+def initialize_tensor(dimensions, dtype):
+    tensor = ark.tensor(dimensions, dtype)
+    tensor_host = np.random.rand(*dimensions).astype(dtype.to_numpy())
+    return tensor, tensor_host
+
+
+# Test function to validate the integrity of the PyTorch view of the ARK tensor,
+# including its data and attributes such as shape and data type.
+def test_values_fixed_dims(num_dims: int, size: int, dtype: ark.DataType):
+    ark.init()
+    dimensions = [size] * num_dims
+
+    input_tensor, input_tensor_host = initialize_tensor(dimensions, dtype)
+    other_tensor, other_tensor_host = initialize_tensor(dimensions, dtype)
+    output_tensor = ark.add(input_tensor, other_tensor)
+
+    runtime = ark.Runtime()
+    runtime.launch()
+
+    input_tensor.from_numpy(input_tensor_host)
+    other_tensor.from_numpy(other_tensor_host)
+
+    input_view = input_tensor.get_torch_view()
+    other_view = other_tensor.get_torch_view()
+    output_view = output_tensor.get_torch_view()
+
+    runtime.run()
+
+    input_view_numpy = input_view.cpu().numpy()
+    other_view_numpy = other_view.cpu().numpy()
+    output_view_numpy = output_view.cpu().numpy()
+
+    output_tensor_host = output_tensor.to_numpy()
+
+    runtime.stop()
+    runtime.delete_all_runtimes()
+
+    assert np.allclose(input_tensor_host, input_view_numpy)
+    assert np.allclose(other_tensor_host, other_view_numpy)
+    assert np.allclose(output_tensor_host, output_view_numpy)
+
+
+# Function to check if there is a difference between two arrays at a specific index
+def check_diff(input_tensor_host, input_view_numpy, value, index):
+    mask = np.ones(input_tensor_host.shape, dtype=bool)
+    mask[index] = False
+    if not np.allclose(input_tensor_host[mask], input_view_numpy[mask]):
+        print("Difference found at index: ", index)
+        return False
+    if input_view_numpy[index] != value:
+        print(input_view_numpy[index], value)
+        return False
+    return True
+
+
+# Test function to check if changes to the torch views are reflected in the original tensors
+def test_aliasing(dtype: ark.DataType):
+    ark.init()
+    dimensions = [4, 4]
+    input_tensor, input_tensor_host = initialize_tensor(dimensions, dtype)
+    other_tensor, other_tensor_host = initialize_tensor(dimensions, dtype)
+    output_tensor = ark.mul(input_tensor, other_tensor)
+    runtime = ark.Runtime()
+    runtime.launch()
+    input_tensor.from_numpy(input_tensor_host)
+    other_tensor.from_numpy(other_tensor_host)
+
+    input_view = input_tensor.get_torch_view()
+    other_view = other_tensor.get_torch_view()
+    output_view = output_tensor.get_torch_view()
+    # make changes to the views
+    input_view[1, 1] = 20
+    other_view[0, 0] = 30
+    runtime.run()
+    output_view[3, 0] = 40
+
+    output_tensor_host = output_tensor.to_numpy()
+    input_view_numpy = input_view.cpu().numpy()
+    other_view_numpy = other_view.cpu().numpy()
+    output_view_numpy = output_view.cpu().numpy()
+    # Check if changes to the views are reflected in the original tensors
+    print(input_view_numpy)
+    assert check_diff(input_tensor_host, input_view_numpy, 20, (1, 1))
+    assert check_diff(other_tensor_host, other_view_numpy, 30, (0, 0))
+    assert check_diff(output_tensor_host, output_view_numpy, 40, (3, 0))
+
+    runtime.stop()
+    runtime.reset()
diff --git a/python/unittest/test_runtime.py b/python/unittest/test_runtime.py
index bd9098fe8..fd34bb96b 100644
--- a/python/unittest/test_runtime.py
+++ b/python/unittest/test_runtime.py
@@ -4,21 +4,20 @@
 import ark
 import json
 
+empty_plan = json.dumps(
+    {
+        "Rank": 0,
+        "WorldSize": 1,
+        "NumProcessors": 1,
+        "NumWarpsPerProcessor": 1,
+        "TaskInfos": [],
+        "ProcessorGroups": [],
+    }
+)
+
 
 def test_runtime_relaunch():
     ark.init()
-
-    empty_plan = json.dumps(
-        {
-            "Rank": 0,
-            "WorldSize": 1,
-            "NumProcessors": 1,
-            "NumWarpsPerProcessor": 1,
-            "TaskInfos": [],
-            "ProcessorGroups": [],
-        }
-    )
-
     with ark.Runtime.get_runtime() as rt:
         assert rt.launched() == False
         rt.launch(plan=empty_plan)
@@ -28,3 +27,101 @@ def test_runtime_relaunch():
         assert rt.launched() == False
         rt.launch(plan=empty_plan)
         assert rt.launched() == True
+
+
+def test_multiple_runtime_launch():
+    ark.init()
+    num_runtimes = 5
+    for i in range(num_runtimes):
+        rt = ark.Runtime.get_runtime(i)
+        assert rt.launched() == False
+        rt.launch(gpu_id=i, plan=empty_plan)
+        assert rt.launched() == True
+    for i in range(num_runtimes):
+        rt = ark.Runtime.get_runtime(i)
+        assert rt.launched() == True
+    ark.Runtime.delete_all_runtimes()
+
+
+def test_stop_runtime():
+    ark.init()
+    rt1 = ark.Runtime.get_runtime(1)
+    rt1.launch(plan=empty_plan, gpu_id=1)
+    rt2 = ark.Runtime.get_runtime(2)
+    rt2.launch(plan=empty_plan, gpu_id=2)
+    rt1.stop()
+    rt1.reset()
+    assert rt1.state == ark.Runtime.State.Init
+    assert rt2.state == ark.Runtime.State.LaunchedNotRunning
+    ark.Runtime.delete_all_runtimes()
+
+
+def test_reset_runtime():
+    ark.init()
+    rt1 = ark.Runtime.get_runtime(0)
+    rt1.launch(plan=empty_plan, gpu_id=1)
+    rt2 = ark.Runtime.get_runtime(1)
+    rt2.launch(plan=empty_plan, gpu_id=2)
+    rt1.reset()
+    assert rt1.launched() == False
+    assert rt2.launched() == True
+    rt1.launch(plan=empty_plan)
+    assert rt1.launched() == True
+    ark.Runtime.delete_all_runtimes()
+
+
+def test_multiple_runtimes_complex():
+    ark.init()
+    num_runtimes = 3
+    runtime_list = [ark.Runtime.get_runtime(i) for i in range(num_runtimes)]
+    default_runtime = ark.Runtime.get_runtime()
+    runtime_list.append(default_runtime)
+    for i, rt in enumerate(runtime_list):
+        rt.launch(plan=empty_plan, gpu_id=i)
+        assert rt.launched() == True
+    runtime_list[0].stop()
+    assert runtime_list[0].state == ark.Runtime.State.LaunchedNotRunning
+    for rt in runtime_list[1:]:
+        assert rt.launched() == True
+    runtime_list[1].reset()
+    assert runtime_list[1].state == ark.Runtime.State.Init
+    assert runtime_list[0].state == ark.Runtime.State.LaunchedNotRunning
+    assert runtime_list[2].state == ark.Runtime.State.LaunchedNotRunning
+    runtime_list[1].launch(plan=empty_plan, gpu_id=1)
+    for rt in runtime_list:
+        assert rt.launched() == True
+    ark.Runtime.delete_all_runtimes()
+
+
+def test_runtime_state_after_reset():
+    ark.init()
+    rt = ark.Runtime.get_runtime()
+    rt.launch(plan=empty_plan)
+    rt.reset()
+    assert rt.launched() == False
+    assert rt.running() == False
+    ark.Runtime.delete_all_runtimes()
+
+
+def test_see_runtime_statuses():
+    ark.init()
+    num_runtimes = 3
+    runtimes = [ark.Runtime.get_runtime(i) for i in range(num_runtimes)]
+    runtime_statuses = ark.Runtime.see_runtime_statuses()
+    assert len(runtime_statuses) == num_runtimes
+    for i in range(num_runtimes):
+        assert i in runtime_statuses
+    for i, rt in enumerate(runtimes):
+        assert runtime_statuses[i] == rt
+    ark.Runtime.delete_all_runtimes()
+
+
+def test_multiple_runtimes_init():
+    ark.init()
+    runtimes = [ark.Runtime.get_runtime(i) for i in range(3)]
+    for rt in runtimes:
+        assert rt.state == ark.Runtime.State.Init
+    ark.init()
+    runtimes = ark.Runtime.see_runtime_statuses()
+    assert len(runtimes) == 0
+    ark.Runtime.delete_all_runtimes()

From 9a0556bde84a4dd6a76f39155d60957c9165ad52 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 18 Jun 2024 21:30:02 +0000
Subject: [PATCH 09/79] cmake dlpack

---
 .gitmodules                |  4 ++++
 ark/CMakeLists.txt         |  1 +
 third_party/CMakeLists.txt | 13 +++++++++++++
 third_party/dlpack         |  1 +
 4 files changed, 19 insertions(+)
 create mode 160000 third_party/dlpack

diff --git a/.gitmodules b/.gitmodules
index ced5dcf94..ec484eb61 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -17,3 +17,7 @@
 [submodule "third_party/json"]
 	path = third_party/json
 	url = https://github.com/nlohmann/json
+
+[submodule "third_party/dlpack"]
+	path = third_party/dlpack
+	url = https://github.com/dmlc/dlpack
diff --git a/ark/CMakeLists.txt b/ark/CMakeLists.txt
index 4457d3c0b..ce03b65ed 100644
--- a/ark/CMakeLists.txt
+++ b/ark/CMakeLists.txt
@@ -17,6 +17,7 @@ set(COMMON_LIBS ARK::numa ARK::ibverbs pthread rt)
 target_include_directories(ark_obj PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
 target_include_directories(ark_obj PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
 target_include_directories(ark_obj SYSTEM PRIVATE
+    ${DLPACK_INCLUDE_DIRS}
     ${JSON_INCLUDE_DIRS}
     ${MSCCLPP_INCLUDE_DIRS}
     ${IBVERBS_INCLUDE_DIRS}
diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt
index 75916d962..cc4b5eb5c 100644
--- a/third_party/CMakeLists.txt
+++ b/third_party/CMakeLists.txt
@@ -35,6 +35,19 @@ if (NOT json_POPULATED)
 endif()
 set(JSON_INCLUDE_DIRS ${json_SOURCE_DIR}/include PARENT_SCOPE)
 
+# DLPack
+FetchContent_Declare(
+    dlpack
+    GIT_REPOSITORY https://github.com/dmlc/dlpack
+    GIT_TAG v0.8
+    SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/dlpack
+)
+FetchContent_GetProperties(dlpack)
+if (NOT dlpack_POPULATED)
+    FetchContent_Populate(dlpack)
+endif()
+set(DLPACK_INCLUDE_DIRS ${dlpack_SOURCE_DIR}/include PARENT_SCOPE)
+
 if(USE_CUDA)
     # Configure CUTLASS
     FetchContent_Declare(
diff --git a/third_party/dlpack b/third_party/dlpack
new file mode 160000
index 000000000..365b823ce
--- /dev/null
+++ b/third_party/dlpack
@@ -0,0 +1 @@
+Subproject commit 365b823cedb281cd0240ca601aba9b78771f91a3

From 75f7831b700783e899beaa15f950f125a7520d6c Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 18 Jun 2024 22:38:35 +0000
Subject: [PATCH 10/79] include dlpack for pybind

---
 python/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index efb9aea3e..bd25d01e6 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -20,3 +20,4 @@ file(GLOB_RECURSE BIND_SOURCES CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/*.c
 pybind11_add_module(ark_py ${BIND_SOURCES})
 set_target_properties(ark_py PROPERTIES OUTPUT_NAME _ark_core)
 target_link_libraries(ark_py PRIVATE ark_static)
+target_include_directories(ark_py SYSTEM PRIVATE ${DLPACK_INCLUDE_DIRS})

From 94b44f20a15c892d5a47e1597d838891ca600553 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 24 Jun 2024 23:51:22 +0000
Subject: [PATCH 11/79] support d2d copy

---
 ark/api/executor.cpp               | 99 ++++++++++++++++++++----------
 ark/include/ark/executor.hpp       | 10 ++-
 python/ark/tensor.py               | 42 +++++++++----
 python/executor_py.cpp             | 33 +++++++---
 python/unittest/test_conversion.py | 37 ++++++++++-
 5 files changed, 162 insertions(+), 59 deletions(-)

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index a0711bfe8..96e53c8cf 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -147,6 +147,8 @@ class Executor::Impl {
          const std::string &plan);
     ~Impl() = default;
 
+    int gpu_id() const { return gpu_id_; }
+
     void compile();
     void launch(int64_t max_spin_count);
     void run(int iter);
@@ -154,9 +156,10 @@ class Executor::Impl {
     float stop(int64_t max_spin_count);
     void barrier();
 
-    void tensor_read(const Tensor tensor, void *data, size_t bytes) const;
+    void tensor_read(const Tensor tensor, void *data, size_t bytes,
+                     bool is_d2d) const;
     void tensor_write(const Tensor tensor, const void *data,
-                      size_t bytes) const;
+                      size_t bytes, bool is_d2d) const;
     DLDeviceType get_device_type() const;
     DLManagedTensor *get_dl_tensor(const Tensor &tensor) const;
 
@@ -731,57 +734,83 @@ void Executor::Impl::barrier() {
 }
 
 void Executor::Impl::tensor_read(const Tensor tensor, void *data,
-                                 size_t bytes) const {
+                                 size_t bytes, bool is_d2d) const {
     GLOG(gpuSetDevice(gpu_id_));
     size_t tensor_data_bytes =
         tensor.shape().nelems() * tensor.data_type().bytes();
-    if (bytes < tensor_data_bytes) {
-        ERR(InvalidUsageError, "Data buffer (", bytes,
-            ") is smaller than the tensor data (", tensor_data_bytes, ").");
+    if (bytes != tensor_data_bytes) {
+        ERR(InvalidUsageError, "Destination bytes (", bytes,
+            ") mismatches the tensor data bytes (", tensor_data_bytes, ").");
     }
-    size_t tensor_bytes =
-        tensor.strides().nelems() * tensor.data_type().bytes();
-    void *src =
-        buffer_->ref(buffer_id_to_offset_.at(tensor.ref()->buffer()->id()));
+    size_t buffer_id = tensor.ref()->buffer()->id();
+    if (buffer_id_to_offset_.find(buffer_id) == buffer_id_to_offset_.end()) {
+        ERR(NotFoundError, "Invalid buffer ID: ", buffer_id);
+    }
+    size_t offset = buffer_id_to_offset_.at(buffer_id);
+    auto kind = (is_d2d) ? gpuMemcpyDeviceToDevice : gpuMemcpyDeviceToHost;
+    void *src = buffer_->ref(offset);
     if (tensor.strides() == tensor.shape()) {
-        GLOG(gpuMemcpyAsync(data, src, bytes, gpuMemcpyDeviceToHost,
-                            copy_stream_->get()));
-        copy_stream_->sync();
+        GLOG(gpuMemcpyAsync(data, src, bytes, kind, copy_stream_->get()));
     } else {
+        size_t tensor_bytes =
+            tensor.strides().nelems() * tensor.data_type().bytes();
         std::vector<int8_t> tensor_host(tensor_bytes);
         GLOG(gpuMemcpyAsync(tensor_host.data(), src, tensor_bytes,
                             gpuMemcpyDeviceToHost, copy_stream_->get()));
         copy_stream_->sync();
-        tensor_to_data(tensor_host.data(), static_cast<int8_t *>(data),
-                       tensor.shape(), tensor.strides(), tensor.offsets(),
-                       tensor.data_type().bytes());
+        if (!is_d2d) {
+            tensor_to_data(tensor_host.data(), static_cast<int8_t *>(data),
+                    tensor.shape(), tensor.strides(), tensor.offsets(),
+                    tensor.data_type().bytes());
+            return;
+        }
+        // TODO: convert data layout on the device directly
+        std::vector<int8_t> data_host(bytes);
+        tensor_to_data(tensor_host.data(), data_host.data(),
+                tensor.shape(), tensor.strides(), tensor.offsets(),
+                tensor.data_type().bytes());
+        GLOG(gpuMemcpyAsync(data, data_host.data(), bytes,
+                            gpuMemcpyHostToDevice, copy_stream_->get()));
     }
+    copy_stream_->sync();
 }
 
 void Executor::Impl::tensor_write(const Tensor tensor, const void *data,
-                                  size_t bytes) const {
+                                  size_t bytes, bool is_d2d) const {
     GLOG(gpuSetDevice(gpu_id_));
     size_t tensor_data_bytes =
         tensor.shape().nelems() * tensor.data_type().bytes();
-    if (bytes < tensor_data_bytes) {
-        ERR(InvalidUsageError, "Data buffer (", bytes,
-            ") is smaller than the tensor data (", tensor_data_bytes, ").");
+    if (bytes != tensor_data_bytes) {
+        ERR(InvalidUsageError, "Source bytes (", bytes,
+            ") mismatches the tensor data bytes (", tensor_data_bytes, ").");
+    }
+    size_t buffer_id = tensor.ref()->buffer()->id();
+    if (buffer_id_to_offset_.find(buffer_id) == buffer_id_to_offset_.end()) {
+        ERR(NotFoundError, "Invalid buffer ID: ", buffer_id);
     }
+    size_t offset = buffer_id_to_offset_.at(buffer_id);
     size_t tensor_bytes =
         tensor.strides().nelems() * tensor.data_type().bytes();
-    void *dst =
-        buffer_->ref(buffer_id_to_offset_.at(tensor.ref()->buffer()->id()));
+    auto kind = (is_d2d) ? gpuMemcpyDeviceToDevice : gpuMemcpyHostToDevice;
+    void *dst = buffer_->ref(offset);
     if (tensor.strides() == tensor.shape()) {
-        GLOG(gpuMemcpyAsync(dst, data, tensor_bytes, gpuMemcpyHostToDevice,
-                            copy_stream_->get()));
+        GLOG(gpuMemcpyAsync(dst, data, tensor_bytes, kind, copy_stream_->get()));
     } else {
         std::vector<int8_t> tensor_host(tensor_bytes);
-        GLOG(gpuMemcpyAsync(tensor_host.data(), dst, tensor_bytes,
-                            gpuMemcpyDeviceToHost, copy_stream_->get()));
-        copy_stream_->sync();
-        data_to_tensor(tensor_host.data(), static_cast<const int8_t *>(data),
-                       tensor.shape(), tensor.strides(), tensor.offsets(),
-                       tensor.data_type().bytes());
+        if (!is_d2d) {
+            data_to_tensor(tensor_host.data(), static_cast<const int8_t *>(data),
+                        tensor.shape(), tensor.strides(), tensor.offsets(),
+                        tensor.data_type().bytes());
+        } else {
+            // TODO: convert data layout on the device directly
+            std::vector<int8_t> tmp(bytes);
+            GLOG(gpuMemcpyAsync(tmp.data(), data, bytes,
+                                gpuMemcpyDeviceToHost, copy_stream_->get()));
+            copy_stream_->sync();
+            data_to_tensor(tensor_host.data(), tmp.data(),
+                        tensor.shape(), tensor.strides(), tensor.offsets(),
+                        tensor.data_type().bytes());
+        }
         GLOG(gpuMemcpyAsync(dst, tensor_host.data(), tensor_bytes,
                             gpuMemcpyHostToDevice, copy_stream_->get()));
     }
@@ -883,6 +912,8 @@ Executor::Executor(int rank, int world_size, int gpu_id,
 
 Executor::~Executor() = default;
 
+int Executor::gpu_id() const { return impl_->gpu_id(); }
+
 void Executor::compile() { impl_->compile(); }
 
 void Executor::launch(int64_t max_spin_count) { impl_->launch(max_spin_count); }
@@ -902,13 +933,13 @@ void Executor::destroy() { impl_.reset(nullptr); }
 bool Executor::destroyed() const { return impl_.get() == nullptr; }
 
 void Executor::tensor_read(const Tensor tensor, void *data,
-                           size_t bytes) const {
-    impl_->tensor_read(tensor, data, bytes);
+                           size_t bytes, bool is_d2d) const {
+    impl_->tensor_read(tensor, data, bytes, is_d2d);
 }
 
 void Executor::tensor_write(const Tensor tensor, const void *data,
-                            size_t bytes) const {
-    impl_->tensor_write(tensor, data, bytes);
+                            size_t bytes, bool is_d2d) const {
+    impl_->tensor_write(tensor, data, bytes, is_d2d);
 }
 
 DLDeviceType Executor::get_device_type() const {
diff --git a/ark/include/ark/executor.hpp b/ark/include/ark/executor.hpp
index 54c49cd29..a5d6f0273 100644
--- a/ark/include/ark/executor.hpp
+++ b/ark/include/ark/executor.hpp
@@ -23,6 +23,9 @@ class Executor {
 
     ~Executor();
 
+    /// Return the GPU ID.
+    int gpu_id() const;
+
     /// Compile the model. This must be called before `launch()`.
     void compile();
 
@@ -59,10 +62,11 @@ class Executor {
                      data.size() * sizeof(T));
     }
 
-    void tensor_read(const Tensor tensor, void *data, size_t bytes) const;
+    void tensor_read(const Tensor tensor, void *data, size_t bytes,
+                     bool is_d2d = false) const;
 
-    void tensor_write(const Tensor tensor, const void *data,
-                      size_t bytes) const;
+    void tensor_write(const Tensor tensor, const void *data, size_t bytes,
+                      bool is_d2d = false) const;
 
     DLManagedTensor *get_dl_tensor(const Tensor &tensor) const;
 
diff --git a/python/ark/tensor.py b/python/ark/tensor.py
index 00e266929..eff1bf20e 100644
--- a/python/ark/tensor.py
+++ b/python/ark/tensor.py
@@ -77,10 +77,17 @@ def to_numpy(self, ndarray: np.ndarray = None) -> np.ndarray:
         an empty numpy array without the data buffer will be returned.
         """
         np_type = self.dtype().to_numpy()
+        if np_type is None:
+            raise ValueError(
+                f"Tensor data type {self.dtype().__name__} is not supported by numpy."
+            )
         rt = Runtime.get_runtime(self.runtime_id)
         if not rt.launched():
-            return np.ndarray(self.shape(), dtype=np_type, buffer=None)
-        if ndarray is None:
+            raise RuntimeError(
+                "Tensor is not allocated yet. `Tensor.to_numpy()` is "
+                "usable only after you call `Runtime.launch()`."
+            )
+        elif ndarray is None:
             ndarray = np.zeros(self.shape(), dtype=np_type)
         elif not ndarray.flags["C_CONTIGUOUS"]:
             raise ValueError("ndarray is not contiguous in memory")
@@ -99,9 +106,18 @@ def to_torch(
         """ """
         if _no_torch:
             raise ImportError("torch is not available")
+        rt = Runtime.get_runtime(self.runtime_id)
+        if not rt.launched():
+            raise RuntimeError(
+                "Tensor is not allocated yet. `Tensor.to_torch()` is "
+                "usable only after you call `Runtime.launch()`."
+            )
         torch_type = self.dtype().to_torch()
         if tensor is None:
-            return torch.from_numpy(self.to_numpy())
+            dev_name = f"cuda:{rt.executor.gpu_id()}"
+            tensor = torch.zeros(
+                self.shape(), dtype=torch_type, device=torch.device(dev_name)
+            )
         elif tensor.shape != self.shape():
             raise ValueError("torch tensor shape does not match the tensor")
         elif tensor.dtype != torch_type:
@@ -110,7 +126,10 @@ def to_torch(
             raise ValueError("torch tensor is not contiguous in memory")
         elif tensor.numel() != self.nelems():
             raise ValueError("torch tensor size does not match the tensor")
-        tensor.copy_(torch.from_numpy(self.to_numpy(self.runtime_id)))
+        tensor_bytes = self.nelems() * self.dtype().element_size()
+        rt.executor.tensor_read(
+            self._tensor, tensor.data_ptr(), tensor_bytes, True
+        )
         return tensor
 
     def get_torch_view(self) -> torch.Tensor:
@@ -163,7 +182,8 @@ def from_torch(tensor: torch.Tensor):
 
     def copy(self, data: Union[np.ndarray, torch.Tensor]) -> "Tensor":
         """
-        Copies the tensor from a host numpy array to the device.
+        Copies data into this tensor. The data type may differ,
+        but the size must match.
         """
         rt = Runtime.get_runtime(self.runtime_id)
         if not rt.launched():
@@ -171,24 +191,22 @@ def copy(self, data: Union[np.ndarray, torch.Tensor]) -> "Tensor":
                 "Tensor is not allocated yet. `Tensor.from_numpy()` is "
                 "usable only after you call `Runtime.launch()`."
             )
+        tensor_bytes = self.nelems() * self.dtype().element_size()
         if isinstance(data, torch.Tensor):
-            if data.dtype != self.dtype().to_torch():
-                raise ValueError("data dtype does not match the tensor")
             if not data.is_contiguous():
                 data = data.contiguous()
-            if data.numel() != self.nelems():
+            if data.numel() * data.element_size() != tensor_bytes:
                 raise ValueError("data size does not match the tensor")
             rt.executor.tensor_write(
                 self._tensor,
                 data.data_ptr(),
-                data.numel() * data.element_size(),
+                tensor_bytes,
+                data.device.type == "cuda",
             )
         elif isinstance(data, np.ndarray):
-            if data.dtype != self.dtype().to_numpy():
-                raise ValueError("data dtype does not match the tensor")
             if not data.flags["C_CONTIGUOUS"]:
                 data = np.ascontiguousarray(data)
-            if data.nbytes != self.nelems() * self.dtype().element_size():
+            if data.nbytes != tensor_bytes:
                 raise ValueError("data size does not match the tensor")
             rt.executor.tensor_write(self._tensor, data)
         else:
diff --git a/python/executor_py.cpp b/python/executor_py.cpp
index 59bee5a9b..b6cf8a7a8 100644
--- a/python/executor_py.cpp
+++ b/python/executor_py.cpp
@@ -15,19 +15,24 @@ static void tensor_write(ark::Executor *exe, const ark::Tensor &tensor,
                          py::buffer host_buffer) {
     py::buffer_info info = host_buffer.request();
     exe->tensor_write(tensor, reinterpret_cast<void *>(info.ptr),
-                      info.size * info.itemsize);
+                      info.size * info.itemsize, false);
 }
 
 static void tensor_write(ark::Executor *exe, const ark::Tensor &tensor,
-                         size_t host_address, size_t bytes) {
-    exe->tensor_write(tensor, reinterpret_cast<void *>(host_address), bytes);
+                         size_t address, size_t bytes, bool is_d2d) {
+    exe->tensor_write(tensor, reinterpret_cast<void *>(address), bytes, is_d2d);
 }
 
 static void tensor_read(ark::Executor *exe, const ark::Tensor &tensor,
                         py::buffer host_buffer) {
     py::buffer_info info = host_buffer.request();
     exe->tensor_read(tensor, reinterpret_cast<void *>(info.ptr),
-                     info.size * info.itemsize);
+                     info.size * info.itemsize, false);
+}
+
+static void tensor_read(ark::Executor *exe, const ark::Tensor &tensor,
+                        size_t address, size_t bytes, bool is_d2d) {
+    exe->tensor_read(tensor, reinterpret_cast<void *>(address), bytes, is_d2d);
 }
 
 DLManagedTensor *to_dlpack(ark::Executor &exe, const ark::Tensor &tensor) {
@@ -59,6 +64,7 @@ void register_executor(py::module &m) {
             py::init<int, int, int, const std::string &, const std::string &>(),
             py::arg("rank"), py::arg("world_size"), py::arg("gpu_id"),
             py::arg("name"), py::arg("plan"))
+        .def("gpu_id", &ark::Executor::gpu_id)
         .def("compile", &ark::Executor::compile)
         .def("launch", &ark::Executor::launch, py::arg("max_spin_count") = -1)
         .def("run", &ark::Executor::run, py::arg("iter"))
@@ -67,7 +73,16 @@ void register_executor(py::module &m) {
         .def("barrier", &ark::Executor::barrier)
         .def("destroy", &ark::Executor::destroy)
         .def("destroyed", &ark::Executor::destroyed)
-        .def("tensor_read", &tensor_read, py::arg("tensor"), py::arg("data"))
+        .def(
+            "tensor_read",
+            py::overload_cast<ark::Executor *, const ark::Tensor &, py::buffer>(
+                &tensor_read),
+            py::arg("tensor"), py::arg("data"))
+        .def("tensor_read",
+             py::overload_cast<ark::Executor *, const ark::Tensor &, size_t,
+                               size_t, bool>(&tensor_read),
+             py::arg("tensor"), py::arg("address"), py::arg("bytes"),
+             py::arg("is_d2d"))
         .def(
             "tensor_write",
             py::overload_cast<ark::Executor *, const ark::Tensor &, py::buffer>(
@@ -75,8 +90,8 @@ void register_executor(py::module &m) {
             py::arg("tensor"), py::arg("data"))
         .def("tensor_write",
              py::overload_cast<ark::Executor *, const ark::Tensor &, size_t,
-                               size_t>(&tensor_write),
-             py::arg("tensor"), py::arg("address"), py::arg("bytes"))
-        .def("get_dl_tensor", &to_dlpack_capsule),
-        py::arg("tensor");
+                               size_t, bool>(&tensor_write),
+             py::arg("tensor"), py::arg("address"), py::arg("bytes"),
+             py::arg("is_d2d"))
+        .def("get_dl_tensor", &to_dlpack_capsule);
 }
diff --git a/python/unittest/test_conversion.py b/python/unittest/test_conversion.py
index 8f941a033..5befa1c34 100644
--- a/python/unittest/test_conversion.py
+++ b/python/unittest/test_conversion.py
@@ -1,7 +1,14 @@
-import torch
+import pytest
 import numpy as np
 import ark
 
+try:
+    import torch
+
+    _no_torch = False
+except ImportError:
+    _no_torch = True
+
 
 def initialize_tensor(dimensions, dtype):
     tensor = ark.tensor(dimensions, dtype)
@@ -11,6 +18,8 @@ def initialize_tensor(dimensions, dtype):
 
 # Test function to validate the integrity of the PyTorch view of the ARK tensor,
 # including its data and attributes such as shape and data type.
+@pytest.mark.parametrize("num_dims,size", [(1, 5), (1, 1024), (2, 5), (2, 32)])
+@pytest.mark.parametrize("dtype", [ark.fp16, ark.fp32])
 def test_values_fixed_dims(num_dims: int, size: int, dtype: ark.DataType):
     ark.init()
     dimensions = [size] * num_dims
@@ -59,6 +68,7 @@ def check_diff(input_tensor_host, input_view_numpy, value, index):
 
 
 # Test function to check if changes to the torch views are reflected in the original tensors
+@pytest.mark.parametrize("dtype", [ark.fp16, ark.fp32])
 def test_aliasing(dtype: ark.DataType):
     ark.init()
     dimensions = [4, 4]
@@ -91,3 +101,28 @@ def test_aliasing(dtype: ark.DataType):
 
     runtime.stop()
     runtime.reset()
+
+
+def test_conversion_torch():
+    if _no_torch:
+        pytest.skip("PyTorch not available")
+
+    dimensions = [4, 4]
+
+    ark.init()
+    t = ark.constant(7, dimensions)
+
+    with ark.Runtime() as rt:
+        rt.launch()
+
+        torch_tensor = t.to_torch()
+
+        assert torch_tensor.shape == (4, 4)
+        assert torch_tensor.dtype == torch.float32
+        assert torch_tensor.device.type == "cuda"
+        assert torch.all(torch_tensor == 0)
+
+        rt.run()
+
+        torch_tensor = t.to_torch()
+        assert torch.all(torch_tensor == 7)

From 20c23f34b17ecfa24d96ffa8799c3c173b468c53 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 24 Jun 2024 23:58:59 +0000
Subject: [PATCH 12/79] lint

---
 ark/api/executor.cpp | 46 +++++++++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 22 deletions(-)

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index 96e53c8cf..ae3e5f499 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -158,8 +158,8 @@ class Executor::Impl {
 
     void tensor_read(const Tensor tensor, void *data, size_t bytes,
                      bool is_d2d) const;
-    void tensor_write(const Tensor tensor, const void *data,
-                      size_t bytes, bool is_d2d) const;
+    void tensor_write(const Tensor tensor, const void *data, size_t bytes,
+                      bool is_d2d) const;
     DLDeviceType get_device_type() const;
     DLManagedTensor *get_dl_tensor(const Tensor &tensor) const;
 
@@ -733,8 +733,8 @@ void Executor::Impl::barrier() {
     }
 }
 
-void Executor::Impl::tensor_read(const Tensor tensor, void *data,
-                                 size_t bytes, bool is_d2d) const {
+void Executor::Impl::tensor_read(const Tensor tensor, void *data, size_t bytes,
+                                 bool is_d2d) const {
     GLOG(gpuSetDevice(gpu_id_));
     size_t tensor_data_bytes =
         tensor.shape().nelems() * tensor.data_type().bytes();
@@ -760,15 +760,15 @@ void Executor::Impl::tensor_read(const Tensor tensor, void *data,
         copy_stream_->sync();
         if (!is_d2d) {
             tensor_to_data(tensor_host.data(), static_cast<int8_t *>(data),
-                    tensor.shape(), tensor.strides(), tensor.offsets(),
-                    tensor.data_type().bytes());
+                           tensor.shape(), tensor.strides(), tensor.offsets(),
+                           tensor.data_type().bytes());
             return;
         }
         // TODO: convert data layout on the device directly
         std::vector<int8_t> data_host(bytes);
-        tensor_to_data(tensor_host.data(), data_host.data(),
-                tensor.shape(), tensor.strides(), tensor.offsets(),
-                tensor.data_type().bytes());
+        tensor_to_data(tensor_host.data(), data_host.data(), tensor.shape(),
+                       tensor.strides(), tensor.offsets(),
+                       tensor.data_type().bytes());
         GLOG(gpuMemcpyAsync(data, data_host.data(), bytes,
                             gpuMemcpyHostToDevice, copy_stream_->get()));
     }
@@ -794,22 +794,24 @@ void Executor::Impl::tensor_write(const Tensor tensor, const void *data,
     auto kind = (is_d2d) ? gpuMemcpyDeviceToDevice : gpuMemcpyHostToDevice;
     void *dst = buffer_->ref(offset);
     if (tensor.strides() == tensor.shape()) {
-        GLOG(gpuMemcpyAsync(dst, data, tensor_bytes, kind, copy_stream_->get()));
+        GLOG(
+            gpuMemcpyAsync(dst, data, tensor_bytes, kind, copy_stream_->get()));
     } else {
         std::vector<int8_t> tensor_host(tensor_bytes);
         if (!is_d2d) {
-            data_to_tensor(tensor_host.data(), static_cast<const int8_t *>(data),
-                        tensor.shape(), tensor.strides(), tensor.offsets(),
-                        tensor.data_type().bytes());
+            data_to_tensor(tensor_host.data(),
+                           static_cast<const int8_t *>(data), tensor.shape(),
+                           tensor.strides(), tensor.offsets(),
+                           tensor.data_type().bytes());
         } else {
             // TODO: convert data layout on the device directly
             std::vector<int8_t> tmp(bytes);
-            GLOG(gpuMemcpyAsync(tmp.data(), data, bytes,
-                                gpuMemcpyDeviceToHost, copy_stream_->get()));
+            GLOG(gpuMemcpyAsync(tmp.data(), data, bytes, gpuMemcpyDeviceToHost,
+                                copy_stream_->get()));
             copy_stream_->sync();
-            data_to_tensor(tensor_host.data(), tmp.data(),
-                        tensor.shape(), tensor.strides(), tensor.offsets(),
-                        tensor.data_type().bytes());
+            data_to_tensor(tensor_host.data(), tmp.data(), tensor.shape(),
+                           tensor.strides(), tensor.offsets(),
+                           tensor.data_type().bytes());
         }
         GLOG(gpuMemcpyAsync(dst, tensor_host.data(), tensor_bytes,
                             gpuMemcpyHostToDevice, copy_stream_->get()));
@@ -932,13 +934,13 @@ void Executor::destroy() { impl_.reset(nullptr); }
 
 bool Executor::destroyed() const { return impl_.get() == nullptr; }
 
-void Executor::tensor_read(const Tensor tensor, void *data,
-                           size_t bytes, bool is_d2d) const {
+void Executor::tensor_read(const Tensor tensor, void *data, size_t bytes,
+                           bool is_d2d) const {
     impl_->tensor_read(tensor, data, bytes, is_d2d);
 }
 
-void Executor::tensor_write(const Tensor tensor, const void *data,
-                            size_t bytes, bool is_d2d) const {
+void Executor::tensor_write(const Tensor tensor, const void *data, size_t bytes,
+                            bool is_d2d) const {
     impl_->tensor_write(tensor, data, bytes, is_d2d);
 }
 

From ebe85604cb7249b4e0d7d6c3eed69758c4c6825f Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 25 Jun 2024 01:21:42 +0000
Subject: [PATCH 13/79] Seperate DLPack from C++ interfaces

---
 ark/api/executor.cpp         | 127 +++++------------------------------
 ark/include/ark/executor.hpp |   8 +--
 python/executor_py.cpp       |  90 ++++++++++++++++++++++++-
 3 files changed, 106 insertions(+), 119 deletions(-)

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index ae3e5f499..ebfa7016d 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -156,12 +156,12 @@ class Executor::Impl {
     float stop(int64_t max_spin_count);
     void barrier();
 
+    uintptr_t tensor_address(const Tensor tensor) const;
+
     void tensor_read(const Tensor tensor, void *data, size_t bytes,
                      bool is_d2d) const;
     void tensor_write(const Tensor tensor, const void *data, size_t bytes,
                       bool is_d2d) const;
-    DLDeviceType get_device_type() const;
-    DLManagedTensor *get_dl_tensor(const Tensor &tensor) const;
 
    private:
     void init_communicator();
@@ -733,6 +733,15 @@ void Executor::Impl::barrier() {
     }
 }
 
+uintptr_t Executor::Impl::tensor_address(const Tensor tensor) const {
+    size_t buffer_id = tensor.ref()->buffer()->id();
+    if (buffer_id_to_offset_.find(buffer_id) == buffer_id_to_offset_.end()) {
+        ERR(NotFoundError, "Invalid buffer ID: ", buffer_id);
+    }
+    size_t offset = buffer_id_to_offset_.at(buffer_id);
+    return reinterpret_cast<uintptr_t>(buffer_->ref(offset));
+}
+
 void Executor::Impl::tensor_read(const Tensor tensor, void *data, size_t bytes,
                                  bool is_d2d) const {
     GLOG(gpuSetDevice(gpu_id_));
@@ -742,13 +751,8 @@ void Executor::Impl::tensor_read(const Tensor tensor, void *data, size_t bytes,
         ERR(InvalidUsageError, "Destination bytes (", bytes,
             ") mismatches the tensor data bytes (", tensor_data_bytes, ").");
     }
-    size_t buffer_id = tensor.ref()->buffer()->id();
-    if (buffer_id_to_offset_.find(buffer_id) == buffer_id_to_offset_.end()) {
-        ERR(NotFoundError, "Invalid buffer ID: ", buffer_id);
-    }
-    size_t offset = buffer_id_to_offset_.at(buffer_id);
     auto kind = (is_d2d) ? gpuMemcpyDeviceToDevice : gpuMemcpyDeviceToHost;
-    void *src = buffer_->ref(offset);
+    void *src = reinterpret_cast<void *>(tensor_address(tensor));
     if (tensor.strides() == tensor.shape()) {
         GLOG(gpuMemcpyAsync(data, src, bytes, kind, copy_stream_->get()));
     } else {
@@ -784,15 +788,10 @@ void Executor::Impl::tensor_write(const Tensor tensor, const void *data,
         ERR(InvalidUsageError, "Source bytes (", bytes,
             ") mismatches the tensor data bytes (", tensor_data_bytes, ").");
     }
-    size_t buffer_id = tensor.ref()->buffer()->id();
-    if (buffer_id_to_offset_.find(buffer_id) == buffer_id_to_offset_.end()) {
-        ERR(NotFoundError, "Invalid buffer ID: ", buffer_id);
-    }
-    size_t offset = buffer_id_to_offset_.at(buffer_id);
     size_t tensor_bytes =
         tensor.strides().nelems() * tensor.data_type().bytes();
     auto kind = (is_d2d) ? gpuMemcpyDeviceToDevice : gpuMemcpyHostToDevice;
-    void *dst = buffer_->ref(offset);
+    void *dst = reinterpret_cast<void *>(tensor_address(tensor));
     if (tensor.strides() == tensor.shape()) {
         GLOG(
             gpuMemcpyAsync(dst, data, tensor_bytes, kind, copy_stream_->get()));
@@ -819,94 +818,6 @@ void Executor::Impl::tensor_write(const Tensor tensor, const void *data,
     copy_stream_->sync();
 }
 
-DLDeviceType Executor::Impl::get_device_type() const {
-#if defined(ARK_CUDA)
-    return kDLCUDA;
-#elif defined(ARK_ROCM)
-    return kDLROCM;
-#else
-    return kDLCPU;
-#endif
-}
-
-DLDataType get_dl_dtype(const DataType &ark_data_type) {
-    DLDataType dl_data_type;
-    dl_data_type.lanes = 1;
-    if (ark_data_type == FP32) {
-        dl_data_type.code = kDLFloat;
-        dl_data_type.bits = 32;
-    } else if (ark_data_type == FP16) {
-        dl_data_type.code = kDLFloat;
-        dl_data_type.bits = 16;
-    } else if (ark_data_type == BF16) {
-        dl_data_type.code = kDLBfloat;
-        dl_data_type.bits = 16;
-    } else if (ark_data_type == INT32) {
-        dl_data_type.code = kDLInt;
-        dl_data_type.bits = 32;
-    } else if (ark_data_type == UINT32) {
-        dl_data_type.code = kDLUInt;
-        dl_data_type.bits = 32;
-    } else if (ark_data_type == INT8) {
-        dl_data_type.code = kDLInt;
-        dl_data_type.bits = 8;
-    } else if (ark_data_type == UINT8) {
-        dl_data_type.code = kDLUInt;
-        dl_data_type.bits = 8;
-    } else if (ark_data_type == BYTE) {
-        dl_data_type.code = kDLUInt;
-        dl_data_type.bits = 8;
-    } else {
-        ERR(InvalidUsageError, "Unsupported data type");
-    }
-    return dl_data_type;
-}
-
-DLManagedTensor *Executor::Impl::get_dl_tensor(const Tensor &tensor) const {
-    DLTensor dl_tensor;
-    dl_tensor.data =
-        buffer_->ref(buffer_id_to_offset_.at(tensor.ref()->buffer()->id()));
-    size_t offset_in_elements =
-        tensor.offsets().is_no_dim() ? 0 : tensor.offsets().vector()[0];
-    dl_tensor.byte_offset = offset_in_elements * tensor.data_type().bytes();
-    dl_tensor.device.device_type = get_device_type();
-    dl_tensor.device.device_id = static_cast<int32_t>(gpu_id_);
-    dl_tensor.ndim = static_cast<int32_t>(tensor.shape().ndims());
-    dl_tensor.dtype = get_dl_dtype(tensor.data_type());
-
-    dl_tensor.shape =
-        tensor.shape().is_no_dim() ? nullptr : new int64_t[dl_tensor.ndim];
-    dl_tensor.strides =
-        tensor.strides().is_no_dim() ? nullptr : new int64_t[dl_tensor.ndim];
-    auto shape = tensor.shape();
-    if (dl_tensor.shape) {
-        for (int i = 0; i < dl_tensor.ndim; ++i) {
-            dl_tensor.shape[i] = shape[i];
-        }
-    }
-    if (dl_tensor.strides) {
-        dl_tensor.strides[dl_tensor.ndim - 1] = 1;
-        for (int i = dl_tensor.ndim - 2; i >= 0; --i) {
-            dl_tensor.strides[i] =
-                dl_tensor.shape[i + 1] * dl_tensor.strides[i + 1];
-        }
-    }
-    DLManagedTensor *dl_managed_tensor = new DLManagedTensor();
-    dl_managed_tensor->dl_tensor = dl_tensor;
-    dl_managed_tensor->manager_ctx = nullptr;
-    dl_managed_tensor->deleter = [](DLManagedTensor *self) {
-        if (self->dl_tensor.shape) {
-            delete[] self->dl_tensor.shape;
-            self->dl_tensor.shape = nullptr;
-        }
-        if (self->dl_tensor.strides) {
-            delete[] self->dl_tensor.strides;
-            self->dl_tensor.strides = nullptr;
-        }
-    };
-    return dl_managed_tensor;
-}
-
 Executor::Executor(int rank, int world_size, int gpu_id,
                    const std::string &name, const std::string &plan)
     : impl_(std::make_unique<Executor::Impl>(rank, world_size, gpu_id, name,
@@ -934,6 +845,10 @@ void Executor::destroy() { impl_.reset(nullptr); }
 
 bool Executor::destroyed() const { return impl_.get() == nullptr; }
 
+uintptr_t Executor::tensor_address(const Tensor tensor) const {
+    return impl_->tensor_address(tensor);
+}
+
 void Executor::tensor_read(const Tensor tensor, void *data, size_t bytes,
                            bool is_d2d) const {
     impl_->tensor_read(tensor, data, bytes, is_d2d);
@@ -944,14 +859,6 @@ void Executor::tensor_write(const Tensor tensor, const void *data, size_t bytes,
     impl_->tensor_write(tensor, data, bytes, is_d2d);
 }
 
-DLDeviceType Executor::get_device_type() const {
-    return impl_->get_device_type();
-}
-
-DLManagedTensor *Executor::get_dl_tensor(const Tensor &tensor) const {
-    return impl_->get_dl_tensor(tensor);
-}
-
 DefaultExecutor::DefaultExecutor(const Model &model, int gpu_id,
                                  const std::string &name)
     : Executor(
diff --git a/ark/include/ark/executor.hpp b/ark/include/ark/executor.hpp
index a5d6f0273..b8cdaf273 100644
--- a/ark/include/ark/executor.hpp
+++ b/ark/include/ark/executor.hpp
@@ -4,8 +4,6 @@
 #ifndef ARK_EXECUTOR_HPP
 #define ARK_EXECUTOR_HPP
 
-#include <dlpack/dlpack.h>
-
 #include <ark/model_ref.hpp>
 #include <ark/tensor.hpp>
 #include <memory>
@@ -50,6 +48,8 @@ class Executor {
 
     bool destroyed() const;
 
+    uintptr_t tensor_address(const Tensor tensor) const;
+
     template <typename T>
     void tensor_read(const Tensor tensor, std::vector<T> &data) const {
         tensor_read(tensor, reinterpret_cast<void *>(data.data()),
@@ -68,10 +68,6 @@ class Executor {
     void tensor_write(const Tensor tensor, const void *data, size_t bytes,
                       bool is_d2d = false) const;
 
-    DLManagedTensor *get_dl_tensor(const Tensor &tensor) const;
-
-    DLDeviceType get_device_type() const;
-
    private:
     class Impl;
     std::unique_ptr<Impl> impl_;
diff --git a/python/executor_py.cpp b/python/executor_py.cpp
index b6cf8a7a8..e5ab4f964 100644
--- a/python/executor_py.cpp
+++ b/python/executor_py.cpp
@@ -9,6 +9,7 @@
 #include <ark/executor.hpp>
 #include <ark/model.hpp>
 #include <iostream>
+#include <stdexcept>
 namespace py = pybind11;
 
 static void tensor_write(ark::Executor *exe, const ark::Tensor &tensor,
@@ -35,9 +36,92 @@ static void tensor_read(ark::Executor *exe, const ark::Tensor &tensor,
     exe->tensor_read(tensor, reinterpret_cast<void *>(address), bytes, is_d2d);
 }
 
-DLManagedTensor *to_dlpack(ark::Executor &exe, const ark::Tensor &tensor) {
-    DLManagedTensor *dl_tensor = exe.get_dl_tensor(tensor);
-    return dl_tensor;
+static DLDataType get_dl_dtype(const ark::DataType &ark_data_type) {
+    DLDataType dl_data_type;
+    dl_data_type.lanes = 1;
+    if (ark_data_type == ark::FP32) {
+        dl_data_type.code = kDLFloat;
+        dl_data_type.bits = 32;
+    } else if (ark_data_type == ark::FP16) {
+        dl_data_type.code = kDLFloat;
+        dl_data_type.bits = 16;
+    } else if (ark_data_type == ark::BF16) {
+        dl_data_type.code = kDLBfloat;
+        dl_data_type.bits = 16;
+    } else if (ark_data_type == ark::INT32) {
+        dl_data_type.code = kDLInt;
+        dl_data_type.bits = 32;
+    } else if (ark_data_type == ark::UINT32) {
+        dl_data_type.code = kDLUInt;
+        dl_data_type.bits = 32;
+    } else if (ark_data_type == ark::INT8) {
+        dl_data_type.code = kDLInt;
+        dl_data_type.bits = 8;
+    } else if (ark_data_type == ark::UINT8) {
+        dl_data_type.code = kDLUInt;
+        dl_data_type.bits = 8;
+    } else if (ark_data_type == ark::BYTE) {
+        dl_data_type.code = kDLUInt;
+        dl_data_type.bits = 8;
+    } else {
+        throw std::runtime_error("unexpected error");
+    }
+    return dl_data_type;
+}
+
+static DLDeviceType get_device_type() {
+#if defined(ARK_CUDA)
+    return kDLCUDA;
+#elif defined(ARK_ROCM)
+    return kDLROCM;
+#else
+    return kDLCPU;
+#endif
+}
+
+static DLManagedTensor *to_dlpack(ark::Executor &exe,
+                                  const ark::Tensor &tensor) {
+    DLTensor dl_tensor;
+    dl_tensor.data = reinterpret_cast<void *>(exe.tensor_address(tensor));
+    size_t offset_in_elements =
+        tensor.offsets().is_no_dim() ? 0 : tensor.offsets().vector()[0];
+    dl_tensor.byte_offset = offset_in_elements * tensor.data_type().bytes();
+    dl_tensor.device.device_type = get_device_type();
+    dl_tensor.device.device_id = static_cast<int32_t>(exe.gpu_id());
+    dl_tensor.ndim = static_cast<int32_t>(tensor.shape().ndims());
+    dl_tensor.dtype = get_dl_dtype(tensor.data_type());
+
+    dl_tensor.shape =
+        tensor.shape().is_no_dim() ? nullptr : new int64_t[dl_tensor.ndim];
+    dl_tensor.strides =
+        tensor.strides().is_no_dim() ? nullptr : new int64_t[dl_tensor.ndim];
+    auto shape = tensor.shape();
+    if (dl_tensor.shape) {
+        for (int i = 0; i < dl_tensor.ndim; ++i) {
+            dl_tensor.shape[i] = shape[i];
+        }
+    }
+    if (dl_tensor.strides) {
+        dl_tensor.strides[dl_tensor.ndim - 1] = 1;
+        for (int i = dl_tensor.ndim - 2; i >= 0; --i) {
+            dl_tensor.strides[i] =
+                dl_tensor.shape[i + 1] * dl_tensor.strides[i + 1];
+        }
+    }
+    DLManagedTensor *dl_managed_tensor = new DLManagedTensor();
+    dl_managed_tensor->dl_tensor = dl_tensor;
+    dl_managed_tensor->manager_ctx = nullptr;
+    dl_managed_tensor->deleter = [](DLManagedTensor *self) {
+        if (self->dl_tensor.shape) {
+            delete[] self->dl_tensor.shape;
+            self->dl_tensor.shape = nullptr;
+        }
+        if (self->dl_tensor.strides) {
+            delete[] self->dl_tensor.strides;
+            self->dl_tensor.strides = nullptr;
+        }
+    };
+    return dl_managed_tensor;
 }
 
 void free_capsule(PyObject *capsule) {

From 08c9b899c22b759a6f4f194b7932f48d08eeb8f4 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 25 Jun 2024 01:30:50 +0000
Subject: [PATCH 14/79] Update workflow trigger

---
 .github/workflows/ut-cuda.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/ut-cuda.yml b/.github/workflows/ut-cuda.yml
index 5a78818ff..918c1a4a8 100644
--- a/.github/workflows/ut-cuda.yml
+++ b/.github/workflows/ut-cuda.yml
@@ -7,8 +7,7 @@ on:
   pull_request:
     branches:
       - main
-    types:
-      - ready_for_review
+    types: [opened, synchronize, reopened, ready_for_review]
 
 jobs:
   UnitTest:

From 1fa08afa36010116cdcd6d89e64db104f3fa23d1 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 25 Jun 2024 20:53:29 +0000
Subject: [PATCH 15/79] expose exceptions

---
 ark/api/dims.cpp                |  1 -
 ark/include/ark.hpp             |  1 +
 ark/{ => include/ark}/error.hpp | 15 ++++++++++-----
 ark/logging.h                   |  2 +-
 python/ark/__init__.py          | 12 ++++++++++++
 python/ark/error.py             | 12 ++++++++++++
 python/ark_py.cpp               |  2 ++
 python/error_py.cpp             | 25 +++++++++++++++++++++++++
 python/unittest/test_error.py   | 12 ++++++++++++
 9 files changed, 75 insertions(+), 7 deletions(-)
 rename ark/{ => include/ark}/error.hpp (70%)
 create mode 100644 python/ark/error.py
 create mode 100644 python/error_py.cpp
 create mode 100644 python/unittest/test_error.py

diff --git a/ark/api/dims.cpp b/ark/api/dims.cpp
index a2830a060..a1f03b426 100644
--- a/ark/api/dims.cpp
+++ b/ark/api/dims.cpp
@@ -5,7 +5,6 @@
 
 #include <vector>
 
-#include "error.hpp"
 #include "logging.h"
 
 namespace ark {
diff --git a/ark/include/ark.hpp b/ark/include/ark.hpp
index a7b2f7f70..2ca796172 100644
--- a/ark/include/ark.hpp
+++ b/ark/include/ark.hpp
@@ -10,6 +10,7 @@
 
 #include <ark/data_type.hpp>
 #include <ark/dims.hpp>
+#include <ark/error.hpp>
 #include <ark/executor.hpp>
 #include <ark/init.hpp>
 #include <ark/model_graph.hpp>
diff --git a/ark/error.hpp b/ark/include/ark/error.hpp
similarity index 70%
rename from ark/error.hpp
rename to ark/include/ark/error.hpp
index e08acd975..78d02cab3 100644
--- a/ark/error.hpp
+++ b/ark/include/ark/error.hpp
@@ -1,17 +1,21 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#ifndef ARK_ERROR_HPP_
-#define ARK_ERROR_HPP_
+#ifndef ARK_ERROR_HPP
+#define ARK_ERROR_HPP
 
 #include <stdexcept>
 #include <string>
 
 namespace ark {
 
-class BaseError : public std::runtime_error {
+class BaseError : public std::exception {
+   private:
+    std::string msg_;
+
    public:
-    BaseError(const std::string &msg) : std::runtime_error(msg) {}
+    BaseError(const std::string &msg) : msg_(msg) {}
+    const char *what() const noexcept override { return msg_.c_str(); }
 };
 
 #define REGISTER_ERROR_TYPE(_name)                        \
@@ -20,6 +24,7 @@ class BaseError : public std::runtime_error {
         _name(const std::string &msg) : BaseError(msg) {} \
     };
 
+REGISTER_ERROR_TYPE(InternalError)
 REGISTER_ERROR_TYPE(InvalidUsageError)
 REGISTER_ERROR_TYPE(NotFoundError)
 REGISTER_ERROR_TYPE(ModelError)
@@ -32,4 +37,4 @@ REGISTER_ERROR_TYPE(UnitTestError)
 
 }  // namespace ark
 
-#endif  // ARK_ERROR_HPP_
+#endif  // ARK_ERROR_HPP
diff --git a/ark/logging.h b/ark/logging.h
index d29793ff7..6eb8aaf91 100644
--- a/ark/logging.h
+++ b/ark/logging.h
@@ -8,7 +8,7 @@
 #include <sstream>
 #include <string>
 
-#include "error.hpp"
+#include "ark/error.hpp"
 
 namespace ark {
 
diff --git a/python/ark/__init__.py b/python/ark/__init__.py
index 2a4d164e4..3d162c3e4 100644
--- a/python/ark/__init__.py
+++ b/python/ark/__init__.py
@@ -91,3 +91,15 @@ def set_world_size(world_size):
     ones,
     zeros,
 )
+from .error import (
+    InternalError,
+    InvalidUsageError,
+    NotFoundError,
+    ModelError,
+    SchedulerError,
+    ExecutorError,
+    SystemError,
+    GpuError,
+    RuntimeError,
+)
+
diff --git a/python/ark/error.py b/python/ark/error.py
new file mode 100644
index 000000000..d3ac3aee8
--- /dev/null
+++ b/python/ark/error.py
@@ -0,0 +1,12 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+from _ark_core import _InternalError as InternalError
+from _ark_core import _InvalidUsageError as InvalidUsageError
+from _ark_core import _NotFoundError as NotFoundError
+from _ark_core import _ModelError as ModelError
+from _ark_core import _SchedulerError as SchedulerError
+from _ark_core import _ExecutorError as ExecutorError
+from _ark_core import _SystemError as SystemError
+from _ark_core import _GpuError as GpuError
+from _ark_core import _RuntimeError as RuntimeError
diff --git a/python/ark_py.cpp b/python/ark_py.cpp
index 35c3b21c3..1bc4255d6 100644
--- a/python/ark_py.cpp
+++ b/python/ark_py.cpp
@@ -9,6 +9,7 @@ namespace py = pybind11;
 
 extern void register_data_type(py::module &m);
 extern void register_dims(py::module &m);
+extern void register_error(py::module &m);
 extern void register_executor(py::module &m);
 extern void register_init(py::module &m);
 extern void register_model_graph(py::module &m);
@@ -23,6 +24,7 @@ PYBIND11_MODULE(_ark_core, m) {
 
     register_data_type(m);
     register_dims(m);
+    register_error(m);
     register_executor(m);
     register_init(m);
     register_model_graph(m);
diff --git a/python/error_py.cpp b/python/error_py.cpp
new file mode 100644
index 000000000..863d8423d
--- /dev/null
+++ b/python/error_py.cpp
@@ -0,0 +1,25 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include <pybind11/operators.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include <ark/error.hpp>
+
+namespace py = pybind11;
+
+#define REGISTER_ERROR_PY(_name) \
+    py::register_exception<ark::_name>(m, "_" #_name)
+
+void register_error(py::module &m) {
+    REGISTER_ERROR_PY(InternalError);
+    REGISTER_ERROR_PY(InvalidUsageError);
+    REGISTER_ERROR_PY(NotFoundError);
+    REGISTER_ERROR_PY(ModelError);
+    REGISTER_ERROR_PY(SchedulerError);
+    REGISTER_ERROR_PY(ExecutorError);
+    REGISTER_ERROR_PY(SystemError);
+    REGISTER_ERROR_PY(GpuError);
+    REGISTER_ERROR_PY(RuntimeError);
+}
diff --git a/python/unittest/test_error.py b/python/unittest/test_error.py
new file mode 100644
index 000000000..c063c05c5
--- /dev/null
+++ b/python/unittest/test_error.py
@@ -0,0 +1,12 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import ark
+
+
+def test_error():
+    ark.init()
+    try:
+        ark.tensor([0])
+    except Exception as e:
+        assert isinstance(e, ark.InvalidUsageError)

From 59caff1eddb0a01c4f7bdf6e082b96d22e10ad6e Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 26 Jun 2024 23:25:35 +0000
Subject: [PATCH 16/79] Build python module by default

---
 CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ee1e3566e..9ba2f2c55 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -17,6 +17,7 @@ option(USE_CUDA "Use NVIDIA/CUDA." OFF)
 option(USE_ROCM "Use AMD/ROCm." OFF)
 option(BYPASS_GPU_CHECK "Bypass GPU check." OFF)
 option(BUILD_TESTS "Build unit tests." ON)
+option(BUILD_PYTHON "Build Python module." ON)
 
 if(BYPASS_GPU_CHECK)
     if(USE_CUDA)

From efb2c78145cab0832971205911320264bbe74870 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Sat, 29 Jun 2024 03:51:19 +0000
Subject: [PATCH 17/79] revert

---
 ark/include/kernels/kernel_template.in | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ark/include/kernels/kernel_template.in b/ark/include/kernels/kernel_template.in
index 876e6a1b4..ea1862920 100644
--- a/ark/include/kernels/kernel_template.in
+++ b/ark/include/kernels/kernel_template.in
@@ -64,5 +64,6 @@ void @NAME@(char *_buf, int *_iter) {
     if (threadIdx.x == 0 && blockIdx.x == 0) {
       atomicStoreRelaxed(_iter, 0);
     }
+    sync_gpu<@NUM_BLOCKS@>(ARK_LOOP_SYNC_STATE);
   }
 }

From 8975f9d4a0574f0421e79f6dd49e7443e7244606 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Sat, 29 Jun 2024 04:03:20 +0000
Subject: [PATCH 18/79] Do not use `sys.path` for importing `_ark_core`

---
 python/ark/__init__.py |  5 +----
 python/ark/error.py    | 18 +++++++++---------
 python/ark/init.py     |  2 +-
 python/ark/model.py    |  2 +-
 python/ark/runtime.py  |  2 +-
 python/ark/tensor.py   |  2 +-
 6 files changed, 14 insertions(+), 17 deletions(-)

diff --git a/python/ark/__init__.py b/python/ark/__init__.py
index 3d162c3e4..031afc7ba 100644
--- a/python/ark/__init__.py
+++ b/python/ark/__init__.py
@@ -7,9 +7,7 @@
 if os.environ.get("ARK_ROOT", None) is None:
     os.environ["ARK_ROOT"] = os.path.abspath(os.path.dirname(__file__))
 
-sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
-
-import _ark_core
+from . import _ark_core
 from .model import Model
 
 
@@ -102,4 +100,3 @@ def set_world_size(world_size):
     GpuError,
     RuntimeError,
 )
-
diff --git a/python/ark/error.py b/python/ark/error.py
index d3ac3aee8..40f7391ac 100644
--- a/python/ark/error.py
+++ b/python/ark/error.py
@@ -1,12 +1,12 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-from _ark_core import _InternalError as InternalError
-from _ark_core import _InvalidUsageError as InvalidUsageError
-from _ark_core import _NotFoundError as NotFoundError
-from _ark_core import _ModelError as ModelError
-from _ark_core import _SchedulerError as SchedulerError
-from _ark_core import _ExecutorError as ExecutorError
-from _ark_core import _SystemError as SystemError
-from _ark_core import _GpuError as GpuError
-from _ark_core import _RuntimeError as RuntimeError
+from ._ark_core import _InternalError as InternalError
+from ._ark_core import _InvalidUsageError as InvalidUsageError
+from ._ark_core import _NotFoundError as NotFoundError
+from ._ark_core import _ModelError as ModelError
+from ._ark_core import _SchedulerError as SchedulerError
+from ._ark_core import _ExecutorError as ExecutorError
+from ._ark_core import _SystemError as SystemError
+from ._ark_core import _GpuError as GpuError
+from ._ark_core import _RuntimeError as RuntimeError
diff --git a/python/ark/init.py b/python/ark/init.py
index dbf7c1569..32f530791 100644
--- a/python/ark/init.py
+++ b/python/ark/init.py
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-import _ark_core
+from . import _ark_core
 from .model import Model
 from .runtime import _RuntimeState
 
diff --git a/python/ark/model.py b/python/ark/model.py
index e6208fc16..87af88f49 100644
--- a/python/ark/model.py
+++ b/python/ark/model.py
@@ -2,7 +2,7 @@
 # Licensed under the MIT license.
 
 from typing import NewType
-from _ark_core import _Model
+from ._ark_core import _Model
 
 _ModelState = NewType("_ModelState", None)
 
diff --git a/python/ark/runtime.py b/python/ark/runtime.py
index 798eaf9d5..efae6ab3c 100644
--- a/python/ark/runtime.py
+++ b/python/ark/runtime.py
@@ -5,7 +5,7 @@
 from enum import Enum
 from typing import Callable, Dict, List
 
-from _ark_core import _Executor, _DefaultPlanner
+from ._ark_core import _Executor, _DefaultPlanner
 from .model import Model
 
 
diff --git a/python/ark/tensor.py b/python/ark/tensor.py
index eff1bf20e..ac2886960 100644
--- a/python/ark/tensor.py
+++ b/python/ark/tensor.py
@@ -4,7 +4,7 @@
 import numpy as np
 from typing import Callable, List, Union, Type
 
-from _ark_core import _Dims, _Tensor, _NullTensor
+from ._ark_core import _Dims, _Tensor, _NullTensor
 from .data_type import DataType
 from .runtime import Runtime
 from .model import Model

From 153837ba60497413d70c90fed945eaa037c84a29 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 2 Jul 2024 04:09:10 +0000
Subject: [PATCH 19/79] wip

---
 ark/api/executor.cpp                   |   51 +-
 ark/codegen.cpp                        |    3 +-
 ark/include/ark/executor.hpp           |    7 +-
 ark/include/kernels/common/broadcast.h |    4 +-
 ark/model/model_json.cpp               |   11 +-
 ark/model/model_json.hpp               |    2 +-
 ark/model/model_op.cpp                 |    5 +-
 ark/ops/ops_all_reduce_test.cpp        |    2 +-
 ark/ops/ops_arithmetic_test.cpp        |   48 +-
 ark/ops/ops_embedding_test.cpp         |    2 +-
 ark/ops/ops_matmul.cpp                 |   30 +-
 ark/ops/ops_test_common.cpp            |   10 +-
 ark/ops/ops_test_common.hpp            |    6 +-
 examples/llama/README.md               |    4 +-
 examples/llama/model_test.py           |   88 +-
 plan_gpu0.json                         | 2504 ++++++++++++++++++++++++
 python/ark/__init__.py                 |    1 +
 python/ark/profiler.py                 |   30 +
 python/executor_py.cpp                 |    1 +
 19 files changed, 2706 insertions(+), 103 deletions(-)
 create mode 100644 plan_gpu0.json
 create mode 100644 python/ark/profiler.py

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index b052040ef..4af9df7c0 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -143,11 +143,13 @@ static size_t tensor_stride_bytes(const Json &tensor) {
 
 class Executor::Impl {
    public:
-    Impl(int rank, int world_size, int gpu_id, const std::string &name,
-         const std::string &plan);
+    Impl(int rank, int world_size, int gpu_id, const std::string &name);
     ~Impl() = default;
 
+    void init(const std::string &plan);
+
     int gpu_id() const { return gpu_id_; }
+    std::string plan() const { return plan_json_.dump_pretty(); }
 
     void compile();
     void launch(int64_t max_spin_count);
@@ -173,11 +175,13 @@ class Executor::Impl {
     const int rank_;
     const int world_size_;
     int gpu_id_;
+    std::string name_;
 
     bool is_launched_ = false;
     bool is_recording_ = false;
     float elapsed_msec_ = -1;
 
+    PlanJson plan_json_;
     std::map<size_t, size_t> buffer_id_to_offset_;
     size_t total_bytes_;
     std::shared_ptr<CodeGenerator> codegen_;
@@ -199,8 +203,8 @@ class Executor::Impl {
 };
 
 Executor::Impl::Impl(int rank, int world_size, int gpu_id,
-                     const std::string &name, const std::string &plan)
-    : rank_(rank), world_size_(world_size), gpu_id_(gpu_id) {
+                     const std::string &name)
+    : rank_(rank), world_size_(world_size), gpu_id_(gpu_id), name_(name) {
     if (rank < 0 || rank >= world_size) {
         ERR(InvalidUsageError, "Invalid rank ", rank, " with world size ",
             world_size);
@@ -211,17 +215,18 @@ Executor::Impl::Impl(int rank, int world_size, int gpu_id,
     if (world_size_ > 1) {
         init_communicator();
     }
+}
 
-    Json plan_json;
+void Executor::Impl::init(const std::string &plan) {
     auto &plan_path = get_env().enforce_plan_path;
     if (!plan_path.empty()) {
         LOG(INFO, "Enforce executor plan path: ", plan_path);
-        plan_json = Json::parse(read_file(plan_path));
+        plan_json_ = Json::parse(read_file(plan_path));
     } else {
-        plan_json = Json::parse(plan);
+        plan_json_ = Json::parse(plan);
     }
 
-    buffer_id_to_offset_ = init_buffers(plan_json);
+    buffer_id_to_offset_ = init_buffers(plan_json_);
 
     std::string buffer_id_to_offset_str;
     for (const auto &kv : buffer_id_to_offset_) {
@@ -230,7 +235,7 @@ Executor::Impl::Impl(int rank, int world_size, int gpu_id,
     }
 
     codegen_ =
-        std::make_shared<CodeGenerator>(plan_json, buffer_id_to_offset_, name);
+        std::make_shared<CodeGenerator>(plan_json_, buffer_id_to_offset_, name_);
 
     auto gpu_manager = GpuManager::get_instance(gpu_id_);
     timer_begin_ = gpu_manager->create_event();
@@ -249,13 +254,13 @@ Executor::Impl::Impl(int rank, int world_size, int gpu_id,
         static_cast<size_t>(gpu_manager->info().smem_block_total);
 
     if (world_size_ > 1) {
-        auto remote_ranks = init_remote_ranks(plan_json);
+        auto remote_ranks = init_remote_ranks(plan_json_);
         init_channels(remote_ranks);
     }
 
     kernel_ = std::shared_ptr<GpuKernel>(new GpuKernel(
         gpu_id_, codegen_->code(), {threads_per_block, 1, 1}, {num_sm, 1, 1},
-        std::max(smem_block_total, size_t(4)), name,
+        std::max(smem_block_total, size_t(4)), name_,
         {std::pair<void *, size_t>{buffer_->ref(), sizeof(buffer_->ref())},
          std::pair<void *, size_t>{flag, sizeof(flag)}}));
 }
@@ -812,13 +817,18 @@ void Executor::Impl::tensor_write(const Tensor tensor, const void *data,
 
 Executor::Executor(int rank, int world_size, int gpu_id,
                    const std::string &name, const std::string &plan)
-    : impl_(std::make_unique<Executor::Impl>(rank, world_size, gpu_id, name,
-                                             plan)) {}
+    : impl_(std::make_unique<Executor::Impl>(rank, world_size, gpu_id, name)) {
+    if (!plan.empty()) {
+        impl_->init(plan);
+    }
+}
 
 Executor::~Executor() = default;
 
 int Executor::gpu_id() const { return impl_->gpu_id(); }
 
+std::string Executor::plan() const { return impl_->plan(); }
+
 void Executor::compile() { impl_->compile(); }
 
 void Executor::launch(int64_t max_spin_count) { impl_->launch(max_spin_count); }
@@ -852,14 +862,17 @@ void Executor::tensor_write(const Tensor tensor, const void *data, size_t bytes,
 }
 
 DefaultExecutor::DefaultExecutor(const Model &model, int gpu_id,
-                                 const std::string &name)
+                                 const std::vector<DefaultPlanner::ConfigRule>& config_rules,
+                                 const std::string& name)
     : Executor(
           model.rank(), model.world_size(),
           (gpu_id < 0) ? (model.rank() % get_env().num_ranks_per_host) : gpu_id,
-          name,
-          DefaultPlanner(model, (gpu_id < 0) ? (model.rank() %
-                                                get_env().num_ranks_per_host)
-                                             : gpu_id)
-              .plan()) {}
+          name, "") {
+    DefaultPlanner planner(model, impl_->gpu_id());
+    for (const auto &rule : config_rules) {
+        planner.install_config_rule(rule);
+    }
+    impl_->init(planner.plan());
+}
 
 }  // namespace ark
diff --git a/ark/codegen.cpp b/ark/codegen.cpp
index cd6206284..09ff28dd3 100644
--- a/ark/codegen.cpp
+++ b/ark/codegen.cpp
@@ -305,7 +305,8 @@ std::string CodeGenerator::Impl::resource_group(
             n_slots = total_warps / num_warps_per_task;
         }
         if (n_slots == 0) {
-            ERR(SchedulerError, "not enough resources for task group");
+            ERR(SchedulerError, "not enough resources for task group: ",
+                tg.dump());
         }
 
         size_t task_b = *task_range.begin();
diff --git a/ark/include/ark/executor.hpp b/ark/include/ark/executor.hpp
index b8cdaf273..2473e1b14 100644
--- a/ark/include/ark/executor.hpp
+++ b/ark/include/ark/executor.hpp
@@ -5,6 +5,7 @@
 #define ARK_EXECUTOR_HPP
 
 #include <ark/model_ref.hpp>
+#include <ark/planner.hpp>
 #include <ark/tensor.hpp>
 #include <memory>
 #include <string>
@@ -24,6 +25,9 @@ class Executor {
     /// Return the GPU ID.
     int gpu_id() const;
 
+    /// Return the plan string.
+    std::string plan() const;
+
     /// Compile the model. This must be called before `launch()`.
     void compile();
 
@@ -68,7 +72,7 @@ class Executor {
     void tensor_write(const Tensor tensor, const void *data, size_t bytes,
                       bool is_d2d = false) const;
 
-   private:
+   protected:
     class Impl;
     std::unique_ptr<Impl> impl_;
 };
@@ -78,6 +82,7 @@ class Model;
 class DefaultExecutor : public Executor {
    public:
     DefaultExecutor(const Model &model, int gpu_id = -1,
+                    const std::vector<DefaultPlanner::ConfigRule>& config_rules = {},
                     const std::string &name = "DefaultExecutor");
 };
 
diff --git a/ark/include/kernels/common/broadcast.h b/ark/include/kernels/common/broadcast.h
index 97b12e004..858938613 100644
--- a/ark/include/kernels/common/broadcast.h
+++ b/ark/include/kernels/common/broadcast.h
@@ -186,9 +186,9 @@ struct Broadcast2Intrinsic {
         (BroadcastInput0 && BroadcastInput1)
             ? OutNelemPerThread
             : BroadcastInput0
-                  ? math::gcd<OutNelemPerThread, In0NelemPerThread>::value
+                  ? math::gcd<OutNelemPerThread, In1NelemPerThread>::value
                   : BroadcastInput1
-                        ? math::gcd<OutNelemPerThread, In1NelemPerThread>::value
+                        ? math::gcd<OutNelemPerThread, In0NelemPerThread>::value
                         : math::gcd<OutNelemPerThread,
                                     math::gcd<In0NelemPerThread,
                                               In1NelemPerThread>::value>::value;
diff --git a/ark/model/model_json.cpp b/ark/model/model_json.cpp
index 0057ef0aa..97ce71967 100644
--- a/ark/model/model_json.cpp
+++ b/ark/model/model_json.cpp
@@ -272,7 +272,16 @@ static void verify_format_plan(const Json &json) {
     }
 }
 
-PlanJson::PlanJson(const Json &json) : Json(json) { verify_format_plan(*this); }
+PlanJson::PlanJson(const Json &json)
+    : Json((json != nullptr) ? json
+                             : Json{{"Rank", 0},
+                                    {"WorldSize", 1},
+                                    {"NumProcessors", 1},
+                                    {"NumWarpsPerProcessor", 1},
+                                    {"TaskInfos", Json::array()},
+                                    {"ProcessorGroups", Json::array()}}) {
+    verify_format_plan(*this);
+}
 
 static std::stringstream &dump_pretty_plan(const Json &json,
                                            std::stringstream &ss, int indent,
diff --git a/ark/model/model_json.hpp b/ark/model/model_json.hpp
index cf5fbbce2..e42640a9a 100644
--- a/ark/model/model_json.hpp
+++ b/ark/model/model_json.hpp
@@ -18,7 +18,7 @@ class ModelJson : public Json {
 
 class PlanJson : public Json {
    public:
-    PlanJson(const Json &json);
+    PlanJson(const Json &json = nullptr);
     std::string dump_pretty(int indent = 0, int indent_step = 2) const;
 };
 
diff --git a/ark/model/model_op.cpp b/ark/model/model_op.cpp
index 6cdba5d02..b5a0645c8 100644
--- a/ark/model/model_op.cpp
+++ b/ark/model/model_op.cpp
@@ -202,8 +202,11 @@ std::shared_ptr<ModelOp> ModelOp::deserialize(const Json &serialized) {
     } else if (!serialized.contains("Args")) {
         ERR(InvalidUsageError, "ModelOp deserialization failed: missing Args");
     }
+    // Run `ModelOpT::from_name` before `construct()` to ensure all operators
+    // are registered.
+    auto op_type = ModelOpT::from_name(serialized["Type"]);
     auto ret = model_op_factory()->construct(serialized["Type"]);
-    ret->type_ = ModelOpT::from_name(serialized["Type"]);
+    ret->type_ = op_type;
     ret->name_ = serialized["Name"];
     ret->is_virtual_ = serialized["IsVirtual"];
     for (const auto &t : serialized["ReadTensors"]) {
diff --git a/ark/ops/ops_all_reduce_test.cpp b/ark/ops/ops_all_reduce_test.cpp
index 9e2c6f675..54c6426fa 100644
--- a/ark/ops/ops_all_reduce_test.cpp
+++ b/ark/ops/ops_all_reduce_test.cpp
@@ -94,7 +94,7 @@ void test_all_reduce_internal(ark::DimType nelem) {
             auto result =
                 ark::op_test("all_reduce", m, {ones}, {output},
                              baseline_all_reduce<ark::half_t, NumGpus>,
-                             {ones_vec.data()}, false, gpu_id, NumGpus);
+                             {ones_vec.data()});
             UNITTEST_LOG(result);
             UNITTEST_EQ(result.max_diff[0], 0.0f);
             return ark::unittest::SUCCESS;
diff --git a/ark/ops/ops_arithmetic_test.cpp b/ark/ops/ops_arithmetic_test.cpp
index 3fdc5ac7e..c7c18b603 100644
--- a/ark/ops/ops_arithmetic_test.cpp
+++ b/ark/ops/ops_arithmetic_test.cpp
@@ -2,6 +2,7 @@
 // Licensed under the MIT license.
 
 #include "ops_test_common.hpp"
+#include "model/model_json.hpp"
 
 template <typename T>
 void baseline_add(std::vector<void *> &outputs,
@@ -142,12 +143,25 @@ ark::unittest::State test_add_fp32() {
 
 ark::unittest::State test_add_fp16() {
     ark::Model m;
-    ark::Tensor t0 = m.tensor({8192}, ark::FP16);
-    ark::Tensor t1 = m.tensor({8192}, ark::FP16);
+    ark::Tensor t0 = m.tensor({32, 2048, 2048}, ark::FP16);
+    ark::Tensor t1 = m.tensor({32, 2048, 2048}, ark::FP16);
     ark::Tensor out = m.add(t0, t1);
 
     auto result =
-        ark::op_test("add_fp16", m, {t0, t1}, {out}, baseline_add<ark::half_t>);
+        ark::op_test("add_fp16", m, {t0, t1}, {out}, baseline_add<ark::half_t>, {},
+        {
+            ark::DefaultPlanner::ConfigRule([](const std::string op_str, const std::string) {
+                auto op = ark::Json::parse(op_str);
+                ark::Json config;
+                if (op.at("Type") == "Add") {
+                    config["NumWarps"] = 4;
+                    config["SramBytes"] = 0;
+                    config["Tile"] = {128, 256};
+                    config["NumTasks"] = 4096;
+                }
+                return config.dump();
+            })
+        });
     UNITTEST_LOG(result);
     UNITTEST_EQ(result.max_diff[0], 0.0f);
     return ark::unittest::SUCCESS;
@@ -416,20 +430,20 @@ ark::unittest::State test_div_invalid() {
 
 int main() {
     ark::init();
-    UNITTEST(test_add_fp32);
+    // UNITTEST(test_add_fp32);
     UNITTEST(test_add_fp16);
-    UNITTEST(test_add_bf16);
-    UNITTEST(test_add_overwrite);
-    UNITTEST(test_add_broadcast);
-    UNITTEST(test_add_invalid);
-    UNITTEST(test_sub_fp32);
-    UNITTEST(test_sub_invalid);
-    UNITTEST(test_mul_fp32);
-    UNITTEST(test_mul_fp16);
-    UNITTEST(test_mul_overwrite);
-    UNITTEST(test_mul_broadcast);
-    UNITTEST(test_mul_invalid);
-    UNITTEST(test_div_fp32);
-    UNITTEST(test_div_invalid);
+    // UNITTEST(test_add_bf16);
+    // UNITTEST(test_add_overwrite);
+    // UNITTEST(test_add_broadcast);
+    // UNITTEST(test_add_invalid);
+    // UNITTEST(test_sub_fp32);
+    // UNITTEST(test_sub_invalid);
+    // UNITTEST(test_mul_fp32);
+    // UNITTEST(test_mul_fp16);
+    // UNITTEST(test_mul_overwrite);
+    // UNITTEST(test_mul_broadcast);
+    // UNITTEST(test_mul_invalid);
+    // UNITTEST(test_div_fp32);
+    // UNITTEST(test_div_invalid);
     return ark::unittest::SUCCESS;
 }
diff --git a/ark/ops/ops_embedding_test.cpp b/ark/ops/ops_embedding_test.cpp
index 822973106..4f9df046a 100644
--- a/ark/ops/ops_embedding_test.cpp
+++ b/ark/ops/ops_embedding_test.cpp
@@ -80,7 +80,7 @@ ark::unittest::State test_embedding() {
     }
     auto result = ark::op_test("embedding_" + type_str, m, {ti, tw}, {to},
                                baseline_embedding<T>,
-                               {ti_data.data(), tw_data.data()}, true);
+                               {ti_data.data(), tw_data.data()});
     UNITTEST_LOG(result);
     UNITTEST_EQ(result.max_diff[0], 0.0f);
     return ark::unittest::SUCCESS;
diff --git a/ark/ops/ops_matmul.cpp b/ark/ops/ops_matmul.cpp
index b259f99c8..b4553a4ed 100644
--- a/ark/ops/ops_matmul.cpp
+++ b/ark/ops/ops_matmul.cpp
@@ -189,45 +189,55 @@ std::vector<ModelOpArg> ModelOpMatmul::impl_args([
 }
 
 static const Json get_default_config(const ArchRef arch,
-                                     const ModelDataType &data_type) {
+                                     const ModelDataType &data_type,
+                                     const Dims &mnk) {
+    if (data_type != FP32.ref() && data_type != FP16.ref() &&
+        data_type != BF16.ref()) {
+        ERR(InvalidUsageError,
+            "Unsupported data type: ", data_type->type_name());
+    }
+    if (!arch->belongs_to(ARCH_CUDA) && !arch->belongs_to(ARCH_ROCM)) {
+        ERR(InvalidUsageError, "Unsupported architecture: ", arch->name());
+    }
+    DimType tm = (mnk[0] > mnk[1]) ? 256 : 128;
+    DimType tn = (mnk[0] > mnk[1]) ? 128 : 256;
     if (arch->belongs_to(ARCH_CUDA_80) && data_type == FP32.ref()) {
         return {{"NumWarps", 8},
                 {"SramBytes", 147456},
-                {"TileShapeMNK", {128, 256, 32}}};
+                {"TileShapeMNK", {tm, tn, 32}}};
     } else if (arch->belongs_to(ARCH_CUDA_80) && data_type == FP16.ref()) {
         return {{"NumWarps", 8},
                 {"SramBytes", 147456},
-                {"TileShapeMNK", {128, 256, 64}}};
+                {"TileShapeMNK", {tm, tn, 64}}};
     } else if (arch->belongs_to(ARCH_CUDA_80) && data_type == BF16.ref()) {
         return {{"NumWarps", 8},
                 {"SramBytes", 147456},
-                {"TileShapeMNK", {128, 256, 64}}};
+                {"TileShapeMNK", {tm, tn, 64}}};
     } else if (arch->belongs_to(ARCH_ROCM_942) && data_type == FP32.ref()) {
         return {{"NumWarps", 4},
                 {"SramBytes", 24672},
-                {"TileShapeMNK", {128, 256, 16}}};
+                {"TileShapeMNK", {tm, tn, 16}}};
     } else if (arch->belongs_to(ARCH_ROCM_942) && data_type == FP16.ref()) {
         return {{"NumWarps", 4},
                 {"SramBytes", 24672},
-                {"TileShapeMNK", {128, 256, 32}}};
+                {"TileShapeMNK", {tm, tn, 32}}};
     } else if (arch->belongs_to(ARCH_ROCM_942) && data_type == BF16.ref()) {
         return {{"NumWarps", 4},
                 {"SramBytes", 24672},
-                {"TileShapeMNK", {128, 256, 32}}};
+                {"TileShapeMNK", {tm, tn, 32}}};
     }
-    ERR(InvalidUsageError, "Unsupported arch and data type: ", arch->name(),
-        " and ", data_type->type_name());
+    ERR(InternalError, "Unexpected error");
     return {};
 }
 
 Json ModelOpMatmul::default_config(const ArchRef arch) const {
     auto result = result_tensors_[0];
-    Json config = get_default_config(arch, result->data_type());
     check_fields_args(args_, {"TransposeInput", "TransposeOther"});
     Dims mnk = calc_problem_size(read_tensors_[0]->padded_shape(),
                                  read_tensors_[1]->padded_shape(),
                                  args_.at("TransposeInput").value<bool>(),
                                  args_.at("TransposeOther").value<bool>());
+    Json config = get_default_config(arch, result->data_type(), mnk);
     size_t tile_x = config.at("TileShapeMNK")[0];
     size_t tile_y = config.at("TileShapeMNK")[1];
     if (mnk[0] % tile_x != 0 || mnk[1] % tile_y != 0) {
diff --git a/ark/ops/ops_test_common.cpp b/ark/ops/ops_test_common.cpp
index 50317fba7..ad2c208b6 100644
--- a/ark/ops/ops_test_common.cpp
+++ b/ark/ops/ops_test_common.cpp
@@ -36,8 +36,9 @@ OpsTestResult op_test(const std::string &test_name_prefix, const Model &model,
                       const std::vector<Tensor> &outputs,
                       OpsTestBaseline baseline,
                       const std::vector<void *> &inputs_data,
-                      bool print_on_error, int rank, int world_size) {
-    DefaultExecutor exe(model);
+                      const std::vector<DefaultPlanner::ConfigRule>& config_rules,
+                      bool print_on_error) {
+    DefaultExecutor exe(model, -1, config_rules);
     exe.compile();
 
     std::vector<std::shared_ptr<std::vector<char>>> inputs_data_storages;
@@ -133,7 +134,7 @@ OpsTestResult op_test(const std::string &test_name_prefix, const Model &model,
     for (auto t : gt) {
         gt_ptrs.push_back(t->data());
     }
-    baseline(gt_ptrs, output_shapes, inputs_data_refs, input_shapes, rank);
+    baseline(gt_ptrs, output_shapes, inputs_data_refs, input_shapes, model.rank());
 
     std::stringstream test_name;
     test_name << test_name_prefix;
@@ -147,6 +148,7 @@ OpsTestResult op_test(const std::string &test_name_prefix, const Model &model,
 
     OpsTestResult result;
     result.test_name = test_name.str();
+    result.plan = exe.plan();
 
     // Compare results with the ground truth.
     for (size_t i = 0; i < outputs.size(); i++) {
@@ -187,7 +189,7 @@ OpsTestResult op_test(const std::string &test_name_prefix, const Model &model,
     GLOG(gpuDeviceSynchronize());
 
     // Throughput test.
-    if (world_size > 1) {
+    if (model.world_size() > 1) {
         // For multi-GPU, we need to make sure that all GPUs run the same
         // number of iterations. Rather than doing allgather, we just
         // use a magic number here.
diff --git a/ark/ops/ops_test_common.hpp b/ark/ops/ops_test_common.hpp
index 01e97dbb1..a32d9b748 100644
--- a/ark/ops/ops_test_common.hpp
+++ b/ark/ops/ops_test_common.hpp
@@ -10,6 +10,7 @@
 
 #include "ark/model.hpp"
 #include "ark/model_ref.hpp"
+#include "ark/planner.hpp"
 #include "ark/random.hpp"
 #include "bfloat16.h"
 #include "half.h"
@@ -133,6 +134,7 @@ TensorCompareResult tensor_compare(T *ground_truth, T *res, Dims shape,
 
 struct OpsTestResult {
     std::string test_name;
+    std::string plan;
     int iter;
     float msec_per_iter;
     std::vector<float> mse;
@@ -170,8 +172,8 @@ OpsTestResult op_test(const std::string &test_name_prefix, const Model &model,
                       const std::vector<Tensor> &outputs,
                       OpsTestBaseline baseline,
                       const std::vector<void *> &inputs_data = {},
-                      bool print_on_error = false, int rank = 0,
-                      int world_size = 1);
+                      const std::vector<DefaultPlanner::ConfigRule>& config_rules = {},
+                      bool print_on_error = false);
 
 OpsTestGpuMem to_gpu(void *host_ptr, size_t size);
 
diff --git a/examples/llama/README.md b/examples/llama/README.md
index 090dd1de3..1fe040ae0 100644
--- a/examples/llama/README.md
+++ b/examples/llama/README.md
@@ -29,10 +29,10 @@ Llama2 examples over ARK.
 4. Download Llama2 model weights and tokenizer weights.
     * The model and tokenizer should be compatible with the [official PyTorch implementation](https://github.com/facebookresearch/llama/blob/main/llama).
 
-5. Run the model accuracy test. `--pth_path` is the path to the model weights file (`consolidated.00.pth`).
+5. Run the model accuracy test. `--ckpt_dir` is the directory where the model weight files are at (e.g., `consolidated.00.pth`).
 
     ```bash
-    python3 model_test.py --pth_path=/path/to/model/weights.pth
+    python3 model_test.py --ckpt_dir=/directory/of/model/weights
     ```
 
 6. Test text generation. `--pth_path` is the path to the model weights file (`consolidated.00.pth`), `--tok_path` is the path to the tokenizer weights file (`tokenizer.model`), and `--params_path` is the path to the model parameters (`params.json`).
diff --git a/examples/llama/model_test.py b/examples/llama/model_test.py
index 737d3ec8b..585341640 100644
--- a/examples/llama/model_test.py
+++ b/examples/llama/model_test.py
@@ -58,30 +58,34 @@ def run_ark(
     ]
     output = module(*module_inputs)
 
-    runtime = ark.Runtime()
-    # Prefer num_warps_per_sm = 16 for nvidia and 8 for amd
-    runtime.launch(num_warps_per_sm=8)
+    with ark.Runtime() as rt:
+        rt.launch(plan_path="/mnt/changhohwang/ark/plan_gpu0.json")
 
-    # Load model parameters
-    if state_dict:
-        module.load_state_dict(state_dict)
+        # Load model parameters
+        if state_dict:
+            print("Loading state_dict")
+            module.load_state_dict(state_dict)
+            print("Loading state_dict done")
 
-    # Load input data into tensors
-    tensors = [i for i in module_inputs if isinstance(i, ark.Tensor)]
-    tensor_data = [i for i in inputs if isinstance(i, np.ndarray)]
-    for tensor, ndarray in zip(tensors, tensor_data):
-        tensor.from_numpy(ndarray)
+        # Load input data into tensors
+        tensors = [i for i in module_inputs if isinstance(i, ark.Tensor)]
+        tensor_data = [i for i in inputs if isinstance(i, np.ndarray)]
+        for tensor, ndarray in zip(tensors, tensor_data):
+            tensor.from_numpy(ndarray)
 
-    start_time = time.time()
+        start_time = time.time()
 
-    # Run the model
-    runtime.run(iter=iterations)
+        # Run the model
+        print("Run:", iterations)
 
-    end_time = time.time()
+        rt.run(iter=iterations)
+        print("Run done")
 
-    if isinstance(output, list) or isinstance(output, tuple):
-        outputs = [o.to_numpy() for o in output]
-    outputs = [output.to_numpy()]
+        end_time = time.time()
+
+        if isinstance(output, list) or isinstance(output, tuple):
+            outputs = [o.to_numpy() for o in output]
+        outputs = [output.to_numpy()]
 
     return RunResults(outputs=outputs, runtime=end_time - start_time)
 
@@ -160,7 +164,9 @@ def test_module(
         else:
             prefix = module_name_prefix + "." if module_name_prefix else ""
             # Load the state_dict from the given path
+            print("Loading ckpt:", ckpt_path)
             state_dict_pt = torch.load(ckpt_path)
+            print("Loading ckpt done")
             state_dict_pt = {
                 k[len(prefix) :]: v
                 for k, v in state_dict_pt.items()
@@ -182,6 +188,7 @@ def test_module(
         rank=rank,
         world_size=world_size,
     )
+    print("Run ARK done")
 
     if not test_thru_ark_only:
         # PyTorch module
@@ -195,6 +202,7 @@ def test_module(
             inputs_pt,
             iterations=test_thru_iterations if test_thru else 1,
         )
+        print("Run PyTorch done")
 
         if test_thru:
             print(
@@ -447,26 +455,26 @@ def test_transformer_block(
     )
     output = module(feature_tensor, 0, freqs_cis_ark_tensor, None)
 
-    ark.Model.get_model().create_nodes()
-    print(ark.Model.get_model().serialize())
-
-    # test_module(
-    #     module_class_ark=model_ark.TransformerBlock,
-    #     module_args_ark=[
-    #         0,
-    #         args,
-    #         ark.DataType.from_numpy(dtype),
-    #         rank,
-    #         world_size,
-    #     ],
-    #     inputs_ark=[feature, 0, freqs_cis_ark, None],
-    #     module_class_pt=model_pt.TransformerBlock,
-    #     module_args_pt=[0, args],
-    #     inputs_pt=[feature.astype(dtype), 0, freqs_cis, None],
-    #     module_name_prefix="layers.0",
-    #     rank=rank,
-    #     world_size=world_size,
-    # )
+    # print(ark.Model.get_model().serialize())
+
+    test_module(
+        module_class_ark=model_ark.TransformerBlock,
+        module_args_ark=[
+            0,
+            args,
+            ark.DataType.from_numpy(dtype),
+            rank,
+            world_size,
+        ],
+        inputs_ark=[feature, 0, freqs_cis_ark, None],
+        module_class_pt=model_pt.TransformerBlock,
+        module_args_pt=[0, args],
+        inputs_pt=[feature.astype(dtype), 0, freqs_cis, None],
+        module_name_prefix="layers.0",
+        rank=rank,
+        world_size=world_size,
+        test_thru=True,
+    )
 
 
 def test_transformer(
@@ -570,7 +578,7 @@ def worker(
     # Configurations
     args = ModelArgs7B()
     batch_size = 1
-    seq_len = 512
+    seq_len = 2048
     dtype = np.float16
     world_size = ngpus
 
@@ -578,7 +586,7 @@ def worker(
     args.vocab_size = 32000
 
     # Reduce max_seq_len due to OOM from the PyTorch model
-    args.max_seq_len = 512
+    args.max_seq_len = 2048
 
     # Verify the configurations
     assert batch_size <= args.max_batch_size
diff --git a/plan_gpu0.json b/plan_gpu0.json
new file mode 100644
index 000000000..49b6bdd98
--- /dev/null
+++ b/plan_gpu0.json
@@ -0,0 +1,2504 @@
+{
+  "Rank": 0,
+  "WorldSize": 1,
+  "NumProcessors": 304,
+  "NumWarpsPerProcessor": 4,
+  "TaskInfos": [
+    {
+      "Id": 0,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":4,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":0,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":6,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":7,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 1,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Rope",
+          "Name": "rope",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":12,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":5,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":15,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":16,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [32,128],
+            "NumTasks": 2048
+          }
+        }
+      ]
+    },
+    {
+      "Id": 2,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Transpose",
+          "Name": "transpose",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":16,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":19,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":20,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "Permutation": {"DIMS":[0,2,1,3]}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [8,128],
+            "NumTasks": 8192
+          }
+        }
+      ]
+    },
+    {
+      "Id": 3,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_1",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":4,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":1,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":8,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":9,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 4,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Rope",
+          "Name": "rope_1",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":13,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":5,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":17,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":18,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [32,128],
+            "NumTasks": 2048
+          }
+        }
+      ]
+    },
+    {
+      "Id": 5,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Transpose",
+          "Name": "transpose_2",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":18,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":23,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":24,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "Permutation": {"DIMS":[0,2,3,1]}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [8,128],
+            "NumTasks": 8192
+          }
+        }
+      ]
+    },
+    {
+      "Id": 6,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_2",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":4,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":2,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":10,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":11,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 7,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Transpose",
+          "Name": "transpose_1",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":14,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":21,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":22,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "Permutation": {"DIMS":[0,2,1,3]}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [8,128],
+            "NumTasks": 8192
+          }
+        }
+      ]
+    },
+    {
+      "Id": 8,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_3",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":20,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":24,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":25,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":26,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":false}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
+            "NumTasks": 4096
+          }
+        }
+      ]
+    },
+    {
+      "Id": 9,
+      "NumWarps": 4,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "ScalarMul",
+          "Name": "mul",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":26,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":27,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":28,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "Factor": {"FLOAT":0.0883883461356163}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [128,256],
+            "NumTasks": 4096
+          }
+        }
+      ]
+    },
+    {
+      "Id": 10,
+      "NumWarps": 1,
+      "SramBytes": 256,
+      "Ops": [
+        {
+          "Type": "ReduceMax",
+          "Name": "reduce_max",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":28,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":29,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":30,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "Axis": {"INT":3},
+            "KeepDim": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "ImplType": "WarpWise",
+            "SramBytes": 256,
+            "NumTasks": 65536
+          }
+        }
+      ]
+    },
+    {
+      "Id": 11,
+      "NumWarps": 4,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Sub",
+          "Name": "sub",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":28,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":30,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":28,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":31,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [128,256],
+            "NumTasks": 4096
+          }
+        }
+      ]
+    },
+    {
+      "Id": 12,
+      "NumWarps": 4,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Exp",
+          "Name": "exp",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":31,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":31,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":32,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [128,256],
+            "NumTasks": 4096
+          }
+        }
+      ]
+    },
+    {
+      "Id": 13,
+      "NumWarps": 1,
+      "SramBytes": 256,
+      "Ops": [
+        {
+          "Type": "ReduceSum",
+          "Name": "reduce_sum",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":32,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":33,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":34,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "Axis": {"INT":3},
+            "KeepDim": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "ImplType": "WarpWise",
+            "SramBytes": 256,
+            "NumTasks": 65536
+          }
+        }
+      ]
+    },
+    {
+      "Id": 14,
+      "NumWarps": 4,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Div",
+          "Name": "div",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":32,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":34,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":32,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":35,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [128,256],
+            "NumTasks": 4096
+          }
+        }
+      ]
+    },
+    {
+      "Id": 15,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_4",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":35,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":22,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":36,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":37,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":false}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [256,128,32],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 16,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Transpose",
+          "Name": "transpose_3",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":37,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":38,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":39,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "Permutation": {"DIMS":[0,2,1,3]}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [8,128],
+            "NumTasks": 8192
+          }
+        }
+      ]
+    },
+    {
+      "Id": 17,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_5",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":40,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":3,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":41,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":20,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":42,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":20,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 18,
+      "NumWarps": 4,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Cast",
+          "Name": "cast",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":52,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":30,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":54,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":55,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [128,256],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 19,
+      "NumWarps": 4,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Mul",
+          "Name": "mul_1",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":55,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":55,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":56,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":57,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [128,256],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 20,
+      "NumWarps": 1,
+      "SramBytes": 256,
+      "Ops": [
+        {
+          "Type": "ReduceMean",
+          "Name": "reduce_mean",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":57,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":58,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":59,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "Axis": {"INT":2},
+            "KeepDim": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "ImplType": "WarpWise",
+            "SramBytes": 256,
+            "NumTasks": 2048
+          }
+        }
+      ]
+    },
+    {
+      "Id": 21,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Rsqrt",
+          "Name": "rsqrt",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":59,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":60,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":61,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [64,1],
+            "NumTasks": 32
+          }
+        }
+      ]
+    },
+    {
+      "Id": 22,
+      "NumWarps": 4,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Mul",
+          "Name": "mul_2",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":55,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":61,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":62,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":63,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [128,256],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 23,
+      "NumWarps": 4,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Mul",
+          "Name": "mul_3",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":63,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":50,"DataType":"FP32","Shape":[1,1,4096],"Strides":[1,1,4096],"Offsets":[0,0,0],"PaddedShape":[1,1,4096],"Buffer":{"Id":28,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":63,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":64,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [128,256],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 24,
+      "NumWarps": 4,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Cast",
+          "Name": "cast_1",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":64,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":65,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":66,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [128,256],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 25,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_6",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":66,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":43,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":21,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":67,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":68,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 26,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Rope",
+          "Name": "rope_2",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":73,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":53,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":31,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":76,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":77,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,64],
+            "NumTasks": 131072
+          }
+        }
+      ]
+    },
+    {
+      "Id": 27,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Transpose",
+          "Name": "transpose_4",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":77,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":80,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":81,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "Permutation": {"DIMS":[0,2,1,3]}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [8,128],
+            "NumTasks": 8192
+          }
+        }
+      ]
+    },
+    {
+      "Id": 28,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_7",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":66,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":44,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":22,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":69,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":70,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 29,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Rope",
+          "Name": "rope_3",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":74,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":53,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":31,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":78,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":79,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,64],
+            "NumTasks": 131072
+          }
+        }
+      ]
+    },
+    {
+      "Id": 30,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Transpose",
+          "Name": "transpose_6",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":79,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":84,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":45,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":85,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":45,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "Permutation": {"DIMS":[0,2,3,1]}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [8,8],
+            "NumTasks": 131072
+          }
+        }
+      ]
+    },
+    {
+      "Id": 31,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_8",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":66,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":45,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":23,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":71,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":72,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 32,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Transpose",
+          "Name": "transpose_5",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":75,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":82,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":83,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "Permutation": {"DIMS":[0,2,1,3]}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [8,128],
+            "NumTasks": 8192
+          }
+        }
+      ]
+    },
+    {
+      "Id": 33,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_9",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":81,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":85,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":45,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":86,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":46,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":87,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":46,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":false}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
+            "NumTasks": 4096
+          }
+        }
+      ]
+    },
+    {
+      "Id": 34,
+      "NumWarps": 4,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "ScalarMul",
+          "Name": "mul_4",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":87,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":46,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":88,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":89,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "Factor": {"FLOAT":0.0883883461356163}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [128,256],
+            "NumTasks": 4096
+          }
+        }
+      ]
+    },
+    {
+      "Id": 35,
+      "NumWarps": 1,
+      "SramBytes": 256,
+      "Ops": [
+        {
+          "Type": "ReduceMax",
+          "Name": "reduce_max_1",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":89,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":90,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":48,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":91,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":48,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "Axis": {"INT":3},
+            "KeepDim": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "ImplType": "WarpWise",
+            "SramBytes": 256,
+            "NumTasks": 65536
+          }
+        }
+      ]
+    },
+    {
+      "Id": 36,
+      "NumWarps": 4,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Sub",
+          "Name": "sub_1",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":89,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":91,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":48,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":89,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":92,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [128,256],
+            "NumTasks": 4096
+          }
+        }
+      ]
+    },
+    {
+      "Id": 37,
+      "NumWarps": 4,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Exp",
+          "Name": "exp_1",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":92,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":92,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":93,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [128,256],
+            "NumTasks": 4096
+          }
+        }
+      ]
+    },
+    {
+      "Id": 38,
+      "NumWarps": 1,
+      "SramBytes": 256,
+      "Ops": [
+        {
+          "Type": "ReduceSum",
+          "Name": "reduce_sum_1",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":93,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":94,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":49,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":95,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":49,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "Axis": {"INT":3},
+            "KeepDim": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "ImplType": "WarpWise",
+            "SramBytes": 256,
+            "NumTasks": 65536
+          }
+        }
+      ]
+    },
+    {
+      "Id": 39,
+      "NumWarps": 4,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Div",
+          "Name": "div_1",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":93,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":95,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":49,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":93,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":96,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [128,256],
+            "NumTasks": 4096
+          }
+        }
+      ]
+    },
+    {
+      "Id": 40,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_10",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":96,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":83,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":97,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":50,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":98,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":50,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":false}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [256,128,32],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 41,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Transpose",
+          "Name": "transpose_7",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":98,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":50,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":99,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":51,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":100,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":51,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "Permutation": {"DIMS":[0,2,1,3]}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [8,128],
+            "NumTasks": 8192
+          }
+        }
+      ]
+    },
+    {
+      "Id": 42,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_11",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":101,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":51,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":46,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":102,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":52,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":103,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":52,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 43,
+      "NumWarps": 4,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Add",
+          "Name": "add",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":52,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":30,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":103,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":52,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":104,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":53,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":105,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":53,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [128,256],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 44,
+      "NumWarps": 4,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Cast",
+          "Name": "cast_2",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":105,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":53,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":106,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":107,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [128,256],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 45,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Mul",
+          "Name": "mul_5",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":107,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":107,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":108,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":55,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":109,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":55,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [128,256],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 46,
+      "NumWarps": 1,
+      "SramBytes": 256,
+      "Ops": [
+        {
+          "Type": "ReduceMean",
+          "Name": "reduce_mean_1",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":109,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":55,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":110,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":56,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":111,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":56,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "Axis": {"INT":2},
+            "KeepDim": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "ImplType": "WarpWise",
+            "SramBytes": 256,
+            "NumTasks": 2048
+          }
+        }
+      ]
+    },
+    {
+      "Id": 47,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Rsqrt",
+          "Name": "rsqrt_1",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":111,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":56,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":112,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":57,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":113,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":57,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [64,1],
+            "NumTasks": 32
+          }
+        }
+      ]
+    },
+    {
+      "Id": 48,
+      "NumWarps": 4,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Mul",
+          "Name": "mul_6",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":107,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":113,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":57,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":114,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":115,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [128,256],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 49,
+      "NumWarps": 4,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Mul",
+          "Name": "mul_7",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":115,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":51,"DataType":"FP32","Shape":[1,1,4096],"Strides":[1,1,4096],"Offsets":[0,0,0],"PaddedShape":[1,1,4096],"Buffer":{"Id":29,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":115,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":116,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [128,256],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 50,
+      "NumWarps": 4,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Cast",
+          "Name": "cast_3",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":116,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":117,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":59,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":118,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":59,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [128,256],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 51,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_12",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":118,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":59,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":47,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":25,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":119,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":60,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":120,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":60,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
+            "NumTasks": 688
+          }
+        }
+      ]
+    },
+    {
+      "Id": 52,
+      "NumWarps": 4,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Sigmoid",
+          "Name": "sigmoid",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":120,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":60,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":121,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":61,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":122,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":61,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [128,256],
+            "NumTasks": 688
+          }
+        }
+      ]
+    },
+    {
+      "Id": 53,
+      "NumWarps": 4,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Mul",
+          "Name": "mul_8",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":120,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":60,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":122,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":61,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":123,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":62,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":124,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":62,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [128,256],
+            "NumTasks": 688
+          }
+        }
+      ]
+    },
+    {
+      "Id": 54,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_13",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":118,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":59,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":49,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":27,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":125,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":63,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":126,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":63,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
+            "NumTasks": 688
+          }
+        }
+      ]
+    },
+    {
+      "Id": 55,
+      "NumWarps": 4,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Mul",
+          "Name": "mul_9",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":124,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":62,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":126,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":63,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":127,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":64,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":128,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":64,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [128,256],
+            "NumTasks": 688
+          }
+        }
+      ]
+    },
+    {
+      "Id": 56,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_14",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":128,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":64,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":48,"DataType":"FP16","Shape":[4096,11008],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,11008],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":129,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":65,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":130,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":65,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 57,
+      "NumWarps": 4,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Add",
+          "Name": "add_1",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":105,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":53,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":130,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":65,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":131,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":66,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":132,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":66,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [128,256],
+            "NumTasks": 256
+          }
+        }
+      ]
+    }
+  ],
+  "ProcessorGroups": [
+    {
+      "ProcessorRange": [0,256],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,256],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":0,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":1,"TaskRange":[0,2048],"Granularity":4}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":2,"TaskRange":[0,8192],"Granularity":4}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,256],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,256],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":3,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":4,"TaskRange":[0,2048],"Granularity":4}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":5,"TaskRange":[0,8192],"Granularity":4}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,256],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,256],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":6,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":7,"TaskRange":[0,8192],"Granularity":4}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":8,"TaskRange":[0,4096],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":9,"TaskRange":[0,4096],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,256],
+          "TaskGroups": [
+            {"TaskId":10,"TaskRange":[0,65536],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":11,"TaskRange":[0,4096],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":12,"TaskRange":[0,4096],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,256],
+          "TaskGroups": [
+            {"TaskId":13,"TaskRange":[0,65536],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":14,"TaskRange":[0,4096],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,256],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,256],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":15,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":16,"TaskRange":[0,8192],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,256],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,256],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":17,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":18,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":19,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,256],
+          "TaskGroups": [
+            {"TaskId":20,"TaskRange":[0,2048],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,32],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,32],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":21,"TaskRange":[0,32],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":22,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":23,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":24,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,256],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,256],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":25,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":26,"TaskRange":[0,131072],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":27,"TaskRange":[0,8192],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,256],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,256],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":28,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":29,"TaskRange":[0,131072],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":30,"TaskRange":[0,131072],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,256],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,256],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":31,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":32,"TaskRange":[0,8192],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":33,"TaskRange":[0,4096],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":34,"TaskRange":[0,4096],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,256],
+          "TaskGroups": [
+            {"TaskId":35,"TaskRange":[0,65536],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":36,"TaskRange":[0,4096],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":37,"TaskRange":[0,4096],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,256],
+          "TaskGroups": [
+            {"TaskId":38,"TaskRange":[0,65536],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":39,"TaskRange":[0,4096],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,256],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,256],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":40,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":41,"TaskRange":[0,8192],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,256],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,256],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":42,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":43,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":44,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":45,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,256],
+          "TaskGroups": [
+            {"TaskId":46,"TaskRange":[0,2048],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,32],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,32],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":47,"TaskRange":[0,32],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":48,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":49,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":50,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":51,"TaskRange":[0,688],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":52,"TaskRange":[0,688],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":53,"TaskRange":[0,688],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":54,"TaskRange":[0,688],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":55,"TaskRange":[0,688],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,256],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,256],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":56,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":57,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    }
+  ]
+}
diff --git a/python/ark/__init__.py b/python/ark/__init__.py
index 031afc7ba..f2f604be9 100644
--- a/python/ark/__init__.py
+++ b/python/ark/__init__.py
@@ -100,3 +100,4 @@ def set_world_size(world_size):
     GpuError,
     RuntimeError,
 )
+from .profiler import Profiler
diff --git a/python/ark/profiler.py b/python/ark/profiler.py
new file mode 100644
index 000000000..b959ceb18
--- /dev/null
+++ b/python/ark/profiler.py
@@ -0,0 +1,30 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import json
+import sys
+import time
+from .runtime import Runtime
+
+
+class Profiler:
+    def __init__(self, plan: str):
+        self.plan = json.loads(plan)
+
+    def run(self):
+        num_processor_groups = len(self.plan["ProcessorGroups"])
+        new_plan = {
+            "Rank": self.plan["Rank"], "WorldSize": self.plan["WorldSize"],
+            "NumProcessors": self.plan["NumProcessors"],
+            "NumWarpsPerProcessor": self.plan["NumWarpsPerProcessor"],
+            "TaskInfos": self.plan["TaskInfos"],
+            "ProcessorGroups": [{}]}
+        for i in range(num_processor_groups):
+            new_plan["ProcessorGroups"][0] = self.plan["ProcessorGroups"][i]
+            with Runtime() as rt:
+                rt.launch(plan=json.dumps(new_plan))
+                start_time = time.time()
+                iter = 1000
+                rt.run(iter=iter)
+                end_time = time.time()
+                sys.stderr.write(f"Processor group {i} runtime: {(end_time - start_time)/iter:.6f} seconds/iter\n")
diff --git a/python/executor_py.cpp b/python/executor_py.cpp
index e5ab4f964..a6e5308ee 100644
--- a/python/executor_py.cpp
+++ b/python/executor_py.cpp
@@ -149,6 +149,7 @@ void register_executor(py::module &m) {
             py::arg("rank"), py::arg("world_size"), py::arg("gpu_id"),
             py::arg("name"), py::arg("plan"))
         .def("gpu_id", &ark::Executor::gpu_id)
+        .def("plan", &ark::Executor::plan)
         .def("compile", &ark::Executor::compile)
         .def("launch", &ark::Executor::launch, py::arg("max_spin_count") = -1)
         .def("run", &ark::Executor::run, py::arg("iter"))

From ff8c4b8fc4ff178befa375ffc8ac546806fa6c4b Mon Sep 17 00:00:00 2001
From: Noli Gerawork <86308445+naturalcandy@users.noreply.github.com>
Date: Tue, 2 Jul 2024 21:25:07 -0400
Subject: [PATCH 20/79] torch to ark (#217)

- Adds Torch to ARK tensor conversion support
- New ModelBufferManager class handles external buffer registration and
simplifies buffer access during kernel initialization
- Adds test cases for ARK to Torch conversion support

---------

Co-authored-by: Changho Hwang <changhohwang@microsoft.com>
---
 ark/api/executor.cpp               | 53 ++++++++++++++++---
 ark/api/tensor.cpp                 | 18 ++++++-
 ark/codegen.cpp                    | 36 +++++++++----
 ark/codegen.hpp                    |  4 +-
 ark/include/ark/tensor.hpp         |  2 +
 ark/model/model_buffer.cpp         | 55 ++++++++++++++++++--
 ark/model/model_buffer.hpp         | 15 ++++++
 ark/model_buffer_manager.hpp       | 58 +++++++++++++++++++++
 python/ark/tensor.py               | 26 +++++-----
 python/tensor_py.cpp               | 46 ++++++++++++++++-
 python/unittest/test_conversion.py | 81 +++++++++++++++++++++++++++++-
 11 files changed, 355 insertions(+), 39 deletions(-)
 create mode 100644 ark/model_buffer_manager.hpp

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index 4af9df7c0..0a780bcc0 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -10,6 +10,7 @@
 #include <mscclpp/core.hpp>
 #include <mscclpp/proxy_channel.hpp>
 #include <mscclpp/sm_channel.hpp>
+#include <tuple>
 
 #include "ark/data_type.hpp"
 #include "ark/model.hpp"
@@ -24,6 +25,7 @@
 #include "gpu/gpu_manager.h"
 #include "logging.h"
 #include "model/model_buffer.hpp"
+#include "model_buffer_manager.hpp"
 #include "model/model_data_type.hpp"
 #include "model/model_tensor.hpp"
 #include "utils/utils_net.hpp"
@@ -234,8 +236,15 @@ void Executor::Impl::init(const std::string &plan) {
             std::to_string(kv.first) + ": " + std::to_string(kv.second) + ", ";
     }
 
-    codegen_ =
-        std::make_shared<CodeGenerator>(plan_json_, buffer_id_to_offset_, name_);
+    ModelBufferManager &buffer_manager = ModelBufferManager::get_instance();
+
+    if (!buffer_manager.is_empty()) {
+        codegen_ = std::make_shared<CodeGenerator>(
+            plan_json_, buffer_id_to_offset_, name, &buffer_manager);
+    } else {
+        codegen_ = std::make_shared<CodeGenerator>(plan_json_,
+                                                   buffer_id_to_offset_, name);
+    }
 
     auto gpu_manager = GpuManager::get_instance(gpu_id_);
     timer_begin_ = gpu_manager->create_event();
@@ -367,7 +376,16 @@ std::map<size_t, size_t> Executor::Impl::init_buffers(const Json &plan_json) {
             }
             continue;
         }
-        buffer_id_to_offset[buf_info->buffer->id()] = offset;
+        if (buf_info->buffer->is_external()) {
+            if (buf_info->buffer->device_id() != gpu_id_) {
+                ERR(InvalidUsageError,
+                    "PyTorch tensor and model execution are on different GPUs");
+            }
+            continue;
+        } else {
+            buffer_id_to_offset[buf_info->buffer->id()] = offset;
+            offset += buf_info->bytes;
+        }
         for (const auto &tag_info : buf_info->buffer->send_tags()) {
             remote_rank_to_send_tags_and_offsets[tag_info.first]
                 .first.push_back(tag_info.second);
@@ -380,7 +398,6 @@ std::map<size_t, size_t> Executor::Impl::init_buffers(const Json &plan_json) {
             remote_rank_to_recv_tags_and_offsets[tag_info.first]
                 .second.push_back(offset);
         }
-        offset += buf_info->bytes;
     }
     total_bytes_ = offset;
 
@@ -456,7 +473,11 @@ std::map<size_t, size_t> Executor::Impl::init_buffers(const Json &plan_json) {
         bootstrap->recv(tags.data(), len * sizeof(int), remote_rank, 1);
         bootstrap->recv(offsets.data(), len * sizeof(size_t), remote_rank, 2);
         for (int i = 0; i < len; ++i) {
-            buffer_id_to_offset[send_tag_to_buffer_id[tags[i]]] = offsets[i];
+            if (!buffer_id_to_info[send_tag_to_buffer_id[tags[i]]]
+                     ->buffer->is_external()) {
+                buffer_id_to_offset[send_tag_to_buffer_id[tags[i]]] =
+                    offsets[i];
+            }
         }
     }
     for (auto &kv : remote_rank_to_recv_tag_to_buffer_id) {
@@ -472,10 +493,13 @@ std::map<size_t, size_t> Executor::Impl::init_buffers(const Json &plan_json) {
         bootstrap->recv(tags.data(), len * sizeof(int), remote_rank, 4);
         bootstrap->recv(offsets.data(), len * sizeof(size_t), remote_rank, 5);
         for (int i = 0; i < len; ++i) {
-            buffer_id_to_offset[recv_tag_to_buffer_id[tags[i]]] = offsets[i];
+            if (!buffer_id_to_info[recv_tag_to_buffer_id[tags[i]]]
+                     ->buffer->is_external()) {
+                buffer_id_to_offset[recv_tag_to_buffer_id[tags[i]]] =
+                    offsets[i];
+            }
         }
     }
-
     return buffer_id_to_offset;
 }
 
@@ -742,6 +766,11 @@ uintptr_t Executor::Impl::tensor_address(const Tensor tensor) const {
 void Executor::Impl::tensor_read(const Tensor tensor, void *data, size_t bytes,
                                  bool is_d2d) const {
     GLOG(gpuSetDevice(gpu_id_));
+    if (tensor.ref()->buffer()->is_external()) {
+        ERR(InvalidUsageError,
+            "Reading data from a tensor preallocated by PyTorch is not "
+            "supported. Use PyTorch's native methods.");
+    }
     size_t tensor_data_bytes =
         tensor.shape().nelems() * tensor.data_type().bytes();
     if (bytes != tensor_data_bytes) {
@@ -779,6 +808,11 @@ void Executor::Impl::tensor_read(const Tensor tensor, void *data, size_t bytes,
 void Executor::Impl::tensor_write(const Tensor tensor, const void *data,
                                   size_t bytes, bool is_d2d) const {
     GLOG(gpuSetDevice(gpu_id_));
+    if (tensor.ref()->buffer()->is_external()) {
+        ERR(InvalidUsageError,
+            "Writing data to a tensor preallocated by PyTorch is not "
+            "supported. Use PyTorch's native methods.");
+    }
     size_t tensor_data_bytes =
         tensor.shape().nelems() * tensor.data_type().bytes();
     if (bytes != tensor_data_bytes) {
@@ -843,7 +877,10 @@ float Executor::stop(int64_t max_spin_count) {
 
 void Executor::barrier() { impl_->barrier(); }
 
-void Executor::destroy() { impl_.reset(nullptr); }
+void Executor::destroy() {
+    ModelBufferManager::get_instance().clear_buffers();
+    impl_.reset(nullptr);
+}
 
 bool Executor::destroyed() const { return impl_.get() == nullptr; }
 
diff --git a/ark/api/tensor.cpp b/ark/api/tensor.cpp
index 4b03c3ac8..4d33bd9f1 100644
--- a/ark/api/tensor.cpp
+++ b/ark/api/tensor.cpp
@@ -3,11 +3,25 @@
 
 #include "ark/tensor.hpp"
 
+#include "model/model_buffer.hpp"
 #include "model/model_data_type.hpp"
 #include "model/model_tensor.hpp"
 
 namespace ark {
 
+Tensor::Tensor(void* data_ptr, int32_t device_id,
+               const std::vector<int64_t>& shape,
+               const DataType& dtype) {
+    size_t external_data_size = std::accumulate(shape.begin(), shape.end(), 1,
+                                                std::multiplies<int64_t>()) *
+                                dtype.bytes();
+    auto buffer =
+        std::make_shared<ModelBuffer>(data_ptr, external_data_size, device_id);
+    auto tensor = std::make_shared<ModelTensor>(dtype.ref(), buffer, Dims(shape),
+                                                Dims(shape), Dims(), Dims());
+    ref_ = tensor;
+}
+
 size_t Tensor::id() const {
     if (ref_) {
         return ref_->id();
@@ -43,14 +57,14 @@ Dims Tensor::padded_shape() const {
     return Dims();
 }
 
-const DataType &Tensor::data_type() const {
+const DataType& Tensor::data_type() const {
     if (ref_) {
         return DataType::from_name(ref_->data_type()->type_name());
     }
     return NONE;
 }
 
-std::ostream &operator<<(std::ostream &os, const Tensor &tensor) {
+std::ostream& operator<<(std::ostream& os, const Tensor& tensor) {
     if (tensor.is_null()) {
         os << "null";
     } else {
diff --git a/ark/codegen.cpp b/ark/codegen.cpp
index 09ff28dd3..a97e5e45b 100644
--- a/ark/codegen.cpp
+++ b/ark/codegen.cpp
@@ -10,6 +10,7 @@
 #include "file_io.h"
 #include "logging.h"
 #include "model/model_buffer.hpp"
+#include "model_buffer_manager.hpp"
 #include "model/model_data_type.hpp"
 #include "model/model_op.hpp"
 #include "model/model_tensor.hpp"
@@ -43,7 +44,7 @@ class CodeGenerator::Impl {
    public:
     Impl(const PlanJson &plan,
          const std::map<size_t, size_t> &buffer_id_to_offset,
-         const std::string &name);
+         const std::string &name, ModelBufferManager *buffer_manager);
     ~Impl() = default;
 
    private:
@@ -64,6 +65,8 @@ class CodeGenerator::Impl {
 
     std::string sync_process_range(const Range<size_t> &ranges, int state_id);
 
+    ModelBufferManager *buffer_manager_;
+
    protected:
     friend class CodeGenerator;
 
@@ -78,14 +81,18 @@ class CodeGenerator::Impl {
 
 CodeGenerator::Impl::Impl(const PlanJson &plan,
                           const std::map<size_t, size_t> &buffer_id_to_offset,
-                          const std::string &name)
-    : buffer_id_to_offset_(buffer_id_to_offset), name_(name) {
+                          const std::string &name,
+                          ModelBufferManager *buffer_manager)
+    : buffer_id_to_offset_(buffer_id_to_offset),
+      name_(name),
+      buffer_manager_(buffer_manager) {
     rank_ = plan.at("Rank");
     world_size_ = plan.at("WorldSize");
     num_procs_ = plan.at("NumProcessors");
     num_warps_per_proc_ = plan.at("NumWarpsPerProcessor");
 
     std::stringstream definitions_ss;
+
     for (auto &task_json : plan.at("TaskInfos")) {
         definitions_ss << this->def_task(task_json);
     }
@@ -224,11 +231,19 @@ std::string CodeGenerator::Impl::def_task(const Json &task_json) {
             auto &arg = impl_args[i];
             if (arg.type_name() == "TENSOR") {
                 auto tns = arg.value<ModelTensorRef>();
-                size_t buffer_offset =
-                    buffer_id_to_offset_.at(tns->buffer()->id());
-                size_t offset = buffer_offset + ModelOffset(tns).value();
-                ss << "(" << tns->data_type()->type_str() << "*)&_buf["
-                   << offset << "]";
+                if (tns->buffer()->is_external()) {
+                    void *buf_addr =
+                        ModelBufferManager::get_instance().get_buffer(
+                            tns->buffer()->id());
+                    ss << "(" << tns->data_type()->type_str() << "*)"
+                       << buf_addr;
+                } else {
+                    size_t buffer_offset =
+                        buffer_id_to_offset_.at(tns->buffer()->id());
+                    size_t offset = buffer_offset + ModelOffset(tns).value();
+                    ss << "(" << tns->data_type()->type_str() << "*)&_buf["
+                       << offset << "]";
+                }
             } else if (arg.type_name() == "OFFSET") {
                 auto moff = arg.value<ModelOffset>();
                 size_t buffer_offset =
@@ -431,8 +446,9 @@ std::string CodeGenerator::Impl::sync_process_range(const Range<size_t> &range,
 
 CodeGenerator::CodeGenerator(
     const PlanJson &plan, const std::map<size_t, size_t> &buffer_id_to_offset,
-    const std::string &name)
-    : impl_(std::make_shared<Impl>(plan, buffer_id_to_offset, name)) {}
+    const std::string &name, ModelBufferManager *buffer_manager)
+    : impl_(std::make_shared<Impl>(plan, buffer_id_to_offset, name,
+                                   buffer_manager)) {}
 
 std::string CodeGenerator::code() const { return impl_->code_; }
 
diff --git a/ark/codegen.hpp b/ark/codegen.hpp
index 4f8307e7e..a2976e644 100644
--- a/ark/codegen.hpp
+++ b/ark/codegen.hpp
@@ -8,6 +8,7 @@
 #include <memory>
 #include <string>
 
+#include "model_buffer_manager.hpp"
 #include "model/model_json.hpp"
 
 namespace ark {
@@ -16,7 +17,8 @@ class CodeGenerator {
    public:
     CodeGenerator(const PlanJson &plan,
                   const std::map<size_t, size_t> &buffer_id_to_offset,
-                  const std::string &name = "ark_kernel");
+                  const std::string &name = "ark_kernel",
+                  ModelBufferManager *buffer_manager = nullptr);
 
     ~CodeGenerator() = default;
 
diff --git a/ark/include/ark/tensor.hpp b/ark/include/ark/tensor.hpp
index 747ce5fea..d13748175 100644
--- a/ark/include/ark/tensor.hpp
+++ b/ark/include/ark/tensor.hpp
@@ -31,6 +31,8 @@ class Tensor {
     Tensor(ModelTensorRef ref) : ref_(ref) {}
     Tensor(const Tensor &other) = default;
     Tensor &operator=(const Tensor &other) = default;
+    Tensor(void *data_ptr, int32_t device_id, const std::vector<int64_t> &shape,
+           const DataType &dtype);
 
     bool operator==(const Tensor &other) const { return ref_ == other.ref_; }
     bool operator!=(const Tensor &other) const { return ref_ != other.ref_; }
diff --git a/ark/model/model_buffer.cpp b/ark/model/model_buffer.cpp
index 4ce91b5e4..ce8f37727 100644
--- a/ark/model/model_buffer.cpp
+++ b/ark/model/model_buffer.cpp
@@ -4,13 +4,13 @@
 #include "model_buffer.hpp"
 
 #include "logging.h"
+#include "model_buffer_manager.hpp"
 
 namespace ark {
 
-ModelBuffer::ModelBuffer(int rank) : rank_(rank) {
-    static size_t id = 0;
-    id_ = id++;
-}
+size_t ModelBuffer::curr_id = 0;
+
+ModelBuffer::ModelBuffer(int rank) : rank_(rank) { id_ = curr_id++; }
 
 ModelBuffer::ModelBuffer(size_t id, int rank,
                          const std::vector<TagInfo> &send_tags,
@@ -24,6 +24,23 @@ ModelBuffer::ModelBuffer(size_t id, int rank,
     }
 }
 
+ModelBuffer::ModelBuffer(void *data, size_t size, int32_t device_id)
+    : rank_(-1),
+      external_data_(data),
+      external_data_size_(size),
+      device_id_(device_id),
+      is_external_(true) {
+    id_ = curr_id++;
+}
+
+ModelBuffer::ModelBuffer(size_t id, void *data, size_t size, int32_t device_id)
+    : id_(id),
+      rank_(-1),
+      external_data_(data),
+      external_data_size_(size),
+      device_id_(device_id),
+      is_external_(true) {}
+
 void ModelBuffer::tag_send(int remote_rank, int tag) {
     send_tags_.insert(TagInfo{remote_rank, tag});
 }
@@ -46,6 +63,14 @@ Json ModelBuffer::serialize() const {
     }
     j["SendTags"] = send_tags;
     j["RecvTags"] = recv_tags;
+    j["IsExternal"] = is_external_;
+    if (is_external_) {
+        ModelBufferManager::get_instance().register_buffer(id_, external_data_,
+                                                           external_data_size_);
+        j["ExternalDataSize"] = external_data_size_;
+        j["DeviceId"] = device_id_;
+    }
+    // external_data_ptr_ is not included in JSON
     return j;
 }
 
@@ -62,6 +87,28 @@ std::shared_ptr<ModelBuffer> ModelBuffer::deserialize(const Json &serialized) {
     } else if (!serialized.contains("RecvTags")) {
         ERR(InvalidUsageError,
             "ModelBuffer deserialization failed: missing RecvTags");
+    } else if (!serialized.contains("IsExternal")) {
+        ERR(InvalidUsageError,
+            "ModelBuffer deserialization failed: missing IsExternal");
+    }
+    if (serialized["IsExternal"]) {
+        if (!serialized.contains("ExternalDataSize")) {
+            ERR(InvalidUsageError,
+                "ModelBuffer deserialization failed: missing ExternalDataSize");
+        } else if (!serialized.contains("DeviceId")) {
+            ERR(InvalidUsageError,
+                "ModelBuffer deserialization failed: missing DeviceId");
+        }
+        void *data_ptr =
+            ModelBufferManager::get_instance().get_buffer(serialized["Id"]);
+        if (!data_ptr) {
+            ERR(InvalidUsageError,
+                "ModelBuffer deserialization failed: external buffer not found "
+                "in BufferManager");
+        }
+        return std::make_shared<ModelBuffer>(serialized["Id"], data_ptr,
+                                             serialized["ExternalDataSize"],
+                                             serialized["DeviceId"]);
     }
     return std::make_shared<ModelBuffer>(serialized["Id"], serialized["Rank"],
                                          serialized["SendTags"],
diff --git a/ark/model/model_buffer.hpp b/ark/model/model_buffer.hpp
index 7ad3db206..e7f1045b2 100644
--- a/ark/model/model_buffer.hpp
+++ b/ark/model/model_buffer.hpp
@@ -22,6 +22,10 @@ class ModelBuffer {
     ModelBuffer(size_t id, int rank, const std::vector<TagInfo> &send_tags,
                 const std::vector<TagInfo> &recv_tags);
 
+    // externally managed buffer
+    ModelBuffer(void *data, size_t size, int32_t device_id);
+    ModelBuffer(size_t id, void *data, size_t size, int32_t device_id);
+
     size_t id() const { return id_; }
 
     int rank() const { return rank_; }
@@ -44,11 +48,22 @@ class ModelBuffer {
 
     static std::shared_ptr<ModelBuffer> deserialize(const Json &serialized);
 
+    // external buffer management
+    size_t external_data_size() const { return external_data_size_; }
+    void *external_data() const { return external_data_; }
+    int32_t device_id() const { return device_id_; }
+    bool is_external() const { return is_external_; }
+
    private:
+    static size_t curr_id;
     size_t id_;
     int rank_;
     std::set<TagInfo> send_tags_;
     std::set<TagInfo> recv_tags_;
+    void *external_data_ = nullptr;
+    size_t external_data_size_ = 0;
+    int32_t device_id_;
+    bool is_external_ = false;
 };
 
 }  // namespace ark
diff --git a/ark/model_buffer_manager.hpp b/ark/model_buffer_manager.hpp
new file mode 100644
index 000000000..7b705f4c8
--- /dev/null
+++ b/ark/model_buffer_manager.hpp
@@ -0,0 +1,58 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef ARK_MODEL_BUFFER_MANAGER_HPP_
+#define ARK_MODEL_BUFFER_MANAGER_HPP_
+
+#include <tuple>
+#include <unordered_map>
+
+namespace ark {
+// Manages externally allocated buffers not in the ARK memory space.
+class ModelBufferManager {
+   public:
+    static ModelBufferManager& get_instance() {
+        static ModelBufferManager instance;
+        return instance;
+    }
+
+    void register_buffer(size_t id, void* data, size_t size) {
+        buffers_[id] = std::make_tuple(data, size);
+    }
+
+    void* get_buffer(size_t id) {
+        auto it = buffers_.find(id);
+        if (it != buffers_.end()) {
+            return std::get<0>(it->second);
+        }
+        return nullptr;
+    }
+
+    size_t get_buffer_size(size_t id) {
+        auto it = buffers_.find(id);
+        if (it != buffers_.end()) {
+            return std::get<1>(it->second);
+        }
+        return 0;
+    }
+
+    const std::unordered_map<size_t, std::tuple<void*, size_t>>& get_buffers()
+        const {
+        return buffers_;
+    }
+
+    void clear_buffers() { buffers_.clear(); }
+
+    bool is_empty() const { return buffers_.empty(); }
+
+   private:
+    std::unordered_map<size_t, std::tuple<void*, size_t>>
+        buffers_;  // Maps buffer IDs to pointers and sizes.
+    size_t next_compact_id_ = 0;
+    ModelBufferManager() {}
+    ModelBufferManager(const ModelBufferManager&) = delete;
+    ModelBufferManager& operator=(const ModelBufferManager&) = delete;
+};
+}  // namespace ark
+
+#endif  // ARK_MODEL_BUFFER_MANAGER_HPP_
diff --git a/python/ark/tensor.py b/python/ark/tensor.py
index ac2886960..8f26dc96e 100644
--- a/python/ark/tensor.py
+++ b/python/ark/tensor.py
@@ -167,18 +167,20 @@ def from_numpy(self, ndarray: np.ndarray) -> "Tensor":
         return self
 
     @staticmethod
-    def from_torch(tensor: torch.Tensor):
-        return Tensor(
-            Model.get_model().tensor(
-                Dims(list(tensor.shape)),
-                DataType.from_torch(tensor.dtype).ctype(),
-                Dims(),
-                Dims(),
-                Dims(),
-                "",
-            ),
-            lambda: tensor,
-        )
+    def from_torch(tensor: torch.Tensor, runtime_id: int = -1) -> "Tensor":
+        """
+        Returns an ARK tensor that shares the same memory with the torch tensor.
+        """
+        if _no_torch:
+            raise ImportError("torch is not available")
+        elif not tensor.is_contiguous():
+            raise ValueError("Torch tensor must be contiguous.")
+        elif tensor.device.type == "cpu":
+            raise ValueError("Torch tensor must be on a device.")
+        ark_dtype = DataType.from_torch(tensor.dtype)
+        dl_capsule = torch.utils.dlpack.to_dlpack(tensor)
+        ark_tensor = _Tensor(dl_capsule, ark_dtype.ctype())
+        return Tensor(ark_tensor, runtime_id=runtime_id)
 
     def copy(self, data: Union[np.ndarray, torch.Tensor]) -> "Tensor":
         """
diff --git a/python/tensor_py.cpp b/python/tensor_py.cpp
index fbd909d3d..16eb03421 100644
--- a/python/tensor_py.cpp
+++ b/python/tensor_py.cpp
@@ -1,6 +1,7 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
+#include <dlpack/dlpack.h>
 #include <pybind11/operators.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
@@ -9,8 +10,51 @@
 
 namespace py = pybind11;
 
-void register_tensor(py::module &m) {
+struct DLTensorMetadata {
+    void* data_ptr;
+    int32_t device_id;
+    DLDeviceType device_type;
+    int32_t ndim;
+    DLDataType dtype;
+    std::vector<int64_t> shape;
+    std::vector<int64_t> strides;
+    uint64_t byte_offset;
+};
+
+static DLTensorMetadata extractDLTensorMetadata(DLManagedTensor* dl_tensor) {
+    DLTensorMetadata metadata;
+    metadata.data_ptr = dl_tensor->dl_tensor.data;
+    metadata.device_id = dl_tensor->dl_tensor.device.device_id;
+    metadata.device_type = dl_tensor->dl_tensor.device.device_type;
+    metadata.ndim = dl_tensor->dl_tensor.ndim;
+    metadata.dtype = dl_tensor->dl_tensor.dtype;
+    metadata.shape.assign(
+        dl_tensor->dl_tensor.shape,
+        dl_tensor->dl_tensor.shape + dl_tensor->dl_tensor.ndim);
+    if (dl_tensor->dl_tensor.strides != nullptr) {
+        metadata.strides.assign(
+            dl_tensor->dl_tensor.strides,
+            dl_tensor->dl_tensor.strides + dl_tensor->dl_tensor.ndim);
+    }
+    metadata.byte_offset = dl_tensor->dl_tensor.byte_offset;
+    return metadata;
+}
+
+void register_tensor(py::module& m) {
     py::class_<ark::Tensor>(m, "_Tensor")
+        .def(py::init([](py::capsule capsule, const ark::DataType& dtype) {
+            DLManagedTensor* dl_tensor = (DLManagedTensor*)capsule;
+            if (!dl_tensor) {
+                throw std::runtime_error(
+                    "Capsule does not contain a DLManagedTensor");
+            }
+            DLTensorMetadata metadata = extractDLTensorMetadata(dl_tensor);
+            int32_t device_id = metadata.device_id;
+            void* data_ptr = metadata.data_ptr;
+            auto shape = metadata.shape;
+
+            return new ark::Tensor(data_ptr, device_id, shape, dtype);
+        }))
         .def("id", &ark::Tensor::id)
         .def("shape", &ark::Tensor::shape, py::return_value_policy::reference)
         .def("strides", &ark::Tensor::strides,
diff --git a/python/unittest/test_conversion.py b/python/unittest/test_conversion.py
index 5befa1c34..833b88662 100644
--- a/python/unittest/test_conversion.py
+++ b/python/unittest/test_conversion.py
@@ -1,6 +1,7 @@
 import pytest
 import numpy as np
 import ark
+from typing import Callable
 
 try:
     import torch
@@ -9,6 +10,8 @@
 except ImportError:
     _no_torch = True
 
+# ARK to Torch tests
+
 
 def initialize_tensor(dimensions, dtype):
     tensor = ark.tensor(dimensions, dtype)
@@ -69,7 +72,7 @@ def check_diff(input_tensor_host, input_view_numpy, value, index):
 
 # Test function to check if changes to the torch views are reflected in the original tensors
 @pytest.mark.parametrize("dtype", [ark.fp16, ark.fp32])
-def test_aliasing(dtype: ark.DataType):
+def test_ark_to_torch_aliasing(dtype: ark.DataType):
     ark.init()
     dimensions = [4, 4]
     input_tensor, input_tensor_host = initialize_tensor(dimensions, dtype)
@@ -126,3 +129,79 @@ def test_conversion_torch():
 
         torch_tensor = t.to_torch()
         assert torch.all(torch_tensor == 7)
+
+
+# Torch to ARK tests
+
+ArkBinOp = Callable[[ark.Tensor, ark.Tensor], ark.Tensor]
+TorchBinOp = Callable[[torch.Tensor, torch.Tensor], torch.Tensor]
+ArkUnOp = Callable[[ark.Tensor], ark.Tensor]
+TorchUnOp = Callable[[torch.Tensor], torch.Tensor]
+
+
+# Verify the accuracy of binary operations involving ARK view tensors
+@pytest.mark.parametrize(
+    "dtype, ark_op, torch_op, tensor_dims",
+    [(torch.float16, ark.add, torch.add, (2, 3))],
+)
+def test_bin_op(dtype, ark_op: ArkBinOp, torch_op: TorchBinOp, tensor_dims):
+    ark.init()
+    input_tensor = torch.randn(tensor_dims, dtype=dtype, device="cuda:0")
+    other_tensor = torch.randn(tensor_dims, dtype=dtype, device="cuda:0")
+    expected_output = torch_op(input_tensor, other_tensor).cpu().numpy()
+    input_ark_view = ark.Tensor.from_torch(input_tensor)
+    other_ark_view = ark.Tensor.from_torch(other_tensor)
+    output = ark_op(input_ark_view, other_ark_view)
+    runtime = ark.Runtime()
+    runtime.launch()
+    runtime.run()
+    output_host = output.to_numpy()
+    runtime.stop()
+    runtime.reset()
+    assert np.allclose(output_host, expected_output)
+
+
+# Verify the accuracy of unary operations involving ARK view tensors
+@pytest.mark.parametrize(
+    "dtype, ark_op, torch_op, tensor_dims",
+    [(torch.float16, ark.exp, torch.exp, (3, 3))],
+)
+def test_unary_op(dtype, ark_op: ArkUnOp, torch_op: TorchUnOp, tensor_dims):
+    ark.init()
+    input_tensor = torch.randn(tensor_dims, dtype=dtype, device="cuda:0")
+    expected_output = torch_op(input_tensor).cpu().numpy()
+    input_ark_view = ark.Tensor.from_torch(input_tensor)
+    output = ark_op(input_ark_view)
+    runtime = ark.Runtime()
+    runtime.launch()
+    runtime.run()
+    output_host = output.to_numpy()
+    runtime.stop()
+    runtime.reset()
+    assert np.allclose(output_host, expected_output)
+
+
+# Test function to check if changes in torch tensors are reflected in ARK views
+@pytest.mark.parametrize("dtype, tensor_dims", [(torch.float16, (64, 64))])
+def test_torch_to_ark_aliasing(dtype, tensor_dims):
+    ark.init()
+    # Initialize a PyTorch tensor
+    input_tensor = torch.randn(tensor_dims, dtype=dtype, device="cuda:0")
+    other_tensor = torch.randn(tensor_dims, dtype=dtype, device="cuda:0")
+
+    input_ark_view = ark.Tensor.from_torch(input_tensor)
+    other_ark_view = ark.Tensor.from_torch(other_tensor)
+
+    output = ark.add(input_ark_view, other_ark_view)
+    # Perform in place operations
+    input_tensor += other_tensor
+    other_tensor += input_tensor
+    expected_output = (input_tensor + other_tensor).cpu().numpy()
+
+    runtime = ark.Runtime()
+    runtime.launch()
+    runtime.run()
+    output_host = output.to_numpy()
+    runtime.stop()
+    runtime.reset()
+    assert np.allclose(output_host, expected_output)

From fe35541e02029b0d9a8da4cbdccf2565cbf516b0 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 3 Jul 2024 06:51:43 +0000
Subject: [PATCH 21/79] wip

---
 ark/api/executor.cpp                      |  22 +-
 ark/api/planner.cpp                       |   1 +
 ark/codegen.cpp                           |  16 +-
 ark/codegen.hpp                           |   3 +-
 ark/model/model_json.cpp                  |  14 +-
 ark/model_buffer_manager.hpp              |   5 +-
 cmake/Utils.cmake                         |   2 +-
 docs/plan_file.md                         |  18 +
 examples/llama/model_test.py              |   2 +-
 examples/tutorial/default_plan.json       | 115 +++---
 examples/tutorial/model.json              |  46 +--
 examples/tutorial/plan.json               |  63 ++--
 examples/tutorial/plan_1_larger_tile.json |  47 +--
 examples/tutorial/plan_2_split_k.json     |  63 ++--
 examples/tutorial/plan_3_overwrite.json   |  63 ++--
 examples/tutorial/plan_tutorial.py        |   4 +-
 plan_gpu0.json                            | 415 +++++++++++-----------
 python/ark/__init__.py                    |   3 +-
 python/ark/planner.py                     | 184 ++++++++++
 python/ark/profiler.py                    |  30 +-
 python/ark/runtime.py                     |  52 +--
 python/unittest/test_runtime.py           |  27 +-
 22 files changed, 686 insertions(+), 509 deletions(-)
 create mode 100644 python/ark/planner.py

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index 0a780bcc0..20b162b16 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -228,6 +228,16 @@ void Executor::Impl::init(const std::string &plan) {
         plan_json_ = Json::parse(plan);
     }
 
+    auto gpu_manager = GpuManager::get_instance(gpu_id_);
+
+    if (!gpu_manager->info().arch->belongs_to(
+            Arch::from_name(plan_json_.at("Architecture")))) {
+        LOG(WARN, "Architecture name of the plan `",
+            plan_json_.at("Architecture").get<std::string>(),
+            "` is not compatible with the GPU architecture `",
+            gpu_manager->info().arch->name(), "`.");
+    }
+
     buffer_id_to_offset_ = init_buffers(plan_json_);
 
     std::string buffer_id_to_offset_str;
@@ -236,17 +246,9 @@ void Executor::Impl::init(const std::string &plan) {
             std::to_string(kv.first) + ": " + std::to_string(kv.second) + ", ";
     }
 
-    ModelBufferManager &buffer_manager = ModelBufferManager::get_instance();
+    codegen_ = std::make_shared<CodeGenerator>(plan_json_, buffer_id_to_offset_,
+                                               name_);
 
-    if (!buffer_manager.is_empty()) {
-        codegen_ = std::make_shared<CodeGenerator>(
-            plan_json_, buffer_id_to_offset_, name, &buffer_manager);
-    } else {
-        codegen_ = std::make_shared<CodeGenerator>(plan_json_,
-                                                   buffer_id_to_offset_, name);
-    }
-
-    auto gpu_manager = GpuManager::get_instance(gpu_id_);
     timer_begin_ = gpu_manager->create_event();
     timer_end_ = gpu_manager->create_event();
     buffer_ = gpu_manager->malloc(total_bytes_, 65536);
diff --git a/ark/api/planner.cpp b/ark/api/planner.cpp
index 5c9d09f2e..14e1b7b41 100644
--- a/ark/api/planner.cpp
+++ b/ark/api/planner.cpp
@@ -119,6 +119,7 @@ std::string DefaultPlanner::Impl::plan(bool pretty) const {
     Json plan;
     plan["Rank"] = model_.rank();
     plan["WorldSize"] = model_.world_size();
+    plan["Architecture"] = gpu_info.arch->name();
     plan["NumProcessors"] = max_num_processors;
     plan["NumWarpsPerProcessor"] = max_num_warps;
     plan["TaskInfos"] = task_infos;
diff --git a/ark/codegen.cpp b/ark/codegen.cpp
index a97e5e45b..55327329a 100644
--- a/ark/codegen.cpp
+++ b/ark/codegen.cpp
@@ -44,7 +44,7 @@ class CodeGenerator::Impl {
    public:
     Impl(const PlanJson &plan,
          const std::map<size_t, size_t> &buffer_id_to_offset,
-         const std::string &name, ModelBufferManager *buffer_manager);
+         const std::string &name);
     ~Impl() = default;
 
    private:
@@ -65,8 +65,6 @@ class CodeGenerator::Impl {
 
     std::string sync_process_range(const Range<size_t> &ranges, int state_id);
 
-    ModelBufferManager *buffer_manager_;
-
    protected:
     friend class CodeGenerator;
 
@@ -81,11 +79,8 @@ class CodeGenerator::Impl {
 
 CodeGenerator::Impl::Impl(const PlanJson &plan,
                           const std::map<size_t, size_t> &buffer_id_to_offset,
-                          const std::string &name,
-                          ModelBufferManager *buffer_manager)
-    : buffer_id_to_offset_(buffer_id_to_offset),
-      name_(name),
-      buffer_manager_(buffer_manager) {
+                          const std::string &name)
+    : buffer_id_to_offset_(buffer_id_to_offset), name_(name) {
     rank_ = plan.at("Rank");
     world_size_ = plan.at("WorldSize");
     num_procs_ = plan.at("NumProcessors");
@@ -446,9 +441,8 @@ std::string CodeGenerator::Impl::sync_process_range(const Range<size_t> &range,
 
 CodeGenerator::CodeGenerator(
     const PlanJson &plan, const std::map<size_t, size_t> &buffer_id_to_offset,
-    const std::string &name, ModelBufferManager *buffer_manager)
-    : impl_(std::make_shared<Impl>(plan, buffer_id_to_offset, name,
-                                   buffer_manager)) {}
+    const std::string &name)
+    : impl_(std::make_shared<Impl>(plan, buffer_id_to_offset, name)) {}
 
 std::string CodeGenerator::code() const { return impl_->code_; }
 
diff --git a/ark/codegen.hpp b/ark/codegen.hpp
index a2976e644..1ed8ec9f2 100644
--- a/ark/codegen.hpp
+++ b/ark/codegen.hpp
@@ -17,8 +17,7 @@ class CodeGenerator {
    public:
     CodeGenerator(const PlanJson &plan,
                   const std::map<size_t, size_t> &buffer_id_to_offset,
-                  const std::string &name = "ark_kernel",
-                  ModelBufferManager *buffer_manager = nullptr);
+                  const std::string &name = "ark_kernel");
 
     ~CodeGenerator() = default;
 
diff --git a/ark/model/model_json.cpp b/ark/model/model_json.cpp
index 97ce71967..86eb843e2 100644
--- a/ark/model/model_json.cpp
+++ b/ark/model/model_json.cpp
@@ -250,9 +250,13 @@ static void verify_format_processor_group(const Json &json) {
 }
 
 static void verify_format_plan(const Json &json) {
-    const std::vector<std::string> required_fields = {
-        "Rank",      "WorldSize",      "NumProcessors", "NumWarpsPerProcessor",
-        "TaskInfos", "ProcessorGroups"};
+    const std::vector<std::string> required_fields = {"Rank",
+                                                      "WorldSize",
+                                                      "Architecture",
+                                                      "NumProcessors",
+                                                      "NumWarpsPerProcessor",
+                                                      "TaskInfos",
+                                                      "ProcessorGroups"};
     for (const auto &field : required_fields) {
         if (!json.contains(field)) {
             ERR(NotFoundError, "PlanJson: " + field + " not found");
@@ -276,6 +280,7 @@ PlanJson::PlanJson(const Json &json)
     : Json((json != nullptr) ? json
                              : Json{{"Rank", 0},
                                     {"WorldSize", 1},
+                                    {"Architecture", "ANY"},
                                     {"NumProcessors", 1},
                                     {"NumWarpsPerProcessor", 1},
                                     {"TaskInfos", Json::array()},
@@ -292,6 +297,9 @@ static std::stringstream &dump_pretty_plan(const Json &json,
     dump_pretty_item(json.at("WorldSize"), "WorldSize", ss,
                      indent + indent_step)
         << ",\n";
+    dump_pretty_item(json.at("Architecture"), "Architecture", ss,
+                     indent + indent_step)
+        << ",\n";
     dump_pretty_item(json.at("NumProcessors"), "NumProcessors", ss,
                      indent + indent_step)
         << ",\n";
diff --git a/ark/model_buffer_manager.hpp b/ark/model_buffer_manager.hpp
index 7b705f4c8..4baaec7fe 100644
--- a/ark/model_buffer_manager.hpp
+++ b/ark/model_buffer_manager.hpp
@@ -46,9 +46,8 @@ class ModelBufferManager {
     bool is_empty() const { return buffers_.empty(); }
 
    private:
-    std::unordered_map<size_t, std::tuple<void*, size_t>>
-        buffers_;  // Maps buffer IDs to pointers and sizes.
-    size_t next_compact_id_ = 0;
+    // Maps buffer IDs to pointers and sizes.
+    std::unordered_map<size_t, std::tuple<void*, size_t>> buffers_;
     ModelBufferManager() {}
     ModelBufferManager(const ModelBufferManager&) = delete;
     ModelBufferManager& operator=(const ModelBufferManager&) = delete;
diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
index 9bb83fb42..b1fd1b132 100644
--- a/cmake/Utils.cmake
+++ b/cmake/Utils.cmake
@@ -14,7 +14,7 @@ if(GIT_CLANG_FORMAT)
         COMMAND ${GIT_CLANG_FORMAT} --style=file --diff || true
     )
     add_custom_target(cpplint-autofix
-        COMMAND ${GIT_CLANG_FORMAT} --style=file || true
+        COMMAND ${GIT_CLANG_FORMAT} --style=file --extensions cc,cpp,h,hpp,cu,in,hip || true
     )
 else()
     message(STATUS "git-clang-format not found.")
diff --git a/docs/plan_file.md b/docs/plan_file.md
index 90a4537a2..c06ccc35d 100644
--- a/docs/plan_file.md
+++ b/docs/plan_file.md
@@ -6,6 +6,7 @@ See an example plan file: [Example 1](../examples/tutorial/default_plan.json)
 
     - Rank (Int)
     - WorldSize (Int)
+    - Architecture (String)
     - NumProcessors (Int)
     - NumWarpsPerProcessor (Int)
     - TaskInfos (Array of TaskInfo)
@@ -42,6 +43,23 @@ See an example plan file: [Example 1](../examples/tutorial/default_plan.json)
 
 `ProcessorRange`, `WarpRange`, `SramRange`, and `TaskRange` are in the "range" format, i.e., `[Begin, End, Step]` that indicates an arithmetic integer sequence with a common difference of `Step`, starting from `Begin` and ends before `End` (does not include `End`). They alternatively can be in the format `[Begin, End]` that assumes `Step` is 1.
 
+## Architecture
+
+A name that refers to the hardware architecture where the plan is supposed to run over. The following names are currently supported.
+
+- `ANY`: compatible with all architectures.
+
+- NVIDIA Family
+    - `CUDA`: compatible with all supported NVIDIA architectures.
+    - `CUDA_70`: compatible with NVIDIA Volta architecture.
+    - `CUDA_80`: compatible with NVIDIA Ampere architecture.
+    - `CUDA_90`: compatible with NVIDIA Hopper architecture.
+
+- AMD Family
+    - `ROCM`: compatible with all supported AMD architectures.
+    - `ROCM_90A`: compatible with AMD CDNA 2 (GFX90A) architecture.
+    - `ROCM_942`: compatible with AMD CDNA 3 (GFX942) architecture.
+
 ## TaskInfo
 
 A `TaskInfo` object describes a sequential set of operators. The followings describe each field of `TaskInfo`.
diff --git a/examples/llama/model_test.py b/examples/llama/model_test.py
index 585341640..71485be45 100644
--- a/examples/llama/model_test.py
+++ b/examples/llama/model_test.py
@@ -59,7 +59,7 @@ def run_ark(
     output = module(*module_inputs)
 
     with ark.Runtime() as rt:
-        rt.launch(plan_path="/mnt/changhohwang/ark/plan_gpu0.json")
+        rt.launch(ark.Plan.from_file("/mnt/changhohwang/ark/plan_gpu0.json"))
 
         # Load model parameters
         if state_dict:
diff --git a/examples/tutorial/default_plan.json b/examples/tutorial/default_plan.json
index c6b4be243..bb774a5b8 100644
--- a/examples/tutorial/default_plan.json
+++ b/examples/tutorial/default_plan.json
@@ -1,36 +1,37 @@
 {
   "Rank": 0,
   "WorldSize": 1,
-  "NumProcessors": 108,
-  "NumWarpsPerProcessor": 8,
+  "Architecture": "ROCM_942",
+  "NumProcessors": 304,
+  "NumWarpsPerProcessor": 4,
   "TaskInfos": [
     {
       "Id": 0,
-      "NumWarps": 8,
-      "SramBytes": 147456,
+      "NumWarps": 4,
+      "SramBytes": 24672,
       "Ops": [
         {
           "Type": "Matmul",
           "Name": "matmul",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":0,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":1,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":0,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":1,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":4,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":4,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":5,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":5,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
             "TransposeOther": {"BOOL":true}
           },
           "Config": {
-            "NumWarps": 8,
-            "SramBytes": 147456,
-            "TileShapeMNK": [128,256,64],
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
             "NumTasks": 172
           }
         }
@@ -46,13 +47,13 @@
           "Name": "sigmoid",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":5,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":5,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":6,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":6,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":7,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":7,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -74,14 +75,14 @@
           "Name": "mul",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":5,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":7,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":5,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":7,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":8,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":8,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":9,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":9,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -95,31 +96,31 @@
     },
     {
       "Id": 3,
-      "NumWarps": 8,
-      "SramBytes": 147456,
+      "NumWarps": 4,
+      "SramBytes": 24672,
       "Ops": [
         {
           "Type": "Matmul",
           "Name": "matmul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":0,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":3,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":0,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":3,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":10,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":10,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":11,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":11,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
             "TransposeOther": {"BOOL":true}
           },
           "Config": {
-            "NumWarps": 8,
-            "SramBytes": 147456,
-            "TileShapeMNK": [128,256,64],
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
             "NumTasks": 172
           }
         }
@@ -135,14 +136,14 @@
           "Name": "mul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":9,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":11,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":9,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":11,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":12,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":12,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":13,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":13,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -156,31 +157,31 @@
     },
     {
       "Id": 5,
-      "NumWarps": 8,
-      "SramBytes": 147456,
+      "NumWarps": 4,
+      "SramBytes": 24672,
       "Ops": [
         {
           "Type": "Matmul",
           "Name": "matmul_2",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":13,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":2,"DataType":"FP16","Shape":[4096,11008],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,11008],"Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":13,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":2,"DataType":"FP16","Shape":[4096,11008],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,11008],"Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":14,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":14,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":15,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":15,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
             "TransposeOther": {"BOOL":true}
           },
           "Config": {
-            "NumWarps": 8,
-            "SramBytes": 147456,
-            "TileShapeMNK": [128,256,64],
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
             "NumTasks": 64
           }
         }
@@ -189,12 +190,12 @@
   ],
   "ProcessorGroups": [
     {
-      "ProcessorRange": [0,108],
+      "ProcessorRange": [0,172],
       "ResourceGroups": [
         {
-          "ProcessorRange": [0,108],
-          "WarpRange": [0,8],
-          "SramRange": [0,147456],
+          "ProcessorRange": [0,172],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
           "TaskGroups": [
             {"TaskId":0,"TaskRange":[0,172],"Granularity":1}
           ]
@@ -202,10 +203,10 @@
       ]
     },
     {
-      "ProcessorRange": [0,108],
+      "ProcessorRange": [0,304],
       "ResourceGroups": [
         {
-          "ProcessorRange": [0,108],
+          "ProcessorRange": [0,304],
           "WarpRange": [0,1],
           "SramRange": [0,0],
           "TaskGroups": [
@@ -215,10 +216,10 @@
       ]
     },
     {
-      "ProcessorRange": [0,108],
+      "ProcessorRange": [0,304],
       "ResourceGroups": [
         {
-          "ProcessorRange": [0,108],
+          "ProcessorRange": [0,304],
           "WarpRange": [0,1],
           "SramRange": [0,0],
           "TaskGroups": [
@@ -228,12 +229,12 @@
       ]
     },
     {
-      "ProcessorRange": [0,108],
+      "ProcessorRange": [0,172],
       "ResourceGroups": [
         {
-          "ProcessorRange": [0,108],
-          "WarpRange": [0,8],
-          "SramRange": [0,147456],
+          "ProcessorRange": [0,172],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
           "TaskGroups": [
             {"TaskId":3,"TaskRange":[0,172],"Granularity":1}
           ]
@@ -241,10 +242,10 @@
       ]
     },
     {
-      "ProcessorRange": [0,108],
+      "ProcessorRange": [0,304],
       "ResourceGroups": [
         {
-          "ProcessorRange": [0,108],
+          "ProcessorRange": [0,304],
           "WarpRange": [0,1],
           "SramRange": [0,0],
           "TaskGroups": [
@@ -258,8 +259,8 @@
       "ResourceGroups": [
         {
           "ProcessorRange": [0,64],
-          "WarpRange": [0,8],
-          "SramRange": [0,147456],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
           "TaskGroups": [
             {"TaskId":5,"TaskRange":[0,64],"Granularity":1}
           ]
@@ -267,4 +268,4 @@
       ]
     }
   ]
-}
+}
\ No newline at end of file
diff --git a/examples/tutorial/model.json b/examples/tutorial/model.json
index 1bc9233a5..a6ba8e8be 100644
--- a/examples/tutorial/model.json
+++ b/examples/tutorial/model.json
@@ -12,14 +12,14 @@
           "Name": "matmul",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":0,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":1,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":0,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":1,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":4,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":4,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":5,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":5,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -31,13 +31,13 @@
           "Name": "sigmoid",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":5,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":5,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":6,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":6,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":7,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":7,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {}
         },
@@ -46,14 +46,14 @@
           "Name": "mul",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":5,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":7,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":5,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":7,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":8,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":8,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":9,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":9,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {}
         }
@@ -69,14 +69,14 @@
           "Name": "matmul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":0,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":3,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":0,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":3,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":10,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":10,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":11,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":11,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -95,14 +95,14 @@
           "Name": "mul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":9,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":11,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":9,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":11,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":12,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":12,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":13,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":13,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {}
         },
@@ -111,14 +111,14 @@
           "Name": "matmul_2",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":13,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":2,"DataType":"FP16","Shape":[4096,11008],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,11008],"Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":13,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":2,"DataType":"FP16","Shape":[4096,11008],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,11008],"Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":14,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":14,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":15,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":15,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
diff --git a/examples/tutorial/plan.json b/examples/tutorial/plan.json
index c0854e505..335c27549 100644
--- a/examples/tutorial/plan.json
+++ b/examples/tutorial/plan.json
@@ -1,6 +1,7 @@
 {
   "Rank": 0,
   "WorldSize": 1,
+  "Architecture": "CUDA_80",
   "NumProcessors": 108,
   "NumWarpsPerProcessor": 8,
   "TaskInfos": [
@@ -14,14 +15,14 @@
           "Name": "matmul",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
-            {"Id":1,"DataType":"FP16","Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
+            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
+            {"Id":1,"DataType":"FP16","Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
           ],
           "WriteTensors": [
-            {"Id":4,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":4,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -46,13 +47,13 @@
           "Name": "sigmoid",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "WriteTensors": [
-            {"Id":6,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":6,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":7,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":7,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {},
           "Config": {
@@ -74,14 +75,14 @@
           "Name": "mul",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
-            {"Id":7,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
+            {"Id":7,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "WriteTensors": [
-            {"Id":8,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":8,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":9,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":9,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {},
           "Config": {
@@ -103,14 +104,14 @@
           "Name": "matmul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
-            {"Id":3,"DataType":"FP16","Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
+            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
+            {"Id":3,"DataType":"FP16","Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
           ],
           "WriteTensors": [
-            {"Id":10,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":10,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -135,14 +136,14 @@
           "Name": "mul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":9,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
-            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":9,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
+            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "WriteTensors": [
-            {"Id":12,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":12,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":13,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":13,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {},
           "Config": {
@@ -164,14 +165,14 @@
           "Name": "matmul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":16,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,8320],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,8320]},
-            {"Id":17,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[4096,8320],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,8320]}
+            {"Id":16,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,8320],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,8320]},
+            {"Id":17,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[4096,8320],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,8320]}
           ],
           "WriteTensors": [
-            {"Id":14,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":14,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "ResultTensors": [
-            {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -196,14 +197,14 @@
           "Name": "matmul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":18,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,2688],"Strides":[1,512,11008],"Offsets":[0,0,8320],"PaddedShape":[1,512,2688]},
-            {"Id":19,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[4096,2688],"Strides":[4096,11008],"Offsets":[0,8320],"PaddedShape":[4096,2688]}
+            {"Id":18,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,2688],"Strides":[1,512,11008],"Offsets":[0,0,8320],"PaddedShape":[1,512,2688]},
+            {"Id":19,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[4096,2688],"Strides":[4096,11008],"Offsets":[0,8320],"PaddedShape":[4096,2688]}
           ],
           "WriteTensors": [
-            {"Id":20,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":20,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "ResultTensors": [
-            {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -228,14 +229,14 @@
           "Name": "add_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
-            {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
+            {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "WriteTensors": [
-            {"Id":23,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":23,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "ResultTensors": [
-            {"Id":15,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":15,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "Args": {},
           "Config": {
diff --git a/examples/tutorial/plan_1_larger_tile.json b/examples/tutorial/plan_1_larger_tile.json
index 3a3f66530..04d2e9d60 100644
--- a/examples/tutorial/plan_1_larger_tile.json
+++ b/examples/tutorial/plan_1_larger_tile.json
@@ -1,6 +1,7 @@
 {
   "Rank": 0,
   "WorldSize": 1,
+  "Architecture": "CUDA_80",
   "NumProcessors": 108,
   "NumWarpsPerProcessor": 8,
   "TaskInfos": [
@@ -14,14 +15,14 @@
           "Name": "matmul",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
-            {"Id":1,"DataType":"FP16","Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
+            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
+            {"Id":1,"DataType":"FP16","Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
           ],
           "WriteTensors": [
-            {"Id":4,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":4,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -46,13 +47,13 @@
           "Name": "sigmoid",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "WriteTensors": [
-            {"Id":6,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":6,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":7,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":7,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {},
           "Config": {
@@ -74,14 +75,14 @@
           "Name": "mul",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
-            {"Id":7,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
+            {"Id":7,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "WriteTensors": [
-            {"Id":8,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":8,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":9,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":9,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {},
           "Config": {
@@ -103,14 +104,14 @@
           "Name": "matmul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
-            {"Id":3,"DataType":"FP16","Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
+            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
+            {"Id":3,"DataType":"FP16","Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
           ],
           "WriteTensors": [
-            {"Id":10,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":10,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -135,14 +136,14 @@
           "Name": "mul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":9,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
-            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":9,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
+            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "WriteTensors": [
-            {"Id":12,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":12,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":13,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":13,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {},
           "Config": {
@@ -164,14 +165,14 @@
           "Name": "matmul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":13,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
-            {"Id":2,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[4096,11008],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,11008]}
+            {"Id":13,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
+            {"Id":2,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[4096,11008],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,11008]}
           ],
           "WriteTensors": [
-            {"Id":14,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":14,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "ResultTensors": [
-            {"Id":15,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":15,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
diff --git a/examples/tutorial/plan_2_split_k.json b/examples/tutorial/plan_2_split_k.json
index 493515d8c..837944171 100644
--- a/examples/tutorial/plan_2_split_k.json
+++ b/examples/tutorial/plan_2_split_k.json
@@ -1,6 +1,7 @@
 {
   "Rank": 0,
   "WorldSize": 1,
+  "Architecture": "CUDA_80",
   "NumProcessors": 108,
   "NumWarpsPerProcessor": 8,
   "TaskInfos": [
@@ -14,14 +15,14 @@
           "Name": "matmul",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
-            {"Id":1,"DataType":"FP16","Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
+            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
+            {"Id":1,"DataType":"FP16","Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
           ],
           "WriteTensors": [
-            {"Id":4,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":4,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -46,13 +47,13 @@
           "Name": "sigmoid",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "WriteTensors": [
-            {"Id":6,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":6,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":7,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":7,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {},
           "Config": {
@@ -74,14 +75,14 @@
           "Name": "mul",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
-            {"Id":7,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
+            {"Id":7,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "WriteTensors": [
-            {"Id":8,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":8,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":9,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":9,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {},
           "Config": {
@@ -103,14 +104,14 @@
           "Name": "matmul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
-            {"Id":3,"DataType":"FP16","Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
+            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
+            {"Id":3,"DataType":"FP16","Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
           ],
           "WriteTensors": [
-            {"Id":10,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":10,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -135,14 +136,14 @@
           "Name": "mul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":9,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
-            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":9,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
+            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "WriteTensors": [
-            {"Id":12,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":12,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":13,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":13,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {},
           "Config": {
@@ -164,14 +165,14 @@
           "Name": "matmul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":16,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,8320],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,8320]},
-            {"Id":17,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[4096,8320],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,8320]}
+            {"Id":16,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,8320],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,8320]},
+            {"Id":17,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[4096,8320],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,8320]}
           ],
           "WriteTensors": [
-            {"Id":14,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":14,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "ResultTensors": [
-            {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -196,14 +197,14 @@
           "Name": "matmul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":18,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,2688],"Strides":[1,512,11008],"Offsets":[0,0,8320],"PaddedShape":[1,512,2688]},
-            {"Id":19,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[4096,2688],"Strides":[4096,11008],"Offsets":[0,8320],"PaddedShape":[4096,2688]}
+            {"Id":18,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,2688],"Strides":[1,512,11008],"Offsets":[0,0,8320],"PaddedShape":[1,512,2688]},
+            {"Id":19,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[4096,2688],"Strides":[4096,11008],"Offsets":[0,8320],"PaddedShape":[4096,2688]}
           ],
           "WriteTensors": [
-            {"Id":20,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":20,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "ResultTensors": [
-            {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -228,14 +229,14 @@
           "Name": "add_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
-            {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
+            {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "WriteTensors": [
-            {"Id":23,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":23,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "ResultTensors": [
-            {"Id":15,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":15,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "Args": {},
           "Config": {
diff --git a/examples/tutorial/plan_3_overwrite.json b/examples/tutorial/plan_3_overwrite.json
index c0854e505..335c27549 100644
--- a/examples/tutorial/plan_3_overwrite.json
+++ b/examples/tutorial/plan_3_overwrite.json
@@ -1,6 +1,7 @@
 {
   "Rank": 0,
   "WorldSize": 1,
+  "Architecture": "CUDA_80",
   "NumProcessors": 108,
   "NumWarpsPerProcessor": 8,
   "TaskInfos": [
@@ -14,14 +15,14 @@
           "Name": "matmul",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
-            {"Id":1,"DataType":"FP16","Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
+            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
+            {"Id":1,"DataType":"FP16","Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
           ],
           "WriteTensors": [
-            {"Id":4,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":4,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -46,13 +47,13 @@
           "Name": "sigmoid",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "WriteTensors": [
-            {"Id":6,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":6,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":7,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":7,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {},
           "Config": {
@@ -74,14 +75,14 @@
           "Name": "mul",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
-            {"Id":7,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
+            {"Id":7,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "WriteTensors": [
-            {"Id":8,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":8,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":9,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":9,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {},
           "Config": {
@@ -103,14 +104,14 @@
           "Name": "matmul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
-            {"Id":3,"DataType":"FP16","Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
+            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
+            {"Id":3,"DataType":"FP16","Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
           ],
           "WriteTensors": [
-            {"Id":10,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":10,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -135,14 +136,14 @@
           "Name": "mul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":9,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
-            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":9,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
+            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "WriteTensors": [
-            {"Id":12,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":12,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":13,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":13,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {},
           "Config": {
@@ -164,14 +165,14 @@
           "Name": "matmul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":16,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,8320],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,8320]},
-            {"Id":17,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[4096,8320],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,8320]}
+            {"Id":16,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,8320],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,8320]},
+            {"Id":17,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[4096,8320],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,8320]}
           ],
           "WriteTensors": [
-            {"Id":14,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":14,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "ResultTensors": [
-            {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -196,14 +197,14 @@
           "Name": "matmul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":18,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,2688],"Strides":[1,512,11008],"Offsets":[0,0,8320],"PaddedShape":[1,512,2688]},
-            {"Id":19,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[4096,2688],"Strides":[4096,11008],"Offsets":[0,8320],"PaddedShape":[4096,2688]}
+            {"Id":18,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,2688],"Strides":[1,512,11008],"Offsets":[0,0,8320],"PaddedShape":[1,512,2688]},
+            {"Id":19,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[4096,2688],"Strides":[4096,11008],"Offsets":[0,8320],"PaddedShape":[4096,2688]}
           ],
           "WriteTensors": [
-            {"Id":20,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":20,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "ResultTensors": [
-            {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -228,14 +229,14 @@
           "Name": "add_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
-            {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
+            {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "WriteTensors": [
-            {"Id":23,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":23,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "ResultTensors": [
-            {"Id":15,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":15,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "Args": {},
           "Config": {
diff --git a/examples/tutorial/plan_tutorial.py b/examples/tutorial/plan_tutorial.py
index 056523e15..989f29c5e 100644
--- a/examples/tutorial/plan_tutorial.py
+++ b/examples/tutorial/plan_tutorial.py
@@ -339,7 +339,7 @@ def main(plan_path: str):
 
         plan = planner.plan()
         with open("default_plan.json", "w") as f:
-            f.write(plan)
+            f.write(str(plan))
         rt.launch(plan=plan)
 
         # Initialize
@@ -364,7 +364,7 @@ def main(plan_path: str):
         print(f"File {plan_path} does not exist. Exiting...")
         return
     with ark.Runtime.get_runtime() as rt:
-        rt.launch(plan_path=plan_path)
+        rt.launch(plan=ark.Plan.from_file(plan_path))
 
         # Initialize
         InputModule.initialize()
diff --git a/plan_gpu0.json b/plan_gpu0.json
index 49b6bdd98..63c1943e3 100644
--- a/plan_gpu0.json
+++ b/plan_gpu0.json
@@ -1,6 +1,7 @@
 {
   "Rank": 0,
   "WorldSize": 1,
+  "Architecture": "ROCM_942",
   "NumProcessors": 304,
   "NumWarpsPerProcessor": 4,
   "TaskInfos": [
@@ -14,14 +15,14 @@
           "Name": "matmul",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":4,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":0,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":4,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":0,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":6,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":6,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":7,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":7,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -46,14 +47,14 @@
           "Name": "rope",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":12,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":5,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":12,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":5,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":15,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":15,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":16,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":16,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -75,13 +76,13 @@
           "Name": "transpose",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":16,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":16,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":19,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":19,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":20,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":20,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "Permutation": {"DIMS":[0,2,1,3]}
@@ -105,14 +106,14 @@
           "Name": "matmul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":4,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":1,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":4,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":1,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":8,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":8,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":9,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":9,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -137,14 +138,14 @@
           "Name": "rope_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":13,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":5,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":13,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":5,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":17,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":17,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":18,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":18,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -166,13 +167,13 @@
           "Name": "transpose_2",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":18,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":18,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":23,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":23,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":24,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":24,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "Permutation": {"DIMS":[0,2,3,1]}
@@ -196,14 +197,14 @@
           "Name": "matmul_2",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":4,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":2,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":4,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":2,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":10,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":10,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":11,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":11,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -228,13 +229,13 @@
           "Name": "transpose_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":14,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":14,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":21,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":21,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":22,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":22,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "Permutation": {"DIMS":[0,2,1,3]}
@@ -258,14 +259,14 @@
           "Name": "matmul_3",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":20,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":24,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":20,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":24,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":25,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":25,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":26,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":26,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -290,13 +291,13 @@
           "Name": "mul",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":26,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":26,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":27,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":27,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":28,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":28,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "Factor": {"FLOAT":0.0883883461356163}
@@ -320,13 +321,13 @@
           "Name": "reduce_max",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":28,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":28,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":29,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":29,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":30,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":30,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "Axis": {"INT":3},
@@ -351,14 +352,14 @@
           "Name": "sub",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":28,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":30,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":28,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":30,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":28,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":28,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":31,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":31,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -380,13 +381,13 @@
           "Name": "exp",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":31,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":31,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":31,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":31,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":32,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":32,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -408,13 +409,13 @@
           "Name": "reduce_sum",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":32,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":32,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":33,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":33,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":34,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":34,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "Axis": {"INT":3},
@@ -439,14 +440,14 @@
           "Name": "div",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":32,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":34,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":32,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":34,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":32,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":32,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":35,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":35,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -468,14 +469,14 @@
           "Name": "matmul_4",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":35,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":22,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":35,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":22,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":36,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":36,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":37,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":37,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -500,13 +501,13 @@
           "Name": "transpose_3",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":37,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":37,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":38,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":38,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":39,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":39,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "Permutation": {"DIMS":[0,2,1,3]}
@@ -530,14 +531,14 @@
           "Name": "matmul_5",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":40,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":3,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":40,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":3,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":41,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":20,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":41,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":20,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":42,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":20,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":42,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":20,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -562,13 +563,13 @@
           "Name": "cast",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":52,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":30,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":52,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":30,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":54,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":54,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":55,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":55,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -590,14 +591,14 @@
           "Name": "mul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":55,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":55,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":55,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":55,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":56,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":56,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":57,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":57,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -619,13 +620,13 @@
           "Name": "reduce_mean",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":57,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":57,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":58,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":58,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":59,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":59,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "Axis": {"INT":2},
@@ -650,13 +651,13 @@
           "Name": "rsqrt",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":59,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":59,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":60,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":60,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":61,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":61,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -678,14 +679,14 @@
           "Name": "mul_2",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":55,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":61,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":55,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":61,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":62,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":62,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":63,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":63,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -707,14 +708,14 @@
           "Name": "mul_3",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":63,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":50,"DataType":"FP32","Shape":[1,1,4096],"Strides":[1,1,4096],"Offsets":[0,0,0],"PaddedShape":[1,1,4096],"Buffer":{"Id":28,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":63,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":50,"DataType":"FP32","Shape":[1,1,4096],"Strides":[1,1,4096],"Offsets":[0,0,0],"PaddedShape":[1,1,4096],"Buffer":{"Id":28,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":63,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":63,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":64,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":64,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -736,13 +737,13 @@
           "Name": "cast_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":64,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":64,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":65,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":65,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":66,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":66,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -764,14 +765,14 @@
           "Name": "matmul_6",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":66,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":43,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":21,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":66,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":43,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":21,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":67,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":67,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":68,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":68,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -796,14 +797,14 @@
           "Name": "rope_2",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":73,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":53,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":31,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":73,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":53,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":31,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":76,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":76,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":77,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":77,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -825,13 +826,13 @@
           "Name": "transpose_4",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":77,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":77,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":80,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":80,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":81,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":81,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "Permutation": {"DIMS":[0,2,1,3]}
@@ -855,14 +856,14 @@
           "Name": "matmul_7",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":66,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":44,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":22,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":66,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":44,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":22,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":69,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":69,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":70,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":70,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -887,14 +888,14 @@
           "Name": "rope_3",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":74,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":53,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":31,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":74,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":53,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":31,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":78,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":78,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":79,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":79,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -916,13 +917,13 @@
           "Name": "transpose_6",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":79,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":79,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":84,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":45,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":84,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":45,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":85,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":45,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":85,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":45,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "Permutation": {"DIMS":[0,2,3,1]}
@@ -946,14 +947,14 @@
           "Name": "matmul_8",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":66,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":45,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":23,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":66,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":45,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":23,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":71,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":71,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":72,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":72,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -978,13 +979,13 @@
           "Name": "transpose_5",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":75,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":75,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":82,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":82,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":83,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":83,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "Permutation": {"DIMS":[0,2,1,3]}
@@ -1008,14 +1009,14 @@
           "Name": "matmul_9",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":81,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":85,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":45,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":81,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":85,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":45,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":86,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":46,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":86,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":46,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":87,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":46,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":87,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":46,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -1040,13 +1041,13 @@
           "Name": "mul_4",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":87,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":46,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":87,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":46,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":88,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":88,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":89,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":89,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "Factor": {"FLOAT":0.0883883461356163}
@@ -1070,13 +1071,13 @@
           "Name": "reduce_max_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":89,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":89,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":90,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":48,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":90,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":48,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":91,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":48,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":91,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":48,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "Axis": {"INT":3},
@@ -1101,14 +1102,14 @@
           "Name": "sub_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":89,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":91,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":48,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":89,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":91,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":48,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":89,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":89,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":92,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":92,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -1130,13 +1131,13 @@
           "Name": "exp_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":92,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":92,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":92,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":92,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":93,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":93,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -1158,13 +1159,13 @@
           "Name": "reduce_sum_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":93,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":93,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":94,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":49,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":94,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":49,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":95,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":49,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":95,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":49,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "Axis": {"INT":3},
@@ -1189,14 +1190,14 @@
           "Name": "div_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":93,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":95,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":49,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":93,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":95,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":49,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":93,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":93,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":96,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":96,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -1218,14 +1219,14 @@
           "Name": "matmul_10",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":96,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":83,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":96,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":83,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":97,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":50,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":97,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":50,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":98,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":50,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":98,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":50,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -1250,13 +1251,13 @@
           "Name": "transpose_7",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":98,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":50,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":98,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":50,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":99,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":51,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":99,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":51,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":100,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":51,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":100,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":51,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "Permutation": {"DIMS":[0,2,1,3]}
@@ -1280,14 +1281,14 @@
           "Name": "matmul_11",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":101,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":51,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":46,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":101,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":51,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":46,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":102,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":52,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":102,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":52,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":103,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":52,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":103,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":52,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -1312,14 +1313,14 @@
           "Name": "add",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":52,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":30,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":103,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":52,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":52,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":30,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":103,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":52,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":104,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":53,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":104,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":53,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":105,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":53,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":105,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":53,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -1341,13 +1342,13 @@
           "Name": "cast_2",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":105,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":53,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":105,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":53,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":106,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":106,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":107,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":107,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -1369,14 +1370,14 @@
           "Name": "mul_5",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":107,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":107,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":107,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":107,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":108,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":55,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":108,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":55,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":109,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":55,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":109,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":55,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -1398,13 +1399,13 @@
           "Name": "reduce_mean_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":109,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":55,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":109,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":55,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":110,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":56,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":110,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":56,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":111,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":56,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":111,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":56,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "Axis": {"INT":2},
@@ -1429,13 +1430,13 @@
           "Name": "rsqrt_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":111,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":56,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":111,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":56,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":112,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":57,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":112,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":57,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":113,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":57,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":113,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":57,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -1457,14 +1458,14 @@
           "Name": "mul_6",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":107,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":113,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":57,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":107,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":113,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":57,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":114,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":114,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":115,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":115,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -1486,14 +1487,14 @@
           "Name": "mul_7",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":115,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":51,"DataType":"FP32","Shape":[1,1,4096],"Strides":[1,1,4096],"Offsets":[0,0,0],"PaddedShape":[1,1,4096],"Buffer":{"Id":29,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":115,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":51,"DataType":"FP32","Shape":[1,1,4096],"Strides":[1,1,4096],"Offsets":[0,0,0],"PaddedShape":[1,1,4096],"Buffer":{"Id":29,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":115,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":115,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":116,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":116,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -1515,13 +1516,13 @@
           "Name": "cast_3",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":116,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":116,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":117,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":59,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":117,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":59,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":118,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":59,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":118,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":59,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -1543,14 +1544,14 @@
           "Name": "matmul_12",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":118,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":59,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":47,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":25,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":118,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":59,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":47,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":25,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":119,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":60,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":119,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":60,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":120,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":60,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":120,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":60,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -1575,13 +1576,13 @@
           "Name": "sigmoid",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":120,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":60,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":120,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":60,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":121,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":61,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":121,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":61,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":122,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":61,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":122,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":61,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -1603,14 +1604,14 @@
           "Name": "mul_8",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":120,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":60,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":122,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":61,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":120,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":60,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":122,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":61,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":123,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":62,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":123,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":62,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":124,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":62,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":124,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":62,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -1632,14 +1633,14 @@
           "Name": "matmul_13",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":118,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":59,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":49,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":27,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":118,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":59,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":49,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":27,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":125,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":63,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":125,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":63,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":126,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":63,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":126,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":63,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -1664,14 +1665,14 @@
           "Name": "mul_9",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":124,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":62,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":126,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":63,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":124,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":62,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":126,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":63,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":127,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":64,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":127,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":64,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":128,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":64,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":128,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":64,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -1693,14 +1694,14 @@
           "Name": "matmul_14",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":128,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":64,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":48,"DataType":"FP16","Shape":[4096,11008],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,11008],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":128,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":64,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":48,"DataType":"FP16","Shape":[4096,11008],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,11008],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":129,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":65,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":129,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":65,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":130,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":65,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":130,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":65,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -1725,14 +1726,14 @@
           "Name": "add_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":105,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":53,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":130,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":65,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":105,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":53,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":130,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":65,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":131,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":66,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":131,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":66,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":132,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":66,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":132,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":66,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
diff --git a/python/ark/__init__.py b/python/ark/__init__.py
index f2f604be9..e96972906 100644
--- a/python/ark/__init__.py
+++ b/python/ark/__init__.py
@@ -37,7 +37,7 @@ def set_world_size(world_size):
 from .init import init
 from .tensor import Dims, Tensor, Parameter
 from .module import Module, RuntimeModule
-from .runtime import Runtime, DefaultPlanner
+from .runtime import Runtime
 from .serialize import save, load
 from .data_type import (
     DataType,
@@ -100,4 +100,5 @@ def set_world_size(world_size):
     GpuError,
     RuntimeError,
 )
+from .planner import DefaultPlanner, Plan
 from .profiler import Profiler
diff --git a/python/ark/planner.py b/python/ark/planner.py
new file mode 100644
index 000000000..8814896d2
--- /dev/null
+++ b/python/ark/planner.py
@@ -0,0 +1,184 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import copy
+import json
+from typing import Callable, Dict, List, Any
+
+from ._ark_core import _DefaultPlanner
+from .model import Model
+
+
+def idnt(indent):
+    return " " * indent
+
+
+def dquote(s):
+    return '"' + s + '"'
+
+
+def denser_json_obj(obj, key, level, indent, indent_step, ret=""):
+    if len(obj) == 0:
+        if key:
+            return ret + idnt(indent) + dquote(key) + ": {}"
+        else:
+            return ret + idnt(indent) + "{}"
+    ret += idnt(indent)
+    if key:
+        ret += dquote(key) + ": {\n"
+    else:
+        ret += "{\n"
+    num_item = len(obj)
+    for k, v in obj.items():
+        is_obj_or_arr = isinstance(v, dict) or isinstance(v, list)
+        is_num_arr = isinstance(v, list) and v and isinstance(v[0], int)
+        if level <= 0 or not is_obj_or_arr or is_num_arr:
+            ret += (
+                idnt(indent + indent_step)
+                + dquote(k)
+                + ": "
+                + json.dumps(v, separators=(",", ":"))
+            )
+        elif isinstance(v, dict):
+            ret += denser_json_obj(
+                v, k, level - 1, indent + indent_step, indent_step
+            )
+        elif isinstance(v, list):
+            ret += denser_json_arr(
+                v, k, level - 1, indent + indent_step, indent_step
+            )
+        num_item -= 1
+        if num_item > 0:
+            ret += ",\n"
+        else:
+            ret += "\n"
+    ret += idnt(indent) + "}"
+    return ret
+
+
+def denser_json_arr(obj, key, level, indent, indent_step, ret=""):
+    if len(obj) == 0:
+        if key:
+            return ret + idnt(indent) + dquote(key) + ": []"
+        else:
+            return ret + idnt(indent) + "[]"
+    ret += idnt(indent)
+    if key:
+        ret += dquote(key) + ": [\n"
+    else:
+        ret += "[\n"
+    num_item = len(obj)
+    for v in obj:
+        is_obj_or_arr = isinstance(v, dict) or isinstance(v, list)
+        is_num_arr = (
+            isinstance(v, list)
+            and v
+            and (isinstance(v[0], int) or isinstance(v[0], float))
+        )
+        if level <= 0 or not is_obj_or_arr or is_num_arr:
+            ret += idnt(indent + indent_step) + json.dumps(
+                v, separators=(",", ":")
+            )
+        elif isinstance(v, dict):
+            ret += denser_json_obj(
+                v, "", level - 1, indent + indent_step, indent_step
+            )
+        elif isinstance(v, list):
+            ret += denser_json_arr(
+                v, "", level - 1, indent + indent_step, indent_step
+            )
+        num_item -= 1
+        if num_item > 0:
+            ret += ",\n"
+        else:
+            ret += "\n"
+    ret += idnt(indent) + "]"
+    return ret
+
+
+def denser_json(obj, level, indent_step=2):
+    if isinstance(obj, dict):
+        return denser_json_obj(obj, "", level, 0, indent_step, "")
+    elif isinstance(obj, list):
+        return denser_json_arr(obj, "", level, 0, indent_step, "")
+    return json.dumps(obj, indent=indent_step)
+
+
+class Plan:
+    def __init__(self, plan: Dict[str, Any]):
+        if plan is None:
+            plan = {}
+            plan["Rank"] = 0
+            plan["WorldSize"] = 1
+            plan["Architecture"] = "ANY"
+            plan["NumProcessors"] = 1
+            plan["NumWarpsPerProcessor"] = 1
+            plan["TaskInfos"] = []
+            plan["ProcessorGroups"] = []
+        else:
+            plan = copy.deepcopy(plan)
+        self.plan = plan
+
+    def __str__(self) -> str:
+        return denser_json(self.plan, 5)
+
+    @property
+    def rank(self) -> int:
+        return self.plan["Rank"]
+
+    @property
+    def world_size(self) -> int:
+        return self.plan["WorldSize"]
+
+    @property
+    def architecture(self) -> str:
+        return self.plan["Architecture"]
+
+    @property
+    def num_processors(self) -> int:
+        return self.plan["NumProcessors"]
+
+    @property
+    def num_warps_per_processor(self) -> int:
+        return self.plan["NumWarpsPerProcessor"]
+
+    @property
+    def task_infos(self) -> List[Dict[str, Any]]:
+        return self.plan["TaskInfos"]
+
+    @property
+    def processor_groups(self) -> List[Dict[str, Any]]:
+        return self.plan["ProcessorGroups"]
+
+    @staticmethod
+    def from_str(plan_str: str) -> "Plan":
+        plan = json.loads(plan_str)
+        return Plan(plan)
+
+    @staticmethod
+    def from_file(file_path: str) -> "Plan":
+        with open(file_path, "r") as f:
+            plan = json.load(f)
+        return Plan(plan)
+
+
+class DefaultPlanner(_DefaultPlanner):
+    def __init__(self, device_id: int = 0):
+        compressed = Model.get_model().compress()
+        super().__init__(compressed, device_id)
+
+    def install_config_rule(self, rule: Callable[[str, str], str]):
+        """
+        Install a configuration rule.
+
+        Args:
+            rule: A function that takes an operator description and a target
+            architecture name and returns a configuration description.
+        """
+        super().install_config_rule(rule)
+
+    def plan(self) -> Plan:
+        """
+        Generate an execution plan.
+        """
+        return Plan.from_str(super().plan(pretty=False))
diff --git a/python/ark/profiler.py b/python/ark/profiler.py
index b959ceb18..feb78e0de 100644
--- a/python/ark/profiler.py
+++ b/python/ark/profiler.py
@@ -1,30 +1,36 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-import json
 import sys
 import time
+
 from .runtime import Runtime
+from .planner import Plan
 
 
 class Profiler:
-    def __init__(self, plan: str):
-        self.plan = json.loads(plan)
+    def __init__(self, plan: Plan):
+        self.plan = plan
 
     def run(self):
-        num_processor_groups = len(self.plan["ProcessorGroups"])
+        num_processor_groups = len(self.plan.processor_groups)
         new_plan = {
-            "Rank": self.plan["Rank"], "WorldSize": self.plan["WorldSize"],
-            "NumProcessors": self.plan["NumProcessors"],
-            "NumWarpsPerProcessor": self.plan["NumWarpsPerProcessor"],
-            "TaskInfos": self.plan["TaskInfos"],
-            "ProcessorGroups": [{}]}
+            "Rank": self.plan.rank,
+            "WorldSize": self.plan.world_size,
+            "Architecture": self.plan.architecture,
+            "NumProcessors": self.plan.num_processors,
+            "NumWarpsPerProcessor": self.plan.num_warps_per_processor,
+            "TaskInfos": self.plan.task_infos,
+            "ProcessorGroups": [None],
+        }
         for i in range(num_processor_groups):
-            new_plan["ProcessorGroups"][0] = self.plan["ProcessorGroups"][i]
+            new_plan["ProcessorGroups"][0] = self.plan.processor_groups[i]
             with Runtime() as rt:
-                rt.launch(plan=json.dumps(new_plan))
+                rt.launch(plan=str(new_plan))
                 start_time = time.time()
                 iter = 1000
                 rt.run(iter=iter)
                 end_time = time.time()
-                sys.stderr.write(f"Processor group {i} runtime: {(end_time - start_time)/iter:.6f} seconds/iter\n")
+                sys.stderr.write(
+                    f"Processor group {i} runtime: {(end_time - start_time)/iter:.6f} seconds/iter\n"
+                )
diff --git a/python/ark/runtime.py b/python/ark/runtime.py
index efae6ab3c..40bfaaa63 100644
--- a/python/ark/runtime.py
+++ b/python/ark/runtime.py
@@ -3,10 +3,11 @@
 
 import logging
 from enum import Enum
-from typing import Callable, Dict, List
+from typing import Dict, List
 
-from ._ark_core import _Executor, _DefaultPlanner
+from ._ark_core import _Executor
 from .model import Model
+from .planner import DefaultPlanner, Plan
 
 
 class _RuntimeState:
@@ -46,33 +47,9 @@ def print_runtime_states():
             print(f"{runtime_id:<12} | {runtime.state:<20}")
 
 
-class DefaultPlanner(_DefaultPlanner):
-    def __init__(self, gpu_id: int = 0):
-        compressed = Model.get_model().compress()
-        super().__init__(compressed, gpu_id)
-
-    def install_config_rule(self, rule: Callable[[str, str], str]):
-        """
-        Install a configuration rule.
-
-        Args:
-            rule: A function that takes an operator description and a target
-            architecture name and returns a configuration description.
-        """
-        super().install_config_rule(rule)
-
-    def plan(self, pretty: bool = True) -> str:
-        """
-        Generate an execution plan.
-
-        Args:
-            pretty: Whether to generate a pretty plan.
-        """
-        return super().plan(pretty)
-
-
 class Executor(_Executor):
-    pass
+    def __init__(self, plan: Plan, device_id: int, name: str):
+        super().__init__(plan.rank, plan.world_size, device_id, name, str(plan))
 
 
 class Runtime:
@@ -155,11 +132,8 @@ def running(self) -> bool:
 
     def launch(
         self,
-        rank: int = 0,
-        world_size: int = 1,
-        gpu_id: int = 0,
-        plan: str = "",
-        plan_path: str = "",
+        plan: Plan = None,
+        device_id: int = 0,
     ):
         """
         Create an executor and schedule the ARK model. The scheduler will generate
@@ -172,11 +146,7 @@ def launch(
             )
             return
         if not plan:
-            if not plan_path:
-                plan = DefaultPlanner(gpu_id).plan()
-            else:
-                with open(plan_path, "r") as f:
-                    plan = f.read()
+            plan = DefaultPlanner(device_id).plan()
         # If the RuntimeState is init, we need to create a new executor and
         # compile the kernels
         if self.state == Runtime.State.Init:
@@ -187,11 +157,9 @@ def launch(
                     )
                     self.executor.destroy()
             self.executor = Executor(
-                rank,
-                world_size,
-                gpu_id,
-                "ArkRuntime",
                 plan,
+                device_id,
+                "ArkRuntime",
             )
             self.executor.compile()
         self.executor.launch()
diff --git a/python/unittest/test_runtime.py b/python/unittest/test_runtime.py
index fd34bb96b..b075c64ea 100644
--- a/python/unittest/test_runtime.py
+++ b/python/unittest/test_runtime.py
@@ -2,18 +2,9 @@
 # Licensed under the MIT license.
 
 import ark
-import json
 
-empty_plan = json.dumps(
-    {
-        "Rank": 0,
-        "WorldSize": 1,
-        "NumProcessors": 1,
-        "NumWarpsPerProcessor": 1,
-        "TaskInfos": [],
-        "ProcessorGroups": [],
-    }
-)
+
+empty_plan = ark.Plan(None)
 
 
 def test_runtime_relaunch():
@@ -35,7 +26,7 @@ def test_multiple_runtime_launch():
     for i in range(num_runtimes):
         rt = ark.Runtime.get_runtime(i)
         assert rt.launched() == False
-        rt.launch(gpu_id=i, plan=empty_plan)
+        rt.launch(plan=empty_plan, device_id=i)
         assert rt.launched() == True
     for i in range(num_runtimes):
         rt = ark.Runtime.get_runtime(i)
@@ -46,9 +37,9 @@ def test_multiple_runtime_launch():
 def test_stop_runtime():
     ark.init()
     rt1 = ark.Runtime.get_runtime(1)
-    rt1.launch(plan=empty_plan, gpu_id=1)
+    rt1.launch(plan=empty_plan, device_id=1)
     rt2 = ark.Runtime.get_runtime(2)
-    rt2.launch(plan=empty_plan, gpu_id=2)
+    rt2.launch(plan=empty_plan, device_id=2)
     rt1.stop()
     rt1.reset()
     assert rt1.state == ark.Runtime.State.Init
@@ -59,9 +50,9 @@ def test_stop_runtime():
 def test_reset_runtime():
     ark.init()
     rt1 = ark.Runtime.get_runtime(0)
-    rt1.launch(plan=empty_plan, gpu_id=1)
+    rt1.launch(plan=empty_plan, device_id=1)
     rt2 = ark.Runtime.get_runtime(1)
-    rt2.launch(plan=empty_plan, gpu_id=2)
+    rt2.launch(plan=empty_plan, device_id=2)
     rt1.reset()
     assert rt1.launched() == False
     assert rt2.launched() == True
@@ -77,7 +68,7 @@ def test_multiple_runtimes_complex():
     default_runtime = ark.Runtime.get_runtime()
     runtime_list.append(default_runtime)
     for i, rt in enumerate(runtime_list):
-        rt.launch(plan=empty_plan, gpu_id=i)
+        rt.launch(plan=empty_plan, device_id=i)
         assert rt.launched() == True
     runtime_list[0].stop()
     assert runtime_list[0].state == ark.Runtime.State.LaunchedNotRunning
@@ -87,7 +78,7 @@ def test_multiple_runtimes_complex():
     assert runtime_list[1].state == ark.Runtime.State.Init
     assert runtime_list[0].state == ark.Runtime.State.LaunchedNotRunning
     assert runtime_list[2].state == ark.Runtime.State.LaunchedNotRunning
-    runtime_list[1].launch(plan=empty_plan, gpu_id=1)
+    runtime_list[1].launch(plan=empty_plan, device_id=1)
     for rt in runtime_list:
         assert rt.launched() == True
     ark.Runtime.delete_all_runtimes()

From 0cb10b92c601306d537eb3de6259cf73e59b33df Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 3 Jul 2024 07:58:34 +0000
Subject: [PATCH 22/79] fix a reduction perf bug

---
 ark/include/kernels/reduce.h | 18 +++++++++---------
 plan_gpu0.json               | 36 ++++++++++++++++++------------------
 python/ark/profiler.py       | 24 +++++++++++++++---------
 3 files changed, 42 insertions(+), 36 deletions(-)

diff --git a/ark/include/kernels/reduce.h b/ark/include/kernels/reduce.h
index 30c8b7831..3d0b4e008 100644
--- a/ark/include/kernels/reduce.h
+++ b/ark/include/kernels/reduce.h
@@ -53,7 +53,7 @@ DEVICE bf16 warpReduce(bf16 val) {
 template <typename ReduceType, typename UnitOp, int LanesNum, typename DataType>
 DEVICE DataType warpsReduce(DataType val, int tid, int smem_per_warp) {
     val = warpReduce<ReduceType, LanesNum>(val);
-    if (LanesNum > Arch::ThreadsPerWarp) {
+    if constexpr (LanesNum > Arch::ThreadsPerWarp) {
         ReduceSharedStorage<DataType> *shared =
             UnitOp::template shared_memory<ReduceSharedStorage<DataType>>(
                 smem_per_warp);
@@ -351,8 +351,8 @@ struct WwiseReduce {
     /// @param in Input tensor.
     /// @param uop_idx Index of the unit operator.
     template <typename DataType>
-    static DEVICE void runW(DataType *out, DataType *in, int uop_idx,
-                            int smem_per_warp) {
+    static DEVICE void run(DataType *out, DataType *in, int uop_idx,
+                           int smem_per_warp) {
         using ShapeChecker =
             ReduceShapeChecker<InShape, OutShape, UnitOutDims, Axis>;
         constexpr int NelemPerThread =
@@ -450,8 +450,8 @@ template <typename InDims, typename InShape, typename OutDims,
 DEVICE void reduce_w_sum(DataType *out, DataType *in, int uop_idx,
                          int smem_per_warp) {
     WwiseReduce<InDims, InShape, OutDims, OutShape, UnitOutDims, NumWarps,
-                SmemBytes, ReduceTypeSum, Axis>::runW(out, in, uop_idx,
-                                                      smem_per_warp);
+                SmemBytes, ReduceTypeSum, Axis>::run(out, in, uop_idx,
+                                                     smem_per_warp);
 }
 
 template <typename InDims, typename InShape, typename OutDims,
@@ -460,8 +460,8 @@ template <typename InDims, typename InShape, typename OutDims,
 DEVICE void reduce_w_mean(DataType *out, DataType *in, int uop_idx,
                           int smem_per_warp) {
     WwiseReduce<InDims, InShape, OutDims, OutShape, UnitOutDims, NumWarps,
-                SmemBytes, ReduceTypeMean, Axis>::runW(out, in, uop_idx,
-                                                       smem_per_warp);
+                SmemBytes, ReduceTypeMean, Axis>::run(out, in, uop_idx,
+                                                      smem_per_warp);
 }
 
 template <typename InDims, typename InShape, typename OutDims,
@@ -470,8 +470,8 @@ template <typename InDims, typename InShape, typename OutDims,
 DEVICE void reduce_w_max(DataType *out, DataType *in, int uop_idx,
                          int smem_per_warp) {
     WwiseReduce<InDims, InShape, OutDims, OutShape, UnitOutDims, NumWarps,
-                SmemBytes, ReduceTypeMax, Axis>::runW(out, in, uop_idx,
-                                                      smem_per_warp);
+                SmemBytes, ReduceTypeMax, Axis>::run(out, in, uop_idx,
+                                                     smem_per_warp);
 }
 
 }  // namespace ark
diff --git a/plan_gpu0.json b/plan_gpu0.json
index 63c1943e3..99e2da8fa 100644
--- a/plan_gpu0.json
+++ b/plan_gpu0.json
@@ -314,7 +314,7 @@
     {
       "Id": 10,
       "NumWarps": 1,
-      "SramBytes": 256,
+      "SramBytes": 0,
       "Ops": [
         {
           "Type": "ReduceMax",
@@ -336,7 +336,7 @@
           "Config": {
             "NumWarps": 1,
             "ImplType": "WarpWise",
-            "SramBytes": 256,
+            "SramBytes": 0,
             "NumTasks": 65536
           }
         }
@@ -402,7 +402,7 @@
     {
       "Id": 13,
       "NumWarps": 1,
-      "SramBytes": 256,
+      "SramBytes": 0,
       "Ops": [
         {
           "Type": "ReduceSum",
@@ -424,7 +424,7 @@
           "Config": {
             "NumWarps": 1,
             "ImplType": "WarpWise",
-            "SramBytes": 256,
+            "SramBytes": 0,
             "NumTasks": 65536
           }
         }
@@ -613,7 +613,7 @@
     {
       "Id": 20,
       "NumWarps": 1,
-      "SramBytes": 256,
+      "SramBytes": 0,
       "Ops": [
         {
           "Type": "ReduceMean",
@@ -635,7 +635,7 @@
           "Config": {
             "NumWarps": 1,
             "ImplType": "WarpWise",
-            "SramBytes": 256,
+            "SramBytes": 0,
             "NumTasks": 2048
           }
         }
@@ -1064,7 +1064,7 @@
     {
       "Id": 35,
       "NumWarps": 1,
-      "SramBytes": 256,
+      "SramBytes": 0,
       "Ops": [
         {
           "Type": "ReduceMax",
@@ -1086,7 +1086,7 @@
           "Config": {
             "NumWarps": 1,
             "ImplType": "WarpWise",
-            "SramBytes": 256,
+            "SramBytes": 0,
             "NumTasks": 65536
           }
         }
@@ -1152,7 +1152,7 @@
     {
       "Id": 38,
       "NumWarps": 1,
-      "SramBytes": 256,
+      "SramBytes": 0,
       "Ops": [
         {
           "Type": "ReduceSum",
@@ -1174,7 +1174,7 @@
           "Config": {
             "NumWarps": 1,
             "ImplType": "WarpWise",
-            "SramBytes": 256,
+            "SramBytes": 0,
             "NumTasks": 65536
           }
         }
@@ -1392,7 +1392,7 @@
     {
       "Id": 46,
       "NumWarps": 1,
-      "SramBytes": 256,
+      "SramBytes": 0,
       "Ops": [
         {
           "Type": "ReduceMean",
@@ -1414,7 +1414,7 @@
           "Config": {
             "NumWarps": 1,
             "ImplType": "WarpWise",
-            "SramBytes": 256,
+            "SramBytes": 0,
             "NumTasks": 2048
           }
         }
@@ -1883,7 +1883,7 @@
         {
           "ProcessorRange": [0,304],
           "WarpRange": [0,4],
-          "SramRange": [0,256],
+          "SramRange": [0,0],
           "TaskGroups": [
             {"TaskId":10,"TaskRange":[0,65536],"Granularity":1}
           ]
@@ -1922,7 +1922,7 @@
         {
           "ProcessorRange": [0,304],
           "WarpRange": [0,4],
-          "SramRange": [0,256],
+          "SramRange": [0,0],
           "TaskGroups": [
             {"TaskId":13,"TaskRange":[0,65536],"Granularity":1}
           ]
@@ -2013,7 +2013,7 @@
         {
           "ProcessorRange": [0,304],
           "WarpRange": [0,4],
-          "SramRange": [0,256],
+          "SramRange": [0,0],
           "TaskGroups": [
             {"TaskId":20,"TaskRange":[0,2048],"Granularity":1}
           ]
@@ -2208,7 +2208,7 @@
         {
           "ProcessorRange": [0,304],
           "WarpRange": [0,4],
-          "SramRange": [0,256],
+          "SramRange": [0,0],
           "TaskGroups": [
             {"TaskId":35,"TaskRange":[0,65536],"Granularity":1}
           ]
@@ -2247,7 +2247,7 @@
         {
           "ProcessorRange": [0,304],
           "WarpRange": [0,4],
-          "SramRange": [0,256],
+          "SramRange": [0,0],
           "TaskGroups": [
             {"TaskId":38,"TaskRange":[0,65536],"Granularity":1}
           ]
@@ -2351,7 +2351,7 @@
         {
           "ProcessorRange": [0,304],
           "WarpRange": [0,4],
-          "SramRange": [0,256],
+          "SramRange": [0,0],
           "TaskGroups": [
             {"TaskId":46,"TaskRange":[0,2048],"Granularity":1}
           ]
diff --git a/python/ark/profiler.py b/python/ark/profiler.py
index feb78e0de..529a0d506 100644
--- a/python/ark/profiler.py
+++ b/python/ark/profiler.py
@@ -8,11 +8,22 @@
 from .planner import Plan
 
 
+def timeit(plan: Plan):
+    with Runtime() as rt:
+        rt.launch(plan=plan)
+        start_time = time.time()
+        iter = 1000
+        rt.run(iter=iter)
+        end_time = time.time()
+        return (end_time - start_time) / iter
+
+
 class Profiler:
     def __init__(self, plan: Plan):
         self.plan = plan
 
     def run(self):
+        sys.stderr.write(f"End-to-end: {timeit(self.plan):.6f} seconds/iter\n")
         num_processor_groups = len(self.plan.processor_groups)
         new_plan = {
             "Rank": self.plan.rank,
@@ -25,12 +36,7 @@ def run(self):
         }
         for i in range(num_processor_groups):
             new_plan["ProcessorGroups"][0] = self.plan.processor_groups[i]
-            with Runtime() as rt:
-                rt.launch(plan=str(new_plan))
-                start_time = time.time()
-                iter = 1000
-                rt.run(iter=iter)
-                end_time = time.time()
-                sys.stderr.write(
-                    f"Processor group {i} runtime: {(end_time - start_time)/iter:.6f} seconds/iter\n"
-                )
+            lat_per_iter = timeit(Plan(new_plan))
+            sys.stderr.write(
+                f"Processor group {i}: {lat_per_iter:.6f} seconds/iter\n"
+            )

From 0fde9c5dc486ba1edb20235115575d360558ece9 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Thu, 4 Jul 2024 07:17:32 +0000
Subject: [PATCH 23/79] optimize

---
 ark/include/kernels/common/sync.h |  12 +--
 ark/ops/ops_broadcast.cpp         |   4 +-
 examples/llama/model_test.py      |   2 +-
 plan_gpu0.json                    | 172 ++++++++----------------------
 4 files changed, 51 insertions(+), 139 deletions(-)

diff --git a/ark/include/kernels/common/sync.h b/ark/include/kernels/common/sync.h
index 85f7639c9..f47625600 100644
--- a/ark/include/kernels/common/sync.h
+++ b/ark/include/kernels/common/sync.h
@@ -106,25 +106,19 @@ DEVICE void sync_warps() {
     static_assert(Arch::ThreadsPerWarp == 64, "");
     if constexpr (NumWarps == 1) {
         __builtin_amdgcn_wave_barrier();
-    } else if constexpr (NumWarps == 16) {
-        __syncthreads();
     } else {
         static_assert(ARK_SMEM_RESERVED_BYTES >= sizeof(sync::WarpGroupState),
                       "");
-        int lane_id = threadIdx.x & 63;
-        if (lane_id == 0) {
+        if ((threadIdx.x & 63) == 0) {
             constexpr int MaxOldCnt = NumWarps - 1;
-            int warp_id = threadIdx.x >> 6;
-            int group_id = warp_id / NumWarps;
+            int group_id = (threadIdx.x >> 6) / NumWarps;
             sync::WarpGroupState *state =
                 reinterpret_cast<sync::WarpGroupState *>(_ARK_SMEM);
             unsigned int tmp = state->is_inc_flag[group_id] ^ 1;
             if (atomicInc(&state->cnt[group_id], MaxOldCnt) == MaxOldCnt) {
                 state->flag[group_id] = tmp;
             } else {
-                while (atomicAdd(&state->flag[group_id], 0) != tmp)
-                    __builtin_amdgcn_s_sleep(1);
-                __asm__ __volatile__("s_wakeup");
+                while (atomicAdd(&state->flag[group_id], 0) != tmp);
             }
             state->is_inc_flag[group_id] = tmp;
         }
diff --git a/ark/ops/ops_broadcast.cpp b/ark/ops/ops_broadcast.cpp
index 3985a0500..f20e8c4dc 100644
--- a/ark/ops/ops_broadcast.cpp
+++ b/ark/ops/ops_broadcast.cpp
@@ -27,8 +27,8 @@ ModelOpBroadcast1::ModelOpBroadcast1(const std::string &type_name,
 std::string ModelOpBroadcast1::impl_name(const Json &config) const {
     check_fields_config(config, {"NumWarps", "Tile"});
     int num_warps = config.at("NumWarps");
-    auto &tile_shape = config.at("Tile");
-    Dims unit_out_dims{tile_shape[0], tile_shape[1]};
+    const auto& tile_shape = config.at("Tile").get<std::vector<DimType>>();
+    Dims unit_out_dims(tile_shape);
 
     return function_name_string(
         pascal_to_snake(type()->type_name()),
diff --git a/examples/llama/model_test.py b/examples/llama/model_test.py
index 71485be45..053015c04 100644
--- a/examples/llama/model_test.py
+++ b/examples/llama/model_test.py
@@ -473,7 +473,7 @@ def test_transformer_block(
         module_name_prefix="layers.0",
         rank=rank,
         world_size=world_size,
-        test_thru=True,
+        test_thru=False,
     )
 
 
diff --git a/plan_gpu0.json b/plan_gpu0.json
index 99e2da8fa..cad05f774 100644
--- a/plan_gpu0.json
+++ b/plan_gpu0.json
@@ -31,7 +31,7 @@
           "Config": {
             "NumWarps": 4,
             "SramBytes": 24672,
-            "TileShapeMNK": [128,256,32],
+            "TileShapeMNK": [256,128,32],
             "NumTasks": 256
           }
         }
@@ -39,7 +39,7 @@
     },
     {
       "Id": 1,
-      "NumWarps": 1,
+      "NumWarps": 4,
       "SramBytes": 0,
       "Ops": [
         {
@@ -58,17 +58,17 @@
           ],
           "Args": {},
           "Config": {
-            "NumWarps": 1,
+            "NumWarps": 4,
             "SramBytes": 0,
-            "Tile": [32,128],
-            "NumTasks": 2048
+            "Tile": [256,1,128],
+            "NumTasks": 256
           }
         }
       ]
     },
     {
       "Id": 2,
-      "NumWarps": 1,
+      "NumWarps": 4,
       "SramBytes": 0,
       "Ops": [
         {
@@ -88,10 +88,10 @@
             "Permutation": {"DIMS":[0,2,1,3]}
           },
           "Config": {
-            "NumWarps": 1,
+            "NumWarps": 4,
             "SramBytes": 0,
-            "Tile": [8,128],
-            "NumTasks": 8192
+            "Tile": [256,128],
+            "NumTasks": 256
           }
         }
       ]
@@ -122,7 +122,7 @@
           "Config": {
             "NumWarps": 4,
             "SramBytes": 24672,
-            "TileShapeMNK": [128,256,32],
+            "TileShapeMNK": [256,128,32],
             "NumTasks": 256
           }
         }
@@ -130,7 +130,7 @@
     },
     {
       "Id": 4,
-      "NumWarps": 1,
+      "NumWarps": 4,
       "SramBytes": 0,
       "Ops": [
         {
@@ -149,17 +149,17 @@
           ],
           "Args": {},
           "Config": {
-            "NumWarps": 1,
+            "NumWarps": 4,
             "SramBytes": 0,
-            "Tile": [32,128],
-            "NumTasks": 2048
+            "Tile": [256,1,128],
+            "NumTasks": 256
           }
         }
       ]
     },
     {
       "Id": 5,
-      "NumWarps": 1,
+      "NumWarps": 4,
       "SramBytes": 0,
       "Ops": [
         {
@@ -170,19 +170,19 @@
             {"Id":18,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":23,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+            {"Id":23,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":24,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+            {"Id":24,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
-            "Permutation": {"DIMS":[0,2,3,1]}
+            "Permutation": {"DIMS":[0,2,1,3]}
           },
           "Config": {
-            "NumWarps": 1,
+            "NumWarps": 4,
             "SramBytes": 0,
-            "Tile": [8,128],
-            "NumTasks": 8192
+            "Tile": [256,128],
+            "NumTasks": 256
           }
         }
       ]
@@ -213,7 +213,7 @@
           "Config": {
             "NumWarps": 4,
             "SramBytes": 24672,
-            "TileShapeMNK": [128,256,32],
+            "TileShapeMNK": [256,128,32],
             "NumTasks": 256
           }
         }
@@ -221,7 +221,7 @@
     },
     {
       "Id": 7,
-      "NumWarps": 1,
+      "NumWarps": 4,
       "SramBytes": 0,
       "Ops": [
         {
@@ -241,10 +241,10 @@
             "Permutation": {"DIMS":[0,2,1,3]}
           },
           "Config": {
-            "NumWarps": 1,
+            "NumWarps": 4,
             "SramBytes": 0,
-            "Tile": [8,128],
-            "NumTasks": 8192
+            "Tile": [256,128],
+            "NumTasks": 256
           }
         }
       ]
@@ -260,7 +260,7 @@
           "IsVirtual": false,
           "ReadTensors": [
             {"Id":20,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":24,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+            {"Id":24,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
             {"Id":25,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
@@ -270,12 +270,12 @@
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":false}
+            "TransposeOther": {"BOOL":true}
           },
           "Config": {
             "NumWarps": 4,
             "SramBytes": 24672,
-            "TileShapeMNK": [128,256,32],
+            "TileShapeMNK": [256,128,32],
             "NumTasks": 4096
           }
         }
@@ -305,7 +305,7 @@
           "Config": {
             "NumWarps": 4,
             "SramBytes": 0,
-            "Tile": [128,256],
+            "Tile": [256,128],
             "NumTasks": 4096
           }
         }
@@ -1747,119 +1747,36 @@
     }
   ],
   "ProcessorGroups": [
-    {
-      "ProcessorRange": [0,256],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,256],
-          "WarpRange": [0,4],
-          "SramRange": [0,24672],
-          "TaskGroups": [
-            {"TaskId":0,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
     {
       "ProcessorRange": [0,304],
       "ResourceGroups": [
         {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":1,"TaskRange":[0,2048],"Granularity":4}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":2,"TaskRange":[0,8192],"Granularity":4}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,256],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,256],
+          "ProcessorRange": [0,86],
           "WarpRange": [0,4],
           "SramRange": [0,24672],
           "TaskGroups": [
-            {"TaskId":3,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":4,"TaskRange":[0,2048],"Granularity":4}
+            {"TaskId":0,"TaskRange":[0,256],"Granularity":1},
+            {"TaskId":1,"TaskRange":[0,256],"Granularity":1},
+            {"TaskId":2,"TaskRange":[0,256],"Granularity":1}
           ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
+        },
         {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":5,"TaskRange":[0,8192],"Granularity":4}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,256],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,256],
+          "ProcessorRange": [86,172],
           "WarpRange": [0,4],
           "SramRange": [0,24672],
           "TaskGroups": [
-            {"TaskId":6,"TaskRange":[0,256],"Granularity":1}
+            {"TaskId":3,"TaskRange":[0,256],"Granularity":1},
+            {"TaskId":4,"TaskRange":[0,256],"Granularity":1},
+            {"TaskId":5,"TaskRange":[0,256],"Granularity":1}
           ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":7,"TaskRange":[0,8192],"Granularity":4}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
+        },
         {
-          "ProcessorRange": [0,304],
+          "ProcessorRange": [172,258],
           "WarpRange": [0,4],
           "SramRange": [0,24672],
           "TaskGroups": [
-            {"TaskId":8,"TaskRange":[0,4096],"Granularity":1}
+            {"TaskId":6,"TaskRange":[0,256],"Granularity":1},
+            {"TaskId":7,"TaskRange":[0,256],"Granularity":1}
           ]
         }
       ]
@@ -1870,8 +1787,9 @@
         {
           "ProcessorRange": [0,304],
           "WarpRange": [0,4],
-          "SramRange": [0,0],
+          "SramRange": [0,24672],
           "TaskGroups": [
+            {"TaskId":8,"TaskRange":[0,4096],"Granularity":1},
             {"TaskId":9,"TaskRange":[0,4096],"Granularity":1}
           ]
         }

From c4be6d1bf7b7fcacdd11dd3efad7b4170461ce41 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Fri, 5 Jul 2024 00:14:05 +0000
Subject: [PATCH 24/79] wip

---
 ark/codegen.cpp                             |    6 +-
 arkprof.py                                  |    4 +
 examples/llama/model_test.py                |   23 +-
 examples/llama/plan_llama2_7b_b1_s2048.json | 1723 +++++++++++++++++++
 python/ark/profiler.py                      |   12 +-
 5 files changed, 1751 insertions(+), 17 deletions(-)
 create mode 100644 arkprof.py
 create mode 100644 examples/llama/plan_llama2_7b_b1_s2048.json

diff --git a/ark/codegen.cpp b/ark/codegen.cpp
index 55327329a..587bcae59 100644
--- a/ark/codegen.cpp
+++ b/ark/codegen.cpp
@@ -298,10 +298,14 @@ std::string CodeGenerator::Impl::resource_group(
     size_t proc_b = *rg_proc_range.begin();
     size_t proc_e = *rg_proc_range.end();
     size_t proc_s = rg_proc_range.step();
+    std::map<size_t, Json> task_infos_map;
+    for (auto &task_info : task_infos) {
+        task_infos_map[task_info.at("Id").get<size_t>()] = task_info;
+    }
     std::stringstream ss;
     for (auto &tg : rg_json["TaskGroups"]) {
         size_t task_id = tg["TaskId"];
-        auto &task_info = task_infos[task_id];
+        auto &task_info = task_infos_map.at(task_id);
         Range<size_t> task_range(tg["TaskRange"][0], tg["TaskRange"][1]);
         size_t task_gran = tg["Granularity"];
         size_t num_warps_per_task = task_info["NumWarps"];
diff --git a/arkprof.py b/arkprof.py
new file mode 100644
index 000000000..782bba560
--- /dev/null
+++ b/arkprof.py
@@ -0,0 +1,4 @@
+import ark
+import sys
+
+ark.Profiler(ark.Plan.from_file(sys.argv[1])).run(iter=1000, profile_processor_groups=False)
diff --git a/examples/llama/model_test.py b/examples/llama/model_test.py
index 053015c04..19c680854 100644
--- a/examples/llama/model_test.py
+++ b/examples/llama/model_test.py
@@ -59,7 +59,8 @@ def run_ark(
     output = module(*module_inputs)
 
     with ark.Runtime() as rt:
-        rt.launch(ark.Plan.from_file("/mnt/changhohwang/ark/plan_gpu0.json"))
+        plan = ark.Plan.from_file("plan_llama2_7b_b1_s2048.json")
+        rt.launch(plan)
 
         # Load model parameters
         if state_dict:
@@ -438,22 +439,22 @@ def test_transformer_block(
         low=-1, high=1, size=(batch_size, seq_len, args.dim)
     ).astype(dtype)
 
-    module = model_ark.Attention(
-        args, ark.DataType.from_numpy(dtype), rank, world_size
-    )
+    # module = model_ark.Attention(
+    #     args, ark.DataType.from_numpy(dtype), rank, world_size
+    # )
     # module_inputs = [
     #     ark.tensor(list(i.shape), ark.DataType.from_numpy(i.dtype))
     #     if isinstance(i, np.ndarray)
     #     else i
     #     for i in inputs
     # ]
-    feature_tensor = ark.tensor(
-        list(feature.shape), ark.DataType.from_numpy(feature.dtype)
-    )
-    freqs_cis_ark_tensor = ark.tensor(
-        list(freqs_cis_ark.shape), ark.DataType.from_numpy(freqs_cis_ark.dtype)
-    )
-    output = module(feature_tensor, 0, freqs_cis_ark_tensor, None)
+    # feature_tensor = ark.tensor(
+    #     list(feature.shape), ark.DataType.from_numpy(feature.dtype)
+    # )
+    # freqs_cis_ark_tensor = ark.tensor(
+    #     list(freqs_cis_ark.shape), ark.DataType.from_numpy(freqs_cis_ark.dtype)
+    # )
+    # output = module(feature_tensor, 0, freqs_cis_ark_tensor, None)
 
     # print(ark.Model.get_model().serialize())
 
diff --git a/examples/llama/plan_llama2_7b_b1_s2048.json b/examples/llama/plan_llama2_7b_b1_s2048.json
new file mode 100644
index 000000000..d0e46d228
--- /dev/null
+++ b/examples/llama/plan_llama2_7b_b1_s2048.json
@@ -0,0 +1,1723 @@
+{
+  "Rank": 0,
+  "WorldSize": 1,
+  "Architecture": "ROCM_942",
+  "NumProcessors": 304,
+  "NumWarpsPerProcessor": 4,
+  "TaskInfos": [
+    {
+      "Id": 0,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Cast",
+          "Name": "cast",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":9,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":11,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":12,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,64],
+            "NumTasks": 131072
+          }
+        }
+      ]
+    },
+    {
+      "Id": 1,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Mul",
+          "Name": "mul",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":12,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":12,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":13,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":14,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,64],
+            "NumTasks": 131072
+          }
+        }
+      ]
+    },
+    {
+      "Id": 2,
+      "NumWarps": 1,
+      "SramBytes": 256,
+      "Ops": [
+        {
+          "Type": "ReduceMean",
+          "Name": "reduce_mean",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":14,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":15,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":16,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "Axis": {"INT":2},
+            "KeepDim": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "ImplType": "WarpWise",
+            "SramBytes": 256,
+            "NumTasks": 2048
+          }
+        }
+      ]
+    },
+    {
+      "Id": 3,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Rsqrt",
+          "Name": "rsqrt",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":16,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":17,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":18,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [64,1],
+            "NumTasks": 32
+          }
+        }
+      ]
+    },
+    {
+      "Id": 4,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Mul",
+          "Name": "mul_1",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":12,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":18,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":19,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":20,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,64],
+            "NumTasks": 131072
+          }
+        }
+      ]
+    },
+    {
+      "Id": 5,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Mul",
+          "Name": "mul_2",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":20,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":7,"DataType":"FP32","Shape":[1,1,4096],"Strides":[1,1,4096],"Offsets":[0,0,0],"PaddedShape":[1,1,4096],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":20,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":21,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,64],
+            "NumTasks": 131072
+          }
+        }
+      ]
+    },
+    {
+      "Id": 6,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Cast",
+          "Name": "cast_1",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":21,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":22,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":23,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,64],
+            "NumTasks": 131072
+          }
+        }
+      ]
+    },
+    {
+      "Id": 7,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":23,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":0,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":24,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":25,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 8,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Rope",
+          "Name": "rope",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":30,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":10,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":33,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":20,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":34,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":20,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,64],
+            "NumTasks": 131072
+          }
+        }
+      ]
+    },
+    {
+      "Id": 9,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Transpose",
+          "Name": "transpose",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":34,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":20,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":37,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":22,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":38,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":22,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "Permutation": {"DIMS":[0,2,1,3]}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [8,8],
+            "NumTasks": 131072
+          }
+        }
+      ]
+    },
+    {
+      "Id": 10,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_1",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":23,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":1,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":26,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":27,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 11,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Rope",
+          "Name": "rope_1",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":31,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":10,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":35,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":21,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":36,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":21,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,64],
+            "NumTasks": 131072
+          }
+        }
+      ]
+    },
+    {
+      "Id": 12,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Transpose",
+          "Name": "transpose_2",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":36,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":21,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":41,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":42,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "Permutation": {"DIMS":[0,2,3,1]}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [8,8],
+            "NumTasks": 131072
+          }
+        }
+      ]
+    },
+    {
+      "Id": 13,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_2",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":23,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":2,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":28,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":29,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 14,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Transpose",
+          "Name": "transpose_1",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":32,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":39,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":23,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":40,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":23,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "Permutation": {"DIMS":[0,2,1,3]}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [8,8],
+            "NumTasks": 131072
+          }
+        }
+      ]
+    },
+    {
+      "Id": 15,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_3",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":38,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":22,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":42,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":43,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":25,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":44,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":25,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":false}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
+            "NumTasks": 4096
+          }
+        }
+      ]
+    },
+    {
+      "Id": 16,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "ScalarMul",
+          "Name": "mul_3",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":44,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":25,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":45,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":46,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "Factor": {"FLOAT":0.0883883461356163}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,64],
+            "NumTasks": 2097152
+          }
+        }
+      ]
+    },
+    {
+      "Id": 17,
+      "NumWarps": 1,
+      "SramBytes": 256,
+      "Ops": [
+        {
+          "Type": "ReduceMax",
+          "Name": "reduce_max",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":46,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":47,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":27,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":48,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":27,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "Axis": {"INT":3},
+            "KeepDim": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "ImplType": "WarpWise",
+            "SramBytes": 256,
+            "NumTasks": 65536
+          }
+        }
+      ]
+    },
+    {
+      "Id": 18,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Sub",
+          "Name": "sub",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":46,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":48,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":27,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":46,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":49,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,64],
+            "NumTasks": 2097152
+          }
+        }
+      ]
+    },
+    {
+      "Id": 19,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Exp",
+          "Name": "exp",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":49,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":49,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":50,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,64],
+            "NumTasks": 2097152
+          }
+        }
+      ]
+    },
+    {
+      "Id": 20,
+      "NumWarps": 1,
+      "SramBytes": 256,
+      "Ops": [
+        {
+          "Type": "ReduceSum",
+          "Name": "reduce_sum",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":50,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":51,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":28,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":52,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":28,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "Axis": {"INT":3},
+            "KeepDim": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "ImplType": "WarpWise",
+            "SramBytes": 256,
+            "NumTasks": 65536
+          }
+        }
+      ]
+    },
+    {
+      "Id": 21,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Div",
+          "Name": "div",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":50,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":52,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":28,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":50,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":53,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,64],
+            "NumTasks": 2097152
+          }
+        }
+      ]
+    },
+    {
+      "Id": 22,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_4",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":53,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":40,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":23,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":54,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":29,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":55,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":29,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":false}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [256,128,32],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 23,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Transpose",
+          "Name": "transpose_3",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":55,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":29,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":56,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":30,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":57,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":30,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "Permutation": {"DIMS":[0,2,1,3]}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [8,8],
+            "NumTasks": 131072
+          }
+        }
+      ]
+    },
+    {
+      "Id": 24,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_5",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":58,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":30,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":3,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":59,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":31,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":60,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":31,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 25,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Add",
+          "Name": "add",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":9,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":60,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":31,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":61,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":62,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,64],
+            "NumTasks": 131072
+          }
+        }
+      ]
+    },
+    {
+      "Id": 26,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Cast",
+          "Name": "cast_2",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":62,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":63,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":64,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,64],
+            "NumTasks": 131072
+          }
+        }
+      ]
+    },
+    {
+      "Id": 27,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Mul",
+          "Name": "mul_4",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":64,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":64,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":65,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":66,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,64],
+            "NumTasks": 131072
+          }
+        }
+      ]
+    },
+    {
+      "Id": 28,
+      "NumWarps": 1,
+      "SramBytes": 256,
+      "Ops": [
+        {
+          "Type": "ReduceMean",
+          "Name": "reduce_mean_1",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":66,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":67,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":68,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "Axis": {"INT":2},
+            "KeepDim": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "ImplType": "WarpWise",
+            "SramBytes": 256,
+            "NumTasks": 2048
+          }
+        }
+      ]
+    },
+    {
+      "Id": 29,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Rsqrt",
+          "Name": "rsqrt_1",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":68,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":69,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":70,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [64,1],
+            "NumTasks": 32
+          }
+        }
+      ]
+    },
+    {
+      "Id": 30,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Mul",
+          "Name": "mul_5",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":64,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":70,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":71,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":72,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,64],
+            "NumTasks": 131072
+          }
+        }
+      ]
+    },
+    {
+      "Id": 31,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Mul",
+          "Name": "mul_6",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":72,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":8,"DataType":"FP32","Shape":[1,1,4096],"Strides":[1,1,4096],"Offsets":[0,0,0],"PaddedShape":[1,1,4096],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":72,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":73,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,64],
+            "NumTasks": 131072
+          }
+        }
+      ]
+    },
+    {
+      "Id": 32,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Cast",
+          "Name": "cast_3",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":73,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":74,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":75,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,64],
+            "NumTasks": 131072
+          }
+        }
+      ]
+    },
+    {
+      "Id": 33,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_6",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":75,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":4,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":76,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":77,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
+            "NumTasks": 688
+          }
+        }
+      ]
+    },
+    {
+      "Id": 34,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Sigmoid",
+          "Name": "sigmoid",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":77,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":78,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":79,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,64],
+            "NumTasks": 352256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 35,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Mul",
+          "Name": "mul_7",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":77,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":79,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":80,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":81,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,64],
+            "NumTasks": 352256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 36,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_7",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":75,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":6,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":82,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":83,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
+            "NumTasks": 688
+          }
+        }
+      ]
+    },
+    {
+      "Id": 37,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Mul",
+          "Name": "mul_8",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":81,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":83,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":84,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":85,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,64],
+            "NumTasks": 352256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 38,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_8",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":85,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":5,"DataType":"FP16","Shape":[4096,11008],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":86,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":87,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 39,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Add",
+          "Name": "add_1",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":62,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":87,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":88,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":45,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":89,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":45,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,64],
+            "NumTasks": 131072
+          }
+        }
+      ]
+    }
+  ],
+  "ProcessorGroups": [
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":0,"TaskRange":[0,131072],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":1,"TaskRange":[0,131072],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,256],
+          "TaskGroups": [
+            {"TaskId":2,"TaskRange":[0,2048],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,32],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,32],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":3,"TaskRange":[0,32],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":4,"TaskRange":[0,131072],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":5,"TaskRange":[0,131072],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":6,"TaskRange":[0,131072],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,256],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,256],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":7,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":8,"TaskRange":[0,131072],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":9,"TaskRange":[0,131072],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,256],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,256],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":10,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":11,"TaskRange":[0,131072],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":12,"TaskRange":[0,131072],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,256],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,256],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":13,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":14,"TaskRange":[0,131072],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":15,"TaskRange":[0,4096],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":16,"TaskRange":[0,2097152],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,256],
+          "TaskGroups": [
+            {"TaskId":17,"TaskRange":[0,65536],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":18,"TaskRange":[0,2097152],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":19,"TaskRange":[0,2097152],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,256],
+          "TaskGroups": [
+            {"TaskId":20,"TaskRange":[0,65536],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":21,"TaskRange":[0,2097152],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,256],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,256],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":22,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":23,"TaskRange":[0,131072],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,256],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,256],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":24,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":25,"TaskRange":[0,131072],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":26,"TaskRange":[0,131072],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":27,"TaskRange":[0,131072],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,256],
+          "TaskGroups": [
+            {"TaskId":28,"TaskRange":[0,2048],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,32],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,32],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":29,"TaskRange":[0,32],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":30,"TaskRange":[0,131072],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":31,"TaskRange":[0,131072],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":32,"TaskRange":[0,131072],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":33,"TaskRange":[0,688],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":34,"TaskRange":[0,352256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":35,"TaskRange":[0,352256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":36,"TaskRange":[0,688],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":37,"TaskRange":[0,352256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,256],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,256],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":38,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":39,"TaskRange":[0,131072],"Granularity":1}
+          ]
+        }
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/python/ark/profiler.py b/python/ark/profiler.py
index 529a0d506..56233247c 100644
--- a/python/ark/profiler.py
+++ b/python/ark/profiler.py
@@ -8,11 +8,10 @@
 from .planner import Plan
 
 
-def timeit(plan: Plan):
+def timeit(plan: Plan, iter: int):
     with Runtime() as rt:
         rt.launch(plan=plan)
         start_time = time.time()
-        iter = 1000
         rt.run(iter=iter)
         end_time = time.time()
         return (end_time - start_time) / iter
@@ -22,8 +21,11 @@ class Profiler:
     def __init__(self, plan: Plan):
         self.plan = plan
 
-    def run(self):
-        sys.stderr.write(f"End-to-end: {timeit(self.plan):.6f} seconds/iter\n")
+    def run(self, iter: int = 1000, profile_processor_groups: bool = False):
+        sys.stderr.write(f"End-to-end: {timeit(self.plan, iter):.6f} seconds/iter\n")
+
+        if not profile_processor_groups:
+            return
         num_processor_groups = len(self.plan.processor_groups)
         new_plan = {
             "Rank": self.plan.rank,
@@ -36,7 +38,7 @@ def run(self):
         }
         for i in range(num_processor_groups):
             new_plan["ProcessorGroups"][0] = self.plan.processor_groups[i]
-            lat_per_iter = timeit(Plan(new_plan))
+            lat_per_iter = timeit(Plan(new_plan), iter)
             sys.stderr.write(
                 f"Processor group {i}: {lat_per_iter:.6f} seconds/iter\n"
             )

From cc30912486c24f71617ee2200c7429ea2e610d51 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Fri, 5 Jul 2024 07:12:49 +0000
Subject: [PATCH 25/79] optimization

---
 examples/llama/plan_llama2_7b_b1_s2048.json | 732 ++++----------------
 1 file changed, 126 insertions(+), 606 deletions(-)

diff --git a/examples/llama/plan_llama2_7b_b1_s2048.json b/examples/llama/plan_llama2_7b_b1_s2048.json
index d0e46d228..15b0de2d0 100644
--- a/examples/llama/plan_llama2_7b_b1_s2048.json
+++ b/examples/llama/plan_llama2_7b_b1_s2048.json
@@ -27,17 +27,10 @@
           "Config": {
             "NumWarps": 1,
             "SramBytes": 0,
-            "Tile": [1,64],
-            "NumTasks": 131072
+            "Tile": [1,4096],
+            "NumTasks": 2048
           }
-        }
-      ]
-    },
-    {
-      "Id": 1,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
+        },
         {
           "Type": "Mul",
           "Name": "mul",
@@ -56,17 +49,10 @@
           "Config": {
             "NumWarps": 1,
             "SramBytes": 0,
-            "Tile": [1,64],
-            "NumTasks": 131072
+            "Tile": [1,4096],
+            "NumTasks": 2048
           }
-        }
-      ]
-    },
-    {
-      "Id": 2,
-      "NumWarps": 1,
-      "SramBytes": 256,
-      "Ops": [
+        },
         {
           "Type": "ReduceMean",
           "Name": "reduce_mean",
@@ -87,7 +73,7 @@
           "Config": {
             "NumWarps": 1,
             "ImplType": "WarpWise",
-            "SramBytes": 256,
+            "SramBytes": 0,
             "NumTasks": 2048
           }
         }
@@ -144,17 +130,10 @@
           "Config": {
             "NumWarps": 1,
             "SramBytes": 0,
-            "Tile": [1,64],
-            "NumTasks": 131072
+            "Tile": [1,4096],
+            "NumTasks": 2048
           }
-        }
-      ]
-    },
-    {
-      "Id": 5,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
+        },
         {
           "Type": "Mul",
           "Name": "mul_2",
@@ -173,17 +152,10 @@
           "Config": {
             "NumWarps": 1,
             "SramBytes": 0,
-            "Tile": [1,64],
-            "NumTasks": 131072
+            "Tile": [1,4096],
+            "NumTasks": 2048
           }
-        }
-      ]
-    },
-    {
-      "Id": 6,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
+        },
         {
           "Type": "Cast",
           "Name": "cast_1",
@@ -201,8 +173,8 @@
           "Config": {
             "NumWarps": 1,
             "SramBytes": 0,
-            "Tile": [1,64],
-            "NumTasks": 131072
+            "Tile": [1,4096],
+            "NumTasks": 2048
           }
         }
       ]
@@ -233,17 +205,10 @@
           "Config": {
             "NumWarps": 4,
             "SramBytes": 24672,
-            "TileShapeMNK": [128,256,32],
+            "TileShapeMNK": [256,128,32],
             "NumTasks": 256
           }
-        }
-      ]
-    },
-    {
-      "Id": 8,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
+        },
         {
           "Type": "Rope",
           "Name": "rope",
@@ -260,19 +225,12 @@
           ],
           "Args": {},
           "Config": {
-            "NumWarps": 1,
+            "NumWarps": 4,
             "SramBytes": 0,
-            "Tile": [1,64],
-            "NumTasks": 131072
+            "Tile": [256,1,128],
+            "NumTasks": 256
           }
-        }
-      ]
-    },
-    {
-      "Id": 9,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
+        },
         {
           "Type": "Transpose",
           "Name": "transpose",
@@ -290,10 +248,10 @@
             "Permutation": {"DIMS":[0,2,1,3]}
           },
           "Config": {
-            "NumWarps": 1,
+            "NumWarps": 4,
             "SramBytes": 0,
-            "Tile": [8,8],
-            "NumTasks": 131072
+            "Tile": [256,128],
+            "NumTasks": 256
           }
         }
       ]
@@ -324,17 +282,10 @@
           "Config": {
             "NumWarps": 4,
             "SramBytes": 24672,
-            "TileShapeMNK": [128,256,32],
+            "TileShapeMNK": [256,128,32],
             "NumTasks": 256
           }
-        }
-      ]
-    },
-    {
-      "Id": 11,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
+        },
         {
           "Type": "Rope",
           "Name": "rope_1",
@@ -351,19 +302,12 @@
           ],
           "Args": {},
           "Config": {
-            "NumWarps": 1,
+            "NumWarps": 4,
             "SramBytes": 0,
-            "Tile": [1,64],
-            "NumTasks": 131072
+            "Tile": [256,128],
+            "NumTasks": 256
           }
-        }
-      ]
-    },
-    {
-      "Id": 12,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
+        },
         {
           "Type": "Transpose",
           "Name": "transpose_2",
@@ -372,19 +316,19 @@
             {"Id":36,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":21,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":41,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+            {"Id":41,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":42,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+            {"Id":42,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
-            "Permutation": {"DIMS":[0,2,3,1]}
+            "Permutation": {"DIMS":[0,2,1,3]}
           },
           "Config": {
-            "NumWarps": 1,
+            "NumWarps": 4,
             "SramBytes": 0,
-            "Tile": [8,8],
-            "NumTasks": 131072
+            "Tile": [256,128],
+            "NumTasks": 256
           }
         }
       ]
@@ -415,17 +359,10 @@
           "Config": {
             "NumWarps": 4,
             "SramBytes": 24672,
-            "TileShapeMNK": [128,256,32],
+            "TileShapeMNK": [256,128,32],
             "NumTasks": 256
           }
-        }
-      ]
-    },
-    {
-      "Id": 14,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
+        },
         {
           "Type": "Transpose",
           "Name": "transpose_1",
@@ -443,10 +380,10 @@
             "Permutation": {"DIMS":[0,2,1,3]}
           },
           "Config": {
-            "NumWarps": 1,
+            "NumWarps": 4,
             "SramBytes": 0,
-            "Tile": [8,8],
-            "NumTasks": 131072
+            "Tile": [256,128],
+            "NumTasks": 256
           }
         }
       ]
@@ -462,7 +399,7 @@
           "IsVirtual": false,
           "ReadTensors": [
             {"Id":38,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":22,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":42,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+            {"Id":42,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
             {"Id":43,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":25,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
@@ -472,22 +409,15 @@
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":false}
+            "TransposeOther": {"BOOL":true}
           },
           "Config": {
             "NumWarps": 4,
             "SramBytes": 24672,
-            "TileShapeMNK": [128,256,32],
+            "TileShapeMNK": [256,128,32],
             "NumTasks": 4096
           }
-        }
-      ]
-    },
-    {
-      "Id": 16,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
+        },
         {
           "Type": "ScalarMul",
           "Name": "mul_3",
@@ -505,10 +435,10 @@
             "Factor": {"FLOAT":0.0883883461356163}
           },
           "Config": {
-            "NumWarps": 1,
+            "NumWarps": 4,
             "SramBytes": 0,
-            "Tile": [1,64],
-            "NumTasks": 2097152
+            "Tile": [256,128],
+            "NumTasks": 4096
           }
         }
       ]
@@ -516,7 +446,7 @@
     {
       "Id": 17,
       "NumWarps": 1,
-      "SramBytes": 256,
+      "SramBytes": 0,
       "Ops": [
         {
           "Type": "ReduceMax",
@@ -538,17 +468,10 @@
           "Config": {
             "NumWarps": 1,
             "ImplType": "WarpWise",
-            "SramBytes": 256,
+            "SramBytes": 0,
             "NumTasks": 65536
           }
-        }
-      ]
-    },
-    {
-      "Id": 18,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
+        },
         {
           "Type": "Sub",
           "Name": "sub",
@@ -567,17 +490,10 @@
           "Config": {
             "NumWarps": 1,
             "SramBytes": 0,
-            "Tile": [1,64],
-            "NumTasks": 2097152
+            "Tile": [1,2048],
+            "NumTasks": 65536
           }
-        }
-      ]
-    },
-    {
-      "Id": 19,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
+        },
         {
           "Type": "Exp",
           "Name": "exp",
@@ -595,17 +511,10 @@
           "Config": {
             "NumWarps": 1,
             "SramBytes": 0,
-            "Tile": [1,64],
-            "NumTasks": 2097152
+            "Tile": [1,2048],
+            "NumTasks": 65536
           }
-        }
-      ]
-    },
-    {
-      "Id": 20,
-      "NumWarps": 1,
-      "SramBytes": 256,
-      "Ops": [
+        },
         {
           "Type": "ReduceSum",
           "Name": "reduce_sum",
@@ -626,17 +535,10 @@
           "Config": {
             "NumWarps": 1,
             "ImplType": "WarpWise",
-            "SramBytes": 256,
+            "SramBytes": 0,
             "NumTasks": 65536
           }
-        }
-      ]
-    },
-    {
-      "Id": 21,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
+        },
         {
           "Type": "Div",
           "Name": "div",
@@ -655,8 +557,8 @@
           "Config": {
             "NumWarps": 1,
             "SramBytes": 0,
-            "Tile": [1,64],
-            "NumTasks": 2097152
+            "Tile": [1,2048],
+            "NumTasks": 65536
           }
         }
       ]
@@ -690,14 +592,7 @@
             "TileShapeMNK": [256,128,32],
             "NumTasks": 256
           }
-        }
-      ]
-    },
-    {
-      "Id": 23,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
+        },
         {
           "Type": "Transpose",
           "Name": "transpose_3",
@@ -715,10 +610,10 @@
             "Permutation": {"DIMS":[0,2,1,3]}
           },
           "Config": {
-            "NumWarps": 1,
+            "NumWarps": 4,
             "SramBytes": 0,
-            "Tile": [8,8],
-            "NumTasks": 131072
+            "Tile": [256,1,128],
+            "NumTasks": 256
           }
         }
       ]
@@ -749,17 +644,10 @@
           "Config": {
             "NumWarps": 4,
             "SramBytes": 24672,
-            "TileShapeMNK": [128,256,32],
+            "TileShapeMNK": [256,128,32],
             "NumTasks": 256
           }
-        }
-      ]
-    },
-    {
-      "Id": 25,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
+        },
         {
           "Type": "Add",
           "Name": "add",
@@ -776,19 +664,12 @@
           ],
           "Args": {},
           "Config": {
-            "NumWarps": 1,
+            "NumWarps": 4,
             "SramBytes": 0,
-            "Tile": [1,64],
-            "NumTasks": 131072
+            "Tile": [256,128],
+            "NumTasks": 256
           }
-        }
-      ]
-    },
-    {
-      "Id": 26,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
+        },
         {
           "Type": "Cast",
           "Name": "cast_2",
@@ -804,19 +685,12 @@
           ],
           "Args": {},
           "Config": {
-            "NumWarps": 1,
+            "NumWarps": 4,
             "SramBytes": 0,
-            "Tile": [1,64],
-            "NumTasks": 131072
+            "Tile": [256,128],
+            "NumTasks": 256
           }
-        }
-      ]
-    },
-    {
-      "Id": 27,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
+        },
         {
           "Type": "Mul",
           "Name": "mul_4",
@@ -833,10 +707,10 @@
           ],
           "Args": {},
           "Config": {
-            "NumWarps": 1,
+            "NumWarps": 4,
             "SramBytes": 0,
-            "Tile": [1,64],
-            "NumTasks": 131072
+            "Tile": [256,128],
+            "NumTasks": 256
           }
         }
       ]
@@ -844,7 +718,7 @@
     {
       "Id": 28,
       "NumWarps": 1,
-      "SramBytes": 256,
+      "SramBytes": 0,
       "Ops": [
         {
           "Type": "ReduceMean",
@@ -866,7 +740,7 @@
           "Config": {
             "NumWarps": 1,
             "ImplType": "WarpWise",
-            "SramBytes": 256,
+            "SramBytes": 0,
             "NumTasks": 2048
           }
         }
@@ -923,17 +797,10 @@
           "Config": {
             "NumWarps": 1,
             "SramBytes": 0,
-            "Tile": [1,64],
-            "NumTasks": 131072
+            "Tile": [1,4096],
+            "NumTasks": 2048
           }
-        }
-      ]
-    },
-    {
-      "Id": 31,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
+        },
         {
           "Type": "Mul",
           "Name": "mul_6",
@@ -952,17 +819,10 @@
           "Config": {
             "NumWarps": 1,
             "SramBytes": 0,
-            "Tile": [1,64],
-            "NumTasks": 131072
+            "Tile": [1,4096],
+            "NumTasks": 2048
           }
-        }
-      ]
-    },
-    {
-      "Id": 32,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
+        },
         {
           "Type": "Cast",
           "Name": "cast_3",
@@ -980,8 +840,8 @@
           "Config": {
             "NumWarps": 1,
             "SramBytes": 0,
-            "Tile": [1,64],
-            "NumTasks": 131072
+            "Tile": [1,4096],
+            "NumTasks": 2048
           }
         }
       ]
@@ -1012,17 +872,10 @@
           "Config": {
             "NumWarps": 4,
             "SramBytes": 24672,
-            "TileShapeMNK": [128,256,32],
+            "TileShapeMNK": [256,128,32],
             "NumTasks": 688
           }
-        }
-      ]
-    },
-    {
-      "Id": 34,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
+        },
         {
           "Type": "Sigmoid",
           "Name": "sigmoid",
@@ -1038,19 +891,12 @@
           ],
           "Args": {},
           "Config": {
-            "NumWarps": 1,
+            "NumWarps": 4,
             "SramBytes": 0,
-            "Tile": [1,64],
-            "NumTasks": 352256
+            "Tile": [256,128],
+            "NumTasks": 688
           }
-        }
-      ]
-    },
-    {
-      "Id": 35,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
+        },
         {
           "Type": "Mul",
           "Name": "mul_7",
@@ -1067,10 +913,10 @@
           ],
           "Args": {},
           "Config": {
-            "NumWarps": 1,
+            "NumWarps": 4,
             "SramBytes": 0,
-            "Tile": [1,64],
-            "NumTasks": 352256
+            "Tile": [256,128],
+            "NumTasks": 688
           }
         }
       ]
@@ -1101,17 +947,10 @@
           "Config": {
             "NumWarps": 4,
             "SramBytes": 24672,
-            "TileShapeMNK": [128,256,32],
+            "TileShapeMNK": [256,128,32],
             "NumTasks": 688
           }
-        }
-      ]
-    },
-    {
-      "Id": 37,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
+        },
         {
           "Type": "Mul",
           "Name": "mul_8",
@@ -1128,10 +967,10 @@
           ],
           "Args": {},
           "Config": {
-            "NumWarps": 1,
+            "NumWarps": 4,
             "SramBytes": 0,
-            "Tile": [1,64],
-            "NumTasks": 352256
+            "Tile": [256,128],
+            "NumTasks": 688
           }
         }
       ]
@@ -1162,17 +1001,10 @@
           "Config": {
             "NumWarps": 4,
             "SramBytes": 24672,
-            "TileShapeMNK": [128,256,32],
+            "TileShapeMNK": [256,128,32],
             "NumTasks": 256
           }
-        }
-      ]
-    },
-    {
-      "Id": 39,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
+        },
         {
           "Type": "Add",
           "Name": "add_1",
@@ -1189,10 +1021,10 @@
           ],
           "Args": {},
           "Config": {
-            "NumWarps": 1,
+            "NumWarps": 4,
             "SramBytes": 0,
-            "Tile": [1,64],
-            "NumTasks": 131072
+            "Tile": [256,128],
+            "NumTasks": 256
           }
         }
       ]
@@ -1204,23 +1036,23 @@
       "ResourceGroups": [
         {
           "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
+          "WarpRange": [0,4],
           "SramRange": [0,0],
           "TaskGroups": [
-            {"TaskId":0,"TaskRange":[0,131072],"Granularity":1}
+            {"TaskId":0,"TaskRange":[0,2048],"Granularity":4}
           ]
         }
       ]
     },
     {
-      "ProcessorRange": [0,304],
+      "ProcessorRange": [0,32],
       "ResourceGroups": [
         {
-          "ProcessorRange": [0,304],
+          "ProcessorRange": [0,32],
           "WarpRange": [0,1],
           "SramRange": [0,0],
           "TaskGroups": [
-            {"TaskId":1,"TaskRange":[0,131072],"Granularity":1}
+            {"TaskId":3,"TaskRange":[0,32],"Granularity":1}
           ]
         }
       ]
@@ -1230,101 +1062,23 @@
       "ResourceGroups": [
         {
           "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,256],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
           "TaskGroups": [
-            {"TaskId":2,"TaskRange":[0,2048],"Granularity":1}
+            {"TaskId":4,"TaskRange":[0,2048],"Granularity":4}
           ]
         }
       ]
     },
     {
-      "ProcessorRange": [0,32],
+      "ProcessorRange": [0,256],
       "ResourceGroups": [
         {
-          "ProcessorRange": [0,32],
-          "WarpRange": [0,1],
-          "SramRange": [0,0],
+          "ProcessorRange": [0,256],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
           "TaskGroups": [
-            {"TaskId":3,"TaskRange":[0,32],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":4,"TaskRange":[0,131072],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":5,"TaskRange":[0,131072],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":6,"TaskRange":[0,131072],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,256],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,256],
-          "WarpRange": [0,4],
-          "SramRange": [0,24672],
-          "TaskGroups": [
-            {"TaskId":7,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":8,"TaskRange":[0,131072],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":9,"TaskRange":[0,131072],"Granularity":1}
+            {"TaskId":7,"TaskRange":[0,256],"Granularity":1}
           ]
         }
       ]
@@ -1342,32 +1096,6 @@
         }
       ]
     },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":11,"TaskRange":[0,131072],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":12,"TaskRange":[0,131072],"Granularity":1}
-          ]
-        }
-      ]
-    },
     {
       "ProcessorRange": [0,256],
       "ResourceGroups": [
@@ -1381,19 +1109,6 @@
         }
       ]
     },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":14,"TaskRange":[0,131072],"Granularity":1}
-          ]
-        }
-      ]
-    },
     {
       "ProcessorRange": [0,304],
       "ResourceGroups": [
@@ -1412,75 +1127,10 @@
       "ResourceGroups": [
         {
           "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":16,"TaskRange":[0,2097152],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,256],
-          "TaskGroups": [
-            {"TaskId":17,"TaskRange":[0,65536],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":18,"TaskRange":[0,2097152],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":19,"TaskRange":[0,2097152],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,256],
-          "TaskGroups": [
-            {"TaskId":20,"TaskRange":[0,65536],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
+          "WarpRange": [0,4],
           "SramRange": [0,0],
           "TaskGroups": [
-            {"TaskId":21,"TaskRange":[0,2097152],"Granularity":1}
+            {"TaskId":17,"TaskRange":[0,65536],"Granularity":4}
           ]
         }
       ]
@@ -1498,19 +1148,6 @@
         }
       ]
     },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":23,"TaskRange":[0,131072],"Granularity":1}
-          ]
-        }
-      ]
-    },
     {
       "ProcessorRange": [0,256],
       "ResourceGroups": [
@@ -1529,49 +1166,10 @@
       "ResourceGroups": [
         {
           "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":25,"TaskRange":[0,131072],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":26,"TaskRange":[0,131072],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
+          "WarpRange": [0,4],
           "SramRange": [0,0],
           "TaskGroups": [
-            {"TaskId":27,"TaskRange":[0,131072],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,256],
-          "TaskGroups": [
-            {"TaskId":28,"TaskRange":[0,2048],"Granularity":1}
+            {"TaskId":28,"TaskRange":[0,2048],"Granularity":4}
           ]
         }
       ]
@@ -1594,36 +1192,10 @@
       "ResourceGroups": [
         {
           "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":30,"TaskRange":[0,131072],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":31,"TaskRange":[0,131072],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
+          "WarpRange": [0,4],
           "SramRange": [0,0],
           "TaskGroups": [
-            {"TaskId":32,"TaskRange":[0,131072],"Granularity":1}
+            {"TaskId":30,"TaskRange":[0,2048],"Granularity":4}
           ]
         }
       ]
@@ -1641,32 +1213,6 @@
         }
       ]
     },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":34,"TaskRange":[0,352256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":35,"TaskRange":[0,352256],"Granularity":1}
-          ]
-        }
-      ]
-    },
     {
       "ProcessorRange": [0,304],
       "ResourceGroups": [
@@ -1680,19 +1226,6 @@
         }
       ]
     },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":37,"TaskRange":[0,352256],"Granularity":1}
-          ]
-        }
-      ]
-    },
     {
       "ProcessorRange": [0,256],
       "ResourceGroups": [
@@ -1705,19 +1238,6 @@
           ]
         }
       ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":39,"TaskRange":[0,131072],"Granularity":1}
-          ]
-        }
-      ]
     }
   ]
 }
\ No newline at end of file

From 34a87d867669aae49b2a29056aadfed694d97b33 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 8 Jul 2024 02:10:40 +0000
Subject: [PATCH 26/79] optimize

---
 examples/llama/plan_llama2_7b_b1_s2048.json | 97 ++++++++++++++++-----
 1 file changed, 76 insertions(+), 21 deletions(-)

diff --git a/examples/llama/plan_llama2_7b_b1_s2048.json b/examples/llama/plan_llama2_7b_b1_s2048.json
index 15b0de2d0..d5c9fe552 100644
--- a/examples/llama/plan_llama2_7b_b1_s2048.json
+++ b/examples/llama/plan_llama2_7b_b1_s2048.json
@@ -3,7 +3,7 @@
   "WorldSize": 1,
   "Architecture": "ROCM_942",
   "NumProcessors": 304,
-  "NumWarpsPerProcessor": 4,
+  "NumWarpsPerProcessor": 8,
   "TaskInfos": [
     {
       "Id": 0,
@@ -948,7 +948,7 @@
             "NumWarps": 4,
             "SramBytes": 24672,
             "TileShapeMNK": [256,128,32],
-            "NumTasks": 688
+            "NumTasks": 602
           }
         },
         {
@@ -970,7 +970,61 @@
             "NumWarps": 4,
             "SramBytes": 0,
             "Tile": [256,128],
-            "NumTasks": 688
+            "NumTasks": 602
+          }
+        }
+      ]
+    },
+    {
+      "Id": 37,
+      "NumWarps": 4,
+      "SramBytes": 16480,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_7",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":102,"DataType":"FP16","Shape":[1,1792,4096],"Strides":[1,2048,4096],"Offsets":[0,256,0],"PaddedShape":[1,1792,4096],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":6,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":101,"DataType":"FP16","Shape":[1,1792,11008],"Strides":[1,2048,11008],"Offsets":[0,256,0],"PaddedShape":[1,1792,11008],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":100,"DataType":"FP16","Shape":[1,1792,11008],"Strides":[1,2048,11008],"Offsets":[0,256,0],"PaddedShape":[1,1792,11008],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 16480,
+            "TileShapeMNK": [128,128,32],
+            "NumTasks": 172
+          }
+        },
+        {
+          "Type": "Mul",
+          "Name": "mul_8",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":81,"DataType":"FP16","Shape":[1,1792,11008],"Strides":[1,2048,11008],"Offsets":[0,256,0],"PaddedShape":[1,1792,11008],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":83,"DataType":"FP16","Shape":[1,1792,11008],"Strides":[1,2048,11008],"Offsets":[0,256,0],"PaddedShape":[1,1792,11008],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":84,"DataType":"FP16","Shape":[1,1792,11008],"Strides":[1,2048,11008],"Offsets":[0,256,0],"PaddedShape":[1,1792,11008],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":85,"DataType":"FP16","Shape":[1,1792,11008],"Strides":[1,2048,11008],"Offsets":[0,256,0],"PaddedShape":[1,1792,11008],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [128,128],
+            "NumTasks": 172
           }
         }
       ]
@@ -1036,10 +1090,10 @@
       "ResourceGroups": [
         {
           "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
+          "WarpRange": [0,8],
           "SramRange": [0,0],
           "TaskGroups": [
-            {"TaskId":0,"TaskRange":[0,2048],"Granularity":4}
+            {"TaskId":0,"TaskRange":[0,2048],"Granularity":7}
           ]
         }
       ]
@@ -1062,10 +1116,10 @@
       "ResourceGroups": [
         {
           "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
+          "WarpRange": [0,8],
           "SramRange": [0,0],
           "TaskGroups": [
-            {"TaskId":4,"TaskRange":[0,2048],"Granularity":4}
+            {"TaskId":4,"TaskRange":[0,2048],"Granularity":7}
           ]
         }
       ]
@@ -1114,10 +1168,10 @@
       "ResourceGroups": [
         {
           "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,24672],
+          "WarpRange": [0,8],
+          "SramRange": [0,49344],
           "TaskGroups": [
-            {"TaskId":15,"TaskRange":[0,4096],"Granularity":1}
+            {"TaskId":15,"TaskRange":[0,4096],"Granularity":2}
           ]
         }
       ]
@@ -1127,10 +1181,10 @@
       "ResourceGroups": [
         {
           "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
+          "WarpRange": [0,8],
           "SramRange": [0,0],
           "TaskGroups": [
-            {"TaskId":17,"TaskRange":[0,65536],"Granularity":4}
+            {"TaskId":17,"TaskRange":[0,65536],"Granularity":8}
           ]
         }
       ]
@@ -1166,10 +1220,10 @@
       "ResourceGroups": [
         {
           "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
+          "WarpRange": [0,8],
           "SramRange": [0,0],
           "TaskGroups": [
-            {"TaskId":28,"TaskRange":[0,2048],"Granularity":4}
+            {"TaskId":28,"TaskRange":[0,2048],"Granularity":7}
           ]
         }
       ]
@@ -1192,10 +1246,10 @@
       "ResourceGroups": [
         {
           "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
+          "WarpRange": [0,8],
           "SramRange": [0,0],
           "TaskGroups": [
-            {"TaskId":30,"TaskRange":[0,2048],"Granularity":4}
+            {"TaskId":30,"TaskRange":[0,2048],"Granularity":7}
           ]
         }
       ]
@@ -1205,8 +1259,8 @@
       "ResourceGroups": [
         {
           "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,24672],
+          "WarpRange": [0,8],
+          "SramRange": [0,49344],
           "TaskGroups": [
             {"TaskId":33,"TaskRange":[0,688],"Granularity":1}
           ]
@@ -1218,10 +1272,11 @@
       "ResourceGroups": [
         {
           "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,24672],
+          "WarpRange": [0,8],
+          "SramRange": [0,49344],
           "TaskGroups": [
-            {"TaskId":36,"TaskRange":[0,688],"Granularity":1}
+            {"TaskId":36,"TaskRange":[0,602],"Granularity":2},
+            {"TaskId":37,"TaskRange":[0,172],"Granularity":1}
           ]
         }
       ]

From 866112de65a6fd5d3c3d89d80cdc53ff27c8c36a Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 9 Jul 2024 01:07:21 +0000
Subject: [PATCH 27/79] optimize

---
 ark/include/kernels/common/sync.h           |  3 +
 ark/include/kernels/reduce.h                | 41 +++++++--
 examples/llama/plan_llama2_7b_b1_s2048.json | 94 +--------------------
 3 files changed, 36 insertions(+), 102 deletions(-)

diff --git a/ark/include/kernels/common/sync.h b/ark/include/kernels/common/sync.h
index f47625600..456a32eb7 100644
--- a/ark/include/kernels/common/sync.h
+++ b/ark/include/kernels/common/sync.h
@@ -106,6 +106,9 @@ DEVICE void sync_warps() {
     static_assert(Arch::ThreadsPerWarp == 64, "");
     if constexpr (NumWarps == 1) {
         __builtin_amdgcn_wave_barrier();
+    } else if constexpr (NumWarps == ARK_WARPS_PER_BLOCK) {
+        // asm volatile("s_waitcnt lgkmcnt(0) \n s_barrier " ::);
+        __syncthreads();
     } else {
         static_assert(ARK_SMEM_RESERVED_BYTES >= sizeof(sync::WarpGroupState),
                       "");
diff --git a/ark/include/kernels/reduce.h b/ark/include/kernels/reduce.h
index 3d0b4e008..2dd79d2c3 100644
--- a/ark/include/kernels/reduce.h
+++ b/ark/include/kernels/reduce.h
@@ -355,8 +355,15 @@ struct WwiseReduce {
                            int smem_per_warp) {
         using ShapeChecker =
             ReduceShapeChecker<InShape, OutShape, UnitOutDims, Axis>;
+        constexpr int InConsecBytes = sizeof(DataType) * InShape::W;
         constexpr int NelemPerThread =
-            DefaultNelemPerThread<OutDims, DataType, UnitOutDims>::value;
+            (InConsecBytes % 16 == 0)
+                ? 16 / sizeof(DataType)
+                : (InConsecBytes % 8 == 0)
+                    ? 8 / sizeof(DataType)
+                    : (InConsecBytes % 4 == 0)
+                        ? 4 / sizeof(DataType)
+                        : (InConsecBytes % 2 == 0) ? 2 / sizeof(DataType) : 1;
 
         constexpr int NonReduceDimLength =
             UnitOutDims::N * UnitOutDims::C * UnitOutDims::H;
@@ -397,22 +404,38 @@ struct WwiseReduce {
                                                         &in[idx_in]);
         }
 
-        DataType finalSum;
-        ReduceType::template identity<1>(&finalSum);
+        static_assert(math::is_pow2<NelemPerThread>::value,
+                      "NelemPerThread must be power of 2");
+        if constexpr (NelemPerThread > 8) {
 #pragma unroll
-        for (int i = 0; i < NelemPerThread; ++i) {
-            ReduceType::template reduce<1>(&finalSum, &finalSum, &reduced[i]);
+            for (int i = 8; i < NelemPerThread; i += 8) {
+                ReduceType::template reduce<8>(&reduced[0], &reduced[0], &reduced[i]);
+            }
+            ReduceType::template reduce<4>(&reduced[0], &reduced[0], &reduced[4]);
+            ReduceType::template reduce<2>(&reduced[0], &reduced[0], &reduced[2]);
+            ReduceType::template reduce<1>(&reduced[0], &reduced[0], &reduced[1]);
+        } else if constexpr (NelemPerThread == 8) {
+            ReduceType::template reduce<4>(&reduced[0], &reduced[0], &reduced[4]);
+            ReduceType::template reduce<2>(&reduced[0], &reduced[0], &reduced[2]);
+            ReduceType::template reduce<1>(&reduced[0], &reduced[0], &reduced[1]);
+        } else if constexpr (NelemPerThread == 4) {
+            ReduceType::template reduce<2>(&reduced[0], &reduced[0], &reduced[2]);
+            ReduceType::template reduce<1>(&reduced[0], &reduced[0], &reduced[1]);
+        } else if constexpr (NelemPerThread == 2) {
+            ReduceType::template reduce<1>(&reduced[0], &reduced[0], &reduced[1]);
         }
 
-        UnitOp::sync_threads();
+        if constexpr (InShape::W % ThreadsPerRow != 0) {
+            UnitOp::sync_threads();
+        }
 
         // final reduction on shared memory using warp shuffle.
-        finalSum = warpsReduce<ReduceType, UnitOp, ThreadsPerRow>(
-            finalSum, tid, smem_per_warp);
+        reduced[0] = warpsReduce<ReduceType, UnitOp, UnitOp::NumThreads>(
+            reduced[0], tid, smem_per_warp);
 
         // write the result to output.
         if (tid % ThreadsPerRow == 0) {
-            ReduceType::template postReduce<1>(&out[idx_out], &finalSum,
+            ReduceType::template postReduce<1>(&out[idx_out], &reduced[0],
                                                InShape::W);
         }
 
diff --git a/examples/llama/plan_llama2_7b_b1_s2048.json b/examples/llama/plan_llama2_7b_b1_s2048.json
index d5c9fe552..b0bc757dc 100644
--- a/examples/llama/plan_llama2_7b_b1_s2048.json
+++ b/examples/llama/plan_llama2_7b_b1_s2048.json
@@ -230,29 +230,6 @@
             "Tile": [256,1,128],
             "NumTasks": 256
           }
-        },
-        {
-          "Type": "Transpose",
-          "Name": "transpose",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":34,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":20,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":37,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":22,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":38,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":22,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "Permutation": {"DIMS":[0,2,1,3]}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [256,128],
-            "NumTasks": 256
-          }
         }
       ]
     },
@@ -307,29 +284,6 @@
             "Tile": [256,128],
             "NumTasks": 256
           }
-        },
-        {
-          "Type": "Transpose",
-          "Name": "transpose_2",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":36,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":21,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":41,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":42,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "Permutation": {"DIMS":[0,2,1,3]}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [256,128],
-            "NumTasks": 256
-          }
         }
       ]
     },
@@ -362,29 +316,6 @@
             "TileShapeMNK": [256,128,32],
             "NumTasks": 256
           }
-        },
-        {
-          "Type": "Transpose",
-          "Name": "transpose_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":32,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":39,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":23,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":40,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":23,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "Permutation": {"DIMS":[0,2,1,3]}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [256,128],
-            "NumTasks": 256
-          }
         }
       ]
     },
@@ -592,29 +523,6 @@
             "TileShapeMNK": [256,128,32],
             "NumTasks": 256
           }
-        },
-        {
-          "Type": "Transpose",
-          "Name": "transpose_3",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":55,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":29,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":56,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":30,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":57,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":30,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "Permutation": {"DIMS":[0,2,1,3]}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [256,1,128],
-            "NumTasks": 256
-          }
         }
       ]
     },
@@ -1184,7 +1092,7 @@
           "WarpRange": [0,8],
           "SramRange": [0,0],
           "TaskGroups": [
-            {"TaskId":17,"TaskRange":[0,65536],"Granularity":8}
+            {"TaskId":17,"TaskRange":[0,65536],"Granularity":1}
           ]
         }
       ]

From 68e787ae377c282c9d117e6650eb112a34c54a9c Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 9 Jul 2024 20:51:44 +0000
Subject: [PATCH 28/79] fix bf16 matmul

---
 ark/ops/ops_matmul.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ark/ops/ops_matmul.cpp b/ark/ops/ops_matmul.cpp
index b4553a4ed..a24b95d72 100644
--- a/ark/ops/ops_matmul.cpp
+++ b/ark/ops/ops_matmul.cpp
@@ -223,7 +223,7 @@ static const Json get_default_config(const ArchRef arch,
                 {"TileShapeMNK", {tm, tn, 32}}};
     } else if (arch->belongs_to(ARCH_ROCM_942) && data_type == BF16.ref()) {
         return {{"NumWarps", 4},
-                {"SramBytes", 24672},
+                {"SramBytes", 24624},
                 {"TileShapeMNK", {tm, tn, 32}}};
     }
     ERR(InternalError, "Unexpected error");

From b18bdb2e66d30c34b21657e15bb6cf491f108544 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 10 Jul 2024 23:44:07 +0000
Subject: [PATCH 29/79] Enhance executor interfaces

---
 ark/api/executor.cpp               | 295 +++++++++++++++++++----------
 ark/gpu/gpu_event.cpp              |  11 +-
 ark/gpu/gpu_event.h                |   4 +-
 ark/gpu/gpu_kernel.cpp             |   2 +-
 ark/gpu/gpu_kernel.h               |   2 +-
 ark/gpu/gpu_manager.cpp            |  18 +-
 ark/gpu/gpu_manager.h              |   4 +-
 ark/include/ark/executor.hpp       |  46 +++--
 ark/model/model_json.cpp           |  11 +-
 ark/model/model_json.hpp           |   2 +-
 ark/model/model_op.cpp             |   5 +-
 ark/ops/ops_all_reduce_test.cpp    |   7 +-
 ark/ops/ops_communication_test.cpp |   8 +-
 ark/ops/ops_embedding_test.cpp     |   6 +-
 ark/ops/ops_test_common.cpp        |  20 +-
 ark/ops/ops_test_common.hpp        |  15 +-
 cmake/Utils.cmake                  |   2 +-
 python/ark/runtime.py              |   4 +-
 python/ark/tensor.py               |  10 +-
 python/executor_py.cpp             |  59 +++++-
 20 files changed, 344 insertions(+), 187 deletions(-)

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index 14625161f..2f50a4280 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -140,10 +140,17 @@ static size_t tensor_stride_bytes(const Json &tensor) {
 
 class Executor::Impl {
    public:
-    Impl(int rank, int world_size, int gpu_id, const std::string &name,
-         const std::string &plan);
+    Impl(int device_id, Stream stream, const std::string &name);
     ~Impl() = default;
 
+    void init(const PlanJson& plan);
+
+    int device_id() const { return device_id_; }
+
+    Stream stream() const { return reinterpret_cast<Stream>(stream_raw_); }
+
+    std::string plan() const { return plan_json_.dump_pretty(); }
+
     void compile();
     void launch(int64_t max_spin_count);
     void run(int iter);
@@ -151,9 +158,12 @@ class Executor::Impl {
     float stop(int64_t max_spin_count);
     void barrier();
 
-    void tensor_read(const Tensor tensor, void *data, size_t bytes) const;
-    void tensor_write(const Tensor tensor, const void *data,
-                      size_t bytes) const;
+    uintptr_t tensor_address(const Tensor tensor) const;
+
+    void tensor_read(const Tensor tensor, void *data, size_t bytes,
+                     Stream stream, bool is_d2d) const;
+    void tensor_write(const Tensor tensor, const void *data, size_t bytes,
+                      Stream stream, bool is_d2d) const;
 
    private:
     void init_communicator();
@@ -162,14 +172,18 @@ class Executor::Impl {
     void init_channels(const std::set<int> &remote_ranks);
 
    protected:
-    const int rank_;
-    const int world_size_;
-    int gpu_id_;
+    int device_id_;
+    std::string name_;
+    gpuStream stream_raw_;
+
+    int rank_;
+    int world_size_;
 
     bool is_launched_ = false;
     bool is_recording_ = false;
     float elapsed_msec_ = -1;
 
+    PlanJson plan_json_;
     std::map<size_t, size_t> buffer_id_to_offset_;
     size_t total_bytes_;
     std::shared_ptr<CodeGenerator> codegen_;
@@ -177,8 +191,7 @@ class Executor::Impl {
     std::shared_ptr<GpuEvent> timer_end_;
     std::shared_ptr<GpuMemory> buffer_;
     std::shared_ptr<GpuHostMemory> flag_;
-    std::shared_ptr<GpuStream> main_stream_;
-    std::shared_ptr<GpuStream> copy_stream_;
+    std::shared_ptr<GpuStream> stream_;
     std::shared_ptr<GpuKernel> kernel_;
 
     // For communication
@@ -190,30 +203,35 @@ class Executor::Impl {
         rank_to_sm_channels_;
 };
 
-Executor::Impl::Impl(int rank, int world_size, int gpu_id,
-                     const std::string &name, const std::string &plan)
-    : rank_(rank), world_size_(world_size), gpu_id_(gpu_id) {
-    if (rank < 0 || rank >= world_size) {
-        ERR(InvalidUsageError, "Invalid rank ", rank, " with world size ",
-            world_size);
+Executor::Impl::Impl(int device_id, Stream stream, const std::string &name)
+    : device_id_(device_id), name_(name) {
+    if (device_id < 0) {
+        ERR(InvalidUsageError, "Invalid device ID ", device_id);
     }
-    if (gpu_id < 0) {
-        ERR(InvalidUsageError, "Invalid GPU ID ", gpu_id);
+    if (stream) {
+        stream_raw_ = reinterpret_cast<gpuStream>(stream);
+    } else {
+        stream_ = GpuManager::get_instance(device_id_)->create_stream();
+        stream_raw_ = stream_->get();
+    }
+}
+
+void Executor::Impl::init(const PlanJson &plan_json) {
+    plan_json_ = plan_json;
+    rank_ = plan_json_["Rank"].get<int>();
+    world_size_ = plan_json_["WorldSize"].get<int>();
+
+    if (rank_ < 0 || rank_ >= world_size_) {
+        ERR(InvalidUsageError, "Invalid rank ", rank_, " with world size ",
+            world_size_);
     }
     if (world_size_ > 1) {
         init_communicator();
     }
 
-    Json plan_json;
-    auto &plan_path = get_env().enforce_plan_path;
-    if (!plan_path.empty()) {
-        LOG(INFO, "Enforce executor plan path: ", plan_path);
-        plan_json = Json::parse(read_file(plan_path));
-    } else {
-        plan_json = Json::parse(plan);
-    }
+    auto gpu_manager = GpuManager::get_instance(device_id_);
 
-    buffer_id_to_offset_ = init_buffers(plan_json);
+    buffer_id_to_offset_ = init_buffers(plan_json_);
 
     std::string buffer_id_to_offset_str;
     for (const auto &kv : buffer_id_to_offset_) {
@@ -221,17 +239,14 @@ Executor::Impl::Impl(int rank, int world_size, int gpu_id,
             std::to_string(kv.first) + ": " + std::to_string(kv.second) + ", ";
     }
 
-    codegen_ =
-        std::make_shared<CodeGenerator>(plan_json, buffer_id_to_offset_, name);
+    codegen_ = std::make_shared<CodeGenerator>(plan_json_, buffer_id_to_offset_,
+                                               name_);
 
-    auto gpu_manager = GpuManager::get_instance(gpu_id_);
     timer_begin_ = gpu_manager->create_event();
     timer_end_ = gpu_manager->create_event();
     buffer_ = gpu_manager->malloc(total_bytes_, 65536);
     flag_ = gpu_manager->malloc_host(
         sizeof(int), gpuHostAllocMapped | gpuHostAllocWriteCombined);
-    main_stream_ = gpu_manager->create_stream();
-    copy_stream_ = gpu_manager->create_stream();
 
     int threads_per_block = static_cast<int>(
         codegen_->num_warps_per_proc() * gpu_manager->info().threads_per_warp);
@@ -241,13 +256,13 @@ Executor::Impl::Impl(int rank, int world_size, int gpu_id,
         static_cast<size_t>(gpu_manager->info().smem_block_total);
 
     if (world_size_ > 1) {
-        auto remote_ranks = init_remote_ranks(plan_json);
+        auto remote_ranks = init_remote_ranks(plan_json_);
         init_channels(remote_ranks);
     }
 
     kernel_ = std::shared_ptr<GpuKernel>(new GpuKernel(
-        gpu_id_, codegen_->code(), {threads_per_block, 1, 1}, {num_sm, 1, 1},
-        std::max(smem_block_total, size_t(4)), name,
+        device_id_, codegen_->code(), {threads_per_block, 1, 1}, {num_sm, 1, 1},
+        std::max(smem_block_total, size_t(4)), name_,
         {std::pair<void *, size_t>{buffer_->ref(), sizeof(buffer_->ref())},
          std::pair<void *, size_t>{flag, sizeof(flag)}}));
 }
@@ -509,7 +524,7 @@ void Executor::Impl::init_channels(const std::set<int> &remote_ranks) {
     mscclpp::TransportFlags all_transports =
         mscclpp::Transport::CudaIpc | mscclpp::Transport::Ethernet;
     if (!get_env().disable_ib) {
-        all_transports |= IBs[gpu_id_];
+        all_transports |= IBs[device_id_];
     }
     mscclpp::RegisteredMemory regmem =
         comm_->registerMemory(buffer_->ref(), buffer_->bytes(), all_transports);
@@ -530,12 +545,12 @@ void Executor::Impl::init_channels(const std::set<int> &remote_ranks) {
         if (remote_node == this_node) {
             add_connection(remote_rank, mscclpp::Transport::CudaIpc);
             if (!get_env().disable_ib) {
-                add_connection(remote_rank, IBs[gpu_id_]);
+                add_connection(remote_rank, IBs[device_id_]);
             }
         } else {
             add_connection(remote_rank, get_env().disable_ib
                                             ? mscclpp::Transport::Ethernet
-                                            : IBs[gpu_id_]);
+                                            : IBs[device_id_]);
         }
         comm_->sendMemoryOnSetup(regmem, remote_rank, 0);
         rank_to_remote_regmem_future[remote_rank] =
@@ -623,22 +638,22 @@ void Executor::Impl::launch(int64_t max_spin_count) {
                 sm_handles[i] = it2->second[0]->deviceHandle();
             }
         }
-        GLOG(gpuSetDevice(gpu_id_));
+        GLOG(gpuSetDevice(device_id_));
         GLOG(gpuMemcpyAsync(
             proxy_chan_addr, proxy_handles.data(),
             proxy_handles.size() *
                 sizeof(mscclpp::SimpleProxyChannel::DeviceHandle),
-            gpuMemcpyHostToDevice, copy_stream_->get()));
+            gpuMemcpyHostToDevice, stream_raw_));
         GLOG(gpuMemcpyAsync(
             proxy_secondary_chan_addr, proxy_secondary_handles.data(),
             proxy_secondary_handles.size() *
                 sizeof(mscclpp::SimpleProxyChannel::DeviceHandle),
-            gpuMemcpyHostToDevice, copy_stream_->get()));
+            gpuMemcpyHostToDevice, stream_raw_));
         GLOG(gpuMemcpyAsync(
             sm_chan_addr, sm_handles.data(),
             sm_handles.size() * sizeof(mscclpp::SmChannel::DeviceHandle),
-            gpuMemcpyHostToDevice, copy_stream_->get()));
-        copy_stream_->sync();
+            gpuMemcpyHostToDevice, stream_raw_));
+        GLOG(gpuStreamSynchronize(stream_raw_));
     }
 
     elapsed_msec_ = -1;
@@ -648,7 +663,7 @@ void Executor::Impl::launch(int64_t max_spin_count) {
         LOG(WARN, "Ignore launching twice.");
         return;
     }
-    timer_begin_->record(main_stream_);
+    timer_begin_->record(stream_raw_);
 
     if (world_size_ > 1) {
         proxy_service_->startProxy();
@@ -656,8 +671,8 @@ void Executor::Impl::launch(int64_t max_spin_count) {
 
     // Initialize loop flags.
     atomicStoreRelaxed(flag_->ref<int>(), 0);
-    kernel_->launch(main_stream_);
-    timer_end_->record(main_stream_);
+    kernel_->launch(stream_raw_);
+    timer_end_->record(stream_raw_);
     is_recording_ = true;
     is_launched_ = true;
 }
@@ -677,7 +692,7 @@ void Executor::Impl::wait(int64_t max_spin_count) {
             continue;
         }
         // Check if the kernel encountered an error.
-        gpuError res = main_stream_->query();
+        gpuError res = gpuStreamQuery(stream_raw_);
         if (res == gpuSuccess) {
             if (atomicLoadRelaxed(flag_->ref<int>()) > 0) {
                 LOG(WARN, "Stream is finished but the loop flag is still set.");
@@ -699,7 +714,7 @@ void Executor::Impl::wait(int64_t max_spin_count) {
 float Executor::Impl::stop(int64_t max_spin_count) {
     this->wait(max_spin_count);
     atomicStoreRelaxed(flag_->ref<int>(), -1);
-    main_stream_->sync();
+    GLOG(gpuStreamSynchronize(stream_raw_));
     if (is_recording_) {
         elapsed_msec_ = timer_end_->elapsed_msec(*timer_begin_);
         is_recording_ = false;
@@ -717,71 +732,140 @@ void Executor::Impl::barrier() {
     }
 }
 
-void Executor::Impl::tensor_read(const Tensor tensor, void *data,
-                                 size_t bytes) const {
-    GLOG(gpuSetDevice(gpu_id_));
+uintptr_t Executor::Impl::tensor_address(const Tensor tensor) const {
+    size_t buffer_id = tensor.ref()->buffer()->id();
+    if (buffer_id_to_offset_.find(buffer_id) == buffer_id_to_offset_.end()) {
+        ERR(NotFoundError, "Invalid buffer ID: ", buffer_id);
+    }
+    size_t offset = buffer_id_to_offset_.at(buffer_id);
+    return reinterpret_cast<uintptr_t>(buffer_->ref(offset));
+}
+
+void Executor::Impl::tensor_read(const Tensor tensor, void *data, size_t bytes,
+                                 Stream stream, bool is_d2d) const {
+    GLOG(gpuSetDevice(device_id_));
+    std::shared_ptr<GpuStream> copy_stream;
+    gpuStream copy_stream_raw;
+    if (stream) {
+        copy_stream_raw = reinterpret_cast<gpuStream>(stream);
+        if ((stream == stream_raw_) && is_launched_) {
+            LOG(WARN,
+                "Reading from a tensor in the same stream of the kernel "
+                "may cause a deadlock.");
+        }
+    } else {
+        copy_stream = GpuManager::get_instance(device_id_)->create_stream();
+        copy_stream_raw = copy_stream->get();
+    }
     size_t tensor_data_bytes =
         tensor.shape().nelems() * tensor.data_type().bytes();
-    if (bytes < tensor_data_bytes) {
-        ERR(InvalidUsageError, "Data buffer (", bytes,
-            ") is smaller than the tensor data (", tensor_data_bytes, ").");
+    if (bytes != tensor_data_bytes) {
+        ERR(InvalidUsageError, "Destination bytes (", bytes,
+            ") mismatches the tensor data bytes (", tensor_data_bytes, ").");
     }
-    size_t tensor_bytes =
-        tensor.strides().nelems() * tensor.data_type().bytes();
-    void *src =
-        buffer_->ref(buffer_id_to_offset_.at(tensor.ref()->buffer()->id()));
+    auto kind = (is_d2d) ? gpuMemcpyDeviceToDevice : gpuMemcpyDeviceToHost;
+    void *src = reinterpret_cast<void *>(tensor_address(tensor));
     if (tensor.strides() == tensor.shape()) {
-        GLOG(gpuMemcpyAsync(data, src, bytes, gpuMemcpyDeviceToHost,
-                            copy_stream_->get()));
-        copy_stream_->sync();
+        GLOG(gpuMemcpyAsync(data, src, bytes, kind, copy_stream_raw));
     } else {
+        size_t tensor_bytes =
+            tensor.strides().nelems() * tensor.data_type().bytes();
         std::vector<int8_t> tensor_host(tensor_bytes);
         GLOG(gpuMemcpyAsync(tensor_host.data(), src, tensor_bytes,
-                            gpuMemcpyDeviceToHost, copy_stream_->get()));
-        copy_stream_->sync();
-        tensor_to_data(tensor_host.data(), static_cast<int8_t *>(data),
-                       tensor.shape(), tensor.strides(), tensor.offsets(),
+                            gpuMemcpyDeviceToHost, copy_stream_raw));
+        GLOG(gpuStreamSynchronize(copy_stream_raw));
+        if (!is_d2d) {
+            tensor_to_data(tensor_host.data(), static_cast<int8_t *>(data),
+                           tensor.shape(), tensor.strides(), tensor.offsets(),
+                           tensor.data_type().bytes());
+            return;
+        }
+        // TODO: convert data layout on the device directly
+        std::vector<int8_t> data_host(bytes);
+        tensor_to_data(tensor_host.data(), data_host.data(), tensor.shape(),
+                       tensor.strides(), tensor.offsets(),
                        tensor.data_type().bytes());
+        GLOG(gpuMemcpyAsync(data, data_host.data(), bytes,
+                            gpuMemcpyHostToDevice, copy_stream_raw));
     }
+    GLOG(gpuStreamSynchronize(copy_stream_raw));
 }
 
 void Executor::Impl::tensor_write(const Tensor tensor, const void *data,
-                                  size_t bytes) const {
-    GLOG(gpuSetDevice(gpu_id_));
+                                  size_t bytes, Stream stream,
+                                  bool is_d2d) const {
+    GLOG(gpuSetDevice(device_id_));
+    std::shared_ptr<GpuStream> copy_stream;
+    gpuStream copy_stream_raw;
+    if (stream) {
+        copy_stream_raw = reinterpret_cast<gpuStream>(stream);
+        if ((stream == stream_raw_) && is_launched_) {
+            LOG(WARN,
+                "Writing to a tensor in the same stream of the kernel "
+                "may cause a deadlock.");
+        }
+    } else {
+        copy_stream = GpuManager::get_instance(device_id_)->create_stream();
+        copy_stream_raw = copy_stream->get();
+    }
     size_t tensor_data_bytes =
         tensor.shape().nelems() * tensor.data_type().bytes();
-    if (bytes < tensor_data_bytes) {
-        ERR(InvalidUsageError, "Data buffer (", bytes,
-            ") is smaller than the tensor data (", tensor_data_bytes, ").");
+    if (bytes != tensor_data_bytes) {
+        ERR(InvalidUsageError, "Source bytes (", bytes,
+            ") mismatches the tensor data bytes (", tensor_data_bytes, ").");
     }
     size_t tensor_bytes =
         tensor.strides().nelems() * tensor.data_type().bytes();
-    void *dst =
-        buffer_->ref(buffer_id_to_offset_.at(tensor.ref()->buffer()->id()));
+    auto kind = (is_d2d) ? gpuMemcpyDeviceToDevice : gpuMemcpyHostToDevice;
+    void *dst = reinterpret_cast<void *>(tensor_address(tensor));
     if (tensor.strides() == tensor.shape()) {
-        GLOG(gpuMemcpyAsync(dst, data, tensor_bytes, gpuMemcpyHostToDevice,
-                            copy_stream_->get()));
+        GLOG(gpuMemcpyAsync(dst, data, tensor_bytes, kind, copy_stream_raw));
     } else {
         std::vector<int8_t> tensor_host(tensor_bytes);
-        GLOG(gpuMemcpyAsync(tensor_host.data(), dst, tensor_bytes,
-                            gpuMemcpyDeviceToHost, copy_stream_->get()));
-        copy_stream_->sync();
-        data_to_tensor(tensor_host.data(), static_cast<const int8_t *>(data),
-                       tensor.shape(), tensor.strides(), tensor.offsets(),
-                       tensor.data_type().bytes());
+        if (!is_d2d) {
+            GLOG(gpuMemcpyAsync(tensor_host.data(), dst, tensor_bytes,
+                                gpuMemcpyDeviceToHost, copy_stream_raw));
+            GLOG(gpuStreamSynchronize(copy_stream_raw));
+            data_to_tensor(tensor_host.data(),
+                           static_cast<const int8_t *>(data), tensor.shape(),
+                           tensor.strides(), tensor.offsets(),
+                           tensor.data_type().bytes());
+        } else {
+            // TODO: convert data layout on the device directly
+            std::vector<int8_t> tmp(bytes);
+            GLOG(gpuMemcpyAsync(tmp.data(), data, bytes, gpuMemcpyDeviceToHost,
+                                copy_stream_raw));
+            GLOG(gpuStreamSynchronize(copy_stream_raw));
+            data_to_tensor(tensor_host.data(), tmp.data(), tensor.shape(),
+                           tensor.strides(), tensor.offsets(),
+                           tensor.data_type().bytes());
+        }
         GLOG(gpuMemcpyAsync(dst, tensor_host.data(), tensor_bytes,
-                            gpuMemcpyHostToDevice, copy_stream_->get()));
+                            gpuMemcpyHostToDevice, copy_stream_raw));
     }
-    copy_stream_->sync();
+    GLOG(gpuStreamSynchronize(copy_stream_raw));
 }
 
-Executor::Executor(int rank, int world_size, int gpu_id,
-                   const std::string &name, const std::string &plan)
-    : impl_(std::make_unique<Executor::Impl>(rank, world_size, gpu_id, name,
-                                             plan)) {}
+Executor::Executor(int device_id, Stream stream, const std::string &name,
+                   const std::string &plan)
+    : impl_(std::make_unique<Executor::Impl>(device_id, stream, name)) {
+    auto &plan_path = get_env().enforce_plan_path;
+    if (!plan_path.empty()) {
+        LOG(INFO, "Enforce executor plan path: ", plan_path);
+        impl_->init(Json::parse(read_file(plan_path)));
+    } else if (!plan.empty()) {
+        impl_->init(Json::parse(plan));
+    }
+}
 
 Executor::~Executor() = default;
 
+int Executor::device_id() const { return impl_->device_id(); }
+
+Stream Executor::stream() const { return impl_->stream(); }
+
+std::string Executor::plan() const { return impl_->plan(); }
+
 void Executor::compile() { impl_->compile(); }
 
 void Executor::launch(int64_t max_spin_count) { impl_->launch(max_spin_count); }
@@ -800,25 +884,32 @@ void Executor::destroy() { impl_.reset(nullptr); }
 
 bool Executor::destroyed() const { return impl_.get() == nullptr; }
 
-void Executor::tensor_read(const Tensor tensor, void *data,
-                           size_t bytes) const {
-    impl_->tensor_read(tensor, data, bytes);
+uintptr_t Executor::tensor_address(const Tensor tensor) const {
+    return impl_->tensor_address(tensor);
 }
 
-void Executor::tensor_write(const Tensor tensor, const void *data,
-                            size_t bytes) const {
-    impl_->tensor_write(tensor, data, bytes);
+void Executor::tensor_read(const Tensor tensor, void *data, size_t bytes,
+                           Stream stream, bool is_d2d) const {
+    impl_->tensor_read(tensor, data, bytes, stream, is_d2d);
 }
 
-DefaultExecutor::DefaultExecutor(const Model &model, int gpu_id,
-                                 const std::string &name)
-    : Executor(
-          model.rank(), model.world_size(),
-          (gpu_id < 0) ? (model.rank() % get_env().num_ranks_per_host) : gpu_id,
-          name,
-          DefaultPlanner(model, (gpu_id < 0) ? (model.rank() %
-                                                get_env().num_ranks_per_host)
-                                             : gpu_id)
-              .plan()) {}
+void Executor::tensor_write(const Tensor tensor, const void *data, size_t bytes,
+                            Stream stream, bool is_d2d) const {
+    impl_->tensor_write(tensor, data, bytes, stream, is_d2d);
+}
+
+DefaultExecutor::DefaultExecutor(
+    const Model &model, int device_id, Stream stream,
+    const std::vector<DefaultPlanner::ConfigRule> &config_rules,
+    const std::string &name)
+    : Executor((device_id < 0) ? (model.rank() % get_env().num_ranks_per_host)
+                               : device_id,
+               stream, name, "") {
+    DefaultPlanner planner(model, impl_->device_id());
+    for (const auto &rule : config_rules) {
+        planner.install_config_rule(rule);
+    }
+    impl_->init(Json::parse(planner.plan()));
+}
 
 }  // namespace ark
diff --git a/ark/gpu/gpu_event.cpp b/ark/gpu/gpu_event.cpp
index 93ec3fd52..cbc45d9a6 100644
--- a/ark/gpu/gpu_event.cpp
+++ b/ark/gpu/gpu_event.cpp
@@ -3,7 +3,6 @@
 
 #include "gpu/gpu_event.h"
 
-#include "gpu/gpu.h"
 #include "gpu/gpu_logging.h"
 #include "gpu/gpu_manager.h"
 
@@ -15,7 +14,7 @@ class GpuEvent::Impl {
     Impl(const Impl&) = delete;
     Impl& operator=(const Impl&) = delete;
 
-    void record(std::shared_ptr<GpuStream> stream);
+    void record(gpuStream stream);
     float elapsed_msec(const GpuEvent& other) const;
 
    private:
@@ -32,8 +31,8 @@ GpuEvent::Impl::Impl(bool disable_timing) {
 
 GpuEvent::Impl::~Impl() { GLOG(gpuEventDestroy(event_)); }
 
-void GpuEvent::Impl::record(std::shared_ptr<GpuStream> stream) {
-    GLOG(gpuEventRecord(event_, stream->get()));
+void GpuEvent::Impl::record(gpuStream stream) {
+    GLOG(gpuEventRecord(event_, stream));
 }
 
 float GpuEvent::Impl::elapsed_msec(const GpuEvent& other) const {
@@ -45,9 +44,7 @@ float GpuEvent::Impl::elapsed_msec(const GpuEvent& other) const {
 GpuEvent::GpuEvent(bool disable_timing)
     : pimpl_(std::make_shared<Impl>(disable_timing)) {}
 
-void GpuEvent::record(std::shared_ptr<GpuStream> stream) {
-    pimpl_->record(stream);
-}
+void GpuEvent::record(gpuStream stream) { pimpl_->record(stream); }
 
 float GpuEvent::elapsed_msec(const GpuEvent& other) const {
     return pimpl_->elapsed_msec(other);
diff --git a/ark/gpu/gpu_event.h b/ark/gpu/gpu_event.h
index 4599ecaa4..081f0203b 100644
--- a/ark/gpu/gpu_event.h
+++ b/ark/gpu/gpu_event.h
@@ -6,6 +6,8 @@
 
 #include <memory>
 
+#include "gpu/gpu.h"
+
 namespace ark {
 
 class GpuStream;
@@ -17,7 +19,7 @@ class GpuEvent {
     GpuEvent(const GpuEvent &) = delete;
     GpuEvent &operator=(const GpuEvent &) = delete;
 
-    void record(std::shared_ptr<GpuStream> stream);
+    void record(gpuStream stream);
     float elapsed_msec(const GpuEvent &other) const;
 
    protected:
diff --git a/ark/gpu/gpu_kernel.cpp b/ark/gpu/gpu_kernel.cpp
index 44ff43a1d..46f467f51 100644
--- a/ark/gpu/gpu_kernel.cpp
+++ b/ark/gpu/gpu_kernel.cpp
@@ -68,7 +68,7 @@ void GpuKernel::compile() {
                                  dynamic_smem_size_bytes));
 }
 
-void GpuKernel::launch(std::shared_ptr<GpuStream> stream) {
+void GpuKernel::launch(gpuStream stream) {
     if (!this->is_compiled()) {
         ERR(InvalidUsageError, "Kernel is not compiled yet.");
     }
diff --git a/ark/gpu/gpu_kernel.h b/ark/gpu/gpu_kernel.h
index c3b60aec4..b3be79071 100644
--- a/ark/gpu/gpu_kernel.h
+++ b/ark/gpu/gpu_kernel.h
@@ -27,7 +27,7 @@ class GpuKernel {
               const std::string& kernel_name,
               std::initializer_list<std::pair<void*, size_t>> args = {});
     void compile();
-    void launch(std::shared_ptr<GpuStream> stream);
+    void launch(gpuStream stream);
 
     gpuDeviceptr get_global(const std::string& name,
                             bool ignore_not_found = false) const;
diff --git a/ark/gpu/gpu_manager.cpp b/ark/gpu/gpu_manager.cpp
index 3a6d0a066..fc841fa32 100644
--- a/ark/gpu/gpu_manager.cpp
+++ b/ark/gpu/gpu_manager.cpp
@@ -20,11 +20,10 @@ class GpuManager::Impl {
 
     int gpu_id_;
     GpuManager::Info info_;
-    std::shared_ptr<GpuStream> main_stream_;
 
     void launch(gpuFunction kernel, const std::array<int, 3> &grid_dim,
                 const std::array<int, 3> &block_dim, int smem_bytes,
-                std::shared_ptr<GpuStream> stream, void **params, void **extra);
+                gpuStream stream, void **params, void **extra);
 };
 
 GpuManager::Impl::Impl(int gpu_id) : gpu_id_(gpu_id) {
@@ -76,11 +75,11 @@ GpuManager::Impl::Impl(int gpu_id) : gpu_id_(gpu_id) {
 void GpuManager::Impl::launch(gpuFunction kernel,
                               const std::array<int, 3> &grid_dim,
                               const std::array<int, 3> &block_dim,
-                              int smem_bytes, std::shared_ptr<GpuStream> stream,
-                              void **params, void **extra) {
+                              int smem_bytes, gpuStream stream, void **params,
+                              void **extra) {
     GLOG_DRV(gpuModuleLaunchKernel(
         kernel, grid_dim[0], grid_dim[1], grid_dim[2], block_dim[0],
-        block_dim[1], block_dim[2], smem_bytes, stream->get(), params, extra));
+        block_dim[1], block_dim[2], smem_bytes, stream, params, extra));
 }
 
 std::shared_ptr<GpuManager> GpuManager::get_instance(int gpu_id) {
@@ -102,9 +101,7 @@ std::shared_ptr<GpuManager> GpuManager::get_instance(int gpu_id) {
     }
 }
 
-GpuManager::GpuManager(int gpu_id) : pimpl_(std::make_shared<Impl>(gpu_id)) {
-    this->pimpl_->main_stream_ = std::shared_ptr<GpuStream>(new GpuStream());
-}
+GpuManager::GpuManager(int gpu_id) : pimpl_(std::make_shared<Impl>(gpu_id)) {}
 
 std::shared_ptr<GpuMemory> GpuManager::malloc(size_t bytes, size_t align,
                                               bool expose) {
@@ -126,8 +123,6 @@ std::shared_ptr<GpuStream> GpuManager::create_stream() const {
     return std::shared_ptr<GpuStream>(new GpuStream());
 }
 
-int GpuManager::get_gpu_id() const { return pimpl_->gpu_id_; }
-
 const GpuManager::Info &GpuManager::info() const { return pimpl_->info_; }
 
 void GpuManager::set_current() const { GLOG(gpuSetDevice(pimpl_->gpu_id_)); }
@@ -135,8 +130,7 @@ void GpuManager::set_current() const { GLOG(gpuSetDevice(pimpl_->gpu_id_)); }
 void GpuManager::launch(gpuFunction function,
                         const std::array<int, 3> &grid_dim,
                         const std::array<int, 3> &block_dim, int smem_bytes,
-                        std::shared_ptr<GpuStream> stream, void **params,
-                        void **extra) const {
+                        gpuStream stream, void **params, void **extra) const {
     this->set_current();
     pimpl_->launch(function, grid_dim, block_dim, smem_bytes, stream, params,
                    extra);
diff --git a/ark/gpu/gpu_manager.h b/ark/gpu/gpu_manager.h
index 05014ac47..93a48cf7b 100644
--- a/ark/gpu/gpu_manager.h
+++ b/ark/gpu/gpu_manager.h
@@ -30,11 +30,9 @@ class GpuManager {
     std::shared_ptr<GpuEvent> create_event(bool disable_timing = false) const;
     std::shared_ptr<GpuStream> create_stream() const;
 
-    int get_gpu_id() const;
     void launch(gpuFunction function, const std::array<int, 3> &grid_dim,
                 const std::array<int, 3> &block_dim, int smem_bytes,
-                std::shared_ptr<GpuStream> stream, void **params,
-                void **extra) const;
+                gpuStream stream, void **params, void **extra) const;
 
     struct Info;
     const Info &info() const;
diff --git a/ark/include/ark/executor.hpp b/ark/include/ark/executor.hpp
index 4682af7d0..75dc81c17 100644
--- a/ark/include/ark/executor.hpp
+++ b/ark/include/ark/executor.hpp
@@ -5,6 +5,7 @@
 #define ARK_EXECUTOR_HPP
 
 #include <ark/model_ref.hpp>
+#include <ark/planner.hpp>
 #include <ark/tensor.hpp>
 #include <memory>
 #include <string>
@@ -12,15 +13,27 @@
 
 namespace ark {
 
+using Stream = void *;
+
 /// Convenience class for executing a model.
 class Executor {
    public:
     /// Constructor.
-    Executor(int rank, int world_size, int gpu_id, const std::string &name,
+    Executor(int device_id, Stream stream, const std::string &name,
              const std::string &plan);
 
+    /// Destructor.
     ~Executor();
 
+    /// Return the device ID.
+    int device_id() const;
+
+    /// Return the stream of the executor.
+    Stream stream() const;
+
+    /// Return the plan string.
+    std::string plan() const;
+
     /// Compile the model. This must be called before `launch()`.
     void compile();
 
@@ -39,30 +52,39 @@ class Executor {
     /// again.
     float stop(int64_t max_spin_count = -1);
 
+    /// Barrier for all rank executors.
     void barrier();
 
+    /// Destroy the executor.
     void destroy();
 
+    /// Return whether the executor is destroyed.
     bool destroyed() const;
 
+    /// Return the raw virtual address of the tensor.
+    uintptr_t tensor_address(const Tensor tensor) const;
+
     template <typename T>
-    void tensor_read(const Tensor tensor, std::vector<T> &data) const {
+    void tensor_read(const Tensor tensor, std::vector<T> &data,
+                     Stream stream = nullptr) const {
         tensor_read(tensor, reinterpret_cast<void *>(data.data()),
-                    data.size() * sizeof(T));
+                    data.size() * sizeof(T), stream);
     }
 
     template <typename T>
-    void tensor_write(const Tensor tensor, const std::vector<T> &data) const {
+    void tensor_write(const Tensor tensor, const std::vector<T> &data,
+                      Stream stream = nullptr) const {
         tensor_write(tensor, reinterpret_cast<const void *>(data.data()),
-                     data.size() * sizeof(T));
+                     data.size() * sizeof(T), stream);
     }
 
-    void tensor_read(const Tensor tensor, void *data, size_t bytes) const;
+    void tensor_read(const Tensor tensor, void *data, size_t bytes,
+                     Stream stream = nullptr, bool is_d2d = false) const;
 
-    void tensor_write(const Tensor tensor, const void *data,
-                      size_t bytes) const;
+    void tensor_write(const Tensor tensor, const void *data, size_t bytes,
+                      Stream stream = nullptr, bool is_d2d = false) const;
 
-   private:
+   protected:
     class Impl;
     std::unique_ptr<Impl> impl_;
 };
@@ -71,8 +93,10 @@ class Model;
 
 class DefaultExecutor : public Executor {
    public:
-    DefaultExecutor(const Model &model, int gpu_id = -1,
-                    const std::string &name = "DefaultExecutor");
+    DefaultExecutor(
+        const Model &model, int device_id = -1, Stream stream = nullptr,
+        const std::vector<DefaultPlanner::ConfigRule> &config_rules = {},
+        const std::string &name = "DefaultExecutor");
 };
 
 }  // namespace ark
diff --git a/ark/model/model_json.cpp b/ark/model/model_json.cpp
index 0057ef0aa..97ce71967 100644
--- a/ark/model/model_json.cpp
+++ b/ark/model/model_json.cpp
@@ -272,7 +272,16 @@ static void verify_format_plan(const Json &json) {
     }
 }
 
-PlanJson::PlanJson(const Json &json) : Json(json) { verify_format_plan(*this); }
+PlanJson::PlanJson(const Json &json)
+    : Json((json != nullptr) ? json
+                             : Json{{"Rank", 0},
+                                    {"WorldSize", 1},
+                                    {"NumProcessors", 1},
+                                    {"NumWarpsPerProcessor", 1},
+                                    {"TaskInfos", Json::array()},
+                                    {"ProcessorGroups", Json::array()}}) {
+    verify_format_plan(*this);
+}
 
 static std::stringstream &dump_pretty_plan(const Json &json,
                                            std::stringstream &ss, int indent,
diff --git a/ark/model/model_json.hpp b/ark/model/model_json.hpp
index cf5fbbce2..e42640a9a 100644
--- a/ark/model/model_json.hpp
+++ b/ark/model/model_json.hpp
@@ -18,7 +18,7 @@ class ModelJson : public Json {
 
 class PlanJson : public Json {
    public:
-    PlanJson(const Json &json);
+    PlanJson(const Json &json = nullptr);
     std::string dump_pretty(int indent = 0, int indent_step = 2) const;
 };
 
diff --git a/ark/model/model_op.cpp b/ark/model/model_op.cpp
index 6cdba5d02..b5a0645c8 100644
--- a/ark/model/model_op.cpp
+++ b/ark/model/model_op.cpp
@@ -202,8 +202,11 @@ std::shared_ptr<ModelOp> ModelOp::deserialize(const Json &serialized) {
     } else if (!serialized.contains("Args")) {
         ERR(InvalidUsageError, "ModelOp deserialization failed: missing Args");
     }
+    // Run `ModelOpT::from_name` before `construct()` to ensure all operators
+    // are registered.
+    auto op_type = ModelOpT::from_name(serialized["Type"]);
     auto ret = model_op_factory()->construct(serialized["Type"]);
-    ret->type_ = ModelOpT::from_name(serialized["Type"]);
+    ret->type_ = op_type;
     ret->name_ = serialized["Name"];
     ret->is_virtual_ = serialized["IsVirtual"];
     for (const auto &t : serialized["ReadTensors"]) {
diff --git a/ark/ops/ops_all_reduce_test.cpp b/ark/ops/ops_all_reduce_test.cpp
index 9e2c6f675..030146680 100644
--- a/ark/ops/ops_all_reduce_test.cpp
+++ b/ark/ops/ops_all_reduce_test.cpp
@@ -91,10 +91,9 @@ void test_all_reduce_internal(ark::DimType nelem) {
 
             std::vector<ark::half_t> ones_vec(ones.shape().nelems(),
                                               ark::half_t(1.0f));
-            auto result =
-                ark::op_test("all_reduce", m, {ones}, {output},
-                             baseline_all_reduce<ark::half_t, NumGpus>,
-                             {ones_vec.data()}, false, gpu_id, NumGpus);
+            auto result = ark::op_test(
+                "all_reduce", m, {ones}, {output},
+                baseline_all_reduce<ark::half_t, NumGpus>, {ones_vec.data()});
             UNITTEST_LOG(result);
             UNITTEST_EQ(result.max_diff[0], 0.0f);
             return ark::unittest::SUCCESS;
diff --git a/ark/ops/ops_communication_test.cpp b/ark/ops/ops_communication_test.cpp
index 2b63642e6..f01de9789 100644
--- a/ark/ops/ops_communication_test.cpp
+++ b/ark/ops/ops_communication_test.cpp
@@ -229,9 +229,7 @@ ark::unittest::State test_communication_send_recv_bidir_sm() {
             ark::Tensor tns2 = model.identity(tns2_data, {tns});
             tns2 = model.recv(tns2_data, remote_gpu_id, tag);
 
-            ark::DefaultPlanner planner(model, gpu_id);
-            planner.install_config_rule(config_rule);
-            ark::Executor exe(gpu_id, 2, gpu_id, "Executor", planner.plan());
+            ark::DefaultExecutor exe(model, gpu_id, nullptr, {config_rule});
             exe.compile();
 
             std::vector<ark::half_t> data(1024);
@@ -275,9 +273,7 @@ ark::unittest::State test_communication_send_recv_bidir_sm() {
 
             ark::Tensor sum = model.add(tns2, tns_data);
 
-            ark::DefaultPlanner planner(model, gpu_id);
-            planner.install_config_rule(config_rule);
-            ark::Executor exe(gpu_id, 2, gpu_id, "Executor", planner.plan());
+            ark::DefaultExecutor exe(model, gpu_id, nullptr, {config_rule});
             exe.compile();
 
             std::vector<ark::half_t> data(1024);
diff --git a/ark/ops/ops_embedding_test.cpp b/ark/ops/ops_embedding_test.cpp
index 822973106..8cc95abd2 100644
--- a/ark/ops/ops_embedding_test.cpp
+++ b/ark/ops/ops_embedding_test.cpp
@@ -78,9 +78,9 @@ ark::unittest::State test_embedding() {
     } else if (std::is_same<T, ark::bfloat16_t>::value) {
         type_str = "bf16";
     }
-    auto result = ark::op_test("embedding_" + type_str, m, {ti, tw}, {to},
-                               baseline_embedding<T>,
-                               {ti_data.data(), tw_data.data()}, true);
+    auto result =
+        ark::op_test("embedding_" + type_str, m, {ti, tw}, {to},
+                     baseline_embedding<T>, {ti_data.data(), tw_data.data()});
     UNITTEST_LOG(result);
     UNITTEST_EQ(result.max_diff[0], 0.0f);
     return ark::unittest::SUCCESS;
diff --git a/ark/ops/ops_test_common.cpp b/ark/ops/ops_test_common.cpp
index 50317fba7..60ffc9dc2 100644
--- a/ark/ops/ops_test_common.cpp
+++ b/ark/ops/ops_test_common.cpp
@@ -31,13 +31,13 @@ std::ostream &operator<<(std::ostream &os, const OpsTestResult &result) {
     return os;
 }
 
-OpsTestResult op_test(const std::string &test_name_prefix, const Model &model,
-                      const std::vector<Tensor> &inputs,
-                      const std::vector<Tensor> &outputs,
-                      OpsTestBaseline baseline,
-                      const std::vector<void *> &inputs_data,
-                      bool print_on_error, int rank, int world_size) {
-    DefaultExecutor exe(model);
+OpsTestResult op_test(
+    const std::string &test_name_prefix, const Model &model,
+    const std::vector<Tensor> &inputs, const std::vector<Tensor> &outputs,
+    OpsTestBaseline baseline, const std::vector<void *> &inputs_data,
+    const std::vector<DefaultPlanner::ConfigRule> &config_rules,
+    bool print_on_error) {
+    DefaultExecutor exe(model, -1, nullptr, config_rules);
     exe.compile();
 
     std::vector<std::shared_ptr<std::vector<char>>> inputs_data_storages;
@@ -133,7 +133,8 @@ OpsTestResult op_test(const std::string &test_name_prefix, const Model &model,
     for (auto t : gt) {
         gt_ptrs.push_back(t->data());
     }
-    baseline(gt_ptrs, output_shapes, inputs_data_refs, input_shapes, rank);
+    baseline(gt_ptrs, output_shapes, inputs_data_refs, input_shapes,
+             model.rank());
 
     std::stringstream test_name;
     test_name << test_name_prefix;
@@ -147,6 +148,7 @@ OpsTestResult op_test(const std::string &test_name_prefix, const Model &model,
 
     OpsTestResult result;
     result.test_name = test_name.str();
+    result.plan = exe.plan();
 
     // Compare results with the ground truth.
     for (size_t i = 0; i < outputs.size(); i++) {
@@ -187,7 +189,7 @@ OpsTestResult op_test(const std::string &test_name_prefix, const Model &model,
     GLOG(gpuDeviceSynchronize());
 
     // Throughput test.
-    if (world_size > 1) {
+    if (model.world_size() > 1) {
         // For multi-GPU, we need to make sure that all GPUs run the same
         // number of iterations. Rather than doing allgather, we just
         // use a magic number here.
diff --git a/ark/ops/ops_test_common.hpp b/ark/ops/ops_test_common.hpp
index 01e97dbb1..c5d640f3b 100644
--- a/ark/ops/ops_test_common.hpp
+++ b/ark/ops/ops_test_common.hpp
@@ -10,6 +10,7 @@
 
 #include "ark/model.hpp"
 #include "ark/model_ref.hpp"
+#include "ark/planner.hpp"
 #include "ark/random.hpp"
 #include "bfloat16.h"
 #include "half.h"
@@ -133,6 +134,7 @@ TensorCompareResult tensor_compare(T *ground_truth, T *res, Dims shape,
 
 struct OpsTestResult {
     std::string test_name;
+    std::string plan;
     int iter;
     float msec_per_iter;
     std::vector<float> mse;
@@ -165,13 +167,12 @@ using OpsTestBaseline = std::function<void(
 
 class Model;
 
-OpsTestResult op_test(const std::string &test_name_prefix, const Model &model,
-                      const std::vector<Tensor> &inputs,
-                      const std::vector<Tensor> &outputs,
-                      OpsTestBaseline baseline,
-                      const std::vector<void *> &inputs_data = {},
-                      bool print_on_error = false, int rank = 0,
-                      int world_size = 1);
+OpsTestResult op_test(
+    const std::string &test_name_prefix, const Model &model,
+    const std::vector<Tensor> &inputs, const std::vector<Tensor> &outputs,
+    OpsTestBaseline baseline, const std::vector<void *> &inputs_data = {},
+    const std::vector<DefaultPlanner::ConfigRule> &config_rules = {},
+    bool print_on_error = false);
 
 OpsTestGpuMem to_gpu(void *host_ptr, size_t size);
 
diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
index 9bb83fb42..855cb824b 100644
--- a/cmake/Utils.cmake
+++ b/cmake/Utils.cmake
@@ -14,7 +14,7 @@ if(GIT_CLANG_FORMAT)
         COMMAND ${GIT_CLANG_FORMAT} --style=file --diff || true
     )
     add_custom_target(cpplint-autofix
-        COMMAND ${GIT_CLANG_FORMAT} --style=file || true
+        COMMAND ${GIT_CLANG_FORMAT} --style=file --force --extensions cc,cpp,h,hpp,cu,in,hip || true
     )
 else()
     message(STATUS "git-clang-format not found.")
diff --git a/python/ark/runtime.py b/python/ark/runtime.py
index 7480ce7da..33db1fb5c 100644
--- a/python/ark/runtime.py
+++ b/python/ark/runtime.py
@@ -106,6 +106,7 @@ def launch(
         gpu_id: int = 0,
         plan: str = "",
         plan_path: str = "",
+        stream: int = 0,
     ):
         """
         Create an executor and schedule the ARK model. The scheduler will generate
@@ -130,9 +131,8 @@ def launch(
                     _RuntimeState.executor.destroy()
 
             _RuntimeState.executor = Executor(
-                rank,
-                world_size,
                 gpu_id,
+                stream,
                 "ArkRuntime",
                 plan,
             )
diff --git a/python/ark/tensor.py b/python/ark/tensor.py
index 316d18566..d69f2aabc 100644
--- a/python/ark/tensor.py
+++ b/python/ark/tensor.py
@@ -48,7 +48,9 @@ def dtype(self) -> DataType:
         """
         return DataType.from_ctype(self._tensor.data_type())
 
-    def to_numpy(self, ndarray: np.ndarray = None) -> np.ndarray:
+    def to_numpy(
+        self, ndarray: np.ndarray = None, stream: int = 0
+    ) -> np.ndarray:
         """
         Copy a tensor from device to host. If `ndarray` is None,
         a new numpy array will be created. If the tensor is not allocated,
@@ -68,10 +70,10 @@ def to_numpy(self, ndarray: np.ndarray = None) -> np.ndarray:
             raise ValueError("ndarray dtype does not match the tensor")
         elif ndarray.nbytes != self.nelems() * self.dtype().element_size():
             raise ValueError("ndarray size does not match the tensor")
-        rt.executor.tensor_read(self._tensor, ndarray)
+        rt.executor.tensor_read(self._tensor, ndarray, stream)
         return ndarray
 
-    def from_numpy(self, ndarray: np.ndarray) -> "Tensor":
+    def from_numpy(self, ndarray: np.ndarray, stream: int = 0) -> "Tensor":
         """
         Copies the tensor from a host numpy array to the device.
         """
@@ -86,7 +88,7 @@ def from_numpy(self, ndarray: np.ndarray) -> "Tensor":
             ndarray = np.ascontiguousarray(ndarray)
         if ndarray.nbytes != self.nelems() * self.dtype().element_size():
             raise ValueError("ndarray size does not match the tensor")
-        rt.executor.tensor_write(self._tensor, ndarray)
+        rt.executor.tensor_write(self._tensor, ndarray, stream)
         return self
 
 
diff --git a/python/executor_py.cpp b/python/executor_py.cpp
index dc2840329..979cb2952 100644
--- a/python/executor_py.cpp
+++ b/python/executor_py.cpp
@@ -11,25 +11,48 @@
 namespace py = pybind11;
 
 static void tensor_write(ark::Executor *exe, const ark::Tensor &tensor,
-                         py::buffer host_buffer) {
+                         py::buffer host_buffer, uintptr_t stream) {
     py::buffer_info info = host_buffer.request();
     exe->tensor_write(tensor, reinterpret_cast<void *>(info.ptr),
-                      info.size * info.itemsize);
+                      info.size * info.itemsize,
+                      reinterpret_cast<ark::Stream>(stream), false);
+}
+
+static void tensor_write(ark::Executor *exe, const ark::Tensor &tensor,
+                         size_t address, size_t bytes, uintptr_t stream,
+                         bool is_d2d) {
+    exe->tensor_write(tensor, reinterpret_cast<void *>(address), bytes,
+                      reinterpret_cast<ark::Stream>(stream), is_d2d);
 }
 
 static void tensor_read(ark::Executor *exe, const ark::Tensor &tensor,
-                        py::buffer host_buffer) {
+                        py::buffer host_buffer, uintptr_t stream) {
     py::buffer_info info = host_buffer.request();
     exe->tensor_read(tensor, reinterpret_cast<void *>(info.ptr),
-                     info.size * info.itemsize);
+                     info.size * info.itemsize,
+                     reinterpret_cast<ark::Stream>(stream), false);
+}
+
+static void tensor_read(ark::Executor *exe, const ark::Tensor &tensor,
+                        size_t address, size_t bytes, uintptr_t stream,
+                        bool is_d2d) {
+    exe->tensor_read(tensor, reinterpret_cast<void *>(address), bytes,
+                     reinterpret_cast<ark::Stream>(stream), is_d2d);
 }
 
 void register_executor(py::module &m) {
     py::class_<ark::Executor>(m, "_Executor")
-        .def(
-            py::init<int, int, int, const std::string &, const std::string &>(),
-            py::arg("rank"), py::arg("world_size"), py::arg("gpu_id"),
-            py::arg("name"), py::arg("plan"))
+        .def(py::init([](int device_id, uintptr_t stream,
+                         const std::string &name, const std::string &plan) {
+            return new ark::Executor(
+                device_id, reinterpret_cast<ark::Stream>(stream), name, plan);
+        }))
+        .def("device_id", &ark::Executor::device_id)
+        .def("stream",
+             [](ark::Executor *self) {
+                 return reinterpret_cast<uintptr_t>(self->stream());
+             })
+        .def("plan", &ark::Executor::plan)
         .def("compile", &ark::Executor::compile)
         .def("launch", &ark::Executor::launch, py::arg("max_spin_count") = -1)
         .def("run", &ark::Executor::run, py::arg("iter"))
@@ -38,6 +61,22 @@ void register_executor(py::module &m) {
         .def("barrier", &ark::Executor::barrier)
         .def("destroy", &ark::Executor::destroy)
         .def("destroyed", &ark::Executor::destroyed)
-        .def("tensor_read", &tensor_read, py::arg("tensor"), py::arg("data"))
-        .def("tensor_write", &tensor_write, py::arg("tensor"), py::arg("data"));
+        .def("tensor_read",
+             py::overload_cast<ark::Executor *, const ark::Tensor &, py::buffer,
+                               uintptr_t>(&tensor_read),
+             py::arg("tensor"), py::arg("data"), py::arg("stream"))
+        .def("tensor_read",
+             py::overload_cast<ark::Executor *, const ark::Tensor &, size_t,
+                               size_t, uintptr_t, bool>(&tensor_read),
+             py::arg("tensor"), py::arg("address"), py::arg("bytes"),
+             py::arg("stream"), py::arg("is_d2d"))
+        .def("tensor_write",
+             py::overload_cast<ark::Executor *, const ark::Tensor &, py::buffer,
+                               uintptr_t>(&tensor_write),
+             py::arg("tensor"), py::arg("data"), py::arg("stream"))
+        .def("tensor_write",
+             py::overload_cast<ark::Executor *, const ark::Tensor &, size_t,
+                               size_t, uintptr_t, bool>(&tensor_write),
+             py::arg("tensor"), py::arg("address"), py::arg("bytes"),
+             py::arg("stream"), py::arg("is_d2d"));
 }

From 215469044ae49a4a453f576b2a396a5c96992aec Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 10 Jul 2024 23:53:32 +0000
Subject: [PATCH 30/79] Update lint workflow

---
 .github/workflows/lint.yml | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 758eaf564..a918dcede 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -13,11 +13,8 @@ jobs:
     - name: Check out Git repository
       uses: actions/checkout@v4
 
-    - name: Install ClangFormat
-      run: sudo apt-get install -y clang-format
-
-    - name: Run clang-format
-      run: clang-format -style=file -Werror --dry-run `find ark python examples -type f -name *.h -o -name *.hpp -o -name *.c -o -name *.cc -o -name *.cpp -o -name *.cu`
+    - name: Run git-clang-format
+      run: git-clang-format --style=file --diff
 
     - name: Set up Python
       uses: actions/setup-python@v4

From 705f9f86d8bf8b70005a03fd875e8cc080c99af1 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Thu, 11 Jul 2024 00:02:45 +0000
Subject: [PATCH 31/79] Optimize operators

---
 ark/include/kernels/common/broadcast.h |  4 +-
 ark/include/kernels/common/sync.h      | 12 ++----
 ark/include/kernels/reduce.h           | 59 ++++++++++++++++++--------
 ark/ops/ops_broadcast.cpp              |  3 +-
 ark/ops/ops_matmul.cpp                 | 32 +++++++++-----
 5 files changed, 69 insertions(+), 41 deletions(-)

diff --git a/ark/include/kernels/common/broadcast.h b/ark/include/kernels/common/broadcast.h
index 97b12e004..858938613 100644
--- a/ark/include/kernels/common/broadcast.h
+++ b/ark/include/kernels/common/broadcast.h
@@ -186,9 +186,9 @@ struct Broadcast2Intrinsic {
         (BroadcastInput0 && BroadcastInput1)
             ? OutNelemPerThread
             : BroadcastInput0
-                  ? math::gcd<OutNelemPerThread, In0NelemPerThread>::value
+                  ? math::gcd<OutNelemPerThread, In1NelemPerThread>::value
                   : BroadcastInput1
-                        ? math::gcd<OutNelemPerThread, In1NelemPerThread>::value
+                        ? math::gcd<OutNelemPerThread, In0NelemPerThread>::value
                         : math::gcd<OutNelemPerThread,
                                     math::gcd<In0NelemPerThread,
                                               In1NelemPerThread>::value>::value;
diff --git a/ark/include/kernels/common/sync.h b/ark/include/kernels/common/sync.h
index 85f7639c9..cf22e357d 100644
--- a/ark/include/kernels/common/sync.h
+++ b/ark/include/kernels/common/sync.h
@@ -106,25 +106,21 @@ DEVICE void sync_warps() {
     static_assert(Arch::ThreadsPerWarp == 64, "");
     if constexpr (NumWarps == 1) {
         __builtin_amdgcn_wave_barrier();
-    } else if constexpr (NumWarps == 16) {
+    } else if constexpr (NumWarps == ARK_WARPS_PER_BLOCK) {
         __syncthreads();
     } else {
         static_assert(ARK_SMEM_RESERVED_BYTES >= sizeof(sync::WarpGroupState),
                       "");
-        int lane_id = threadIdx.x & 63;
-        if (lane_id == 0) {
+        if ((threadIdx.x & 63) == 0) {
             constexpr int MaxOldCnt = NumWarps - 1;
-            int warp_id = threadIdx.x >> 6;
-            int group_id = warp_id / NumWarps;
+            int group_id = (threadIdx.x >> 6) / NumWarps;
             sync::WarpGroupState *state =
                 reinterpret_cast<sync::WarpGroupState *>(_ARK_SMEM);
             unsigned int tmp = state->is_inc_flag[group_id] ^ 1;
             if (atomicInc(&state->cnt[group_id], MaxOldCnt) == MaxOldCnt) {
                 state->flag[group_id] = tmp;
             } else {
-                while (atomicAdd(&state->flag[group_id], 0) != tmp)
-                    __builtin_amdgcn_s_sleep(1);
-                __asm__ __volatile__("s_wakeup");
+                while (atomicAdd(&state->flag[group_id], 0) != tmp);
             }
             state->is_inc_flag[group_id] = tmp;
         }
diff --git a/ark/include/kernels/reduce.h b/ark/include/kernels/reduce.h
index 30c8b7831..2dd79d2c3 100644
--- a/ark/include/kernels/reduce.h
+++ b/ark/include/kernels/reduce.h
@@ -53,7 +53,7 @@ DEVICE bf16 warpReduce(bf16 val) {
 template <typename ReduceType, typename UnitOp, int LanesNum, typename DataType>
 DEVICE DataType warpsReduce(DataType val, int tid, int smem_per_warp) {
     val = warpReduce<ReduceType, LanesNum>(val);
-    if (LanesNum > Arch::ThreadsPerWarp) {
+    if constexpr (LanesNum > Arch::ThreadsPerWarp) {
         ReduceSharedStorage<DataType> *shared =
             UnitOp::template shared_memory<ReduceSharedStorage<DataType>>(
                 smem_per_warp);
@@ -351,12 +351,19 @@ struct WwiseReduce {
     /// @param in Input tensor.
     /// @param uop_idx Index of the unit operator.
     template <typename DataType>
-    static DEVICE void runW(DataType *out, DataType *in, int uop_idx,
-                            int smem_per_warp) {
+    static DEVICE void run(DataType *out, DataType *in, int uop_idx,
+                           int smem_per_warp) {
         using ShapeChecker =
             ReduceShapeChecker<InShape, OutShape, UnitOutDims, Axis>;
+        constexpr int InConsecBytes = sizeof(DataType) * InShape::W;
         constexpr int NelemPerThread =
-            DefaultNelemPerThread<OutDims, DataType, UnitOutDims>::value;
+            (InConsecBytes % 16 == 0)
+                ? 16 / sizeof(DataType)
+                : (InConsecBytes % 8 == 0)
+                    ? 8 / sizeof(DataType)
+                    : (InConsecBytes % 4 == 0)
+                        ? 4 / sizeof(DataType)
+                        : (InConsecBytes % 2 == 0) ? 2 / sizeof(DataType) : 1;
 
         constexpr int NonReduceDimLength =
             UnitOutDims::N * UnitOutDims::C * UnitOutDims::H;
@@ -397,22 +404,38 @@ struct WwiseReduce {
                                                         &in[idx_in]);
         }
 
-        DataType finalSum;
-        ReduceType::template identity<1>(&finalSum);
+        static_assert(math::is_pow2<NelemPerThread>::value,
+                      "NelemPerThread must be power of 2");
+        if constexpr (NelemPerThread > 8) {
 #pragma unroll
-        for (int i = 0; i < NelemPerThread; ++i) {
-            ReduceType::template reduce<1>(&finalSum, &finalSum, &reduced[i]);
+            for (int i = 8; i < NelemPerThread; i += 8) {
+                ReduceType::template reduce<8>(&reduced[0], &reduced[0], &reduced[i]);
+            }
+            ReduceType::template reduce<4>(&reduced[0], &reduced[0], &reduced[4]);
+            ReduceType::template reduce<2>(&reduced[0], &reduced[0], &reduced[2]);
+            ReduceType::template reduce<1>(&reduced[0], &reduced[0], &reduced[1]);
+        } else if constexpr (NelemPerThread == 8) {
+            ReduceType::template reduce<4>(&reduced[0], &reduced[0], &reduced[4]);
+            ReduceType::template reduce<2>(&reduced[0], &reduced[0], &reduced[2]);
+            ReduceType::template reduce<1>(&reduced[0], &reduced[0], &reduced[1]);
+        } else if constexpr (NelemPerThread == 4) {
+            ReduceType::template reduce<2>(&reduced[0], &reduced[0], &reduced[2]);
+            ReduceType::template reduce<1>(&reduced[0], &reduced[0], &reduced[1]);
+        } else if constexpr (NelemPerThread == 2) {
+            ReduceType::template reduce<1>(&reduced[0], &reduced[0], &reduced[1]);
         }
 
-        UnitOp::sync_threads();
+        if constexpr (InShape::W % ThreadsPerRow != 0) {
+            UnitOp::sync_threads();
+        }
 
         // final reduction on shared memory using warp shuffle.
-        finalSum = warpsReduce<ReduceType, UnitOp, ThreadsPerRow>(
-            finalSum, tid, smem_per_warp);
+        reduced[0] = warpsReduce<ReduceType, UnitOp, UnitOp::NumThreads>(
+            reduced[0], tid, smem_per_warp);
 
         // write the result to output.
         if (tid % ThreadsPerRow == 0) {
-            ReduceType::template postReduce<1>(&out[idx_out], &finalSum,
+            ReduceType::template postReduce<1>(&out[idx_out], &reduced[0],
                                                InShape::W);
         }
 
@@ -450,8 +473,8 @@ template <typename InDims, typename InShape, typename OutDims,
 DEVICE void reduce_w_sum(DataType *out, DataType *in, int uop_idx,
                          int smem_per_warp) {
     WwiseReduce<InDims, InShape, OutDims, OutShape, UnitOutDims, NumWarps,
-                SmemBytes, ReduceTypeSum, Axis>::runW(out, in, uop_idx,
-                                                      smem_per_warp);
+                SmemBytes, ReduceTypeSum, Axis>::run(out, in, uop_idx,
+                                                     smem_per_warp);
 }
 
 template <typename InDims, typename InShape, typename OutDims,
@@ -460,8 +483,8 @@ template <typename InDims, typename InShape, typename OutDims,
 DEVICE void reduce_w_mean(DataType *out, DataType *in, int uop_idx,
                           int smem_per_warp) {
     WwiseReduce<InDims, InShape, OutDims, OutShape, UnitOutDims, NumWarps,
-                SmemBytes, ReduceTypeMean, Axis>::runW(out, in, uop_idx,
-                                                       smem_per_warp);
+                SmemBytes, ReduceTypeMean, Axis>::run(out, in, uop_idx,
+                                                      smem_per_warp);
 }
 
 template <typename InDims, typename InShape, typename OutDims,
@@ -470,8 +493,8 @@ template <typename InDims, typename InShape, typename OutDims,
 DEVICE void reduce_w_max(DataType *out, DataType *in, int uop_idx,
                          int smem_per_warp) {
     WwiseReduce<InDims, InShape, OutDims, OutShape, UnitOutDims, NumWarps,
-                SmemBytes, ReduceTypeMax, Axis>::runW(out, in, uop_idx,
-                                                      smem_per_warp);
+                SmemBytes, ReduceTypeMax, Axis>::run(out, in, uop_idx,
+                                                     smem_per_warp);
 }
 
 }  // namespace ark
diff --git a/ark/ops/ops_broadcast.cpp b/ark/ops/ops_broadcast.cpp
index 3985a0500..e5559fc32 100644
--- a/ark/ops/ops_broadcast.cpp
+++ b/ark/ops/ops_broadcast.cpp
@@ -27,8 +27,7 @@ ModelOpBroadcast1::ModelOpBroadcast1(const std::string &type_name,
 std::string ModelOpBroadcast1::impl_name(const Json &config) const {
     check_fields_config(config, {"NumWarps", "Tile"});
     int num_warps = config.at("NumWarps");
-    auto &tile_shape = config.at("Tile");
-    Dims unit_out_dims{tile_shape[0], tile_shape[1]};
+    Dims unit_out_dims(config.at("Tile").get<std::vector<DimType>>());
 
     return function_name_string(
         pascal_to_snake(type()->type_name()),
diff --git a/ark/ops/ops_matmul.cpp b/ark/ops/ops_matmul.cpp
index b259f99c8..a24b95d72 100644
--- a/ark/ops/ops_matmul.cpp
+++ b/ark/ops/ops_matmul.cpp
@@ -189,45 +189,55 @@ std::vector<ModelOpArg> ModelOpMatmul::impl_args([
 }
 
 static const Json get_default_config(const ArchRef arch,
-                                     const ModelDataType &data_type) {
+                                     const ModelDataType &data_type,
+                                     const Dims &mnk) {
+    if (data_type != FP32.ref() && data_type != FP16.ref() &&
+        data_type != BF16.ref()) {
+        ERR(InvalidUsageError,
+            "Unsupported data type: ", data_type->type_name());
+    }
+    if (!arch->belongs_to(ARCH_CUDA) && !arch->belongs_to(ARCH_ROCM)) {
+        ERR(InvalidUsageError, "Unsupported architecture: ", arch->name());
+    }
+    DimType tm = (mnk[0] > mnk[1]) ? 256 : 128;
+    DimType tn = (mnk[0] > mnk[1]) ? 128 : 256;
     if (arch->belongs_to(ARCH_CUDA_80) && data_type == FP32.ref()) {
         return {{"NumWarps", 8},
                 {"SramBytes", 147456},
-                {"TileShapeMNK", {128, 256, 32}}};
+                {"TileShapeMNK", {tm, tn, 32}}};
     } else if (arch->belongs_to(ARCH_CUDA_80) && data_type == FP16.ref()) {
         return {{"NumWarps", 8},
                 {"SramBytes", 147456},
-                {"TileShapeMNK", {128, 256, 64}}};
+                {"TileShapeMNK", {tm, tn, 64}}};
     } else if (arch->belongs_to(ARCH_CUDA_80) && data_type == BF16.ref()) {
         return {{"NumWarps", 8},
                 {"SramBytes", 147456},
-                {"TileShapeMNK", {128, 256, 64}}};
+                {"TileShapeMNK", {tm, tn, 64}}};
     } else if (arch->belongs_to(ARCH_ROCM_942) && data_type == FP32.ref()) {
         return {{"NumWarps", 4},
                 {"SramBytes", 24672},
-                {"TileShapeMNK", {128, 256, 16}}};
+                {"TileShapeMNK", {tm, tn, 16}}};
     } else if (arch->belongs_to(ARCH_ROCM_942) && data_type == FP16.ref()) {
         return {{"NumWarps", 4},
                 {"SramBytes", 24672},
-                {"TileShapeMNK", {128, 256, 32}}};
+                {"TileShapeMNK", {tm, tn, 32}}};
     } else if (arch->belongs_to(ARCH_ROCM_942) && data_type == BF16.ref()) {
         return {{"NumWarps", 4},
-                {"SramBytes", 24672},
-                {"TileShapeMNK", {128, 256, 32}}};
+                {"SramBytes", 24624},
+                {"TileShapeMNK", {tm, tn, 32}}};
     }
-    ERR(InvalidUsageError, "Unsupported arch and data type: ", arch->name(),
-        " and ", data_type->type_name());
+    ERR(InternalError, "Unexpected error");
     return {};
 }
 
 Json ModelOpMatmul::default_config(const ArchRef arch) const {
     auto result = result_tensors_[0];
-    Json config = get_default_config(arch, result->data_type());
     check_fields_args(args_, {"TransposeInput", "TransposeOther"});
     Dims mnk = calc_problem_size(read_tensors_[0]->padded_shape(),
                                  read_tensors_[1]->padded_shape(),
                                  args_.at("TransposeInput").value<bool>(),
                                  args_.at("TransposeOther").value<bool>());
+    Json config = get_default_config(arch, result->data_type(), mnk);
     size_t tile_x = config.at("TileShapeMNK")[0];
     size_t tile_y = config.at("TileShapeMNK")[1];
     if (mnk[0] % tile_x != 0 || mnk[1] % tile_y != 0) {

From a3114e45eea5d8c7929915e7ca1b1f9cc6ef1591 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Thu, 11 Jul 2024 00:04:40 +0000
Subject: [PATCH 32/79] fix

---
 ark/error.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ark/error.hpp b/ark/error.hpp
index e08acd975..5ad21824b 100644
--- a/ark/error.hpp
+++ b/ark/error.hpp
@@ -20,6 +20,7 @@ class BaseError : public std::runtime_error {
         _name(const std::string &msg) : BaseError(msg) {} \
     };
 
+REGISTER_ERROR_TYPE(InternalError)
 REGISTER_ERROR_TYPE(InvalidUsageError)
 REGISTER_ERROR_TYPE(NotFoundError)
 REGISTER_ERROR_TYPE(ModelError)

From 6116424e2a692a3cec2eb749565f1ae03637e5e6 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Thu, 11 Jul 2024 00:28:47 +0000
Subject: [PATCH 33/79] delete an unused file

---
 plan_gpu0.json | 2423 ------------------------------------------------
 1 file changed, 2423 deletions(-)
 delete mode 100644 plan_gpu0.json

diff --git a/plan_gpu0.json b/plan_gpu0.json
deleted file mode 100644
index cad05f774..000000000
--- a/plan_gpu0.json
+++ /dev/null
@@ -1,2423 +0,0 @@
-{
-  "Rank": 0,
-  "WorldSize": 1,
-  "Architecture": "ROCM_942",
-  "NumProcessors": 304,
-  "NumWarpsPerProcessor": 4,
-  "TaskInfos": [
-    {
-      "Id": 0,
-      "NumWarps": 4,
-      "SramBytes": 24672,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":4,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":0,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":6,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":7,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 24672,
-            "TileShapeMNK": [256,128,32],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 1,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Rope",
-          "Name": "rope",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":12,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":5,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":15,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":16,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [256,1,128],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 2,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Transpose",
-          "Name": "transpose",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":16,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":19,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":20,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "Permutation": {"DIMS":[0,2,1,3]}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [256,128],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 3,
-      "NumWarps": 4,
-      "SramBytes": 24672,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":4,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":1,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":8,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":9,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 24672,
-            "TileShapeMNK": [256,128,32],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 4,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Rope",
-          "Name": "rope_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":13,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":5,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":17,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":18,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [256,1,128],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 5,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Transpose",
-          "Name": "transpose_2",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":18,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":23,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":24,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "Permutation": {"DIMS":[0,2,1,3]}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [256,128],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 6,
-      "NumWarps": 4,
-      "SramBytes": 24672,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_2",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":4,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":2,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":10,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":11,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 24672,
-            "TileShapeMNK": [256,128,32],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 7,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Transpose",
-          "Name": "transpose_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":14,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":21,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":22,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "Permutation": {"DIMS":[0,2,1,3]}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [256,128],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 8,
-      "NumWarps": 4,
-      "SramBytes": 24672,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_3",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":20,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":24,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":25,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":26,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 24672,
-            "TileShapeMNK": [256,128,32],
-            "NumTasks": 4096
-          }
-        }
-      ]
-    },
-    {
-      "Id": 9,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "ScalarMul",
-          "Name": "mul",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":26,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":27,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":28,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "Factor": {"FLOAT":0.0883883461356163}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [256,128],
-            "NumTasks": 4096
-          }
-        }
-      ]
-    },
-    {
-      "Id": 10,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "ReduceMax",
-          "Name": "reduce_max",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":28,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":29,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":30,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "Axis": {"INT":3},
-            "KeepDim": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 1,
-            "ImplType": "WarpWise",
-            "SramBytes": 0,
-            "NumTasks": 65536
-          }
-        }
-      ]
-    },
-    {
-      "Id": 11,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Sub",
-          "Name": "sub",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":28,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":30,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":28,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":31,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 4096
-          }
-        }
-      ]
-    },
-    {
-      "Id": 12,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Exp",
-          "Name": "exp",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":31,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":31,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":32,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 4096
-          }
-        }
-      ]
-    },
-    {
-      "Id": 13,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "ReduceSum",
-          "Name": "reduce_sum",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":32,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":33,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":34,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "Axis": {"INT":3},
-            "KeepDim": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 1,
-            "ImplType": "WarpWise",
-            "SramBytes": 0,
-            "NumTasks": 65536
-          }
-        }
-      ]
-    },
-    {
-      "Id": 14,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Div",
-          "Name": "div",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":32,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":34,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":32,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":35,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 4096
-          }
-        }
-      ]
-    },
-    {
-      "Id": 15,
-      "NumWarps": 4,
-      "SramBytes": 24672,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_4",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":35,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":22,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":36,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":37,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":false}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 24672,
-            "TileShapeMNK": [256,128,32],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 16,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Transpose",
-          "Name": "transpose_3",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":37,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":38,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":39,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "Permutation": {"DIMS":[0,2,1,3]}
-          },
-          "Config": {
-            "NumWarps": 1,
-            "SramBytes": 0,
-            "Tile": [8,128],
-            "NumTasks": 8192
-          }
-        }
-      ]
-    },
-    {
-      "Id": 17,
-      "NumWarps": 4,
-      "SramBytes": 24672,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_5",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":40,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":3,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":41,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":20,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":42,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":20,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 24672,
-            "TileShapeMNK": [128,256,32],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 18,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Cast",
-          "Name": "cast",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":52,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":30,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":54,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":55,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 19,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Mul",
-          "Name": "mul_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":55,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":55,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":56,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":57,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 20,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "ReduceMean",
-          "Name": "reduce_mean",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":57,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":58,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":59,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "Axis": {"INT":2},
-            "KeepDim": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 1,
-            "ImplType": "WarpWise",
-            "SramBytes": 0,
-            "NumTasks": 2048
-          }
-        }
-      ]
-    },
-    {
-      "Id": 21,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Rsqrt",
-          "Name": "rsqrt",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":59,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":60,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":61,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 1,
-            "SramBytes": 0,
-            "Tile": [64,1],
-            "NumTasks": 32
-          }
-        }
-      ]
-    },
-    {
-      "Id": 22,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Mul",
-          "Name": "mul_2",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":55,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":61,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":62,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":63,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 23,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Mul",
-          "Name": "mul_3",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":63,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":50,"DataType":"FP32","Shape":[1,1,4096],"Strides":[1,1,4096],"Offsets":[0,0,0],"PaddedShape":[1,1,4096],"Buffer":{"Id":28,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":63,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":64,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 24,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Cast",
-          "Name": "cast_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":64,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":65,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":66,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 25,
-      "NumWarps": 4,
-      "SramBytes": 24672,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_6",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":66,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":43,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":21,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":67,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":68,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 24672,
-            "TileShapeMNK": [128,256,32],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 26,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Rope",
-          "Name": "rope_2",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":73,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":53,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":31,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":76,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":77,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 1,
-            "SramBytes": 0,
-            "Tile": [1,64],
-            "NumTasks": 131072
-          }
-        }
-      ]
-    },
-    {
-      "Id": 27,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Transpose",
-          "Name": "transpose_4",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":77,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":80,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":81,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "Permutation": {"DIMS":[0,2,1,3]}
-          },
-          "Config": {
-            "NumWarps": 1,
-            "SramBytes": 0,
-            "Tile": [8,128],
-            "NumTasks": 8192
-          }
-        }
-      ]
-    },
-    {
-      "Id": 28,
-      "NumWarps": 4,
-      "SramBytes": 24672,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_7",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":66,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":44,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":22,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":69,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":70,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 24672,
-            "TileShapeMNK": [128,256,32],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 29,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Rope",
-          "Name": "rope_3",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":74,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":53,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":31,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":78,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":79,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 1,
-            "SramBytes": 0,
-            "Tile": [1,64],
-            "NumTasks": 131072
-          }
-        }
-      ]
-    },
-    {
-      "Id": 30,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Transpose",
-          "Name": "transpose_6",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":79,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":84,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":45,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":85,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":45,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "Permutation": {"DIMS":[0,2,3,1]}
-          },
-          "Config": {
-            "NumWarps": 1,
-            "SramBytes": 0,
-            "Tile": [8,8],
-            "NumTasks": 131072
-          }
-        }
-      ]
-    },
-    {
-      "Id": 31,
-      "NumWarps": 4,
-      "SramBytes": 24672,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_8",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":66,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":45,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":23,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":71,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":72,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 24672,
-            "TileShapeMNK": [128,256,32],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 32,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Transpose",
-          "Name": "transpose_5",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":75,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":82,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":83,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "Permutation": {"DIMS":[0,2,1,3]}
-          },
-          "Config": {
-            "NumWarps": 1,
-            "SramBytes": 0,
-            "Tile": [8,128],
-            "NumTasks": 8192
-          }
-        }
-      ]
-    },
-    {
-      "Id": 33,
-      "NumWarps": 4,
-      "SramBytes": 24672,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_9",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":81,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":85,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":45,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":86,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":46,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":87,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":46,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":false}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 24672,
-            "TileShapeMNK": [128,256,32],
-            "NumTasks": 4096
-          }
-        }
-      ]
-    },
-    {
-      "Id": 34,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "ScalarMul",
-          "Name": "mul_4",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":87,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":46,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":88,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":89,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "Factor": {"FLOAT":0.0883883461356163}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 4096
-          }
-        }
-      ]
-    },
-    {
-      "Id": 35,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "ReduceMax",
-          "Name": "reduce_max_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":89,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":90,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":48,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":91,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":48,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "Axis": {"INT":3},
-            "KeepDim": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 1,
-            "ImplType": "WarpWise",
-            "SramBytes": 0,
-            "NumTasks": 65536
-          }
-        }
-      ]
-    },
-    {
-      "Id": 36,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Sub",
-          "Name": "sub_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":89,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":91,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":48,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":89,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":92,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 4096
-          }
-        }
-      ]
-    },
-    {
-      "Id": 37,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Exp",
-          "Name": "exp_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":92,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":92,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":93,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 4096
-          }
-        }
-      ]
-    },
-    {
-      "Id": 38,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "ReduceSum",
-          "Name": "reduce_sum_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":93,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":94,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":49,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":95,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":49,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "Axis": {"INT":3},
-            "KeepDim": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 1,
-            "ImplType": "WarpWise",
-            "SramBytes": 0,
-            "NumTasks": 65536
-          }
-        }
-      ]
-    },
-    {
-      "Id": 39,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Div",
-          "Name": "div_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":93,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":95,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":49,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":93,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":96,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 4096
-          }
-        }
-      ]
-    },
-    {
-      "Id": 40,
-      "NumWarps": 4,
-      "SramBytes": 24672,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_10",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":96,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":83,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":97,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":50,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":98,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":50,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":false}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 24672,
-            "TileShapeMNK": [256,128,32],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 41,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Transpose",
-          "Name": "transpose_7",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":98,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":50,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":99,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":51,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":100,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":51,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "Permutation": {"DIMS":[0,2,1,3]}
-          },
-          "Config": {
-            "NumWarps": 1,
-            "SramBytes": 0,
-            "Tile": [8,128],
-            "NumTasks": 8192
-          }
-        }
-      ]
-    },
-    {
-      "Id": 42,
-      "NumWarps": 4,
-      "SramBytes": 24672,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_11",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":101,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":51,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":46,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":102,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":52,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":103,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":52,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 24672,
-            "TileShapeMNK": [128,256,32],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 43,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Add",
-          "Name": "add",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":52,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":30,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":103,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":52,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":104,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":53,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":105,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":53,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 44,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Cast",
-          "Name": "cast_2",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":105,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":53,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":106,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":107,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 45,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Mul",
-          "Name": "mul_5",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":107,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":107,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":108,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":55,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":109,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":55,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 1,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 46,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "ReduceMean",
-          "Name": "reduce_mean_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":109,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":55,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":110,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":56,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":111,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":56,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "Axis": {"INT":2},
-            "KeepDim": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 1,
-            "ImplType": "WarpWise",
-            "SramBytes": 0,
-            "NumTasks": 2048
-          }
-        }
-      ]
-    },
-    {
-      "Id": 47,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Rsqrt",
-          "Name": "rsqrt_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":111,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":56,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":112,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":57,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":113,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":57,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 1,
-            "SramBytes": 0,
-            "Tile": [64,1],
-            "NumTasks": 32
-          }
-        }
-      ]
-    },
-    {
-      "Id": 48,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Mul",
-          "Name": "mul_6",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":107,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":113,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":57,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":114,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":115,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 49,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Mul",
-          "Name": "mul_7",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":115,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":51,"DataType":"FP32","Shape":[1,1,4096],"Strides":[1,1,4096],"Offsets":[0,0,0],"PaddedShape":[1,1,4096],"Buffer":{"Id":29,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":115,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":116,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 50,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Cast",
-          "Name": "cast_3",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":116,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":117,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":59,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":118,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":59,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 51,
-      "NumWarps": 4,
-      "SramBytes": 24672,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_12",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":118,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":59,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":47,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":25,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":119,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":60,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":120,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":60,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 24672,
-            "TileShapeMNK": [128,256,32],
-            "NumTasks": 688
-          }
-        }
-      ]
-    },
-    {
-      "Id": 52,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Sigmoid",
-          "Name": "sigmoid",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":120,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":60,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":121,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":61,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":122,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":61,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 688
-          }
-        }
-      ]
-    },
-    {
-      "Id": 53,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Mul",
-          "Name": "mul_8",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":120,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":60,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":122,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":61,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":123,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":62,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":124,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":62,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 688
-          }
-        }
-      ]
-    },
-    {
-      "Id": 54,
-      "NumWarps": 4,
-      "SramBytes": 24672,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_13",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":118,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":59,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":49,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":27,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":125,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":63,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":126,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":63,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 24672,
-            "TileShapeMNK": [128,256,32],
-            "NumTasks": 688
-          }
-        }
-      ]
-    },
-    {
-      "Id": 55,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Mul",
-          "Name": "mul_9",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":124,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":62,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":126,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":63,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":127,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":64,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":128,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":64,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 688
-          }
-        }
-      ]
-    },
-    {
-      "Id": 56,
-      "NumWarps": 4,
-      "SramBytes": 24672,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_14",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":128,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":64,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":48,"DataType":"FP16","Shape":[4096,11008],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,11008],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":129,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":65,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":130,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":65,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 24672,
-            "TileShapeMNK": [128,256,32],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 57,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Add",
-          "Name": "add_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":105,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":53,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":130,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":65,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":131,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":66,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":132,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":66,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 256
-          }
-        }
-      ]
-    }
-  ],
-  "ProcessorGroups": [
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,86],
-          "WarpRange": [0,4],
-          "SramRange": [0,24672],
-          "TaskGroups": [
-            {"TaskId":0,"TaskRange":[0,256],"Granularity":1},
-            {"TaskId":1,"TaskRange":[0,256],"Granularity":1},
-            {"TaskId":2,"TaskRange":[0,256],"Granularity":1}
-          ]
-        },
-        {
-          "ProcessorRange": [86,172],
-          "WarpRange": [0,4],
-          "SramRange": [0,24672],
-          "TaskGroups": [
-            {"TaskId":3,"TaskRange":[0,256],"Granularity":1},
-            {"TaskId":4,"TaskRange":[0,256],"Granularity":1},
-            {"TaskId":5,"TaskRange":[0,256],"Granularity":1}
-          ]
-        },
-        {
-          "ProcessorRange": [172,258],
-          "WarpRange": [0,4],
-          "SramRange": [0,24672],
-          "TaskGroups": [
-            {"TaskId":6,"TaskRange":[0,256],"Granularity":1},
-            {"TaskId":7,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,24672],
-          "TaskGroups": [
-            {"TaskId":8,"TaskRange":[0,4096],"Granularity":1},
-            {"TaskId":9,"TaskRange":[0,4096],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":10,"TaskRange":[0,65536],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":11,"TaskRange":[0,4096],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":12,"TaskRange":[0,4096],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":13,"TaskRange":[0,65536],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":14,"TaskRange":[0,4096],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,256],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,256],
-          "WarpRange": [0,4],
-          "SramRange": [0,24672],
-          "TaskGroups": [
-            {"TaskId":15,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":16,"TaskRange":[0,8192],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,256],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,256],
-          "WarpRange": [0,4],
-          "SramRange": [0,24672],
-          "TaskGroups": [
-            {"TaskId":17,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":18,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":19,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":20,"TaskRange":[0,2048],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,32],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,32],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":21,"TaskRange":[0,32],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":22,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":23,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":24,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,256],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,256],
-          "WarpRange": [0,4],
-          "SramRange": [0,24672],
-          "TaskGroups": [
-            {"TaskId":25,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":26,"TaskRange":[0,131072],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":27,"TaskRange":[0,8192],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,256],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,256],
-          "WarpRange": [0,4],
-          "SramRange": [0,24672],
-          "TaskGroups": [
-            {"TaskId":28,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":29,"TaskRange":[0,131072],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":30,"TaskRange":[0,131072],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,256],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,256],
-          "WarpRange": [0,4],
-          "SramRange": [0,24672],
-          "TaskGroups": [
-            {"TaskId":31,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":32,"TaskRange":[0,8192],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,24672],
-          "TaskGroups": [
-            {"TaskId":33,"TaskRange":[0,4096],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":34,"TaskRange":[0,4096],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":35,"TaskRange":[0,65536],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":36,"TaskRange":[0,4096],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":37,"TaskRange":[0,4096],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":38,"TaskRange":[0,65536],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":39,"TaskRange":[0,4096],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,256],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,256],
-          "WarpRange": [0,4],
-          "SramRange": [0,24672],
-          "TaskGroups": [
-            {"TaskId":40,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":41,"TaskRange":[0,8192],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,256],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,256],
-          "WarpRange": [0,4],
-          "SramRange": [0,24672],
-          "TaskGroups": [
-            {"TaskId":42,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":43,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":44,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":45,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":46,"TaskRange":[0,2048],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,32],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,32],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":47,"TaskRange":[0,32],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":48,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":49,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":50,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,24672],
-          "TaskGroups": [
-            {"TaskId":51,"TaskRange":[0,688],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":52,"TaskRange":[0,688],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":53,"TaskRange":[0,688],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,24672],
-          "TaskGroups": [
-            {"TaskId":54,"TaskRange":[0,688],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":55,"TaskRange":[0,688],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,256],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,256],
-          "WarpRange": [0,4],
-          "SramRange": [0,24672],
-          "TaskGroups": [
-            {"TaskId":56,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":57,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    }
-  ]
-}

From 67e3b2601f00997d6debe8f9dd3e7c633ceee08b Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Thu, 11 Jul 2024 01:44:53 +0000
Subject: [PATCH 34/79] update test

---
 ark/ops/ops_scalar_test.cpp | 43 +++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 23 deletions(-)

diff --git a/ark/ops/ops_scalar_test.cpp b/ark/ops/ops_scalar_test.cpp
index 9e9e635b8..6ae0022f0 100644
--- a/ark/ops/ops_scalar_test.cpp
+++ b/ark/ops/ops_scalar_test.cpp
@@ -263,31 +263,28 @@ ark::unittest::State test_scalar_mul_fp16_offset() {
     {
         ark::Model m;
         ark::Tensor buf = m.tensor({1024}, ark::FP16);
-        ark::Tensor tns = m.refer(buf, {2}, {1024}, {3});
-        ark::Tensor out = m.mul(tns, 2, tns);
-
-        ark::DefaultExecutor exe(m);
-        exe.compile();
+        ark::Tensor tns = m.refer(buf, {2}, {1024}, {6});
+        ark::Tensor doubled = m.mul(tns, 2, tns);
+        ark::Tensor out = m.identity(buf, {doubled});
 
         std::vector<ark::half_t> data(1024, ark::half_t(2));
-        exe.tensor_write(buf, data);
-
-        exe.launch();
-        exe.run(1);
-        exe.stop();
-
-        data.clear();
-        data.resize(1024);
-
-        exe.tensor_read(buf, data);
-
-        for (size_t i = 0; i < data.size(); ++i) {
-            if (i == 3 || i == 4) {
-                UNITTEST_EQ(data[i], 4);
-            } else {
-                UNITTEST_EQ(data[i], 2);
-            }
-        }
+        auto result = ark::op_test(
+            "scalar_mul_fp16_offset", m, {buf}, {out},
+            [](std::vector<void *> &outputs, const std::vector<ark::Dims> &,
+               const std::vector<void *> &, const std::vector<ark::Dims> &,
+               int) {
+                ark::half_t *out = static_cast<ark::half_t *>(outputs[0]);
+                for (size_t i = 0; i < 1024; ++i) {
+                    if (i == 6 || i == 7) {
+                        out[i] = 4;
+                    } else {
+                        out[i] = 2;
+                    }
+                }
+            },
+            {data.data()});
+        UNITTEST_LOG(result);
+        UNITTEST_EQ(result.max_diff[0], 0.0f);
     }
     return ark::unittest::SUCCESS;
 }

From e1f178bd3c7bbb0023e1ffc3eceee72564116d10 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Fri, 12 Jul 2024 04:37:51 +0000
Subject: [PATCH 35/79] fix merge & updates

---
 ark/api/executor.cpp               |  3 +--
 python/ark/runtime.py              |  8 ++++----
 python/ark/tensor.py               | 17 ++++++++++-------
 python/executor_py.cpp             |  2 +-
 python/unittest/unittest_common.py | 22 ++++++++++++++++++++++
 5 files changed, 38 insertions(+), 14 deletions(-)
 create mode 100644 python/unittest/unittest_common.py

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index 1af298e89..ad6cb8550 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -233,7 +233,6 @@ void Executor::Impl::init(const PlanJson &plan_json) {
     if (world_size_ > 1) {
         init_communicator();
     }
-}
 
     auto gpu_manager = GpuManager::get_instance(device_id_);
 
@@ -384,7 +383,7 @@ std::map<size_t, size_t> Executor::Impl::init_buffers(const Json &plan_json) {
             continue;
         }
         if (buf_info->buffer->is_external()) {
-            if (buf_info->buffer->device_id() != gpu_id_) {
+            if (buf_info->buffer->device_id() != device_id_) {
                 ERR(InvalidUsageError,
                     "PyTorch tensor and model execution are on different GPUs");
             }
diff --git a/python/ark/runtime.py b/python/ark/runtime.py
index 93acb6bf8..1e56fe1ca 100644
--- a/python/ark/runtime.py
+++ b/python/ark/runtime.py
@@ -142,7 +142,7 @@ def launch(
         initialized. The executor will compile the cuda kernels and launch the ARK runtime.
         """
         if self.launched():
-            logging.warn(
+            logging.warning(
                 f"Runtime {self.runtime_id} is already launched, skip launching"
             )
             return
@@ -153,7 +153,7 @@ def launch(
         if self.state == Runtime.State.Init:
             if self.executor is not None:
                 if not self.executor.destroyed():
-                    logging.warn(
+                    logging.warning(
                         f"Runtime {self.runtime_id}, has already been launched. Destroying the old executor"
                     )
                     self.executor.destroy()
@@ -184,7 +184,7 @@ def wait(self):
         Wait for the kernel to finish.
         """
         if self.state != Runtime.State.Running:
-            logging.warn(
+            logging.warning(
                 f"ARK runtime {self.runtime_id} is not running, skip waiting"
             )
             return
@@ -197,7 +197,7 @@ def stop(self) -> float:
         Once this is called, we need to call `launch()` again to run the model again.
         """
         if not self.launched():
-            logging.warn(
+            logging.warning(
                 f"ARK runtime {self.runtime_id} is never launched, skip stopping"
             )
             return
diff --git a/python/ark/tensor.py b/python/ark/tensor.py
index e377cf852..335020769 100644
--- a/python/ark/tensor.py
+++ b/python/ark/tensor.py
@@ -103,7 +103,7 @@ def to_numpy(
         return ndarray
 
     def to_torch(
-        self, tensor: torch.Tensor = None, runtime_id: int = -1
+        self, tensor: torch.Tensor = None, stream: int = 0
     ) -> torch.Tensor:
         """ """
         if _no_torch:
@@ -116,21 +116,24 @@ def to_torch(
             )
         torch_type = self.dtype().to_torch()
         if tensor is None:
-            dev_name = f"cuda:{rt.executor.gpu_id()}"
+            dev_name = f"cuda:{rt.executor.device_id()}"
             tensor = torch.zeros(
                 self.shape(), dtype=torch_type, device=torch.device(dev_name)
             )
-        elif tensor.shape != self.shape():
-            raise ValueError("torch tensor shape does not match the tensor")
+        elif list(tensor.shape) != self.shape():
+            raise ValueError(f"torch tensor shape {list(tensor.shape)} "
+                             f"does not match the tensor {self.shape()}")
         elif tensor.dtype != torch_type:
-            raise ValueError("torch tensor dtype does not match the tensor")
+            raise ValueError(f"torch tensor dtype {tensor.dtype} "
+                             f"does not match the tensor {torch_type}")
         elif not tensor.is_contiguous():
             raise ValueError("torch tensor is not contiguous in memory")
         elif tensor.numel() != self.nelems():
-            raise ValueError("torch tensor size does not match the tensor")
+            raise ValueError(f"torch tensor size {tensor.numel()} "
+                             f"does not match the tensor {self.nelems()}")
         tensor_bytes = self.nelems() * self.dtype().element_size()
         rt.executor.tensor_read(
-            self._tensor, tensor.data_ptr(), tensor_bytes, True
+            self._tensor, tensor.data_ptr(), tensor_bytes, stream, True
         )
         return tensor
 
diff --git a/python/executor_py.cpp b/python/executor_py.cpp
index fffbb2c30..8455fa585 100644
--- a/python/executor_py.cpp
+++ b/python/executor_py.cpp
@@ -93,7 +93,7 @@ static DLManagedTensor *to_dlpack(ark::Executor &exe,
         tensor.offsets().is_no_dim() ? 0 : tensor.offsets().vector()[0];
     dl_tensor.byte_offset = offset_in_elements * tensor.data_type().bytes();
     dl_tensor.device.device_type = get_device_type();
-    dl_tensor.device.device_id = static_cast<int32_t>(exe.gpu_id());
+    dl_tensor.device.device_id = static_cast<int32_t>(exe.device_id());
     dl_tensor.ndim = static_cast<int32_t>(tensor.shape().ndims());
     dl_tensor.dtype = get_dl_dtype(tensor.data_type());
 
diff --git a/python/unittest/unittest_common.py b/python/unittest/unittest_common.py
new file mode 100644
index 000000000..9548410b5
--- /dev/null
+++ b/python/unittest/unittest_common.py
@@ -0,0 +1,22 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import pytest
+import ark
+
+
+def pytest_ark(need_torch: bool = False):
+    """
+    Decorator for ARK unit tests.
+    """
+    def decorator(test_func):
+        if need_torch:
+            try:
+                import torch
+            except ImportError:
+                return pytest.mark.skip(reason="torch is not installed")(test_func)
+        def wrapper(*args, **kwargs):
+            ark.init()
+            test_func(*args, **kwargs)
+        return wrapper
+    return decorator

From ce1959ecb5fb064b4e653b3cad7cf3dcba63a9d7 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Fri, 12 Jul 2024 06:49:30 +0000
Subject: [PATCH 36/79] Add `loop_mode` argument

---
 ark/api/executor.cpp                       | 116 ++++++++++++++-------
 ark/api/planner.cpp                        |   2 +-
 ark/codegen.cpp                            |   2 +-
 ark/gpu/{gpu.h => gpu.hpp}                 |   7 +-
 ark/gpu/gpu_compile.cpp                    |   4 +-
 ark/gpu/{gpu_compile.h => gpu_compile.hpp} |   6 +-
 ark/gpu/gpu_event.cpp                      |   6 +-
 ark/gpu/{gpu_event.h => gpu_event.hpp}     |   8 +-
 ark/gpu/gpu_kernel.cpp                     |  33 ++----
 ark/gpu/{gpu_kernel.h => gpu_kernel.hpp}   |  19 ++--
 ark/gpu/gpu_kernel_test.cpp                |   8 +-
 ark/gpu/{gpu_logging.h => gpu_logging.hpp} |   8 +-
 ark/gpu/gpu_manager.cpp                    |   4 +-
 ark/gpu/{gpu_manager.h => gpu_manager.hpp} |  14 +--
 ark/gpu/gpu_memory.cpp                     |   8 +-
 ark/gpu/{gpu_memory.h => gpu_memory.hpp}   |  10 +-
 ark/gpu/gpu_stream.cpp                     |   6 +-
 ark/gpu/{gpu_stream.h => gpu_stream.hpp}   |   8 +-
 ark/include/ark/executor.hpp               |   4 +-
 ark/include/kernels/kernel_template.in     |  17 ++-
 ark/ops/ops_matmul_test.cpp                |   2 +-
 ark/ops/ops_test_common.cpp                |   2 +-
 python/ark/runtime.py                      |   4 +-
 python/executor_py.cpp                     |   8 +-
 24 files changed, 173 insertions(+), 133 deletions(-)
 rename ark/gpu/{gpu.h => gpu.hpp} (98%)
 rename ark/gpu/{gpu_compile.h => gpu_compile.hpp} (78%)
 rename ark/gpu/{gpu_event.h => gpu_event.hpp} (84%)
 rename ark/gpu/{gpu_kernel.h => gpu_kernel.hpp} (68%)
 rename ark/gpu/{gpu_logging.h => gpu_logging.hpp} (92%)
 rename ark/gpu/{gpu_manager.h => gpu_manager.hpp} (88%)
 rename ark/gpu/{gpu_memory.h => gpu_memory.hpp} (87%)
 rename ark/gpu/{gpu_stream.h => gpu_stream.hpp} (79%)

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index 2f50a4280..91c8e39de 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -14,11 +14,11 @@
 #include "codegen.hpp"
 #include "env.h"
 #include "file_io.h"
-#include "gpu/gpu.h"
-#include "gpu/gpu_event.h"
-#include "gpu/gpu_kernel.h"
-#include "gpu/gpu_logging.h"
-#include "gpu/gpu_manager.h"
+#include "gpu/gpu.hpp"
+#include "gpu/gpu_event.hpp"
+#include "gpu/gpu_kernel.hpp"
+#include "gpu/gpu_logging.hpp"
+#include "gpu/gpu_manager.hpp"
 #include "logging.h"
 #include "model/model_buffer.hpp"
 #include "model/model_data_type.hpp"
@@ -140,7 +140,7 @@ static size_t tensor_stride_bytes(const Json &tensor) {
 
 class Executor::Impl {
    public:
-    Impl(int device_id, Stream stream, const std::string &name);
+    Impl(int device_id, Stream stream, const std::string &name, bool loop_mode);
     ~Impl() = default;
 
     void init(const PlanJson& plan);
@@ -174,6 +174,8 @@ class Executor::Impl {
    protected:
     int device_id_;
     std::string name_;
+    bool loop_mode_;
+
     gpuStream stream_raw_;
 
     int rank_;
@@ -203,8 +205,9 @@ class Executor::Impl {
         rank_to_sm_channels_;
 };
 
-Executor::Impl::Impl(int device_id, Stream stream, const std::string &name)
-    : device_id_(device_id), name_(name) {
+Executor::Impl::Impl(int device_id, Stream stream, const std::string &name,
+                     bool loop_mode)
+    : device_id_(device_id), name_(name), loop_mode_(loop_mode) {
     if (device_id < 0) {
         ERR(InvalidUsageError, "Invalid device ID ", device_id);
     }
@@ -251,7 +254,6 @@ void Executor::Impl::init(const PlanJson &plan_json) {
     int threads_per_block = static_cast<int>(
         codegen_->num_warps_per_proc() * gpu_manager->info().threads_per_warp);
     int num_sm = static_cast<int>(codegen_->num_procs());
-    int *flag = flag_->ref<int>();
     size_t smem_block_total =
         static_cast<size_t>(gpu_manager->info().smem_block_total);
 
@@ -260,11 +262,19 @@ void Executor::Impl::init(const PlanJson &plan_json) {
         init_channels(remote_ranks);
     }
 
+    std::string kernel_name;
+    if (loop_mode_) {
+        kernel_name = "ark_loop_kernel";
+    } else {
+        kernel_name = "ark_kernel";
+    }
+    if (!name_.empty()) {
+        kernel_name += "_" + name_;
+    }
+
     kernel_ = std::shared_ptr<GpuKernel>(new GpuKernel(
         device_id_, codegen_->code(), {threads_per_block, 1, 1}, {num_sm, 1, 1},
-        std::max(smem_block_total, size_t(4)), name_,
-        {std::pair<void *, size_t>{buffer_->ref(), sizeof(buffer_->ref())},
-         std::pair<void *, size_t>{flag, sizeof(flag)}}));
+        std::max(smem_block_total, size_t(4)), kernel_name));
 }
 
 void Executor::Impl::init_communicator() {
@@ -669,51 +679,76 @@ void Executor::Impl::launch(int64_t max_spin_count) {
         proxy_service_->startProxy();
     }
 
-    // Initialize loop flags.
-    atomicStoreRelaxed(flag_->ref<int>(), 0);
-    kernel_->launch(stream_raw_);
-    timer_end_->record(stream_raw_);
+    if (loop_mode_) {
+        // Initialize loop flags.
+        atomicStoreRelaxed(flag_->ref<int>(), 0);
+        void *buf_ptr = buffer_->ref();
+        void *flag_ptr = flag_->ref();
+        std::vector<void *> args = {&buf_ptr, &flag_ptr};
+        kernel_->launch(stream_raw_, args);
+    }
     is_recording_ = true;
     is_launched_ = true;
 }
 
 void Executor::Impl::run(int iter) {
-    if (iter > 0) {
+    if (iter <= 0) return;
+    if (loop_mode_) {
         while (atomicLoadRelaxed(flag_->ref<int>()) > 0) {
         }
         atomicStoreRelaxed(flag_->ref<int>(), iter);
+    } else {
+        void *buf_ptr = buffer_->ref();
+        int i = 0;
+        std::vector<void *> args = {&buf_ptr, reinterpret_cast<void *>(&i)};
+        for (; i < iter; i++) {
+            kernel_->launch(stream_raw_, args);
+        }
     }
 }
 
 void Executor::Impl::wait(int64_t max_spin_count) {
     int64_t cnt = max_spin_count;
-    while (atomicLoadRelaxed(flag_->ref<int>()) > 0) {
-        if (cnt-- > 0) {
-            continue;
-        }
-        // Check if the kernel encountered an error.
-        gpuError res = gpuStreamQuery(stream_raw_);
-        if (res == gpuSuccess) {
-            if (atomicLoadRelaxed(flag_->ref<int>()) > 0) {
-                LOG(WARN, "Stream is finished but the loop flag is still set.");
-                break;
+    if (loop_mode_) {
+        while (atomicLoadRelaxed(flag_->ref<int>()) > 0) {
+            if (cnt-- > 0) {
+                continue;
+            }
+            // Check if the kernel encountered an error.
+            gpuError res = gpuStreamQuery(stream_raw_);
+            if (res == gpuSuccess) {
+                if (atomicLoadRelaxed(flag_->ref<int>()) > 0) {
+                    LOG(WARN,
+                        "Stream is finished but the loop flag is still set.");
+                    break;
+                } else {
+                    LOG(WARN,
+                        "wait() is delayed by a stream query. Regarding "
+                        "timing measurements may be inaccurate.");
+                    break;
+                }
+            } else if (res == gpuErrorNotReady) {
+                cnt = max_spin_count;
             } else {
-                LOG(WARN,
-                    "wait() is delayed by a stream query. Regarding "
-                    "timing measurements may be inaccurate.");
-                break;
+                GLOG(res);
             }
-        } else if (res == gpuErrorNotReady) {
-            cnt = max_spin_count;
-        } else {
-            GLOG(res);
         }
+    } else {
+        if (max_spin_count >= 0) {
+            LOG(WARN, "max_spin_count is ignored in non-loop mode.");
+        }
+        GLOG(gpuStreamSynchronize(stream_raw_));
     }
 }
 
 float Executor::Impl::stop(int64_t max_spin_count) {
     this->wait(max_spin_count);
-    atomicStoreRelaxed(flag_->ref<int>(), -1);
+    if (is_recording_) {
+        timer_end_->record(stream_raw_);
+    }
+    if (loop_mode_) {
+        atomicStoreRelaxed(flag_->ref<int>(), -1);
+    }
     GLOG(gpuStreamSynchronize(stream_raw_));
     if (is_recording_) {
         elapsed_msec_ = timer_end_->elapsed_msec(*timer_begin_);
@@ -847,8 +882,9 @@ void Executor::Impl::tensor_write(const Tensor tensor, const void *data,
 }
 
 Executor::Executor(int device_id, Stream stream, const std::string &name,
-                   const std::string &plan)
-    : impl_(std::make_unique<Executor::Impl>(device_id, stream, name)) {
+                   const std::string &plan, bool loop_mode)
+    : impl_(std::make_unique<Executor::Impl>(device_id, stream, name,
+                                             loop_mode)) {
     auto &plan_path = get_env().enforce_plan_path;
     if (!plan_path.empty()) {
         LOG(INFO, "Enforce executor plan path: ", plan_path);
@@ -901,10 +937,10 @@ void Executor::tensor_write(const Tensor tensor, const void *data, size_t bytes,
 DefaultExecutor::DefaultExecutor(
     const Model &model, int device_id, Stream stream,
     const std::vector<DefaultPlanner::ConfigRule> &config_rules,
-    const std::string &name)
+    const std::string &name, bool loop_mode)
     : Executor((device_id < 0) ? (model.rank() % get_env().num_ranks_per_host)
                                : device_id,
-               stream, name, "") {
+               stream, name, "", loop_mode) {
     DefaultPlanner planner(model, impl_->device_id());
     for (const auto &rule : config_rules) {
         planner.install_config_rule(rule);
diff --git a/ark/api/planner.cpp b/ark/api/planner.cpp
index 5c9d09f2e..d7fdbf807 100644
--- a/ark/api/planner.cpp
+++ b/ark/api/planner.cpp
@@ -6,7 +6,7 @@
 #include "ark/model.hpp"
 #include "env.h"
 #include "file_io.h"
-#include "gpu/gpu_manager.h"
+#include "gpu/gpu_manager.hpp"
 #include "model/model_json.hpp"
 #include "model/model_node.hpp"
 #include "model/model_op.hpp"
diff --git a/ark/codegen.cpp b/ark/codegen.cpp
index cd6206284..02a5d9ad9 100644
--- a/ark/codegen.cpp
+++ b/ark/codegen.cpp
@@ -174,7 +174,7 @@ CodeGenerator::Impl::Impl(const PlanJson &plan,
         {"@NUM_WARPS_PER_BLOCK@", std::to_string(num_warps_per_proc_)},
         {"@DEFINITIONS@", definitions_ss.str()},
         {"@BODY@", body_ss.str()},
-        {"@NAME@", name_},
+        {"@NAME@", (name_.empty() ? "" : "_" + name_)},
     };
     code_ = replace(template_code, replacements);
 }
diff --git a/ark/gpu/gpu.h b/ark/gpu/gpu.hpp
similarity index 98%
rename from ark/gpu/gpu.h
rename to ark/gpu/gpu.hpp
index 2f1eba3ba..531d6c7ee 100644
--- a/ark/gpu/gpu.h
+++ b/ark/gpu/gpu.hpp
@@ -1,8 +1,8 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#ifndef ARK_GPU_H_
-#define ARK_GPU_H_
+#ifndef ARK_GPU_HPP_
+#define ARK_GPU_HPP_
 
 #include <functional>
 
@@ -125,6 +125,7 @@ ARK_GPU_DEFINE_CONSTANT_ALIAS(gpuPointerAttributeSyncMemops,
 // runtime API
 ARK_GPU_DEFINE_FUNC_ALIAS(gpuGetErrorString, cudaGetErrorString,
                           hipGetErrorString);
+ARK_GPU_DEFINE_FUNC_ALIAS(gpuGetLastError, cudaGetLastError, hipGetLastError);
 ARK_GPU_DEFINE_FUNC_ALIAS(gpuDeviceGetAttribute, cudaDeviceGetAttribute,
                           hipDeviceGetAttribute);
 ARK_GPU_DEFINE_FUNC_ALIAS(gpuDeviceSynchronize, cudaDeviceSynchronize,
@@ -183,4 +184,4 @@ ARK_GPU_DEFINE_FUNC_ALIAS(gpuPointerSetAttribute, cuPointerSetAttribute,
 
 }  // namespace ark
 
-#endif  // ARK_GPU_H_
+#endif  // ARK_GPU_HPP_
diff --git a/ark/gpu/gpu_compile.cpp b/ark/gpu/gpu_compile.cpp
index b1c078af4..11e172f07 100644
--- a/ark/gpu/gpu_compile.cpp
+++ b/ark/gpu/gpu_compile.cpp
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include "gpu/gpu_compile.h"
+#include "gpu/gpu_compile.hpp"
 
 #include <sys/types.h>
 #include <sys/wait.h>
@@ -22,7 +22,7 @@
 #include "cpu_timer.h"
 #include "env.h"
 #include "file_io.h"
-#include "gpu/gpu_logging.h"
+#include "gpu/gpu_logging.hpp"
 #include "utils/utils_string.hpp"
 
 #define ARK_DEBUG_KERNEL 0
diff --git a/ark/gpu/gpu_compile.h b/ark/gpu/gpu_compile.hpp
similarity index 78%
rename from ark/gpu/gpu_compile.h
rename to ark/gpu/gpu_compile.hpp
index 58048e78c..8b9e1a9fd 100644
--- a/ark/gpu/gpu_compile.h
+++ b/ark/gpu/gpu_compile.hpp
@@ -1,8 +1,8 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#ifndef ARK_GPU_COMPILE_H_
-#define ARK_GPU_COMPILE_H_
+#ifndef ARK_GPU_COMPILE_HPP_
+#define ARK_GPU_COMPILE_HPP_
 
 #include <string>
 #include <vector>
@@ -16,4 +16,4 @@ const std::string gpu_compile(const std::vector<std::string> &codes,
 
 }  // namespace ark
 
-#endif  // ARK_GPU_COMPILE_H_
+#endif  // ARK_GPU_COMPILE_HPP_
diff --git a/ark/gpu/gpu_event.cpp b/ark/gpu/gpu_event.cpp
index cbc45d9a6..06779b91a 100644
--- a/ark/gpu/gpu_event.cpp
+++ b/ark/gpu/gpu_event.cpp
@@ -1,10 +1,10 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include "gpu/gpu_event.h"
+#include "gpu/gpu_event.hpp"
 
-#include "gpu/gpu_logging.h"
-#include "gpu/gpu_manager.h"
+#include "gpu/gpu_logging.hpp"
+#include "gpu/gpu_manager.hpp"
 
 namespace ark {
 class GpuEvent::Impl {
diff --git a/ark/gpu/gpu_event.h b/ark/gpu/gpu_event.hpp
similarity index 84%
rename from ark/gpu/gpu_event.h
rename to ark/gpu/gpu_event.hpp
index 081f0203b..bd2a7c952 100644
--- a/ark/gpu/gpu_event.h
+++ b/ark/gpu/gpu_event.hpp
@@ -1,12 +1,12 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#ifndef ARK_GPU_EVENT_H_
-#define ARK_GPU_EVENT_H_
+#ifndef ARK_GPU_EVENT_HPP_
+#define ARK_GPU_EVENT_HPP_
 
 #include <memory>
 
-#include "gpu/gpu.h"
+#include "gpu/gpu.hpp"
 
 namespace ark {
 
@@ -33,4 +33,4 @@ class GpuEvent {
 };
 }  // namespace ark
 
-#endif  // ARK_GPU_EVENT_H_
+#endif  // ARK_GPU_EVENT_HPP_
diff --git a/ark/gpu/gpu_kernel.cpp b/ark/gpu/gpu_kernel.cpp
index 46f467f51..d4412f80e 100644
--- a/ark/gpu/gpu_kernel.cpp
+++ b/ark/gpu/gpu_kernel.cpp
@@ -1,50 +1,38 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include "gpu_kernel.h"
+#include "gpu_kernel.hpp"
 
 #include <cassert>
 #include <cstring>
 
-#include "gpu.h"
-#include "gpu_compile.h"
-#include "gpu_logging.h"
-#include "gpu_manager.h"
+#include "gpu.hpp"
+#include "gpu_compile.hpp"
+#include "gpu_logging.hpp"
+#include "gpu_manager.hpp"
 
 namespace ark {
 
 GpuKernel::GpuKernel(int gpu_id, const std::string& code,
                      const std::array<int, 3>& block_dim,
                      const std::array<int, 3>& grid_dim, size_t smem_bytes,
-                     const std::string& kernel_name,
-                     std::initializer_list<std::pair<void*, size_t>> args) {
-    this->init(gpu_id, code, block_dim, grid_dim, smem_bytes, kernel_name,
-               args);
+                     const std::string& kernel_name) {
+    this->init(gpu_id, code, block_dim, grid_dim, smem_bytes, kernel_name);
 }
 
 void GpuKernel::init(int gpu_id, const std::string& code,
                      const std::array<int, 3>& block_dim,
                      const std::array<int, 3>& grid_dim, size_t smem_bytes,
-                     const std::string& kernel_name,
-                     std::initializer_list<std::pair<void*, size_t>> args) {
+                     const std::string& kernel_name) {
     gpu_manager_ = GpuManager::get_instance(gpu_id);
     code_ = code;
     block_dim_ = block_dim;
     grid_dim_ = grid_dim;
     smem_bytes_ = smem_bytes;
     kernel_name_ = kernel_name;
-    params_ptr_.resize(args.size());
-    args_.resize(args.size());
     if (kernel_name_.size() == 0) {
         ERR(InvalidUsageError, "Invalid kernel name: ", kernel_name_);
     }
-    size_t idx = 0;
-    for (auto& pair : args) {
-        args_[idx].reset(new uint8_t[pair.second]);
-        std::memcpy(args_[idx].get(), &(pair.first), pair.second);
-        params_ptr_[idx] = static_cast<void*>(args_[idx].get());
-        idx++;
-    }
 }
 
 void GpuKernel::compile() {
@@ -68,12 +56,13 @@ void GpuKernel::compile() {
                                  dynamic_smem_size_bytes));
 }
 
-void GpuKernel::launch(gpuStream stream) {
+void GpuKernel::launch(gpuStream stream, std::vector<void*>& args) {
     if (!this->is_compiled()) {
         ERR(InvalidUsageError, "Kernel is not compiled yet.");
     }
     gpu_manager_->launch(function_, grid_dim_, block_dim_, smem_bytes_, stream,
-                         params_ptr_.data(), nullptr);
+                         args.data(), nullptr);
+    GLOG(gpuGetLastError());
 }
 
 gpuDeviceptr GpuKernel::get_global(const std::string& name,
diff --git a/ark/gpu/gpu_kernel.h b/ark/gpu/gpu_kernel.hpp
similarity index 68%
rename from ark/gpu/gpu_kernel.h
rename to ark/gpu/gpu_kernel.hpp
index b3be79071..5308cfead 100644
--- a/ark/gpu/gpu_kernel.h
+++ b/ark/gpu/gpu_kernel.hpp
@@ -1,13 +1,14 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#ifndef ARK_GPU_KERNEL_H_
-#define ARK_GPU_KERNEL_H_
+#ifndef ARK_GPU_KERNEL_HPP_
+#define ARK_GPU_KERNEL_HPP_
 
 #include <memory>
 #include <string>
+#include <vector>
 
-#include "gpu_stream.h"
+#include "gpu_stream.hpp"
 
 namespace ark {
 
@@ -18,16 +19,14 @@ class GpuKernel {
     GpuKernel(int gpu_id, const std::string& codes,
               const std::array<int, 3>& block_dim,
               const std::array<int, 3>& grid_dim, size_t smem_bytes,
-              const std::string& kernel_name,
-              std::initializer_list<std::pair<void*, size_t>> args = {});
+              const std::string& kernel_name);
 
     void init(int gpu_id, const std::string& codes,
               const std::array<int, 3>& block_dim,
               const std::array<int, 3>& grid_dim, size_t smem_bytes,
-              const std::string& kernel_name,
-              std::initializer_list<std::pair<void*, size_t>> args = {});
+              const std::string& kernel_name);
     void compile();
-    void launch(gpuStream stream);
+    void launch(gpuStream stream, std::vector<void*>& args);
 
     gpuDeviceptr get_global(const std::string& name,
                             bool ignore_not_found = false) const;
@@ -43,10 +42,8 @@ class GpuKernel {
     std::string bin_;
     gpuModule module_;
     gpuFunction function_ = nullptr;
-    std::vector<void*> params_ptr_;
-    std::vector<std::shared_ptr<uint8_t[]>> args_;
 };
 
 }  // namespace ark
 
-#endif  // ARK_GPU_KERNEL_H_
+#endif  // ARK_GPU_KERNEL_HPP_
diff --git a/ark/gpu/gpu_kernel_test.cpp b/ark/gpu/gpu_kernel_test.cpp
index 870ad7ab9..342ef9656 100644
--- a/ark/gpu/gpu_kernel_test.cpp
+++ b/ark/gpu/gpu_kernel_test.cpp
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include "gpu/gpu_kernel.h"
+#include "gpu/gpu_kernel.hpp"
 
 #include "unittest/unittest_utils.h"
 
@@ -9,7 +9,13 @@ const std::string void_kernel = "extern \"C\" __global__ void kernel() {}";
 
 ark::unittest::State test_gpu_kernel() {
     ark::GpuKernel kernel(0, void_kernel, {1, 1, 1}, {1, 1, 1}, 0, "kernel");
+    UNITTEST_TRUE(!kernel.is_compiled());
     kernel.compile();
+    UNITTEST_TRUE(kernel.is_compiled());
+    std::vector<void*> args;
+    for (int i = 0; i < 10; i++) {
+        kernel.launch(nullptr, args);
+    }
     return ark::unittest::SUCCESS;
 }
 
diff --git a/ark/gpu/gpu_logging.h b/ark/gpu/gpu_logging.hpp
similarity index 92%
rename from ark/gpu/gpu_logging.h
rename to ark/gpu/gpu_logging.hpp
index b14435b8b..5e35cc003 100644
--- a/ark/gpu/gpu_logging.h
+++ b/ark/gpu/gpu_logging.hpp
@@ -1,10 +1,10 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#ifndef ARK_GPU_LOGGING_H_
-#define ARK_GPU_LOGGING_H_
+#ifndef ARK_GPU_LOGGING_HPP_
+#define ARK_GPU_LOGGING_HPP_
 
-#include "gpu/gpu.h"
+#include "gpu/gpu.hpp"
 #include "logging.h"
 
 #define GLOG(cmd)                                           \
@@ -29,4 +29,4 @@
         }                                                                      \
     } while (0)
 
-#endif  // ARK_GPU_LOGGING_H_
+#endif  // ARK_GPU_LOGGING_HPP_
diff --git a/ark/gpu/gpu_manager.cpp b/ark/gpu/gpu_manager.cpp
index fc841fa32..572932e35 100644
--- a/ark/gpu/gpu_manager.cpp
+++ b/ark/gpu/gpu_manager.cpp
@@ -1,11 +1,11 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include "gpu/gpu_manager.h"
+#include "gpu/gpu_manager.hpp"
 
 #include <unordered_map>
 
-#include "gpu/gpu_logging.h"
+#include "gpu/gpu_logging.hpp"
 #include "utils/utils_string.hpp"
 
 namespace ark {
diff --git a/ark/gpu/gpu_manager.h b/ark/gpu/gpu_manager.hpp
similarity index 88%
rename from ark/gpu/gpu_manager.h
rename to ark/gpu/gpu_manager.hpp
index 93a48cf7b..eeeda4d94 100644
--- a/ark/gpu/gpu_manager.h
+++ b/ark/gpu/gpu_manager.hpp
@@ -1,16 +1,16 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#ifndef ARK_GPU_MANAGER_H_
-#define ARK_GPU_MANAGER_H_
+#ifndef ARK_GPU_MANAGER_HPP_
+#define ARK_GPU_MANAGER_HPP_
 
 #include <memory>
 
 #include "arch.hpp"
-#include "gpu/gpu.h"
-#include "gpu/gpu_event.h"
-#include "gpu/gpu_memory.h"
-#include "gpu/gpu_stream.h"
+#include "gpu/gpu.hpp"
+#include "gpu/gpu_event.hpp"
+#include "gpu/gpu_memory.hpp"
+#include "gpu/gpu_stream.hpp"
 
 namespace ark {
 
@@ -62,4 +62,4 @@ class GpuManager {
 
 }  // namespace ark
 
-#endif  // ARK_GPU_MANAGER_H_
+#endif  // ARK_GPU_MANAGER_HPP_
diff --git a/ark/gpu/gpu_memory.cpp b/ark/gpu/gpu_memory.cpp
index 184db457c..9a854f521 100644
--- a/ark/gpu/gpu_memory.cpp
+++ b/ark/gpu/gpu_memory.cpp
@@ -1,11 +1,11 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include "gpu/gpu_memory.h"
+#include "gpu/gpu_memory.hpp"
 
-#include "gpu/gpu.h"
-#include "gpu/gpu_logging.h"
-#include "gpu/gpu_manager.h"
+#include "gpu/gpu.hpp"
+#include "gpu/gpu_logging.hpp"
+#include "gpu/gpu_manager.hpp"
 
 namespace ark {
 
diff --git a/ark/gpu/gpu_memory.h b/ark/gpu/gpu_memory.hpp
similarity index 87%
rename from ark/gpu/gpu_memory.h
rename to ark/gpu/gpu_memory.hpp
index cd7a6f04f..6b277d40b 100644
--- a/ark/gpu/gpu_memory.h
+++ b/ark/gpu/gpu_memory.hpp
@@ -1,13 +1,13 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#ifndef ARK_GPU_MEMORY_H_
-#define ARK_GPU_MEMORY_H_
+#ifndef ARK_GPU_MEMORY_HPP_
+#define ARK_GPU_MEMORY_HPP_
 
 #include <memory>
 #include <vector>
 
-#include "gpu/gpu.h"
+#include "gpu/gpu.hpp"
 
 namespace ark {
 
@@ -40,7 +40,7 @@ class GpuHostMemory {
     GpuHostMemory(const GpuHostMemory&) = delete;
     GpuHostMemory& operator=(const GpuHostMemory&) = delete;
 
-    template <typename T>
+    template <typename T = void>
     T* ref() const {
         return reinterpret_cast<T*>(ptr_);
     }
@@ -54,4 +54,4 @@ class GpuHostMemory {
 
 }  // namespace ark
 
-#endif  // ARK_GPU_MEMORY_H_
+#endif  // ARK_GPU_MEMORY_HPP_
diff --git a/ark/gpu/gpu_stream.cpp b/ark/gpu/gpu_stream.cpp
index 52502365a..17d4e21f5 100644
--- a/ark/gpu/gpu_stream.cpp
+++ b/ark/gpu/gpu_stream.cpp
@@ -1,10 +1,10 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include "gpu/gpu_stream.h"
+#include "gpu/gpu_stream.hpp"
 
-#include "gpu/gpu_logging.h"
-#include "gpu/gpu_manager.h"
+#include "gpu/gpu_logging.hpp"
+#include "gpu/gpu_manager.hpp"
 
 namespace ark {
 class GpuStream::Impl {
diff --git a/ark/gpu/gpu_stream.h b/ark/gpu/gpu_stream.hpp
similarity index 79%
rename from ark/gpu/gpu_stream.h
rename to ark/gpu/gpu_stream.hpp
index e76f01827..9d8775f95 100644
--- a/ark/gpu/gpu_stream.h
+++ b/ark/gpu/gpu_stream.hpp
@@ -1,12 +1,12 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#ifndef ARK_GPU_STREAM_H_
-#define ARK_GPU_STREAM_H_
+#ifndef ARK_GPU_STREAM_HPP_
+#define ARK_GPU_STREAM_HPP_
 
 #include <memory>
 
-#include "gpu/gpu.h"
+#include "gpu/gpu.hpp"
 
 namespace ark {
 
@@ -30,4 +30,4 @@ class GpuStream {
 };
 }  // namespace ark
 
-#endif  // ARK_GPU_STREAM_H_
+#endif  // ARK_GPU_STREAM_HPP_
diff --git a/ark/include/ark/executor.hpp b/ark/include/ark/executor.hpp
index 75dc81c17..f0a108a1f 100644
--- a/ark/include/ark/executor.hpp
+++ b/ark/include/ark/executor.hpp
@@ -20,7 +20,7 @@ class Executor {
    public:
     /// Constructor.
     Executor(int device_id, Stream stream, const std::string &name,
-             const std::string &plan);
+             const std::string &plan, bool loop_mode = true);
 
     /// Destructor.
     ~Executor();
@@ -96,7 +96,7 @@ class DefaultExecutor : public Executor {
     DefaultExecutor(
         const Model &model, int device_id = -1, Stream stream = nullptr,
         const std::vector<DefaultPlanner::ConfigRule> &config_rules = {},
-        const std::string &name = "DefaultExecutor");
+        const std::string &name = "DefaultExecutor", bool loop_mode = true);
 };
 
 }  // namespace ark
diff --git a/ark/include/kernels/kernel_template.in b/ark/include/kernels/kernel_template.in
index ea1862920..a8a56f141 100644
--- a/ark/include/kernels/kernel_template.in
+++ b/ark/include/kernels/kernel_template.in
@@ -33,12 +33,12 @@ __device__ sync::State ARK_LOOP_SYNC_STATE;
 
 @DEFINITIONS@
 
-__device__ void ark_loop_body(char *_buf, int _iter) {
+__device__ void ark_body(char *_buf, int _iter) {
 @BODY@
 }
 
 extern "C" __global__ __launch_bounds__(ARK_WARPS_PER_BLOCK * Arch::ThreadsPerWarp, 1)
-void @NAME@(char *_buf, int *_iter) {
+void ark_loop_kernel@NAME@(char *_buf, int *_iter) {
   int *shared_mem = (int *)_ARK_SMEM;
   for (int i = threadIdx.x; i < ARK_SMEM_RESERVED_BYTES / sizeof(int); i += blockDim.x) {
     shared_mem[i] = 0;
@@ -52,10 +52,10 @@ void @NAME@(char *_buf, int *_iter) {
     sync_gpu<@NUM_BLOCKS@>(ARK_LOOP_SYNC_STATE);
     if (ARK_ITER < 0) return;
 
-    ark_loop_body(_buf, 0);
+    ark_body(_buf, 0);
     for (int _i = 1; _i < ARK_ITER; ++_i) {
       sync_gpu<@NUM_BLOCKS@>(ARK_LOOP_SYNC_STATE);
-      ark_loop_body(_buf, _i);
+      ark_body(_buf, _i);
     }
     if (threadIdx.x == 0) {
       __threadfence_system();
@@ -67,3 +67,12 @@ void @NAME@(char *_buf, int *_iter) {
     sync_gpu<@NUM_BLOCKS@>(ARK_LOOP_SYNC_STATE);
   }
 }
+
+extern "C" __global__ __launch_bounds__(ARK_WARPS_PER_BLOCK * Arch::ThreadsPerWarp, 1)
+void ark_kernel@NAME@(char *_buf, int _iter) {
+  int *shared_mem = (int *)_ARK_SMEM;
+  for (int i = threadIdx.x; i < ARK_SMEM_RESERVED_BYTES / sizeof(int); i += blockDim.x) {
+    shared_mem[i] = 0;
+  }
+  ark_body(_buf, _iter);
+}
diff --git a/ark/ops/ops_matmul_test.cpp b/ark/ops/ops_matmul_test.cpp
index 4304a19e2..6d09b54d6 100644
--- a/ark/ops/ops_matmul_test.cpp
+++ b/ark/ops/ops_matmul_test.cpp
@@ -3,7 +3,7 @@
 
 #include <algorithm>
 
-#include "gpu/gpu.h"
+#include "gpu/gpu.hpp"
 #include "logging.h"
 #include "model/model_node.hpp"
 #include "model/model_op.hpp"
diff --git a/ark/ops/ops_test_common.cpp b/ark/ops/ops_test_common.cpp
index 60ffc9dc2..bec69c456 100644
--- a/ark/ops/ops_test_common.cpp
+++ b/ark/ops/ops_test_common.cpp
@@ -10,7 +10,7 @@
 #include "ark/planner.hpp"
 #include "ark/random.hpp"
 #include "env.h"
-#include "gpu/gpu_logging.h"
+#include "gpu/gpu_logging.hpp"
 #include "logging.h"
 #include "model/model_data_type.hpp"
 #include "model/model_tensor.hpp"
diff --git a/python/ark/runtime.py b/python/ark/runtime.py
index 33db1fb5c..d54f85c36 100644
--- a/python/ark/runtime.py
+++ b/python/ark/runtime.py
@@ -101,12 +101,11 @@ def running(self) -> bool:
 
     def launch(
         self,
-        rank: int = 0,
-        world_size: int = 1,
         gpu_id: int = 0,
         plan: str = "",
         plan_path: str = "",
         stream: int = 0,
+        loop_mode: bool = True,
     ):
         """
         Create an executor and schedule the ARK model. The scheduler will generate
@@ -135,6 +134,7 @@ def launch(
                 stream,
                 "ArkRuntime",
                 plan,
+                loop_mode,
             )
             self.executor = _RuntimeState.executor
             self.executor.compile()
diff --git a/python/executor_py.cpp b/python/executor_py.cpp
index 979cb2952..e782a99fe 100644
--- a/python/executor_py.cpp
+++ b/python/executor_py.cpp
@@ -43,9 +43,11 @@ static void tensor_read(ark::Executor *exe, const ark::Tensor &tensor,
 void register_executor(py::module &m) {
     py::class_<ark::Executor>(m, "_Executor")
         .def(py::init([](int device_id, uintptr_t stream,
-                         const std::string &name, const std::string &plan) {
-            return new ark::Executor(
-                device_id, reinterpret_cast<ark::Stream>(stream), name, plan);
+                         const std::string &name, const std::string &plan,
+                         bool loop_mode) {
+            return new ark::Executor(device_id,
+                                     reinterpret_cast<ark::Stream>(stream),
+                                     name, plan, loop_mode);
         }))
         .def("device_id", &ark::Executor::device_id)
         .def("stream",

From 55755bbe2e2fbc36195f7786280689bde3170ec2 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Sun, 14 Jul 2024 14:19:35 -0700
Subject: [PATCH 37/79] do not force noinline

---
 ark/codegen.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ark/codegen.cpp b/ark/codegen.cpp
index cd6206284..0d4b14a09 100644
--- a/ark/codegen.cpp
+++ b/ark/codegen.cpp
@@ -213,7 +213,7 @@ std::string CodeGenerator::Impl::def_task(const Json &task_json) {
     for (auto &op_json : task_json["Ops"]) {
         ss << this->def_op(op_json, task_json["Id"], op_idx++);
     }
-    ss << "__noinline__ __device__ void t" << task_json["Id"]
+    ss << "__device__ void t" << task_json["Id"]
        << "(char* _buf, int _idx, int _spw) {\n";
     op_idx = 0;
     for (auto &op_json : task_json["Ops"]) {

From b29eaaefb5b969a8e0ec8b8e3813e5e3245e7825 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Sun, 14 Jul 2024 21:25:20 +0000
Subject: [PATCH 38/79] wip

---
 arkprof.py                         |  4 +++-
 python/ark/profiler.py             | 10 +++++-----
 python/ark/runtime.py              | 11 +++++++++--
 python/ark/tensor.py               | 18 ++++++++++++------
 python/unittest/unittest_common.py |  8 +++++++-
 5 files changed, 36 insertions(+), 15 deletions(-)

diff --git a/arkprof.py b/arkprof.py
index 782bba560..9e67c2dfc 100644
--- a/arkprof.py
+++ b/arkprof.py
@@ -1,4 +1,6 @@
 import ark
 import sys
 
-ark.Profiler(ark.Plan.from_file(sys.argv[1])).run(iter=1000, profile_processor_groups=False)
+ark.Profiler(ark.Plan.from_file(sys.argv[1])).run(
+    iter=1000, profile_processor_groups=False
+)
diff --git a/python/ark/profiler.py b/python/ark/profiler.py
index 56233247c..c161b24e6 100644
--- a/python/ark/profiler.py
+++ b/python/ark/profiler.py
@@ -8,9 +8,9 @@
 from .planner import Plan
 
 
-def timeit(plan: Plan, iter: int):
+def timeit(plan: Plan, iter: int, loop_mode: bool):
     with Runtime() as rt:
-        rt.launch(plan=plan)
+        rt.launch(plan=plan, loop_mode=loop_mode)
         start_time = time.time()
         rt.run(iter=iter)
         end_time = time.time()
@@ -21,8 +21,8 @@ class Profiler:
     def __init__(self, plan: Plan):
         self.plan = plan
 
-    def run(self, iter: int = 1000, profile_processor_groups: bool = False):
-        sys.stderr.write(f"End-to-end: {timeit(self.plan, iter):.6f} seconds/iter\n")
+    def run(self, iter: int = 1000, loop_mode: bool = True, profile_processor_groups: bool = False):
+        sys.stderr.write(f"End-to-end: {timeit(self.plan, iter, loop_mode):.6f} seconds/iter\n")
 
         if not profile_processor_groups:
             return
@@ -38,7 +38,7 @@ def run(self, iter: int = 1000, profile_processor_groups: bool = False):
         }
         for i in range(num_processor_groups):
             new_plan["ProcessorGroups"][0] = self.plan.processor_groups[i]
-            lat_per_iter = timeit(Plan(new_plan), iter)
+            lat_per_iter = timeit(Plan(new_plan), iter, loop_mode)
             sys.stderr.write(
                 f"Processor group {i}: {lat_per_iter:.6f} seconds/iter\n"
             )
diff --git a/python/ark/runtime.py b/python/ark/runtime.py
index b3dbe7887..51a5b7905 100644
--- a/python/ark/runtime.py
+++ b/python/ark/runtime.py
@@ -48,8 +48,15 @@ def print_runtime_states():
 
 
 class Executor(_Executor):
-    def __init__(self, device_id: int, stream: int, name: str, plan: Plan):
-        super().__init__(device_id, stream, name, str(plan))
+    def __init__(
+        self,
+        device_id: int,
+        stream: int,
+        name: str,
+        plan: Plan,
+        loop_mode: bool = True,
+    ):
+        super().__init__(device_id, stream, name, str(plan), loop_mode)
 
 
 class Runtime:
diff --git a/python/ark/tensor.py b/python/ark/tensor.py
index 335020769..657da1065 100644
--- a/python/ark/tensor.py
+++ b/python/ark/tensor.py
@@ -121,16 +121,22 @@ def to_torch(
                 self.shape(), dtype=torch_type, device=torch.device(dev_name)
             )
         elif list(tensor.shape) != self.shape():
-            raise ValueError(f"torch tensor shape {list(tensor.shape)} "
-                             f"does not match the tensor {self.shape()}")
+            raise ValueError(
+                f"torch tensor shape {list(tensor.shape)} "
+                f"does not match the tensor {self.shape()}"
+            )
         elif tensor.dtype != torch_type:
-            raise ValueError(f"torch tensor dtype {tensor.dtype} "
-                             f"does not match the tensor {torch_type}")
+            raise ValueError(
+                f"torch tensor dtype {tensor.dtype} "
+                f"does not match the tensor {torch_type}"
+            )
         elif not tensor.is_contiguous():
             raise ValueError("torch tensor is not contiguous in memory")
         elif tensor.numel() != self.nelems():
-            raise ValueError(f"torch tensor size {tensor.numel()} "
-                             f"does not match the tensor {self.nelems()}")
+            raise ValueError(
+                f"torch tensor size {tensor.numel()} "
+                f"does not match the tensor {self.nelems()}"
+            )
         tensor_bytes = self.nelems() * self.dtype().element_size()
         rt.executor.tensor_read(
             self._tensor, tensor.data_ptr(), tensor_bytes, stream, True
diff --git a/python/unittest/unittest_common.py b/python/unittest/unittest_common.py
index 9548410b5..0c385e89a 100644
--- a/python/unittest/unittest_common.py
+++ b/python/unittest/unittest_common.py
@@ -9,14 +9,20 @@ def pytest_ark(need_torch: bool = False):
     """
     Decorator for ARK unit tests.
     """
+
     def decorator(test_func):
         if need_torch:
             try:
                 import torch
             except ImportError:
-                return pytest.mark.skip(reason="torch is not installed")(test_func)
+                return pytest.mark.skip(reason="torch is not installed")(
+                    test_func
+                )
+
         def wrapper(*args, **kwargs):
             ark.init()
             test_func(*args, **kwargs)
+
         return wrapper
+
     return decorator

From a7a5d46c001b143781022e2d28aaa3eee0c502b3 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Sun, 14 Jul 2024 23:56:21 +0000
Subject: [PATCH 39/79] Fix CK tile indexing

---
 third_party/patches/composable_kernel.patch | 89 +++++++++++++++++++--
 1 file changed, 83 insertions(+), 6 deletions(-)

diff --git a/third_party/patches/composable_kernel.patch b/third_party/patches/composable_kernel.patch
index 43b1afcaa..e12f19332 100644
--- a/third_party/patches/composable_kernel.patch
+++ b/third_party/patches/composable_kernel.patch
@@ -561,7 +561,7 @@ index 2d5dc90bf..160eef036 100644
          });
 
 diff --git a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
-index 7bb47e9d3..2b2e8c604 100644
+index 7bb47e9d3..d495c7297 100644
 --- a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
 +++ b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
 @@ -60,7 +60,7 @@ struct BlockToCTileMap_M00_N0_M01
@@ -582,7 +582,84 @@ index 7bb47e9d3..2b2e8c604 100644
      {
          return true;
      }
-@@ -315,7 +315,7 @@ struct BlockToCTileMap_KSplit_M00_N0_M01Adapt
+@@ -177,58 +177,7 @@ struct BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, void>
+         index_t idx_N0 = block_1d_id % N0;
+         index_t idx_M0 = block_1d_id / N0;
+
+-        const auto M01_adapt = (idx_M0 < M0 - M0 % M01_) ? M01_ : M0 % M01_;
+-
+-        index_t idx_M00          = idx_M0 / M01_;
+-        index_t idx_M01          = idx_M0 % M01_;
+-        index_t idx_N0_M01_local = idx_N0 + idx_M01 * N0;
+-
+-        /**
+-         *                        idxN0
+-         *
+-         *           |<               mtx   N                 >|
+-         *
+-         *             NPerBlock   NPerBlock   NPerBlock   NPerBlock
+-         *                N_0         N_1        N_2         N_3
+-         *       -   |-----------|-----------|-----------|-----|-----|-
+-         *       ^   | -   -  0  |/---->  2  |           |     |     |
+-         *           | |   |     /     |     |           |     |     |  M_0  MPerBlock
+-         *           | M   |    /|     |     |           |     |     |
+-         *           |-0---|---/-|-----|-----|-----------|-----|-----|-
+-         *           | 1   |  /  |     |     |  blockid  |     |     |
+-         * idxM0     | |   | /   |     V     |     5     |     |     |  M_1  MPerBlock
+-         *           | -   V   1 |     -  3  |           |     |     |
+-         *           |-----------|-----------|-----------|-----|-----|-
+-         *    mtx M  |           |           |           |     |     |
+-         *           |           |           |           |     |     |  M_2  MPerBlock
+-         *           |           |           |           |     |     |
+-         *           |-----------|-----------|-----------|-----|-----|-
+-         *           |           |           |           |     |     |
+-         *           |           |           |           |     |     |  M_3  MPerBlock
+-         *           |           |           |           |     |     |
+-         *           |-----------|-----------|-----------|-----|-----|-
+-         *       V   |           |           |           |     |     |
+-         *       -   |-----------|-----------|-----------|-----|-----|- M_4  MPerBlock
+-         *           |           |           |           |     |     |
+-         *           |-----------|-----------|-----------|-----|-----|-
+-         *  Example:
+-         *   assume:
+-         *      M0 = 5
+-         *      N0 = 4
+-         *      block_1d_id = 5
+-         *      M01 = 2
+-         *
+-         *   idx_N0 = 1
+-         *   idx_M0 = 1
+-         *   M01_adapt = 2
+-         *   idx_M00 = 0
+-         *   idx_M01 = 1
+-         *   idx_N0_M01_local = 5
+-         *   output {1, 2}
+-         */
+-
+-        return make_tuple(idx_N0_M01_local % M01_adapt + idx_M00 * M01_,
+-                          idx_N0_M01_local / M01_adapt);
++        return make_tuple(idx_M0, idx_N0);
+     }
+
+     template <typename CTileIdx, typename CTileDim>
+@@ -297,15 +246,7 @@ struct BlockToCTileMap_KSplit_M00_N0_M01Adapt
+         index_t idx_N0 = block_1d_id % N0;
+         index_t idx_M0 = block_1d_id / N0;
+
+-        const auto M01_adapt = (idx_M0 < M0 - M0 % M01_) ? M01_ : M0 % M01_;
+-
+-        index_t idx_M00          = idx_M0 / M01_;
+-        index_t idx_M01          = idx_M0 % M01_;
+-        index_t idx_N0_M01_local = idx_N0 + idx_M01 * N0;
+-
+-        return make_tuple(idx_ksplit,
+-                          idx_N0_M01_local % M01_adapt + idx_M00 * M01_,
+-                          idx_N0_M01_local / M01_adapt);
++        return make_tuple(idx_ksplit, idx_M0, idx_N0);
+     }
+
+     template <typename CTileIdx, typename CTileDim>
+@@ -315,7 +256,7 @@ struct BlockToCTileMap_KSplit_M00_N0_M01Adapt
          return true; // always valid provided that user gets grid size from CalculateGridSize()
      }
 
@@ -591,7 +668,7 @@ index 7bb47e9d3..2b2e8c604 100644
 
      private:
      index_t M01_;
-@@ -373,7 +373,7 @@ struct BlockToCTileMap_M00_N00_M01_N01
+@@ -373,7 +314,7 @@ struct BlockToCTileMap_M00_N00_M01_N01
              return true;
      }
 
@@ -600,7 +677,7 @@ index 7bb47e9d3..2b2e8c604 100644
      {
          if constexpr(DeviceCTileIndexCheck)
              return true; // validity check moved to kernel
-@@ -485,7 +485,7 @@ struct BlockToCTileMap_KSplit_M00_N00_M01_N01
+@@ -485,7 +426,7 @@ struct BlockToCTileMap_KSplit_M00_N00_M01_N01
              return true;
      }
 
@@ -609,7 +686,7 @@ index 7bb47e9d3..2b2e8c604 100644
      {
          if constexpr(DeviceCTileIndexCheck)
              return true; // validity check moved to kernel
-@@ -609,7 +609,7 @@ struct OffsettedBlockToCTileMap
+@@ -609,7 +550,7 @@ struct OffsettedBlockToCTileMap
      }
 
      template <typename CGridDesc_M_N>
@@ -618,7 +695,7 @@ index 7bb47e9d3..2b2e8c604 100644
      {
          return block_to_ctile_map_.CheckValidity(c_grid_desc_m_n);
      }
-@@ -666,7 +666,7 @@ struct BlockToCTileMap_3DGrid_KSplit
+@@ -666,7 +607,7 @@ struct BlockToCTileMap_3DGrid_KSplit
      }
 
      template <typename CGridDesc_M_N>

From 9c19a5ec8543863d159c96f05c007b63943c2566 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 29 Jul 2024 02:56:23 +0000
Subject: [PATCH 40/79] wip

---
 .vscode/settings.json                 |   2 -
 ark/api/context_manager.cpp           |  42 +++++++++
 ark/api/context_manager_test.cpp      |  54 +++++++++++
 ark/api/model.cpp                     |   4 +-
 ark/api/model_graph.cpp               |   4 +-
 ark/api/model_test.cpp                |  24 ++---
 ark/api/planner.cpp                   |   4 +-
 ark/include/ark.hpp                   |   1 +
 ark/include/ark/context_manager.hpp   |  24 +++++
 ark/include/ark/model.hpp             |  64 +++++++------
 ark/include/ark/model_graph.hpp       |   3 +-
 ark/model/model_graph_impl.cpp        |  40 ++++++++-
 ark/model/model_graph_impl.hpp        |  36 +++++++-
 ark/model/model_node.hpp              |   3 +
 ark/model/model_op.cpp                |  11 +++
 ark/model/model_op.hpp                |   9 +-
 ark/ops/ops_arithmetic.cpp            |  20 +++--
 ark/ops/ops_cast.cpp                  |  10 +--
 ark/ops/ops_communication.cpp         |  14 +--
 ark/ops/ops_copy.cpp                  |   5 +-
 ark/ops/ops_embedding.cpp             |   4 +-
 ark/ops/ops_identity.cpp              |   2 +-
 ark/ops/ops_math.cpp                  |  31 ++++---
 ark/ops/ops_matmul.cpp                |   6 +-
 ark/ops/ops_noop.cpp                  |   2 +-
 ark/ops/ops_reduce.cpp                |  12 +--
 ark/ops/ops_refer.cpp                 |   2 +-
 ark/ops/ops_reshape.cpp               |   4 +-
 ark/ops/ops_rope.cpp                  |   5 +-
 ark/ops/ops_scalar.cpp                |  31 ++++---
 ark/ops/ops_tensor.cpp                |   2 +-
 ark/ops/ops_transpose.cpp             |   5 +-
 arkprof.py                            |   1 +
 examples/tutorial/context_tutorial.py | 117 ++++++++++++++++++++++++
 python/ark/__init__.py                |   2 +-
 python/ark/context_manager.py         |  24 +++++
 python/ark/ops.py                     | 125 ++++++++++++++++++++------
 python/ark/profiler.py                |  11 ++-
 python/ark_py.cpp                     |   2 +
 python/context_manager_py.cpp         |  15 ++++
 python/model_py.cpp                   |  86 ++++++++++--------
 41 files changed, 676 insertions(+), 187 deletions(-)
 create mode 100644 ark/api/context_manager.cpp
 create mode 100644 ark/api/context_manager_test.cpp
 create mode 100644 ark/include/ark/context_manager.hpp
 create mode 100644 examples/tutorial/context_tutorial.py
 create mode 100644 python/ark/context_manager.py
 create mode 100644 python/context_manager_py.cpp

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 640196a66..00260f078 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -3,8 +3,6 @@
     "cmake.environment": {
         "ARK_ROOT": "${workspaceFolder}/build",
         "ARK_IGNORE_BINARY_CACHE": "1",
-        "ARK_DISABLE_GRAPH_OPT": "0",
-        "ARK_IPC_LISTEN_PORT_BASE": "42000",
         // "ARK_LOG_LEVEL": "DEBUG"
     },
     "cmake.ctestArgs": [
diff --git a/ark/api/context_manager.cpp b/ark/api/context_manager.cpp
new file mode 100644
index 000000000..6d16d9e79
--- /dev/null
+++ b/ark/api/context_manager.cpp
@@ -0,0 +1,42 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "ark/context_manager.hpp"
+
+#include "model/model_graph_impl.hpp"
+
+namespace ark {
+
+class ContextManager::Impl {
+   public:
+    Impl(std::shared_ptr<ModelGraphContextStack> context_stack,
+         const std::map<std::string, std::string>& context_map);
+
+    ~Impl();
+
+   private:
+    std::shared_ptr<ModelGraphContextStack> context_stack_;
+    std::vector<std::string> keys_;
+};
+
+ContextManager::Impl::Impl(
+    std::shared_ptr<ModelGraphContextStack> context_stack,
+    const std::map<std::string, std::string>& context_map)
+    : context_stack_(context_stack) {
+    for (const auto& [key, value] : context_map) {
+        context_stack_->push(key, value);
+        keys_.push_back(key);
+    }
+}
+
+ContextManager::Impl::~Impl() {
+    for (auto it = keys_.rbegin(); it != keys_.rend(); ++it) {
+        context_stack_->pop(*it);
+    }
+}
+
+ContextManager::ContextManager(
+    Model& model, const std::map<std::string, std::string>& context_map)
+    : impl_(std::make_shared<Impl>(model.impl_->context_stack_, context_map)) {}
+
+}  // namespace ark
diff --git a/ark/api/context_manager_test.cpp b/ark/api/context_manager_test.cpp
new file mode 100644
index 000000000..ff60b43bf
--- /dev/null
+++ b/ark/api/context_manager_test.cpp
@@ -0,0 +1,54 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "ark/model.hpp"
+#include "ark/context_manager.hpp"
+
+#include "model/model_node.hpp"
+#include "unittest/unittest_utils.h"
+
+ark::unittest::State test_context_manager() {
+    ark::Model model;
+    ark::Tensor t0 = model.tensor({1}, ark::FP32);
+    ark::Tensor t1 = model.tensor({1}, ark::FP32);
+    ark::Tensor t2 = model.add(t0, t1);
+
+    ark::Tensor t3;
+    ark::Tensor t4;
+    ark::Tensor t5;
+    {
+        ark::ContextManager cm0_1(model, {{"key0", "val1"}});
+        t3 = model.relu(t2);
+
+        ark::ContextManager cm1_1(model, {{"key1", "val2"}});
+        t4 = model.sqrt(t3);
+    }
+    {
+        ark::ContextManager cm0_2(model, {{"key0", "val3"}});
+        t5 = model.exp(t2);
+    }
+
+    UNITTEST_TRUE(model.verify());
+
+    auto compressed = model.compress(false);
+    UNITTEST_TRUE(compressed.verify());
+
+    auto nodes = compressed.nodes();
+    UNITTEST_EQ(nodes.size(), 4);
+
+    UNITTEST_EQ(nodes[0]->context.size(), 0);
+    UNITTEST_EQ(nodes[1]->context.size(), 1);
+    UNITTEST_EQ(nodes[1]->context.at("key0"), "val1");
+    UNITTEST_EQ(nodes[2]->context.size(), 2);
+    UNITTEST_EQ(nodes[2]->context.at("key0"), "val1");
+    UNITTEST_EQ(nodes[2]->context.at("key1"), "val2");
+    UNITTEST_EQ(nodes[3]->context.size(), 1);
+    UNITTEST_EQ(nodes[3]->context.at("key0"), "val3");
+
+    return ark::unittest::SUCCESS;
+}
+
+int main() {
+    UNITTEST(test_context_manager);
+    return 0;
+}
diff --git a/ark/api/model.cpp b/ark/api/model.cpp
index ab536a33c..a5a258f71 100644
--- a/ark/api/model.cpp
+++ b/ark/api/model.cpp
@@ -9,9 +9,9 @@
 
 namespace ark {
 
-Model Model::compress() const {
+Model Model::compress(bool merge_nodes) const {
     Model model(*this);
-    model.compress_nodes();
+    model.compress_nodes(merge_nodes);
     return model;
 }
 
diff --git a/ark/api/model_graph.cpp b/ark/api/model_graph.cpp
index b6061a34e..d11808467 100644
--- a/ark/api/model_graph.cpp
+++ b/ark/api/model_graph.cpp
@@ -33,7 +33,9 @@ int ModelGraph::rank() const { return impl_->rank(); }
 
 int ModelGraph::world_size() const { return impl_->world_size(); }
 
-void ModelGraph::compress_nodes() { impl_->compress_nodes(); }
+void ModelGraph::compress_nodes(bool merge_nodes) {
+    impl_->compress_nodes(merge_nodes);
+}
 
 bool ModelGraph::compressed() const { return impl_->compressed(); }
 
diff --git a/ark/api/model_test.cpp b/ark/api/model_test.cpp
index a9d332a97..785bfcd7b 100644
--- a/ark/api/model_test.cpp
+++ b/ark/api/model_test.cpp
@@ -36,7 +36,7 @@ ark::unittest::State test_model_basics() {
     //   (AddOp,)
     //
 
-    compressed = model.compress();
+    compressed = model.compress(true);
     UNITTEST_TRUE(compressed.verify());
     UNITTEST_TRUE(compressed.compressed());
     UNITTEST_EQ(compressed.nodes().size(), 1);
@@ -70,7 +70,7 @@ ark::unittest::State test_model_basics() {
     //   (AddOp,AddOp,)
     //
 
-    compressed = model.compress();
+    compressed = model.compress(true);
     UNITTEST_TRUE(compressed.verify());
     UNITTEST_EQ(compressed.nodes().size(), 1);
 
@@ -104,7 +104,7 @@ ark::unittest::State test_model_basics() {
     //   (AddOp,AddOp,ReluOp,)
     //
 
-    compressed = model.compress();
+    compressed = model.compress(true);
     UNITTEST_TRUE(compressed.verify());
     UNITTEST_EQ(compressed.nodes().size(), 1);
 
@@ -143,7 +143,7 @@ ark::unittest::State test_model_basics() {
     //   (AddOp,AddOp,ReluOp,AddOp,)
     //
 
-    compressed = model.compress();
+    compressed = model.compress(true);
     UNITTEST_TRUE(compressed.verify());
 
     auto nodes = compressed.nodes();
@@ -190,7 +190,7 @@ ark::unittest::State test_model_basics() {
     //                      (AddOp,) --+--> (AddOp,)
     //
 
-    compressed = model.compress();
+    compressed = model.compress(true);
     UNITTEST_TRUE(compressed.verify());
 
     nodes = compressed.nodes();
@@ -250,7 +250,7 @@ ark::unittest::State test_model_basics() {
     //                                      (AddOp,)
     //
 
-    compressed = model.compress();
+    compressed = model.compress(true);
     UNITTEST_TRUE(compressed.verify());
 
     nodes = compressed.nodes();
@@ -312,7 +312,7 @@ ark::unittest::State test_model_basics() {
     //                                      (AddOp,)
     //
 
-    compressed = model.compress();
+    compressed = model.compress(true);
     UNITTEST_TRUE(compressed.verify());
 
     nodes = compressed.nodes();
@@ -353,7 +353,7 @@ ark::unittest::State test_model_dependent_inputs() {
     ark::Tensor x4 = m.mul(x2, x3);
     ark::Tensor y = m.add(x0, x4);
 
-    auto compressed = m.compress();
+    auto compressed = m.compress(true);
     auto nodes = compressed.nodes();
     UNITTEST_EQ(nodes.size(), 4);
     auto nodes_iter = nodes.begin();
@@ -399,7 +399,7 @@ ark::unittest::State test_model_noop() {
 
     UNITTEST_TRUE(model.verify());
 
-    auto compressed = model.compress();
+    auto compressed = model.compress(true);
     UNITTEST_TRUE(compressed.verify());
     UNITTEST_EQ(compressed.nodes().size(), 0);
     return ark::unittest::SUCCESS;
@@ -425,7 +425,7 @@ ark::unittest::State test_model_identity() {
     ark::Tensor t4 = model.relu(t3);
     UNITTEST_TRUE(model.verify());
 
-    auto compressed = model.compress();
+    auto compressed = model.compress(true);
     UNITTEST_TRUE(compressed.verify());
     auto nodes = compressed.nodes();
     UNITTEST_EQ(nodes.size(), 3);
@@ -478,7 +478,7 @@ ark::unittest::State test_model_sharding() {
     ark::Tensor t5 = model.relu(t4);
     UNITTEST_TRUE(model.verify());
 
-    auto compressed = model.compress();
+    auto compressed = model.compress(true);
     UNITTEST_TRUE(compressed.verify());
     auto nodes = compressed.nodes();
     UNITTEST_EQ(nodes.size(), 4);
@@ -526,7 +526,7 @@ ark::unittest::State test_model_cumulate() {
 
     UNITTEST_TRUE(model.verify());
 
-    auto compressed = model.compress();
+    auto compressed = model.compress(true);
     auto nodes = compressed.nodes();
     UNITTEST_EQ(nodes.size(), 5);
 
diff --git a/ark/api/planner.cpp b/ark/api/planner.cpp
index f4e7fa8ee..dba149a1e 100644
--- a/ark/api/planner.cpp
+++ b/ark/api/planner.cpp
@@ -69,7 +69,9 @@ std::string DefaultPlanner::Impl::plan(bool pretty) const {
             task_info["Id"] = next_node_id++;
 
             Json config;
-            if (!config_rules_.empty()) {
+            if (!op->config().empty()) {
+                config = op->config();
+            } else if (!config_rules_.empty()) {
                 const std::string op_str = op->serialize().dump();
                 for (auto &rule : config_rules_) {
                     auto config_str = rule(op_str, gpu_info.arch->name());
diff --git a/ark/include/ark.hpp b/ark/include/ark.hpp
index 2ca796172..e76687bce 100644
--- a/ark/include/ark.hpp
+++ b/ark/include/ark.hpp
@@ -8,6 +8,7 @@
 #include <ark/version.hpp>
 // clang-format on
 
+#include <ark/context_manager.hpp>
 #include <ark/data_type.hpp>
 #include <ark/dims.hpp>
 #include <ark/error.hpp>
diff --git a/ark/include/ark/context_manager.hpp b/ark/include/ark/context_manager.hpp
new file mode 100644
index 000000000..58271ea8c
--- /dev/null
+++ b/ark/include/ark/context_manager.hpp
@@ -0,0 +1,24 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef ARK_CONTEXT_MANAGER_HPP
+#define ARK_CONTEXT_MANAGER_HPP
+
+#include <ark/model.hpp>
+#include <map>
+
+namespace ark {
+
+class ContextManager {
+   public:
+    ContextManager(Model& model,
+                   const std::map<std::string, std::string>& context_map);
+
+   private:
+    class Impl;
+    std::shared_ptr<Impl> impl_;
+};
+
+}  // namespace ark
+
+#endif  // ARK_CONTEXT_MANAGER_HPP
diff --git a/ark/include/ark/model.hpp b/ark/include/ark/model.hpp
index 66551a037..35efe53d5 100644
--- a/ark/include/ark/model.hpp
+++ b/ark/include/ark/model.hpp
@@ -26,7 +26,7 @@ class Model : public ModelGraph {
 
     Model &operator=(const Model &other) = default;
 
-    Model compress() const;
+    Model compress(bool merge_nodes = false) const;
 
     int unique_tag();
 
@@ -87,23 +87,29 @@ class Model : public ModelGraph {
     // result in `output`.
     // Currently, only reduction along the last dimension is supported.
     Tensor reduce_sum(Tensor input, int axis, bool keepdims = true,
-                      Tensor output = NullTensor, const std::string &name = "");
+                      Tensor output = NullTensor,
+                      const std::string &config = "",
+                      const std::string &name = "");
     Tensor reduce_mean(Tensor input, int axis, bool keepdims = true,
                        Tensor output = NullTensor,
+                       const std::string &config = "",
                        const std::string &name = "");
     Tensor reduce_max(Tensor input, int axis, bool keepdims = true,
-                      Tensor output = NullTensor, const std::string &name = "");
+                      Tensor output = NullTensor,
+                      const std::string &config = "",
+                      const std::string &name = "");
 
     // Transposes the `input` tensor according to the given `permutation`.
     // For example, transpose(input, {0, 1 ,3, 2}) will swap the last two
     // dimensions of the input tensor. Currently, only 4D tensors are supported.
     Tensor transpose(Tensor input, const std::vector<int64_t> &permutation,
-                     Tensor output = NullTensor, const std::string &name = "");
+                     Tensor output = NullTensor, const std::string &config = "",
+                     const std::string &name = "");
     // Performs matrix multiplication between the `input` tensor and another
     // `other` tensor, storing the result in `output`.
     Tensor matmul(Tensor input, Tensor other, Tensor output = NullTensor,
                   bool trans_input = false, bool trans_other = false,
-                  const std::string &name = "");
+                  const std::string &config = "", const std::string &name = "");
     // Implements the 'im2col' method for 2D convolution layers, which takes an
     // `input` tensor and reshapes it to a 2D matrix by extracting image patches
     // from the input tensor based on the provided parameters.
@@ -120,72 +126,76 @@ class Model : public ModelGraph {
                     Tensor output = NullTensor, const std::string &name = "");
     // Calculates the exponential of the `input` tensor, element-wise.
     Tensor exp(Tensor input, Tensor output = NullTensor,
-               const std::string &name = "");
+               const std::string &config = "", const std::string &name = "");
     // Calculates the square root of the `input` tensor, element-wise.
     Tensor sqrt(Tensor input, Tensor output = NullTensor,
-                const std::string &name = "");
+                const std::string &config = "", const std::string &name = "");
     // Calculates the reverse square root of the `input` tensor, element-wise.
     Tensor rsqrt(Tensor input, Tensor output = NullTensor,
-                 const std::string &name = "");
+                 const std::string &config = "", const std::string &name = "");
     // ReLU activation
     Tensor relu(Tensor input, Tensor output = NullTensor,
-                const std::string &name = "");
+                const std::string &config = "", const std::string &name = "");
     // Copy the `input` tensor to `output` tensor
     Tensor copy(Tensor input, Tensor output = NullTensor,
-                const std::string &name = "");
+                const std::string &config = "", const std::string &name = "");
     Tensor copy(float val, Tensor output = NullTensor,
-                const std::string &name = "");
+                const std::string &config = "", const std::string &name = "");
     // Applies the Gaussian Error Linear Unit (GELU) activation function to the
     // `input` tensor, element-wise. GELU is a smooth approximation of the
     // rectifier function and is widely used in deep learning models.
     Tensor gelu(Tensor input, Tensor output = NullTensor,
-                const std::string &name = "");
+                const std::string &config = "", const std::string &name = "");
     // Sigmoid activation
     Tensor sigmoid(Tensor input, Tensor output = NullTensor,
+                   const std::string &config = "",
                    const std::string &name = "");
     // Performs rotary position embedding (RoPE) on the `input` tensor
     Tensor rope(Tensor input, Tensor other, Tensor output = NullTensor,
-                const std::string &name = "");
+                const std::string &config = "", const std::string &name = "");
 
     // Performs an element-wise addition operator between the `input` tensor
     // and the `other` tensor
     Tensor add(Tensor input, Tensor other, Tensor output = NullTensor,
-               const std::string &name = "");
+               const std::string &config = "", const std::string &name = "");
     Tensor add(Tensor input, float value, Tensor output = NullTensor,
-               const std::string &name = "");
+               const std::string &config = "", const std::string &name = "");
     // Performs an element-wise subtraction operator between the `input` tensor
     // and the `other` tensor
     Tensor sub(Tensor input, Tensor other, Tensor output = NullTensor,
-               const std::string &name = "");
+               const std::string &config = "", const std::string &name = "");
     Tensor sub(Tensor input, float value, Tensor output = NullTensor,
-               const std::string &name = "");
+               const std::string &config = "", const std::string &name = "");
     // Performs an element-wise multiplication operator between the `input`
     // tensor and the `other` tensor,
     Tensor mul(Tensor input, Tensor other, Tensor output = NullTensor,
-               const std::string &name = "");
+               const std::string &config = "", const std::string &name = "");
     Tensor mul(Tensor input, float value, Tensor output = NullTensor,
-               const std::string &name = "");
+               const std::string &config = "", const std::string &name = "");
     // Performs an element-wise division operator between the `input`
     // tensor and the `other` tensor,
     Tensor div(Tensor input, Tensor other, Tensor output = NullTensor,
-               const std::string &name = "");
+               const std::string &config = "", const std::string &name = "");
     Tensor div(Tensor input, float value, Tensor output = NullTensor,
-               const std::string &name = "");
+               const std::string &config = "", const std::string &name = "");
 
     Tensor send(Tensor input, int remote_rank, int tag,
-                Tensor output = NullTensor, const std::string &name = "");
+                Tensor output = NullTensor, const std::string &config = "",
+                const std::string &name = "");
     // Blocks the execution until the corresponding 'send' operator with the
     // specified `id` is completed.
-    Tensor send_done(Tensor input, const std::string &name = "");
+    Tensor send_done(Tensor input, const std::string &config = "",
+                     const std::string &name = "");
     // Receives a tensor from a source rank (@p src_rank), identified by the
     // `id` parameter. Blocks the execution until the corresponding 'recv'
     // operator is completed.
     Tensor recv(Tensor output, int remote_rank, int tag,
-                const std::string &name = "");
+                const std::string &config = "", const std::string &name = "");
     //
     Tensor put_packet(Tensor input, Tensor local_tmp_buf, Tensor recv_buf,
                       int id, int rank, int dst_rank, size_t dst_offset,
-                      int flag, const std::string &name = "");
+                      int flag, const std::string &config = "",
+                      const std::string &name = "");
     // Performs an all-reduce operator across all ranks, aggregating the input
     // tensors. Takes the `input` tensor, the current GPU's rank, and the
     // total number of ranks `rank_num`.
@@ -200,10 +210,12 @@ class Model : public ModelGraph {
                                    const std::string &name = "");
     /// Embedding layer.
     Tensor embedding(Tensor input, Tensor weight, Tensor output = NullTensor,
+                     const std::string &config = "",
                      const std::string &name = "");
     /// Tensor type casting.
     Tensor cast(Tensor input, const DataType &data_type,
-                Tensor output = NullTensor, const std::string &name = "");
+                Tensor output = NullTensor, const std::string &config = "",
+                const std::string &name = "");
 
     // sync across multi devices
     Tensor device_sync(Tensor input, int npeers, const std::string &name = "");
diff --git a/ark/include/ark/model_graph.hpp b/ark/include/ark/model_graph.hpp
index bd7c59033..f6390a2a9 100644
--- a/ark/include/ark/model_graph.hpp
+++ b/ark/include/ark/model_graph.hpp
@@ -25,7 +25,7 @@ class ModelGraph {
 
     int world_size() const;
 
-    void compress_nodes();
+    void compress_nodes(bool merge_nodes = false);
 
     bool compressed() const;
 
@@ -38,6 +38,7 @@ class ModelGraph {
 
    protected:
     friend class Model;
+    friend class ContextManager;
 
     class Impl;
     std::unique_ptr<Impl> impl_;
diff --git a/ark/model/model_graph_impl.cpp b/ark/model/model_graph_impl.cpp
index 17410d23f..53a7fa851 100644
--- a/ark/model/model_graph_impl.cpp
+++ b/ark/model/model_graph_impl.cpp
@@ -17,6 +17,36 @@
 
 namespace ark {
 
+ModelGraphContextStack::ModelGraphContextStack(const ModelGraphContextStack &other) {
+    for (const auto &pair : other.storage_) {
+        for (const auto &value : pair.second) {
+            this->storage_[pair.first].push_back(value);
+        }
+    }
+}
+
+void ModelGraphContextStack::push(const std::string &key, const std::string &value) {
+    this->storage_[key].push_back(std::make_shared<std::string>(value));
+}
+
+void ModelGraphContextStack::pop(const std::string &key) {
+    auto it = this->storage_.find(key);
+    if (it == this->storage_.end() || it->second.empty()) {
+        ERR(ModelError, "context stack is empty");
+    }
+    it->second.pop_back();
+}
+
+std::map<std::string, std::string> ModelGraphContextStack::current_context() const {
+    std::map<std::string, std::string> cur;
+    for (const auto &pair : this->storage_) {
+        if (!pair.second.empty()) {
+            cur[pair.first] = *pair.second.back();
+        }
+    }
+    return cur;
+}
+
 ModelGraph::Impl::Impl(const ModelGraph::Impl &other) { *this = other; }
 
 ModelGraph::Impl &ModelGraph::Impl::operator=(const ModelGraph::Impl &other) {
@@ -25,6 +55,7 @@ ModelGraph::Impl &ModelGraph::Impl::operator=(const ModelGraph::Impl &other) {
     for (const auto &node : other.nodes_) {
         ModelNodeRef new_node = std::make_shared<ModelNode>();
         new_node->ops = node->ops;
+        new_node->context = node->context;
         node_map.emplace(node, new_node);
         nodes_.push_back(new_node);
     }
@@ -61,13 +92,16 @@ ModelGraph::Impl &ModelGraph::Impl::operator=(const ModelGraph::Impl &other) {
     rank_ = other.rank_;
     world_size_ = other.world_size_;
     compressed_ = other.compressed_;
+    context_stack_ = std::make_shared<ModelGraphContextStack>(*(other.context_stack_));
     return *this;
 }
 
-void ModelGraph::Impl::compress_nodes() {
+void ModelGraph::Impl::compress_nodes(bool merge_nodes) {
     if (!compressed_) {
         this->recursive_remove_virtual_nodes();
-        this->recursive_merge_nodes();
+        if (merge_nodes) {
+            this->recursive_merge_nodes();
+        }
         compressed_ = true;
     }
 }
@@ -171,6 +205,8 @@ ModelNodeRef ModelGraph::Impl::add_op(ModelOpRef op) {
         producer->consumers.push_back(node);
     }
 
+    node->context = context_stack_->current_context();
+
     nodes_.push_back(node);
     return node;
 }
diff --git a/ark/model/model_graph_impl.hpp b/ark/model/model_graph_impl.hpp
index 6c109b51e..fbfc54c7e 100644
--- a/ark/model/model_graph_impl.hpp
+++ b/ark/model/model_graph_impl.hpp
@@ -4,6 +4,7 @@
 #ifndef ARK_MODEL_GRAPH_IMPL_HPP_
 #define ARK_MODEL_GRAPH_IMPL_HPP_
 
+#include <list>
 #include <map>
 #include <set>
 #include <tuple>
@@ -18,17 +19,39 @@
 
 namespace ark {
 
+class ModelGraphContextStack {
+   private:
+    std::map<std::string, std::list<std::shared_ptr<std::string>>> storage_;
+
+   public:
+    ModelGraphContextStack() = default;
+
+    ModelGraphContextStack(const ModelGraphContextStack &other);
+
+    ~ModelGraphContextStack() = default;
+
+    void push(const std::string &key, const std::string &value);
+
+    void pop(const std::string &key);
+
+    std::map<std::string, std::string> current_context() const;
+};
+
 class ModelGraph::Impl {
    public:
     Impl(int rank, int world_size)
-        : rank_(rank), world_size_(world_size), compressed_(false){};
+        : rank_(rank),
+          world_size_(world_size),
+          compressed_(false),
+          context_stack_(std::make_shared<ModelGraphContextStack>()) {};
 
     Impl(const Impl &other);
 
     Impl &operator=(const Impl &other);
 
     template <typename T, typename... Args>
-    ModelOpRef create_op(const std::string &name, Args &&... args) {
+    ModelOpRef create_op(const std::string &config, const std::string &name,
+                         Args &&...args) {
         ModelOpRef op = std::make_shared<T>(std::forward<Args>(args)...);
         std::string name_copy;
         if (name.empty()) {
@@ -41,6 +64,7 @@ class ModelGraph::Impl {
         if (count > 0) {
             name_copy += "_" + std::to_string(count);
         }
+        op->set_config(config);
         op->set_name(name_copy);
         add_op(op);
         return op;
@@ -50,7 +74,7 @@ class ModelGraph::Impl {
 
     int world_size() const { return world_size_; }
 
-    void compress_nodes();
+    void compress_nodes(bool merge_nodes = false);
 
     bool compressed() const { return compressed_; }
 
@@ -100,6 +124,12 @@ class ModelGraph::Impl {
 
     /// True if `compress_nodes` has been called.
     bool compressed_;
+
+   protected:
+    friend class ContextManager;
+
+    /// Graph context stack.
+    std::shared_ptr<ModelGraphContextStack> context_stack_;
 };
 
 }  // namespace ark
diff --git a/ark/model/model_node.hpp b/ark/model/model_node.hpp
index 7838ca120..c86b4d29a 100644
--- a/ark/model/model_node.hpp
+++ b/ark/model/model_node.hpp
@@ -26,6 +26,9 @@ class ModelNode {
 
     /// The list of @ref ModelNode that this @ref ModelNode depends on.
     UniqueList<ModelNodeRef> producers;
+
+    /// Graph context of this node.
+    std::map<std::string, std::string> context;
 };
 
 }  // namespace ark
diff --git a/ark/model/model_op.cpp b/ark/model/model_op.cpp
index b5a0645c8..e9689cdcb 100644
--- a/ark/model/model_op.cpp
+++ b/ark/model/model_op.cpp
@@ -87,6 +87,14 @@ const ModelOpType ModelOpT::from_name(const std::string &type_name) {
     return it->second;
 }
 
+void ModelOp::set_config(const std::string &config) {
+    if (!config.empty()) {
+        config_ = Json::parse(config);
+    } else {
+        config_.clear();
+    }
+}
+
 std::vector<ModelTensorRef> ModelOp::input_tensors() const {
     // input_tensors = read_tensors || write_tensors
     std::set<ModelTensorRef> input_tensors;
@@ -179,6 +187,9 @@ Json ModelOp::serialize() const {
     for (auto &arg : args_) {
         j["Args"][arg.first] = arg.second.serialize();
     }
+    if (!config_.empty()) {
+        j["Config"] = config_;
+    }
     return j;
 }
 
diff --git a/ark/model/model_op.hpp b/ark/model/model_op.hpp
index e8c220258..091a9f163 100644
--- a/ark/model/model_op.hpp
+++ b/ark/model/model_op.hpp
@@ -50,8 +50,8 @@ class ModelOp {
         return "";
     }
 
-    virtual std::vector<ModelOpArg> impl_args([
-        [maybe_unused]] const Json &config) const {
+    virtual std::vector<ModelOpArg> impl_args(
+        [[maybe_unused]] const Json &config) const {
         return {};
     }
 
@@ -60,10 +60,14 @@ class ModelOp {
         return {{"NumTasks", 0}, {"NumWarps", 0}, {"SramBytes", 0}};
     }
 
+    void set_config(const std::string &config);
+
     void set_name(const std::string &name) { name_ = name; }
 
     ModelOpType type() const { return type_; }
 
+    const Json &config() const { return config_; }
+
     const std::string &name() const { return name_; }
 
     bool is_virtual() const { return is_virtual_; }
@@ -100,6 +104,7 @@ class ModelOp {
         const std::vector<std::string> &template_args = {});
 
     ModelOpType type_;
+    Json config_;
     std::string name_;
     bool is_virtual_;
     std::vector<ModelTensorRef> read_tensors_;
diff --git a/ark/ops/ops_arithmetic.cpp b/ark/ops/ops_arithmetic.cpp
index aeece0d77..ef85b5d22 100644
--- a/ark/ops/ops_arithmetic.cpp
+++ b/ark/ops/ops_arithmetic.cpp
@@ -12,9 +12,10 @@ ModelOpAdd::ModelOpAdd(ModelTensorRef input, ModelTensorRef other,
     : ModelOpBroadcast2("Add", input, other, output) {}
 
 Tensor Model::add(Tensor input, Tensor other, Tensor output,
-                  const std::string &name) {
+                  const std::string &config, const std::string &name) {
     return impl_
-        ->create_op<ModelOpAdd>(name, input.ref_, other.ref_, output.ref_)
+        ->create_op<ModelOpAdd>(config, name, input.ref_, other.ref_,
+                                output.ref_)
         ->result_tensors()[0];
 }
 
@@ -23,9 +24,10 @@ ModelOpMul::ModelOpMul(ModelTensorRef input, ModelTensorRef other,
     : ModelOpBroadcast2("Mul", input, other, output) {}
 
 Tensor Model::mul(Tensor input, Tensor other, Tensor output,
-                  const std::string &name) {
+                  const std::string &config, const std::string &name) {
     return impl_
-        ->create_op<ModelOpMul>(name, input.ref_, other.ref_, output.ref_)
+        ->create_op<ModelOpMul>(config, name, input.ref_, other.ref_,
+                                output.ref_)
         ->result_tensors()[0];
 }
 
@@ -34,9 +36,10 @@ ModelOpSub::ModelOpSub(ModelTensorRef input, ModelTensorRef other,
     : ModelOpBroadcast2("Sub", input, other, output) {}
 
 Tensor Model::sub(Tensor input, Tensor other, Tensor output,
-                  const std::string &name) {
+                  const std::string &config, const std::string &name) {
     return impl_
-        ->create_op<ModelOpSub>(name, input.ref_, other.ref_, output.ref_)
+        ->create_op<ModelOpSub>(config, name, input.ref_, other.ref_,
+                                output.ref_)
         ->result_tensors()[0];
 }
 
@@ -45,9 +48,10 @@ ModelOpDiv::ModelOpDiv(ModelTensorRef input, ModelTensorRef other,
     : ModelOpBroadcast2("Div", input, other, output) {}
 
 Tensor Model::div(Tensor input, Tensor other, Tensor output,
-                  const std::string &name) {
+                  const std::string &config, const std::string &name) {
     return impl_
-        ->create_op<ModelOpDiv>(name, input.ref_, other.ref_, output.ref_)
+        ->create_op<ModelOpDiv>(config, name, input.ref_, other.ref_,
+                                output.ref_)
         ->result_tensors()[0];
 }
 
diff --git a/ark/ops/ops_cast.cpp b/ark/ops/ops_cast.cpp
index 9873c8367..e9527ad8c 100644
--- a/ark/ops/ops_cast.cpp
+++ b/ark/ops/ops_cast.cpp
@@ -105,7 +105,7 @@ ModelOpByteCast::ModelOpByteCast(ModelTensorRef input, ModelDataType data_type,
 }
 
 Tensor Model::cast(Tensor input, const DataType &data_type, Tensor output,
-                   const std::string &name) {
+                   const std::string &config, const std::string &name) {
     check_null(input.ref());
     if (output.is_null()) {
         if (input.data_type() == data_type) {
@@ -119,14 +119,14 @@ Tensor Model::cast(Tensor input, const DataType &data_type, Tensor output,
             byte_cast_helper(input.ref(), data_type.ref(), new_shape,
                              new_strides, new_offsets, new_padded_shape);
             return impl_
-                ->create_op<ModelOpByteCast>(name, input.ref(), data_type.ref(),
-                                             new_shape, new_strides,
-                                             new_offsets, new_padded_shape)
+                ->create_op<ModelOpByteCast>(
+                    config, name, input.ref(), data_type.ref(), new_shape,
+                    new_strides, new_offsets, new_padded_shape)
                 ->result_tensors()[0];
         }
     }
     return impl_
-        ->create_op<ModelOpCast>(name, input.ref(), data_type.ref(),
+        ->create_op<ModelOpCast>(config, name, input.ref(), data_type.ref(),
                                  output.ref())
         ->result_tensors()[0];
 }
diff --git a/ark/ops/ops_communication.cpp b/ark/ops/ops_communication.cpp
index e335f869e..4e76d2ede 100644
--- a/ark/ops/ops_communication.cpp
+++ b/ark/ops/ops_communication.cpp
@@ -157,23 +157,25 @@ Json ModelOpRecv::default_config([[maybe_unused]] const ArchRef arch) const {
 }
 
 Tensor Model::send(Tensor input, int remote_rank, int tag, Tensor output,
-                   const std::string &name) {
+                   const std::string &config, const std::string &name) {
     tags_.insert(tag);
     return impl_
-        ->create_op<ModelOpSend>(name, input.ref(), remote_rank, tag,
+        ->create_op<ModelOpSend>(config, name, input.ref(), remote_rank, tag,
                                  output.ref())
         ->result_tensors()[0];
 }
 
-Tensor Model::send_done(Tensor input, const std::string &name) {
-    return impl_->create_op<ModelOpSendDone>(name, input.ref())
+Tensor Model::send_done(Tensor input, const std::string &config,
+                        const std::string &name) {
+    return impl_->create_op<ModelOpSendDone>(config, name, input.ref())
         ->result_tensors()[0];
 }
 
 Tensor Model::recv(Tensor output, int remote_rank, int tag,
-                   const std::string &name) {
+                   const std::string &config, const std::string &name) {
     tags_.insert(tag);
-    return impl_->create_op<ModelOpRecv>(name, output.ref(), remote_rank, tag)
+    return impl_
+        ->create_op<ModelOpRecv>(config, name, output.ref(), remote_rank, tag)
         ->result_tensors()[0];
 }
 
diff --git a/ark/ops/ops_copy.cpp b/ark/ops/ops_copy.cpp
index 4f32966b8..4914c34a4 100644
--- a/ark/ops/ops_copy.cpp
+++ b/ark/ops/ops_copy.cpp
@@ -20,8 +20,9 @@ ModelOpCopy::ModelOpCopy(ModelTensorRef input, ModelTensorRef output)
     verify();
 }
 
-Tensor Model::copy(Tensor input, Tensor output, const std::string &name) {
-    return impl_->create_op<ModelOpCopy>(name, input.ref_, output.ref_)
+Tensor Model::copy(Tensor input, Tensor output, const std::string &config,
+                   const std::string &name) {
+    return impl_->create_op<ModelOpCopy>(config, name, input.ref_, output.ref_)
         ->result_tensors()[0];
 }
 
diff --git a/ark/ops/ops_embedding.cpp b/ark/ops/ops_embedding.cpp
index 542c0fcac..466b9a4e5 100644
--- a/ark/ops/ops_embedding.cpp
+++ b/ark/ops/ops_embedding.cpp
@@ -70,9 +70,9 @@ Json ModelOpEmbedding::default_config([
 }
 
 Tensor Model::embedding(Tensor input, Tensor weight, Tensor output,
-                        const std::string &name) {
+                        const std::string &config, const std::string &name) {
     return impl_
-        ->create_op<ModelOpEmbedding>(name, input.ref_, weight.ref_,
+        ->create_op<ModelOpEmbedding>(config, name, input.ref_, weight.ref_,
                                       output.ref_)
         ->result_tensors()[0];
 }
diff --git a/ark/ops/ops_identity.cpp b/ark/ops/ops_identity.cpp
index 065cd9a52..dd398d8a5 100644
--- a/ark/ops/ops_identity.cpp
+++ b/ark/ops/ops_identity.cpp
@@ -31,7 +31,7 @@ Tensor Model::identity(Tensor input, const std::vector<Tensor> &deps,
     for (auto &dep : deps) {
         deps_ref.emplace_back(dep.ref_);
     }
-    return impl_->create_op<ModelOpIdentity>(name, input.ref_, deps_ref)
+    return impl_->create_op<ModelOpIdentity>("", name, input.ref_, deps_ref)
         ->result_tensors()[0];
 }
 
diff --git a/ark/ops/ops_math.cpp b/ark/ops/ops_math.cpp
index 1067c561a..b2833dcca 100644
--- a/ark/ops/ops_math.cpp
+++ b/ark/ops/ops_math.cpp
@@ -24,48 +24,55 @@ ModelOpMath::ModelOpMath(const std::string &type_name, ModelTensorRef input,
 ModelOpExp::ModelOpExp(ModelTensorRef input, ModelTensorRef output)
     : ModelOpMath("Exp", input, output) {}
 
-Tensor Model::exp(Tensor input, Tensor output, const std::string &name) {
-    return impl_->create_op<ModelOpExp>(name, input.ref_, output.ref_)
+Tensor Model::exp(Tensor input, Tensor output, const std::string &config,
+                  const std::string &name) {
+    return impl_->create_op<ModelOpExp>(config, name, input.ref_, output.ref_)
         ->result_tensors()[0];
 }
 
 ModelOpGelu::ModelOpGelu(ModelTensorRef input, ModelTensorRef output)
     : ModelOpMath("Gelu", input, output) {}
 
-Tensor Model::gelu(Tensor input, Tensor output, const std::string &name) {
-    return impl_->create_op<ModelOpGelu>(name, input.ref_, output.ref_)
+Tensor Model::gelu(Tensor input, Tensor output, const std::string &config,
+                   const std::string &name) {
+    return impl_->create_op<ModelOpGelu>(config, name, input.ref_, output.ref_)
         ->result_tensors()[0];
 }
 
 ModelOpRelu::ModelOpRelu(ModelTensorRef input, ModelTensorRef output)
     : ModelOpMath("Relu", input, output) {}
 
-Tensor Model::relu(Tensor input, Tensor output, const std::string &name) {
-    return impl_->create_op<ModelOpRelu>(name, input.ref_, output.ref_)
+Tensor Model::relu(Tensor input, Tensor output, const std::string &config,
+                   const std::string &name) {
+    return impl_->create_op<ModelOpRelu>(config, name, input.ref_, output.ref_)
         ->result_tensors()[0];
 }
 
 ModelOpRsqrt::ModelOpRsqrt(ModelTensorRef input, ModelTensorRef output)
     : ModelOpMath("Rsqrt", input, output) {}
 
-Tensor Model::rsqrt(Tensor input, Tensor output, const std::string &name) {
-    return impl_->create_op<ModelOpRsqrt>(name, input.ref_, output.ref_)
+Tensor Model::rsqrt(Tensor input, Tensor output, const std::string &config,
+                    const std::string &name) {
+    return impl_->create_op<ModelOpRsqrt>(config, name, input.ref_, output.ref_)
         ->result_tensors()[0];
 }
 
 ModelOpSigmoid::ModelOpSigmoid(ModelTensorRef input, ModelTensorRef output)
     : ModelOpMath("Sigmoid", input, output) {}
 
-Tensor Model::sigmoid(Tensor input, Tensor output, const std::string &name) {
-    return impl_->create_op<ModelOpSigmoid>(name, input.ref_, output.ref_)
+Tensor Model::sigmoid(Tensor input, Tensor output, const std::string &config,
+                      const std::string &name) {
+    return impl_
+        ->create_op<ModelOpSigmoid>(config, name, input.ref_, output.ref_)
         ->result_tensors()[0];
 }
 
 ModelOpSqrt::ModelOpSqrt(ModelTensorRef input, ModelTensorRef output)
     : ModelOpMath("Sqrt", input, output) {}
 
-Tensor Model::sqrt(Tensor input, Tensor output, const std::string &name) {
-    return impl_->create_op<ModelOpSqrt>(name, input.ref_, output.ref_)
+Tensor Model::sqrt(Tensor input, Tensor output, const std::string &config,
+                   const std::string &name) {
+    return impl_->create_op<ModelOpSqrt>(config, name, input.ref_, output.ref_)
         ->result_tensors()[0];
 }
 
diff --git a/ark/ops/ops_matmul.cpp b/ark/ops/ops_matmul.cpp
index a24b95d72..1976699a1 100644
--- a/ark/ops/ops_matmul.cpp
+++ b/ark/ops/ops_matmul.cpp
@@ -255,10 +255,10 @@ Json ModelOpMatmul::default_config(const ArchRef arch) const {
 
 Tensor Model::matmul(Tensor input, Tensor other, Tensor output,
                      bool trans_input, bool trans_other,
-                     const std::string &name) {
+                     const std::string &config, const std::string &name) {
     return impl_
-        ->create_op<ModelOpMatmul>(name, input.ref(), other.ref(), output.ref(),
-                                   trans_input, trans_other)
+        ->create_op<ModelOpMatmul>(config, name, input.ref(), other.ref(),
+                                   output.ref(), trans_input, trans_other)
         ->result_tensors()[0];
 }
 
diff --git a/ark/ops/ops_noop.cpp b/ark/ops/ops_noop.cpp
index 894ab29be..42fe5fdf5 100644
--- a/ark/ops/ops_noop.cpp
+++ b/ark/ops/ops_noop.cpp
@@ -30,7 +30,7 @@ Json ModelOpNoop::default_config([[maybe_unused]] const ArchRef arch) const {
 }
 
 void Model::noop(Tensor input, const std::string &name) {
-    impl_->create_op<ModelOpNoop>(name, input.ref_);
+    impl_->create_op<ModelOpNoop>("", name, input.ref_);
 }
 
 }  // namespace ark
diff --git a/ark/ops/ops_reduce.cpp b/ark/ops/ops_reduce.cpp
index 1c91a2f0b..dadd049d2 100644
--- a/ark/ops/ops_reduce.cpp
+++ b/ark/ops/ops_reduce.cpp
@@ -128,25 +128,25 @@ Json ModelOpReduce::default_config([[maybe_unused]] const ArchRef arch) const {
 }
 
 Tensor Model::reduce_max(Tensor input, int axis, bool keepdims, Tensor output,
-                         const std::string &name) {
+                         const std::string &config, const std::string &name) {
     return impl_
-        ->create_op<ModelOpReduceMax>(name, input.ref_, axis, keepdims,
+        ->create_op<ModelOpReduceMax>(config, name, input.ref_, axis, keepdims,
                                       output.ref_)
         ->result_tensors()[0];
 }
 
 Tensor Model::reduce_mean(Tensor input, int axis, bool keepdims, Tensor output,
-                          const std::string &name) {
+                          const std::string &config, const std::string &name) {
     return impl_
-        ->create_op<ModelOpReduceMean>(name, input.ref_, axis, keepdims,
+        ->create_op<ModelOpReduceMean>(config, name, input.ref_, axis, keepdims,
                                        output.ref_)
         ->result_tensors()[0];
 }
 
 Tensor Model::reduce_sum(Tensor input, int axis, bool keepdims, Tensor output,
-                         const std::string &name) {
+                         const std::string &config, const std::string &name) {
     return impl_
-        ->create_op<ModelOpReduceSum>(name, input.ref_, axis, keepdims,
+        ->create_op<ModelOpReduceSum>(config, name, input.ref_, axis, keepdims,
                                       output.ref_)
         ->result_tensors()[0];
 }
diff --git a/ark/ops/ops_refer.cpp b/ark/ops/ops_refer.cpp
index 782d6708c..68c61b30f 100644
--- a/ark/ops/ops_refer.cpp
+++ b/ark/ops/ops_refer.cpp
@@ -20,7 +20,7 @@ Tensor Model::refer(Tensor input, const Dims &shape, const Dims &strides,
                     const Dims &offsets, const Dims &padded_shape,
                     const std::string &name) {
     return impl_
-        ->create_op<ModelOpRefer>(name, input.ref_, shape, strides, offsets,
+        ->create_op<ModelOpRefer>("", name, input.ref_, shape, strides, offsets,
                                   padded_shape)
         ->result_tensors()[0];
 }
diff --git a/ark/ops/ops_reshape.cpp b/ark/ops/ops_reshape.cpp
index c4e192908..6ecbba466 100644
--- a/ark/ops/ops_reshape.cpp
+++ b/ark/ops/ops_reshape.cpp
@@ -199,8 +199,8 @@ Tensor Model::reshape(Tensor input, const Dims &shape, bool allowzero,
     reshape_helper(input.ref_, Dims{inferred_shape}, allowzero, new_shape,
                    new_strides, new_offs);
     return impl_
-        ->create_op<ModelOpReshape>(name, input.ref_, new_shape, new_strides,
-                                    new_offs)
+        ->create_op<ModelOpReshape>("", name, input.ref_, new_shape,
+                                    new_strides, new_offs)
         ->result_tensors()[0];
 }
 
diff --git a/ark/ops/ops_rope.cpp b/ark/ops/ops_rope.cpp
index 06c1c915e..36015aae5 100644
--- a/ark/ops/ops_rope.cpp
+++ b/ark/ops/ops_rope.cpp
@@ -12,9 +12,10 @@ ModelOpRope::ModelOpRope(ModelTensorRef input, ModelTensorRef other,
     : ModelOpBroadcast2("Rope", input, other, output) {}
 
 Tensor Model::rope(Tensor input, Tensor other, Tensor output,
-                   const std::string &name) {
+                   const std::string &config, const std::string &name) {
     return impl_
-        ->create_op<ModelOpRope>(name, input.ref_, other.ref_, output.ref_)
+        ->create_op<ModelOpRope>(config, name, input.ref_, other.ref_,
+                                 output.ref_)
         ->result_tensors()[0];
 }
 
diff --git a/ark/ops/ops_scalar.cpp b/ark/ops/ops_scalar.cpp
index 944a7247c..b5c10f1c3 100644
--- a/ark/ops/ops_scalar.cpp
+++ b/ark/ops/ops_scalar.cpp
@@ -115,20 +115,21 @@ std::vector<ModelOpArg> ModelOpScalarMul::impl_args([
 Tensor Model::constant(float val, const Dims &shape, DataType data_type,
                        const std::string &name) {
     return impl_
-        ->create_op<ModelOpScalarAssign>(name, val, shape, data_type.ref(),
+        ->create_op<ModelOpScalarAssign>("", name, val, shape, data_type.ref(),
                                          nullptr)
         ->result_tensors()[0];
 }
 
-Tensor Model::copy(float val, Tensor output, const std::string &name) {
+Tensor Model::copy(float val, Tensor output, const std::string &config,
+                   const std::string &name) {
     if (output == NullTensor) {
         return impl_
-            ->create_op<ModelOpScalarAssign>(name, val, Dims{1}, FP32.ref(),
-                                             output.ref())
+            ->create_op<ModelOpScalarAssign>(config, name, val, Dims{1},
+                                             FP32.ref(), output.ref())
             ->result_tensors()[0];
     } else {
         return impl_
-            ->create_op<ModelOpScalarAssign>(name, val, output.shape(),
+            ->create_op<ModelOpScalarAssign>(config, name, val, output.shape(),
                                              output.data_type().ref(),
                                              output.ref())
             ->result_tensors()[0];
@@ -136,30 +137,34 @@ Tensor Model::copy(float val, Tensor output, const std::string &name) {
 }
 
 Tensor Model::add(Tensor input, float value, Tensor output,
-                  const std::string &name) {
+                  const std::string &config, const std::string &name) {
     return impl_
-        ->create_op<ModelOpScalarAdd>(name, input.ref_, value, output.ref_)
+        ->create_op<ModelOpScalarAdd>(config, name, input.ref_, value,
+                                      output.ref_)
         ->result_tensors()[0];
 }
 
 Tensor Model::sub(Tensor input, float value, Tensor output,
-                  const std::string &name) {
+                  const std::string &config, const std::string &name) {
     return impl_
-        ->create_op<ModelOpScalarAdd>(name, input.ref_, -value, output.ref_)
+        ->create_op<ModelOpScalarAdd>(config, name, input.ref_, -value,
+                                      output.ref_)
         ->result_tensors()[0];
 }
 
 Tensor Model::mul(Tensor input, float value, Tensor output,
-                  const std::string &name) {
+                  const std::string &config, const std::string &name) {
     return impl_
-        ->create_op<ModelOpScalarMul>(name, input.ref_, value, output.ref_)
+        ->create_op<ModelOpScalarMul>(config, name, input.ref_, value,
+                                      output.ref_)
         ->result_tensors()[0];
 }
 
 Tensor Model::div(Tensor input, float value, Tensor output,
-                  const std::string &name) {
+                  const std::string &config, const std::string &name) {
     return impl_
-        ->create_op<ModelOpScalarMul>(name, input.ref_, 1 / value, output.ref_)
+        ->create_op<ModelOpScalarMul>(config, name, input.ref_, 1 / value,
+                                      output.ref_)
         ->result_tensors()[0];
 }
 
diff --git a/ark/ops/ops_tensor.cpp b/ark/ops/ops_tensor.cpp
index 0279ab311..77091fa57 100644
--- a/ark/ops/ops_tensor.cpp
+++ b/ark/ops/ops_tensor.cpp
@@ -27,7 +27,7 @@ Tensor Model::tensor(const Dims &shape, const DataType &data_type,
                      const Dims &strides, const Dims &offsets,
                      const Dims &padded_shape, const std::string &name) {
     return impl_
-        ->create_op<ModelOpTensor>(name, nullptr, shape, data_type.ref(),
+        ->create_op<ModelOpTensor>("", name, nullptr, shape, data_type.ref(),
                                    strides, offsets, padded_shape)
         ->result_tensors()[0];
 }
diff --git a/ark/ops/ops_transpose.cpp b/ark/ops/ops_transpose.cpp
index 3f0ed0131..f099c7fb7 100644
--- a/ark/ops/ops_transpose.cpp
+++ b/ark/ops/ops_transpose.cpp
@@ -124,9 +124,10 @@ Json ModelOpTranspose::default_config([
 }
 
 Tensor Model::transpose(Tensor input, const std::vector<int64_t> &permutation,
-                        Tensor output, const std::string &name) {
+                        Tensor output, const std::string &config,
+                        const std::string &name) {
     return impl_
-        ->create_op<ModelOpTranspose>(name, input.ref_, permutation,
+        ->create_op<ModelOpTranspose>(config, name, input.ref_, permutation,
                                       output.ref_)
         ->result_tensors()[0];
 }
diff --git a/arkprof.py b/arkprof.py
index 9e67c2dfc..5fb62e118 100644
--- a/arkprof.py
+++ b/arkprof.py
@@ -1,6 +1,7 @@
 import ark
 import sys
 
+ark.init()
 ark.Profiler(ark.Plan.from_file(sys.argv[1])).run(
     iter=1000, profile_processor_groups=False
 )
diff --git a/examples/tutorial/context_tutorial.py b/examples/tutorial/context_tutorial.py
new file mode 100644
index 000000000..fb01f0a0c
--- /dev/null
+++ b/examples/tutorial/context_tutorial.py
@@ -0,0 +1,117 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import ark
+import time
+import torch
+import torch.nn.functional as F
+
+
+class VanillaSoftmax(ark.Module):
+    def __init__(self):
+        super(Softmax, self).__init__()
+
+    def forward(self, input):
+        max = ark.reduce_max(input, axis=-1)
+        output = ark.sub(input, max)
+        output = ark.exp(output)
+        sum = ark.reduce_sum(output, axis=-1)
+        output = ark.div(output, sum)
+        return output
+
+
+class Softmax(ark.Module):
+    def __init__(self):
+        super(Softmax, self).__init__()
+
+    def forward(self, input):
+        with ark.ContextManager(
+            processor_range=[0, 304],
+            warp_range=[0, 8],
+            sram_range=[0, 0],
+            task_id=0,
+        ):
+            max = ark.reduce_max(
+                input,
+                axis=-1,
+                config={
+                    "NumWarps": 1,
+                    "ImplType": "WarpWise",
+                    "SramBytes": 0,
+                    "NumTasks": 65536,
+                },
+            )
+            output = ark.sub(
+                input,
+                max,
+                config={
+                    "NumWarps": 1,
+                    "SramBytes": 0,
+                    "Tile": [1, 2048],
+                    "NumTasks": 65536,
+                },
+            )
+            output = ark.exp(
+                output,
+                config={
+                    "NumWarps": 1,
+                    "SramBytes": 0,
+                    "Tile": [1, 2048],
+                    "NumTasks": 65536,
+                },
+            )
+            sum = ark.reduce_sum(
+                output,
+                axis=-1,
+                config={
+                    "NumWarps": 1,
+                    "ImplType": "WarpWise",
+                    "SramBytes": 0,
+                    "NumTasks": 65536,
+                },
+            )
+            output = ark.div(
+                output,
+                sum,
+                config={
+                    "NumWarps": 1,
+                    "SramBytes": 0,
+                    "Tile": [1, 2048],
+                    "NumTasks": 65536,
+                },
+            )
+            return output
+
+
+def eval(tensor: ark.Tensor):
+    with ark.Runtime() as rt:
+        rt.launch()
+        rt.run()
+        return tensor.to_torch()
+
+
+def perf():
+    with ark.Runtime() as rt:
+        rt.launch()
+
+        start = time.time()
+        rt.run(iter=1000)
+        end = time.time()
+        return (end - start) / 1000
+
+
+if __name__ == "__main__":
+    ark.init()
+
+    shape = (32, 2048, 2048)
+
+    input = torch.randn(*shape).to("cuda:0")
+
+    output = Softmax()(ark.Tensor.from_torch(input))
+
+    if torch.allclose(eval(output), F.softmax(input, dim=-1), atol=1e-5):
+        print("Correct result")
+    else:
+        print("Incorrect result")
+
+    print(f"Performance: {(perf() * 1e3):.3f} ms/iter")
diff --git a/python/ark/__init__.py b/python/ark/__init__.py
index e96972906..00370e683 100644
--- a/python/ark/__init__.py
+++ b/python/ark/__init__.py
@@ -1,7 +1,6 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-import sys
 import os
 
 if os.environ.get("ARK_ROOT", None) is None:
@@ -102,3 +101,4 @@ def set_world_size(world_size):
 )
 from .planner import DefaultPlanner, Plan
 from .profiler import Profiler
+from .context_manager import ContextManager
diff --git a/python/ark/context_manager.py b/python/ark/context_manager.py
new file mode 100644
index 000000000..443e1ca5d
--- /dev/null
+++ b/python/ark/context_manager.py
@@ -0,0 +1,24 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import json
+from .model import Model
+from ._ark_core import _ContextManager
+
+
+class ContextManager(_ContextManager):
+    def __init__(self, **kwargs):
+        context_map = {key: json.dumps(value) for key, value in kwargs.items()}
+        super().__init__(Model.get_model(), context_map)
+
+    def __enter__(self) -> "ContextManager":
+        """
+        Enter the context manager.
+        """
+        return self
+
+    def __exit__(self, exc_type, exc_value, exc_tb):
+        """
+        Exit the context manager.
+        """
+        del self
diff --git a/python/ark/ops.py b/python/ark/ops.py
index 86b021aef..509e3c891 100644
--- a/python/ark/ops.py
+++ b/python/ark/ops.py
@@ -1,7 +1,8 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-from typing import List, Iterable, Union
+import json
+from typing import Any, Dict, List, Iterable, Union
 
 from .tensor import Dims, Tensor, Parameter, NullTensor
 from .data_type import DataType, fp32
@@ -12,6 +13,12 @@ def _is_list_or_tuple(obj):
     return isinstance(obj, list) or isinstance(obj, tuple)
 
 
+def _config_to_str(config: Union[str, Dict[str, Any]]) -> str:
+    if isinstance(config, str):
+        return config
+    return json.dumps(config)
+
+
 def _tensor(
     shape: Iterable[int],
     dtype: DataType = fp32,
@@ -50,6 +57,7 @@ def add(
     input: Union[Tensor, float],
     other: Union[Tensor, float],
     output: Tensor = NullTensor,
+    config: Union[str, Dict[str, Any]] = "",
     name: str = "add",
 ) -> Union[Tensor, float]:
     """
@@ -73,12 +81,15 @@ def add(
         return input + other
     else:
         return Tensor(
-            Model.get_model().copy(input + other, output._tensor, name)
+            Model.get_model().copy(
+                input + other, output._tensor, _config_to_str(config), name
+            )
         )
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
-        Model.get_model().add(a, b, output, name), runtime_id=input.runtime_id
+        Model.get_model().add(a, b, output, _config_to_str(config), name),
+        runtime_id=input.runtime_id,
     )
 
 
@@ -86,13 +97,16 @@ def cast(
     input: Tensor,
     dtype: DataType,
     output: Tensor = NullTensor,
+    config: Union[str, Dict[str, Any]] = "",
     name: str = "cast",
 ) -> Tensor:
     """Type casting."""
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
-        Model.get_model().cast(input._tensor, dtype.ctype(), output, name),
+        Model.get_model().cast(
+            input._tensor, dtype.ctype(), output, _config_to_str(config), name
+        ),
         runtime_id=input.runtime_id,
     )
 
@@ -112,7 +126,10 @@ def constant(
 
 
 def copy(
-    input: Union[Tensor, float], output: Tensor = NullTensor, name: str = "copy"
+    input: Union[Tensor, float],
+    output: Tensor = NullTensor,
+    config: Union[str, Dict[str, Any]] = "",
+    name: str = "copy",
 ) -> Tensor:
     """Data caopy."""
     if output is not NullTensor:
@@ -120,7 +137,7 @@ def copy(
     if isinstance(input, Tensor):
         intput = intput._tensor
     return Tensor(
-        Model.get_model().copy(intput, output, name),
+        Model.get_model().copy(intput, output, _config_to_str(config), name),
         runtime_id=input.runtime_id,
     )
 
@@ -129,6 +146,7 @@ def div(
     input: Tensor,
     other: Union[Tensor, float],
     output: Tensor = NullTensor,
+    config: Union[str, Dict[str, Any]] = "",
     name: str = "div",
 ) -> Tensor:
     """
@@ -144,7 +162,9 @@ def div(
             raise ValueError("Tensors must be on the same runtime")
         other = other._tensor
     return Tensor(
-        Model.get_model().div(input._tensor, other, output, name),
+        Model.get_model().div(
+            input._tensor, other, output, _config_to_str(config), name
+        ),
         runtime_id=input.runtime_id,
     )
 
@@ -153,6 +173,7 @@ def embedding(
     input: Tensor,
     weight: Tensor,
     output: Tensor = NullTensor,
+    config: Union[str, Dict[str, Any]] = "",
     name: str = "embedding",
 ) -> Tensor:
     """Embedding layer."""
@@ -162,14 +183,17 @@ def embedding(
         output = output._tensor
     return Tensor(
         Model.get_model().embedding(
-            input._tensor, weight._tensor, output, name
+            input._tensor, weight._tensor, output, _config_to_str(config), name
         ),
         runtime_id=input.runtime_id,
     )
 
 
 def exp(
-    input: Tensor, output: Tensor = NullTensor, name: str = "exp"
+    input: Tensor,
+    output: Tensor = NullTensor,
+    config: Union[str, Dict[str, Any]] = "",
+    name: str = "exp",
 ) -> Tensor:
     """
     Calculates the exponential of the `input` tensor, element-wise.
@@ -179,13 +203,18 @@ def exp(
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
-        Model.get_model().exp(input._tensor, output, name),
+        Model.get_model().exp(
+            input._tensor, output, _config_to_str(config), name
+        ),
         runtime_id=input.runtime_id,
     )
 
 
 def gelu(
-    input: Tensor, output: Tensor = NullTensor, name: str = "gelu"
+    input: Tensor,
+    output: Tensor = NullTensor,
+    config: Union[str, Dict[str, Any]] = "",
+    name: str = "gelu",
 ) -> Tensor:
     """
     Applies the Gaussian Error Linear Unit (GELU) activation
@@ -198,7 +227,9 @@ def gelu(
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
-        Model.get_model().gelu(input._tensor, output, name),
+        Model.get_model().gelu(
+            input._tensor, output, _config_to_str(config), name
+        ),
         runtime_id=input.runtime_id,
     )
 
@@ -230,6 +261,7 @@ def matmul(
     output: Tensor = NullTensor,
     transpose_input: bool = False,
     transpose_other: bool = False,
+    config: Union[str, Dict[str, Any]] = "",
     name: str = "matmul",
 ) -> Tensor:
     """
@@ -252,6 +284,7 @@ def matmul(
             output,
             transpose_input,
             transpose_other,
+            _config_to_str(config),
             name,
         ),
         runtime_id=input.runtime_id,
@@ -262,6 +295,7 @@ def mul(
     input: Tensor,
     other: Union[Tensor, float],
     output: Tensor = NullTensor,
+    config: Union[str, Dict[str, Any]] = "",
     name: str = "mul",
 ) -> Tensor:
     """
@@ -277,7 +311,9 @@ def mul(
             raise ValueError("Tensors must be on the same runtime")
         other = other._tensor
     return Tensor(
-        Model.get_model().mul(input._tensor, other, output, name),
+        Model.get_model().mul(
+            input._tensor, other, output, _config_to_str(config), name
+        ),
         runtime_id=input.runtime_id,
     )
 
@@ -294,6 +330,7 @@ def reduce_max(
     axis: int,
     keepdims: bool = True,
     output: Tensor = NullTensor,
+    config: Union[str, Dict[str, Any]] = "",
     name: str = "reduce_max",
 ) -> Tensor:
     """
@@ -306,7 +343,7 @@ def reduce_max(
         output = output._tensor
     return Tensor(
         Model.get_model().reduce_max(
-            input._tensor, axis, keepdims, output, name
+            input._tensor, axis, keepdims, output, _config_to_str(config), name
         ),
         runtime_id=input.runtime_id,
     )
@@ -317,6 +354,7 @@ def reduce_mean(
     axis: int,
     keepdims: bool = True,
     output: Tensor = NullTensor,
+    config: Union[str, Dict[str, Any]] = "",
     name: str = "reduce_mean",
 ) -> Tensor:
     """
@@ -329,7 +367,7 @@ def reduce_mean(
         output = output._tensor
     return Tensor(
         Model.get_model().reduce_mean(
-            input._tensor, axis, keepdims, output, name
+            input._tensor, axis, keepdims, output, _config_to_str(config), name
         ),
         runtime_id=input.runtime_id,
     )
@@ -340,6 +378,7 @@ def reduce_sum(
     axis: int,
     keepdims: bool = True,
     output: Tensor = NullTensor,
+    config: Union[str, Dict[str, Any]] = "",
     name: str = "reduce_sum",
 ) -> Tensor:
     """
@@ -354,14 +393,17 @@ def reduce_sum(
         output = output._tensor
     return Tensor(
         Model.get_model().reduce_sum(
-            input._tensor, axis, keepdims, output, name
+            input._tensor, axis, keepdims, output, _config_to_str(config), name
         ),
         runtime_id=input.runtime_id,
     )
 
 
 def relu(
-    input: Tensor, output: Tensor = NullTensor, name: str = "relu"
+    input: Tensor,
+    output: Tensor = NullTensor,
+    config: Union[str, Dict[str, Any]] = "",
+    name: str = "relu",
 ) -> Tensor:
     """
     Applies the ReLU activation function to the `input` tensor,
@@ -372,7 +414,9 @@ def relu(
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
-        Model.get_model().relu(input._tensor, output, name),
+        Model.get_model().relu(
+            input._tensor, output, _config_to_str(config), name
+        ),
         runtime_id=input.runtime_id,
     )
 
@@ -411,6 +455,7 @@ def rope(
     input: Tensor,
     other: Tensor,
     output: Tensor = NullTensor,
+    config: Union[str, Dict[str, Any]] = "",
     name: str = "rope",
 ) -> Tensor:
     """
@@ -423,13 +468,18 @@ def rope(
     if input.runtime_id != other.runtime_id:
         raise ValueError("Tensors must be on the same runtime")
     return Tensor(
-        Model.get_model().rope(input._tensor, other._tensor, output, name),
+        Model.get_model().rope(
+            input._tensor, other._tensor, output, _config_to_str(config), name
+        ),
         runtime_id=input.runtime_id,
     )
 
 
 def rsqrt(
-    input: Tensor, output: Tensor = NullTensor, name: str = "rsqrt"
+    input: Tensor,
+    output: Tensor = NullTensor,
+    config: Union[str, Dict[str, Any]] = "",
+    name: str = "rsqrt",
 ) -> Tensor:
     """
     Calculates the square root of the `input` tensor, element-wise.
@@ -439,7 +489,9 @@ def rsqrt(
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
-        Model.get_model().rsqrt(input._tensor, output, name),
+        Model.get_model().rsqrt(
+            input._tensor, output, _config_to_str(config), name
+        ),
         runtime_id=input.runtime_id,
     )
 
@@ -465,7 +517,10 @@ def sharding(
 
 
 def sigmoid(
-    input: Tensor, output: Tensor = NullTensor, name: str = "sigmoid"
+    input: Tensor,
+    output: Tensor = NullTensor,
+    config: Union[str, Dict[str, Any]] = "",
+    name: str = "sigmoid",
 ) -> Tensor:
     """
     Applies the Sigmoid activation function to the `input` tensor,
@@ -476,13 +531,18 @@ def sigmoid(
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
-        Model.get_model().sigmoid(input._tensor, output, name),
+        Model.get_model().sigmoid(
+            input._tensor, output, _config_to_str(config), name
+        ),
         runtime_id=input.runtime_id,
     )
 
 
 def sqrt(
-    input: Tensor, output: Tensor = NullTensor, name: str = "sqrt"
+    input: Tensor,
+    output: Tensor = NullTensor,
+    config: Union[str, Dict[str, Any]] = "",
+    name: str = "sqrt",
 ) -> Tensor:
     """
     Calculates the square root of the `input` tensor, element-wise.
@@ -492,7 +552,9 @@ def sqrt(
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
-        Model.get_model().sqrt(input._tensor, output, name),
+        Model.get_model().sqrt(
+            input._tensor, output, _config_to_str(config), name
+        ),
         runtime_id=input.runtime_id,
     )
 
@@ -501,6 +563,7 @@ def sub(
     input: Tensor,
     other: Union[Tensor, float],
     output: Tensor = NullTensor,
+    config: Union[str, Dict[str, Any]] = "",
     name: str = "sub",
 ) -> Tensor:
     """
@@ -516,7 +579,9 @@ def sub(
             raise ValueError("Tensors must be on the same runtime")
         other = other._tensor
     return Tensor(
-        Model.get_model().sub(input._tensor, other, output, name),
+        Model.get_model().sub(
+            input._tensor, other, output, _config_to_str(config), name
+        ),
         runtime_id=input.runtime_id,
     )
 
@@ -546,6 +611,7 @@ def transpose(
     input: Tensor,
     perm: Iterable[int],
     output: Tensor = NullTensor,
+    config: Union[str, Dict[str, Any]] = "",
     name: str = "transpose",
 ) -> Tensor:
     """
@@ -565,7 +631,9 @@ def transpose(
     if len(perm) > 4:
         raise ValueError("Only support perm up to 4 dimensions")
     return Tensor(
-        Model.get_model().transpose(input._tensor, perm, output, name),
+        Model.get_model().transpose(
+            input._tensor, perm, output, _config_to_str(config), name
+        ),
         runtime_id=input.runtime_id,
     )
 
@@ -578,10 +646,11 @@ def mean(
     axis: int,
     keepdims: bool = True,
     output: Tensor = NullTensor,
+    config: Union[str, Dict[str, Any]] = "",
     name: str = "mean",
 ) -> Tensor:
     """Alias of reduce_mean."""
-    return reduce_mean(input, axis, keepdims, output, name)
+    return reduce_mean(input, axis, keepdims, output, config, name)
 
 
 def ones(
diff --git a/python/ark/profiler.py b/python/ark/profiler.py
index c161b24e6..e47f5b7aa 100644
--- a/python/ark/profiler.py
+++ b/python/ark/profiler.py
@@ -21,8 +21,15 @@ class Profiler:
     def __init__(self, plan: Plan):
         self.plan = plan
 
-    def run(self, iter: int = 1000, loop_mode: bool = True, profile_processor_groups: bool = False):
-        sys.stderr.write(f"End-to-end: {timeit(self.plan, iter, loop_mode):.6f} seconds/iter\n")
+    def run(
+        self,
+        iter: int = 1000,
+        loop_mode: bool = True,
+        profile_processor_groups: bool = False,
+    ):
+        sys.stderr.write(
+            f"End-to-end: {timeit(self.plan, iter, loop_mode):.6f} seconds/iter\n"
+        )
 
         if not profile_processor_groups:
             return
diff --git a/python/ark_py.cpp b/python/ark_py.cpp
index 1bc4255d6..7acd4ad1a 100644
--- a/python/ark_py.cpp
+++ b/python/ark_py.cpp
@@ -7,6 +7,7 @@
 
 namespace py = pybind11;
 
+extern void register_context_manager(py::module &m);
 extern void register_data_type(py::module &m);
 extern void register_dims(py::module &m);
 extern void register_error(py::module &m);
@@ -22,6 +23,7 @@ extern void register_version(py::module &m);
 PYBIND11_MODULE(_ark_core, m) {
     m.doc() = "Bind ARK C++ APIs to Python";
 
+    register_context_manager(m);
     register_data_type(m);
     register_dims(m);
     register_error(m);
diff --git a/python/context_manager_py.cpp b/python/context_manager_py.cpp
new file mode 100644
index 000000000..3d703a4bc
--- /dev/null
+++ b/python/context_manager_py.cpp
@@ -0,0 +1,15 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include <pybind11/operators.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include <ark/context_manager.hpp>
+
+namespace py = pybind11;
+
+void register_context_manager(py::module &m) {
+    py::class_<ark::ContextManager>(m, "_ContextManager")
+        .def(py::init<ark::Model&, const std::map<std::string, std::string>&>());
+}
diff --git a/python/model_py.cpp b/python/model_py.cpp
index 2d1e5f634..ba17251d8 100644
--- a/python/model_py.cpp
+++ b/python/model_py.cpp
@@ -15,97 +15,109 @@ void register_model(py::module &m) {
         .def(py::init<int, int>(), py::arg("rank"), py::arg("world_size"))
         .def("rank", &ark::Model::rank)
         .def("world_size", &ark::Model::world_size)
-        .def("compress", &ark::Model::compress)
+        .def("compress", &ark::Model::compress, py::arg("merge_nodes") = false)
         .def("add",
              py::overload_cast<ark::Tensor, ark::Tensor, ark::Tensor,
-                               const std::string &>(&ark::Model::add),
+                               const std::string &, const std::string &>(
+                 &ark::Model::add),
              py::arg("input"), py::arg("other"), py::arg("output"),
-             py::arg("name"))
+             py::arg("config"), py::arg("name"))
         .def("add",
              py::overload_cast<ark::Tensor, float, ark::Tensor,
-                               const std::string &>(&ark::Model::add),
+                               const std::string &, const std::string &>(
+                 &ark::Model::add),
              py::arg("input"), py::arg("other"), py::arg("output"),
-             py::arg("name"))
+             py::arg("config"), py::arg("name"))
         .def("cast", &ark::Model::cast, py::arg("input"), py::arg("data_type"),
-             py::arg("output"), py::arg("name"))
+             py::arg("output"), py::arg("config"), py::arg("name"))
         .def("constant", &ark::Model::constant, py::arg("value"),
              py::arg("shape"), py::arg("data_type"), py::arg("name"))
         .def("copy",
-             py::overload_cast<ark::Tensor, ark::Tensor, const std::string &>(
-                 &ark::Model::copy),
-             py::arg("input"), py::arg("output"), py::arg("name"))
+             py::overload_cast<ark::Tensor, ark::Tensor, const std::string &,
+                               const std::string &>(&ark::Model::copy),
+             py::arg("input"), py::arg("output"), py::arg("config"),
+             py::arg("name"))
         .def("copy",
-             py::overload_cast<float, ark::Tensor, const std::string &>(
-                 &ark::Model::copy),
-             py::arg("input"), py::arg("output"), py::arg("name"))
+             py::overload_cast<float, ark::Tensor, const std::string &,
+                               const std::string &>(&ark::Model::copy),
+             py::arg("input"), py::arg("output"), py::arg("config"),
+             py::arg("name"))
         .def("div",
              py::overload_cast<ark::Tensor, ark::Tensor, ark::Tensor,
-                               const std::string &>(&ark::Model::div),
+                               const std::string &, const std::string &>(
+                 &ark::Model::div),
              py::arg("input"), py::arg("other"), py::arg("output"),
-             py::arg("name"))
+             py::arg("config"), py::arg("name"))
         .def("div",
              py::overload_cast<ark::Tensor, float, ark::Tensor,
-                               const std::string &>(&ark::Model::div),
+                               const std::string &, const std::string &>(
+                 &ark::Model::div),
              py::arg("input"), py::arg("other"), py::arg("output"),
-             py::arg("name"))
+             py::arg("config"), py::arg("name"))
         .def("embedding", &ark::Model::embedding, py::arg("input"),
-             py::arg("weight"), py::arg("output"), py::arg("name"))
-        .def("exp", &ark::Model::exp, py::arg("input"), py::arg("output"),
+             py::arg("weight"), py::arg("output"), py::arg("config"),
              py::arg("name"))
+        .def("exp", &ark::Model::exp, py::arg("input"), py::arg("output"),
+             py::arg("config"), py::arg("name"))
         .def("gelu", &ark::Model::gelu, py::arg("input"), py::arg("output"),
-             py::arg("name"))
+             py::arg("config"), py::arg("name"))
         .def("identity", &ark::Model::identity, py::arg("input"),
              py::arg("deps"), py::arg("name"))
         .def("matmul", &ark::Model::matmul, py::arg("input"), py::arg("other"),
              py::arg("output"), py::arg("trans_input"), py::arg("trans_other"),
-             py::arg("name"))
+             py::arg("config"), py::arg("name"))
         .def("mul",
              py::overload_cast<ark::Tensor, ark::Tensor, ark::Tensor,
-                               const std::string &>(&ark::Model::mul),
+                               const std::string &, const std::string &>(
+                 &ark::Model::mul),
              py::arg("input"), py::arg("other"), py::arg("output"),
-             py::arg("name"))
+             py::arg("config"), py::arg("name"))
         .def("mul",
              py::overload_cast<ark::Tensor, float, ark::Tensor,
-                               const std::string &>(&ark::Model::mul),
+                               const std::string &, const std::string &>(
+                 &ark::Model::mul),
              py::arg("input"), py::arg("other"), py::arg("output"),
-             py::arg("name"))
+             py::arg("config"), py::arg("name"))
         .def("noop", &ark::Model::noop, py::arg("input"), py::arg("name"))
         .def("reduce_max", &ark::Model::reduce_max, py::arg("input"),
              py::arg("axis"), py::arg("keepdims"), py::arg("output"),
-             py::arg("name"))
+             py::arg("config"), py::arg("name"))
         .def("reduce_mean", &ark::Model::reduce_mean, py::arg("input"),
              py::arg("axis"), py::arg("keepdims"), py::arg("output"),
-             py::arg("name"))
+             py::arg("config"), py::arg("name"))
         .def("reduce_sum", &ark::Model::reduce_sum, py::arg("input"),
              py::arg("axis"), py::arg("keepdims"), py::arg("output"),
-             py::arg("name"))
+             py::arg("config"), py::arg("name"))
         .def("relu", &ark::Model::relu, py::arg("input"), py::arg("output"),
-             py::arg("name"))
+             py::arg("config"), py::arg("name"))
         .def("reshape", &ark::Model::reshape, py::arg("input"),
              py::arg("shape"), py::arg("allowzero"), py::arg("name"))
         .def("rope", &ark::Model::rope, py::arg("input"), py::arg("other"),
-             py::arg("output"), py::arg("name"))
+             py::arg("output"), py::arg("config"), py::arg("name"))
         .def("rsqrt", &ark::Model::rsqrt, py::arg("input"), py::arg("output"),
-             py::arg("name"))
+             py::arg("config"), py::arg("name"))
         .def("sharding", &ark::Model::sharding, py::arg("input"),
              py::arg("axis"), py::arg("dim_per_shard"), py::arg("name"))
         .def("sigmoid", &ark::Model::sigmoid, py::arg("input"),
-             py::arg("output"), py::arg("name"))
+             py::arg("output"), py::arg("config"), py::arg("name"))
         .def("sqrt", &ark::Model::sqrt, py::arg("input"), py::arg("output"),
-             py::arg("name"))
+             py::arg("config"), py::arg("name"))
         .def("sub",
              py::overload_cast<ark::Tensor, ark::Tensor, ark::Tensor,
-                               const std::string &>(&ark::Model::sub),
+                               const std::string &, const std::string &>(
+                 &ark::Model::sub),
              py::arg("input"), py::arg("other"), py::arg("output"),
-             py::arg("name"))
+             py::arg("config"), py::arg("name"))
         .def("sub",
              py::overload_cast<ark::Tensor, float, ark::Tensor,
-                               const std::string &>(&ark::Model::sub),
+                               const std::string &, const std::string &>(
+                 &ark::Model::sub),
              py::arg("input"), py::arg("other"), py::arg("output"),
-             py::arg("name"))
+             py::arg("config"), py::arg("name"))
         .def("tensor", &ark::Model::tensor, py::arg("shape"),
              py::arg("data_type"), py::arg("strides"), py::arg("offsets"),
              py::arg("padded_shape"), py::arg("name"))
         .def("transpose", &ark::Model::transpose, py::arg("input"),
-             py::arg("permutation"), py::arg("output"), py::arg("name"));
+             py::arg("permutation"), py::arg("output"), py::arg("config"),
+             py::arg("name"));
 }

From ef3bb84e8ebb3bb86e256767802401e39d617a85 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 29 Jul 2024 20:31:14 +0000
Subject: [PATCH 41/79] plan manager

---
 ark/api/context_manager_test.cpp           |   1 -
 ark/api/model.cpp                          |   9 ++
 ark/api/plan_manager.cpp                   |  97 ++++++++++++++++
 ark/api/plan_manager_test.cpp              |  58 ++++++++++
 ark/api/planner.cpp                        | 125 +++++++++++++++------
 ark/include/ark/model.hpp                  |   9 +-
 ark/include/ark/model_graph.hpp            |   1 +
 ark/include/ark/plan_manager.hpp           |  25 +++++
 ark/model/model_graph_impl.cpp             |  16 ++-
 ark/model/model_graph_impl.hpp             |   6 +-
 examples/tutorial/context_tutorial.py      | 117 -------------------
 examples/tutorial/plan_manager_tutorial.py |  82 ++++++++++++++
 python/ark/__init__.py                     |   2 +-
 python/ark/context_manager.py              |  24 ----
 python/ark/plan_manager.py                 |  34 ++++++
 python/ark_py.cpp                          |   4 +-
 python/context_manager_py.cpp              |  15 ---
 python/plan_manager_py.cpp                 |  15 +++
 18 files changed, 440 insertions(+), 200 deletions(-)
 create mode 100644 ark/api/plan_manager.cpp
 create mode 100644 ark/api/plan_manager_test.cpp
 create mode 100644 ark/include/ark/plan_manager.hpp
 delete mode 100644 examples/tutorial/context_tutorial.py
 create mode 100644 examples/tutorial/plan_manager_tutorial.py
 delete mode 100644 python/ark/context_manager.py
 create mode 100644 python/ark/plan_manager.py
 delete mode 100644 python/context_manager_py.cpp
 create mode 100644 python/plan_manager_py.cpp

diff --git a/ark/api/context_manager_test.cpp b/ark/api/context_manager_test.cpp
index ff60b43bf..5fff94f34 100644
--- a/ark/api/context_manager_test.cpp
+++ b/ark/api/context_manager_test.cpp
@@ -1,7 +1,6 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include "ark/model.hpp"
 #include "ark/context_manager.hpp"
 
 #include "model/model_node.hpp"
diff --git a/ark/api/model.cpp b/ark/api/model.cpp
index a5a258f71..e9604c341 100644
--- a/ark/api/model.cpp
+++ b/ark/api/model.cpp
@@ -9,6 +9,15 @@
 
 namespace ark {
 
+Model::Model(int rank, int world_size) : ModelGraph(rank, world_size) {
+    static size_t next_id = 0;
+    id_ = next_id++;
+}
+
+Model::Model(const Model &other) : ModelGraph(other), id_(other.id()) {}
+
+size_t Model::id() const { return id_; }
+
 Model Model::compress(bool merge_nodes) const {
     Model model(*this);
     model.compress_nodes(merge_nodes);
diff --git a/ark/api/plan_manager.cpp b/ark/api/plan_manager.cpp
new file mode 100644
index 000000000..aee8d4f7b
--- /dev/null
+++ b/ark/api/plan_manager.cpp
@@ -0,0 +1,97 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "ark/plan_manager.hpp"
+
+#include "logging.h"
+#include "model/model_json.hpp"
+#include "model/model_graph_impl.hpp"
+
+namespace ark {
+
+class PlanManagerState {
+   public:
+    PlanManagerState() : sync(true) {}
+    bool sync;
+};
+
+static std::map<size_t, PlanManagerState> gPlanManagerStates;
+
+PlanManager::PlanManager(Model& model, const std::string& plan_context) : model_id_(model.id()), stop_sync_(false) {
+    auto ctx = Json::parse(plan_context);
+    if (!ctx.is_object()) {
+        ERR(ModelError, "plan context must be a JSON object");
+    }
+    if (gPlanManagerStates.find(model_id_) == gPlanManagerStates.end()) {
+        gPlanManagerStates.emplace(model_id_, PlanManagerState());
+    }
+    auto& state = gPlanManagerStates[model_id_];
+    bool async = !state.sync;
+    std::map<std::string, std::string> context_map;
+    for (const auto& [key, value] : ctx.items()) {
+        if (key == "sync") {
+            if (!value.is_boolean()) {
+                ERR(ModelError, "sync must be a boolean");
+            }
+            if (state.sync && !value.get<bool>()) {
+                stop_sync_ = true;
+                state.sync = false;
+                context_map["AppendTask"] = "true";
+            } else if (!state.sync) {
+                context_map["AppendTask"] = "true";
+            }
+        } else if (key == "processor_range") {
+            if (!value.is_array()) {
+                ERR(ModelError, "processor_range must be an array");
+            }
+            if (async) {
+                LOG(WARN, "Ignoring processor_range under sync=false context");
+                continue;
+            }
+            context_map["ProcessorRange"] = value.dump();
+        } else if (key == "warp_range") {
+            if (!value.is_array()) {
+                ERR(ModelError, "warp_range must be an array");
+            }
+            if (async) {
+                LOG(WARN, "Ignoring warp_range under sync=false context");
+                continue;
+            }
+            context_map["WarpRange"] = value.dump();
+        } else if (key == "sram_range") {
+            if (!value.is_array()) {
+                ERR(ModelError, "sram_range must be an array");
+            }
+            if (async) {
+                LOG(WARN, "Ignoring sram_range under sync=false context");
+                continue;
+            }
+            context_map["SramRange"] = value.dump();
+        } else if (key == "config") {
+            if (!value.is_object()) {
+                ERR(ModelError, "config must be an object");
+            }
+            auto cfg = model.impl_->get_context("Config");
+            if (cfg.empty()) {
+                context_map["Config"] = value.dump();
+            } else {
+                auto cfg_obj = Json::parse(cfg);
+                for (const auto& [k, v] : value.items()) {
+                    cfg_obj[k] = v;
+                }
+                context_map["Config"] = cfg_obj.dump();
+            }
+        } else {
+            LOG(WARN, "Ignoring unknown plan context key: ", key);
+        }
+    }
+    context_manager_ = std::make_shared<ContextManager>(model, context_map);
+}
+
+PlanManager::~PlanManager() {
+    if (stop_sync_) {
+        gPlanManagerStates[model_id_].sync = true;
+    }
+}
+
+}  // namespace ark
diff --git a/ark/api/plan_manager_test.cpp b/ark/api/plan_manager_test.cpp
new file mode 100644
index 000000000..78f5d4cb8
--- /dev/null
+++ b/ark/api/plan_manager_test.cpp
@@ -0,0 +1,58 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "ark/plan_manager.hpp"
+#include "ark/planner.hpp"
+
+#include "model/model_json.hpp"
+#include "unittest/unittest_utils.h"
+
+ark::unittest::State test_plan_manager() {
+    ark::Model model;
+    ark::Tensor t0 = model.tensor({1}, ark::FP32);
+    ark::Tensor t1 = model.tensor({1}, ark::FP32);
+    ark::Tensor t2 = model.add(t0, t1);
+
+    ark::Tensor t3;
+    ark::Tensor t4;
+    ark::Tensor t5;
+    ark::Tensor t6;
+    {
+        ark::PlanManager pm_0(model, ark::Json({
+            {"processor_range", {0, 2}},
+            {"warp_range", {0, 4}},
+            {"sram_range", {0, 0}},
+            {"sync", false}
+        }).dump());
+        t3 = model.relu(t2);
+        t4 = model.sqrt(t3);
+    }
+    {
+        ark::PlanManager pm_0(model, ark::Json({
+            {"processor_range", {2, 4}},
+            {"warp_range", {0, 4}},
+            {"sram_range", {0, 0}}
+        }).dump());
+        t5 = model.exp(t2);
+
+        ark::PlanManager pm_1(model, ark::Json({
+            {"processor_range", {2, 3}}
+        }).dump());
+        t6 = model.rsqrt(t5);
+    }
+
+    UNITTEST_TRUE(model.verify());
+
+    ark::DefaultPlanner planner(model, 0);
+    auto plan_str = planner.plan();
+    ark::Json plan = ark::Json::parse(plan_str);
+
+    UNITTEST_LOG(plan_str);
+
+    return ark::unittest::SUCCESS;
+}
+
+int main() {
+    UNITTEST(test_plan_manager);
+    return 0;
+}
diff --git a/ark/api/planner.cpp b/ark/api/planner.cpp
index dba149a1e..1c40e5301 100644
--- a/ark/api/planner.cpp
+++ b/ark/api/planner.cpp
@@ -58,19 +58,35 @@ std::string DefaultPlanner::Impl::plan(bool pretty) const {
     size_t num_sm = gpu_info.num_sm;
     Json task_infos = Json::array();
     Json processor_groups = Json::array();
-    size_t max_num_warps = 1;
-    size_t max_num_processors = 1;
-    size_t next_node_id = 0;
+    size_t max_processor_id = 1;
+    size_t max_warp_id = 1;
+    size_t next_task_id = 0;
+    bool prev_append_task = false;
+    bool first_op = true;
+
+    auto get_context = [&](const ModelNodeRef &node,
+                           const std::string &key) -> Json {
+        if (node->context.find(key) != node->context.end()) {
+            return Json::parse(node->context.at(key));
+        }
+        return Json();
+    };
+
     for (const auto &node : model_.nodes()) {
+        std::string context = "";
+        for (const auto &[key, value] : node->context) {
+            context += key + "=" + value + ",";
+        }
+        context += "prev_append_task=" + std::to_string(prev_append_task);
+        LOG(INFO, context);
+
         for (const auto &op : node->ops) {
             if (op->is_virtual()) continue;
 
-            Json task_info;
-            task_info["Id"] = next_node_id++;
-
+            auto ctx_config = get_context(node, "Config");
             Json config;
-            if (!op->config().empty()) {
-                config = op->config();
+            if (!ctx_config.empty()) {
+                config = ctx_config;
             } else if (!config_rules_.empty()) {
                 const std::string op_str = op->serialize().dump();
                 for (auto &rule : config_rules_) {
@@ -90,31 +106,70 @@ std::string DefaultPlanner::Impl::plan(bool pretty) const {
             size_t num_warps = config["NumWarps"];
             size_t num_tasks = config["NumTasks"];
             size_t sram_bytes = config["SramBytes"];
-            task_info["NumWarps"] = num_warps;
-            task_info["SramBytes"] = sram_bytes;
-
-            max_num_warps = std::max(max_num_warps, num_warps);
-
-            task_info["Ops"] = Json::array();
-            task_info["Ops"].push_back(op->serialize());
-            task_info["Ops"][0]["Config"] = config;
-            task_infos.push_back(task_info);
-
-            Json resource_group;
-            size_t num_processors = std::min(num_sm, num_tasks);
-            max_num_processors = std::max(max_num_processors, num_processors);
-            resource_group["ProcessorRange"] = {0, num_processors};
-            resource_group["WarpRange"] = {0, num_warps};
-            resource_group["SramRange"] = {0, sram_bytes};
-            resource_group["TaskGroups"] = {{{"TaskId", task_info["Id"]},
-                                             {"TaskRange", {0, num_tasks}},
-                                             {"Granularity", 1}}};
-
-            Json processor_group;
-            processor_group["ProcessorRange"] = {0, num_processors};
-            processor_group["ResourceGroups"] = Json::array();
-            processor_group["ResourceGroups"].push_back(resource_group);
-            processor_groups.push_back(processor_group);
+
+            auto ctx_append_task = get_context(node, "AppendTask");
+            if (!ctx_append_task.empty() && ctx_append_task.get<bool>() &&
+                prev_append_task) {
+                auto &task_info = task_infos.back();
+                task_info["NumWarps"] =
+                    std::max(task_info["NumWarps"].get<size_t>(), num_warps);
+                task_info["SramBytes"] =
+                    std::max(task_info["SramBytes"].get<size_t>(), sram_bytes);
+                task_info["Ops"].push_back(op->serialize());
+                task_info["Ops"].back()["Config"] = config;
+            } else {
+                Json task_info;
+                task_info["Id"] = first_op ? next_task_id : ++next_task_id;
+                task_info["NumWarps"] = num_warps;
+                task_info["SramBytes"] = sram_bytes;
+                task_info["Ops"] = Json::array();
+                task_info["Ops"].push_back(op->serialize());
+                task_info["Ops"][0]["Config"] = config;
+                task_infos.push_back(task_info);
+
+                auto ctx_processor_range = get_context(node, "ProcessorRange");
+                auto ctx_warp_range = get_context(node, "WarpRange");
+                auto ctx_sram_range = get_context(node, "SramRange");
+
+                Json processor_group;
+                if (!ctx_processor_range.empty()) {
+                    processor_group["ProcessorRange"] = ctx_processor_range;
+                    max_processor_id = std::max(
+                        max_processor_id, ctx_processor_range[1].get<size_t>());
+                } else {
+                    size_t num_processors = std::min(num_sm, num_tasks);
+                    processor_group["ProcessorRange"] = {0, num_processors};
+                    max_processor_id =
+                        std::max(max_processor_id, num_processors);
+                }
+
+                Json resource_group;
+                resource_group["ProcessorRange"] =
+                    processor_group["ProcessorRange"];
+                if (!ctx_warp_range.empty()) {
+                    resource_group["WarpRange"] = ctx_warp_range;
+                    max_warp_id =
+                        std::max(max_warp_id, ctx_warp_range[1].get<size_t>());
+                } else {
+                    resource_group["WarpRange"] = {0, num_warps};
+                    max_warp_id = std::max(max_warp_id, num_warps);
+                }
+                if (!ctx_sram_range.empty()) {
+                    resource_group["SramRange"] = ctx_sram_range;
+                } else {
+                    resource_group["SramRange"] = {0, sram_bytes};
+                }
+                resource_group["TaskGroups"] = {{{"TaskId", task_info["Id"]},
+                                                 {"TaskRange", {0, num_tasks}},
+                                                 {"Granularity", 1}}};
+
+                processor_group["ResourceGroups"] = Json::array();
+                processor_group["ResourceGroups"].push_back(resource_group);
+                processor_groups.push_back(processor_group);
+            }
+            prev_append_task =
+                !ctx_append_task.empty() && ctx_append_task.get<bool>();
+            first_op = false;
         }
     }
 
@@ -122,8 +177,8 @@ std::string DefaultPlanner::Impl::plan(bool pretty) const {
     plan["Rank"] = model_.rank();
     plan["WorldSize"] = model_.world_size();
     plan["Architecture"] = gpu_info.arch->name();
-    plan["NumProcessors"] = max_num_processors;
-    plan["NumWarpsPerProcessor"] = max_num_warps;
+    plan["NumProcessors"] = max_processor_id;
+    plan["NumWarpsPerProcessor"] = max_warp_id;
     plan["TaskInfos"] = task_infos;
     plan["ProcessorGroups"] = processor_groups;
 
diff --git a/ark/include/ark/model.hpp b/ark/include/ark/model.hpp
index 35efe53d5..e0b17be52 100644
--- a/ark/include/ark/model.hpp
+++ b/ark/include/ark/model.hpp
@@ -17,15 +17,20 @@ namespace ark {
 
 class Model : public ModelGraph {
    private:
+    size_t id_;
     std::set<int> tags_;
 
    public:
-    Model(int rank = 0, int world_size = 1) : ModelGraph(rank, world_size) {}
-    Model(const Model &other) : ModelGraph(other) {}
+    Model(int rank = 0, int world_size = 1);
+
+    Model(const Model &other);
+
     ~Model() {}
 
     Model &operator=(const Model &other) = default;
 
+    size_t id() const;
+
     Model compress(bool merge_nodes = false) const;
 
     int unique_tag();
diff --git a/ark/include/ark/model_graph.hpp b/ark/include/ark/model_graph.hpp
index f6390a2a9..c53c98c3a 100644
--- a/ark/include/ark/model_graph.hpp
+++ b/ark/include/ark/model_graph.hpp
@@ -38,6 +38,7 @@ class ModelGraph {
 
    protected:
     friend class Model;
+    friend class PlanManager;
     friend class ContextManager;
 
     class Impl;
diff --git a/ark/include/ark/plan_manager.hpp b/ark/include/ark/plan_manager.hpp
new file mode 100644
index 000000000..3952a1c06
--- /dev/null
+++ b/ark/include/ark/plan_manager.hpp
@@ -0,0 +1,25 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef ARK_PLAN_MANAGER_HPP
+#define ARK_PLAN_MANAGER_HPP
+
+#include <ark/context_manager.hpp>
+
+namespace ark {
+
+class PlanManager {
+   public:
+    PlanManager(Model& model, const std::string& plan_context);
+
+    ~PlanManager();
+
+   private:
+    size_t model_id_;
+    bool stop_sync_;
+    std::shared_ptr<ContextManager> context_manager_;
+};
+
+}  // namespace ark
+
+#endif  // ARK_PLAN_MANAGER_HPP
diff --git a/ark/model/model_graph_impl.cpp b/ark/model/model_graph_impl.cpp
index 53a7fa851..385424e57 100644
--- a/ark/model/model_graph_impl.cpp
+++ b/ark/model/model_graph_impl.cpp
@@ -37,7 +37,15 @@ void ModelGraphContextStack::pop(const std::string &key) {
     it->second.pop_back();
 }
 
-std::map<std::string, std::string> ModelGraphContextStack::current_context() const {
+std::string ModelGraphContextStack::get_context(const std::string &key) const {
+    if (this->storage_.find(key) == this->storage_.end() ||
+        this->storage_.at(key).empty()) {
+        return "";
+    }
+    return *this->storage_.at(key).back();
+}
+
+std::map<std::string, std::string> ModelGraphContextStack::get_context_all() const {
     std::map<std::string, std::string> cur;
     for (const auto &pair : this->storage_) {
         if (!pair.second.empty()) {
@@ -167,6 +175,10 @@ bool ModelGraph::Impl::verify() const {
     return true;
 }
 
+std::string ModelGraph::Impl::get_context(const std::string &key) const {
+    return context_stack_->get_context(key);
+}
+
 ModelNodeRef ModelGraph::Impl::add_op(ModelOpRef op) {
     for (auto &tns : op->input_tensors()) {
         if (tensor_to_producer_op_.find(tns) == tensor_to_producer_op_.end()) {
@@ -205,7 +217,7 @@ ModelNodeRef ModelGraph::Impl::add_op(ModelOpRef op) {
         producer->consumers.push_back(node);
     }
 
-    node->context = context_stack_->current_context();
+    node->context = context_stack_->get_context_all();
 
     nodes_.push_back(node);
     return node;
diff --git a/ark/model/model_graph_impl.hpp b/ark/model/model_graph_impl.hpp
index fbfc54c7e..ec255423e 100644
--- a/ark/model/model_graph_impl.hpp
+++ b/ark/model/model_graph_impl.hpp
@@ -34,7 +34,9 @@ class ModelGraphContextStack {
 
     void pop(const std::string &key);
 
-    std::map<std::string, std::string> current_context() const;
+    std::string get_context(const std::string &key) const;
+
+    std::map<std::string, std::string> get_context_all() const;
 };
 
 class ModelGraph::Impl {
@@ -80,6 +82,8 @@ class ModelGraph::Impl {
 
     bool verify() const;
 
+    std::string get_context(const std::string &key) const;
+
     std::string serialize(bool pretty = true) const;
 
     std::vector<ModelNodeRef> nodes() const;
diff --git a/examples/tutorial/context_tutorial.py b/examples/tutorial/context_tutorial.py
deleted file mode 100644
index fb01f0a0c..000000000
--- a/examples/tutorial/context_tutorial.py
+++ /dev/null
@@ -1,117 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import ark
-import time
-import torch
-import torch.nn.functional as F
-
-
-class VanillaSoftmax(ark.Module):
-    def __init__(self):
-        super(Softmax, self).__init__()
-
-    def forward(self, input):
-        max = ark.reduce_max(input, axis=-1)
-        output = ark.sub(input, max)
-        output = ark.exp(output)
-        sum = ark.reduce_sum(output, axis=-1)
-        output = ark.div(output, sum)
-        return output
-
-
-class Softmax(ark.Module):
-    def __init__(self):
-        super(Softmax, self).__init__()
-
-    def forward(self, input):
-        with ark.ContextManager(
-            processor_range=[0, 304],
-            warp_range=[0, 8],
-            sram_range=[0, 0],
-            task_id=0,
-        ):
-            max = ark.reduce_max(
-                input,
-                axis=-1,
-                config={
-                    "NumWarps": 1,
-                    "ImplType": "WarpWise",
-                    "SramBytes": 0,
-                    "NumTasks": 65536,
-                },
-            )
-            output = ark.sub(
-                input,
-                max,
-                config={
-                    "NumWarps": 1,
-                    "SramBytes": 0,
-                    "Tile": [1, 2048],
-                    "NumTasks": 65536,
-                },
-            )
-            output = ark.exp(
-                output,
-                config={
-                    "NumWarps": 1,
-                    "SramBytes": 0,
-                    "Tile": [1, 2048],
-                    "NumTasks": 65536,
-                },
-            )
-            sum = ark.reduce_sum(
-                output,
-                axis=-1,
-                config={
-                    "NumWarps": 1,
-                    "ImplType": "WarpWise",
-                    "SramBytes": 0,
-                    "NumTasks": 65536,
-                },
-            )
-            output = ark.div(
-                output,
-                sum,
-                config={
-                    "NumWarps": 1,
-                    "SramBytes": 0,
-                    "Tile": [1, 2048],
-                    "NumTasks": 65536,
-                },
-            )
-            return output
-
-
-def eval(tensor: ark.Tensor):
-    with ark.Runtime() as rt:
-        rt.launch()
-        rt.run()
-        return tensor.to_torch()
-
-
-def perf():
-    with ark.Runtime() as rt:
-        rt.launch()
-
-        start = time.time()
-        rt.run(iter=1000)
-        end = time.time()
-        return (end - start) / 1000
-
-
-if __name__ == "__main__":
-    ark.init()
-
-    shape = (32, 2048, 2048)
-
-    input = torch.randn(*shape).to("cuda:0")
-
-    output = Softmax()(ark.Tensor.from_torch(input))
-
-    if torch.allclose(eval(output), F.softmax(input, dim=-1), atol=1e-5):
-        print("Correct result")
-    else:
-        print("Incorrect result")
-
-    print(f"Performance: {(perf() * 1e3):.3f} ms/iter")
diff --git a/examples/tutorial/plan_manager_tutorial.py b/examples/tutorial/plan_manager_tutorial.py
new file mode 100644
index 000000000..25aca7af6
--- /dev/null
+++ b/examples/tutorial/plan_manager_tutorial.py
@@ -0,0 +1,82 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import ark
+import time
+import torch
+import torch.nn.functional as F
+
+
+class VanillaSoftmax(ark.Module):
+    def __init__(self):
+        super(Softmax, self).__init__()
+
+    def forward(self, input):
+        max = ark.reduce_max(input, axis=-1)
+        output = ark.sub(input, max)
+        output = ark.exp(output)
+        sum = ark.reduce_sum(output, axis=-1)
+        output = ark.div(output, sum)
+        return output
+
+
+class Softmax(ark.Module):
+    def __init__(self):
+        super(Softmax, self).__init__()
+
+    def forward(self, input):
+        with ark.PlanManager(
+            processor_range=[0, 304],
+            warp_range=[0, 8],
+            sram_range=[0, 0],
+            sync=False,
+            config={
+                "NumWarps": 1,
+                "SramBytes": 0,
+                "NumTasks": 65536,
+            }
+        ):
+            with ark.PlanManager(config={"ImplType": "WarpWise"}):
+                max = ark.reduce_max(input, axis=-1)
+            with ark.PlanManager(config={"Tile": [1, 2048]}):
+                output = ark.sub(input, max)
+                output = ark.exp(output)
+            with ark.PlanManager(config={"ImplType": "WarpWise"}):
+                sum = ark.reduce_sum(output, axis=-1)
+            with ark.PlanManager(config={"Tile": [1, 2048]}):
+                output = ark.div(output, sum)
+            return output
+
+
+def eval(tensor: ark.Tensor):
+    with ark.Runtime() as rt:
+        rt.launch()
+        rt.run()
+        return tensor.to_torch()
+
+
+def perf():
+    with ark.Runtime() as rt:
+        rt.launch()
+
+        start = time.time()
+        rt.run(iter=1000)
+        end = time.time()
+        return (end - start) / 1000
+
+
+if __name__ == "__main__":
+    ark.init()
+
+    shape = (32, 2048, 2048)
+
+    input = torch.randn(*shape).to("cuda:0")
+
+    output = Softmax()(ark.Tensor.from_torch(input))
+
+    if torch.allclose(eval(output), F.softmax(input, dim=-1), atol=1e-5):
+        print("Correct result")
+    else:
+        print("Incorrect result")
+
+    print(f"Performance: {(perf() * 1e3):.3f} ms/iter")
diff --git a/python/ark/__init__.py b/python/ark/__init__.py
index 00370e683..db19b59d4 100644
--- a/python/ark/__init__.py
+++ b/python/ark/__init__.py
@@ -101,4 +101,4 @@ def set_world_size(world_size):
 )
 from .planner import DefaultPlanner, Plan
 from .profiler import Profiler
-from .context_manager import ContextManager
+from .plan_manager import PlanManager
diff --git a/python/ark/context_manager.py b/python/ark/context_manager.py
deleted file mode 100644
index 443e1ca5d..000000000
--- a/python/ark/context_manager.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import json
-from .model import Model
-from ._ark_core import _ContextManager
-
-
-class ContextManager(_ContextManager):
-    def __init__(self, **kwargs):
-        context_map = {key: json.dumps(value) for key, value in kwargs.items()}
-        super().__init__(Model.get_model(), context_map)
-
-    def __enter__(self) -> "ContextManager":
-        """
-        Enter the context manager.
-        """
-        return self
-
-    def __exit__(self, exc_type, exc_value, exc_tb):
-        """
-        Exit the context manager.
-        """
-        del self
diff --git a/python/ark/plan_manager.py b/python/ark/plan_manager.py
new file mode 100644
index 000000000..80e615ab8
--- /dev/null
+++ b/python/ark/plan_manager.py
@@ -0,0 +1,34 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import json
+from typing import List, Dict, Any
+from .model import Model
+from ._ark_core import _PlanManager
+
+
+class PlanManager(_PlanManager):
+    def __init__(self, **kwargs):
+        """
+        Plan manager for specifying the parallelization and tiling configuration of the operators in the context.
+
+        Args:
+            processor_range (List[int], optional): The range of processors to be used. Defaults to None.
+            warp_range (List[int], optional): The range of warps to be used. Defaults to None.
+            sram_range (List[int], optional): The range of SRAMs to be used. Defaults to None.
+            sync (bool, optional): Whether to synchronize the execution. Defaults to True.
+            config (Dict[str, Any], optional): The configuration for the operators. Defaults to None.
+        """
+        super().__init__(Model.get_model(), json.dumps(kwargs))
+
+    def __enter__(self) -> "PlanManager":
+        """
+        Enter the plan manager.
+        """
+        return self
+
+    def __exit__(self, exc_type, exc_value, exc_tb):
+        """
+        Exit the plan manager.
+        """
+        del self
diff --git a/python/ark_py.cpp b/python/ark_py.cpp
index 7acd4ad1a..75788ba55 100644
--- a/python/ark_py.cpp
+++ b/python/ark_py.cpp
@@ -7,7 +7,7 @@
 
 namespace py = pybind11;
 
-extern void register_context_manager(py::module &m);
+extern void register_plan_manager(py::module &m);
 extern void register_data_type(py::module &m);
 extern void register_dims(py::module &m);
 extern void register_error(py::module &m);
@@ -23,7 +23,7 @@ extern void register_version(py::module &m);
 PYBIND11_MODULE(_ark_core, m) {
     m.doc() = "Bind ARK C++ APIs to Python";
 
-    register_context_manager(m);
+    register_plan_manager(m);
     register_data_type(m);
     register_dims(m);
     register_error(m);
diff --git a/python/context_manager_py.cpp b/python/context_manager_py.cpp
deleted file mode 100644
index 3d703a4bc..000000000
--- a/python/context_manager_py.cpp
+++ /dev/null
@@ -1,15 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <pybind11/operators.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-
-#include <ark/context_manager.hpp>
-
-namespace py = pybind11;
-
-void register_context_manager(py::module &m) {
-    py::class_<ark::ContextManager>(m, "_ContextManager")
-        .def(py::init<ark::Model&, const std::map<std::string, std::string>&>());
-}
diff --git a/python/plan_manager_py.cpp b/python/plan_manager_py.cpp
new file mode 100644
index 000000000..34aa0b77c
--- /dev/null
+++ b/python/plan_manager_py.cpp
@@ -0,0 +1,15 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include <pybind11/operators.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include <ark/plan_manager.hpp>
+
+namespace py = pybind11;
+
+void register_plan_manager(py::module &m) {
+    py::class_<ark::PlanManager>(m, "_PlanManager")
+        .def(py::init<ark::Model&, const std::string&>());
+}

From 7a7f70e43d3e6e327abf5fe835fad1902c803ca0 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 30 Jul 2024 04:45:27 +0000
Subject: [PATCH 42/79] fix

---
 ark/api/plan_manager.cpp                   |  8 ++++----
 ark/api/planner.cpp                        | 22 ++++++++--------------
 examples/tutorial/plan_manager_tutorial.py |  3 +--
 python/ark/tensor.py                       |  7 +++++--
 4 files changed, 18 insertions(+), 22 deletions(-)

diff --git a/ark/api/plan_manager.cpp b/ark/api/plan_manager.cpp
index aee8d4f7b..8cb1940b1 100644
--- a/ark/api/plan_manager.cpp
+++ b/ark/api/plan_manager.cpp
@@ -17,7 +17,9 @@ class PlanManagerState {
 
 static std::map<size_t, PlanManagerState> gPlanManagerStates;
 
-PlanManager::PlanManager(Model& model, const std::string& plan_context) : model_id_(model.id()), stop_sync_(false) {
+PlanManager::PlanManager(Model& model, const std::string& plan_context)
+    : model_id_(model.id()), stop_sync_(false) {
+    static int task_group_id = 0;
     auto ctx = Json::parse(plan_context);
     if (!ctx.is_object()) {
         ERR(ModelError, "plan context must be a JSON object");
@@ -36,9 +38,7 @@ PlanManager::PlanManager(Model& model, const std::string& plan_context) : model_
             if (state.sync && !value.get<bool>()) {
                 stop_sync_ = true;
                 state.sync = false;
-                context_map["AppendTask"] = "true";
-            } else if (!state.sync) {
-                context_map["AppendTask"] = "true";
+                context_map["TaskGroupId"] = std::to_string(task_group_id++);
             }
         } else if (key == "processor_range") {
             if (!value.is_array()) {
diff --git a/ark/api/planner.cpp b/ark/api/planner.cpp
index 1c40e5301..032be0d6f 100644
--- a/ark/api/planner.cpp
+++ b/ark/api/planner.cpp
@@ -61,7 +61,7 @@ std::string DefaultPlanner::Impl::plan(bool pretty) const {
     size_t max_processor_id = 1;
     size_t max_warp_id = 1;
     size_t next_task_id = 0;
-    bool prev_append_task = false;
+    int prev_task_group_id = -1;
     bool first_op = true;
 
     auto get_context = [&](const ModelNodeRef &node,
@@ -73,13 +73,6 @@ std::string DefaultPlanner::Impl::plan(bool pretty) const {
     };
 
     for (const auto &node : model_.nodes()) {
-        std::string context = "";
-        for (const auto &[key, value] : node->context) {
-            context += key + "=" + value + ",";
-        }
-        context += "prev_append_task=" + std::to_string(prev_append_task);
-        LOG(INFO, context);
-
         for (const auto &op : node->ops) {
             if (op->is_virtual()) continue;
 
@@ -106,10 +99,12 @@ std::string DefaultPlanner::Impl::plan(bool pretty) const {
             size_t num_warps = config["NumWarps"];
             size_t num_tasks = config["NumTasks"];
             size_t sram_bytes = config["SramBytes"];
+            size_t granularity = config.value("Granularity", 1);
 
-            auto ctx_append_task = get_context(node, "AppendTask");
-            if (!ctx_append_task.empty() && ctx_append_task.get<bool>() &&
-                prev_append_task) {
+            auto ctx_task_group_id = get_context(node, "TaskGroupId");
+            int task_group_id =
+                ctx_task_group_id.empty() ? -1 : ctx_task_group_id.get<int>();
+            if (task_group_id != -1 && task_group_id == prev_task_group_id) {
                 auto &task_info = task_infos.back();
                 task_info["NumWarps"] =
                     std::max(task_info["NumWarps"].get<size_t>(), num_warps);
@@ -161,14 +156,13 @@ std::string DefaultPlanner::Impl::plan(bool pretty) const {
                 }
                 resource_group["TaskGroups"] = {{{"TaskId", task_info["Id"]},
                                                  {"TaskRange", {0, num_tasks}},
-                                                 {"Granularity", 1}}};
+                                                 {"Granularity", granularity}}};
 
                 processor_group["ResourceGroups"] = Json::array();
                 processor_group["ResourceGroups"].push_back(resource_group);
                 processor_groups.push_back(processor_group);
             }
-            prev_append_task =
-                !ctx_append_task.empty() && ctx_append_task.get<bool>();
+            prev_task_group_id = task_group_id;
             first_op = false;
         }
     }
diff --git a/examples/tutorial/plan_manager_tutorial.py b/examples/tutorial/plan_manager_tutorial.py
index 25aca7af6..c840ce0c0 100644
--- a/examples/tutorial/plan_manager_tutorial.py
+++ b/examples/tutorial/plan_manager_tutorial.py
@@ -26,7 +26,6 @@ def __init__(self):
 
     def forward(self, input):
         with ark.PlanManager(
-            processor_range=[0, 304],
             warp_range=[0, 8],
             sram_range=[0, 0],
             sync=False,
@@ -34,7 +33,7 @@ def forward(self, input):
                 "NumWarps": 1,
                 "SramBytes": 0,
                 "NumTasks": 65536,
-            }
+            },
         ):
             with ark.PlanManager(config={"ImplType": "WarpWise"}):
                 max = ark.reduce_max(input, axis=-1)
diff --git a/python/ark/tensor.py b/python/ark/tensor.py
index 657da1065..eed7a4259 100644
--- a/python/ark/tensor.py
+++ b/python/ark/tensor.py
@@ -193,7 +193,9 @@ def from_torch(tensor: torch.Tensor, runtime_id: int = -1) -> "Tensor":
         ark_tensor = _Tensor(dl_capsule, ark_dtype.ctype())
         return Tensor(ark_tensor, runtime_id=runtime_id)
 
-    def copy(self, data: Union[np.ndarray, torch.Tensor]) -> "Tensor":
+    def copy(
+        self, data: Union[np.ndarray, torch.Tensor], stream: int = 0
+    ) -> "Tensor":
         """
         Copies data into this tensor. The data type may differ,
         but the size must match.
@@ -214,6 +216,7 @@ def copy(self, data: Union[np.ndarray, torch.Tensor]) -> "Tensor":
                 self._tensor,
                 data.data_ptr(),
                 tensor_bytes,
+                stream,
                 data.device.type == "cuda",
             )
         elif isinstance(data, np.ndarray):
@@ -221,7 +224,7 @@ def copy(self, data: Union[np.ndarray, torch.Tensor]) -> "Tensor":
                 data = np.ascontiguousarray(data)
             if data.nbytes != tensor_bytes:
                 raise ValueError("data size does not match the tensor")
-            rt.executor.tensor_write(self._tensor, data)
+            rt.executor.tensor_write(self._tensor, data, stream)
         else:
             raise ValueError("data must be a numpy array or a torch tensor")
         return self

From a77a2ea6b864562f4e916dbaaf30f82e080aad93 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 30 Jul 2024 05:48:00 +0000
Subject: [PATCH 43/79] llama example

---
 examples/llama/model_7b_b1_s2048.py | 704 ++++++++++++++++++++++++++++
 examples/llama/model_test.py        |   6 +-
 2 files changed, 708 insertions(+), 2 deletions(-)
 create mode 100644 examples/llama/model_7b_b1_s2048.py

diff --git a/examples/llama/model_7b_b1_s2048.py b/examples/llama/model_7b_b1_s2048.py
new file mode 100644
index 000000000..f41304e85
--- /dev/null
+++ b/examples/llama/model_7b_b1_s2048.py
@@ -0,0 +1,704 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""LLaMA 2 Transformer model.
+   Correspond to https://github.com/facebookresearch/llama/blob/main/llama/model.py
+"""
+
+import ark
+import math
+from dataclasses import dataclass
+from typing import Optional
+import os
+
+
+@dataclass
+class ModelArgs:
+    dim: int = 4096
+    n_layers: int = 32
+    n_heads: int = 32
+    n_kv_heads: Optional[int] = None
+    vocab_size: int = -1  # defined later by tokenizer
+    multiple_of: int = (
+        256  # make SwiGLU hidden layer size multiple of large power of 2
+    )
+    ffn_dim_multiplier: Optional[float] = None
+    norm_eps: float = 1e-5
+    max_batch_size: int = 32
+    max_seq_len: int = 2048
+
+
+@dataclass
+class ModelArgs7B(ModelArgs):
+    dim: int = 4096
+    n_layers: int = 32
+    n_heads: int = 32
+    n_kv_heads: Optional[int] = None
+    vocab_size: int = -1  # defined later by tokenizer
+    multiple_of: int = (
+        256  # make SwiGLU hidden layer size multiple of large power of 2
+    )
+    ffn_dim_multiplier: Optional[float] = None
+    norm_eps: float = 1e-5
+    max_batch_size: int = 32
+    max_seq_len: int = 2048
+
+
+@dataclass
+class ModelArgs13B(ModelArgs):
+    dim: int = 5120
+    n_layers: int = 40
+    n_heads: int = 40
+    n_kv_heads: Optional[int] = None
+    vocab_size: int = -1  # defined later by tokenizer
+    multiple_of: int = (
+        256  # make SwiGLU hidden layer size multiple of large power of 2
+    )
+    ffn_dim_multiplier: Optional[float] = None
+    norm_eps: float = 1e-5
+    max_batch_size: int = 32
+    max_seq_len: int = 2048
+
+
+@dataclass
+class ModelArgs70B(ModelArgs):
+    dim: int = 8192
+    n_layers: int = 80
+    n_heads: int = 64
+    n_kv_heads: Optional[int] = 8
+    vocab_size: int = -1
+    multiple_of: int = (
+        4096  # make SwiGLU hidden layer size multiple of large power of 2
+    )
+    ffn_dim_multiplier: Optional[float] = 1.3
+    norm_eps: float = 1e-5
+    max_batch_size: int = 32
+    max_seq_len: int = 4096
+
+
+class RMSNorm(ark.Module):
+    """
+    Root mean square layer normalization (RMSNorm).
+    """
+
+    def __init__(
+        self, dim: int, eps: float = 1e-6, dtype: ark.DataType = ark.fp16
+    ):
+        super().__init__()
+        self.eps = eps
+        self.dtype = dtype
+        self.weight = ark.parameter([1, 1, dim], ark.fp32)
+
+    def forward(self, x):
+        with ark.PlanManager(
+            warp_range=[0, 8],
+            sync=False,
+            config={
+                "NumWarps": 1,
+                "SramBytes": 0,
+                "NumTasks": 2048,
+                "Granularity": 7,
+            },
+        ):
+            with ark.PlanManager(config={"Tile": [1, 4096]}):
+                x = ark.cast(x, ark.fp32)
+                x2 = ark.mul(x, x)
+            with ark.PlanManager(config={"ImplType": "WarpWise"}):
+                mean = ark.reduce_mean(x2, axis=-1)
+        with ark.PlanManager(
+            config={
+                "NumWarps": 1,
+                "SramBytes": 0,
+                "Tile": [64, 1],
+                "NumTasks": 32,
+            }
+        ):
+            rrms = ark.rsqrt(mean)
+        with ark.PlanManager(
+            warp_range=[0, 8],
+            sync=False,
+            config={
+                "NumWarps": 1,
+                "SramBytes": 0,
+                "NumTasks": 2048,
+                "Tile": [1, 4096],
+                "Granularity": 7,
+            },
+        ):
+            x = ark.mul(x, rrms)
+            x = ark.mul(x, self.weight, x)
+            return ark.cast(x, self.dtype)
+
+
+class ColumnParallelLinear(ark.Module):
+    """Linear layer with column parallelism.
+
+    The linear layer is defined as Y = XA + b. A is parallelized along
+    its second dimension as A = [A_1, ..., A_p].
+    Here the weight = A^T, so we need to partition the weight matrix along
+    its first dimension.
+
+    """
+
+    def __init__(
+        self,
+        in_dim: int,
+        out_dim: int,
+        dtype: ark.DataType = ark.fp16,
+        gather_output: bool = True,
+        local_rank: int = 0,
+        world_size: int = 1,
+    ):
+        super().__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        self.dtype = dtype
+        self.local_rank = local_rank
+        self.world_size = world_size
+        self.gather_output = gather_output
+
+        self.weight = ark.parameter([out_dim // world_size, in_dim], dtype)
+
+    def forward(self, x):
+        if self.world_size == 1 or self.gather_output == False:
+            return ark.matmul(x, self.weight, transpose_other=True)
+        # We need to concat the output_tensor_shards along the last dimension
+        output_tensor = ark.tensor(
+            [x.shape()[0], x.shape()[1], self.out_dim], self.dtype
+        )
+        output_tensor_shards = ark.sharding(
+            output_tensor,
+            axis=2,
+            dim_per_shard=self.out_dim // self.world_size,
+        )
+        local_result = ark.identity(
+            output_tensor_shards[self.local_rank], deps=output_tensor_shards
+        )
+        # (batch_size, seq_len, out_dim // world_size)
+        local_result = ark.matmul(
+            x, self.weight, local_result, transpose_other=True
+        )
+        gather_input = ark.identity(output_tensor, deps=[local_result])
+        # return gather_input
+        gather_reshape = ark.reshape(
+            gather_input, [x.shape()[0] * x.shape()[1], self.out_dim]
+        )
+        gather_out = ark.local_all_gather(
+            gather_reshape, self.local_rank, self.world_size, 1
+        )
+        return ark.reshape(
+            gather_out, [x.shape()[0], x.shape()[1], self.out_dim]
+        )
+
+
+class RowParallelLinear(ark.Module):
+    """Linear layer with row parallelism.
+
+    The linear layer is defined as Y = XA + b. A is parallelized along
+    its first dimension and X along its second dimension as:
+               -   -
+              | A_1 |
+              | .   |
+          A = | .   |        X = [X_1, ..., X_p]
+              | .   |
+              | A_p |
+               -   -
+
+    Here the weight = A^T, so we need to partition the weight matrix along
+    its second dimension.
+    """
+
+    def __init__(
+        self,
+        in_dim: int,
+        out_dim: int,
+        dtype: ark.DataType = ark.fp16,
+        input_is_parallel: bool = False,
+        local_rank: int = 0,
+        world_size: int = 1,
+    ):
+        super().__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        self.dtype = dtype
+        self.local_rank = local_rank
+        self.world_size = world_size
+        self.input_is_parallel = input_is_parallel
+
+        self.weight = ark.parameter([out_dim, in_dim // world_size], dtype)
+
+    def forward(self, x):
+        if self.world_size == 1:
+            return ark.matmul(x, self.weight, transpose_other=True)
+        x_ndims = len(x.shape())
+        if self.input_is_parallel:
+            input_parallel = x
+        else:
+            x_shards = ark.sharding(
+                x, x_ndims - 1, self.in_dim // self.world_size
+            )
+            input_parallel = x_shards[self.local_rank]
+        local_result = ark.matmul(
+            input_parallel, self.weight, transpose_other=True
+        )
+        reduced_result = ark.local_all_reduce(
+            local_result, self.local_rank, self.world_size
+        )
+        return reduced_result
+
+
+class ParallelEmbedding(ark.Module):
+    """Embedding layer."""
+
+    def __init__(
+        self,
+        vocab_size: int,
+        dim: int,
+        dtype: ark.DataType,
+        local_rank: int = 0,
+        world_size: int = 1,
+    ):
+        super().__init__()
+        self.vocab_size = vocab_size
+        self.dim = dim
+        self.weight = ark.parameter([vocab_size, dim // world_size], dtype)
+        self.out_dim = dim
+        self.dtype = dtype
+        self.world_size = world_size
+        self.local_rank = local_rank
+
+    def forward(self, x):
+        if self.world_size == 1:
+            return ark.embedding(x, self.weight)
+
+        output_tensor = ark.tensor(
+            [x.shape()[0], x.shape()[1], self.out_dim], self.dtype
+        )
+        output_tensor_shards = ark.sharding(
+            output_tensor, axis=2, dim_per_shard=self.out_dim // self.world_size
+        )
+        local_result = ark.identity(
+            output_tensor_shards[self.local_rank], deps=output_tensor_shards
+        )
+        local_result = ark.embedding(x, self.weight, local_result)
+        gather_input = ark.identity(output_tensor, deps=[local_result])
+        gather_reshape = ark.reshape(
+            gather_input, [x.shape()[0] * x.shape()[1], self.out_dim]
+        )
+        gather_out = ark.local_all_gather(
+            gather_reshape, self.local_rank, self.world_size, 1
+        )
+        return ark.reshape(
+            gather_out, [x.shape()[0], x.shape()[1], self.out_dim]
+        )
+
+
+class Linear(ark.Module):
+    """
+    Linear layer module with weights and no bias.
+    """
+
+    def __init__(
+        self, in_dim: int, out_dim: int, dtype: ark.DataType = ark.fp16
+    ):
+        super().__init__()
+        self.dtype = dtype
+        self.weight = ark.parameter([out_dim, in_dim], dtype)
+
+    def forward(self, x):
+        return ark.matmul(x, self.weight, transpose_other=True)
+
+
+class Silu(ark.Module):
+    """
+    Silu activation function, silu(x) = x * sigmoid(x)
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x: ark.Tensor):
+        # We need to specify output tensor so that the sigmoid op will not be an in-place operator
+        output = ark.tensor(x.shape(), x.dtype())
+        x1 = ark.sigmoid(x, output)
+        return ark.mul(x, x1)
+
+
+class FeedForward(ark.Module):
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+        multiple_of: int,
+        ffn_dim_multiplier: Optional[float],
+        dtype: ark.DataType = ark.fp16,
+        local_rank: int = 0,
+        world_size: int = 1,
+    ):
+        super().__init__()
+        hidden_dim = int(2 * hidden_dim / 3)
+        # custom dim factor multiplier
+        if ffn_dim_multiplier is not None:
+            hidden_dim = int(ffn_dim_multiplier * hidden_dim)
+        hidden_dim = multiple_of * (
+            (hidden_dim + multiple_of - 1) // multiple_of
+        )
+
+        self.w1 = ColumnParallelLinear(
+            dim, hidden_dim, dtype, False, local_rank, world_size
+        )
+        self.w2 = RowParallelLinear(
+            hidden_dim, dim, dtype, True, local_rank, world_size
+        )
+        self.w3 = ColumnParallelLinear(
+            dim, hidden_dim, dtype, False, local_rank, world_size
+        )
+
+    def forward(self, x):
+        # self.w2(F.silu(self.w1(x)) * self.w3(x))
+        with ark.PlanManager(
+            warp_range=[0, 8],
+            sram_range=[0, 49344],
+            sync=False,
+            config={
+                "NumWarps": 4,
+                "NumTasks": 688,
+            },
+        ):
+            with ark.PlanManager(
+                config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]}
+            ):
+                x1 = self.w1(x)
+            with ark.PlanManager(config={"SramBytes": 0, "Tile": [256, 128]}):
+                x1 = Silu()(x1)
+        with ark.PlanManager(
+            warp_range=[0, 8],
+            sram_range=[0, 49344],
+            sync=False,
+            config={
+                "NumWarps": 4,
+                "NumTasks": 688,
+            },
+        ):
+            with ark.PlanManager(
+                config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]}
+            ):
+                x2 = self.w3(x)
+            with ark.PlanManager(config={"SramBytes": 0, "Tile": [256, 128]}):
+                x3 = ark.mul(x1, x2)
+        x4 = self.w2(x3)
+        return x4
+
+
+def apply_rotary_emb(xq, xk, freqs_cis):
+    """
+    Apply rotary embeddings to xq and xk.
+    """
+    xq_out = ark.rope(xq, freqs_cis)
+    xk_out = ark.rope(xk, freqs_cis)
+    return xq_out, xk_out
+
+
+class Softmax(ark.Module):
+    def __init__(self):
+        super(Softmax, self).__init__()
+
+    def forward(self, input):
+        with ark.PlanManager(
+            warp_range=[0, 8],
+            sram_range=[0, 0],
+            sync=False,
+            config={
+                "NumWarps": 1,
+                "SramBytes": 0,
+                "NumTasks": 65536,
+            },
+        ):
+            with ark.PlanManager(config={"ImplType": "WarpWise"}):
+                max = ark.reduce_max(input, axis=-1)
+            with ark.PlanManager(config={"Tile": [1, 2048]}):
+                output = ark.sub(input, max)
+                output = ark.exp(output)
+            with ark.PlanManager(config={"ImplType": "WarpWise"}):
+                sum = ark.reduce_sum(output, axis=-1)
+            with ark.PlanManager(config={"Tile": [1, 2048]}):
+                output = ark.div(output, sum)
+            return output
+
+
+class Attention(ark.Module):
+    def __init__(
+        self,
+        args: ModelArgs,
+        dtype: ark.DataType = ark.fp16,
+        local_rank: int = 0,
+        world_size: int = 1,
+    ):
+        super().__init__()
+        self.n_kv_heads = (
+            args.n_heads if args.n_kv_heads is None else args.n_kv_heads
+        )
+        model_parallel_size = world_size
+        self.dtype = dtype
+        self.n_local_heads = args.n_heads // model_parallel_size
+        self.n_local_kv_heads = self.n_kv_heads // model_parallel_size
+        self.n_rep = self.n_local_heads // self.n_local_kv_heads
+        self.head_dim = args.dim // args.n_heads
+        self.wq = ColumnParallelLinear(
+            args.dim,
+            args.n_heads * self.head_dim,
+            dtype,
+            False,
+            local_rank,
+            world_size,
+        )
+        self.wk = ColumnParallelLinear(
+            args.dim,
+            self.n_kv_heads * self.head_dim,
+            dtype,
+            False,
+            local_rank,
+            world_size,
+        )
+        self.wv = ColumnParallelLinear(
+            args.dim,
+            self.n_kv_heads * self.head_dim,
+            dtype,
+            False,
+            local_rank,
+            world_size,
+        )
+        self.wo = RowParallelLinear(
+            args.n_heads * self.head_dim,
+            args.dim,
+            dtype,
+            True,
+            local_rank,
+            world_size,
+        )
+
+    def forward(
+        self,
+        x: ark.Tensor,
+        start_pos: int,
+        freqs_cis: ark.Tensor,
+        mask: Optional[ark.Tensor],
+    ):
+        bsz, seqlen, _ = x.shape()
+
+        with ark.PlanManager(
+            warp_range=[0, 4],
+            sram_range=[0, 24672],
+            sync=False,
+            config={"NumWarps": 4, "NumTasks": 256},
+        ):
+            with ark.PlanManager(
+                config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]}
+            ):
+                xq = self.wq(x)
+            xq = ark.reshape(
+                xq, [bsz, seqlen, self.n_local_heads, self.head_dim]
+            )
+            with ark.PlanManager(
+                config={"SramBytes": 0, "Tile": [256, 1, 128]}
+            ):
+                if freqs_cis is not None:
+                    xq = ark.rope(xq, freqs_cis)
+            with ark.PlanManager(config={"SramBytes": 0, "Tile": [256, 128]}):
+                xq = ark.transpose(xq, [0, 2, 1, 3])
+
+        with ark.PlanManager(
+            warp_range=[0, 4],
+            sram_range=[0, 24672],
+            sync=False,
+            config={"NumWarps": 4, "NumTasks": 256},
+        ):
+            with ark.PlanManager(
+                config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]}
+            ):
+                xk = self.wk(x)
+            xk = ark.reshape(
+                xk, [bsz, seqlen, self.n_local_kv_heads, self.head_dim]
+            )
+            with ark.PlanManager(
+                config={"SramBytes": 0, "Tile": [256, 1, 128]}
+            ):
+                if freqs_cis is not None:
+                    xk = ark.rope(xk, freqs_cis)
+            keys = xk
+            with ark.PlanManager(config={"SramBytes": 0, "Tile": [256, 128]}):
+                keys = ark.transpose(keys, [0, 2, 1, 3])
+
+        with ark.PlanManager(
+            warp_range=[0, 4],
+            sram_range=[0, 24672],
+            sync=False,
+            config={
+                "NumWarps": 4,
+                "NumTasks": 256,
+                "SramBytes": 24672,
+                "TileShapeMNK": [256, 128, 32],
+            },
+        ):
+            with ark.PlanManager(
+                config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]}
+            ):
+                xv = self.wv(x)
+            xv = ark.reshape(
+                xv, [bsz, seqlen, self.n_local_kv_heads, self.head_dim]
+            )
+            values = xv
+            with ark.PlanManager(
+                config={"SramBytes": 0, "Tile": [256, 1, 128]}
+            ):
+                values = ark.transpose(values, [0, 2, 1, 3])
+
+        with ark.PlanManager(
+            warp_range=[0, 8],
+            sram_range=[0, 49344],
+            sync=False,
+            config={
+                "NumWarps": 4,
+                "NumTasks": 4096,
+                "Granularity": 2,
+            },
+        ):
+            with ark.PlanManager(
+                config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]}
+            ):
+                scores = ark.matmul(xq, keys, transpose_other=True)
+            with ark.PlanManager(config={"SramBytes": 0, "Tile": [256, 128]}):
+                scores = ark.mul(scores, 1.0 / math.sqrt(self.head_dim))
+
+        if mask is not None:
+            scores = ark.add(scores, mask)
+
+        scores = Softmax()(scores)
+
+        with ark.PlanManager(
+            warp_range=[0, 4],
+            sram_range=[0, 24672],
+            sync=False,
+            config={
+                "NumWarps": 4,
+                "NumTasks": 256,
+            },
+        ):
+            with ark.PlanManager(
+                config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]}
+            ):
+                output = ark.matmul(scores, values)
+            with ark.PlanManager(
+                config={"SramBytes": 0, "Tile": [256, 1, 128]}
+            ):
+                output = ark.transpose(output, [0, 2, 1, 3])
+        output = ark.reshape(
+            output, [bsz, seqlen, self.head_dim * self.n_local_heads]
+        )
+        return self.wo(output)
+
+
+class TransformerBlock(ark.Module):
+    def __init__(
+        self,
+        layer_id: int,
+        args: ModelArgs,
+        dtype: ark.DataType = ark.fp16,
+        local_rank: int = 0,
+        world_size: int = 1,
+    ):
+        super().__init__()
+        self.n_heads = args.n_heads
+        self.dim = args.dim
+        self.head_dim = args.dim // args.n_heads
+        self.attention = Attention(args, dtype, local_rank, world_size)
+        self.feed_forward = FeedForward(
+            dim=args.dim,
+            hidden_dim=4 * args.dim,
+            multiple_of=args.multiple_of,
+            ffn_dim_multiplier=args.ffn_dim_multiplier,
+            dtype=dtype,
+            local_rank=local_rank,
+            world_size=world_size,
+        )
+        self.layer_id = layer_id
+        self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps, dtype=dtype)
+        self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps, dtype=dtype)
+
+    def forward(
+        self,
+        x: ark.Tensor,
+        start_pos: int,
+        freqs_cis: ark.Tensor,
+        mask: Optional[ark.Tensor],
+    ):
+        attention_norm_x = self.attention_norm(x)
+        h = self.attention.forward(attention_norm_x, start_pos, freqs_cis, mask)
+        with ark.PlanManager(
+            warp_range=[0, 4],
+            config={
+                "NumWarps": 4,
+                "Tile": [256, 128],
+                "NumTasks": 256,
+                "SramBytes": 0,
+            },
+        ):
+            h = ark.add(x, h)
+        ff = self.feed_forward(self.ffn_norm(h))
+        with ark.PlanManager(
+            warp_range=[0, 4],
+            config={
+                "NumWarps": 4,
+                "Tile": [256, 128],
+                "NumTasks": 256,
+                "SramBytes": 0,
+            },
+        ):
+            out = ark.add(h, ff)
+        return out
+
+
+class Transformer(ark.Module):
+    def __init__(
+        self,
+        params: ModelArgs,
+        dtype: ark.DataType = ark.fp16,
+        local_rank: int = 0,
+        world_size: int = 1,
+    ):
+        super().__init__()
+        self.params = params
+        self.vocab_size = params.vocab_size
+        self.n_layers = params.n_layers
+
+        self.tok_embeddings = ParallelEmbedding(
+            params.vocab_size, params.dim, dtype, local_rank, world_size
+        )
+
+        self.layers = []
+        for layer_id in range(self.n_layers):
+            self.layers.append(
+                TransformerBlock(
+                    layer_id, params, dtype, local_rank, world_size
+                )
+            )
+            self.register_module(f"layers.{layer_id}", self.layers[layer_id])
+        self.norm = RMSNorm(params.dim, eps=params.norm_eps, dtype=dtype)
+        self.output = ColumnParallelLinear(
+            params.dim, params.vocab_size, dtype, True, local_rank, world_size
+        )
+
+    def forward(
+        self,
+        tokens: ark.Tensor,
+        start_pos: int,
+        freqs_cis: ark.Tensor,
+        mask: Optional[ark.Tensor],
+    ):
+        h = self.tok_embeddings(tokens)
+
+        for layer in self.layers:
+            h = layer(h, start_pos, freqs_cis, mask)
+        h = self.norm(h)
+        output = self.output(h)
+        return output
diff --git a/examples/llama/model_test.py b/examples/llama/model_test.py
index 19c680854..f559a826b 100644
--- a/examples/llama/model_test.py
+++ b/examples/llama/model_test.py
@@ -59,8 +59,10 @@ def run_ark(
     output = module(*module_inputs)
 
     with ark.Runtime() as rt:
-        plan = ark.Plan.from_file("plan_llama2_7b_b1_s2048.json")
-        rt.launch(plan)
+        plan = ark.DefaultPlanner().plan()
+        with open("plan.json", "w") as f:
+            f.write(str(plan))
+        rt.launch(plan=plan)
 
         # Load model parameters
         if state_dict:

From 78ac0dacb70e26ef5dc8704c0bb69c7c47240cbd Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 6 Aug 2024 08:06:32 +0000
Subject: [PATCH 44/79] fix merge

---
 ark/include/ark/executor.hpp | 2 +-
 ark/ops/ops_test_common.cpp  | 2 +-
 ark/ops/ops_test_common.hpp  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/ark/include/ark/executor.hpp b/ark/include/ark/executor.hpp
index f0a108a1f..3744c33db 100644
--- a/ark/include/ark/executor.hpp
+++ b/ark/include/ark/executor.hpp
@@ -95,7 +95,7 @@ class DefaultExecutor : public Executor {
    public:
     DefaultExecutor(
         const Model &model, int device_id = -1, Stream stream = nullptr,
-        const std::vector<DefaultPlanner::ConfigRule> &config_rules = {},
+        const std::vector<Planner::ConfigRule> &config_rules = {},
         const std::string &name = "DefaultExecutor", bool loop_mode = true);
 };
 
diff --git a/ark/ops/ops_test_common.cpp b/ark/ops/ops_test_common.cpp
index 2bd9ce2e7..4e94d06a7 100644
--- a/ark/ops/ops_test_common.cpp
+++ b/ark/ops/ops_test_common.cpp
@@ -35,7 +35,7 @@ OpsTestResult op_test(
     const std::string &test_name_prefix, const Model &model,
     const std::vector<Tensor> &inputs, const std::vector<Tensor> &outputs,
     OpsTestBaseline baseline, const std::vector<void *> &inputs_data,
-    const std::vector<DefaultPlanner::ConfigRule> &config_rules,
+    const std::vector<Planner::ConfigRule> &config_rules,
     bool print_on_error) {
     DefaultExecutor exe(model, -1, nullptr, config_rules);
     exe.compile();
diff --git a/ark/ops/ops_test_common.hpp b/ark/ops/ops_test_common.hpp
index c5d640f3b..3848773e6 100644
--- a/ark/ops/ops_test_common.hpp
+++ b/ark/ops/ops_test_common.hpp
@@ -171,7 +171,7 @@ OpsTestResult op_test(
     const std::string &test_name_prefix, const Model &model,
     const std::vector<Tensor> &inputs, const std::vector<Tensor> &outputs,
     OpsTestBaseline baseline, const std::vector<void *> &inputs_data = {},
-    const std::vector<DefaultPlanner::ConfigRule> &config_rules = {},
+    const std::vector<Planner::ConfigRule> &config_rules = {},
     bool print_on_error = false);
 
 OpsTestGpuMem to_gpu(void *host_ptr, size_t size);

From afb518a7622363b000e9fc1d21c4cf8178c3461d Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 6 Aug 2024 08:09:48 +0000
Subject: [PATCH 45/79] fix merge

---
 ark/api/executor.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index 58d058d25..42ed45128 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -233,8 +233,6 @@ void Executor::Impl::init(const PlanJson &plan_json) {
     }
 
     auto gpu_manager = GpuManager::get_instance(device_id_);
-
-    auto gpu_manager = GpuManager::get_instance(gpu_id_);
     if (!gpu_manager->info().arch->belongs_to(
             Arch::from_name(plan_json.at("Architecture")))) {
         LOG(WARN, "Architecture name of the plan `",
@@ -779,7 +777,7 @@ void Executor::Impl::barrier() {
 uintptr_t Executor::Impl::tensor_address(const Tensor tensor) const {
     size_t buffer_id = tensor.ref()->buffer()->id();
     if (buffer_id_to_offset_.find(buffer_id) == buffer_id_to_offset_.end()) {
-        ERR(NotFoundError, "Invalid buffer ID: ", buffer_id);
+        ERR(InternalError, "Invalid buffer ID: ", buffer_id);
     }
     size_t offset = buffer_id_to_offset_.at(buffer_id);
     return reinterpret_cast<uintptr_t>(buffer_->ref(offset));

From 762bf4aa439510dbc04e4f9ee83da84c7a32a03a Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 6 Aug 2024 16:30:57 +0000
Subject: [PATCH 46/79] fix merge

---
 ark/ops/ops_all_reduce_test.cpp    | 15 +++++++--------
 ark/ops/ops_communication_test.cpp |  2 +-
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/ark/ops/ops_all_reduce_test.cpp b/ark/ops/ops_all_reduce_test.cpp
index 90814d036..8cf68b085 100644
--- a/ark/ops/ops_all_reduce_test.cpp
+++ b/ark/ops/ops_all_reduce_test.cpp
@@ -125,10 +125,9 @@ void test_all_reduce_packet_internal(ark::DimType nelem) {
 
             std::vector<ark::half_t> ones_vec(ones.shape().nelems(),
                                               ark::half_t(1.0f));
-            auto result =
-                ark::op_test("all_reduce_packet", m, {ones}, {output},
-                             baseline_all_reduce<ark::half_t, NumGpus>,
-                             {ones_vec.data()}, false, gpu_id, NumGpus);
+            auto result = ark::op_test(
+                "all_reduce_packet", m, {ones}, {output},
+                baseline_all_reduce<ark::half_t, NumGpus>, {ones_vec.data()});
             UNITTEST_LOG(result);
             UNITTEST_EQ(result.max_diff[0], 0.0f);
             return ark::unittest::SUCCESS;
@@ -232,10 +231,10 @@ void test_all_reduce_sm_internal(ark::DimType nelem) {
 
             std::vector<ark::half_t> ones_vec(ones.shape().nelems(),
                                               ark::half_t(1.0f));
-            auto result = ark::op_test(
-                "all_reduce_sm", m, {ones}, {output},
-                baseline_all_reduce<ark::half_t, NumGpus>, {ones_vec.data()},
-                false, gpu_id, NumGpus, config_rule);
+            auto result =
+                ark::op_test("all_reduce_sm", m, {ones}, {output},
+                             baseline_all_reduce<ark::half_t, NumGpus>,
+                             {ones_vec.data()}, {config_rule});
             UNITTEST_LOG(result);
             UNITTEST_EQ(result.max_diff[0], 0.0f);
             return ark::unittest::SUCCESS;
diff --git a/ark/ops/ops_communication_test.cpp b/ark/ops/ops_communication_test.cpp
index db384c1f4..8cdad41b2 100644
--- a/ark/ops/ops_communication_test.cpp
+++ b/ark/ops/ops_communication_test.cpp
@@ -433,7 +433,7 @@ ark::unittest::State test_communication_send_recv_reduce() {
 
             ark::Planner planner(model, gpu_id);
             planner.install_config_rule(config_rule);
-            ark::Executor exe(gpu_id, 2, gpu_id, "Executor", planner.plan());
+            ark::Executor exe(gpu_id, nullptr, "Executor", planner.plan());
             exe.compile();
 
             std::vector<ark::half_t> data(1024);

From f654f0b08d48931acd5645c16300c1a6f3ebe88e Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 6 Aug 2024 16:34:21 +0000
Subject: [PATCH 47/79] add a python method

---
 python/executor_py.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/executor_py.cpp b/python/executor_py.cpp
index e782a99fe..a3f2a078b 100644
--- a/python/executor_py.cpp
+++ b/python/executor_py.cpp
@@ -63,6 +63,7 @@ void register_executor(py::module &m) {
         .def("barrier", &ark::Executor::barrier)
         .def("destroy", &ark::Executor::destroy)
         .def("destroyed", &ark::Executor::destroyed)
+        .def("tensor_address", &ark::Executor::tensor_address)
         .def("tensor_read",
              py::overload_cast<ark::Executor *, const ark::Tensor &, py::buffer,
                                uintptr_t>(&tensor_read),

From 498926c6242a35a38ffd6a8c406b4f3cf1ff84c6 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 6 Aug 2024 16:35:28 +0000
Subject: [PATCH 48/79] submodule update

---
 third_party/mscclpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/mscclpp b/third_party/mscclpp
index cddffbc8b..40cb19655 160000
--- a/third_party/mscclpp
+++ b/third_party/mscclpp
@@ -1 +1 @@
-Subproject commit cddffbc8b6dfa6facf7c64c1b7d73acf30e600b3
+Subproject commit 40cb1965538ab98fea3cc9fe004f730e23e84829

From 3e331a2e2f5487502daccc32890ef49c5d86eb12 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 6 Aug 2024 17:12:15 +0000
Subject: [PATCH 49/79] fix

---
 ark/model/model_json.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ark/model/model_json.cpp b/ark/model/model_json.cpp
index b82f9e484..c2099e2c9 100644
--- a/ark/model/model_json.cpp
+++ b/ark/model/model_json.cpp
@@ -287,6 +287,7 @@ PlanJson::PlanJson(const Json &json)
     : Json((json != nullptr) ? json
                              : Json{{"Rank", 0},
                                     {"WorldSize", 1},
+                                    {"Architecture", "ANY"},
                                     {"NumProcessors", 1},
                                     {"NumWarpsPerProcessor", 1},
                                     {"TaskInfos", Json::array()},

From 10bfa75dbd40a96ffca69fb22e89127e1839b940 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 6 Aug 2024 17:14:47 +0000
Subject: [PATCH 50/79] Rename CMake environments

---
 .github/workflows/codeql.yml  |  4 ++--
 .github/workflows/ut-cuda.yml |  2 +-
 CMakeLists.txt                | 32 ++++++++++++++++----------------
 ark/CMakeLists.txt            | 10 +++++-----
 pyproject.toml                |  2 +-
 third_party/CMakeLists.txt    |  9 +++++++--
 third_party/mscclpp           |  2 +-
 7 files changed, 33 insertions(+), 28 deletions(-)

diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 0d7094c36..272cb8ebe 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -49,7 +49,7 @@ jobs:
     - name: Build
       run: |
         mkdir build && cd build
-        cmake -DCMAKE_BUILD_TYPE=Debug -DBUILD_PYTHON=ON -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON -DBUILD_TESTS=OFF ..
+        cmake -DCMAKE_BUILD_TYPE=Debug -DARK_BUILD_PYTHON=ON -DARK_BYPASS_GPU_CHECK=ON -DARK_USE_CUDA=ON -DARK_BUILD_TESTS=OFF ..
         make -j build ark_py
 
     - name: Perform CodeQL Analysis
@@ -95,7 +95,7 @@ jobs:
     - name: Build
       run: |
         mkdir build && cd build
-        CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Debug -DBUILD_PYTHON=ON -DBYPASS_GPU_CHECK=ON -DUSE_ROCM=ON -DBUILD_TESTS=OFF ..
+        CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Debug -DARK_BUILD_PYTHON=ON -DARK_BYPASS_GPU_CHECK=ON -DARK_USE_ROCM=ON -DARK_BUILD_TESTS=OFF ..
         make -j build ark_py
 
     - name: Perform CodeQL Analysis
diff --git a/.github/workflows/ut-cuda.yml b/.github/workflows/ut-cuda.yml
index 4e573adfb..c2e8e7c50 100644
--- a/.github/workflows/ut-cuda.yml
+++ b/.github/workflows/ut-cuda.yml
@@ -44,7 +44,7 @@ jobs:
       - name: Build
         run: |
           mkdir build && cd build
-          cmake -DCMAKE_BUILD_TYPE=Debug -DBUILD_PYTHON=ON ..
+          cmake -DCMAKE_BUILD_TYPE=Debug -DARK_BUILD_PYTHON=ON ..
           make -j ut ark_py
 
       - name: Run C++ UT
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ee1e3566e..2e80ea1e8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -13,16 +13,16 @@ enable_language(CXX)
 
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 
-option(USE_CUDA "Use NVIDIA/CUDA." OFF)
-option(USE_ROCM "Use AMD/ROCm." OFF)
-option(BYPASS_GPU_CHECK "Bypass GPU check." OFF)
-option(BUILD_TESTS "Build unit tests." ON)
+option(ARK_USE_CUDA "Use NVIDIA/CUDA." OFF)
+option(ARK_USE_ROCM "Use AMD/ROCm." OFF)
+option(ARK_BYPASS_GPU_CHECK "Bypass GPU check." OFF)
+option(ARK_BUILD_TESTS "Build unit tests." ON)
 
-if(BYPASS_GPU_CHECK)
-    if(USE_CUDA)
+if(ARK_BYPASS_GPU_CHECK)
+    if(ARK_USE_CUDA)
         message("Bypassing GPU check: using NVIDIA/CUDA.")
         find_package(CUDAToolkit REQUIRED)
-    elseif(USE_ROCM)
+    elseif(ARK_USE_ROCM)
         message("Bypassing GPU check: using AMD/ROCm.")
         set(CMAKE_PREFIX_PATH "/opt/rocm;${CMAKE_PREFIX_PATH}")
         find_package(hip REQUIRED)
@@ -35,16 +35,16 @@ else()
     include(CheckAmdGpu)
     if(NVIDIA_FOUND AND AMD_FOUND)
         message("Detected NVIDIA/CUDA and AMD/ROCm: prioritizing NVIDIA/CUDA.")
-        set(USE_CUDA ON)
-        set(USE_ROCM OFF)
+        set(ARK_USE_CUDA ON)
+        set(ARK_USE_ROCM OFF)
     elseif(NVIDIA_FOUND)
         message("Detected NVIDIA/CUDA.")
-        set(USE_CUDA ON)
-        set(USE_ROCM OFF)
+        set(ARK_USE_CUDA ON)
+        set(ARK_USE_ROCM OFF)
     elseif(AMD_FOUND)
         message("Detected AMD/ROCm.")
-        set(USE_CUDA OFF)
-        set(USE_ROCM ON)
+        set(ARK_USE_CUDA OFF)
+        set(ARK_USE_ROCM ON)
     else()
         message(FATAL_ERROR "Neither NVIDIA/CUDA nor AMD/ROCm is found.")
     endif()
@@ -53,7 +53,7 @@ endif()
 # Declare project
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wno-deprecated-declarations")
-if(USE_CUDA)
+if(ARK_USE_CUDA)
     set(CMAKE_CUDA_STANDARD 17)
     set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wall,-Wextra")
     project(ark LANGUAGES CXX CUDA)
@@ -72,7 +72,7 @@ if(USE_CUDA)
     if(CUDAToolkit_VERSION_MAJOR GREATER_EQUAL 12)
         set(CMAKE_CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES} 90)
     endif()
-else()  # USE_ROCM
+else()  # ARK_USE_ROCM
     set(CMAKE_HIP_STANDARD 17)
     set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Wall -Wextra")
     project(ark LANGUAGES CXX HIP)
@@ -145,7 +145,7 @@ add_custom_target(ut)
 # Details
 add_subdirectory(ark)
 
-if(BUILD_PYTHON)
+if(ARK_BUILD_PYTHON)
     # Install Python module
     add_subdirectory(python)
     add_dependencies(ark_py build)
diff --git a/ark/CMakeLists.txt b/ark/CMakeLists.txt
index 4457d3c0b..208d9f9cb 100644
--- a/ark/CMakeLists.txt
+++ b/ark/CMakeLists.txt
@@ -6,7 +6,7 @@ file(GLOB_RECURSE UT_SOURCES CONFIGURE_DEPENDS *_test.cpp)
 file(GLOB_RECURSE UT_COMMON_SOURCES CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/unittest/*.cpp)
 list(REMOVE_ITEM SOURCES ${UT_SOURCES} ${UT_COMMON_SOURCES})
 
-if(USE_ROCM)
+if(ARK_USE_ROCM)
     file(GLOB_RECURSE CU_SOURCES CONFIGURE_DEPENDS *.cu)
     set_source_files_properties(${CU_SOURCES} PROPERTIES LANGUAGE CXX)
 endif()
@@ -23,7 +23,7 @@ target_include_directories(ark_obj SYSTEM PRIVATE
     ${NUMA_INCLUDE_DIRS}
 )
 
-if(USE_CUDA)
+if(ARK_USE_CUDA)
     list(APPEND COMMON_LIBS CUDA::cuda_driver)
     target_include_directories(ark_obj SYSTEM PRIVATE
         ${PROJECT_SOURCE_DIR}/third_party/cutlass/include
@@ -32,7 +32,7 @@ if(USE_CUDA)
     target_compile_definitions(ark_obj PUBLIC ARK_CUDA)
 endif()
 
-if(USE_ROCM)
+if(ARK_USE_ROCM)
     list(APPEND COMMON_LIBS hip::host)
     target_include_directories(ark_obj SYSTEM PRIVATE
         ${PROJECT_SOURCE_DIR}/third_party/cutlass/include
@@ -45,7 +45,7 @@ target_sources(ark_obj PRIVATE ${SOURCES})
 target_link_libraries(ark_obj PUBLIC mscclpp_static PRIVATE ${COMMON_LIBS})
 
 # ARK unit tests
-if(BUILD_TESTS)
+if(ARK_BUILD_TESTS)
     foreach(ut_source IN ITEMS ${UT_SOURCES})
         get_filename_component(exe_name ${ut_source} NAME_WE)
         add_executable(${exe_name} ${ut_source} ${UT_COMMON_SOURCES})
@@ -58,7 +58,7 @@ if(BUILD_TESTS)
             ${NUMA_INCLUDE_DIRS}
         )
 
-        if(USE_CUDA)
+        if(ARK_USE_CUDA)
             target_link_libraries(${exe_name} PRIVATE ark_obj ${COMMON_LIBS} CUDA::cudart CUDA::cublas)
             target_include_directories(${exe_name} SYSTEM PRIVATE
                 ${CUDAToolkit_INCLUDE_DIRS}
diff --git a/pyproject.toml b/pyproject.toml
index 1f9386c73..d9fb4502e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,7 +18,7 @@ install.strip = true
 build-dir = "build/{wheel_tag}"
 
 [tool.scikit-build.cmake.define]
-BUILD_PYTHON = "ON"
+ARK_BUILD_PYTHON = "ON"
 
 [tool.black]
 line-length = 80
diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt
index 12ae74298..96e442289 100644
--- a/third_party/CMakeLists.txt
+++ b/third_party/CMakeLists.txt
@@ -14,7 +14,12 @@ FetchContent_Declare(
     GIT_TAG v0.5.2
     SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/mscclpp
 )
+set(BUILD_TESTS OFF CACHE BOOL "" FORCE)
 set(BUILD_PYTHON_BINDINGS OFF CACHE BOOL "" FORCE)
+set(BUILD_APPS_NCCL OFF CACHE BOOL "" FORCE)
+set(USE_CUDA ${ARK_USE_CUDA} CACHE BOOL "" FORCE)
+set(USE_ROCM ${ARK_USE_ROCM} CACHE BOOL "" FORCE)
+set(BYPASS_GPU_CHECK ON CACHE BOOL "" FORCE)
 set(INSTALL_PREFIX "ark")
 FetchContent_GetProperties(mscclpp)
 if (NOT mscclpp_POPULATED)
@@ -35,7 +40,7 @@ if (NOT json_POPULATED)
 endif()
 set(JSON_INCLUDE_DIRS ${json_SOURCE_DIR}/include PARENT_SCOPE)
 
-if(USE_CUDA)
+if(ARK_USE_CUDA)
     # Configure CUTLASS
     FetchContent_Declare(
         cutlass
@@ -58,7 +63,7 @@ if(USE_CUDA)
 endif()
 
 
-if(USE_ROCM)
+if(ARK_USE_ROCM)
     # Configure CK
     FetchContent_Declare(
         ck
diff --git a/third_party/mscclpp b/third_party/mscclpp
index cddffbc8b..40cb19655 160000
--- a/third_party/mscclpp
+++ b/third_party/mscclpp
@@ -1 +1 @@
-Subproject commit cddffbc8b6dfa6facf7c64c1b7d73acf30e600b3
+Subproject commit 40cb1965538ab98fea3cc9fe004f730e23e84829

From 3dda44a8dc310560333de0cf9090d7da0013e21f Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 6 Aug 2024 18:15:09 +0000
Subject: [PATCH 51/79] A few fixes & improved coverage

---
 ark/api/executor.cpp         |  21 +++--
 ark/api/executor_test.cpp    | 150 +++++++++++++++++++++++++++++++++++
 ark/include/ark/executor.hpp |   2 +-
 python/executor_py.cpp       |   2 +-
 4 files changed, 161 insertions(+), 14 deletions(-)
 create mode 100644 ark/api/executor_test.cpp

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index 42ed45128..16d369bc8 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -141,7 +141,7 @@ static size_t tensor_stride_bytes(const Json &tensor) {
 class Executor::Impl {
    public:
     Impl(int device_id, Stream stream, const std::string &name, bool loop_mode);
-    ~Impl() = default;
+    ~Impl();
 
     void init(const PlanJson& plan);
 
@@ -152,7 +152,7 @@ class Executor::Impl {
     std::string plan() const { return plan_json_.dump_pretty(); }
 
     void compile();
-    void launch(int64_t max_spin_count);
+    void launch();
     void run(int iter);
     void wait(int64_t max_spin_count);
     float stop(int64_t max_spin_count);
@@ -219,6 +219,10 @@ Executor::Impl::Impl(int device_id, Stream stream, const std::string &name,
     }
 }
 
+Executor::Impl::~Impl() {
+    if (is_launched_) stop(-1);
+}
+
 void Executor::Impl::init(const PlanJson &plan_json) {
     plan_json_ = plan_json;
     rank_ = plan_json_["Rank"].get<int>();
@@ -620,13 +624,12 @@ void Executor::Impl::init_channels(const std::set<int> &remote_ranks) {
 
 void Executor::Impl::compile() { kernel_->compile(); }
 
-void Executor::Impl::launch(int64_t max_spin_count) {
+void Executor::Impl::launch() {
     if (!kernel_->is_compiled()) {
         ERR(InvalidUsageError, "Need to compile first before initialization.");
     }
     if (is_launched_) {
-        // Wait until previous works finish.
-        this->wait(max_spin_count);
+        LOG(WARN, "Ignore launching twice.");
         return;
     }
     auto get_global_rt = [&](const std::string &symbol) {
@@ -674,12 +677,6 @@ void Executor::Impl::launch(int64_t max_spin_count) {
     }
 
     elapsed_msec_ = -1;
-    if (!kernel_->is_compiled()) {
-        ERR(InvalidUsageError, "Need to compile first before initialization.");
-    } else if (is_launched_) {
-        LOG(WARN, "Ignore launching twice.");
-        return;
-    }
     timer_begin_->record(stream_raw_);
 
     if (world_size_ > 1) {
@@ -911,7 +908,7 @@ std::string Executor::plan() const { return impl_->plan(); }
 
 void Executor::compile() { impl_->compile(); }
 
-void Executor::launch(int64_t max_spin_count) { impl_->launch(max_spin_count); }
+void Executor::launch() { impl_->launch(); }
 
 void Executor::run(int iter) { impl_->run(iter); }
 
diff --git a/ark/api/executor_test.cpp b/ark/api/executor_test.cpp
new file mode 100644
index 000000000..b0b398ac9
--- /dev/null
+++ b/ark/api/executor_test.cpp
@@ -0,0 +1,150 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "ark/executor.hpp"
+
+#include "gpu/gpu.hpp"
+#include "model/model_json.hpp"
+#include "unittest/unittest_utils.h"
+
+template <bool LoopMode>
+ark::unittest::State test_executor() {
+    ark::gpuStream stream;
+    UNITTEST_EQ(
+        ark::gpuStreamCreateWithFlags(&stream, ark::gpuStreamNonBlocking),
+        ark::gpuSuccess);
+
+    ark::Model empty;
+    {
+        ark::DefaultExecutor executor(empty, 0, stream, {}, "test", LoopMode);
+        UNITTEST_EQ(executor.device_id(), 0);
+        UNITTEST_EQ(executor.stream(), stream);
+
+        executor.compile();
+        executor.launch();
+        executor.run(1);
+        executor.wait();
+        executor.stop();
+        executor.destroy();
+    }
+    {
+        ark::DefaultExecutor executor(empty, 0, stream, {}, "test", LoopMode);
+        executor.compile();
+        executor.launch();
+        executor.run(1);
+        executor.wait();
+        executor.stop();
+
+        executor.launch();
+        executor.run(1);
+        executor.wait();
+        executor.stop();
+
+        executor.destroy();
+    }
+    {
+        ark::DefaultExecutor executor(empty, 0, stream, {}, "test", LoopMode);
+        UNITTEST_THROW(executor.launch(), ark::InvalidUsageError);
+
+        executor.compile();
+        executor.launch();
+        executor.launch();  // Will be ignored with a warning.
+        executor.run(1);
+        executor.wait();
+        executor.wait();  // nothing to do
+
+        // Stop & destroy automatically.
+    }
+
+    UNITTEST_EQ(ark::gpuStreamDestroy(stream), ark::gpuSuccess);
+    return ark::unittest::SUCCESS;
+}
+
+ark::unittest::State test_executor_loop() { return test_executor<true>(); }
+
+ark::unittest::State test_executor_no_loop() { return test_executor<false>(); }
+
+ark::unittest::State test_executor_tensor_read_write() {
+    // Alloc CPU array
+    std::vector<float> host_data(1024);
+    void *host_ptr = host_data.data();
+    for (size_t i = 0; i < host_data.size(); ++i) {
+        host_data[i] = static_cast<float>(i);
+    }
+
+    // Alloc GPU array
+    void *dev_ptr;
+    UNITTEST_EQ(ark::gpuMalloc(&dev_ptr, 1024 * sizeof(float)),
+                ark::gpuSuccess);
+
+    // Create an ARK tensor
+    ark::Model m;
+    auto tensor = m.tensor({1024}, ark::FP32);
+    m.noop(tensor);
+
+    ark::DefaultExecutor executor(m, 0);
+    executor.compile();
+    executor.launch();
+
+    // Copy data from CPU array to ARK tensor
+    executor.tensor_write(tensor, host_ptr, 1024 * sizeof(float));
+
+    // Copy data from ARK tensor to GPU array
+    executor.tensor_read(tensor, dev_ptr, 1024 * sizeof(float), nullptr, true);
+
+    // Check the data
+    std::vector<float> dev_data(1024);
+    executor.tensor_read(tensor, dev_data.data(), 1024 * sizeof(float));
+    for (size_t i = 0; i < dev_data.size(); ++i) {
+        UNITTEST_EQ(dev_data[i], static_cast<float>(i));
+        dev_data[i] = -1;
+    }
+
+    UNITTEST_EQ(ark::gpuMemcpy(dev_data.data(), dev_ptr, 1024 * sizeof(float),
+                               ark::gpuMemcpyDeviceToHost),
+                ark::gpuSuccess);
+    for (size_t i = 0; i < dev_data.size(); ++i) {
+        UNITTEST_EQ(dev_data[i], static_cast<float>(i));
+        dev_data[i] = -1;
+    }
+
+    // Copy -1s back to GPU array
+    UNITTEST_EQ(ark::gpuMemcpy(dev_ptr, dev_data.data(), 1024 * sizeof(float),
+                               ark::gpuMemcpyHostToDevice),
+                ark::gpuSuccess);
+
+    // Copy data from GPU array to ARK tensor
+    executor.tensor_write(tensor, dev_ptr, 1024 * sizeof(float), nullptr, true);
+
+    // Copy data from ARK tensor to CPU array
+    executor.tensor_read(tensor, host_ptr, 1024 * sizeof(float));
+
+    // Check the data
+    for (size_t i = 0; i < host_data.size(); ++i) {
+        UNITTEST_EQ(host_data[i], -1);
+    }
+
+    return ark::unittest::SUCCESS;
+}
+
+ark::unittest::State test_executor_invalid() {
+    // Invalid device ID.
+    UNITTEST_THROW(ark::Executor(-1, nullptr, "test", ""),
+                   ark::InvalidUsageError);
+
+    // Invalid rank.
+    ark::PlanJson plan;
+    plan["Rank"] = 1;
+    UNITTEST_THROW(ark::Executor(0, nullptr, "test", plan.dump(), true),
+                   ark::InvalidUsageError);
+
+    return ark::unittest::SUCCESS;
+}
+
+int main() {
+    UNITTEST(test_executor_loop);
+    UNITTEST(test_executor_no_loop);
+    UNITTEST(test_executor_tensor_read_write);
+    UNITTEST(test_executor_invalid);
+    return 0;
+}
diff --git a/ark/include/ark/executor.hpp b/ark/include/ark/executor.hpp
index 3744c33db..7f30f39ed 100644
--- a/ark/include/ark/executor.hpp
+++ b/ark/include/ark/executor.hpp
@@ -39,7 +39,7 @@ class Executor {
 
     /// Launch the model (not running yet). This must be called after
     /// `compile()`.
-    void launch(int64_t max_spin_count = -1);
+    void launch();
 
     /// Run the model for `iter` iterations.
     void run(int iter);
diff --git a/python/executor_py.cpp b/python/executor_py.cpp
index a3f2a078b..36e1c435e 100644
--- a/python/executor_py.cpp
+++ b/python/executor_py.cpp
@@ -56,7 +56,7 @@ void register_executor(py::module &m) {
              })
         .def("plan", &ark::Executor::plan)
         .def("compile", &ark::Executor::compile)
-        .def("launch", &ark::Executor::launch, py::arg("max_spin_count") = -1)
+        .def("launch", &ark::Executor::launch)
         .def("run", &ark::Executor::run, py::arg("iter"))
         .def("wait", &ark::Executor::wait, py::arg("max_spin_count") = -1)
         .def("stop", &ark::Executor::stop, py::arg("max_spin_count") = -1)

From 4971601b09880e29adc85ab305a739edf55ccbb0 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 6 Aug 2024 19:03:08 +0000
Subject: [PATCH 52/79] fix merge

---
 ark/api/context_manager.cpp                | 42 ----------
 ark/api/context_manager_test.cpp           | 53 ------------
 ark/api/executor.cpp                       |  8 --
 ark/api/model.cpp                          |  2 +-
 ark/api/model_graph.cpp                    |  4 +-
 ark/api/plan_manager.cpp                   | 97 ----------------------
 ark/api/plan_manager_test.cpp              | 58 -------------
 ark/codegen.cpp                            |  1 -
 ark/include/ark/context_manager.hpp        | 24 ------
 ark/include/ark/error.hpp                  | 15 +++-
 ark/include/ark/model.hpp                  | 57 +++++--------
 ark/include/ark/model_graph.hpp            |  2 +-
 ark/include/ark/plan_manager.hpp           | 25 ------
 ark/model/model_graph_impl.cpp             |  6 +-
 ark/model/model_graph_impl.hpp             |  8 +-
 ark/model/model_op.cpp                     | 11 ---
 ark/model/model_op.hpp                     |  9 +-
 ark/ops/ops_arithmetic.cpp                 | 20 ++---
 ark/ops/ops_arithmetic_test.cpp            | 48 ++++-------
 ark/ops/ops_cast.cpp                       | 10 +--
 ark/ops/ops_communication.cpp              | 14 ++--
 ark/ops/ops_copy.cpp                       |  5 +-
 ark/ops/ops_embedding.cpp                  |  4 +-
 ark/ops/ops_identity.cpp                   |  2 +-
 ark/ops/ops_math.cpp                       | 31 +++----
 ark/ops/ops_matmul.cpp                     |  6 +-
 ark/ops/ops_noop.cpp                       |  2 +-
 ark/ops/ops_reduce.cpp                     | 12 +--
 ark/ops/ops_refer.cpp                      |  2 +-
 ark/ops/ops_reshape.cpp                    |  4 +-
 ark/ops/ops_rope.cpp                       |  5 +-
 ark/ops/ops_scalar.cpp                     | 31 +++----
 ark/ops/ops_transpose.cpp                  |  5 +-
 examples/llama/model_7b_b1_s2048.py        | 70 ++++++++--------
 examples/tutorial/plan_manager_tutorial.py | 81 ------------------
 python/ark/plan_manager.py                 | 34 --------
 python/ark/runtime.py                      |  1 +
 python/model_py.cpp                        | 79 ++++++++----------
 python/plan_manager_py.cpp                 | 15 ----
 39 files changed, 195 insertions(+), 708 deletions(-)
 delete mode 100644 ark/api/context_manager.cpp
 delete mode 100644 ark/api/context_manager_test.cpp
 delete mode 100644 ark/api/plan_manager.cpp
 delete mode 100644 ark/api/plan_manager_test.cpp
 delete mode 100644 ark/include/ark/context_manager.hpp
 delete mode 100644 ark/include/ark/plan_manager.hpp
 delete mode 100644 examples/tutorial/plan_manager_tutorial.py
 delete mode 100644 python/ark/plan_manager.py
 delete mode 100644 python/plan_manager_py.cpp

diff --git a/ark/api/context_manager.cpp b/ark/api/context_manager.cpp
deleted file mode 100644
index 6d16d9e79..000000000
--- a/ark/api/context_manager.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include "ark/context_manager.hpp"
-
-#include "model/model_graph_impl.hpp"
-
-namespace ark {
-
-class ContextManager::Impl {
-   public:
-    Impl(std::shared_ptr<ModelGraphContextStack> context_stack,
-         const std::map<std::string, std::string>& context_map);
-
-    ~Impl();
-
-   private:
-    std::shared_ptr<ModelGraphContextStack> context_stack_;
-    std::vector<std::string> keys_;
-};
-
-ContextManager::Impl::Impl(
-    std::shared_ptr<ModelGraphContextStack> context_stack,
-    const std::map<std::string, std::string>& context_map)
-    : context_stack_(context_stack) {
-    for (const auto& [key, value] : context_map) {
-        context_stack_->push(key, value);
-        keys_.push_back(key);
-    }
-}
-
-ContextManager::Impl::~Impl() {
-    for (auto it = keys_.rbegin(); it != keys_.rend(); ++it) {
-        context_stack_->pop(*it);
-    }
-}
-
-ContextManager::ContextManager(
-    Model& model, const std::map<std::string, std::string>& context_map)
-    : impl_(std::make_shared<Impl>(model.impl_->context_stack_, context_map)) {}
-
-}  // namespace ark
diff --git a/ark/api/context_manager_test.cpp b/ark/api/context_manager_test.cpp
deleted file mode 100644
index 5fff94f34..000000000
--- a/ark/api/context_manager_test.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include "ark/context_manager.hpp"
-
-#include "model/model_node.hpp"
-#include "unittest/unittest_utils.h"
-
-ark::unittest::State test_context_manager() {
-    ark::Model model;
-    ark::Tensor t0 = model.tensor({1}, ark::FP32);
-    ark::Tensor t1 = model.tensor({1}, ark::FP32);
-    ark::Tensor t2 = model.add(t0, t1);
-
-    ark::Tensor t3;
-    ark::Tensor t4;
-    ark::Tensor t5;
-    {
-        ark::ContextManager cm0_1(model, {{"key0", "val1"}});
-        t3 = model.relu(t2);
-
-        ark::ContextManager cm1_1(model, {{"key1", "val2"}});
-        t4 = model.sqrt(t3);
-    }
-    {
-        ark::ContextManager cm0_2(model, {{"key0", "val3"}});
-        t5 = model.exp(t2);
-    }
-
-    UNITTEST_TRUE(model.verify());
-
-    auto compressed = model.compress(false);
-    UNITTEST_TRUE(compressed.verify());
-
-    auto nodes = compressed.nodes();
-    UNITTEST_EQ(nodes.size(), 4);
-
-    UNITTEST_EQ(nodes[0]->context.size(), 0);
-    UNITTEST_EQ(nodes[1]->context.size(), 1);
-    UNITTEST_EQ(nodes[1]->context.at("key0"), "val1");
-    UNITTEST_EQ(nodes[2]->context.size(), 2);
-    UNITTEST_EQ(nodes[2]->context.at("key0"), "val1");
-    UNITTEST_EQ(nodes[2]->context.at("key1"), "val2");
-    UNITTEST_EQ(nodes[3]->context.size(), 1);
-    UNITTEST_EQ(nodes[3]->context.at("key0"), "val3");
-
-    return ark::unittest::SUCCESS;
-}
-
-int main() {
-    UNITTEST(test_context_manager);
-    return 0;
-}
diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index 6fb2b5f2e..17d579763 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -250,14 +250,6 @@ void Executor::Impl::init(const PlanJson &plan_json) {
             gpu_manager->info().arch->name(), "`.");
     }
 
-    if (!gpu_manager->info().arch->belongs_to(
-            Arch::from_name(plan_json_.at("Architecture")))) {
-        LOG(WARN, "Architecture name of the plan `",
-            plan_json_.at("Architecture").get<std::string>(),
-            "` is not compatible with the GPU architecture `",
-            gpu_manager->info().arch->name(), "`.");
-    }
-
     buffer_id_to_offset_ = init_buffers(plan_json_);
 
     std::string buffer_id_to_offset_str;
diff --git a/ark/api/model.cpp b/ark/api/model.cpp
index 8227ea848..dcbd4940e 100644
--- a/ark/api/model.cpp
+++ b/ark/api/model.cpp
@@ -20,7 +20,7 @@ size_t Model::id() const { return id_; }
 
 Model Model::compress() const {
     Model model(*this);
-    model.compress_nodes(merge_nodes);
+    model.compress_nodes();
     return model;
 }
 
diff --git a/ark/api/model_graph.cpp b/ark/api/model_graph.cpp
index a4477b8e6..e07565141 100644
--- a/ark/api/model_graph.cpp
+++ b/ark/api/model_graph.cpp
@@ -33,9 +33,7 @@ int ModelGraph::rank() const { return impl_->rank(); }
 
 int ModelGraph::world_size() const { return impl_->world_size(); }
 
-void ModelGraph::compress_nodes(bool merge_nodes) {
-    impl_->compress_nodes(merge_nodes);
-}
+void ModelGraph::compress_nodes() { impl_->compress_nodes(); }
 
 bool ModelGraph::compressed() const { return impl_->compressed(); }
 
diff --git a/ark/api/plan_manager.cpp b/ark/api/plan_manager.cpp
deleted file mode 100644
index 8cb1940b1..000000000
--- a/ark/api/plan_manager.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include "ark/plan_manager.hpp"
-
-#include "logging.h"
-#include "model/model_json.hpp"
-#include "model/model_graph_impl.hpp"
-
-namespace ark {
-
-class PlanManagerState {
-   public:
-    PlanManagerState() : sync(true) {}
-    bool sync;
-};
-
-static std::map<size_t, PlanManagerState> gPlanManagerStates;
-
-PlanManager::PlanManager(Model& model, const std::string& plan_context)
-    : model_id_(model.id()), stop_sync_(false) {
-    static int task_group_id = 0;
-    auto ctx = Json::parse(plan_context);
-    if (!ctx.is_object()) {
-        ERR(ModelError, "plan context must be a JSON object");
-    }
-    if (gPlanManagerStates.find(model_id_) == gPlanManagerStates.end()) {
-        gPlanManagerStates.emplace(model_id_, PlanManagerState());
-    }
-    auto& state = gPlanManagerStates[model_id_];
-    bool async = !state.sync;
-    std::map<std::string, std::string> context_map;
-    for (const auto& [key, value] : ctx.items()) {
-        if (key == "sync") {
-            if (!value.is_boolean()) {
-                ERR(ModelError, "sync must be a boolean");
-            }
-            if (state.sync && !value.get<bool>()) {
-                stop_sync_ = true;
-                state.sync = false;
-                context_map["TaskGroupId"] = std::to_string(task_group_id++);
-            }
-        } else if (key == "processor_range") {
-            if (!value.is_array()) {
-                ERR(ModelError, "processor_range must be an array");
-            }
-            if (async) {
-                LOG(WARN, "Ignoring processor_range under sync=false context");
-                continue;
-            }
-            context_map["ProcessorRange"] = value.dump();
-        } else if (key == "warp_range") {
-            if (!value.is_array()) {
-                ERR(ModelError, "warp_range must be an array");
-            }
-            if (async) {
-                LOG(WARN, "Ignoring warp_range under sync=false context");
-                continue;
-            }
-            context_map["WarpRange"] = value.dump();
-        } else if (key == "sram_range") {
-            if (!value.is_array()) {
-                ERR(ModelError, "sram_range must be an array");
-            }
-            if (async) {
-                LOG(WARN, "Ignoring sram_range under sync=false context");
-                continue;
-            }
-            context_map["SramRange"] = value.dump();
-        } else if (key == "config") {
-            if (!value.is_object()) {
-                ERR(ModelError, "config must be an object");
-            }
-            auto cfg = model.impl_->get_context("Config");
-            if (cfg.empty()) {
-                context_map["Config"] = value.dump();
-            } else {
-                auto cfg_obj = Json::parse(cfg);
-                for (const auto& [k, v] : value.items()) {
-                    cfg_obj[k] = v;
-                }
-                context_map["Config"] = cfg_obj.dump();
-            }
-        } else {
-            LOG(WARN, "Ignoring unknown plan context key: ", key);
-        }
-    }
-    context_manager_ = std::make_shared<ContextManager>(model, context_map);
-}
-
-PlanManager::~PlanManager() {
-    if (stop_sync_) {
-        gPlanManagerStates[model_id_].sync = true;
-    }
-}
-
-}  // namespace ark
diff --git a/ark/api/plan_manager_test.cpp b/ark/api/plan_manager_test.cpp
deleted file mode 100644
index 78f5d4cb8..000000000
--- a/ark/api/plan_manager_test.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include "ark/plan_manager.hpp"
-#include "ark/planner.hpp"
-
-#include "model/model_json.hpp"
-#include "unittest/unittest_utils.h"
-
-ark::unittest::State test_plan_manager() {
-    ark::Model model;
-    ark::Tensor t0 = model.tensor({1}, ark::FP32);
-    ark::Tensor t1 = model.tensor({1}, ark::FP32);
-    ark::Tensor t2 = model.add(t0, t1);
-
-    ark::Tensor t3;
-    ark::Tensor t4;
-    ark::Tensor t5;
-    ark::Tensor t6;
-    {
-        ark::PlanManager pm_0(model, ark::Json({
-            {"processor_range", {0, 2}},
-            {"warp_range", {0, 4}},
-            {"sram_range", {0, 0}},
-            {"sync", false}
-        }).dump());
-        t3 = model.relu(t2);
-        t4 = model.sqrt(t3);
-    }
-    {
-        ark::PlanManager pm_0(model, ark::Json({
-            {"processor_range", {2, 4}},
-            {"warp_range", {0, 4}},
-            {"sram_range", {0, 0}}
-        }).dump());
-        t5 = model.exp(t2);
-
-        ark::PlanManager pm_1(model, ark::Json({
-            {"processor_range", {2, 3}}
-        }).dump());
-        t6 = model.rsqrt(t5);
-    }
-
-    UNITTEST_TRUE(model.verify());
-
-    ark::DefaultPlanner planner(model, 0);
-    auto plan_str = planner.plan();
-    ark::Json plan = ark::Json::parse(plan_str);
-
-    UNITTEST_LOG(plan_str);
-
-    return ark::unittest::SUCCESS;
-}
-
-int main() {
-    UNITTEST(test_plan_manager);
-    return 0;
-}
diff --git a/ark/codegen.cpp b/ark/codegen.cpp
index bc43584cb..1619b863f 100644
--- a/ark/codegen.cpp
+++ b/ark/codegen.cpp
@@ -87,7 +87,6 @@ CodeGenerator::Impl::Impl(const PlanJson &plan,
     num_warps_per_proc_ = plan.at("NumWarpsPerProcessor");
 
     std::stringstream definitions_ss;
-
     for (auto &task_json : plan.at("TaskInfos")) {
         definitions_ss << this->def_task(task_json);
     }
diff --git a/ark/include/ark/context_manager.hpp b/ark/include/ark/context_manager.hpp
deleted file mode 100644
index 58271ea8c..000000000
--- a/ark/include/ark/context_manager.hpp
+++ /dev/null
@@ -1,24 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#ifndef ARK_CONTEXT_MANAGER_HPP
-#define ARK_CONTEXT_MANAGER_HPP
-
-#include <ark/model.hpp>
-#include <map>
-
-namespace ark {
-
-class ContextManager {
-   public:
-    ContextManager(Model& model,
-                   const std::map<std::string, std::string>& context_map);
-
-   private:
-    class Impl;
-    std::shared_ptr<Impl> impl_;
-};
-
-}  // namespace ark
-
-#endif  // ARK_CONTEXT_MANAGER_HPP
diff --git a/ark/include/ark/error.hpp b/ark/include/ark/error.hpp
index 78d02cab3..965b1c0bc 100644
--- a/ark/include/ark/error.hpp
+++ b/ark/include/ark/error.hpp
@@ -9,6 +9,7 @@
 
 namespace ark {
 
+/// Base class for all ARK errors.
 class BaseError : public std::exception {
    private:
     std::string msg_;
@@ -24,15 +25,21 @@ class BaseError : public std::exception {
         _name(const std::string &msg) : BaseError(msg) {} \
     };
 
+/// Internal error in ARK, likely a bug.
 REGISTER_ERROR_TYPE(InternalError)
+/// Invalid usage of ARK API.
 REGISTER_ERROR_TYPE(InvalidUsageError)
-REGISTER_ERROR_TYPE(NotFoundError)
+/// Invalid ARK model definition or usage.
 REGISTER_ERROR_TYPE(ModelError)
-REGISTER_ERROR_TYPE(SchedulerError)
-REGISTER_ERROR_TYPE(ExecutorError)
+/// Invalid ARK plan definition or usage.
+REGISTER_ERROR_TYPE(PlanError)
+/// Unsupported feature triggered.
+REGISTER_ERROR_TYPE(UnsupportedError)
+/// Error from invalid system state such as a system call failure.
 REGISTER_ERROR_TYPE(SystemError)
+/// Error from a CUDA/HIP API call.
 REGISTER_ERROR_TYPE(GpuError)
-REGISTER_ERROR_TYPE(RuntimeError)
+/// Error from a unit test.
 REGISTER_ERROR_TYPE(UnitTestError)
 
 }  // namespace ark
diff --git a/ark/include/ark/model.hpp b/ark/include/ark/model.hpp
index cbbff7f95..3c4f22e22 100644
--- a/ark/include/ark/model.hpp
+++ b/ark/include/ark/model.hpp
@@ -103,29 +103,23 @@ class Model : public ModelGraph {
     // result in `output`.
     // Currently, only reduction along the last dimension is supported.
     Tensor reduce_sum(Tensor input, int axis, bool keepdims = true,
-                      Tensor output = NullTensor,
-                      const std::string &config = "",
-                      const std::string &name = "");
+                      Tensor output = NullTensor, const std::string &name = "");
     Tensor reduce_mean(Tensor input, int axis, bool keepdims = true,
                        Tensor output = NullTensor,
-                       const std::string &config = "",
                        const std::string &name = "");
     Tensor reduce_max(Tensor input, int axis, bool keepdims = true,
-                      Tensor output = NullTensor,
-                      const std::string &config = "",
-                      const std::string &name = "");
+                      Tensor output = NullTensor, const std::string &name = "");
 
     // Transposes the `input` tensor according to the given `permutation`.
     // For example, transpose(input, {0, 1 ,3, 2}) will swap the last two
     // dimensions of the input tensor. Currently, only 4D tensors are supported.
     Tensor transpose(Tensor input, const std::vector<int64_t> &permutation,
-                     Tensor output = NullTensor, const std::string &config = "",
-                     const std::string &name = "");
+                     Tensor output = NullTensor, const std::string &name = "");
     // Performs matrix multiplication between the `input` tensor and another
     // `other` tensor, storing the result in `output`.
     Tensor matmul(Tensor input, Tensor other, Tensor output = NullTensor,
                   bool trans_input = false, bool trans_other = false,
-                  const std::string &config = "", const std::string &name = "");
+                  const std::string &name = "");
     // Implements the 'im2col' method for 2D convolution layers, which takes an
     // `input` tensor and reshapes it to a 2D matrix by extracting image patches
     // from the input tensor based on the provided parameters.
@@ -142,66 +136,63 @@ class Model : public ModelGraph {
                     Tensor output = NullTensor, const std::string &name = "");
     // Calculates the exponential of the `input` tensor, element-wise.
     Tensor exp(Tensor input, Tensor output = NullTensor,
-               const std::string &config = "", const std::string &name = "");
+               const std::string &name = "");
     // Calculates the square root of the `input` tensor, element-wise.
     Tensor sqrt(Tensor input, Tensor output = NullTensor,
-                const std::string &config = "", const std::string &name = "");
+                const std::string &name = "");
     // Calculates the reverse square root of the `input` tensor, element-wise.
     Tensor rsqrt(Tensor input, Tensor output = NullTensor,
-                 const std::string &config = "", const std::string &name = "");
+                 const std::string &name = "");
     // ReLU activation
     Tensor relu(Tensor input, Tensor output = NullTensor,
-                const std::string &config = "", const std::string &name = "");
+                const std::string &name = "");
     // Copy the `input` tensor to `output` tensor
     Tensor copy(Tensor input, Tensor output = NullTensor,
-                const std::string &config = "", const std::string &name = "");
+                const std::string &name = "");
     Tensor copy(float val, Tensor output = NullTensor,
-                const std::string &config = "", const std::string &name = "");
+                const std::string &name = "");
     // Applies the Gaussian Error Linear Unit (GELU) activation function to the
     // `input` tensor, element-wise. GELU is a smooth approximation of the
     // rectifier function and is widely used in deep learning models.
     Tensor gelu(Tensor input, Tensor output = NullTensor,
-                const std::string &config = "", const std::string &name = "");
+                const std::string &name = "");
     // Sigmoid activation
     Tensor sigmoid(Tensor input, Tensor output = NullTensor,
-                   const std::string &config = "",
                    const std::string &name = "");
     // Performs rotary position embedding (RoPE) on the `input` tensor
     Tensor rope(Tensor input, Tensor other, Tensor output = NullTensor,
-                const std::string &config = "", const std::string &name = "");
+                const std::string &name = "");
 
     // Performs an element-wise addition operator between the `input` tensor
     // and the `other` tensor
     Tensor add(Tensor input, Tensor other, Tensor output = NullTensor,
-               const std::string &config = "", const std::string &name = "");
+               const std::string &name = "");
     Tensor add(Tensor input, float value, Tensor output = NullTensor,
-               const std::string &config = "", const std::string &name = "");
+               const std::string &name = "");
     // Performs an element-wise subtraction operator between the `input` tensor
     // and the `other` tensor
     Tensor sub(Tensor input, Tensor other, Tensor output = NullTensor,
-               const std::string &config = "", const std::string &name = "");
+               const std::string &name = "");
     Tensor sub(Tensor input, float value, Tensor output = NullTensor,
-               const std::string &config = "", const std::string &name = "");
+               const std::string &name = "");
     // Performs an element-wise multiplication operator between the `input`
     // tensor and the `other` tensor,
     Tensor mul(Tensor input, Tensor other, Tensor output = NullTensor,
-               const std::string &config = "", const std::string &name = "");
+               const std::string &name = "");
     Tensor mul(Tensor input, float value, Tensor output = NullTensor,
-               const std::string &config = "", const std::string &name = "");
+               const std::string &name = "");
     // Performs an element-wise division operator between the `input`
     // tensor and the `other` tensor,
     Tensor div(Tensor input, Tensor other, Tensor output = NullTensor,
-               const std::string &config = "", const std::string &name = "");
+               const std::string &name = "");
     Tensor div(Tensor input, float value, Tensor output = NullTensor,
-               const std::string &config = "", const std::string &name = "");
+               const std::string &name = "");
 
     Tensor send(Tensor input, int remote_rank, int tag,
-                Tensor output = NullTensor, const std::string &config = "",
-                const std::string &name = "");
+                Tensor output = NullTensor, const std::string &name = "");
     // Blocks the execution until the corresponding 'send' operator with the
     // specified `id` is completed.
-    Tensor send_done(Tensor input, const std::string &config = "",
-                     const std::string &name = "");
+    Tensor send_done(Tensor input, const std::string &name = "");
     // Receives a tensor from a source rank (@p src_rank), identified by the
     // `id` parameter. Blocks the execution until the corresponding 'recv'
     // operator is completed.
@@ -238,12 +229,10 @@ class Model : public ModelGraph {
                                    const std::string &name = "");
     /// Embedding layer.
     Tensor embedding(Tensor input, Tensor weight, Tensor output = NullTensor,
-                     const std::string &config = "",
                      const std::string &name = "");
     /// Tensor type casting.
     Tensor cast(Tensor input, const DataType &data_type,
-                Tensor output = NullTensor, const std::string &config = "",
-                const std::string &name = "");
+                Tensor output = NullTensor, const std::string &name = "");
 
     // sync across multi devices
     Tensor device_sync(Tensor input, int rank, int rank_num,
diff --git a/ark/include/ark/model_graph.hpp b/ark/include/ark/model_graph.hpp
index 598bf343a..29074630c 100644
--- a/ark/include/ark/model_graph.hpp
+++ b/ark/include/ark/model_graph.hpp
@@ -25,7 +25,7 @@ class ModelGraph {
 
     int world_size() const;
 
-    void compress_nodes(bool merge_nodes = false);
+    void compress_nodes();
 
     bool compressed() const;
 
diff --git a/ark/include/ark/plan_manager.hpp b/ark/include/ark/plan_manager.hpp
deleted file mode 100644
index 3952a1c06..000000000
--- a/ark/include/ark/plan_manager.hpp
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#ifndef ARK_PLAN_MANAGER_HPP
-#define ARK_PLAN_MANAGER_HPP
-
-#include <ark/context_manager.hpp>
-
-namespace ark {
-
-class PlanManager {
-   public:
-    PlanManager(Model& model, const std::string& plan_context);
-
-    ~PlanManager();
-
-   private:
-    size_t model_id_;
-    bool stop_sync_;
-    std::shared_ptr<ContextManager> context_manager_;
-};
-
-}  // namespace ark
-
-#endif  // ARK_PLAN_MANAGER_HPP
diff --git a/ark/model/model_graph_impl.cpp b/ark/model/model_graph_impl.cpp
index 81359439a..7c1ea3fb5 100644
--- a/ark/model/model_graph_impl.cpp
+++ b/ark/model/model_graph_impl.cpp
@@ -112,7 +112,7 @@ ModelGraph::Impl &ModelGraph::Impl::operator=(const ModelGraph::Impl &other) {
     return *this;
 }
 
-void ModelGraph::Impl::compress_nodes(bool merge_nodes) {
+void ModelGraph::Impl::compress_nodes() {
     if (!compressed_) {
         this->recursive_remove_virtual_nodes();
         compressed_ = true;
@@ -178,10 +178,6 @@ bool ModelGraph::Impl::verify() const {
     return true;
 }
 
-std::string ModelGraph::Impl::get_context(const std::string &key) const {
-    return context_stack_->get_context(key);
-}
-
 ModelNodeRef ModelGraph::Impl::add_op(ModelOpRef op) {
     for (auto &tns : op->input_tensors()) {
         if (tensor_to_producer_op_.find(tns) == tensor_to_producer_op_.end()) {
diff --git a/ark/model/model_graph_impl.hpp b/ark/model/model_graph_impl.hpp
index c7080ab73..62944f999 100644
--- a/ark/model/model_graph_impl.hpp
+++ b/ark/model/model_graph_impl.hpp
@@ -54,8 +54,7 @@ class ModelGraph::Impl {
     Impl &operator=(const Impl &other);
 
     template <typename T, typename... Args>
-    ModelOpRef create_op(const std::string &config, const std::string &name,
-                         Args &&...args) {
+    ModelOpRef create_op(const std::string &name, Args &&... args) {
         ModelOpRef op = std::make_shared<T>(std::forward<Args>(args)...);
         std::string name_copy;
         if (name.empty()) {
@@ -68,7 +67,6 @@ class ModelGraph::Impl {
         if (count > 0) {
             name_copy += "_" + std::to_string(count);
         }
-        op->set_config(config);
         op->set_name(name_copy);
         add_op(op);
         return op;
@@ -78,14 +76,12 @@ class ModelGraph::Impl {
 
     int world_size() const { return world_size_; }
 
-    void compress_nodes(bool merge_nodes = false);
+    void compress_nodes();
 
     bool compressed() const { return compressed_; }
 
     bool verify() const;
 
-    std::string get_context(const std::string &key) const;
-
     std::string serialize(bool pretty = true) const;
 
     std::vector<ModelNodeRef> nodes() const;
diff --git a/ark/model/model_op.cpp b/ark/model/model_op.cpp
index dc4906235..5db8576e8 100644
--- a/ark/model/model_op.cpp
+++ b/ark/model/model_op.cpp
@@ -92,14 +92,6 @@ const ModelOpType ModelOpT::from_name(const std::string &type_name) {
     return it->second;
 }
 
-void ModelOp::set_config(const std::string &config) {
-    if (!config.empty()) {
-        config_ = Json::parse(config);
-    } else {
-        config_.clear();
-    }
-}
-
 std::vector<ModelTensorRef> ModelOp::input_tensors() const {
     // input_tensors = read_tensors || write_tensors
     std::set<ModelTensorRef> input_tensors;
@@ -192,9 +184,6 @@ Json ModelOp::serialize() const {
     for (auto &arg : args_) {
         j["Args"][arg.first] = arg.second.serialize();
     }
-    if (!config_.empty()) {
-        j["Config"] = config_;
-    }
     return j;
 }
 
diff --git a/ark/model/model_op.hpp b/ark/model/model_op.hpp
index d048375c2..f7323d6c0 100644
--- a/ark/model/model_op.hpp
+++ b/ark/model/model_op.hpp
@@ -50,8 +50,8 @@ class ModelOp {
         return "";
     }
 
-    virtual std::vector<ModelOpArg> impl_args(
-        [[maybe_unused]] const Json &config) const {
+    virtual std::vector<ModelOpArg> impl_args([
+        [maybe_unused]] const Json &config) const {
         return {};
     }
 
@@ -60,14 +60,10 @@ class ModelOp {
         return {{"NumTasks", 0}, {"NumWarps", 0}, {"SramBytes", 0}};
     }
 
-    void set_config(const std::string &config);
-
     void set_name(const std::string &name) { name_ = name; }
 
     ModelOpType type() const { return type_; }
 
-    const Json &config() const { return config_; }
-
     const std::string &name() const { return name_; }
 
     bool is_virtual() const { return is_virtual_; }
@@ -104,7 +100,6 @@ class ModelOp {
         const std::vector<std::string> &template_args = {});
 
     ModelOpType type_;
-    Json config_;
     std::string name_;
     bool is_virtual_;
     std::vector<ModelTensorRef> read_tensors_;
diff --git a/ark/ops/ops_arithmetic.cpp b/ark/ops/ops_arithmetic.cpp
index ef85b5d22..aeece0d77 100644
--- a/ark/ops/ops_arithmetic.cpp
+++ b/ark/ops/ops_arithmetic.cpp
@@ -12,10 +12,9 @@ ModelOpAdd::ModelOpAdd(ModelTensorRef input, ModelTensorRef other,
     : ModelOpBroadcast2("Add", input, other, output) {}
 
 Tensor Model::add(Tensor input, Tensor other, Tensor output,
-                  const std::string &config, const std::string &name) {
+                  const std::string &name) {
     return impl_
-        ->create_op<ModelOpAdd>(config, name, input.ref_, other.ref_,
-                                output.ref_)
+        ->create_op<ModelOpAdd>(name, input.ref_, other.ref_, output.ref_)
         ->result_tensors()[0];
 }
 
@@ -24,10 +23,9 @@ ModelOpMul::ModelOpMul(ModelTensorRef input, ModelTensorRef other,
     : ModelOpBroadcast2("Mul", input, other, output) {}
 
 Tensor Model::mul(Tensor input, Tensor other, Tensor output,
-                  const std::string &config, const std::string &name) {
+                  const std::string &name) {
     return impl_
-        ->create_op<ModelOpMul>(config, name, input.ref_, other.ref_,
-                                output.ref_)
+        ->create_op<ModelOpMul>(name, input.ref_, other.ref_, output.ref_)
         ->result_tensors()[0];
 }
 
@@ -36,10 +34,9 @@ ModelOpSub::ModelOpSub(ModelTensorRef input, ModelTensorRef other,
     : ModelOpBroadcast2("Sub", input, other, output) {}
 
 Tensor Model::sub(Tensor input, Tensor other, Tensor output,
-                  const std::string &config, const std::string &name) {
+                  const std::string &name) {
     return impl_
-        ->create_op<ModelOpSub>(config, name, input.ref_, other.ref_,
-                                output.ref_)
+        ->create_op<ModelOpSub>(name, input.ref_, other.ref_, output.ref_)
         ->result_tensors()[0];
 }
 
@@ -48,10 +45,9 @@ ModelOpDiv::ModelOpDiv(ModelTensorRef input, ModelTensorRef other,
     : ModelOpBroadcast2("Div", input, other, output) {}
 
 Tensor Model::div(Tensor input, Tensor other, Tensor output,
-                  const std::string &config, const std::string &name) {
+                  const std::string &name) {
     return impl_
-        ->create_op<ModelOpDiv>(config, name, input.ref_, other.ref_,
-                                output.ref_)
+        ->create_op<ModelOpDiv>(name, input.ref_, other.ref_, output.ref_)
         ->result_tensors()[0];
 }
 
diff --git a/ark/ops/ops_arithmetic_test.cpp b/ark/ops/ops_arithmetic_test.cpp
index fd6a05b1a..772da3276 100644
--- a/ark/ops/ops_arithmetic_test.cpp
+++ b/ark/ops/ops_arithmetic_test.cpp
@@ -2,7 +2,6 @@
 // Licensed under the MIT license.
 
 #include "ops_test_common.hpp"
-#include "model/model_json.hpp"
 
 template <typename T>
 void baseline_add(std::vector<void *> &outputs,
@@ -143,25 +142,12 @@ ark::unittest::State test_add_fp32() {
 
 ark::unittest::State test_add_fp16() {
     ark::Model m;
-    ark::Tensor t0 = m.tensor({32, 2048, 2048}, ark::FP16);
-    ark::Tensor t1 = m.tensor({32, 2048, 2048}, ark::FP16);
+    ark::Tensor t0 = m.tensor({8192}, ark::FP16);
+    ark::Tensor t1 = m.tensor({8192}, ark::FP16);
     ark::Tensor out = m.add(t0, t1);
 
     auto result =
-        ark::op_test("add_fp16", m, {t0, t1}, {out}, baseline_add<ark::half_t>, {},
-        {
-            ark::DefaultPlanner::ConfigRule([](const std::string op_str, const std::string) {
-                auto op = ark::Json::parse(op_str);
-                ark::Json config;
-                if (op.at("Type") == "Add") {
-                    config["NumWarps"] = 4;
-                    config["SramBytes"] = 0;
-                    config["Tile"] = {128, 256};
-                    config["NumTasks"] = 4096;
-                }
-                return config.dump();
-            })
-        });
+        ark::op_test("add_fp16", m, {t0, t1}, {out}, baseline_add<ark::half_t>);
     UNITTEST_LOG(result);
     UNITTEST_EQ(result.max_diff[0], 0.0f);
     return ark::unittest::SUCCESS;
@@ -430,20 +416,20 @@ ark::unittest::State test_div_invalid() {
 
 int main() {
     ark::init();
-    // UNITTEST(test_add_fp32);
+    UNITTEST(test_add_fp32);
     UNITTEST(test_add_fp16);
-    // UNITTEST(test_add_bf16);
-    // UNITTEST(test_add_overwrite);
-    // UNITTEST(test_add_broadcast);
-    // UNITTEST(test_add_invalid);
-    // UNITTEST(test_sub_fp32);
-    // UNITTEST(test_sub_invalid);
-    // UNITTEST(test_mul_fp32);
-    // UNITTEST(test_mul_fp16);
-    // UNITTEST(test_mul_overwrite);
-    // UNITTEST(test_mul_broadcast);
-    // UNITTEST(test_mul_invalid);
-    // UNITTEST(test_div_fp32);
-    // UNITTEST(test_div_invalid);
+    UNITTEST(test_add_bf16);
+    UNITTEST(test_add_overwrite);
+    UNITTEST(test_add_broadcast);
+    UNITTEST(test_add_invalid);
+    UNITTEST(test_sub_fp32);
+    UNITTEST(test_sub_invalid);
+    UNITTEST(test_mul_fp32);
+    UNITTEST(test_mul_fp16);
+    UNITTEST(test_mul_overwrite);
+    UNITTEST(test_mul_broadcast);
+    UNITTEST(test_mul_invalid);
+    UNITTEST(test_div_fp32);
+    UNITTEST(test_div_invalid);
     return ark::unittest::SUCCESS;
 }
diff --git a/ark/ops/ops_cast.cpp b/ark/ops/ops_cast.cpp
index 96146217e..e94fec989 100644
--- a/ark/ops/ops_cast.cpp
+++ b/ark/ops/ops_cast.cpp
@@ -105,7 +105,7 @@ ModelOpByteCast::ModelOpByteCast(ModelTensorRef input, ModelDataType data_type,
 }
 
 Tensor Model::cast(Tensor input, const DataType &data_type, Tensor output,
-                   const std::string &config, const std::string &name) {
+                   const std::string &name) {
     check_null(input.ref());
     if (output.is_null()) {
         if (input.data_type() == data_type) {
@@ -119,14 +119,14 @@ Tensor Model::cast(Tensor input, const DataType &data_type, Tensor output,
             byte_cast_helper(input.ref(), data_type.ref(), new_shape,
                              new_strides, new_offsets, new_padded_shape);
             return impl_
-                ->create_op<ModelOpByteCast>(
-                    config, name, input.ref(), data_type.ref(), new_shape,
-                    new_strides, new_offsets, new_padded_shape)
+                ->create_op<ModelOpByteCast>(name, input.ref(), data_type.ref(),
+                                             new_shape, new_strides,
+                                             new_offsets, new_padded_shape)
                 ->result_tensors()[0];
         }
     }
     return impl_
-        ->create_op<ModelOpCast>(config, name, input.ref(), data_type.ref(),
+        ->create_op<ModelOpCast>(name, input.ref(), data_type.ref(),
                                  output.ref())
         ->result_tensors()[0];
 }
diff --git a/ark/ops/ops_communication.cpp b/ark/ops/ops_communication.cpp
index e42c96d9c..baf7aafa2 100644
--- a/ark/ops/ops_communication.cpp
+++ b/ark/ops/ops_communication.cpp
@@ -589,25 +589,23 @@ Json ModelOpDeviceSync::default_config([[maybe_unused]] const ArchRef arch) cons
 }
 
 Tensor Model::send(Tensor input, int remote_rank, int tag, Tensor output,
-                   const std::string &config, const std::string &name) {
+                   const std::string &name) {
     tags_.insert(tag);
     return impl_
-        ->create_op<ModelOpSend>(config, name, input.ref(), remote_rank, tag,
+        ->create_op<ModelOpSend>(name, input.ref(), remote_rank, tag,
                                  output.ref())
         ->result_tensors()[0];
 }
 
-Tensor Model::send_done(Tensor input, const std::string &config,
-                        const std::string &name) {
-    return impl_->create_op<ModelOpSendDone>(config, name, input.ref())
+Tensor Model::send_done(Tensor input, const std::string &name) {
+    return impl_->create_op<ModelOpSendDone>(name, input.ref())
         ->result_tensors()[0];
 }
 
 Tensor Model::recv(Tensor output, int remote_rank, int tag,
-                   const std::string &config, const std::string &name) {
+                   const std::string &name) {
     tags_.insert(tag);
-    return impl_
-        ->create_op<ModelOpRecv>(config, name, output.ref(), remote_rank, tag)
+    return impl_->create_op<ModelOpRecv>(name, output.ref(), remote_rank, tag)
         ->result_tensors()[0];
 }
 
diff --git a/ark/ops/ops_copy.cpp b/ark/ops/ops_copy.cpp
index 4914c34a4..4f32966b8 100644
--- a/ark/ops/ops_copy.cpp
+++ b/ark/ops/ops_copy.cpp
@@ -20,9 +20,8 @@ ModelOpCopy::ModelOpCopy(ModelTensorRef input, ModelTensorRef output)
     verify();
 }
 
-Tensor Model::copy(Tensor input, Tensor output, const std::string &config,
-                   const std::string &name) {
-    return impl_->create_op<ModelOpCopy>(config, name, input.ref_, output.ref_)
+Tensor Model::copy(Tensor input, Tensor output, const std::string &name) {
+    return impl_->create_op<ModelOpCopy>(name, input.ref_, output.ref_)
         ->result_tensors()[0];
 }
 
diff --git a/ark/ops/ops_embedding.cpp b/ark/ops/ops_embedding.cpp
index 1169c47c3..2e2626d4c 100644
--- a/ark/ops/ops_embedding.cpp
+++ b/ark/ops/ops_embedding.cpp
@@ -70,9 +70,9 @@ Json ModelOpEmbedding::default_config([
 }
 
 Tensor Model::embedding(Tensor input, Tensor weight, Tensor output,
-                        const std::string &config, const std::string &name) {
+                        const std::string &name) {
     return impl_
-        ->create_op<ModelOpEmbedding>(config, name, input.ref_, weight.ref_,
+        ->create_op<ModelOpEmbedding>(name, input.ref_, weight.ref_,
                                       output.ref_)
         ->result_tensors()[0];
 }
diff --git a/ark/ops/ops_identity.cpp b/ark/ops/ops_identity.cpp
index dd398d8a5..065cd9a52 100644
--- a/ark/ops/ops_identity.cpp
+++ b/ark/ops/ops_identity.cpp
@@ -31,7 +31,7 @@ Tensor Model::identity(Tensor input, const std::vector<Tensor> &deps,
     for (auto &dep : deps) {
         deps_ref.emplace_back(dep.ref_);
     }
-    return impl_->create_op<ModelOpIdentity>("", name, input.ref_, deps_ref)
+    return impl_->create_op<ModelOpIdentity>(name, input.ref_, deps_ref)
         ->result_tensors()[0];
 }
 
diff --git a/ark/ops/ops_math.cpp b/ark/ops/ops_math.cpp
index b2833dcca..1067c561a 100644
--- a/ark/ops/ops_math.cpp
+++ b/ark/ops/ops_math.cpp
@@ -24,55 +24,48 @@ ModelOpMath::ModelOpMath(const std::string &type_name, ModelTensorRef input,
 ModelOpExp::ModelOpExp(ModelTensorRef input, ModelTensorRef output)
     : ModelOpMath("Exp", input, output) {}
 
-Tensor Model::exp(Tensor input, Tensor output, const std::string &config,
-                  const std::string &name) {
-    return impl_->create_op<ModelOpExp>(config, name, input.ref_, output.ref_)
+Tensor Model::exp(Tensor input, Tensor output, const std::string &name) {
+    return impl_->create_op<ModelOpExp>(name, input.ref_, output.ref_)
         ->result_tensors()[0];
 }
 
 ModelOpGelu::ModelOpGelu(ModelTensorRef input, ModelTensorRef output)
     : ModelOpMath("Gelu", input, output) {}
 
-Tensor Model::gelu(Tensor input, Tensor output, const std::string &config,
-                   const std::string &name) {
-    return impl_->create_op<ModelOpGelu>(config, name, input.ref_, output.ref_)
+Tensor Model::gelu(Tensor input, Tensor output, const std::string &name) {
+    return impl_->create_op<ModelOpGelu>(name, input.ref_, output.ref_)
         ->result_tensors()[0];
 }
 
 ModelOpRelu::ModelOpRelu(ModelTensorRef input, ModelTensorRef output)
     : ModelOpMath("Relu", input, output) {}
 
-Tensor Model::relu(Tensor input, Tensor output, const std::string &config,
-                   const std::string &name) {
-    return impl_->create_op<ModelOpRelu>(config, name, input.ref_, output.ref_)
+Tensor Model::relu(Tensor input, Tensor output, const std::string &name) {
+    return impl_->create_op<ModelOpRelu>(name, input.ref_, output.ref_)
         ->result_tensors()[0];
 }
 
 ModelOpRsqrt::ModelOpRsqrt(ModelTensorRef input, ModelTensorRef output)
     : ModelOpMath("Rsqrt", input, output) {}
 
-Tensor Model::rsqrt(Tensor input, Tensor output, const std::string &config,
-                    const std::string &name) {
-    return impl_->create_op<ModelOpRsqrt>(config, name, input.ref_, output.ref_)
+Tensor Model::rsqrt(Tensor input, Tensor output, const std::string &name) {
+    return impl_->create_op<ModelOpRsqrt>(name, input.ref_, output.ref_)
         ->result_tensors()[0];
 }
 
 ModelOpSigmoid::ModelOpSigmoid(ModelTensorRef input, ModelTensorRef output)
     : ModelOpMath("Sigmoid", input, output) {}
 
-Tensor Model::sigmoid(Tensor input, Tensor output, const std::string &config,
-                      const std::string &name) {
-    return impl_
-        ->create_op<ModelOpSigmoid>(config, name, input.ref_, output.ref_)
+Tensor Model::sigmoid(Tensor input, Tensor output, const std::string &name) {
+    return impl_->create_op<ModelOpSigmoid>(name, input.ref_, output.ref_)
         ->result_tensors()[0];
 }
 
 ModelOpSqrt::ModelOpSqrt(ModelTensorRef input, ModelTensorRef output)
     : ModelOpMath("Sqrt", input, output) {}
 
-Tensor Model::sqrt(Tensor input, Tensor output, const std::string &config,
-                   const std::string &name) {
-    return impl_->create_op<ModelOpSqrt>(config, name, input.ref_, output.ref_)
+Tensor Model::sqrt(Tensor input, Tensor output, const std::string &name) {
+    return impl_->create_op<ModelOpSqrt>(name, input.ref_, output.ref_)
         ->result_tensors()[0];
 }
 
diff --git a/ark/ops/ops_matmul.cpp b/ark/ops/ops_matmul.cpp
index bc94922fc..dca349f44 100644
--- a/ark/ops/ops_matmul.cpp
+++ b/ark/ops/ops_matmul.cpp
@@ -244,10 +244,10 @@ Json ModelOpMatmul::default_config(const ArchRef arch) const {
 
 Tensor Model::matmul(Tensor input, Tensor other, Tensor output,
                      bool trans_input, bool trans_other,
-                     const std::string &config, const std::string &name) {
+                     const std::string &name) {
     return impl_
-        ->create_op<ModelOpMatmul>(config, name, input.ref(), other.ref(),
-                                   output.ref(), trans_input, trans_other)
+        ->create_op<ModelOpMatmul>(name, input.ref(), other.ref(), output.ref(),
+                                   trans_input, trans_other)
         ->result_tensors()[0];
 }
 
diff --git a/ark/ops/ops_noop.cpp b/ark/ops/ops_noop.cpp
index 42fe5fdf5..894ab29be 100644
--- a/ark/ops/ops_noop.cpp
+++ b/ark/ops/ops_noop.cpp
@@ -30,7 +30,7 @@ Json ModelOpNoop::default_config([[maybe_unused]] const ArchRef arch) const {
 }
 
 void Model::noop(Tensor input, const std::string &name) {
-    impl_->create_op<ModelOpNoop>("", name, input.ref_);
+    impl_->create_op<ModelOpNoop>(name, input.ref_);
 }
 
 }  // namespace ark
diff --git a/ark/ops/ops_reduce.cpp b/ark/ops/ops_reduce.cpp
index 19f70385b..78dd9d7e6 100644
--- a/ark/ops/ops_reduce.cpp
+++ b/ark/ops/ops_reduce.cpp
@@ -127,25 +127,25 @@ Json ModelOpReduce::default_config([[maybe_unused]] const ArchRef arch) const {
 }
 
 Tensor Model::reduce_max(Tensor input, int axis, bool keepdims, Tensor output,
-                         const std::string &config, const std::string &name) {
+                         const std::string &name) {
     return impl_
-        ->create_op<ModelOpReduceMax>(config, name, input.ref_, axis, keepdims,
+        ->create_op<ModelOpReduceMax>(name, input.ref_, axis, keepdims,
                                       output.ref_)
         ->result_tensors()[0];
 }
 
 Tensor Model::reduce_mean(Tensor input, int axis, bool keepdims, Tensor output,
-                          const std::string &config, const std::string &name) {
+                          const std::string &name) {
     return impl_
-        ->create_op<ModelOpReduceMean>(config, name, input.ref_, axis, keepdims,
+        ->create_op<ModelOpReduceMean>(name, input.ref_, axis, keepdims,
                                        output.ref_)
         ->result_tensors()[0];
 }
 
 Tensor Model::reduce_sum(Tensor input, int axis, bool keepdims, Tensor output,
-                         const std::string &config, const std::string &name) {
+                         const std::string &name) {
     return impl_
-        ->create_op<ModelOpReduceSum>(config, name, input.ref_, axis, keepdims,
+        ->create_op<ModelOpReduceSum>(name, input.ref_, axis, keepdims,
                                       output.ref_)
         ->result_tensors()[0];
 }
diff --git a/ark/ops/ops_refer.cpp b/ark/ops/ops_refer.cpp
index 68c61b30f..782d6708c 100644
--- a/ark/ops/ops_refer.cpp
+++ b/ark/ops/ops_refer.cpp
@@ -20,7 +20,7 @@ Tensor Model::refer(Tensor input, const Dims &shape, const Dims &strides,
                     const Dims &offsets, const Dims &padded_shape,
                     const std::string &name) {
     return impl_
-        ->create_op<ModelOpRefer>("", name, input.ref_, shape, strides, offsets,
+        ->create_op<ModelOpRefer>(name, input.ref_, shape, strides, offsets,
                                   padded_shape)
         ->result_tensors()[0];
 }
diff --git a/ark/ops/ops_reshape.cpp b/ark/ops/ops_reshape.cpp
index 8ed3ac247..aac22b71a 100644
--- a/ark/ops/ops_reshape.cpp
+++ b/ark/ops/ops_reshape.cpp
@@ -199,8 +199,8 @@ Tensor Model::reshape(Tensor input, const Dims &shape, bool allowzero,
     reshape_helper(input.ref_, Dims{inferred_shape}, allowzero, new_shape,
                    new_strides, new_offs);
     return impl_
-        ->create_op<ModelOpReshape>("", name, input.ref_, new_shape,
-                                    new_strides, new_offs)
+        ->create_op<ModelOpReshape>(name, input.ref_, new_shape, new_strides,
+                                    new_offs)
         ->result_tensors()[0];
 }
 
diff --git a/ark/ops/ops_rope.cpp b/ark/ops/ops_rope.cpp
index 36015aae5..06c1c915e 100644
--- a/ark/ops/ops_rope.cpp
+++ b/ark/ops/ops_rope.cpp
@@ -12,10 +12,9 @@ ModelOpRope::ModelOpRope(ModelTensorRef input, ModelTensorRef other,
     : ModelOpBroadcast2("Rope", input, other, output) {}
 
 Tensor Model::rope(Tensor input, Tensor other, Tensor output,
-                   const std::string &config, const std::string &name) {
+                   const std::string &name) {
     return impl_
-        ->create_op<ModelOpRope>(config, name, input.ref_, other.ref_,
-                                 output.ref_)
+        ->create_op<ModelOpRope>(name, input.ref_, other.ref_, output.ref_)
         ->result_tensors()[0];
 }
 
diff --git a/ark/ops/ops_scalar.cpp b/ark/ops/ops_scalar.cpp
index b5c10f1c3..944a7247c 100644
--- a/ark/ops/ops_scalar.cpp
+++ b/ark/ops/ops_scalar.cpp
@@ -115,21 +115,20 @@ std::vector<ModelOpArg> ModelOpScalarMul::impl_args([
 Tensor Model::constant(float val, const Dims &shape, DataType data_type,
                        const std::string &name) {
     return impl_
-        ->create_op<ModelOpScalarAssign>("", name, val, shape, data_type.ref(),
+        ->create_op<ModelOpScalarAssign>(name, val, shape, data_type.ref(),
                                          nullptr)
         ->result_tensors()[0];
 }
 
-Tensor Model::copy(float val, Tensor output, const std::string &config,
-                   const std::string &name) {
+Tensor Model::copy(float val, Tensor output, const std::string &name) {
     if (output == NullTensor) {
         return impl_
-            ->create_op<ModelOpScalarAssign>(config, name, val, Dims{1},
-                                             FP32.ref(), output.ref())
+            ->create_op<ModelOpScalarAssign>(name, val, Dims{1}, FP32.ref(),
+                                             output.ref())
             ->result_tensors()[0];
     } else {
         return impl_
-            ->create_op<ModelOpScalarAssign>(config, name, val, output.shape(),
+            ->create_op<ModelOpScalarAssign>(name, val, output.shape(),
                                              output.data_type().ref(),
                                              output.ref())
             ->result_tensors()[0];
@@ -137,34 +136,30 @@ Tensor Model::copy(float val, Tensor output, const std::string &config,
 }
 
 Tensor Model::add(Tensor input, float value, Tensor output,
-                  const std::string &config, const std::string &name) {
+                  const std::string &name) {
     return impl_
-        ->create_op<ModelOpScalarAdd>(config, name, input.ref_, value,
-                                      output.ref_)
+        ->create_op<ModelOpScalarAdd>(name, input.ref_, value, output.ref_)
         ->result_tensors()[0];
 }
 
 Tensor Model::sub(Tensor input, float value, Tensor output,
-                  const std::string &config, const std::string &name) {
+                  const std::string &name) {
     return impl_
-        ->create_op<ModelOpScalarAdd>(config, name, input.ref_, -value,
-                                      output.ref_)
+        ->create_op<ModelOpScalarAdd>(name, input.ref_, -value, output.ref_)
         ->result_tensors()[0];
 }
 
 Tensor Model::mul(Tensor input, float value, Tensor output,
-                  const std::string &config, const std::string &name) {
+                  const std::string &name) {
     return impl_
-        ->create_op<ModelOpScalarMul>(config, name, input.ref_, value,
-                                      output.ref_)
+        ->create_op<ModelOpScalarMul>(name, input.ref_, value, output.ref_)
         ->result_tensors()[0];
 }
 
 Tensor Model::div(Tensor input, float value, Tensor output,
-                  const std::string &config, const std::string &name) {
+                  const std::string &name) {
     return impl_
-        ->create_op<ModelOpScalarMul>(config, name, input.ref_, 1 / value,
-                                      output.ref_)
+        ->create_op<ModelOpScalarMul>(name, input.ref_, 1 / value, output.ref_)
         ->result_tensors()[0];
 }
 
diff --git a/ark/ops/ops_transpose.cpp b/ark/ops/ops_transpose.cpp
index c659761d9..d0f7581cc 100644
--- a/ark/ops/ops_transpose.cpp
+++ b/ark/ops/ops_transpose.cpp
@@ -124,10 +124,9 @@ Json ModelOpTranspose::default_config([
 }
 
 Tensor Model::transpose(Tensor input, const std::vector<int64_t> &permutation,
-                        Tensor output, const std::string &config,
-                        const std::string &name) {
+                        Tensor output, const std::string &name) {
     return impl_
-        ->create_op<ModelOpTranspose>(config, name, input.ref_, permutation,
+        ->create_op<ModelOpTranspose>(name, input.ref_, permutation,
                                       output.ref_)
         ->result_tensors()[0];
 }
diff --git a/examples/llama/model_7b_b1_s2048.py b/examples/llama/model_7b_b1_s2048.py
index f41304e85..d4a080c84 100644
--- a/examples/llama/model_7b_b1_s2048.py
+++ b/examples/llama/model_7b_b1_s2048.py
@@ -90,7 +90,7 @@ def __init__(
         self.weight = ark.parameter([1, 1, dim], ark.fp32)
 
     def forward(self, x):
-        with ark.PlanManager(
+        with ark.PlannerContext(
             warp_range=[0, 8],
             sync=False,
             config={
@@ -100,12 +100,12 @@ def forward(self, x):
                 "Granularity": 7,
             },
         ):
-            with ark.PlanManager(config={"Tile": [1, 4096]}):
+            with ark.PlannerContext(config={"Tile": [1, 4096]}):
                 x = ark.cast(x, ark.fp32)
                 x2 = ark.mul(x, x)
-            with ark.PlanManager(config={"ImplType": "WarpWise"}):
+            with ark.PlannerContext(config={"ImplType": "WarpWise"}):
                 mean = ark.reduce_mean(x2, axis=-1)
-        with ark.PlanManager(
+        with ark.PlannerContext(
             config={
                 "NumWarps": 1,
                 "SramBytes": 0,
@@ -114,7 +114,7 @@ def forward(self, x):
             }
         ):
             rrms = ark.rsqrt(mean)
-        with ark.PlanManager(
+        with ark.PlannerContext(
             warp_range=[0, 8],
             sync=False,
             config={
@@ -356,7 +356,7 @@ def __init__(
 
     def forward(self, x):
         # self.w2(F.silu(self.w1(x)) * self.w3(x))
-        with ark.PlanManager(
+        with ark.PlannerContext(
             warp_range=[0, 8],
             sram_range=[0, 49344],
             sync=False,
@@ -365,13 +365,13 @@ def forward(self, x):
                 "NumTasks": 688,
             },
         ):
-            with ark.PlanManager(
+            with ark.PlannerContext(
                 config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]}
             ):
                 x1 = self.w1(x)
-            with ark.PlanManager(config={"SramBytes": 0, "Tile": [256, 128]}):
+            with ark.PlannerContext(config={"SramBytes": 0, "Tile": [256, 128]}):
                 x1 = Silu()(x1)
-        with ark.PlanManager(
+        with ark.PlannerContext(
             warp_range=[0, 8],
             sram_range=[0, 49344],
             sync=False,
@@ -380,11 +380,11 @@ def forward(self, x):
                 "NumTasks": 688,
             },
         ):
-            with ark.PlanManager(
+            with ark.PlannerContext(
                 config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]}
             ):
                 x2 = self.w3(x)
-            with ark.PlanManager(config={"SramBytes": 0, "Tile": [256, 128]}):
+            with ark.PlannerContext(config={"SramBytes": 0, "Tile": [256, 128]}):
                 x3 = ark.mul(x1, x2)
         x4 = self.w2(x3)
         return x4
@@ -404,7 +404,7 @@ def __init__(self):
         super(Softmax, self).__init__()
 
     def forward(self, input):
-        with ark.PlanManager(
+        with ark.PlannerContext(
             warp_range=[0, 8],
             sram_range=[0, 0],
             sync=False,
@@ -414,14 +414,14 @@ def forward(self, input):
                 "NumTasks": 65536,
             },
         ):
-            with ark.PlanManager(config={"ImplType": "WarpWise"}):
+            with ark.PlannerContext(config={"ImplType": "WarpWise"}):
                 max = ark.reduce_max(input, axis=-1)
-            with ark.PlanManager(config={"Tile": [1, 2048]}):
+            with ark.PlannerContext(config={"Tile": [1, 2048]}):
                 output = ark.sub(input, max)
                 output = ark.exp(output)
-            with ark.PlanManager(config={"ImplType": "WarpWise"}):
+            with ark.PlannerContext(config={"ImplType": "WarpWise"}):
                 sum = ark.reduce_sum(output, axis=-1)
-            with ark.PlanManager(config={"Tile": [1, 2048]}):
+            with ark.PlannerContext(config={"Tile": [1, 2048]}):
                 output = ark.div(output, sum)
             return output
 
@@ -486,50 +486,50 @@ def forward(
     ):
         bsz, seqlen, _ = x.shape()
 
-        with ark.PlanManager(
+        with ark.PlannerContext(
             warp_range=[0, 4],
             sram_range=[0, 24672],
             sync=False,
             config={"NumWarps": 4, "NumTasks": 256},
         ):
-            with ark.PlanManager(
+            with ark.PlannerContext(
                 config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]}
             ):
                 xq = self.wq(x)
             xq = ark.reshape(
                 xq, [bsz, seqlen, self.n_local_heads, self.head_dim]
             )
-            with ark.PlanManager(
+            with ark.PlannerContext(
                 config={"SramBytes": 0, "Tile": [256, 1, 128]}
             ):
                 if freqs_cis is not None:
                     xq = ark.rope(xq, freqs_cis)
-            with ark.PlanManager(config={"SramBytes": 0, "Tile": [256, 128]}):
+            with ark.PlannerContext(config={"SramBytes": 0, "Tile": [256, 128]}):
                 xq = ark.transpose(xq, [0, 2, 1, 3])
 
-        with ark.PlanManager(
+        with ark.PlannerContext(
             warp_range=[0, 4],
             sram_range=[0, 24672],
             sync=False,
             config={"NumWarps": 4, "NumTasks": 256},
         ):
-            with ark.PlanManager(
+            with ark.PlannerContext(
                 config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]}
             ):
                 xk = self.wk(x)
             xk = ark.reshape(
                 xk, [bsz, seqlen, self.n_local_kv_heads, self.head_dim]
             )
-            with ark.PlanManager(
+            with ark.PlannerContext(
                 config={"SramBytes": 0, "Tile": [256, 1, 128]}
             ):
                 if freqs_cis is not None:
                     xk = ark.rope(xk, freqs_cis)
             keys = xk
-            with ark.PlanManager(config={"SramBytes": 0, "Tile": [256, 128]}):
+            with ark.PlannerContext(config={"SramBytes": 0, "Tile": [256, 128]}):
                 keys = ark.transpose(keys, [0, 2, 1, 3])
 
-        with ark.PlanManager(
+        with ark.PlannerContext(
             warp_range=[0, 4],
             sram_range=[0, 24672],
             sync=False,
@@ -540,7 +540,7 @@ def forward(
                 "TileShapeMNK": [256, 128, 32],
             },
         ):
-            with ark.PlanManager(
+            with ark.PlannerContext(
                 config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]}
             ):
                 xv = self.wv(x)
@@ -548,12 +548,12 @@ def forward(
                 xv, [bsz, seqlen, self.n_local_kv_heads, self.head_dim]
             )
             values = xv
-            with ark.PlanManager(
+            with ark.PlannerContext(
                 config={"SramBytes": 0, "Tile": [256, 1, 128]}
             ):
                 values = ark.transpose(values, [0, 2, 1, 3])
 
-        with ark.PlanManager(
+        with ark.PlannerContext(
             warp_range=[0, 8],
             sram_range=[0, 49344],
             sync=False,
@@ -563,11 +563,11 @@ def forward(
                 "Granularity": 2,
             },
         ):
-            with ark.PlanManager(
+            with ark.PlannerContext(
                 config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]}
             ):
                 scores = ark.matmul(xq, keys, transpose_other=True)
-            with ark.PlanManager(config={"SramBytes": 0, "Tile": [256, 128]}):
+            with ark.PlannerContext(config={"SramBytes": 0, "Tile": [256, 128]}):
                 scores = ark.mul(scores, 1.0 / math.sqrt(self.head_dim))
 
         if mask is not None:
@@ -575,7 +575,7 @@ def forward(
 
         scores = Softmax()(scores)
 
-        with ark.PlanManager(
+        with ark.PlannerContext(
             warp_range=[0, 4],
             sram_range=[0, 24672],
             sync=False,
@@ -584,11 +584,11 @@ def forward(
                 "NumTasks": 256,
             },
         ):
-            with ark.PlanManager(
+            with ark.PlannerContext(
                 config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]}
             ):
                 output = ark.matmul(scores, values)
-            with ark.PlanManager(
+            with ark.PlannerContext(
                 config={"SramBytes": 0, "Tile": [256, 1, 128]}
             ):
                 output = ark.transpose(output, [0, 2, 1, 3])
@@ -634,7 +634,7 @@ def forward(
     ):
         attention_norm_x = self.attention_norm(x)
         h = self.attention.forward(attention_norm_x, start_pos, freqs_cis, mask)
-        with ark.PlanManager(
+        with ark.PlannerContext(
             warp_range=[0, 4],
             config={
                 "NumWarps": 4,
@@ -645,7 +645,7 @@ def forward(
         ):
             h = ark.add(x, h)
         ff = self.feed_forward(self.ffn_norm(h))
-        with ark.PlanManager(
+        with ark.PlannerContext(
             warp_range=[0, 4],
             config={
                 "NumWarps": 4,
diff --git a/examples/tutorial/plan_manager_tutorial.py b/examples/tutorial/plan_manager_tutorial.py
deleted file mode 100644
index c840ce0c0..000000000
--- a/examples/tutorial/plan_manager_tutorial.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import ark
-import time
-import torch
-import torch.nn.functional as F
-
-
-class VanillaSoftmax(ark.Module):
-    def __init__(self):
-        super(Softmax, self).__init__()
-
-    def forward(self, input):
-        max = ark.reduce_max(input, axis=-1)
-        output = ark.sub(input, max)
-        output = ark.exp(output)
-        sum = ark.reduce_sum(output, axis=-1)
-        output = ark.div(output, sum)
-        return output
-
-
-class Softmax(ark.Module):
-    def __init__(self):
-        super(Softmax, self).__init__()
-
-    def forward(self, input):
-        with ark.PlanManager(
-            warp_range=[0, 8],
-            sram_range=[0, 0],
-            sync=False,
-            config={
-                "NumWarps": 1,
-                "SramBytes": 0,
-                "NumTasks": 65536,
-            },
-        ):
-            with ark.PlanManager(config={"ImplType": "WarpWise"}):
-                max = ark.reduce_max(input, axis=-1)
-            with ark.PlanManager(config={"Tile": [1, 2048]}):
-                output = ark.sub(input, max)
-                output = ark.exp(output)
-            with ark.PlanManager(config={"ImplType": "WarpWise"}):
-                sum = ark.reduce_sum(output, axis=-1)
-            with ark.PlanManager(config={"Tile": [1, 2048]}):
-                output = ark.div(output, sum)
-            return output
-
-
-def eval(tensor: ark.Tensor):
-    with ark.Runtime() as rt:
-        rt.launch()
-        rt.run()
-        return tensor.to_torch()
-
-
-def perf():
-    with ark.Runtime() as rt:
-        rt.launch()
-
-        start = time.time()
-        rt.run(iter=1000)
-        end = time.time()
-        return (end - start) / 1000
-
-
-if __name__ == "__main__":
-    ark.init()
-
-    shape = (32, 2048, 2048)
-
-    input = torch.randn(*shape).to("cuda:0")
-
-    output = Softmax()(ark.Tensor.from_torch(input))
-
-    if torch.allclose(eval(output), F.softmax(input, dim=-1), atol=1e-5):
-        print("Correct result")
-    else:
-        print("Incorrect result")
-
-    print(f"Performance: {(perf() * 1e3):.3f} ms/iter")
diff --git a/python/ark/plan_manager.py b/python/ark/plan_manager.py
deleted file mode 100644
index 80e615ab8..000000000
--- a/python/ark/plan_manager.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import json
-from typing import List, Dict, Any
-from .model import Model
-from ._ark_core import _PlanManager
-
-
-class PlanManager(_PlanManager):
-    def __init__(self, **kwargs):
-        """
-        Plan manager for specifying the parallelization and tiling configuration of the operators in the context.
-
-        Args:
-            processor_range (List[int], optional): The range of processors to be used. Defaults to None.
-            warp_range (List[int], optional): The range of warps to be used. Defaults to None.
-            sram_range (List[int], optional): The range of SRAMs to be used. Defaults to None.
-            sync (bool, optional): Whether to synchronize the execution. Defaults to True.
-            config (Dict[str, Any], optional): The configuration for the operators. Defaults to None.
-        """
-        super().__init__(Model.get_model(), json.dumps(kwargs))
-
-    def __enter__(self) -> "PlanManager":
-        """
-        Enter the plan manager.
-        """
-        return self
-
-    def __exit__(self, exc_type, exc_value, exc_tb):
-        """
-        Exit the plan manager.
-        """
-        del self
diff --git a/python/ark/runtime.py b/python/ark/runtime.py
index f064a5988..960223c64 100644
--- a/python/ark/runtime.py
+++ b/python/ark/runtime.py
@@ -3,6 +3,7 @@
 
 import logging
 from enum import Enum
+from typing import Dict, List
 
 from _ark_core import _Executor
 from .planner import Planner, Plan
diff --git a/python/model_py.cpp b/python/model_py.cpp
index 5a22d6a18..c224a3d5b 100644
--- a/python/model_py.cpp
+++ b/python/model_py.cpp
@@ -19,100 +19,89 @@ void register_model(py::module &m) {
         .def("compress", &ark::Model::compress)
         .def("add",
              py::overload_cast<ark::Tensor, ark::Tensor, ark::Tensor,
-                               const std::string &, const std::string &>(
-                 &ark::Model::add),
+                               const std::string &>(&ark::Model::add),
              py::arg("input"), py::arg("other"), py::arg("output"),
-             py::arg("config"), py::arg("name"))
+             py::arg("name"))
         .def("add",
              py::overload_cast<ark::Tensor, float, ark::Tensor,
-                               const std::string &, const std::string &>(
-                 &ark::Model::add),
+                               const std::string &>(&ark::Model::add),
              py::arg("input"), py::arg("other"), py::arg("output"),
-             py::arg("config"), py::arg("name"))
+             py::arg("name"))
         .def("cast", &ark::Model::cast, py::arg("input"), py::arg("data_type"),
-             py::arg("output"), py::arg("config"), py::arg("name"))
+             py::arg("output"), py::arg("name"))
         .def("constant", &ark::Model::constant, py::arg("value"),
              py::arg("shape"), py::arg("data_type"), py::arg("name"))
         .def("copy",
-             py::overload_cast<ark::Tensor, ark::Tensor, const std::string &,
-                               const std::string &>(&ark::Model::copy),
-             py::arg("input"), py::arg("output"), py::arg("config"),
-             py::arg("name"))
+             py::overload_cast<ark::Tensor, ark::Tensor, const std::string &>(
+                 &ark::Model::copy),
+             py::arg("input"), py::arg("output"), py::arg("name"))
         .def("copy",
-             py::overload_cast<float, ark::Tensor, const std::string &,
-                               const std::string &>(&ark::Model::copy),
-             py::arg("input"), py::arg("output"), py::arg("config"),
-             py::arg("name"))
+             py::overload_cast<float, ark::Tensor, const std::string &>(
+                 &ark::Model::copy),
+             py::arg("input"), py::arg("output"), py::arg("name"))
         .def("div",
              py::overload_cast<ark::Tensor, ark::Tensor, ark::Tensor,
-                               const std::string &, const std::string &>(
-                 &ark::Model::div),
+                               const std::string &>(&ark::Model::div),
              py::arg("input"), py::arg("other"), py::arg("output"),
-             py::arg("config"), py::arg("name"))
+             py::arg("name"))
         .def("div",
              py::overload_cast<ark::Tensor, float, ark::Tensor,
-                               const std::string &, const std::string &>(
-                 &ark::Model::div),
+                               const std::string &>(&ark::Model::div),
              py::arg("input"), py::arg("other"), py::arg("output"),
-             py::arg("config"), py::arg("name"))
-        .def("embedding", &ark::Model::embedding, py::arg("input"),
-             py::arg("weight"), py::arg("output"), py::arg("config"),
              py::arg("name"))
+        .def("embedding", &ark::Model::embedding, py::arg("input"),
+             py::arg("weight"), py::arg("output"), py::arg("name"))
         .def("exp", &ark::Model::exp, py::arg("input"), py::arg("output"),
-             py::arg("config"), py::arg("name"))
+             py::arg("name"))
         .def("gelu", &ark::Model::gelu, py::arg("input"), py::arg("output"),
-             py::arg("config"), py::arg("name"))
+             py::arg("name"))
         .def("identity", &ark::Model::identity, py::arg("input"),
              py::arg("deps"), py::arg("name"))
         .def("matmul", &ark::Model::matmul, py::arg("input"), py::arg("other"),
              py::arg("output"), py::arg("trans_input"), py::arg("trans_other"),
-             py::arg("config"), py::arg("name"))
+             py::arg("name"))
         .def("mul",
              py::overload_cast<ark::Tensor, ark::Tensor, ark::Tensor,
-                               const std::string &, const std::string &>(
-                 &ark::Model::mul),
+                               const std::string &>(&ark::Model::mul),
              py::arg("input"), py::arg("other"), py::arg("output"),
-             py::arg("config"), py::arg("name"))
+             py::arg("name"))
         .def("mul",
              py::overload_cast<ark::Tensor, float, ark::Tensor,
-                               const std::string &, const std::string &>(
-                 &ark::Model::mul),
+                               const std::string &>(&ark::Model::mul),
              py::arg("input"), py::arg("other"), py::arg("output"),
-             py::arg("config"), py::arg("name"))
+             py::arg("name"))
         .def("noop", &ark::Model::noop, py::arg("input"), py::arg("name"))
         .def("reduce_max", &ark::Model::reduce_max, py::arg("input"),
              py::arg("axis"), py::arg("keepdims"), py::arg("output"),
-             py::arg("config"), py::arg("name"))
+             py::arg("name"))
         .def("reduce_mean", &ark::Model::reduce_mean, py::arg("input"),
              py::arg("axis"), py::arg("keepdims"), py::arg("output"),
-             py::arg("config"), py::arg("name"))
+             py::arg("name"))
         .def("reduce_sum", &ark::Model::reduce_sum, py::arg("input"),
              py::arg("axis"), py::arg("keepdims"), py::arg("output"),
-             py::arg("config"), py::arg("name"))
+             py::arg("name"))
         .def("relu", &ark::Model::relu, py::arg("input"), py::arg("output"),
-             py::arg("config"), py::arg("name"))
+             py::arg("name"))
         .def("reshape", &ark::Model::reshape, py::arg("input"),
              py::arg("shape"), py::arg("allowzero"), py::arg("name"))
         .def("rope", &ark::Model::rope, py::arg("input"), py::arg("other"),
-             py::arg("output"), py::arg("config"), py::arg("name"))
+             py::arg("output"), py::arg("name"))
         .def("rsqrt", &ark::Model::rsqrt, py::arg("input"), py::arg("output"),
-             py::arg("config"), py::arg("name"))
+             py::arg("name"))
         .def("sharding", &ark::Model::sharding, py::arg("input"),
              py::arg("axis"), py::arg("dim_per_shard"), py::arg("name"))
         .def("sigmoid", &ark::Model::sigmoid, py::arg("input"),
-             py::arg("output"), py::arg("config"), py::arg("name"))
+             py::arg("output"), py::arg("name"))
         .def("sqrt", &ark::Model::sqrt, py::arg("input"), py::arg("output"),
-             py::arg("config"), py::arg("name"))
+             py::arg("name"))
         .def("sub",
              py::overload_cast<ark::Tensor, ark::Tensor, ark::Tensor,
-                               const std::string &, const std::string &>(
-                 &ark::Model::sub),
+                               const std::string &>(&ark::Model::sub),
              py::arg("input"), py::arg("other"), py::arg("output"),
-             py::arg("config"), py::arg("name"))
+             py::arg("name"))
         .def("sub",
              py::overload_cast<ark::Tensor, float, ark::Tensor,
-                               const std::string &, const std::string &>(
-                 &ark::Model::sub),
+                               const std::string &>(&ark::Model::sub),
              py::arg("input"), py::arg("other"), py::arg("output"),
              py::arg("name"))
         .def("tensor",
diff --git a/python/plan_manager_py.cpp b/python/plan_manager_py.cpp
deleted file mode 100644
index 34aa0b77c..000000000
--- a/python/plan_manager_py.cpp
+++ /dev/null
@@ -1,15 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <pybind11/operators.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-
-#include <ark/plan_manager.hpp>
-
-namespace py = pybind11;
-
-void register_plan_manager(py::module &m) {
-    py::class_<ark::PlanManager>(m, "_PlanManager")
-        .def(py::init<ark::Model&, const std::string&>());
-}

From 28b83953ae26b8554fc8b822df8e96dd8bf04091 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 6 Aug 2024 14:33:23 -0700
Subject: [PATCH 53/79] Update runtime.py

---
 python/ark/runtime.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/ark/runtime.py b/python/ark/runtime.py
index 96c6f470a..e40750260 100644
--- a/python/ark/runtime.py
+++ b/python/ark/runtime.py
@@ -98,7 +98,7 @@ def launch(
                     _RuntimeState.executor.destroy()
 
             _RuntimeState.executor = Executor(
-                gpu_id,
+                device_id,
                 stream,
                 "ArkRuntime",
                 plan,

From 11901c4a3f49469ede51e992b8b1d2fc1f2c1e3b Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 7 Aug 2024 09:36:45 +0000
Subject: [PATCH 54/79] fix

---
 python/ark/runtime.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/ark/runtime.py b/python/ark/runtime.py
index e40750260..495fc1c24 100644
--- a/python/ark/runtime.py
+++ b/python/ark/runtime.py
@@ -101,7 +101,7 @@ def launch(
                 device_id,
                 stream,
                 "ArkRuntime",
-                plan,
+                str(plan),
                 loop_mode,
             )
             self.executor = _RuntimeState.executor

From c0cbf19c4181cb697d5e3edc9db1198160bde788 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Thu, 8 Aug 2024 08:06:34 +0000
Subject: [PATCH 55/79] lint

---
 examples/llama/model_7b_b1_s2048.py | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/examples/llama/model_7b_b1_s2048.py b/examples/llama/model_7b_b1_s2048.py
index d4a080c84..73d349ccc 100644
--- a/examples/llama/model_7b_b1_s2048.py
+++ b/examples/llama/model_7b_b1_s2048.py
@@ -369,7 +369,9 @@ def forward(self, x):
                 config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]}
             ):
                 x1 = self.w1(x)
-            with ark.PlannerContext(config={"SramBytes": 0, "Tile": [256, 128]}):
+            with ark.PlannerContext(
+                config={"SramBytes": 0, "Tile": [256, 128]}
+            ):
                 x1 = Silu()(x1)
         with ark.PlannerContext(
             warp_range=[0, 8],
@@ -384,7 +386,9 @@ def forward(self, x):
                 config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]}
             ):
                 x2 = self.w3(x)
-            with ark.PlannerContext(config={"SramBytes": 0, "Tile": [256, 128]}):
+            with ark.PlannerContext(
+                config={"SramBytes": 0, "Tile": [256, 128]}
+            ):
                 x3 = ark.mul(x1, x2)
         x4 = self.w2(x3)
         return x4
@@ -504,7 +508,9 @@ def forward(
             ):
                 if freqs_cis is not None:
                     xq = ark.rope(xq, freqs_cis)
-            with ark.PlannerContext(config={"SramBytes": 0, "Tile": [256, 128]}):
+            with ark.PlannerContext(
+                config={"SramBytes": 0, "Tile": [256, 128]}
+            ):
                 xq = ark.transpose(xq, [0, 2, 1, 3])
 
         with ark.PlannerContext(
@@ -526,7 +532,9 @@ def forward(
                 if freqs_cis is not None:
                     xk = ark.rope(xk, freqs_cis)
             keys = xk
-            with ark.PlannerContext(config={"SramBytes": 0, "Tile": [256, 128]}):
+            with ark.PlannerContext(
+                config={"SramBytes": 0, "Tile": [256, 128]}
+            ):
                 keys = ark.transpose(keys, [0, 2, 1, 3])
 
         with ark.PlannerContext(
@@ -567,7 +575,9 @@ def forward(
                 config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]}
             ):
                 scores = ark.matmul(xq, keys, transpose_other=True)
-            with ark.PlannerContext(config={"SramBytes": 0, "Tile": [256, 128]}):
+            with ark.PlannerContext(
+                config={"SramBytes": 0, "Tile": [256, 128]}
+            ):
                 scores = ark.mul(scores, 1.0 / math.sqrt(self.head_dim))
 
         if mask is not None:

From 8583d1bfd24699ec65e8c7933e9cb564de08844d Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Sat, 10 Aug 2024 09:17:15 +0000
Subject: [PATCH 56/79] updates

---
 ark/api/executor.cpp                  |   4 +
 ark/include/ark/executor.hpp          |   5 +
 examples/tutorial/planner_tutorial.py |  13 +-
 python/CMakeLists.txt                 |  13 ++
 python/ark/__init__.py                |   5 +-
 python/ark/data_type.py               |   2 +-
 python/ark/init.py                    |   2 +-
 python/ark/model.py                   |   2 +-
 python/ark/module.py                  |   4 +-
 python/ark/ops.py                     |  97 +++----------
 python/ark/tensor.py                  |  98 +++++--------
 python/ark_py.cpp                     |   2 -
 python/executor_py.cpp                | 188 ++++++++++++++-----------
 python/tensor_py.cpp                  |  33 ++++-
 python/unittest/test.py               |   2 +-
 python/unittest/test_runtime.py       | 192 +++++++++++++-------------
 python/unittest/test_tensor.py        |  23 +++
 17 files changed, 349 insertions(+), 336 deletions(-)
 create mode 100644 python/unittest/test_tensor.py

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index e77eada96..50686c434 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -154,6 +154,8 @@ class Executor::Impl {
 
     Stream stream() const { return reinterpret_cast<Stream>(stream_raw_); }
 
+    std::shared_ptr<GpuMemory> buffer() const { return buffer_; }
+
     std::string plan() const { return plan_json_.dump_pretty(); }
 
     void compile();
@@ -934,6 +936,8 @@ int Executor::device_id() const { return impl_->device_id(); }
 
 Stream Executor::stream() const { return impl_->stream(); }
 
+std::shared_ptr<GpuMemory> Executor::buffer() const { return impl_->buffer(); }
+
 std::string Executor::plan() const { return impl_->plan(); }
 
 void Executor::compile() { impl_->compile(); }
diff --git a/ark/include/ark/executor.hpp b/ark/include/ark/executor.hpp
index 14ca87618..02a67cd26 100644
--- a/ark/include/ark/executor.hpp
+++ b/ark/include/ark/executor.hpp
@@ -15,6 +15,8 @@ namespace ark {
 
 using Stream = void *;
 
+class GpuMemory;
+
 /// Convenience class for executing a model.
 class Executor {
    public:
@@ -31,6 +33,9 @@ class Executor {
     /// Return the stream of the executor.
     Stream stream() const;
 
+    /// Return the buffer of the executor.
+    std::shared_ptr<GpuMemory> buffer() const;
+
     /// Return the plan string.
     std::string plan() const;
 
diff --git a/examples/tutorial/planner_tutorial.py b/examples/tutorial/planner_tutorial.py
index 1f6c3ac58..6153aaf8e 100644
--- a/examples/tutorial/planner_tutorial.py
+++ b/examples/tutorial/planner_tutorial.py
@@ -69,14 +69,13 @@ def perf():
 
     shape = (32, 2048, 2048)
 
-    # input = torch.randn(*shape).to("cuda:0")
-    input = ark.tensor(shape)
+    input = torch.randn(*shape).to("cuda:0")
 
-    output = Softmax()(input)
+    output = Softmax()(ark.Tensor.from_torch(input))
 
-    # if torch.allclose(eval(output), F.softmax(input, dim=-1), atol=1e-5):
-    #     print("Correct result")
-    # else:
-    #     print("Incorrect result")
+    if torch.allclose(eval(output), F.softmax(input, dim=-1), atol=1e-5):
+        print("Correct result")
+    else:
+        print("Incorrect result")
 
     print(f"Performance: {(perf() * 1e3):.3f} ms/iter")
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index bd25d01e6..2e160f8d1 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -21,3 +21,16 @@ pybind11_add_module(ark_py ${BIND_SOURCES})
 set_target_properties(ark_py PROPERTIES OUTPUT_NAME _ark_core)
 target_link_libraries(ark_py PRIVATE ark_static)
 target_include_directories(ark_py SYSTEM PRIVATE ${DLPACK_INCLUDE_DIRS})
+target_include_directories(ark_py PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../ark)
+
+if(ARK_USE_CUDA)
+    target_include_directories(ark_py SYSTEM PRIVATE
+        ${CUDAToolkit_INCLUDE_DIRS}
+    )
+endif()
+
+if(ARK_USE_ROCM)
+    target_include_directories(ark_py SYSTEM PRIVATE
+        /opt/rocm/include
+    )
+endif()
diff --git a/python/ark/__init__.py b/python/ark/__init__.py
index c20b50b8c..68b03ab29 100644
--- a/python/ark/__init__.py
+++ b/python/ark/__init__.py
@@ -1,12 +1,15 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
+import sys
 import os
 
 if os.environ.get("ARK_ROOT", None) is None:
     os.environ["ARK_ROOT"] = os.path.abspath(os.path.dirname(__file__))
 
-from . import _ark_core
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+import _ark_core
 from .model import Model
 
 
diff --git a/python/ark/data_type.py b/python/ark/data_type.py
index 8ab982106..41c4201c3 100644
--- a/python/ark/data_type.py
+++ b/python/ark/data_type.py
@@ -2,7 +2,7 @@
 # Licensed under the MIT license.
 
 import numpy
-from . import _ark_core
+import _ark_core
 
 try:
     import torch
diff --git a/python/ark/init.py b/python/ark/init.py
index 32f530791..dbf7c1569 100644
--- a/python/ark/init.py
+++ b/python/ark/init.py
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-from . import _ark_core
+import _ark_core
 from .model import Model
 from .runtime import _RuntimeState
 
diff --git a/python/ark/model.py b/python/ark/model.py
index 87af88f49..e6208fc16 100644
--- a/python/ark/model.py
+++ b/python/ark/model.py
@@ -2,7 +2,7 @@
 # Licensed under the MIT license.
 
 from typing import NewType
-from ._ark_core import _Model
+from _ark_core import _Model
 
 _ModelState = NewType("_ModelState", None)
 
diff --git a/python/ark/module.py b/python/ark/module.py
index faeeea40d..d797da72c 100644
--- a/python/ark/module.py
+++ b/python/ark/module.py
@@ -5,7 +5,7 @@
 import numpy as np
 from typing import Any, Dict, List, Union
 from .tensor import Tensor, Parameter
-from .runtime import Runtime, DefaultPlanner
+from .runtime import Runtime, Planner
 from .ops import tensor
 from .data_type import DataType
 
@@ -183,7 +183,7 @@ def forward(self, *args: Any, **kwargs: Any) -> Any:
             self.built_forward = True
 
         with Runtime.get_runtime() as rt:
-            rt.launch(plan=DefaultPlanner().plan())
+            rt.launch(plan=Planner().plan())
             for tns, arg in zip(self.forward_input_tensor_args, args):
                 tns.copy(arg)
             for key, value in self.forward_input_tensor_kwargs.items():
diff --git a/python/ark/ops.py b/python/ark/ops.py
index f890e5d1b..7d98f51c2 100644
--- a/python/ark/ops.py
+++ b/python/ark/ops.py
@@ -1,7 +1,6 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-import json
 from typing import Any, Dict, List, Iterable, Union
 
 from .tensor import Dims, Tensor, Parameter, NullTensor
@@ -13,12 +12,6 @@ def _is_list_or_tuple(obj):
     return isinstance(obj, list) or isinstance(obj, tuple)
 
 
-def _config_to_str(config: Union[str, Dict[str, Any]]) -> str:
-    if isinstance(config, str):
-        return config
-    return json.dumps(config)
-
-
 def _tensor(
     shape: Iterable[int],
     dtype: DataType = fp32,
@@ -59,7 +52,6 @@ def add(
     input: Union[Tensor, float],
     other: Union[Tensor, float],
     output: Tensor = NullTensor,
-    config: Union[str, Dict[str, Any]] = "",
     name: str = "add",
 ) -> Union[Tensor, float]:
     """
@@ -83,14 +75,12 @@ def add(
         return input + other
     else:
         return Tensor(
-            Model.get_model().copy(
-                input + other, output._tensor, _config_to_str(config), name
-            )
+            Model.get_model().copy(input + other, output._tensor, name)
         )
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
-        Model.get_model().add(a, b, output, _config_to_str(config), name),
+        Model.get_model().add(a, b, output, name),
         runtime_id=input.runtime_id,
     )
 
@@ -99,16 +89,13 @@ def cast(
     input: Tensor,
     dtype: DataType,
     output: Tensor = NullTensor,
-    config: Union[str, Dict[str, Any]] = "",
     name: str = "cast",
 ) -> Tensor:
     """Type casting."""
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
-        Model.get_model().cast(
-            input._tensor, dtype.ctype(), output, _config_to_str(config), name
-        ),
+        Model.get_model().cast(input._tensor, dtype.ctype(), output, name),
         runtime_id=input.runtime_id,
     )
 
@@ -130,7 +117,6 @@ def constant(
 def copy(
     input: Union[Tensor, float],
     output: Tensor = NullTensor,
-    config: Union[str, Dict[str, Any]] = "",
     name: str = "copy",
 ) -> Tensor:
     """Data caopy."""
@@ -139,7 +125,7 @@ def copy(
     if isinstance(input, Tensor):
         intput = intput._tensor
     return Tensor(
-        Model.get_model().copy(intput, output, _config_to_str(config), name),
+        Model.get_model().copy(intput, output, name),
         runtime_id=input.runtime_id,
     )
 
@@ -148,7 +134,6 @@ def div(
     input: Tensor,
     other: Union[Tensor, float],
     output: Tensor = NullTensor,
-    config: Union[str, Dict[str, Any]] = "",
     name: str = "div",
 ) -> Tensor:
     """
@@ -164,9 +149,7 @@ def div(
             raise ValueError("Tensors must be on the same runtime")
         other = other._tensor
     return Tensor(
-        Model.get_model().div(
-            input._tensor, other, output, _config_to_str(config), name
-        ),
+        Model.get_model().div(input._tensor, other, output, name),
         runtime_id=input.runtime_id,
     )
 
@@ -175,7 +158,6 @@ def embedding(
     input: Tensor,
     weight: Tensor,
     output: Tensor = NullTensor,
-    config: Union[str, Dict[str, Any]] = "",
     name: str = "embedding",
 ) -> Tensor:
     """Embedding layer."""
@@ -185,7 +167,7 @@ def embedding(
         output = output._tensor
     return Tensor(
         Model.get_model().embedding(
-            input._tensor, weight._tensor, output, _config_to_str(config), name
+            input._tensor, weight._tensor, output, name
         ),
         runtime_id=input.runtime_id,
     )
@@ -194,7 +176,6 @@ def embedding(
 def exp(
     input: Tensor,
     output: Tensor = NullTensor,
-    config: Union[str, Dict[str, Any]] = "",
     name: str = "exp",
 ) -> Tensor:
     """
@@ -205,9 +186,7 @@ def exp(
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
-        Model.get_model().exp(
-            input._tensor, output, _config_to_str(config), name
-        ),
+        Model.get_model().exp(input._tensor, output, name),
         runtime_id=input.runtime_id,
     )
 
@@ -215,7 +194,6 @@ def exp(
 def gelu(
     input: Tensor,
     output: Tensor = NullTensor,
-    config: Union[str, Dict[str, Any]] = "",
     name: str = "gelu",
 ) -> Tensor:
     """
@@ -229,9 +207,7 @@ def gelu(
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
-        Model.get_model().gelu(
-            input._tensor, output, _config_to_str(config), name
-        ),
+        Model.get_model().gelu(input._tensor, output, name),
         runtime_id=input.runtime_id,
     )
 
@@ -263,7 +239,6 @@ def matmul(
     output: Tensor = NullTensor,
     transpose_input: bool = False,
     transpose_other: bool = False,
-    config: Union[str, Dict[str, Any]] = "",
     name: str = "matmul",
 ) -> Tensor:
     """
@@ -286,7 +261,6 @@ def matmul(
             output,
             transpose_input,
             transpose_other,
-            _config_to_str(config),
             name,
         ),
         runtime_id=input.runtime_id,
@@ -297,7 +271,6 @@ def mul(
     input: Tensor,
     other: Union[Tensor, float],
     output: Tensor = NullTensor,
-    config: Union[str, Dict[str, Any]] = "",
     name: str = "mul",
 ) -> Tensor:
     """
@@ -313,9 +286,7 @@ def mul(
             raise ValueError("Tensors must be on the same runtime")
         other = other._tensor
     return Tensor(
-        Model.get_model().mul(
-            input._tensor, other, output, _config_to_str(config), name
-        ),
+        Model.get_model().mul(input._tensor, other, output, name),
         runtime_id=input.runtime_id,
     )
 
@@ -332,7 +303,6 @@ def reduce_max(
     axis: int,
     keepdims: bool = True,
     output: Tensor = NullTensor,
-    config: Union[str, Dict[str, Any]] = "",
     name: str = "reduce_max",
 ) -> Tensor:
     """
@@ -345,7 +315,7 @@ def reduce_max(
         output = output._tensor
     return Tensor(
         Model.get_model().reduce_max(
-            input._tensor, axis, keepdims, output, _config_to_str(config), name
+            input._tensor, axis, keepdims, output, name
         ),
         runtime_id=input.runtime_id,
     )
@@ -356,7 +326,6 @@ def reduce_mean(
     axis: int,
     keepdims: bool = True,
     output: Tensor = NullTensor,
-    config: Union[str, Dict[str, Any]] = "",
     name: str = "reduce_mean",
 ) -> Tensor:
     """
@@ -369,7 +338,7 @@ def reduce_mean(
         output = output._tensor
     return Tensor(
         Model.get_model().reduce_mean(
-            input._tensor, axis, keepdims, output, _config_to_str(config), name
+            input._tensor, axis, keepdims, output, name
         ),
         runtime_id=input.runtime_id,
     )
@@ -380,7 +349,6 @@ def reduce_sum(
     axis: int,
     keepdims: bool = True,
     output: Tensor = NullTensor,
-    config: Union[str, Dict[str, Any]] = "",
     name: str = "reduce_sum",
 ) -> Tensor:
     """
@@ -395,7 +363,7 @@ def reduce_sum(
         output = output._tensor
     return Tensor(
         Model.get_model().reduce_sum(
-            input._tensor, axis, keepdims, output, _config_to_str(config), name
+            input._tensor, axis, keepdims, output, name
         ),
         runtime_id=input.runtime_id,
     )
@@ -404,7 +372,6 @@ def reduce_sum(
 def relu(
     input: Tensor,
     output: Tensor = NullTensor,
-    config: Union[str, Dict[str, Any]] = "",
     name: str = "relu",
 ) -> Tensor:
     """
@@ -416,9 +383,7 @@ def relu(
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
-        Model.get_model().relu(
-            input._tensor, output, _config_to_str(config), name
-        ),
+        Model.get_model().relu(input._tensor, output, name),
         runtime_id=input.runtime_id,
     )
 
@@ -457,7 +422,6 @@ def rope(
     input: Tensor,
     other: Tensor,
     output: Tensor = NullTensor,
-    config: Union[str, Dict[str, Any]] = "",
     name: str = "rope",
 ) -> Tensor:
     """
@@ -470,9 +434,7 @@ def rope(
     if input.runtime_id != other.runtime_id:
         raise ValueError("Tensors must be on the same runtime")
     return Tensor(
-        Model.get_model().rope(
-            input._tensor, other._tensor, output, _config_to_str(config), name
-        ),
+        Model.get_model().rope(input._tensor, other._tensor, output, name),
         runtime_id=input.runtime_id,
     )
 
@@ -480,7 +442,6 @@ def rope(
 def rsqrt(
     input: Tensor,
     output: Tensor = NullTensor,
-    config: Union[str, Dict[str, Any]] = "",
     name: str = "rsqrt",
 ) -> Tensor:
     """
@@ -491,9 +452,7 @@ def rsqrt(
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
-        Model.get_model().rsqrt(
-            input._tensor, output, _config_to_str(config), name
-        ),
+        Model.get_model().rsqrt(input._tensor, output, name),
         runtime_id=input.runtime_id,
     )
 
@@ -521,7 +480,6 @@ def sharding(
 def sigmoid(
     input: Tensor,
     output: Tensor = NullTensor,
-    config: Union[str, Dict[str, Any]] = "",
     name: str = "sigmoid",
 ) -> Tensor:
     """
@@ -533,9 +491,7 @@ def sigmoid(
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
-        Model.get_model().sigmoid(
-            input._tensor, output, _config_to_str(config), name
-        ),
+        Model.get_model().sigmoid(input._tensor, output, name),
         runtime_id=input.runtime_id,
     )
 
@@ -543,7 +499,6 @@ def sigmoid(
 def sqrt(
     input: Tensor,
     output: Tensor = NullTensor,
-    config: Union[str, Dict[str, Any]] = "",
     name: str = "sqrt",
 ) -> Tensor:
     """
@@ -554,9 +509,7 @@ def sqrt(
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
-        Model.get_model().sqrt(
-            input._tensor, output, _config_to_str(config), name
-        ),
+        Model.get_model().sqrt(input._tensor, output, name),
         runtime_id=input.runtime_id,
     )
 
@@ -565,7 +518,6 @@ def sub(
     input: Tensor,
     other: Union[Tensor, float],
     output: Tensor = NullTensor,
-    config: Union[str, Dict[str, Any]] = "",
     name: str = "sub",
 ) -> Tensor:
     """
@@ -581,9 +533,7 @@ def sub(
             raise ValueError("Tensors must be on the same runtime")
         other = other._tensor
     return Tensor(
-        Model.get_model().sub(
-            input._tensor, other, output, _config_to_str(config), name
-        ),
+        Model.get_model().sub(input._tensor, other, output, name),
         runtime_id=input.runtime_id,
     )
 
@@ -613,7 +563,6 @@ def transpose(
     input: Tensor,
     perm: Iterable[int],
     output: Tensor = NullTensor,
-    config: Union[str, Dict[str, Any]] = "",
     name: str = "transpose",
 ) -> Tensor:
     """
@@ -633,9 +582,7 @@ def transpose(
     if len(perm) > 4:
         raise ValueError("Only support perm up to 4 dimensions")
     return Tensor(
-        Model.get_model().transpose(
-            input._tensor, perm, output, _config_to_str(config), name
-        ),
+        Model.get_model().transpose(input._tensor, perm, output, name),
         runtime_id=input.runtime_id,
     )
 
@@ -648,7 +595,6 @@ def mean(
     axis: int,
     keepdims: bool = True,
     output: Tensor = NullTensor,
-    config: Union[str, Dict[str, Any]] = "",
     name: str = "mean",
 ) -> Tensor:
     """Alias of reduce_mean."""
@@ -764,9 +710,10 @@ def all_reduce(
     "reshape",
     "identity",
     "sharding",
-    "reduce_sum",
-    "reduce_mean",
+    "noop",
     "reduce_max",
+    "reduce_mean",
+    "reduce_sum",
     "layernorm",
     "softmax",
     "transpose",
diff --git a/python/ark/tensor.py b/python/ark/tensor.py
index eed7a4259..089d3eaed 100644
--- a/python/ark/tensor.py
+++ b/python/ark/tensor.py
@@ -4,7 +4,7 @@
 import numpy as np
 from typing import Callable, List, Union, Type
 
-from ._ark_core import _Dims, _Tensor, _NullTensor
+from _ark_core import _Dims, _Tensor, _NullTensor
 from .data_type import DataType
 from .runtime import Runtime
 from .model import Model
@@ -102,63 +102,6 @@ def to_numpy(
         rt.executor.tensor_read(self._tensor, ndarray, stream)
         return ndarray
 
-    def to_torch(
-        self, tensor: torch.Tensor = None, stream: int = 0
-    ) -> torch.Tensor:
-        """ """
-        if _no_torch:
-            raise ImportError("torch is not available")
-        rt = Runtime.get_runtime(self.runtime_id)
-        if not rt.launched():
-            raise RuntimeError(
-                "Tensor is not allocated yet. `Tensor.to_torch()` is "
-                "usable only after you call `Runtime.launch()`."
-            )
-        torch_type = self.dtype().to_torch()
-        if tensor is None:
-            dev_name = f"cuda:{rt.executor.device_id()}"
-            tensor = torch.zeros(
-                self.shape(), dtype=torch_type, device=torch.device(dev_name)
-            )
-        elif list(tensor.shape) != self.shape():
-            raise ValueError(
-                f"torch tensor shape {list(tensor.shape)} "
-                f"does not match the tensor {self.shape()}"
-            )
-        elif tensor.dtype != torch_type:
-            raise ValueError(
-                f"torch tensor dtype {tensor.dtype} "
-                f"does not match the tensor {torch_type}"
-            )
-        elif not tensor.is_contiguous():
-            raise ValueError("torch tensor is not contiguous in memory")
-        elif tensor.numel() != self.nelems():
-            raise ValueError(
-                f"torch tensor size {tensor.numel()} "
-                f"does not match the tensor {self.nelems()}"
-            )
-        tensor_bytes = self.nelems() * self.dtype().element_size()
-        rt.executor.tensor_read(
-            self._tensor, tensor.data_ptr(), tensor_bytes, stream, True
-        )
-        return tensor
-
-    def get_torch_view(self) -> torch.Tensor:
-        """
-        Returns a torch tensor that shares the same memory with the device tensor.
-        """
-        if _no_torch:
-            raise ImportError("torch is not available")
-        rt = Runtime.get_runtime(self.runtime_id)
-        if not rt.launched():
-            raise RuntimeError(
-                "Tensor is not allocated yet. `Tensor.get_torch_view()` is "
-                "usable only after you call `Runtime.launch()`."
-            )
-        dl_tensor = rt.executor.get_dl_tensor(self._tensor)
-        torch_view = torch.utils.dlpack.from_dlpack(dl_tensor)
-        return torch_view
-
     def from_numpy(self, ndarray: np.ndarray, stream: int = 0) -> "Tensor":
         """
         Copies the tensor from a host numpy array to the device.
@@ -177,6 +120,37 @@ def from_numpy(self, ndarray: np.ndarray, stream: int = 0) -> "Tensor":
         rt.executor.tensor_write(self._tensor, ndarray, stream)
         return self
 
+    def to_dlpack(self):
+        """
+        Returns a DLPack tensor that shares the same memory with the device tensor.
+        """
+        rt = Runtime.get_runtime(self.runtime_id)
+        if not rt.launched():
+            raise RuntimeError(
+                "Tensor is not allocated yet. `Tensor.to_dlpack()` is "
+                "usable only after you call `Runtime.launch()`."
+            )
+        return rt.executor.tensor_to_dlpack(self._tensor)
+
+    @staticmethod
+    def from_dlpack(ext_tensor, runtime_id: int = -1) -> "Tensor":
+        """
+        Copies the tensor from a DLPack tensor to the device.
+        """
+        return Tensor(_Tensor(ext_tensor), runtime_id=runtime_id)
+
+    def to_torch(self) -> torch.Tensor:
+        """
+        Returns a torch tensor that shares the same memory with the device tensor.
+        """
+        if _no_torch:
+            raise ImportError("torch is not available")
+        dl_capsule = self.to_dlpack()
+        torch_view = torch.utils.dlpack.from_dlpack(dl_capsule)
+        # Keep dl_capsule alive not to free the memory
+        torch_view.__ark_buffer__ = dl_capsule
+        return torch_view
+
     @staticmethod
     def from_torch(tensor: torch.Tensor, runtime_id: int = -1) -> "Tensor":
         """
@@ -188,10 +162,10 @@ def from_torch(tensor: torch.Tensor, runtime_id: int = -1) -> "Tensor":
             raise ValueError("Torch tensor must be contiguous.")
         elif tensor.device.type == "cpu":
             raise ValueError("Torch tensor must be on a device.")
-        ark_dtype = DataType.from_torch(tensor.dtype)
-        dl_capsule = torch.utils.dlpack.to_dlpack(tensor)
-        ark_tensor = _Tensor(dl_capsule, ark_dtype.ctype())
-        return Tensor(ark_tensor, runtime_id=runtime_id)
+        return Tensor.from_dlpack(
+            torch.utils.dlpack.to_dlpack(tensor),
+            runtime_id=runtime_id,
+        )
 
     def copy(
         self, data: Union[np.ndarray, torch.Tensor], stream: int = 0
diff --git a/python/ark_py.cpp b/python/ark_py.cpp
index 75788ba55..1bc4255d6 100644
--- a/python/ark_py.cpp
+++ b/python/ark_py.cpp
@@ -7,7 +7,6 @@
 
 namespace py = pybind11;
 
-extern void register_plan_manager(py::module &m);
 extern void register_data_type(py::module &m);
 extern void register_dims(py::module &m);
 extern void register_error(py::module &m);
@@ -23,7 +22,6 @@ extern void register_version(py::module &m);
 PYBIND11_MODULE(_ark_core, m) {
     m.doc() = "Bind ARK C++ APIs to Python";
 
-    register_plan_manager(m);
     register_data_type(m);
     register_dims(m);
     register_error(m);
diff --git a/python/executor_py.cpp b/python/executor_py.cpp
index 126970d89..d90825e21 100644
--- a/python/executor_py.cpp
+++ b/python/executor_py.cpp
@@ -8,8 +8,10 @@
 
 #include <ark/executor.hpp>
 #include <ark/model.hpp>
-#include <iostream>
-#include <stdexcept>
+
+#include "gpu/gpu_memory.hpp"
+#include "logging.hpp"
+
 namespace py = pybind11;
 
 static void tensor_write(ark::Executor *exe, const ark::Tensor &tensor,
@@ -42,37 +44,37 @@ static void tensor_read(ark::Executor *exe, const ark::Tensor &tensor,
                      reinterpret_cast<ark::Stream>(stream), is_d2d);
 }
 
-static DLDataType get_dl_dtype(const ark::DataType &ark_data_type) {
-    DLDataType dl_data_type;
-    dl_data_type.lanes = 1;
-    if (ark_data_type == ark::FP32) {
-        dl_data_type.code = kDLFloat;
-        dl_data_type.bits = 32;
-    } else if (ark_data_type == ark::FP16) {
-        dl_data_type.code = kDLFloat;
-        dl_data_type.bits = 16;
-    } else if (ark_data_type == ark::BF16) {
-        dl_data_type.code = kDLBfloat;
-        dl_data_type.bits = 16;
-    } else if (ark_data_type == ark::INT32) {
-        dl_data_type.code = kDLInt;
-        dl_data_type.bits = 32;
-    } else if (ark_data_type == ark::UINT32) {
-        dl_data_type.code = kDLUInt;
-        dl_data_type.bits = 32;
-    } else if (ark_data_type == ark::INT8) {
-        dl_data_type.code = kDLInt;
-        dl_data_type.bits = 8;
-    } else if (ark_data_type == ark::UINT8) {
-        dl_data_type.code = kDLUInt;
-        dl_data_type.bits = 8;
-    } else if (ark_data_type == ark::BYTE) {
-        dl_data_type.code = kDLUInt;
-        dl_data_type.bits = 8;
+static DLDataType to_dl_dtype(const ark::DataType &ark_dtype) {
+    DLDataType dl_dtype;
+    dl_dtype.lanes = 1;
+    if (ark_dtype == ark::FP32) {
+        dl_dtype.code = kDLFloat;
+        dl_dtype.bits = 32;
+    } else if (ark_dtype == ark::FP16) {
+        dl_dtype.code = kDLFloat;
+        dl_dtype.bits = 16;
+    } else if (ark_dtype == ark::BF16) {
+        dl_dtype.code = kDLBfloat;
+        dl_dtype.bits = 16;
+    } else if (ark_dtype == ark::INT32) {
+        dl_dtype.code = kDLInt;
+        dl_dtype.bits = 32;
+    } else if (ark_dtype == ark::UINT32) {
+        dl_dtype.code = kDLUInt;
+        dl_dtype.bits = 32;
+    } else if (ark_dtype == ark::INT8) {
+        dl_dtype.code = kDLInt;
+        dl_dtype.bits = 8;
+    } else if (ark_dtype == ark::UINT8) {
+        dl_dtype.code = kDLUInt;
+        dl_dtype.bits = 8;
+    } else if (ark_dtype == ark::BYTE) {
+        dl_dtype.code = kDLUInt;
+        dl_dtype.bits = 8;
     } else {
-        throw std::runtime_error("unexpected error");
+        ERR(ark::InternalError, "unexpected");
     }
-    return dl_data_type;
+    return dl_dtype;
 }
 
 static DLDeviceType get_device_type() {
@@ -85,66 +87,84 @@ static DLDeviceType get_device_type() {
 #endif
 }
 
-static DLManagedTensor *to_dlpack(ark::Executor &exe,
-                                  const ark::Tensor &tensor) {
-    DLTensor dl_tensor;
-    dl_tensor.data = reinterpret_cast<void *>(exe.tensor_address(tensor));
-    size_t offset_in_elements =
-        tensor.offsets().is_no_dim() ? 0 : tensor.offsets().vector()[0];
-    dl_tensor.byte_offset = offset_in_elements * tensor.data_type().bytes();
-    dl_tensor.device.device_type = get_device_type();
-    dl_tensor.device.device_id = static_cast<int32_t>(exe.device_id());
-    dl_tensor.ndim = static_cast<int32_t>(tensor.shape().ndims());
-    dl_tensor.dtype = get_dl_dtype(tensor.data_type());
-
-    dl_tensor.shape =
-        tensor.shape().is_no_dim() ? nullptr : new int64_t[dl_tensor.ndim];
-    dl_tensor.strides =
-        tensor.strides().is_no_dim() ? nullptr : new int64_t[dl_tensor.ndim];
-    auto shape = tensor.shape();
-    if (dl_tensor.shape) {
-        for (int i = 0; i < dl_tensor.ndim; ++i) {
-            dl_tensor.shape[i] = shape[i];
-        }
-    }
-    if (dl_tensor.strides) {
-        dl_tensor.strides[dl_tensor.ndim - 1] = 1;
-        for (int i = dl_tensor.ndim - 2; i >= 0; --i) {
-            dl_tensor.strides[i] =
-                dl_tensor.shape[i + 1] * dl_tensor.strides[i + 1];
+namespace ark {
+
+class SharedTensor {
+   public:
+    SharedTensor(Executor &exe, const Tensor &tensor);
+    ~SharedTensor() = default;
+
+    DLTensor dl_tensor() const;
+
+   private:
+    std::shared_ptr<GpuMemory> buffer_;
+    void *data_;
+    int device_id_;
+    DataType dtype_;
+    std::shared_ptr<std::vector<int64_t>> shape_;
+    std::shared_ptr<std::vector<int64_t>> strides_;
+    std::shared_ptr<std::vector<int64_t>> offsets_;
+};
+
+SharedTensor::SharedTensor(Executor &exe, const Tensor &tensor) {
+    buffer_ = exe.buffer();
+    data_ = reinterpret_cast<void *>(exe.tensor_address(tensor));
+    device_id_ = exe.device_id();
+    dtype_ = tensor.data_type();
+    shape_ = std::make_shared<std::vector<int64_t>>(tensor.shape().vector());
+    offsets_ =
+        std::make_shared<std::vector<int64_t>>(tensor.offsets().vector());
+
+    strides_ = std::make_shared<std::vector<int64_t>>();
+    if (!shape_->empty()) {
+        int ndims = static_cast<int>(shape_->size());
+        strides_->resize(shape_->size());
+        strides_->back() = 1;
+        auto tmp = tensor.strides().vector();
+        for (int i = ndims - 2; i >= 0; --i) {
+            (*strides_)[i] = (*strides_)[i + 1] * tmp[i + 1];
         }
     }
-    DLManagedTensor *dl_managed_tensor = new DLManagedTensor();
-    dl_managed_tensor->dl_tensor = dl_tensor;
-    dl_managed_tensor->manager_ctx = nullptr;
-    dl_managed_tensor->deleter = [](DLManagedTensor *self) {
-        if (self->dl_tensor.shape) {
-            delete[] self->dl_tensor.shape;
-            self->dl_tensor.shape = nullptr;
-        }
-        if (self->dl_tensor.strides) {
-            delete[] self->dl_tensor.strides;
-            self->dl_tensor.strides = nullptr;
-        }
-    };
-    return dl_managed_tensor;
 }
 
-void free_capsule(PyObject *capsule) {
-    const char *name = PyCapsule_GetName(capsule);
-    auto *dl_managed_tensor =
-        static_cast<DLManagedTensor *>(PyCapsule_GetPointer(capsule, name));
-    if (dl_managed_tensor) {
-        dl_managed_tensor->deleter(dl_managed_tensor);
-        dl_managed_tensor = nullptr;
-    }
+DLTensor SharedTensor::dl_tensor() const {
+    DLTensor dl_tensor;
+    dl_tensor.data = data_;
+    size_t offset_in_elements = offsets_->empty() ? 0 : offsets_->at(0);
+    dl_tensor.byte_offset = offset_in_elements * dtype_.bytes();
+    dl_tensor.device.device_type = get_device_type();
+    dl_tensor.device.device_id = device_id_;
+    dl_tensor.ndim = static_cast<int32_t>(shape_->size());
+    dl_tensor.dtype = to_dl_dtype(dtype_);
+    dl_tensor.shape = shape_->data();
+    dl_tensor.strides = strides_->data();
+    return dl_tensor;
 }
 
-py::capsule to_dlpack_capsule(ark::Executor &self, const ark::Tensor &tensor) {
-    DLManagedTensor *dl_managed_tensor = to_dlpack(self, tensor);
+}  // namespace ark
+
+static py::capsule tensor_to_dlpack(ark::Executor &self, const ark::Tensor &tensor) {
+    auto shared_tensor = new ark::SharedTensor(self, tensor);
+    DLManagedTensor *dl_managed_tensor = new DLManagedTensor();
+    dl_managed_tensor->dl_tensor = shared_tensor->dl_tensor();
+    dl_managed_tensor->manager_ctx = shared_tensor;
+    dl_managed_tensor->deleter = [](DLManagedTensor *self) {
+        if (self->manager_ctx) {
+            delete static_cast<ark::SharedTensor *>(self->manager_ctx);
+            self->manager_ctx = nullptr;
+        }
+    };
     const char *capsule_name = "dltensor";
     PyObject *dl_capsule = PyCapsule_New(static_cast<void *>(dl_managed_tensor),
-                                         capsule_name, free_capsule);
+                                         capsule_name, [](PyObject *capsule) {
+            const char *name = PyCapsule_GetName(capsule);
+            auto *dl_managed_tensor = static_cast<DLManagedTensor *>(
+                PyCapsule_GetPointer(capsule, name));
+            if (dl_managed_tensor) {
+                dl_managed_tensor->deleter(dl_managed_tensor);
+                dl_managed_tensor = nullptr;
+            }
+        });
     return py::reinterpret_steal<py::capsule>(dl_capsule);
 }
 
@@ -191,5 +211,5 @@ void register_executor(py::module &m) {
                                size_t, uintptr_t, bool>(&tensor_write),
              py::arg("tensor"), py::arg("address"), py::arg("bytes"),
              py::arg("stream"), py::arg("is_d2d"))
-        .def("get_dl_tensor", &to_dlpack_capsule);
+        .def("tensor_to_dlpack", &tensor_to_dlpack);
 }
diff --git a/python/tensor_py.cpp b/python/tensor_py.cpp
index 16eb03421..e7f06592d 100644
--- a/python/tensor_py.cpp
+++ b/python/tensor_py.cpp
@@ -8,6 +8,8 @@
 
 #include <ark/tensor.hpp>
 
+#include "logging.hpp"
+
 namespace py = pybind11;
 
 struct DLTensorMetadata {
@@ -40,12 +42,37 @@ static DLTensorMetadata extractDLTensorMetadata(DLManagedTensor* dl_tensor) {
     return metadata;
 }
 
+static ark::DataType from_dl_dtype(const DLDataType &dl_dtype) {
+    if (dl_dtype.lanes != 1) {
+        ERR(ark::UnsupportedError, "unsupported data type");
+    }
+    ark::DataType ark_dtype;
+    if (dl_dtype.code == kDLFloat && dl_dtype.bits == 32) {
+        ark_dtype = ark::FP32;
+    } else if (dl_dtype.code == kDLFloat && dl_dtype.bits == 16) {
+        ark_dtype = ark::FP16;
+    } else if (dl_dtype.code == kDLBfloat && dl_dtype.bits == 16) {
+        ark_dtype = ark::BF16;
+    } else if (dl_dtype.code == kDLInt && dl_dtype.bits == 32) {
+        ark_dtype = ark::INT32;
+    } else if (dl_dtype.code == kDLUInt && dl_dtype.bits == 32) {
+        ark_dtype = ark::UINT32;
+    } else if (dl_dtype.code == kDLInt && dl_dtype.bits == 8) {
+        ark_dtype = ark::INT8;
+    } else if (dl_dtype.code == kDLUInt && dl_dtype.bits == 8) {
+        ark_dtype = ark::UINT8;
+    } else {
+        ERR(ark::UnsupportedError, "unsupported data type");
+    }
+    return ark_dtype;
+}
+
 void register_tensor(py::module& m) {
     py::class_<ark::Tensor>(m, "_Tensor")
-        .def(py::init([](py::capsule capsule, const ark::DataType& dtype) {
+        .def(py::init([](py::capsule capsule) {
             DLManagedTensor* dl_tensor = (DLManagedTensor*)capsule;
             if (!dl_tensor) {
-                throw std::runtime_error(
+                ERR(ark::InvalidUsageError,
                     "Capsule does not contain a DLManagedTensor");
             }
             DLTensorMetadata metadata = extractDLTensorMetadata(dl_tensor);
@@ -53,7 +80,7 @@ void register_tensor(py::module& m) {
             void* data_ptr = metadata.data_ptr;
             auto shape = metadata.shape;
 
-            return new ark::Tensor(data_ptr, device_id, shape, dtype);
+            return ark::Tensor(data_ptr, device_id, shape, from_dl_dtype(metadata.dtype));
         }))
         .def("id", &ark::Tensor::id)
         .def("shape", &ark::Tensor::shape, py::return_value_policy::reference)
diff --git a/python/unittest/test.py b/python/unittest/test.py
index 238b16fb6..d56932b83 100644
--- a/python/unittest/test.py
+++ b/python/unittest/test.py
@@ -10,4 +10,4 @@
 from test_error import *
 from test_model import *
 from test_runtime import *
-from test_conversion import *
+from test_tensor import *
diff --git a/python/unittest/test_runtime.py b/python/unittest/test_runtime.py
index 8c00b51f8..c3d15d1b9 100644
--- a/python/unittest/test_runtime.py
+++ b/python/unittest/test_runtime.py
@@ -20,99 +20,99 @@ def test_runtime_relaunch():
         assert rt.launched() == True
 
 
-def test_multiple_runtime_launch():
-    ark.init()
-    num_runtimes = 5
-    for i in range(num_runtimes):
-        rt = ark.Runtime.get_runtime(i)
-        assert rt.launched() == False
-        rt.launch(plan=empty_plan, device_id=i)
-        assert rt.launched() == True
-    for i in range(num_runtimes):
-        rt = ark.Runtime.get_runtime(i)
-        assert rt.launched() == True
-    ark.Runtime.delete_all_runtimes()
-
-
-def test_stop_runtime():
-    ark.init()
-    rt1 = ark.Runtime.get_runtime(1)
-    rt1.launch(plan=empty_plan, device_id=1)
-    rt2 = ark.Runtime.get_runtime(2)
-    rt2.launch(plan=empty_plan, device_id=2)
-    rt1.stop()
-    rt1.reset()
-    assert rt1.state == ark.Runtime.State.Init
-    assert rt2.state == ark.Runtime.State.LaunchedNotRunning
-    ark.Runtime.delete_all_runtimes()
-
-
-def test_reset_runtime():
-    ark.init()
-    rt1 = ark.Runtime.get_runtime(0)
-    rt1.launch(plan=empty_plan, device_id=1)
-    rt2 = ark.Runtime.get_runtime(1)
-    rt2.launch(plan=empty_plan, device_id=2)
-    rt1.reset()
-    assert rt1.launched() == False
-    assert rt2.launched() == True
-    rt1.launch(plan=empty_plan)
-    assert rt1.launched() == True
-    ark.Runtime.delete_all_runtimes()
-
-
-def test_multiple_runtimes_complex():
-    ark.init()
-    num_runtimes = 3
-    runtime_list = [ark.Runtime.get_runtime(i) for i in range(num_runtimes)]
-    default_runtime = ark.Runtime.get_runtime()
-    runtime_list.append(default_runtime)
-    for i, rt in enumerate(runtime_list):
-        rt.launch(plan=empty_plan, device_id=i)
-        assert rt.launched() == True
-    runtime_list[0].stop()
-    assert runtime_list[0].state == ark.Runtime.State.LaunchedNotRunning
-    for rt in runtime_list[1:]:
-        assert rt.launched() == True
-    runtime_list[1].reset()
-    assert runtime_list[1].state == ark.Runtime.State.Init
-    assert runtime_list[0].state == ark.Runtime.State.LaunchedNotRunning
-    assert runtime_list[2].state == ark.Runtime.State.LaunchedNotRunning
-    runtime_list[1].launch(plan=empty_plan, device_id=1)
-    for rt in runtime_list:
-        assert rt.launched() == True
-    ark.Runtime.delete_all_runtimes()
-
-
-def test_runtime_state_after_reset():
-    ark.init()
-    rt = ark.Runtime.get_runtime()
-    rt.launch(plan=empty_plan)
-    rt.reset()
-    assert rt.launched() == False
-    assert rt.running() == False
-    ark.Runtime.delete_all_runtimes()
-
-
-def test_see_runtime_statuses():
-    ark.init()
-    num_runtimes = 3
-    runtimes = [ark.Runtime.get_runtime(i) for i in range(num_runtimes)]
-    runtime_statuses = ark.Runtime.see_runtime_statuses()
-    assert len(runtime_statuses) == num_runtimes
-    for i in range(num_runtimes):
-        assert i in runtime_statuses
-    for i, rt in enumerate(runtimes):
-        assert runtime_statuses[i] == rt
-    ark.Runtime.delete_all_runtimes()
-
-
-def test_multiple_runtimes_init():
-    ark.init()
-    runtimes = [ark.Runtime.get_runtime(i) for i in range(3)]
-    for rt in runtimes:
-        assert rt.state == ark.Runtime.State.Init
-    ark.init()
-    runtimes = ark.Runtime.see_runtime_statuses()
-    assert len(runtimes) == 0
-    ark.Runtime.delete_all_runtimes()
+# def test_multiple_runtime_launch():
+#     ark.init()
+#     num_runtimes = 5
+#     for i in range(num_runtimes):
+#         rt = ark.Runtime.get_runtime(i)
+#         assert rt.launched() == False
+#         rt.launch(plan=empty_plan, device_id=i)
+#         assert rt.launched() == True
+#     for i in range(num_runtimes):
+#         rt = ark.Runtime.get_runtime(i)
+#         assert rt.launched() == True
+#     ark.Runtime.delete_all_runtimes()
+
+
+# def test_stop_runtime():
+#     ark.init()
+#     rt1 = ark.Runtime.get_runtime(1)
+#     rt1.launch(plan=empty_plan, device_id=1)
+#     rt2 = ark.Runtime.get_runtime(2)
+#     rt2.launch(plan=empty_plan, device_id=2)
+#     rt1.stop()
+#     rt1.reset()
+#     assert rt1.state == ark.Runtime.State.Init
+#     assert rt2.state == ark.Runtime.State.LaunchedNotRunning
+#     ark.Runtime.delete_all_runtimes()
+
+
+# def test_reset_runtime():
+#     ark.init()
+#     rt1 = ark.Runtime.get_runtime(0)
+#     rt1.launch(plan=empty_plan, device_id=1)
+#     rt2 = ark.Runtime.get_runtime(1)
+#     rt2.launch(plan=empty_plan, device_id=2)
+#     rt1.reset()
+#     assert rt1.launched() == False
+#     assert rt2.launched() == True
+#     rt1.launch(plan=empty_plan)
+#     assert rt1.launched() == True
+#     ark.Runtime.delete_all_runtimes()
+
+
+# def test_multiple_runtimes_complex():
+#     ark.init()
+#     num_runtimes = 3
+#     runtime_list = [ark.Runtime.get_runtime(i) for i in range(num_runtimes)]
+#     default_runtime = ark.Runtime.get_runtime()
+#     runtime_list.append(default_runtime)
+#     for i, rt in enumerate(runtime_list):
+#         rt.launch(plan=empty_plan, device_id=i)
+#         assert rt.launched() == True
+#     runtime_list[0].stop()
+#     assert runtime_list[0].state == ark.Runtime.State.LaunchedNotRunning
+#     for rt in runtime_list[1:]:
+#         assert rt.launched() == True
+#     runtime_list[1].reset()
+#     assert runtime_list[1].state == ark.Runtime.State.Init
+#     assert runtime_list[0].state == ark.Runtime.State.LaunchedNotRunning
+#     assert runtime_list[2].state == ark.Runtime.State.LaunchedNotRunning
+#     runtime_list[1].launch(plan=empty_plan, device_id=1)
+#     for rt in runtime_list:
+#         assert rt.launched() == True
+#     ark.Runtime.delete_all_runtimes()
+
+
+# def test_runtime_state_after_reset():
+#     ark.init()
+#     rt = ark.Runtime.get_runtime()
+#     rt.launch(plan=empty_plan)
+#     rt.reset()
+#     assert rt.launched() == False
+#     assert rt.running() == False
+#     ark.Runtime.delete_all_runtimes()
+
+
+# def test_see_runtime_statuses():
+#     ark.init()
+#     num_runtimes = 3
+#     runtimes = [ark.Runtime.get_runtime(i) for i in range(num_runtimes)]
+#     runtime_statuses = ark.Runtime.see_runtime_statuses()
+#     assert len(runtime_statuses) == num_runtimes
+#     for i in range(num_runtimes):
+#         assert i in runtime_statuses
+#     for i, rt in enumerate(runtimes):
+#         assert runtime_statuses[i] == rt
+#     ark.Runtime.delete_all_runtimes()
+
+
+# def test_multiple_runtimes_init():
+#     ark.init()
+#     runtimes = [ark.Runtime.get_runtime(i) for i in range(3)]
+#     for rt in runtimes:
+#         assert rt.state == ark.Runtime.State.Init
+#     ark.init()
+#     runtimes = ark.Runtime.see_runtime_statuses()
+#     assert len(runtimes) == 0
+#     ark.Runtime.delete_all_runtimes()
diff --git a/python/unittest/test_tensor.py b/python/unittest/test_tensor.py
new file mode 100644
index 000000000..1acad43ee
--- /dev/null
+++ b/python/unittest/test_tensor.py
@@ -0,0 +1,23 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+from unittest_common import pytest_ark
+import ark
+
+
+@pytest_ark(need_torch=True)
+def test_tensor_torch():
+    import torch
+
+    ones = torch.ones(2, 1024, device=torch.device("cuda:0"))
+
+    t = ark.Tensor.from_torch(ones)
+    t = ark.mul(t, 5)
+
+    with ark.Runtime() as rt:
+        rt.launch()
+        rt.run()
+
+        x = t.to_torch()
+
+    assert torch.allclose(x, ones * 5)

From 8c2562b3b7ddeb5736bd10be28768222b7ad9a56 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Sun, 11 Aug 2024 21:22:35 +0000
Subject: [PATCH 57/79] remove runtime ID

---
 ark/api/tensor.cpp    |   4 +-
 python/ark/init.py    |   5 +-
 python/ark/ops.py     | 132 ++++++++++--------------------------------
 python/ark/runtime.py | 106 ++++++---------------------------
 python/ark/tensor.py  |  25 +++-----
 5 files changed, 61 insertions(+), 211 deletions(-)

diff --git a/ark/api/tensor.cpp b/ark/api/tensor.cpp
index 4d33bd9f1..4b5bdfd55 100644
--- a/ark/api/tensor.cpp
+++ b/ark/api/tensor.cpp
@@ -57,14 +57,14 @@ Dims Tensor::padded_shape() const {
     return Dims();
 }
 
-const DataType& Tensor::data_type() const {
+const DataType &Tensor::data_type() const {
     if (ref_) {
         return DataType::from_name(ref_->data_type()->type_name());
     }
     return NONE;
 }
 
-std::ostream& operator<<(std::ostream& os, const Tensor& tensor) {
+std::ostream &operator<<(std::ostream &os, const Tensor &tensor) {
     if (tensor.is_null()) {
         os << "null";
     } else {
diff --git a/python/ark/init.py b/python/ark/init.py
index dbf7c1569..a4a67e85d 100644
--- a/python/ark/init.py
+++ b/python/ark/init.py
@@ -9,6 +9,7 @@
 def init():
     """Initializes ARK."""
     Model.reset()
-    if _RuntimeState.runtime:
-        _RuntimeState.delete_all()
+    if _RuntimeState.runtime is not None:
+        del _RuntimeState.runtime
+        _RuntimeState.runtime = None
     _ark_core.init()
diff --git a/python/ark/ops.py b/python/ark/ops.py
index 7d98f51c2..5fe144150 100644
--- a/python/ark/ops.py
+++ b/python/ark/ops.py
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-from typing import Any, Dict, List, Iterable, Union
+from typing import List, Iterable, Union
 
 from .tensor import Dims, Tensor, Parameter, NullTensor
 from .data_type import DataType, fp32
@@ -61,8 +61,6 @@ def add(
     tensor_add = ark.add(tensor1, tensor2)
     """
     if isinstance(input, Tensor) and isinstance(other, Tensor):
-        if input.runtime_id != other.runtime_id:
-            raise ValueError("Tensors must be on the same runtime")
         a = input._tensor
         b = other._tensor
     elif isinstance(input, Tensor):
@@ -79,10 +77,7 @@ def add(
         )
     if output is not NullTensor:
         output = output._tensor
-    return Tensor(
-        Model.get_model().add(a, b, output, name),
-        runtime_id=input.runtime_id,
-    )
+    return Tensor(Model.get_model().add(a, b, output, name))
 
 
 def cast(
@@ -95,8 +90,7 @@ def cast(
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
-        Model.get_model().cast(input._tensor, dtype.ctype(), output, name),
-        runtime_id=input.runtime_id,
+        Model.get_model().cast(input._tensor, dtype.ctype(), output, name)
     )
 
 
@@ -105,12 +99,10 @@ def constant(
     shape: Iterable[int],
     dtype: DataType = fp32,
     name: str = "constant",
-    runtime_id: int = -1,
 ) -> Tensor:
     """Constant."""
     return Tensor(
-        Model.get_model().constant(value, Dims(shape), dtype.ctype(), name),
-        runtime_id=runtime_id,
+        Model.get_model().constant(value, Dims(shape), dtype.ctype(), name)
     )
 
 
@@ -124,10 +116,7 @@ def copy(
         output = output._tensor
     if isinstance(input, Tensor):
         intput = intput._tensor
-    return Tensor(
-        Model.get_model().copy(intput, output, name),
-        runtime_id=input.runtime_id,
-    )
+    return Tensor(Model.get_model().copy(intput, output, name))
 
 
 def div(
@@ -145,13 +134,8 @@ def div(
     if output is not NullTensor:
         output = output._tensor
     if isinstance(other, Tensor):
-        if input.runtime_id != other.runtime_id:
-            raise ValueError("Tensors must be on the same runtime")
         other = other._tensor
-    return Tensor(
-        Model.get_model().div(input._tensor, other, output, name),
-        runtime_id=input.runtime_id,
-    )
+    return Tensor(Model.get_model().div(input._tensor, other, output, name))
 
 
 def embedding(
@@ -161,15 +145,10 @@ def embedding(
     name: str = "embedding",
 ) -> Tensor:
     """Embedding layer."""
-    if input.runtime_id != weight.runtime_id:
-        raise ValueError("Tensors must be on the same runtime")
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
-        Model.get_model().embedding(
-            input._tensor, weight._tensor, output, name
-        ),
-        runtime_id=input.runtime_id,
+        Model.get_model().embedding(input._tensor, weight._tensor, output, name)
     )
 
 
@@ -185,10 +164,7 @@ def exp(
     """
     if output is not NullTensor:
         output = output._tensor
-    return Tensor(
-        Model.get_model().exp(input._tensor, output, name),
-        runtime_id=input.runtime_id,
-    )
+    return Tensor(Model.get_model().exp(input._tensor, output, name))
 
 
 def gelu(
@@ -206,10 +182,7 @@ def gelu(
     """
     if output is not NullTensor:
         output = output._tensor
-    return Tensor(
-        Model.get_model().gelu(input._tensor, output, name),
-        runtime_id=input.runtime_id,
-    )
+    return Tensor(Model.get_model().gelu(input._tensor, output, name))
 
 
 def identity(
@@ -224,13 +197,8 @@ def identity(
     for dep in deps:
         if not isinstance(dep, Tensor):
             raise TypeError("All dependencies should be a tensor")
-        if input.runtime_id != dep.runtime_id:
-            raise ValueError("All tensors must be on the same runtime")
         dep_tensors.append(dep._tensor)
-    return Tensor(
-        Model.get_model().identity(input._tensor, dep_tensors, name),
-        runtime_id=input.runtime_id,
-    )
+    return Tensor(Model.get_model().identity(input._tensor, dep_tensors, name))
 
 
 def matmul(
@@ -250,8 +218,6 @@ def matmul(
     Usage:
     tensor_matmul = ark.matmul(tensor1, tensor2)
     """
-    if input.runtime_id != other.runtime_id:
-        raise ValueError("Tensors must be on the same runtime")
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
@@ -262,8 +228,7 @@ def matmul(
             transpose_input,
             transpose_other,
             name,
-        ),
-        runtime_id=input.runtime_id,
+        )
     )
 
 
@@ -282,13 +247,8 @@ def mul(
     if output is not NullTensor:
         output = output._tensor
     if isinstance(other, Tensor):
-        if input.runtime_id != other.runtime_id:
-            raise ValueError("Tensors must be on the same runtime")
         other = other._tensor
-    return Tensor(
-        Model.get_model().mul(input._tensor, other, output, name),
-        runtime_id=input.runtime_id,
-    )
+    return Tensor(Model.get_model().mul(input._tensor, other, output, name))
 
 
 def noop(input: Tensor, name: str = "noop"):
@@ -316,8 +276,7 @@ def reduce_max(
     return Tensor(
         Model.get_model().reduce_max(
             input._tensor, axis, keepdims, output, name
-        ),
-        runtime_id=input.runtime_id,
+        )
     )
 
 
@@ -339,8 +298,7 @@ def reduce_mean(
     return Tensor(
         Model.get_model().reduce_mean(
             input._tensor, axis, keepdims, output, name
-        ),
-        runtime_id=input.runtime_id,
+        )
     )
 
 
@@ -364,8 +322,7 @@ def reduce_sum(
     return Tensor(
         Model.get_model().reduce_sum(
             input._tensor, axis, keepdims, output, name
-        ),
-        runtime_id=input.runtime_id,
+        )
     )
 
 
@@ -382,10 +339,7 @@ def relu(
     """
     if output is not NullTensor:
         output = output._tensor
-    return Tensor(
-        Model.get_model().relu(input._tensor, output, name),
-        runtime_id=input.runtime_id,
-    )
+    return Tensor(Model.get_model().relu(input._tensor, output, name))
 
 
 def reshape(
@@ -413,8 +367,7 @@ def reshape(
     if len(shape) > 4:
         raise ValueError("Only support tensors with up to 4 dimensions")
     return Tensor(
-        Model.get_model().reshape(input._tensor, Dims(shape), allowzero, name),
-        runtime_id=input.runtime_id,
+        Model.get_model().reshape(input._tensor, Dims(shape), allowzero, name)
     )
 
 
@@ -431,11 +384,8 @@ def rope(
     """
     if output is not NullTensor:
         output = output._tensor
-    if input.runtime_id != other.runtime_id:
-        raise ValueError("Tensors must be on the same runtime")
     return Tensor(
-        Model.get_model().rope(input._tensor, other._tensor, output, name),
-        runtime_id=input.runtime_id,
+        Model.get_model().rope(input._tensor, other._tensor, output, name)
     )
 
 
@@ -451,10 +401,7 @@ def rsqrt(
     """
     if output is not NullTensor:
         output = output._tensor
-    return Tensor(
-        Model.get_model().rsqrt(input._tensor, output, name),
-        runtime_id=input.runtime_id,
-    )
+    return Tensor(Model.get_model().rsqrt(input._tensor, output, name))
 
 
 def sharding(
@@ -472,9 +419,7 @@ def sharding(
     _tensor_list = Model.get_model().sharding(
         input._tensor, axis, dim_per_shard, name
     )
-    return [
-        Tensor(_tensor, runtime_id=input.runtime_id) for _tensor in _tensor_list
-    ]
+    return [Tensor(_tensor) for _tensor in _tensor_list]
 
 
 def sigmoid(
@@ -490,10 +435,7 @@ def sigmoid(
     """
     if output is not NullTensor:
         output = output._tensor
-    return Tensor(
-        Model.get_model().sigmoid(input._tensor, output, name),
-        runtime_id=input.runtime_id,
-    )
+    return Tensor(Model.get_model().sigmoid(input._tensor, output, name))
 
 
 def sqrt(
@@ -508,10 +450,7 @@ def sqrt(
     """
     if output is not NullTensor:
         output = output._tensor
-    return Tensor(
-        Model.get_model().sqrt(input._tensor, output, name),
-        runtime_id=input.runtime_id,
-    )
+    return Tensor(Model.get_model().sqrt(input._tensor, output, name))
 
 
 def sub(
@@ -529,13 +468,8 @@ def sub(
     if output is not NullTensor:
         output = output._tensor
     if isinstance(other, Tensor):
-        if input.runtime_id != other.runtime_id:
-            raise ValueError("Tensors must be on the same runtime")
         other = other._tensor
-    return Tensor(
-        Model.get_model().sub(input._tensor, other, output, name),
-        runtime_id=input.runtime_id,
-    )
+    return Tensor(Model.get_model().sub(input._tensor, other, output, name))
 
 
 def tensor(
@@ -546,7 +480,6 @@ def tensor(
     padded_shape: Iterable[int] = [],
     rank: int = -1,
     name: str = "",
-    runtime_id: int = -1,
 ) -> Tensor:
     """
     Construct a tensor with given shape and data type.
@@ -582,8 +515,7 @@ def transpose(
     if len(perm) > 4:
         raise ValueError("Only support perm up to 4 dimensions")
     return Tensor(
-        Model.get_model().transpose(input._tensor, perm, output, name),
-        runtime_id=input.runtime_id,
+        Model.get_model().transpose(input._tensor, perm, output, name)
     )
 
 
@@ -598,19 +530,17 @@ def mean(
     name: str = "mean",
 ) -> Tensor:
     """Alias of reduce_mean."""
-    return reduce_mean(input, axis, keepdims, output, config, name)
+    return reduce_mean(input, axis, keepdims, output, name)
 
 
 def ones(
     shape: Iterable[int],
     dtype: DataType = fp32,
-    name: str = "ones",
-    runtime_id: int = -1,
+    name: str = "ones"
 ) -> Tensor:
     """Ones."""
     return Tensor(
-        Model.get_model().constant(1, Dims(shape), dtype.ctype(), name),
-        runtime_id=runtime_id,
+        Model.get_model().constant(1, Dims(shape), dtype.ctype(), name)
     )
 
 
@@ -621,14 +551,12 @@ def parameter(
     offsets: Iterable[int] = [],
     padded_shape: Iterable[int] = [],
     name: str = "",
-    runtime_id: int = -1,
 ) -> Parameter:
     """
     Construct a parameter with given shape and data type.
     """
     return Parameter(
-        _tensor(shape, dtype, strides, offsets, padded_shape, name),
-        runtime_id=runtime_id,
+        _tensor(shape, dtype, strides, offsets, padded_shape, name)
     )
 
 
@@ -665,12 +593,10 @@ def zeros(
     shape: Iterable[int],
     dtype: DataType = fp32,
     name: str = "zeros",
-    runtime_id: int = -1,
 ) -> Tensor:
     """Zeros."""
     return Tensor(
-        Model.get_model().constant(0, Dims(shape), dtype.ctype(), name),
-        runtime_id=runtime_id,
+        Model.get_model().constant(0, Dims(shape), dtype.ctype(), name)
     )
 
 
diff --git a/python/ark/runtime.py b/python/ark/runtime.py
index c2e507bca..671953df1 100644
--- a/python/ark/runtime.py
+++ b/python/ark/runtime.py
@@ -14,36 +14,7 @@ class _RuntimeState:
     The _RuntimeState class is used to store the state of the model.
     """
 
-    runtime: Dict[int, "Runtime"] = {}
-
-    @staticmethod
-    def reset_all():
-        """
-        Resets all runtimes.
-        """
-        runtime_ids = list(_RuntimeState.runtime.keys())
-        for runtime_id in runtime_ids:
-            _RuntimeState.runtime[runtime_id].reset()
-
-    @staticmethod
-    def delete_all():
-        """
-        Deletes all runtimes.
-        """
-        runtime_ids = list(_RuntimeState.runtime.keys())
-        for runtime_id in runtime_ids:
-            _RuntimeState.runtime[runtime_id].reset(delete=True)
-
-    @staticmethod
-    def print_runtime_states():
-        """
-        Print runtimes and their corresponding states.
-        """
-        print(f"{'Runtime ID':<12} | {'Status':<20}")
-        print(f"{'-'*12} | {'-'*20}")
-        for runtime_id, runtime in _RuntimeState.runtime.items():
-            runtime_id = "-1(Default)" if runtime_id == -1 else runtime_id
-            print(f"{runtime_id:<12} | {runtime.state:<20}")
+    runtime = None
 
 
 class Executor(_Executor):
@@ -64,11 +35,10 @@ class State(Enum):
         LaunchedNotRunning = 1
         Running = 2
 
-    def __init__(self, runtime_id: int = -1):
-        self.runtime_id = runtime_id
+    def __init__(self):
         self.executor: Executor = None
         self.state: Runtime.State = Runtime.State.Init
-        _RuntimeState.runtime[runtime_id] = self
+        _RuntimeState.runtime = self
 
     def get_state(self) -> "Runtime.State":
         """
@@ -77,36 +47,16 @@ def get_state(self) -> "Runtime.State":
         return self.state
 
     @staticmethod
-    def exists(runtime_id: int) -> bool:
-        """
-        Check if a runtime exists with the given ID.
+    def get_runtime() -> "Runtime":
         """
-        return runtime_id in _RuntimeState.runtime
-
-    @staticmethod
-    def get_all_ids() -> List[int]:
-        """
-        Get a list of all existing runtime IDs.
-        """
-        return list(_RuntimeState.runtime.keys())
-
-    @staticmethod
-    def get_runtime(runtime_id=-1) -> "Runtime":
-        """
-        Get the runtime by ID. If runtime_id is not provided, use a default ID of -1.
-        If the runtime does not exist, create a new runtime with the given ID.
-        """
-        if runtime_id not in _RuntimeState.runtime:
-            _RuntimeState.runtime[runtime_id] = Runtime(runtime_id)
-        return _RuntimeState.runtime[runtime_id]
-
-    @staticmethod
-    def see_runtime_statuses() -> "Dict[int, Runtime]":
-        """
-        Returns the runtime dictionary containing all of the runtimes.
+        Get the runtime.
+        If the runtime does not exist, create a new runtime.
         """
+        if _RuntimeState.runtime is None:
+            _RuntimeState.runtime = Runtime()
         return _RuntimeState.runtime
 
+
     def __enter__(self):
         return self
 
@@ -142,7 +92,7 @@ def launch(
         """
         if self.launched():
             logging.warning(
-                f"Runtime {self.runtime_id} is already launched, skip launching"
+                f"Runtime is already launched, skip launching"
             )
             return
         plan = Planner(device_id).plan() if plan is None else plan
@@ -152,7 +102,7 @@ def launch(
             if self.executor is not None:
                 if not self.executor.destroyed():
                     logging.warning(
-                        f"Runtime {self.runtime_id}, has already been launched. Destroying the old executor"
+                        f"Runtime has already been launched. Destroying the old executor"
                     )
                     self.executor.destroy()
             self.executor = Executor(
@@ -171,8 +121,8 @@ def run(self, iter=1, non_blocking=False):
         Run the ARK program for iter iterations and wait for the kernel to finish.
         """
         if self.state != Runtime.State.LaunchedNotRunning:
-            logging.error(f"ARK runtime {self.runtime_id} is not launched")
-            raise RuntimeError(f"ARK runtime {self.runtime_id} is not launched")
+            logging.error(f"ARK runtime is not launched")
+            raise RuntimeError(f"ARK runtime is not launched")
         self.state = Runtime.State.Running
         self.executor.run(iter)
         if not non_blocking:
@@ -193,7 +143,7 @@ def wait(self):
         """
         if self.state != Runtime.State.Running:
             logging.warning(
-                f"ARK runtime {self.runtime_id} is not running, skip waiting"
+                f"ARK runtime is not running, skip waiting"
             )
             return
         self.executor.wait()
@@ -206,7 +156,7 @@ def stop(self) -> float:
         """
         if not self.launched():
             logging.warning(
-                f"ARK runtime {self.runtime_id} is never launched, skip stopping"
+                f"ARK runtime is never launched, skip stopping"
             )
             return
         elapsed = self.executor.stop()
@@ -215,7 +165,7 @@ def stop(self) -> float:
 
     def reset(self, delete=False):
         """
-        Reset the runtime. If delete is True, delete the runtime associated with the runtime_id.
+        Reset the runtime. If delete is True, delete the runtime.
         """
         if self.launched():
             self.stop()
@@ -225,25 +175,5 @@ def reset(self, delete=False):
             self.executor = None
         self.state = Runtime.State.Init
         if delete:
-            del _RuntimeState.runtime[self.runtime_id]
-
-    @staticmethod
-    def reset_all_runtimes():
-        """
-        Reset all runtimes.
-        """
-        _RuntimeState.reset_all()
-
-    @staticmethod
-    def delete_all_runtimes():
-        """
-        Delete all runtimes.
-        """
-        _RuntimeState.delete_all()
-
-    @staticmethod
-    def print_runtime_states():
-        """
-        Print runtimes and their corresponding states.
-        """
-        _RuntimeState.print_runtime_states()
+            del _RuntimeState.runtime
+            _RuntimeState.runtime = None
diff --git a/python/ark/tensor.py b/python/ark/tensor.py
index 089d3eaed..a950c3d1d 100644
--- a/python/ark/tensor.py
+++ b/python/ark/tensor.py
@@ -33,18 +33,15 @@ def __init__(
         self,
         _tensor: _Tensor,
         initializer: Initializer = None,
-        runtime_id: int = -1,
     ):
         """
         Initializes a new instance of the Tensor class.
         Args:
             _tensor (_ark_core._Tensor): The underlying _Tensor object.
             intializer (Initializer): The initializer for the Tensor.
-            runtime_id (int): The ID of the Runtime to use. Defaults to -1, which is the default Runtime.
         """
         self._tensor = _tensor
         self.initializer: Initializer = initializer
-        self.runtime_id = runtime_id
 
     def shape(self) -> List[int]:
         """
@@ -83,7 +80,7 @@ def to_numpy(
             raise ValueError(
                 f"Tensor data type {self.dtype().__name__} is not supported by numpy."
             )
-        rt = Runtime.get_runtime(self.runtime_id)
+        rt = Runtime.get_runtime()
         if not rt.launched():
             raise RuntimeError(
                 "Tensor is not allocated yet. `Tensor.to_numpy()` is "
@@ -106,7 +103,7 @@ def from_numpy(self, ndarray: np.ndarray, stream: int = 0) -> "Tensor":
         """
         Copies the tensor from a host numpy array to the device.
         """
-        rt = Runtime.get_runtime(self.runtime_id)
+        rt = Runtime.get_runtime()
         if not rt.launched():
             raise RuntimeError(
                 "Tensor is not allocated yet. `Tensor.from_numpy()` is "
@@ -124,7 +121,7 @@ def to_dlpack(self):
         """
         Returns a DLPack tensor that shares the same memory with the device tensor.
         """
-        rt = Runtime.get_runtime(self.runtime_id)
+        rt = Runtime.get_runtime()
         if not rt.launched():
             raise RuntimeError(
                 "Tensor is not allocated yet. `Tensor.to_dlpack()` is "
@@ -133,11 +130,11 @@ def to_dlpack(self):
         return rt.executor.tensor_to_dlpack(self._tensor)
 
     @staticmethod
-    def from_dlpack(ext_tensor, runtime_id: int = -1) -> "Tensor":
+    def from_dlpack(ext_tensor) -> "Tensor":
         """
         Copies the tensor from a DLPack tensor to the device.
         """
-        return Tensor(_Tensor(ext_tensor), runtime_id=runtime_id)
+        return Tensor(_Tensor(ext_tensor))
 
     def to_torch(self) -> torch.Tensor:
         """
@@ -152,7 +149,7 @@ def to_torch(self) -> torch.Tensor:
         return torch_view
 
     @staticmethod
-    def from_torch(tensor: torch.Tensor, runtime_id: int = -1) -> "Tensor":
+    def from_torch(tensor: torch.Tensor) -> "Tensor":
         """
         Returns an ARK tensor that shares the same memory with the torch tensor.
         """
@@ -162,10 +159,7 @@ def from_torch(tensor: torch.Tensor, runtime_id: int = -1) -> "Tensor":
             raise ValueError("Torch tensor must be contiguous.")
         elif tensor.device.type == "cpu":
             raise ValueError("Torch tensor must be on a device.")
-        return Tensor.from_dlpack(
-            torch.utils.dlpack.to_dlpack(tensor),
-            runtime_id=runtime_id,
-        )
+        return Tensor.from_dlpack(torch.utils.dlpack.to_dlpack(tensor))
 
     def copy(
         self, data: Union[np.ndarray, torch.Tensor], stream: int = 0
@@ -174,7 +168,7 @@ def copy(
         Copies data into this tensor. The data type may differ,
         but the size must match.
         """
-        rt = Runtime.get_runtime(self.runtime_id)
+        rt = Runtime.get_runtime()
         if not rt.launched():
             raise RuntimeError(
                 "Tensor is not allocated yet. `Tensor.from_numpy()` is "
@@ -218,9 +212,8 @@ class Parameter(Tensor):
     A tensor as a parameter.
     """
 
-    def __init__(self, _tensor: _Tensor, runtime_id: int = -1):
+    def __init__(self, _tensor: _Tensor):
         """
         Initializes a new instance of the Parameter class.
         """
         super().__init__(_tensor)
-        self.runtime_id = runtime_id

From 9b265c7cc1d70c46919c389952cf37467e917632 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Sun, 11 Aug 2024 21:38:14 +0000
Subject: [PATCH 58/79] Fix communication

---
 ark/api/executor.cpp | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index 50686c434..86243f10d 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -405,20 +405,20 @@ std::map<size_t, size_t> Executor::Impl::init_buffers(const Json &plan_json) {
             continue;
         } else {
             buffer_id_to_offset[buf_info->buffer->id()] = offset;
+            for (const auto &tag_info : buf_info->buffer->send_tags()) {
+                remote_rank_to_send_tags_and_offsets[tag_info.first]
+                    .first.push_back(tag_info.second);
+                remote_rank_to_send_tags_and_offsets[tag_info.first]
+                    .second.push_back(offset);
+            }
+            for (const auto &tag_info : buf_info->buffer->recv_tags()) {
+                remote_rank_to_recv_tags_and_offsets[tag_info.first]
+                    .first.push_back(tag_info.second);
+                remote_rank_to_recv_tags_and_offsets[tag_info.first]
+                    .second.push_back(offset);
+            }
             offset += buf_info->bytes;
         }
-        for (const auto &tag_info : buf_info->buffer->send_tags()) {
-            remote_rank_to_send_tags_and_offsets[tag_info.first]
-                .first.push_back(tag_info.second);
-            remote_rank_to_send_tags_and_offsets[tag_info.first]
-                .second.push_back(offset);
-        }
-        for (const auto &tag_info : buf_info->buffer->recv_tags()) {
-            remote_rank_to_recv_tags_and_offsets[tag_info.first]
-                .first.push_back(tag_info.second);
-            remote_rank_to_recv_tags_and_offsets[tag_info.first]
-                .second.push_back(offset);
-        }
     }
     total_bytes_ = offset;
 

From 5d5342a27cd6a5d761757a92a074c86c3e0e3a62 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Sun, 11 Aug 2024 22:06:54 +0000
Subject: [PATCH 59/79] Add `Tensor::torch_strides` method

---
 ark/api/tensor.cpp         | 17 +++++++++++++++++
 ark/include/ark/tensor.hpp |  2 ++
 python/ark/ops.py          |  4 +---
 python/ark/runtime.py      | 13 +++----------
 python/executor_py.cpp     | 13 ++-----------
 python/tensor_py.cpp       | 15 ++++++---------
 6 files changed, 31 insertions(+), 33 deletions(-)

diff --git a/ark/api/tensor.cpp b/ark/api/tensor.cpp
index 4b5bdfd55..4fb60aef6 100644
--- a/ark/api/tensor.cpp
+++ b/ark/api/tensor.cpp
@@ -64,6 +64,23 @@ const DataType &Tensor::data_type() const {
     return NONE;
 }
 
+Dims Tensor::torch_strides() const {
+    if (ref_) {
+        Dims st = ref_->strides();
+        int ndims = st.ndims();
+        std::vector<DimType> tmp;
+        for (int i = 1; i < ndims; ++i) {
+            tmp.push_back(st[i]);
+        }
+        tmp.push_back(1);
+        for (int i = ndims - 2; i >= 0; --i) {
+            tmp[i] *= tmp[i + 1];
+        }
+        return Dims(tmp);
+    }
+    return Dims();
+}
+
 std::ostream &operator<<(std::ostream &os, const Tensor &tensor) {
     if (tensor.is_null()) {
         os << "null";
diff --git a/ark/include/ark/tensor.hpp b/ark/include/ark/tensor.hpp
index d13748175..5e463f99d 100644
--- a/ark/include/ark/tensor.hpp
+++ b/ark/include/ark/tensor.hpp
@@ -52,6 +52,8 @@ class Tensor {
     Dims padded_shape() const;
 
     const DataType &data_type() const;
+
+    Dims torch_strides() const;
 };
 
 const Tensor NullTensor;
diff --git a/python/ark/ops.py b/python/ark/ops.py
index 5fe144150..f8b75a70b 100644
--- a/python/ark/ops.py
+++ b/python/ark/ops.py
@@ -534,9 +534,7 @@ def mean(
 
 
 def ones(
-    shape: Iterable[int],
-    dtype: DataType = fp32,
-    name: str = "ones"
+    shape: Iterable[int], dtype: DataType = fp32, name: str = "ones"
 ) -> Tensor:
     """Ones."""
     return Tensor(
diff --git a/python/ark/runtime.py b/python/ark/runtime.py
index 671953df1..712addc29 100644
--- a/python/ark/runtime.py
+++ b/python/ark/runtime.py
@@ -56,7 +56,6 @@ def get_runtime() -> "Runtime":
             _RuntimeState.runtime = Runtime()
         return _RuntimeState.runtime
 
-
     def __enter__(self):
         return self
 
@@ -91,9 +90,7 @@ def launch(
         initialized. The executor will compile the cuda kernels and launch the ARK runtime.
         """
         if self.launched():
-            logging.warning(
-                f"Runtime is already launched, skip launching"
-            )
+            logging.warning(f"Runtime is already launched, skip launching")
             return
         plan = Planner(device_id).plan() if plan is None else plan
         # If the RuntimeState is init, we need to create a new executor and
@@ -142,9 +139,7 @@ def wait(self):
         Wait for the kernel to finish.
         """
         if self.state != Runtime.State.Running:
-            logging.warning(
-                f"ARK runtime is not running, skip waiting"
-            )
+            logging.warning(f"ARK runtime is not running, skip waiting")
             return
         self.executor.wait()
         self.state = Runtime.State.LaunchedNotRunning
@@ -155,9 +150,7 @@ def stop(self) -> float:
         Once this is called, we need to call `launch()` again to run the model again.
         """
         if not self.launched():
-            logging.warning(
-                f"ARK runtime is never launched, skip stopping"
-            )
+            logging.warning(f"ARK runtime is never launched, skip stopping")
             return
         elapsed = self.executor.stop()
         self.state = Runtime.State.LaunchedNotRunning
diff --git a/python/executor_py.cpp b/python/executor_py.cpp
index d90825e21..f42e59ee9 100644
--- a/python/executor_py.cpp
+++ b/python/executor_py.cpp
@@ -112,19 +112,10 @@ SharedTensor::SharedTensor(Executor &exe, const Tensor &tensor) {
     device_id_ = exe.device_id();
     dtype_ = tensor.data_type();
     shape_ = std::make_shared<std::vector<int64_t>>(tensor.shape().vector());
+    strides_ =
+        std::make_shared<std::vector<int64_t>>(tensor.torch_strides().vector());
     offsets_ =
         std::make_shared<std::vector<int64_t>>(tensor.offsets().vector());
-
-    strides_ = std::make_shared<std::vector<int64_t>>();
-    if (!shape_->empty()) {
-        int ndims = static_cast<int>(shape_->size());
-        strides_->resize(shape_->size());
-        strides_->back() = 1;
-        auto tmp = tensor.strides().vector();
-        for (int i = ndims - 2; i >= 0; --i) {
-            (*strides_)[i] = (*strides_)[i + 1] * tmp[i + 1];
-        }
-    }
 }
 
 DLTensor SharedTensor::dl_tensor() const {
diff --git a/python/tensor_py.cpp b/python/tensor_py.cpp
index e7f06592d..5abb35c66 100644
--- a/python/tensor_py.cpp
+++ b/python/tensor_py.cpp
@@ -83,15 +83,12 @@ void register_tensor(py::module& m) {
             return ark::Tensor(data_ptr, device_id, shape, from_dl_dtype(metadata.dtype));
         }))
         .def("id", &ark::Tensor::id)
-        .def("shape", &ark::Tensor::shape, py::return_value_policy::reference)
-        .def("strides", &ark::Tensor::strides,
-             py::return_value_policy::reference)
-        .def("offsets", &ark::Tensor::offsets,
-             py::return_value_policy::reference)
-        .def("padded_shape", &ark::Tensor::padded_shape,
-             py::return_value_policy::reference)
-        .def("data_type", &ark::Tensor::data_type,
-             py::return_value_policy::reference);
+        .def("shape", &ark::Tensor::shape)
+        .def("strides", &ark::Tensor::strides)
+        .def("offsets", &ark::Tensor::offsets)
+        .def("padded_shape", &ark::Tensor::padded_shape)
+        .def("data_type", &ark::Tensor::data_type)
+        .def("torch_strides", &ark::Tensor::torch_strides);
 
     m.attr("_NullTensor") = &ark::NullTensor;
 }

From 598cb78b351de2471e3d2386374504f4820adcd4 Mon Sep 17 00:00:00 2001
From: Noli Gerawork <ngerawor@andrew.cmu.edu>
Date: Sun, 11 Aug 2024 18:18:51 -0400
Subject: [PATCH 60/79] Torch Support (#237)

- Adds `RuntimeModule`.
- Adds support for running multiple consecutive plans.
- Pass buffers (externally allocated or from a previous plan), as kernel
arguments
- Adds gradient computation logic for ARK tensors/parameters

---------

Co-authored-by: Changho Hwang <changhohwang@microsoft.com>
---
 ark/api/executor.cpp                     | 114 +++++++++++++---
 ark/api/tensor.cpp                       |   7 +-
 ark/codegen.cpp                          | 101 +++++++++++---
 ark/codegen.hpp                          |   4 +-
 ark/include/ark/error.hpp                |   2 +-
 ark/include/ark/executor.hpp             |   5 +-
 ark/include/kernels/kernel_template.in   |  18 +--
 docs/env.md                              |   4 +
 examples/tutorial/model_test_tutorial.py | 163 +++++++++++++++++++++++
 examples/tutorial/torch_tutorial.py      |  27 ----
 python/ark/init.py                       |   4 +-
 python/ark/module.py                     | 144 +++++++++++---------
 python/ark/runtime.py                    |  21 ++-
 python/ark/tensor.py                     |  55 +++++++-
 python/ark/torch_mock.py                 |  20 +++
 python/executor_py.cpp                   |   3 +-
 python/unittest/test_runtime.py          | 154 ++++++++-------------
 17 files changed, 592 insertions(+), 254 deletions(-)
 create mode 100644 examples/tutorial/model_test_tutorial.py
 delete mode 100644 examples/tutorial/torch_tutorial.py

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index 86243f10d..4634ed6fd 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -25,9 +25,9 @@
 #include "gpu/gpu_manager.hpp"
 #include "logging.hpp"
 #include "model/model_buffer.hpp"
-#include "model_buffer_manager.hpp"
 #include "model/model_data_type.hpp"
 #include "model/model_tensor.hpp"
+#include "model_buffer_manager.hpp"
 #include "utils/utils_net.hpp"
 
 #if defined(ARK_CUDA)
@@ -148,16 +148,17 @@ class Executor::Impl {
     Impl(int device_id, Stream stream, const std::string &name, bool loop_mode);
     ~Impl();
 
-    void init(const PlanJson& plan);
+    void init(const PlanJson &plan);
 
     int device_id() const { return device_id_; }
 
     Stream stream() const { return reinterpret_cast<Stream>(stream_raw_); }
 
-    std::shared_ptr<GpuMemory> buffer() const { return buffer_; }
+    std::shared_ptr<GpuMemory> buffer() const { return buffers_.back(); }
 
     std::string plan() const { return plan_json_.dump_pretty(); }
 
+    void add_plan(const std::string &plan);
     void compile();
     void launch();
     void run(int iter);
@@ -165,7 +166,7 @@ class Executor::Impl {
     float stop(int64_t max_spin_count);
     void barrier();
 
-    uintptr_t tensor_address(const Tensor &tensor) const;
+    void *tensor_address(const Tensor &tensor) const;
 
     void tensor_read(const Tensor &tensor, void *data, size_t bytes,
                      Stream stream, bool is_d2d) const;
@@ -175,6 +176,8 @@ class Executor::Impl {
    private:
     void init_communicator();
     std::map<size_t, size_t> init_buffers(const Json &plan_json);
+    std::map<size_t, void *> init_buffer_addrs(
+        void *buffer_base, const std::map<size_t, size_t> &buffer_id_to_offset);
     std::set<int> init_remote_ranks(const Json &plan_json) const;
     void init_channels(const std::set<int> &remote_ranks);
 
@@ -183,6 +186,8 @@ class Executor::Impl {
     std::string name_;
     bool loop_mode_;
 
+    bool is_buffer_allocated_;
+
     gpuStream stream_raw_;
 
     int rank_;
@@ -193,12 +198,16 @@ class Executor::Impl {
     float elapsed_msec_ = -1;
 
     PlanJson plan_json_;
+    std::vector<void *> external_buffers_;
+    std::vector<std::string> external_args_;
+    std::map<size_t, std::string> buffer_id_to_name_;
     std::map<size_t, size_t> buffer_id_to_offset_;
+    std::map<size_t, void *> buffer_id_to_addr_;
     size_t total_bytes_;
     std::shared_ptr<CodeGenerator> codegen_;
     std::shared_ptr<GpuEvent> timer_begin_;
     std::shared_ptr<GpuEvent> timer_end_;
-    std::shared_ptr<GpuMemory> buffer_;
+    std::vector<std::shared_ptr<GpuMemory>> buffers_;
     std::shared_ptr<GpuHostMemory> flag_;
     std::shared_ptr<GpuStream> stream_;
     std::shared_ptr<GpuKernel> kernel_;
@@ -239,11 +248,12 @@ void Executor::Impl::init(const PlanJson &plan_json) {
         ERR(InvalidUsageError, "Invalid rank ", rank_, " with world size ",
             world_size_);
     }
-    if (world_size_ > 1) {
+    if (world_size_ > 1 && !comm_) {
         init_communicator();
     }
 
     auto gpu_manager = GpuManager::get_instance(device_id_);
+
     if (!gpu_manager->info().arch->belongs_to(
             Arch::from_name(plan_json.at("Architecture")))) {
         LOG(WARN, "Architecture name of the plan `",
@@ -260,12 +270,20 @@ void Executor::Impl::init(const PlanJson &plan_json) {
             std::to_string(kv.first) + ": " + std::to_string(kv.second) + ", ";
     }
 
-    codegen_ = std::make_shared<CodeGenerator>(plan_json_, buffer_id_to_offset_,
-                                               name_);
-
     timer_begin_ = gpu_manager->create_event();
     timer_end_ = gpu_manager->create_event();
-    buffer_ = gpu_manager->malloc(total_bytes_, 65536);
+    if (total_bytes_ > 0) {
+        buffers_.push_back(gpu_manager->malloc(total_bytes_, 65536));
+        is_buffer_allocated_ = true;
+    }
+
+    buffer_id_to_addr_ =
+        init_buffer_addrs(buffers_.back()->ref(), buffer_id_to_offset_);
+
+    codegen_ = std::make_shared<CodeGenerator>(plan_json_, buffer_id_to_offset_,
+                                               external_args_,
+                                               buffer_id_to_name_, name_);
+
     flag_ = gpu_manager->malloc_host(
         sizeof(int), gpuHostAllocMapped | gpuHostAllocWriteCombined);
 
@@ -282,6 +300,8 @@ void Executor::Impl::init(const PlanJson &plan_json) {
 
     std::string kernel_name;
     if (loop_mode_) {
+        // should we add an identifier to specify which plan the kernel executes
+        // i.e. ark_loop_kernel_2 for the second plan
         kernel_name = "ark_loop_kernel";
     } else {
         kernel_name = "ark_kernel";
@@ -304,6 +324,21 @@ void Executor::Impl::init_communicator() {
     comm_ = std::make_shared<mscclpp::Communicator>(bootstrap);
 }
 
+std::map<size_t, void *> Executor::Impl::init_buffer_addrs(
+    void *buffer_base, const std::map<size_t, size_t> &buffer_id_to_offset) {
+    std::map<size_t, void *> buffer_id_to_addr;
+    // Reuse existing buffer addresses for new plans that use previous tensors
+    // from earlier plans
+    if (!buffer_id_to_addr_.empty()) {
+        buffer_id_to_addr = buffer_id_to_addr_;
+    }
+    for (const auto &kv : buffer_id_to_offset) {
+        buffer_id_to_addr[kv.first] =
+            static_cast<char *>(buffer_base) + kv.second;
+    }
+    return buffer_id_to_addr;
+}
+
 std::map<size_t, size_t> Executor::Impl::init_buffers(const Json &plan_json) {
     class BufferInfo {
        public:
@@ -402,6 +437,23 @@ std::map<size_t, size_t> Executor::Impl::init_buffers(const Json &plan_json) {
                 ERR(InvalidUsageError,
                     "PyTorch tensor and model execution are on different GPUs");
             }
+            external_buffers_.push_back(buf_info->buffer->external_data());
+            const auto [it, inserted] = buffer_id_to_name_.try_emplace(
+                buf_info->buffer->id(),
+                "extern_buf_" + std::to_string(buf_info->buffer->id()));
+            external_args_.push_back(it->second);
+            continue;
+        }
+        // if we are adding a plan and come across a buffer from a previous
+        // plan, we utilize the buffer offset from the previous plan
+        if (buffer_id_to_offset_.find(buf_info->buffer->id()) !=
+            buffer_id_to_offset_.end()) {
+            external_buffers_.push_back(
+                buffer_id_to_addr_[buf_info->buffer->id()]);
+            const std::string name =
+                "extern_buf_" + std::to_string(buf_info->buffer->id());
+            external_args_.push_back(name);
+            buffer_id_to_name_[buf_info->buffer->id()] = name;
             continue;
         } else {
             buffer_id_to_offset[buf_info->buffer->id()] = offset;
@@ -552,7 +604,9 @@ std::set<int> Executor::Impl::init_remote_ranks(const Json &plan_json) const {
 }
 
 void Executor::Impl::init_channels(const std::set<int> &remote_ranks) {
-    proxy_service_ = std::make_shared<mscclpp::ProxyService>();
+    if (!proxy_service_) {
+        proxy_service_ = std::make_shared<mscclpp::ProxyService>();
+    }
 
     int num_ranks_per_node = get_env().num_ranks_per_host;
     auto rank_to_node = [&](int rank) { return rank / num_ranks_per_node; };
@@ -569,8 +623,8 @@ void Executor::Impl::init_channels(const std::set<int> &remote_ranks) {
     if (!get_env().disable_ib) {
         all_transports |= IBs[device_id_];
     }
-    mscclpp::RegisteredMemory regmem =
-        comm_->registerMemory(buffer_->ref(), buffer_->bytes(), all_transports);
+    mscclpp::RegisteredMemory regmem = comm_->registerMemory(
+        buffers_.back()->ref(), buffers_.back()->bytes(), all_transports);
 
     std::map<int, std::vector<mscclpp::NonblockingFuture<
                       std::shared_ptr<mscclpp::Connection>>>>
@@ -644,6 +698,15 @@ void Executor::Impl::init_channels(const std::set<int> &remote_ranks) {
     }
 }
 
+void Executor::Impl::add_plan(const std::string &plan) {
+    external_buffers_.clear();
+    external_args_.clear();
+    buffer_id_to_name_.clear();
+    total_bytes_ = 0;
+    is_buffer_allocated_ = false;
+    init(Json::parse(plan));
+}
+
 void Executor::Impl::compile() { kernel_->compile(); }
 
 void Executor::Impl::launch() {
@@ -708,9 +771,12 @@ void Executor::Impl::launch() {
     if (loop_mode_) {
         // Initialize loop flags.
         atomicStoreRelaxed(flag_->ref<int>(), 0);
-        void *buf_ptr = buffer_->ref();
         void *flag_ptr = flag_->ref();
+        void *buf_ptr = buffers_.back()->ref();
         std::vector<void *> args = {&buf_ptr, &flag_ptr};
+        for (auto &buffer : external_buffers_) {
+            args.push_back(&buffer);
+        }
         kernel_->launch(stream_raw_, args);
     }
     is_recording_ = true;
@@ -724,9 +790,12 @@ void Executor::Impl::run(int iter) {
         }
         atomicStoreRelaxed(flag_->ref<int>(), iter);
     } else {
-        void *buf_ptr = buffer_->ref();
+        void *buf_ptr = buffers_.back()->ref();
         int i = 0;
         std::vector<void *> args = {&buf_ptr, reinterpret_cast<void *>(&i)};
+        for (auto &buffer : external_buffers_) {
+            args.push_back(&buffer);
+        }
         for (; i < iter; i++) {
             kernel_->launch(stream_raw_, args);
         }
@@ -793,13 +862,12 @@ void Executor::Impl::barrier() {
     }
 }
 
-uintptr_t Executor::Impl::tensor_address(const Tensor &tensor) const {
+void *Executor::Impl::tensor_address(const Tensor &tensor) const {
     size_t buffer_id = tensor.ref()->buffer()->id();
-    if (buffer_id_to_offset_.find(buffer_id) == buffer_id_to_offset_.end()) {
+    if (buffer_id_to_addr_.find(buffer_id) == buffer_id_to_addr_.end()) {
         ERR(InternalError, "Invalid buffer ID: ", buffer_id);
     }
-    size_t offset = buffer_id_to_offset_.at(buffer_id);
-    return reinterpret_cast<uintptr_t>(buffer_->ref(offset));
+    return buffer_id_to_addr_.at(buffer_id);
 }
 
 void Executor::Impl::tensor_read(const Tensor &tensor, void *data, size_t bytes,
@@ -830,7 +898,7 @@ void Executor::Impl::tensor_read(const Tensor &tensor, void *data, size_t bytes,
             ") mismatches the tensor data bytes (", tensor_data_bytes, ").");
     }
     auto kind = (is_d2d) ? gpuMemcpyDeviceToDevice : gpuMemcpyDeviceToHost;
-    void *src = reinterpret_cast<void *>(tensor_address(tensor));
+    void *src = tensor_address(tensor);
     if (tensor.strides() == tensor.shape()) {
         GLOG(gpuMemcpyAsync(data, src, bytes, kind, copy_stream_raw));
     } else {
@@ -888,7 +956,7 @@ void Executor::Impl::tensor_write(const Tensor &tensor, const void *data,
     size_t tensor_bytes =
         tensor.strides().nelems() * tensor.data_type().bytes();
     auto kind = (is_d2d) ? gpuMemcpyDeviceToDevice : gpuMemcpyHostToDevice;
-    void *dst = reinterpret_cast<void *>(tensor_address(tensor));
+    void *dst = tensor_address(tensor);
     if (tensor.strides() == tensor.shape()) {
         GLOG(gpuMemcpyAsync(dst, data, tensor_bytes, kind, copy_stream_raw));
     } else {
@@ -940,6 +1008,8 @@ std::shared_ptr<GpuMemory> Executor::buffer() const { return impl_->buffer(); }
 
 std::string Executor::plan() const { return impl_->plan(); }
 
+void Executor::add_plan(const std::string &plan) { impl_->add_plan(plan); }
+
 void Executor::compile() { impl_->compile(); }
 
 void Executor::launch() { impl_->launch(); }
@@ -961,7 +1031,7 @@ void Executor::destroy() {
 
 bool Executor::destroyed() const { return impl_.get() == nullptr; }
 
-uintptr_t Executor::tensor_address(const Tensor &tensor) const {
+void *Executor::tensor_address(const Tensor &tensor) const {
     return impl_->tensor_address(tensor);
 }
 
diff --git a/ark/api/tensor.cpp b/ark/api/tensor.cpp
index 4fb60aef6..084ce6383 100644
--- a/ark/api/tensor.cpp
+++ b/ark/api/tensor.cpp
@@ -10,15 +10,14 @@
 namespace ark {
 
 Tensor::Tensor(void* data_ptr, int32_t device_id,
-               const std::vector<int64_t>& shape,
-               const DataType& dtype) {
+               const std::vector<int64_t>& shape, const DataType& dtype) {
     size_t external_data_size = std::accumulate(shape.begin(), shape.end(), 1,
                                                 std::multiplies<int64_t>()) *
                                 dtype.bytes();
     auto buffer =
         std::make_shared<ModelBuffer>(data_ptr, external_data_size, device_id);
-    auto tensor = std::make_shared<ModelTensor>(dtype.ref(), buffer, Dims(shape),
-                                                Dims(shape), Dims(), Dims());
+    auto tensor = std::make_shared<ModelTensor>(
+        dtype.ref(), buffer, Dims(shape), Dims(shape), Dims(), Dims());
     ref_ = tensor;
 }
 
diff --git a/ark/codegen.cpp b/ark/codegen.cpp
index 1619b863f..2bd36d679 100644
--- a/ark/codegen.cpp
+++ b/ark/codegen.cpp
@@ -10,10 +10,10 @@
 #include "file_io.h"
 #include "logging.hpp"
 #include "model/model_buffer.hpp"
-#include "model_buffer_manager.hpp"
 #include "model/model_data_type.hpp"
 #include "model/model_op.hpp"
 #include "model/model_tensor.hpp"
+#include "model_buffer_manager.hpp"
 #include "range.hpp"
 #include "utils/utils_math.hpp"
 
@@ -25,7 +25,18 @@ static std::string replace(
         size_t pos = 0;
         while ((pos = result.find(kv.first, pos)) != std::string::npos) {
             result.replace(pos, kv.first.length(), kv.second);
-            pos += kv.second.length();
+            if ((kv.first == "@GLOBAL_ARGS@" || kv.first == "@FUNCTION_ARGS@" ||
+                 kv.first == "@ARG_TYPES@") &&
+                kv.second.empty()) {
+                size_t comma_pos = pos;
+                if (comma_pos >= 2 && result.substr(comma_pos - 2, 2) == ", ") {
+                    result.erase(comma_pos - 2, 2);
+                    pos -= 2;
+                }
+
+            } else {
+                pos += kv.second.length();
+            }
         }
     }
     return result;
@@ -44,6 +55,8 @@ class CodeGenerator::Impl {
    public:
     Impl(const PlanJson &plan,
          const std::map<size_t, size_t> &buffer_id_to_offset,
+         const std::vector<std::string> &external_args,
+         const std::map<size_t, std::string> &buffer_id_to_name,
          const std::string &name);
     ~Impl() = default;
 
@@ -69,6 +82,8 @@ class CodeGenerator::Impl {
     friend class CodeGenerator;
 
     std::map<size_t, size_t> buffer_id_to_offset_;
+    std::vector<std::string> external_args_;
+    std::map<size_t, std::string> buffer_id_to_name_;
     std::string name_;
     int rank_;
     int world_size_;
@@ -77,10 +92,15 @@ class CodeGenerator::Impl {
     std::string code_;
 };
 
-CodeGenerator::Impl::Impl(const PlanJson &plan,
-                          const std::map<size_t, size_t> &buffer_id_to_offset,
-                          const std::string &name)
-    : buffer_id_to_offset_(buffer_id_to_offset), name_(name) {
+CodeGenerator::Impl::Impl(
+    const PlanJson &plan, const std::map<size_t, size_t> &buffer_id_to_offset,
+    const std::vector<std::string> &external_args,
+    const std::map<size_t, std::string> &buffer_id_to_name,
+    const std::string &name)
+    : buffer_id_to_offset_(buffer_id_to_offset),
+      external_args_(external_args),
+      buffer_id_to_name_(buffer_id_to_name),
+      name_(name) {
     rank_ = plan.at("Rank");
     world_size_ = plan.at("WorldSize");
     num_procs_ = plan.at("NumProcessors");
@@ -169,6 +189,30 @@ CodeGenerator::Impl::Impl(const PlanJson &plan,
     if (!is_file(template_path)) {
         ERR(InternalError, "kernel template file not found: ", template_path);
     }
+
+    // Generate the global arguments
+    std::stringstream global_args_ss, function_args_ss, arg_types_ss;
+    for (const auto &arg : external_args_) {
+        global_args_ss << "void *" << arg << ", ";
+        function_args_ss << arg << ", ";
+        arg_types_ss << "void *, ";
+    }
+    std::string global_args = global_args_ss.str();
+    std::string function_args = function_args_ss.str();
+    std::string arg_types = arg_types_ss.str();
+    if (!global_args.empty()) {
+        global_args.pop_back();
+        global_args.pop_back();
+    }
+    if (!function_args.empty()) {
+        function_args.pop_back();
+        function_args.pop_back();
+    }
+    if (!arg_types.empty()) {
+        arg_types.pop_back();
+        arg_types.pop_back();
+    }
+
     std::string template_code = read_file(template_path);
     std::map<std::string, std::string> replacements = {
         {"@NUM_BLOCKS@", std::to_string(num_procs_)},
@@ -176,6 +220,9 @@ CodeGenerator::Impl::Impl(const PlanJson &plan,
         {"@DEFINITIONS@", definitions_ss.str()},
         {"@BODY@", body_ss.str()},
         {"@NAME@", (name_.empty() ? "" : "_" + name_)},
+        {"@GLOBAL_ARGS@", global_args},
+        {"@FUNCTION_ARGS@", function_args},
+        {"@ARG_TYPES@", arg_types},
     };
     code_ = replace(template_code, replacements);
 }
@@ -215,7 +262,7 @@ std::string CodeGenerator::Impl::def_task(const Json &task_json) {
         ss << this->def_op(op_json, task_json["Id"], op_idx++);
     }
     ss << "__device__ void t" << task_json["Id"]
-       << "(char* _buf, int _idx, int _spw) {\n";
+       << "(char *_buf, int _idx, int _spw, @GLOBAL_ARGS@) {\n";
     op_idx = 0;
     for (auto &op_json : task_json["Ops"]) {
         auto op = ModelOp::deserialize(op_json);
@@ -225,25 +272,32 @@ std::string CodeGenerator::Impl::def_task(const Json &task_json) {
             auto &arg = impl_args[i];
             if (arg.type_name() == "TENSOR") {
                 auto tns = arg.value<ModelTensorRef>();
-                if (tns->buffer()->is_external()) {
-                    void *buf_addr =
-                        ModelBufferManager::get_instance().get_buffer(
-                            tns->buffer()->id());
-                    ss << "(" << tns->data_type()->type_str() << "*)"
-                       << buf_addr;
-                } else {
-                    size_t buffer_offset =
-                        buffer_id_to_offset_.at(tns->buffer()->id());
+                size_t buffer_id = tns->buffer()->id();
+                if (buffer_id_to_name_.find(buffer_id) ==
+                    buffer_id_to_name_.end()) {
+                    size_t buffer_offset = buffer_id_to_offset_.at(buffer_id);
                     size_t offset = buffer_offset + ModelOffset(tns).value();
                     ss << "(" << tns->data_type()->type_str() << "*)&_buf["
                        << offset << "]";
+                } else {
+                    ss << "(" << tns->data_type()->type_str() << "*)"
+                       << buffer_id_to_name_.at(buffer_id);
                 }
             } else if (arg.type_name() == "OFFSET") {
                 auto moff = arg.value<ModelOffset>();
-                size_t buffer_offset =
-                    buffer_id_to_offset_.at(moff.buffer_id());
-                size_t offset = buffer_offset + moff.value();
-                ss << offset;
+                size_t buffer_id = moff.buffer_id();
+                if (buffer_id_to_name_.find(buffer_id) ==
+                    buffer_id_to_name_.end()) {
+                    size_t buffer_offset = buffer_id_to_offset_.at(buffer_id);
+                    size_t offset = buffer_offset + moff.value();
+                    ss << offset;
+                } else {
+                    const std::string &buffer_name =
+                        buffer_id_to_name_.at(buffer_id);
+                    size_t offset = moff.value();
+                    ss << "(uint64_t)((char*)" << buffer_name << " + " << offset
+                       << ")";
+                }
             } else {
                 ss << arg.serialize().begin().value();
             }
@@ -274,7 +328,7 @@ std::string CodeGenerator::Impl::task_seq(
     ss << "task_seq<" << proc_b << ", " << proc_e << ", " << proc_s << ", "
        << proc_cur << ", " << task_b << ", " << task_e << ", " << task_s << ", "
        << task_gran << ", " << num_slots << ", " << slot_num_warps << ", "
-       << slot_sram_bytes << ", t" << task_id << ">(_buf);\n";
+       << slot_sram_bytes << ", t" << task_id << ">(_buf, @FUNCTION_ARGS@);\n";
     return ss.str();
 }
 
@@ -444,8 +498,11 @@ std::string CodeGenerator::Impl::sync_process_range(const Range<size_t> &range,
 
 CodeGenerator::CodeGenerator(
     const PlanJson &plan, const std::map<size_t, size_t> &buffer_id_to_offset,
+    const std::vector<std::string> &external_args,
+    const std::map<size_t, std::string> &buffer_id_to_name,
     const std::string &name)
-    : impl_(std::make_shared<Impl>(plan, buffer_id_to_offset, name)) {}
+    : impl_(std::make_shared<Impl>(plan, buffer_id_to_offset, external_args,
+                                   buffer_id_to_name, name)) {}
 
 std::string CodeGenerator::code() const { return impl_->code_; }
 
diff --git a/ark/codegen.hpp b/ark/codegen.hpp
index 1ed8ec9f2..8a4eed270 100644
--- a/ark/codegen.hpp
+++ b/ark/codegen.hpp
@@ -8,8 +8,8 @@
 #include <memory>
 #include <string>
 
-#include "model_buffer_manager.hpp"
 #include "model/model_json.hpp"
+#include "model_buffer_manager.hpp"
 
 namespace ark {
 
@@ -17,6 +17,8 @@ class CodeGenerator {
    public:
     CodeGenerator(const PlanJson &plan,
                   const std::map<size_t, size_t> &buffer_id_to_offset,
+                  const std::vector<std::string> &external_args,
+                  const std::map<size_t, std::string> &buffer_id_to_name,
                   const std::string &name = "ark_kernel");
 
     ~CodeGenerator() = default;
diff --git a/ark/include/ark/error.hpp b/ark/include/ark/error.hpp
index 965b1c0bc..1fbec0c01 100644
--- a/ark/include/ark/error.hpp
+++ b/ark/include/ark/error.hpp
@@ -44,4 +44,4 @@ REGISTER_ERROR_TYPE(UnitTestError)
 
 }  // namespace ark
 
-#endif  // ARK_ERROR_HPP
+#endif  // ARK_ERROR_HPP
\ No newline at end of file
diff --git a/ark/include/ark/executor.hpp b/ark/include/ark/executor.hpp
index 02a67cd26..d44ac2302 100644
--- a/ark/include/ark/executor.hpp
+++ b/ark/include/ark/executor.hpp
@@ -39,6 +39,9 @@ class Executor {
     /// Return the plan string.
     std::string plan() const;
 
+    /// Add a plan to the executor.
+    void add_plan(const std::string &plan);
+
     /// Compile the model. This must be called before `launch()`.
     void compile();
 
@@ -67,7 +70,7 @@ class Executor {
     bool destroyed() const;
 
     /// Return the raw virtual address of the tensor.
-    uintptr_t tensor_address(const Tensor &tensor) const;
+    void *tensor_address(const Tensor &tensor) const;
 
     template <typename T>
     void tensor_read(const Tensor &tensor, std::vector<T> &data,
diff --git a/ark/include/kernels/kernel_template.in b/ark/include/kernels/kernel_template.in
index a8a56f141..a05e143d3 100644
--- a/ark/include/kernels/kernel_template.in
+++ b/ark/include/kernels/kernel_template.in
@@ -6,8 +6,8 @@ using namespace ark;
 template <size_t ProcBegin, size_t ProcEnd, size_t ProcStep, size_t ProcCurrent,
           size_t TaskBegin, size_t TaskEnd, size_t TaskStep, size_t TaskGranularity,
           size_t NumSlots, size_t SlotNumWarps, size_t SlotSramBytes,
-          void (*task)(char*, int, int)>
-__forceinline__ __device__ void task_seq(char *_buf) {
+          void (*task)(char*, int, int, @ARG_TYPES@)>
+__forceinline__ __device__ void task_seq(char *_buf, @GLOBAL_ARGS@) {
   if (math::geq<ProcBegin>(blockIdx.x) && math::le<ProcEnd>(blockIdx.x) &&
       ((blockIdx.x - ProcBegin) % ProcStep == 0)) {
     constexpr size_t SlotNumThreads = SlotNumWarps * Arch::ThreadsPerWarp;
@@ -23,7 +23,7 @@ __forceinline__ __device__ void task_seq(char *_buf) {
       size_t task_id = task_id_base + TaskStep *
         (t % TaskGranularity + t / TaskGranularity * TaskGranularity * NumProcs);
       if (task_id >= TaskEnd) break;
-      task(_buf, task_id, SramBytesPerWarp);
+      task(_buf, task_id, SramBytesPerWarp, @FUNCTION_ARGS@);
     }
   }
 }
@@ -33,12 +33,12 @@ __device__ sync::State ARK_LOOP_SYNC_STATE;
 
 @DEFINITIONS@
 
-__device__ void ark_body(char *_buf, int _iter) {
+__device__ void ark_body(char *_buf, int _iter, @GLOBAL_ARGS@) {
 @BODY@
 }
 
 extern "C" __global__ __launch_bounds__(ARK_WARPS_PER_BLOCK * Arch::ThreadsPerWarp, 1)
-void ark_loop_kernel@NAME@(char *_buf, int *_iter) {
+void ark_loop_kernel@NAME@(char *_buf, int *_iter, @GLOBAL_ARGS@) {
   int *shared_mem = (int *)_ARK_SMEM;
   for (int i = threadIdx.x; i < ARK_SMEM_RESERVED_BYTES / sizeof(int); i += blockDim.x) {
     shared_mem[i] = 0;
@@ -52,10 +52,10 @@ void ark_loop_kernel@NAME@(char *_buf, int *_iter) {
     sync_gpu<@NUM_BLOCKS@>(ARK_LOOP_SYNC_STATE);
     if (ARK_ITER < 0) return;
 
-    ark_body(_buf, 0);
+    ark_body(_buf, 0, @FUNCTION_ARGS@);
     for (int _i = 1; _i < ARK_ITER; ++_i) {
       sync_gpu<@NUM_BLOCKS@>(ARK_LOOP_SYNC_STATE);
-      ark_body(_buf, _i);
+      ark_body(_buf, _i, @FUNCTION_ARGS@);
     }
     if (threadIdx.x == 0) {
       __threadfence_system();
@@ -69,10 +69,10 @@ void ark_loop_kernel@NAME@(char *_buf, int *_iter) {
 }
 
 extern "C" __global__ __launch_bounds__(ARK_WARPS_PER_BLOCK * Arch::ThreadsPerWarp, 1)
-void ark_kernel@NAME@(char *_buf, int _iter) {
+void ark_kernel@NAME@(char *_buf, int _iter, @GLOBAL_ARGS@) {
   int *shared_mem = (int *)_ARK_SMEM;
   for (int i = threadIdx.x; i < ARK_SMEM_RESERVED_BYTES / sizeof(int); i += blockDim.x) {
     shared_mem[i] = 0;
   }
-  ark_body(_buf, _iter);
+  ark_body(_buf, _iter, @FUNCTION_ARGS@);
 }
diff --git a/docs/env.md b/docs/env.md
index 2d5839c3b..95330a032 100644
--- a/docs/env.md
+++ b/docs/env.md
@@ -27,3 +27,7 @@
 - `ARK_DISABLE_IB` (Default: `0`; Options: `0`, `1`)
 
     If set to `1`, disable ibverbs networking (i.e., disable multi-node execution).
+
+- `ARK_IGNORE_BINARY_CACHE` (Default: `1`; Options: `0`, `1`)
+
+    If set to `1`, ignore the binary cache and force ARK to recompile binaries on each run.
diff --git a/examples/tutorial/model_test_tutorial.py b/examples/tutorial/model_test_tutorial.py
new file mode 100644
index 000000000..ac5a7b2a9
--- /dev/null
+++ b/examples/tutorial/model_test_tutorial.py
@@ -0,0 +1,163 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import ark
+import torch
+import torch.optim as optim
+
+
+# Set random seed for reproducibility.
+torch.manual_seed(42)
+
+# Let's first define a linear layer using ARK.
+class ARKLinear(ark.Module):
+    def __init__(self, weight):
+        super().__init__()
+        self.weight = weight
+
+    def forward(self, input):
+        self.saved_input = input
+        output = ark.matmul(input, self.weight, transpose_other=True)
+        return output
+
+    def backward(self, grad_output):
+        grad_weight = ark.matmul(
+            grad_output, self.saved_input, transpose_input=True
+        )
+        grad_input = ark.matmul(grad_output, self.weight, transpose_other=False)
+        self.weight.update_gradient(grad_weight)
+        return grad_input, grad_weight
+
+
+# Let's use our previous module to define a double linear layer.
+class MyARKModule(ark.Module):
+    def __init__(self, weight0, weight1):
+        super().__init__()
+        self.linear1 = ARKLinear(weight0)
+        self.linear2 = ARKLinear(weight1)
+
+    def forward(self, x):
+        x = self.linear1.forward(x)
+        x = self.linear2.forward(x)
+        return x
+
+    def backward(self, grad_output):
+        grad_x, grad_weight2 = self.linear2.backward(grad_output)
+        grad_x, grad_weight1 = self.linear1.backward(grad_x)
+        return grad_x, grad_weight1, grad_weight2
+
+
+# Define a PyTorch model.
+class SimpleModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.layers = torch.nn.Sequential(
+            torch.nn.Linear(256, 256, bias=False),  # Layer 0
+            torch.nn.Linear(256, 256, bias=False),  # Layer 1
+            torch.nn.Linear(256, 256, bias=False),  # Layer 2
+            torch.nn.Linear(256, 256, bias=False),  # Layer 3
+            torch.nn.Linear(256, 256, bias=False),  # Layer 4
+            torch.nn.ReLU(),  # Activation
+        )
+
+    def forward(self, x):
+        return self.layers(x)
+
+
+# Function to compare the gradients of two models of the same architecture and parameter order.
+def compare_grad(ark_model, torch_model, atol=1e-4, rtol=1e-2):
+    ark_params = list(ark_model.named_parameters())
+    torch_params = list(torch_model.named_parameters())
+    for (ark_name, ark_param), (torch_name, torch_param) in zip(
+        ark_params, torch_params
+    ):
+        if (ark_param.grad is None) ^ (torch_param.grad is None):
+            print("Exactly one of the gradients is None")
+        else:
+            grads_equal = torch.allclose(
+                ark_param.grad, torch_param.grad, atol=atol, rtol=rtol
+            )
+            if not grads_equal:
+                print(
+                    f"Gradient for {ark_name} when compared to {torch_name} is different:"
+                )
+                print(f"ARK gradient: {ark_param.grad}")
+                print(f"Torch gradient: {torch_param.grad}")
+
+
+# For our ARK model we will replace the first two layers with ARK layers.
+def replace_layers_with_ark(model):
+    weight_0 = torch.nn.Parameter(
+        model.layers[0].weight.to("cuda:0").requires_grad_(True)
+    )
+    weight_1 = torch.nn.Parameter(
+        model.layers[1].weight.to("cuda:0").requires_grad_(True)
+    )
+    ark_module = ark.RuntimeModule(MyARKModule(weight_0, weight_1))
+    model.layers[0] = ark_module
+    del model.layers[1]
+
+    # Since we replaced the PyTorch layer with an ARK layer, we need to register the PyTorch parameters
+    # our ARK module utilizes with the original PyTorch model so ARK can leverage PyTorch's optimizers.
+    model.register_parameter("weight_0", weight_0)
+    model.register_parameter("weight_1", weight_1)
+
+    return model
+
+
+# Instantiate our models.
+pytorch_model = SimpleModel()
+ark_model = SimpleModel()
+
+
+# Ensure both models have the same weights.
+ark_model.load_state_dict(pytorch_model.state_dict())
+ark_model = replace_layers_with_ark(ark_model)
+
+
+# Move both models to GPU.
+pytorch_model.to("cuda:0")
+ark_model.to("cuda:0")
+
+# Now let's run the models on some random input.
+input_torch = torch.randn(128, 256).to("cuda:0").requires_grad_(True)
+input_ark = input_torch.clone().detach().requires_grad_(True)
+
+
+# Define an arbitrary target.
+target = torch.randn(128, 256).to("cuda:0")
+
+loss_fn = torch.nn.MSELoss()
+optim_torch = optim.SGD(pytorch_model.parameters(), lr=0.01)
+optim_ark = optim.SGD(ark_model.parameters(), lr=0.01)
+
+num_iters = 5
+for iter in range(num_iters):
+    print(f"Iteration {iter+1}/{num_iters}")
+
+    optim_torch.zero_grad()
+    optim_ark.zero_grad()
+
+    pytorch_output = pytorch_model(input_torch)
+    ark_output = ark_model(input_ark)
+
+    assert torch.allclose(pytorch_output, ark_output, atol=1e-4, rtol=1e-2)
+
+    # Compute losses.
+    torch_loss = loss_fn(pytorch_output, target)
+    ark_loss = loss_fn(ark_output, target)
+
+    # See how ARK's loss compares to PyTorch's loss.
+    print(f"\nPyTorch loss: {torch_loss.item()}")
+    print(f"\nARK loss: {ark_loss.item()}\n")
+    assert torch.allclose(torch_loss, ark_loss, atol=1e-4, rtol=1e-2)
+
+    # Perform a backward pass.
+    torch_loss.backward()
+    ark_loss.backward()
+
+    optim_torch.step()
+    optim_ark.step()
+
+    # Ensure gradients of both models are updated accordingly.
+    compare_grad(ark_model, pytorch_model)
diff --git a/examples/tutorial/torch_tutorial.py b/examples/tutorial/torch_tutorial.py
deleted file mode 100644
index e9482a7cc..000000000
--- a/examples/tutorial/torch_tutorial.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import ark
-import torch
-
-
-class ArkAddModule(ark.RuntimeModule):
-    def build_forward(self, x: ark.Tensor, y: ark.Tensor) -> ark.Tensor:
-        return ark.add(x, y)
-
-
-# ARK module for addition
-module = ArkAddModule()
-
-# Define two torch arrays
-x = torch.ones(64) * 2
-y = torch.ones(64) * 3
-
-# Run the ARK module
-z = module(x, y)
-
-w = module(x, z)
-
-# Print the result
-print(z)  # 5
-print(w)  # 7
diff --git a/python/ark/init.py b/python/ark/init.py
index a4a67e85d..29627d645 100644
--- a/python/ark/init.py
+++ b/python/ark/init.py
@@ -6,10 +6,10 @@
 from .runtime import _RuntimeState
 
 
-def init():
+def init(keep_runtime: bool = False):
     """Initializes ARK."""
     Model.reset()
-    if _RuntimeState.runtime is not None:
+    if not keep_runtime and _RuntimeState.runtime is not None:
         del _RuntimeState.runtime
         _RuntimeState.runtime = None
     _ark_core.init()
diff --git a/python/ark/module.py b/python/ark/module.py
index d797da72c..0fdea23b6 100644
--- a/python/ark/module.py
+++ b/python/ark/module.py
@@ -3,9 +3,10 @@
 
 import logging
 import numpy as np
-from typing import Any, Dict, List, Union
+from typing import Any, Dict, Union
 from .tensor import Tensor, Parameter
 from .runtime import Runtime, Planner
+from .init import init
 from .ops import tensor
 from .data_type import DataType
 
@@ -25,6 +26,7 @@ class Module:
     """
 
     def __init__(self):
+        super().__init__()
         # The submodules of the module.
         self.sub_modules: dict[str, "Module"] = dict()
         # The parameters of the module.
@@ -34,12 +36,16 @@ def __setattr__(self, __name: str, __value: Any) -> None:
         """
         When setting an attribute, if the attribute is a Module, add it to
         the sub_modules. If the attribute is a Tensor and this Tensor is a
-        parameter, add it to the parameters.
+        parameter, add it to the parameters. If the attribute is a
+        torch.nn.Parameter, convert it to an ARK Parameter before adding.
         """
         if isinstance(__value, Module):
             self.register_module(__name, __value)
         elif isinstance(__value, Parameter):
             self.register_parameter(__name, __value)
+        elif not _no_torch and isinstance(__value, torch.nn.Parameter):
+            __value = Parameter(__value)
+            self.register_parameter(__name, __value)
         super().__setattr__(__name, __value)
 
     def __call__(self, *args: Any, **kwargs: Any):
@@ -131,63 +137,81 @@ def _recursive_ark_to_torch(object):
     return object
 
 
-class RuntimeModule(Module):
-    def __init__(self):
-        if _no_torch:
-            raise ImportError("torch is not available")
-        super().__init__()
-        self.built_forward = False
-        self.built_backward = False
-        self.forward_input_tensor_args: List[Tensor] = []
-        self.forward_input_tensor_kwargs: Dict[str, Tensor] = {}
-        self.forward_input_args = []
-        self.forward_input_kwargs = {}
-        self.forward_output = None
-        self.backward_tensor_args = []
-        self.backward_tensor_kwargs = {}
-
-    def build_forward(self, *args: Any, **kwargs: Any) -> Any: ...
-
-    def build_backward(self, *args: Any, **kwargs: Any) -> Any: ...
-
-    def forward(self, *args: Any, **kwargs: Any) -> Any:
-        if not self.built_forward:
-            for arg in args:
-                if isinstance(arg, torch.Tensor):
-                    self.forward_input_tensor_args.append(
-                        tensor(
-                            list(arg.shape),
-                            DataType.from_torch(arg.dtype),
-                        )
-                    )
-                    self.forward_input_args.append(
-                        self.forward_input_tensor_args[-1]
-                    )
-                else:
-                    self.forward_input_args.append(arg)
-            for key, value in kwargs.items():
-                if isinstance(value, torch.Tensor):
-                    self.forward_input_tensor_kwargs[key] = tensor(
-                        list(value.shape),
-                        DataType.from_torch(value.dtype),
-                    )
-                    self.forward_input_kwargs[key] = (
-                        self.forward_input_tensor_kwargs[key]
-                    )
-                else:
-                    self.forward_input_kwargs[key] = value
-            self.forward_output = self.build_forward(
-                *self.forward_input_args,
-                **self.forward_input_kwargs,
-            )
-            self.built_forward = True
+class _ARKFunction(torch.autograd.Function):
+    """
+    Facilitates the integration of ARK modules with PyTorch's
+    autograd system by defining custom forward and backward passes that
+    utilize the user's defined ARK module.
+    """
 
-        with Runtime.get_runtime() as rt:
-            rt.launch(plan=Planner().plan())
-            for tns, arg in zip(self.forward_input_tensor_args, args):
-                tns.copy(arg)
-            for key, value in self.forward_input_tensor_kwargs.items():
-                value.copy(kwargs[key])
+    @staticmethod
+    def forward(ctx, ark_module, *args, **kwargs):
+        """
+        Returns a PyTorch tensor that is the result
+        of the forward pass of the ARK module.
+        """
+        init(keep_runtime=True)
+        ctx.ark_module = ark_module
+        input_args, input_kwargs = [], {}
+        input_requires_grad = 0
+        for arg in args:
+            if isinstance(arg, torch.Tensor):
+                input_args.append(Tensor.from_torch(arg))
+                if arg.requires_grad:
+                    input_requires_grad += 1
+            else:
+                input_args.append(arg)
+        for k, v in kwargs.items():
+            if isinstance(v, torch.Tensor):
+                input_kwargs[k] = Tensor.from_torch(v)
+                if v.requires_grad:
+                    input_requires_grad += 1
+            else:
+                input_kwargs[k] = v
+        ctx.num_inp_grad = input_requires_grad
+        output = ark_module.forward(*input_args, **input_kwargs)
+        rt = Runtime.get_runtime()
+        rt.launch()
+        rt.run()
+        output = output.get_torch_view()
+        rt.reset(persist=True)
+        return output
+
+    @staticmethod
+    def backward(ctx, *grad_outputs):
+        """
+        Converts the gradient outputs to ARK format, computes the gradients for the input
+        and parameters using the ARK module backwards pass, and updates the gradients of the corresponding
+        PyTorch parameters.
+        """
+        init(keep_runtime=True)
+        ark_grad_outputs = [Tensor.from_torch(grad) for grad in grad_outputs]
+        grads = ctx.ark_module.backward(*ark_grad_outputs)
+        grad_inputs, grad_weights = (
+            grads[:ctx.num_inp_grad],
+            grads[ctx.num_inp_grad:],
+        )
+        params_dict = ctx.ark_module.params_dict()
+        rt = Runtime.get_runtime()
+        rt.launch()
+        rt.run()
+        grad_inputs = [grad.get_torch_view() for grad in grad_inputs]
+        for _, param in params_dict.items():
+            if param.staged_tensor is not None:
+                pytorch_grad = param.staged_tensor.get_torch_view()
+                param.torch_param.grad = pytorch_grad
+        rt.reset(persist=True)
+        return (None, *grad_inputs)
+
+
+class RuntimeModule(torch.nn.Module):
+    """
+    Wraps an ARK module to be used as a PyTorch autograd function.
+    """
+
+    def __init__(self, ark_module):
+        super().__init__()
+        self.ark_module = ark_module
 
-            rt.run()
-            return _recursive_ark_to_torch(self.forward_output)
+    def forward(self, *args, **kwargs):
+        return _ARKFunction.apply(self.ark_module, *args, **kwargs)
diff --git a/python/ark/runtime.py b/python/ark/runtime.py
index 712addc29..071eedd04 100644
--- a/python/ark/runtime.py
+++ b/python/ark/runtime.py
@@ -77,6 +77,14 @@ def running(self) -> bool:
         """
         return self.state == Runtime.State.Running
 
+    def add_plan(self, plan: Plan):
+        """
+        Add a plan to the executor.
+        """
+        if self.executor is None:
+            raise RuntimeError("Executor is not initialized")
+        self.executor.add_plan(str(plan))
+
     def launch(
         self,
         plan: Plan = None,
@@ -89,10 +97,15 @@ def launch(
         the CUDA kernels. The GPU context and the connection between GPUs will be
         initialized. The executor will compile the cuda kernels and launch the ARK runtime.
         """
+        plan = Planner(device_id).plan() if plan is None else plan
         if self.launched():
-            logging.warning(f"Runtime is already launched, skip launching")
+            # If the Runtime state is already launched and we are adding another plan
+            # to the executor, we compile the new kernel and launch the executor again.
+            self.executor.add_plan(str(plan))
+            self.executor.compile()
+            self.executor.launch()
             return
-        plan = Planner(device_id).plan() if plan is None else plan
+
         # If the RuntimeState is init, we need to create a new executor and
         # compile the kernels
         if self.state == Runtime.State.Init:
@@ -156,12 +169,14 @@ def stop(self) -> float:
         self.state = Runtime.State.LaunchedNotRunning
         return elapsed
 
-    def reset(self, delete=False):
+    def reset(self, delete=False, persist=False):
         """
         Reset the runtime. If delete is True, delete the runtime.
         """
         if self.launched():
             self.stop()
+        if persist:
+            return
         if self.executor is not None:
             if not self.executor.destroyed():
                 self.executor.destroy()
diff --git a/python/ark/tensor.py b/python/ark/tensor.py
index a950c3d1d..ba1af52db 100644
--- a/python/ark/tensor.py
+++ b/python/ark/tensor.py
@@ -33,15 +33,18 @@ def __init__(
         self,
         _tensor: _Tensor,
         initializer: Initializer = None,
+        requires_grad: bool = False,
     ):
         """
         Initializes a new instance of the Tensor class.
         Args:
             _tensor (_ark_core._Tensor): The underlying _Tensor object.
-            intializer (Initializer): The initializer for the Tensor.
+            initializer (Initializer): The initializer for the Tensor.
+            requires_grad (bool): Whether the tensor requires gradient. Defaults to True.
         """
         self._tensor = _tensor
         self.initializer: Initializer = initializer
+        self.requires_grad = requires_grad
 
     def shape(self) -> List[int]:
         """
@@ -171,7 +174,7 @@ def copy(
         rt = Runtime.get_runtime()
         if not rt.launched():
             raise RuntimeError(
-                "Tensor is not allocated yet. `Tensor.from_numpy()` is "
+                "Tensor is not allocated yet. `Tensor.copy()` is "
                 "usable only after you call `Runtime.launch()`."
             )
         tensor_bytes = self.nelems() * self.dtype().element_size()
@@ -187,6 +190,9 @@ def copy(
                 stream,
                 data.device.type == "cuda",
             )
+            data.requires_grad = self.requires_grad
+            if isinstance(self, Parameter):
+                self.torch_param = data
         elif isinstance(data, np.ndarray):
             if not data.flags["C_CONTIGUOUS"]:
                 data = np.ascontiguousarray(data)
@@ -207,13 +213,50 @@ def initialize(self) -> "Tensor":
         return self
 
 
-class Parameter(Tensor):
+class Parameter(Tensor, torch.nn.Parameter):
     """
     A tensor as a parameter.
     """
-
-    def __init__(self, _tensor: _Tensor):
+    def __init__(
+        self, tensor: Union[_Tensor, "torch.nn.Parameter"],
+    ):
         """
         Initializes a new instance of the Parameter class.
         """
-        super().__init__(_tensor)
+        if not _no_torch and isinstance(tensor, torch.nn.Parameter):
+            ark_tensor = Tensor.from_torch(tensor)
+            core_tensor = ark_tensor._tensor
+            self.torch_param = tensor
+            self.staged_tensor = None
+            Tensor.__init__(
+                self,
+                core_tensor,
+                requires_grad=tensor.requires_grad,
+            )
+        elif isinstance(tensor, _Tensor):
+            core_tensor = tensor
+            self.torch_param = None
+            self.staged_tensor = None
+            Tensor.__init__(
+                self, core_tensor, requires_grad=False
+            )
+        else:
+            raise TypeError(
+                "tensor must be an ARK tensor or a torch.nn.Parameter"
+            )
+
+    def update_gradient(self, ark_tensor: Tensor):
+        """
+        Stages an ARK tensor to be used for updating the gradient of its associated parameter.
+        """
+        if _no_torch:
+            raise ImportError("torch is not available")
+        if self.torch_param is None:
+            raise ValueError(
+                "there is no PyTorch parameter associated with this ARK parameter"
+            )
+        if not self.torch_param.requires_grad:
+            raise ValueError("parameter does not require gradient updates")
+        if ark_tensor is None or not isinstance(ark_tensor, Tensor):
+            raise ValueError("cannot use non-ARK tensor to update ARK gradient")
+        self.staged_tensor = ark_tensor
diff --git a/python/ark/torch_mock.py b/python/ark/torch_mock.py
index 68333e431..d0894a621 100644
--- a/python/ark/torch_mock.py
+++ b/python/ark/torch_mock.py
@@ -27,3 +27,23 @@ class ubyte: ...
 
 
 class Tensor: ...
+
+
+
+class nn:
+
+
+    class Module: ...
+    
+
+    class Parameter: ... 
+
+
+class autograd:
+
+
+    class Function: 
+
+
+        def apply(self, *args, **kwargs): ...
+
diff --git a/python/executor_py.cpp b/python/executor_py.cpp
index f42e59ee9..4b67b48a0 100644
--- a/python/executor_py.cpp
+++ b/python/executor_py.cpp
@@ -202,5 +202,6 @@ void register_executor(py::module &m) {
                                size_t, uintptr_t, bool>(&tensor_write),
              py::arg("tensor"), py::arg("address"), py::arg("bytes"),
              py::arg("stream"), py::arg("is_d2d"))
-        .def("tensor_to_dlpack", &tensor_to_dlpack);
+        .def("tensor_to_dlpack", &tensor_to_dlpack)
+        .def("add_plan", &ark::Executor::add_plan, py::arg("plan"));
 }
diff --git a/python/unittest/test_runtime.py b/python/unittest/test_runtime.py
index c3d15d1b9..b4fa838a1 100644
--- a/python/unittest/test_runtime.py
+++ b/python/unittest/test_runtime.py
@@ -2,6 +2,7 @@
 # Licensed under the MIT license.
 
 import ark
+import numpy as np
 
 
 empty_plan = ark.Plan(None)
@@ -20,99 +21,62 @@ def test_runtime_relaunch():
         assert rt.launched() == True
 
 
-# def test_multiple_runtime_launch():
-#     ark.init()
-#     num_runtimes = 5
-#     for i in range(num_runtimes):
-#         rt = ark.Runtime.get_runtime(i)
-#         assert rt.launched() == False
-#         rt.launch(plan=empty_plan, device_id=i)
-#         assert rt.launched() == True
-#     for i in range(num_runtimes):
-#         rt = ark.Runtime.get_runtime(i)
-#         assert rt.launched() == True
-#     ark.Runtime.delete_all_runtimes()
-
-
-# def test_stop_runtime():
-#     ark.init()
-#     rt1 = ark.Runtime.get_runtime(1)
-#     rt1.launch(plan=empty_plan, device_id=1)
-#     rt2 = ark.Runtime.get_runtime(2)
-#     rt2.launch(plan=empty_plan, device_id=2)
-#     rt1.stop()
-#     rt1.reset()
-#     assert rt1.state == ark.Runtime.State.Init
-#     assert rt2.state == ark.Runtime.State.LaunchedNotRunning
-#     ark.Runtime.delete_all_runtimes()
-
-
-# def test_reset_runtime():
-#     ark.init()
-#     rt1 = ark.Runtime.get_runtime(0)
-#     rt1.launch(plan=empty_plan, device_id=1)
-#     rt2 = ark.Runtime.get_runtime(1)
-#     rt2.launch(plan=empty_plan, device_id=2)
-#     rt1.reset()
-#     assert rt1.launched() == False
-#     assert rt2.launched() == True
-#     rt1.launch(plan=empty_plan)
-#     assert rt1.launched() == True
-#     ark.Runtime.delete_all_runtimes()
-
-
-# def test_multiple_runtimes_complex():
-#     ark.init()
-#     num_runtimes = 3
-#     runtime_list = [ark.Runtime.get_runtime(i) for i in range(num_runtimes)]
-#     default_runtime = ark.Runtime.get_runtime()
-#     runtime_list.append(default_runtime)
-#     for i, rt in enumerate(runtime_list):
-#         rt.launch(plan=empty_plan, device_id=i)
-#         assert rt.launched() == True
-#     runtime_list[0].stop()
-#     assert runtime_list[0].state == ark.Runtime.State.LaunchedNotRunning
-#     for rt in runtime_list[1:]:
-#         assert rt.launched() == True
-#     runtime_list[1].reset()
-#     assert runtime_list[1].state == ark.Runtime.State.Init
-#     assert runtime_list[0].state == ark.Runtime.State.LaunchedNotRunning
-#     assert runtime_list[2].state == ark.Runtime.State.LaunchedNotRunning
-#     runtime_list[1].launch(plan=empty_plan, device_id=1)
-#     for rt in runtime_list:
-#         assert rt.launched() == True
-#     ark.Runtime.delete_all_runtimes()
-
-
-# def test_runtime_state_after_reset():
-#     ark.init()
-#     rt = ark.Runtime.get_runtime()
-#     rt.launch(plan=empty_plan)
-#     rt.reset()
-#     assert rt.launched() == False
-#     assert rt.running() == False
-#     ark.Runtime.delete_all_runtimes()
-
-
-# def test_see_runtime_statuses():
-#     ark.init()
-#     num_runtimes = 3
-#     runtimes = [ark.Runtime.get_runtime(i) for i in range(num_runtimes)]
-#     runtime_statuses = ark.Runtime.see_runtime_statuses()
-#     assert len(runtime_statuses) == num_runtimes
-#     for i in range(num_runtimes):
-#         assert i in runtime_statuses
-#     for i, rt in enumerate(runtimes):
-#         assert runtime_statuses[i] == rt
-#     ark.Runtime.delete_all_runtimes()
-
+def test_add_plans():
+    ark.init()
+    M, N = 64, 64
+    input_tensor = ark.tensor([M, N], ark.fp16)
+    other_tensor = ark.tensor([M, N], ark.fp16)
+    output_tensor = ark.add(input_tensor, other_tensor)
+    runtime = ark.Runtime()
+    runtime.launch()
+    input_tensor_host = np.random.rand(M, N).astype(np.float16)
+    input_tensor.from_numpy(input_tensor_host)
+    other_tensor_host = np.random.rand(M, N).astype(np.float16)
+    other_tensor.from_numpy(other_tensor_host)
+    runtime.run()
+    output_tensor_host = output_tensor.to_numpy()
+    np.testing.assert_allclose(
+        output_tensor_host, input_tensor_host + other_tensor_host
+    )
+    runtime.reset(persist=True)
+    ark.init(keep_runtime=True)
+    prev_output = output_tensor
+    new_tensor = ark.tensor([M, N], ark.fp16)
+    final_output = ark.add(prev_output, new_tensor)
+    runtime.launch()
+    new_tensor_host = np.random.rand(M, N).astype(np.float16)
+    new_tensor.from_numpy(new_tensor_host)
+    runtime.run()
+    final_output_host = final_output.to_numpy()
+    np.testing.assert_allclose(
+        final_output_host, output_tensor_host + new_tensor_host
+    )
+    runtime.reset()
+
+def test_reuse_plans():
+    ark.init()
+    M, N = 64, 64
+    input_tensor = ark.tensor([M, N], ark.fp16)
+    other_tensor = ark.tensor([M, N], ark.fp16)
+    output_tensor = ark.add(input_tensor, other_tensor)
+    runtime = ark.Runtime()
+    runtime.launch()
+    input_tensor_host = np.random.rand(M, N).astype(np.float16)
+    input_tensor.from_numpy(input_tensor_host)
+    other_tensor_host = np.random.rand(M, N).astype(np.float16)
+    other_tensor.from_numpy(other_tensor_host)
+    runtime.run()
+    output_tensor_host = output_tensor.to_numpy()
+    np.testing.assert_allclose(
+        output_tensor_host, input_tensor_host + other_tensor_host
+    )
+    runtime.reset(persist=True)
+    ark.init(keep_runtime=True)
+    runtime.launch()
+    runtime.run()
+    output_tensor_host = output_tensor.to_numpy()
+    np.testing.assert_allclose(
+        output_tensor_host, input_tensor_host + other_tensor_host
+    )
+    runtime.reset()
 
-# def test_multiple_runtimes_init():
-#     ark.init()
-#     runtimes = [ark.Runtime.get_runtime(i) for i in range(3)]
-#     for rt in runtimes:
-#         assert rt.state == ark.Runtime.State.Init
-#     ark.init()
-#     runtimes = ark.Runtime.see_runtime_statuses()
-#     assert len(runtimes) == 0
-#     ark.Runtime.delete_all_runtimes()

From 28ce0275d79264bf6e11cd855a49d87c1ee782cf Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Sun, 11 Aug 2024 22:42:46 +0000
Subject: [PATCH 61/79] Minor changes

---
 ark/api/executor_test.cpp                |  2 +-
 examples/tutorial/model_test_tutorial.py |  1 +
 python/ark/init.py                       |  4 +--
 python/ark/module.py                     | 41 +++++++++++-------------
 python/ark/runtime.py                    |  8 ++---
 python/ark/tensor.py                     |  8 ++---
 python/ark/torch_mock.py                 | 10 ++----
 python/executor_py.cpp                   |  9 ++++--
 python/unittest/test_runtime.py          |  9 ++----
 9 files changed, 40 insertions(+), 52 deletions(-)

diff --git a/ark/api/executor_test.cpp b/ark/api/executor_test.cpp
index dad0e9d83..75d506ecb 100644
--- a/ark/api/executor_test.cpp
+++ b/ark/api/executor_test.cpp
@@ -88,7 +88,7 @@ ark::unittest::State test_executor_tensor_read_write(ark::Dims shape,
     ark::DefaultExecutor executor(m, 0);
     executor.compile();
     executor.launch();
-    UNITTEST_GT(executor.tensor_address(tensor), 0);
+    UNITTEST_NE(executor.tensor_address(tensor), nullptr);
 
     // Copy data from CPU array to ARK tensor
     executor.tensor_write(tensor, host_data.data(),
diff --git a/examples/tutorial/model_test_tutorial.py b/examples/tutorial/model_test_tutorial.py
index ac5a7b2a9..c83d0d15e 100644
--- a/examples/tutorial/model_test_tutorial.py
+++ b/examples/tutorial/model_test_tutorial.py
@@ -9,6 +9,7 @@
 # Set random seed for reproducibility.
 torch.manual_seed(42)
 
+
 # Let's first define a linear layer using ARK.
 class ARKLinear(ark.Module):
     def __init__(self, weight):
diff --git a/python/ark/init.py b/python/ark/init.py
index 29627d645..a4a67e85d 100644
--- a/python/ark/init.py
+++ b/python/ark/init.py
@@ -6,10 +6,10 @@
 from .runtime import _RuntimeState
 
 
-def init(keep_runtime: bool = False):
+def init():
     """Initializes ARK."""
     Model.reset()
-    if not keep_runtime and _RuntimeState.runtime is not None:
+    if _RuntimeState.runtime is not None:
         del _RuntimeState.runtime
         _RuntimeState.runtime = None
     _ark_core.init()
diff --git a/python/ark/module.py b/python/ark/module.py
index 0fdea23b6..0917ea1ed 100644
--- a/python/ark/module.py
+++ b/python/ark/module.py
@@ -5,10 +5,9 @@
 import numpy as np
 from typing import Any, Dict, Union
 from .tensor import Tensor, Parameter
-from .runtime import Runtime, Planner
+from .runtime import Runtime
 from .init import init
-from .ops import tensor
-from .data_type import DataType
+from .model import Model
 
 try:
     import torch
@@ -78,6 +77,7 @@ def load_state_dict(
         self,
         state_dict: Dict[str, Union[np.ndarray, torch.Tensor]],
         prefix: str = "",
+        stream: int = 0,
     ):
         """
         Loads a model from a state_dict and copy the parameters to the device GPU.
@@ -91,7 +91,7 @@ def load_state_dict(
             data = state_dict.get(name, None)
             if data is None:
                 continue
-            param.copy(data)
+            param.copy(data, stream=stream)
             all_keys.remove(name)
         if all_keys:
             logging.warning(
@@ -99,7 +99,10 @@ def load_state_dict(
             )
 
     def state_dict(
-        self, prefix: str = "", mode: str = "numpy"
+        self,
+        prefix: str = "",
+        mode: str = "numpy",
+        stream: int = 0,
     ) -> Dict[str, Union[np.ndarray, torch.Tensor]]:
         """
         Copies the parameters from the device GPU to the host and saves the
@@ -108,11 +111,13 @@ def state_dict(
         """
         if mode == "numpy":
             return {
-                k: v.to_numpy() for k, v in self.params_dict(prefix).items()
+                k: v.to_numpy(stream=stream)
+                for k, v in self.params_dict(prefix).items()
             }
         elif mode == "torch":
             return {
-                k: v.to_torch() for k, v in self.params_dict(prefix).items()
+                k: v.to_torch(stream=stream)
+                for k, v in self.params_dict(prefix).items()
             }
         raise ValueError(f"Unsupported mode: {mode}")
 
@@ -127,17 +132,7 @@ def initialize(self):
             module.initialize()
 
 
-def _recursive_ark_to_torch(object):
-    if isinstance(object, Tensor):
-        return object.to_torch()
-    if isinstance(object, dict):
-        return {k: _recursive_ark_to_torch(v) for k, v in object.items()}
-    if isinstance(object, list):
-        return [_recursive_ark_to_torch(v) for v in object]
-    return object
-
-
-class _ARKFunction(torch.autograd.Function):
+class _Function(torch.autograd.Function):
     """
     Facilitates the integration of ARK modules with PyTorch's
     autograd system by defining custom forward and backward passes that
@@ -150,7 +145,7 @@ def forward(ctx, ark_module, *args, **kwargs):
         Returns a PyTorch tensor that is the result
         of the forward pass of the ARK module.
         """
-        init(keep_runtime=True)
+        Model.reset()
         ctx.ark_module = ark_module
         input_args, input_kwargs = [], {}
         input_requires_grad = 0
@@ -184,12 +179,12 @@ def backward(ctx, *grad_outputs):
         and parameters using the ARK module backwards pass, and updates the gradients of the corresponding
         PyTorch parameters.
         """
-        init(keep_runtime=True)
+        Model.reset()
         ark_grad_outputs = [Tensor.from_torch(grad) for grad in grad_outputs]
         grads = ctx.ark_module.backward(*ark_grad_outputs)
         grad_inputs, grad_weights = (
-            grads[:ctx.num_inp_grad],
-            grads[ctx.num_inp_grad:],
+            grads[: ctx.num_inp_grad],
+            grads[ctx.num_inp_grad :],
         )
         params_dict = ctx.ark_module.params_dict()
         rt = Runtime.get_runtime()
@@ -214,4 +209,4 @@ def __init__(self, ark_module):
         self.ark_module = ark_module
 
     def forward(self, *args, **kwargs):
-        return _ARKFunction.apply(self.ark_module, *args, **kwargs)
+        return _Function.apply(self.ark_module, *args, **kwargs)
diff --git a/python/ark/runtime.py b/python/ark/runtime.py
index 071eedd04..1523905d7 100644
--- a/python/ark/runtime.py
+++ b/python/ark/runtime.py
@@ -3,7 +3,6 @@
 
 import logging
 from enum import Enum
-from typing import Dict, List
 
 from _ark_core import _Executor
 from .planner import Planner, Plan
@@ -169,9 +168,9 @@ def stop(self) -> float:
         self.state = Runtime.State.LaunchedNotRunning
         return elapsed
 
-    def reset(self, delete=False, persist=False):
+    def reset(self, persist=False):
         """
-        Reset the runtime. If delete is True, delete the runtime.
+        Reset the runtime.
         """
         if self.launched():
             self.stop()
@@ -182,6 +181,3 @@ def reset(self, delete=False, persist=False):
                 self.executor.destroy()
             self.executor = None
         self.state = Runtime.State.Init
-        if delete:
-            del _RuntimeState.runtime
-            _RuntimeState.runtime = None
diff --git a/python/ark/tensor.py b/python/ark/tensor.py
index ba1af52db..3fda8b3b3 100644
--- a/python/ark/tensor.py
+++ b/python/ark/tensor.py
@@ -217,8 +217,10 @@ class Parameter(Tensor, torch.nn.Parameter):
     """
     A tensor as a parameter.
     """
+
     def __init__(
-        self, tensor: Union[_Tensor, "torch.nn.Parameter"],
+        self,
+        tensor: Union[_Tensor, "torch.nn.Parameter"],
     ):
         """
         Initializes a new instance of the Parameter class.
@@ -237,9 +239,7 @@ def __init__(
             core_tensor = tensor
             self.torch_param = None
             self.staged_tensor = None
-            Tensor.__init__(
-                self, core_tensor, requires_grad=False
-            )
+            Tensor.__init__(self, core_tensor, requires_grad=False)
         else:
             raise TypeError(
                 "tensor must be an ARK tensor or a torch.nn.Parameter"
diff --git a/python/ark/torch_mock.py b/python/ark/torch_mock.py
index d0894a621..7a7de0ae6 100644
--- a/python/ark/torch_mock.py
+++ b/python/ark/torch_mock.py
@@ -29,21 +29,15 @@ class ubyte: ...
 class Tensor: ...
 
 
-
 class nn:
 
-
     class Module: ...
-    
 
-    class Parameter: ... 
+    class Parameter: ...
 
 
 class autograd:
 
-
-    class Function: 
-
+    class Function:
 
         def apply(self, *args, **kwargs): ...
-
diff --git a/python/executor_py.cpp b/python/executor_py.cpp
index 4b67b48a0..e10277646 100644
--- a/python/executor_py.cpp
+++ b/python/executor_py.cpp
@@ -182,8 +182,13 @@ void register_executor(py::module &m) {
         .def("barrier", &ark::Executor::barrier)
         .def("destroy", &ark::Executor::destroy)
         .def("destroyed", &ark::Executor::destroyed)
-        .def("tensor_address", &ark::Executor::tensor_address,
-             py::arg("tensor"))
+        .def(
+            "tensor_address",
+            [](ark::Executor *self, const ark::Tensor &tensor) {
+                return reinterpret_cast<uintptr_t>(
+                    self->tensor_address(tensor));
+            },
+            py::arg("tensor"))
         .def("tensor_read",
              py::overload_cast<ark::Executor *, const ark::Tensor &, py::buffer,
                                uintptr_t>(&tensor_read),
diff --git a/python/unittest/test_runtime.py b/python/unittest/test_runtime.py
index b4fa838a1..b368bb93a 100644
--- a/python/unittest/test_runtime.py
+++ b/python/unittest/test_runtime.py
@@ -5,9 +5,6 @@
 import numpy as np
 
 
-empty_plan = ark.Plan(None)
-
-
 def test_runtime_relaunch():
     ark.init()
     with ark.Runtime.get_runtime() as rt:
@@ -39,7 +36,7 @@ def test_add_plans():
         output_tensor_host, input_tensor_host + other_tensor_host
     )
     runtime.reset(persist=True)
-    ark.init(keep_runtime=True)
+    ark.Model.reset()
     prev_output = output_tensor
     new_tensor = ark.tensor([M, N], ark.fp16)
     final_output = ark.add(prev_output, new_tensor)
@@ -53,6 +50,7 @@ def test_add_plans():
     )
     runtime.reset()
 
+
 def test_reuse_plans():
     ark.init()
     M, N = 64, 64
@@ -71,7 +69,7 @@ def test_reuse_plans():
         output_tensor_host, input_tensor_host + other_tensor_host
     )
     runtime.reset(persist=True)
-    ark.init(keep_runtime=True)
+    ark.Model.reset()
     runtime.launch()
     runtime.run()
     output_tensor_host = output_tensor.to_numpy()
@@ -79,4 +77,3 @@ def test_reuse_plans():
         output_tensor_host, input_tensor_host + other_tensor_host
     )
     runtime.reset()
-

From b8e13b43a28dd018a97ca1205d32f1ef7fe510e7 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Sun, 11 Aug 2024 23:33:46 +0000
Subject: [PATCH 62/79] a few fixes & more verfication

---
 ark/api/executor.cpp     | 29 ++++++++++++---------
 ark/model/model_json.cpp | 54 ++++++++++++++++++++++++++++++++++------
 2 files changed, 63 insertions(+), 20 deletions(-)

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index 4634ed6fd..c424271cc 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -154,7 +154,9 @@ class Executor::Impl {
 
     Stream stream() const { return reinterpret_cast<Stream>(stream_raw_); }
 
-    std::shared_ptr<GpuMemory> buffer() const { return buffers_.back(); }
+    std::shared_ptr<GpuMemory> buffer() const {
+        return buffers_.empty() ? nullptr : buffers_.back();
+    }
 
     std::string plan() const { return plan_json_.dump_pretty(); }
 
@@ -177,7 +179,8 @@ class Executor::Impl {
     void init_communicator();
     std::map<size_t, size_t> init_buffers(const Json &plan_json);
     std::map<size_t, void *> init_buffer_addrs(
-        void *buffer_base, const std::map<size_t, size_t> &buffer_id_to_offset);
+        std::shared_ptr<GpuMemory> buffer,
+        const std::map<size_t, size_t> &buffer_id_to_offset);
     std::set<int> init_remote_ranks(const Json &plan_json) const;
     void init_channels(const std::set<int> &remote_ranks);
 
@@ -275,11 +278,10 @@ void Executor::Impl::init(const PlanJson &plan_json) {
     if (total_bytes_ > 0) {
         buffers_.push_back(gpu_manager->malloc(total_bytes_, 65536));
         is_buffer_allocated_ = true;
+        buffer_id_to_addr_ =
+            init_buffer_addrs(buffers_.back(), buffer_id_to_offset_);
     }
 
-    buffer_id_to_addr_ =
-        init_buffer_addrs(buffers_.back()->ref(), buffer_id_to_offset_);
-
     codegen_ = std::make_shared<CodeGenerator>(plan_json_, buffer_id_to_offset_,
                                                external_args_,
                                                buffer_id_to_name_, name_);
@@ -293,7 +295,7 @@ void Executor::Impl::init(const PlanJson &plan_json) {
     size_t smem_block_total =
         static_cast<size_t>(gpu_manager->info().smem_block_total);
 
-    if (world_size_ > 1) {
+    if (world_size_ > 1 && total_bytes_ > 0) {
         auto remote_ranks = init_remote_ranks(plan_json_);
         init_channels(remote_ranks);
     }
@@ -325,7 +327,8 @@ void Executor::Impl::init_communicator() {
 }
 
 std::map<size_t, void *> Executor::Impl::init_buffer_addrs(
-    void *buffer_base, const std::map<size_t, size_t> &buffer_id_to_offset) {
+    std::shared_ptr<GpuMemory> buffer,
+    const std::map<size_t, size_t> &buffer_id_to_offset) {
     std::map<size_t, void *> buffer_id_to_addr;
     // Reuse existing buffer addresses for new plans that use previous tensors
     // from earlier plans
@@ -333,8 +336,7 @@ std::map<size_t, void *> Executor::Impl::init_buffer_addrs(
         buffer_id_to_addr = buffer_id_to_addr_;
     }
     for (const auto &kv : buffer_id_to_offset) {
-        buffer_id_to_addr[kv.first] =
-            static_cast<char *>(buffer_base) + kv.second;
+        buffer_id_to_addr[kv.first] = buffer->ref(kv.second);
     }
     return buffer_id_to_addr;
 }
@@ -772,7 +774,7 @@ void Executor::Impl::launch() {
         // Initialize loop flags.
         atomicStoreRelaxed(flag_->ref<int>(), 0);
         void *flag_ptr = flag_->ref();
-        void *buf_ptr = buffers_.back()->ref();
+        void *buf_ptr = (buffers_.empty()) ? nullptr : buffers_.back()->ref();
         std::vector<void *> args = {&buf_ptr, &flag_ptr};
         for (auto &buffer : external_buffers_) {
             args.push_back(&buffer);
@@ -790,7 +792,7 @@ void Executor::Impl::run(int iter) {
         }
         atomicStoreRelaxed(flag_->ref<int>(), iter);
     } else {
-        void *buf_ptr = buffers_.back()->ref();
+        void *buf_ptr = (buffers_.empty()) ? nullptr : buffers_.back()->ref();
         int i = 0;
         std::vector<void *> args = {&buf_ptr, reinterpret_cast<void *>(&i)};
         for (auto &buffer : external_buffers_) {
@@ -865,7 +867,10 @@ void Executor::Impl::barrier() {
 void *Executor::Impl::tensor_address(const Tensor &tensor) const {
     size_t buffer_id = tensor.ref()->buffer()->id();
     if (buffer_id_to_addr_.find(buffer_id) == buffer_id_to_addr_.end()) {
-        ERR(InternalError, "Invalid buffer ID: ", buffer_id);
+        ERR(InvalidUsageError, "Tensor has an unknown buffer ID ", buffer_id,
+            ". This is likely caused by accessing a tensor that is optimized "
+            "out by the compiler or not used in any plan passed to the "
+            "executor.");
     }
     return buffer_id_to_addr_.at(buffer_id);
 }
diff --git a/ark/model/model_json.cpp b/ark/model/model_json.cpp
index c2099e2c9..dad62cb4e 100644
--- a/ark/model/model_json.cpp
+++ b/ark/model/model_json.cpp
@@ -5,6 +5,7 @@
 
 #include <sstream>
 
+#include "ark/dims.hpp"
 #include "logging.hpp"
 
 static std::stringstream &idnt(std::stringstream &ss, int indent) {
@@ -26,14 +27,46 @@ static void verify_format_json(const std::string &name, const Json &json,
                                const std::vector<std::string> &array_fields) {
     for (const auto &field : required_fields) {
         if (!json.contains(field)) {
-            ERR(ErrorType,
-                name + ": " + field + " not found. Given: " + json.dump());
+            ERR(ErrorType, name, ": ", field,
+                " not found. Given: ", json.dump());
         }
     }
     for (const auto &field : array_fields) {
         if (!json.at(field).is_array()) {
-            ERR(ErrorType, name + ": " + field +
-                               " is not an array. Given: " + json.dump());
+            ERR(ErrorType, name, ": ", field,
+                " is not an array. Given: ", json.dump());
+        }
+    }
+}
+
+template <typename ErrorType, bool ZeroNotAllowed>
+static void verify_format_dims(const std::string &name, const Json &json,
+                               const std::vector<std::string> &dims_fields) {
+    for (const auto &field : dims_fields) {
+        if (!json.at(field).is_array()) {
+            ERR(ErrorType, name, ": ", field,
+                " is not an array. Given: ", json.dump());
+        }
+        std::vector<DimType> dims;
+        try {
+            dims = json.at(field).get<std::vector<DimType>>();
+        } catch (const std::exception &e) {
+            ERR(ErrorType, name, ": ", field,
+                " is not an array of integers. Given: ", json.dump());
+        }
+        for (const auto &dim : dims) {
+            if (dim < 0) {
+                ERR(ErrorType, name, ": ", field,
+                    " contains negative value. Given: ", json.dump());
+            }
+        }
+        if (ZeroNotAllowed) {
+            for (const auto &dim : dims) {
+                if (dim == 0) {
+                    ERR(ErrorType, name, ": ", field,
+                        " contains zero value. Given: ", json.dump());
+                }
+            }
         }
     }
 }
@@ -52,10 +85,15 @@ static void verify_format_tensor(const Json &json) {
     const std::vector<std::string> required_fields = {
         "Id",      "DataType",    "Shape", "Strides",
         "Offsets", "PaddedShape", "Buffer"};
-    const std::vector<std::string> array_fields = {"Shape", "Strides",
-                                                   "Offsets", "PaddedShape"};
-    verify_format_json<ErrorType>("TensorJson", json, required_fields,
-                                  array_fields);
+    const std::vector<std::string> dims_fields = {"Shape", "Strides", "Offsets",
+                                                  "PaddedShape"};
+    verify_format_json<ErrorType>("TensorJson", json, required_fields, {});
+    verify_format_dims<ErrorType, false>("TensorJson", json,
+                                         {
+                                             "Offsets",
+                                         });
+    verify_format_dims<ErrorType, true>("TensorJson", json,
+                                        {"Shape", "Strides", "PaddedShape"});
     verify_format_buffer<ErrorType>(json.at("Buffer"));
 }
 

From 5ba79f9a72679f727e532927903f91a901c02048 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 12 Aug 2024 02:23:06 +0000
Subject: [PATCH 63/79] Align C++ Executor interface with Python interface

---
 ark/api/executor.cpp               | 131 +++++++++++++++--------------
 ark/api/executor_test.cpp          |  12 +--
 ark/include/ark/executor.hpp       |  21 ++---
 ark/ops/ops_communication_test.cpp |  12 +--
 ark/ops/ops_identity_test.cpp      |   1 -
 ark/ops/ops_reshape_test.cpp       |   1 -
 ark/ops/ops_scalar_test.cpp        |   3 -
 ark/ops/ops_tensor_test.cpp        |   3 -
 ark/ops/ops_test_common.cpp        |   1 -
 python/ark/__init__.py             |   2 +-
 python/ark/module.py               |  10 +--
 python/ark/runtime.py              |  70 +++++----------
 python/executor_py.cpp             |  20 ++---
 python/unittest/test_runtime.py    |   8 +-
 14 files changed, 125 insertions(+), 170 deletions(-)

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index c424271cc..9d9d79a43 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -145,11 +145,9 @@ static size_t tensor_stride_bytes(const Json &tensor) {
 
 class Executor::Impl {
    public:
-    Impl(int device_id, Stream stream, const std::string &name, bool loop_mode);
+    Impl() : plan_json_(), device_id_(-1) {};
     ~Impl();
 
-    void init(const PlanJson &plan);
-
     int device_id() const { return device_id_; }
 
     Stream stream() const { return reinterpret_cast<Stream>(stream_raw_); }
@@ -160,9 +158,11 @@ class Executor::Impl {
 
     std::string plan() const { return plan_json_.dump_pretty(); }
 
-    void add_plan(const std::string &plan);
-    void compile();
-    void launch();
+    const std::string &name() const { return name_; }
+
+    void compile(const std::string &plan, int device_id,
+                 const std::string &name);
+    void launch(Stream stream, bool loop_mode);
     void run(int iter);
     void wait(int64_t max_spin_count);
     float stop(int64_t max_spin_count);
@@ -175,7 +175,15 @@ class Executor::Impl {
     void tensor_write(const Tensor &tensor, const void *data, size_t bytes,
                       Stream stream, bool is_d2d) const;
 
+   protected:
+    friend class DefaultExecutor;
+
+    gpuStream stream_raw_;
+    bool loop_mode_;
+
    private:
+    void init(const PlanJson &plan_json, int device_id,
+              const std::string &name);
     void init_communicator();
     std::map<size_t, size_t> init_buffers(const Json &plan_json);
     std::map<size_t, void *> init_buffer_addrs(
@@ -184,14 +192,9 @@ class Executor::Impl {
     std::set<int> init_remote_ranks(const Json &plan_json) const;
     void init_channels(const std::set<int> &remote_ranks);
 
-   protected:
+    PlanJson plan_json_;
     int device_id_;
     std::string name_;
-    bool loop_mode_;
-
-    bool is_buffer_allocated_;
-
-    gpuStream stream_raw_;
 
     int rank_;
     int world_size_;
@@ -200,7 +203,6 @@ class Executor::Impl {
     bool is_recording_ = false;
     float elapsed_msec_ = -1;
 
-    PlanJson plan_json_;
     std::vector<void *> external_buffers_;
     std::vector<std::string> external_args_;
     std::map<size_t, std::string> buffer_id_to_name_;
@@ -224,26 +226,25 @@ class Executor::Impl {
         rank_to_sm_channels_;
 };
 
-Executor::Impl::Impl(int device_id, Stream stream, const std::string &name,
-                     bool loop_mode)
-    : device_id_(device_id), name_(name), loop_mode_(loop_mode) {
-    if (device_id < 0) {
-        ERR(InvalidUsageError, "Invalid device ID ", device_id);
-    }
-    if (stream) {
-        stream_raw_ = reinterpret_cast<gpuStream>(stream);
-    } else {
-        stream_ = GpuManager::get_instance(device_id_)->create_stream();
-        stream_raw_ = stream_->get();
-    }
-}
-
 Executor::Impl::~Impl() {
     if (is_launched_) stop(-1);
 }
 
-void Executor::Impl::init(const PlanJson &plan_json) {
+void Executor::Impl::init(const PlanJson &plan_json, int device_id,
+                          const std::string &name) {
+    if (device_id < 0) {
+        ERR(InvalidUsageError, "Invalid device ID ", device_id);
+    }
+
     plan_json_ = plan_json;
+    device_id_ = device_id;
+    name_ = name;
+
+    external_buffers_.clear();
+    external_args_.clear();
+    buffer_id_to_name_.clear();
+    total_bytes_ = 0;
+
     rank_ = plan_json_["Rank"].get<int>();
     world_size_ = plan_json_["WorldSize"].get<int>();
 
@@ -277,7 +278,6 @@ void Executor::Impl::init(const PlanJson &plan_json) {
     timer_end_ = gpu_manager->create_event();
     if (total_bytes_ > 0) {
         buffers_.push_back(gpu_manager->malloc(total_bytes_, 65536));
-        is_buffer_allocated_ = true;
         buffer_id_to_addr_ =
             init_buffer_addrs(buffers_.back(), buffer_id_to_offset_);
     }
@@ -700,25 +700,32 @@ void Executor::Impl::init_channels(const std::set<int> &remote_ranks) {
     }
 }
 
-void Executor::Impl::add_plan(const std::string &plan) {
-    external_buffers_.clear();
-    external_args_.clear();
-    buffer_id_to_name_.clear();
-    total_bytes_ = 0;
-    is_buffer_allocated_ = false;
-    init(Json::parse(plan));
+void Executor::Impl::compile(const std::string &plan, int device_id,
+                             const std::string &name) {
+    if (is_launched_) {
+        ERR(InvalidUsageError, "Need to stop before re-compiling.");
+        return;
+    }
+    init(PlanJson::parse(plan), device_id, name);
+    kernel_->compile();
 }
 
-void Executor::Impl::compile() { kernel_->compile(); }
-
-void Executor::Impl::launch() {
-    if (!kernel_->is_compiled()) {
-        ERR(InvalidUsageError, "Need to compile first before initialization.");
+void Executor::Impl::launch(Stream stream, bool loop_mode) {
+    if ((kernel_ == nullptr) || !kernel_->is_compiled()) {
+        ERR(InvalidUsageError, "Need to compile first before launch.");
     }
     if (is_launched_) {
         LOG(WARN, "Ignore launching twice.");
         return;
     }
+    if (stream) {
+        stream_raw_ = reinterpret_cast<gpuStream>(stream);
+    } else {
+        stream_ = GpuManager::get_instance(device_id_)->create_stream();
+        stream_raw_ = stream_->get();
+    }
+    loop_mode_ = loop_mode;
+
     auto get_global_rt = [&](const std::string &symbol) {
         return reinterpret_cast<void *>(kernel_->get_global(symbol));
     };
@@ -773,8 +780,8 @@ void Executor::Impl::launch() {
     if (loop_mode_) {
         // Initialize loop flags.
         atomicStoreRelaxed(flag_->ref<int>(), 0);
-        void *flag_ptr = flag_->ref();
         void *buf_ptr = (buffers_.empty()) ? nullptr : buffers_.back()->ref();
+        void *flag_ptr = flag_->ref();
         std::vector<void *> args = {&buf_ptr, &flag_ptr};
         for (auto &buffer : external_buffers_) {
             args.push_back(&buffer);
@@ -990,18 +997,7 @@ void Executor::Impl::tensor_write(const Tensor &tensor, const void *data,
     GLOG(gpuStreamSynchronize(copy_stream_raw));
 }
 
-Executor::Executor(int device_id, Stream stream, const std::string &name,
-                   const std::string &plan, bool loop_mode)
-    : impl_(std::make_unique<Executor::Impl>(device_id, stream, name,
-                                             loop_mode)) {
-    auto &plan_path = get_env().enforce_plan_path;
-    if (!plan_path.empty()) {
-        LOG(INFO, "Enforce executor plan path: ", plan_path);
-        impl_->init(Json::parse(read_file(plan_path)));
-    } else if (!plan.empty()) {
-        impl_->init(Json::parse(plan));
-    }
-}
+Executor::Executor() : impl_(std::make_unique<Executor::Impl>()) {}
 
 Executor::~Executor() = default;
 
@@ -1013,11 +1009,16 @@ std::shared_ptr<GpuMemory> Executor::buffer() const { return impl_->buffer(); }
 
 std::string Executor::plan() const { return impl_->plan(); }
 
-void Executor::add_plan(const std::string &plan) { impl_->add_plan(plan); }
+const std::string &Executor::name() const { return impl_->name(); }
 
-void Executor::compile() { impl_->compile(); }
+void Executor::compile(int device_id, const std::string &plan,
+                       const std::string &name) {
+    impl_->compile(device_id, plan, name);
+}
 
-void Executor::launch() { impl_->launch(); }
+void Executor::launch(Stream stream, bool loop_mode) {
+    impl_->launch(stream, loop_mode);
+}
 
 void Executor::run(int iter) { impl_->run(iter); }
 
@@ -1054,14 +1055,20 @@ DefaultExecutor::DefaultExecutor(
     const Model &model, int device_id, Stream stream,
     const std::vector<Planner::ConfigRule> &config_rules,
     const std::string &name, bool loop_mode)
-    : Executor((device_id < 0) ? (model.rank() % get_env().num_ranks_per_host)
-                               : device_id,
-               stream, name, "", loop_mode) {
-    Planner planner(model, impl_->device_id());
+    : Executor() {
+    device_id = (device_id < 0) ? (model.rank() % get_env().num_ranks_per_host)
+                                : device_id;
+    Planner planner(model, device_id);
     for (const auto &rule : config_rules) {
         planner.install_config_rule(rule);
     }
-    impl_->init(Json::parse(planner.plan()));
+    compile(device_id, planner.plan(), name);
+    impl_->stream_raw_ = reinterpret_cast<gpuStream>(stream);
+    impl_->loop_mode_ = loop_mode;
+}
+
+void DefaultExecutor::launch() {
+    Executor::launch(reinterpret_cast<Stream>(impl_->stream_raw_), impl_->loop_mode_);
 }
 
 }  // namespace ark
diff --git a/ark/api/executor_test.cpp b/ark/api/executor_test.cpp
index 75d506ecb..e54578dfc 100644
--- a/ark/api/executor_test.cpp
+++ b/ark/api/executor_test.cpp
@@ -20,7 +20,6 @@ ark::unittest::State test_executor() {
         UNITTEST_EQ(executor.device_id(), 0);
         UNITTEST_EQ(executor.stream(), stream);
 
-        executor.compile();
         executor.launch();
         executor.run(1);
         executor.wait();
@@ -31,7 +30,6 @@ ark::unittest::State test_executor() {
     }
     {
         ark::DefaultExecutor executor(empty, 0, stream, {}, "test", LoopMode);
-        executor.compile();
         executor.launch();
         executor.run(1);
         executor.wait();
@@ -48,7 +46,6 @@ ark::unittest::State test_executor() {
         ark::DefaultExecutor executor(empty, 0, stream, {}, "test", LoopMode);
         UNITTEST_THROW(executor.launch(), ark::InvalidUsageError);
 
-        executor.compile();
         executor.launch();
         executor.launch();  // Will be ignored with a warning.
         executor.run(1);
@@ -86,7 +83,6 @@ ark::unittest::State test_executor_tensor_read_write(ark::Dims shape,
     m.noop(tensor);
 
     ark::DefaultExecutor executor(m, 0);
-    executor.compile();
     executor.launch();
     UNITTEST_NE(executor.tensor_address(tensor), nullptr);
 
@@ -169,15 +165,15 @@ ark::unittest::State test_executor_tensor_read_write_stride_offset() {
 }
 
 ark::unittest::State test_executor_invalid() {
+    ark::Executor exe;
+
     // Invalid device ID.
-    UNITTEST_THROW(ark::Executor(-1, nullptr, "test", ""),
-                   ark::InvalidUsageError);
+    UNITTEST_THROW(exe.compile(-1, ""), ark::InvalidUsageError);
 
     // Invalid rank.
     ark::PlanJson plan;
     plan["Rank"] = 1;
-    UNITTEST_THROW(ark::Executor(0, nullptr, "test", plan.dump(), true),
-                   ark::InvalidUsageError);
+    UNITTEST_THROW(exe.compile(0, plan.dump()), ark::InvalidUsageError);
 
     return ark::unittest::SUCCESS;
 }
diff --git a/ark/include/ark/executor.hpp b/ark/include/ark/executor.hpp
index d44ac2302..8e6577cd2 100644
--- a/ark/include/ark/executor.hpp
+++ b/ark/include/ark/executor.hpp
@@ -21,8 +21,7 @@ class GpuMemory;
 class Executor {
    public:
     /// Constructor.
-    Executor(int device_id, Stream stream, const std::string &name,
-             const std::string &plan, bool loop_mode = true);
+    Executor();
 
     /// Destructor.
     ~Executor();
@@ -39,23 +38,22 @@ class Executor {
     /// Return the plan string.
     std::string plan() const;
 
-    /// Add a plan to the executor.
-    void add_plan(const std::string &plan);
+    const std::string &name() const;
 
     /// Compile the model. This must be called before `launch()`.
-    void compile();
+    void compile(const std::string &plan, int device_id,
+                 const std::string &name = "executor");
 
-    /// Launch the model (not running yet). This must be called after
-    /// `compile()`.
-    void launch();
+    /// Launch the executor. This must be called after `compile()`.
+    void launch(Stream stream = nullptr, bool loop_mode = true);
 
-    /// Run the model for `iter` iterations.
+    /// Run the executor for `iter` iterations.
     void run(int iter);
 
     /// Wait for the previous run to finish.
     void wait(int64_t max_spin_count = -1);
 
-    /// Stop the model and return the elapsed time in milliseconds.
+    /// Stop the executor and return the elapsed time in milliseconds.
     /// Once this is called, we need to call `launch()` again to run the model
     /// again.
     float stop(int64_t max_spin_count = -1);
@@ -105,6 +103,9 @@ class DefaultExecutor : public Executor {
         const Model &model, int device_id = -1, Stream stream = nullptr,
         const std::vector<Planner::ConfigRule> &config_rules = {},
         const std::string &name = "DefaultExecutor", bool loop_mode = true);
+
+    /// Launch the default executor.
+    void launch();
 };
 
 }  // namespace ark
diff --git a/ark/ops/ops_communication_test.cpp b/ark/ops/ops_communication_test.cpp
index 8cdad41b2..7a7fec523 100644
--- a/ark/ops/ops_communication_test.cpp
+++ b/ark/ops/ops_communication_test.cpp
@@ -25,7 +25,6 @@ ark::unittest::State test_communication_send_recv_unidir() {
             }
 
             ark::DefaultExecutor exe(model, gpu_id);
-            exe.compile();
 
             if (gpu_id == 0) {
                 std::vector<ark::half_t> data(1024);
@@ -68,7 +67,6 @@ ark::unittest::State test_communication_send_recv_unidir() {
             }
 
             ark::DefaultExecutor exe(model, gpu_id);
-            exe.compile();
 
             if (gpu_id == 1) {
                 std::vector<ark::half_t> data(1024);
@@ -117,7 +115,6 @@ ark::unittest::State test_communication_send_recv_bidir() {
             tns2 = model.recv(tns2_data, remote_gpu_id, tag);
 
             ark::DefaultExecutor exe(model, gpu_id);
-            exe.compile();
 
             std::vector<ark::half_t> data(1024);
             std::iota(data.begin(), data.end(), ark::half_t(gpu_id + 1));
@@ -161,7 +158,6 @@ ark::unittest::State test_communication_send_recv_bidir() {
             ark::Tensor sum = model.add(tns2, tns_data);
 
             ark::DefaultExecutor exe(model, gpu_id);
-            exe.compile();
 
             std::vector<ark::half_t> data(1024);
             std::iota(data.begin(), data.end(), ark::half_t(gpu_id + 1));
@@ -232,7 +228,6 @@ ark::unittest::State test_communication_send_recv_bidir_sm() {
             tns2 = model.recv(tns2_data, remote_gpu_id, tag);
 
             ark::DefaultExecutor exe(model, gpu_id, nullptr, {config_rule});
-            exe.compile();
 
             std::vector<ark::half_t> data(1024);
             std::iota(data.begin(), data.end(), ark::half_t(gpu_id + 1));
@@ -276,7 +271,6 @@ ark::unittest::State test_communication_send_recv_bidir_sm() {
             ark::Tensor sum = model.add(tns2, tns_data);
 
             ark::DefaultExecutor exe(model, gpu_id, nullptr, {config_rule});
-            exe.compile();
 
             std::vector<ark::half_t> data(1024);
             std::iota(data.begin(), data.end(), ark::half_t(gpu_id + 1));
@@ -319,7 +313,6 @@ ark::unittest::State test_communication_send_packet() {
             }
 
             ark::DefaultExecutor exe(model, gpu_id);
-            exe.compile();
 
             if (gpu_id == 0) {
                 std::vector<ark::half_t> data(1024);
@@ -362,7 +355,6 @@ ark::unittest::State test_communication_send_recv_reduce_packet() {
             model.recv_packet(shard_tensors[peer_gpu_id], peer_gpu_id, 1, 1);
 
             ark::DefaultExecutor exe(model, gpu_id);
-            exe.compile();
 
             std::vector<ark::half_t> data(1024);
             std::iota(data.begin(), data.end(), 1.0f);
@@ -433,8 +425,8 @@ ark::unittest::State test_communication_send_recv_reduce() {
 
             ark::Planner planner(model, gpu_id);
             planner.install_config_rule(config_rule);
-            ark::Executor exe(gpu_id, nullptr, "Executor", planner.plan());
-            exe.compile();
+            ark::Executor exe;
+            exe.compile(gpu_id, planner.plan());
 
             std::vector<ark::half_t> data(1024);
             std::iota(data.begin(), data.end(), 1.0f);
diff --git a/ark/ops/ops_identity_test.cpp b/ark/ops/ops_identity_test.cpp
index a6e49c9c0..eb8d3f4d4 100644
--- a/ark/ops/ops_identity_test.cpp
+++ b/ark/ops/ops_identity_test.cpp
@@ -58,7 +58,6 @@ ark::unittest::State test_ops_identity() {
 
     // Create an executor
     ark::DefaultExecutor exe(model);
-    exe.compile();
 
     int num_elem = 2 * 3 * 4 * 5;
 
diff --git a/ark/ops/ops_reshape_test.cpp b/ark/ops/ops_reshape_test.cpp
index 1128c955a..7bb8aa4be 100644
--- a/ark/ops/ops_reshape_test.cpp
+++ b/ark/ops/ops_reshape_test.cpp
@@ -9,7 +9,6 @@
 void test_reshape_checker(ark::Model &m, ark::Tensor t0, ark::Tensor t1,
                           const std::string &) {
     ark::DefaultExecutor exe(m);
-    exe.compile();
 
     std::vector<float> data_vec(t0.shape().nelems());
     std::iota(data_vec.begin(), data_vec.end(), 1.0f);
diff --git a/ark/ops/ops_scalar_test.cpp b/ark/ops/ops_scalar_test.cpp
index 6afc9e1ad..47a5b40bd 100644
--- a/ark/ops/ops_scalar_test.cpp
+++ b/ark/ops/ops_scalar_test.cpp
@@ -66,7 +66,6 @@ ark::unittest::State test_scalar_assign_fp16() {
         ark::Tensor t = m.constant(7, ark::Dims(4, 2, 50), ark::FP16);
 
         ark::DefaultExecutor exe(m);
-        exe.compile();
 
         exe.launch();
         exe.run(1);
@@ -84,7 +83,6 @@ ark::unittest::State test_scalar_assign_fp16() {
         ark::Tensor out = m.copy(7, t);
 
         ark::DefaultExecutor exe(m);
-        exe.compile();
 
         std::vector<ark::half_t> data(4 * 2 * 50, 3);
         exe.tensor_write(t, data);
@@ -109,7 +107,6 @@ ark::unittest::State test_scalar_assign_fp32() {
         ark::Tensor out = m.copy(7);
 
         ark::DefaultExecutor exe(m);
-        exe.compile();
 
         exe.launch();
         exe.run(1);
diff --git a/ark/ops/ops_tensor_test.cpp b/ark/ops/ops_tensor_test.cpp
index be6488ef1..a2c36fd8c 100644
--- a/ark/ops/ops_tensor_test.cpp
+++ b/ark/ops/ops_tensor_test.cpp
@@ -20,7 +20,6 @@ ark::unittest::State test_tensor_strides() {
 
     // Create an executor
     ark::DefaultExecutor exe(model);
-    exe.compile();
 
     // Fill buffer data: {1.0, 2.0, 3.0, 4.0}
     std::vector<float> data(shape.nelems());
@@ -53,7 +52,6 @@ ark::unittest::State test_tensor_memcpy() {
 
     // Create an executor
     ark::DefaultExecutor exe(model);
-    exe.compile();
 
     // Fill buffer data: {1.0, 2.0, 3.0, ..., 3024.0}
     std::vector<float> data(strides.nelems());
@@ -138,7 +136,6 @@ ark::unittest::State test_tensor_layout() {
 
     // Create an executor
     ark::DefaultExecutor exe(model);
-    exe.compile();
 
     // Fill tensor data: {1.0, 2.0, 3.0, ..., 120.0}
     std::vector<float> data(2 * 3 * 4 * 5);
diff --git a/ark/ops/ops_test_common.cpp b/ark/ops/ops_test_common.cpp
index 4e94d06a7..42f7e670e 100644
--- a/ark/ops/ops_test_common.cpp
+++ b/ark/ops/ops_test_common.cpp
@@ -38,7 +38,6 @@ OpsTestResult op_test(
     const std::vector<Planner::ConfigRule> &config_rules,
     bool print_on_error) {
     DefaultExecutor exe(model, -1, nullptr, config_rules);
-    exe.compile();
 
     std::vector<std::shared_ptr<std::vector<char>>> inputs_data_storages;
     std::vector<void *> inputs_data_refs;
diff --git a/python/ark/__init__.py b/python/ark/__init__.py
index 68b03ab29..939c4837f 100644
--- a/python/ark/__init__.py
+++ b/python/ark/__init__.py
@@ -39,7 +39,7 @@ def set_world_size(world_size):
 from .init import init
 from .tensor import Dims, Tensor, Parameter
 from .module import Module, RuntimeModule
-from .runtime import Runtime
+from .runtime import *
 from .serialize import save, load
 from .data_type import (
     DataType,
diff --git a/python/ark/module.py b/python/ark/module.py
index 0917ea1ed..49d2ddf00 100644
--- a/python/ark/module.py
+++ b/python/ark/module.py
@@ -168,8 +168,8 @@ def forward(ctx, ark_module, *args, **kwargs):
         rt = Runtime.get_runtime()
         rt.launch()
         rt.run()
-        output = output.get_torch_view()
-        rt.reset(persist=True)
+        rt.stop()
+        output = output.to_torch()
         return output
 
     @staticmethod
@@ -190,12 +190,12 @@ def backward(ctx, *grad_outputs):
         rt = Runtime.get_runtime()
         rt.launch()
         rt.run()
-        grad_inputs = [grad.get_torch_view() for grad in grad_inputs]
+        rt.stop()
+        grad_inputs = [grad.to_torch() for grad in grad_inputs]
         for _, param in params_dict.items():
             if param.staged_tensor is not None:
-                pytorch_grad = param.staged_tensor.get_torch_view()
+                pytorch_grad = param.staged_tensor.to_torch()
                 param.torch_param.grad = pytorch_grad
-        rt.reset(persist=True)
         return (None, *grad_inputs)
 
 
diff --git a/python/ark/runtime.py b/python/ark/runtime.py
index 1523905d7..f3baf3994 100644
--- a/python/ark/runtime.py
+++ b/python/ark/runtime.py
@@ -16,10 +16,6 @@ class _RuntimeState:
     runtime = None
 
 
-class Executor(_Executor):
-    pass
-
-
 class Runtime:
     """
     Convenience class for running a model.
@@ -35,16 +31,11 @@ class State(Enum):
         Running = 2
 
     def __init__(self):
-        self.executor: Executor = None
+        self.executor: _Executor = _Executor()
         self.state: Runtime.State = Runtime.State.Init
+        self.loop_mode = True
         _RuntimeState.runtime = self
 
-    def get_state(self) -> "Runtime.State":
-        """
-        Get the runtime state.
-        """
-        return self.state
-
     @staticmethod
     def get_runtime() -> "Runtime":
         """
@@ -76,14 +67,6 @@ def running(self) -> bool:
         """
         return self.state == Runtime.State.Running
 
-    def add_plan(self, plan: Plan):
-        """
-        Add a plan to the executor.
-        """
-        if self.executor is None:
-            raise RuntimeError("Executor is not initialized")
-        self.executor.add_plan(str(plan))
-
     def launch(
         self,
         plan: Plan = None,
@@ -96,33 +79,21 @@ def launch(
         the CUDA kernels. The GPU context and the connection between GPUs will be
         initialized. The executor will compile the cuda kernels and launch the ARK runtime.
         """
+        if device_id < 0:
+            logging.error(f"Invalid device_id: {device_id}")
+            raise ValueError(f"Invalid device_id: {device_id}")
         plan = Planner(device_id).plan() if plan is None else plan
+        plan_str = str(plan)
         if self.launched():
-            # If the Runtime state is already launched and we are adding another plan
-            # to the executor, we compile the new kernel and launch the executor again.
-            self.executor.add_plan(str(plan))
-            self.executor.compile()
-            self.executor.launch()
-            return
+            # Stop the current running model
+            self.stop()
+
+        # Recompile if the previous launch was not compiled with the same info
+        # or if this is the first launch
+        if plan_str != self.executor.plan() or device_id != self.executor.device_id():
+            self.executor.compile(plan_str, device_id)
 
-        # If the RuntimeState is init, we need to create a new executor and
-        # compile the kernels
-        if self.state == Runtime.State.Init:
-            if self.executor is not None:
-                if not self.executor.destroyed():
-                    logging.warning(
-                        f"Runtime has already been launched. Destroying the old executor"
-                    )
-                    self.executor.destroy()
-            self.executor = Executor(
-                device_id,
-                stream,
-                "ArkRuntime",
-                str(plan),
-                loop_mode,
-            )
-            self.executor.compile()
-        self.executor.launch()
+        self.executor.launch(stream, loop_mode)
         self.state = Runtime.State.LaunchedNotRunning
 
     def run(self, iter=1, non_blocking=False):
@@ -168,16 +139,15 @@ def stop(self) -> float:
         self.state = Runtime.State.LaunchedNotRunning
         return elapsed
 
-    def reset(self, persist=False):
+    def reset(self):
         """
         Reset the runtime.
         """
         if self.launched():
             self.stop()
-        if persist:
-            return
-        if self.executor is not None:
-            if not self.executor.destroyed():
-                self.executor.destroy()
-            self.executor = None
+        self.executor.destroy()
+        self.executor = _Executor()
         self.state = Runtime.State.Init
+
+
+__all__ = ["Runtime"]
diff --git a/python/executor_py.cpp b/python/executor_py.cpp
index e10277646..5b4e7959f 100644
--- a/python/executor_py.cpp
+++ b/python/executor_py.cpp
@@ -161,21 +161,20 @@ static py::capsule tensor_to_dlpack(ark::Executor &self, const ark::Tensor &tens
 
 void register_executor(py::module &m) {
     py::class_<ark::Executor>(m, "_Executor")
-        .def(py::init([](int device_id, uintptr_t stream,
-                         const std::string &name, const std::string &plan,
-                         bool loop_mode) {
-            return new ark::Executor(device_id,
-                                     reinterpret_cast<ark::Stream>(stream),
-                                     name, plan, loop_mode);
-        }))
+        .def(py::init<>())
         .def("device_id", &ark::Executor::device_id)
         .def("stream",
              [](ark::Executor *self) {
                  return reinterpret_cast<uintptr_t>(self->stream());
              })
         .def("plan", &ark::Executor::plan)
-        .def("compile", &ark::Executor::compile)
-        .def("launch", &ark::Executor::launch)
+        .def("name", &ark::Executor::name)
+        .def("compile", &ark::Executor::compile, py::arg("device_id"),
+             py::arg("plan"), py::arg("name") = "executor")
+        .def("launch", [](ark::Executor *self, uintptr_t stream, bool loop_mode) {
+                 self->launch(reinterpret_cast<ark::Stream>(stream), loop_mode);
+             },
+             py::arg("stream") = 0, py::arg("loop_mode") = true)
         .def("run", &ark::Executor::run, py::arg("iter"))
         .def("wait", &ark::Executor::wait, py::arg("max_spin_count") = -1)
         .def("stop", &ark::Executor::stop, py::arg("max_spin_count") = -1)
@@ -207,6 +206,5 @@ void register_executor(py::module &m) {
                                size_t, uintptr_t, bool>(&tensor_write),
              py::arg("tensor"), py::arg("address"), py::arg("bytes"),
              py::arg("stream"), py::arg("is_d2d"))
-        .def("tensor_to_dlpack", &tensor_to_dlpack)
-        .def("add_plan", &ark::Executor::add_plan, py::arg("plan"));
+        .def("tensor_to_dlpack", &tensor_to_dlpack);
 }
diff --git a/python/unittest/test_runtime.py b/python/unittest/test_runtime.py
index b368bb93a..356430d9a 100644
--- a/python/unittest/test_runtime.py
+++ b/python/unittest/test_runtime.py
@@ -18,7 +18,7 @@ def test_runtime_relaunch():
         assert rt.launched() == True
 
 
-def test_add_plans():
+def test_runtime_init():
     ark.init()
     M, N = 64, 64
     input_tensor = ark.tensor([M, N], ark.fp16)
@@ -35,7 +35,7 @@ def test_add_plans():
     np.testing.assert_allclose(
         output_tensor_host, input_tensor_host + other_tensor_host
     )
-    runtime.reset(persist=True)
+    runtime.stop()
     ark.Model.reset()
     prev_output = output_tensor
     new_tensor = ark.tensor([M, N], ark.fp16)
@@ -51,7 +51,7 @@ def test_add_plans():
     runtime.reset()
 
 
-def test_reuse_plans():
+def test_runtime_reuse_plans():
     ark.init()
     M, N = 64, 64
     input_tensor = ark.tensor([M, N], ark.fp16)
@@ -68,7 +68,7 @@ def test_reuse_plans():
     np.testing.assert_allclose(
         output_tensor_host, input_tensor_host + other_tensor_host
     )
-    runtime.reset(persist=True)
+    runtime.stop()
     ark.Model.reset()
     runtime.launch()
     runtime.run()

From b0176ad47191e7cfda6db5f36b4f2c6dddc8c0d6 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 12 Aug 2024 02:24:12 +0000
Subject: [PATCH 64/79] lint

---
 python/ark/runtime.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/python/ark/runtime.py b/python/ark/runtime.py
index f3baf3994..1490cdeb8 100644
--- a/python/ark/runtime.py
+++ b/python/ark/runtime.py
@@ -90,7 +90,10 @@ def launch(
 
         # Recompile if the previous launch was not compiled with the same info
         # or if this is the first launch
-        if plan_str != self.executor.plan() or device_id != self.executor.device_id():
+        if (
+            plan_str != self.executor.plan()
+            or device_id != self.executor.device_id()
+        ):
             self.executor.compile(plan_str, device_id)
 
         self.executor.launch(stream, loop_mode)

From 4db38e131056a476bc6bb3baeaf220ee96abcb10 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 12 Aug 2024 02:26:42 +0000
Subject: [PATCH 65/79] minor fix

---
 ark/api/executor.cpp               | 6 +++---
 ark/api/executor_test.cpp          | 4 ++--
 ark/ops/ops_communication_test.cpp | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index 9d9d79a43..4505b2a35 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -1011,9 +1011,9 @@ std::string Executor::plan() const { return impl_->plan(); }
 
 const std::string &Executor::name() const { return impl_->name(); }
 
-void Executor::compile(int device_id, const std::string &plan,
+void Executor::compile(const std::string &plan, int device_id,
                        const std::string &name) {
-    impl_->compile(device_id, plan, name);
+    impl_->compile(plan, device_id, name);
 }
 
 void Executor::launch(Stream stream, bool loop_mode) {
@@ -1062,7 +1062,7 @@ DefaultExecutor::DefaultExecutor(
     for (const auto &rule : config_rules) {
         planner.install_config_rule(rule);
     }
-    compile(device_id, planner.plan(), name);
+    compile(planner.plan(), device_id, name);
     impl_->stream_raw_ = reinterpret_cast<gpuStream>(stream);
     impl_->loop_mode_ = loop_mode;
 }
diff --git a/ark/api/executor_test.cpp b/ark/api/executor_test.cpp
index e54578dfc..2cc3ee1c2 100644
--- a/ark/api/executor_test.cpp
+++ b/ark/api/executor_test.cpp
@@ -168,12 +168,12 @@ ark::unittest::State test_executor_invalid() {
     ark::Executor exe;
 
     // Invalid device ID.
-    UNITTEST_THROW(exe.compile(-1, ""), ark::InvalidUsageError);
+    UNITTEST_THROW(exe.compile("", -1), ark::InvalidUsageError);
 
     // Invalid rank.
     ark::PlanJson plan;
     plan["Rank"] = 1;
-    UNITTEST_THROW(exe.compile(0, plan.dump()), ark::InvalidUsageError);
+    UNITTEST_THROW(exe.compile(plan.dump(), 0), ark::InvalidUsageError);
 
     return ark::unittest::SUCCESS;
 }
diff --git a/ark/ops/ops_communication_test.cpp b/ark/ops/ops_communication_test.cpp
index 7a7fec523..39c466909 100644
--- a/ark/ops/ops_communication_test.cpp
+++ b/ark/ops/ops_communication_test.cpp
@@ -426,7 +426,7 @@ ark::unittest::State test_communication_send_recv_reduce() {
             ark::Planner planner(model, gpu_id);
             planner.install_config_rule(config_rule);
             ark::Executor exe;
-            exe.compile(gpu_id, planner.plan());
+            exe.compile(planner.plan(), gpu_id);
 
             std::vector<ark::half_t> data(1024);
             std::iota(data.begin(), data.end(), 1.0f);

From 15d423bed4fa399b02aefef5d0a9ed06d78fde7a Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 12 Aug 2024 03:39:05 +0000
Subject: [PATCH 66/79] more fixes

---
 ark/api/executor.cpp        | 38 +++++++++++++++++-----------------
 ark/api/executor_test.cpp   |  1 -
 ark/gpu/gpu_kernel.cpp      | 41 ++++++++++++++++++++-----------------
 ark/gpu/gpu_kernel.hpp      | 11 +++++-----
 ark/gpu/gpu_kernel_test.cpp |  4 ++--
 5 files changed, 48 insertions(+), 47 deletions(-)

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index 4505b2a35..626fed808 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -199,6 +199,8 @@ class Executor::Impl {
     int rank_;
     int world_size_;
 
+    std::string kernel_name_;
+
     bool is_launched_ = false;
     bool is_recording_ = false;
     float elapsed_msec_ = -1;
@@ -300,21 +302,9 @@ void Executor::Impl::init(const PlanJson &plan_json, int device_id,
         init_channels(remote_ranks);
     }
 
-    std::string kernel_name;
-    if (loop_mode_) {
-        // should we add an identifier to specify which plan the kernel executes
-        // i.e. ark_loop_kernel_2 for the second plan
-        kernel_name = "ark_loop_kernel";
-    } else {
-        kernel_name = "ark_kernel";
-    }
-    if (!name_.empty()) {
-        kernel_name += "_" + name_;
-    }
-
-    kernel_ = std::shared_ptr<GpuKernel>(new GpuKernel(
-        device_id_, codegen_->code(), {threads_per_block, 1, 1}, {num_sm, 1, 1},
-        std::max(smem_block_total, size_t(4)), kernel_name));
+    kernel_ = std::shared_ptr<GpuKernel>(
+        new GpuKernel(device_id_, codegen_->code(), {threads_per_block, 1, 1},
+                      {num_sm, 1, 1}, std::max(smem_block_total, size_t(4))));
 }
 
 void Executor::Impl::init_communicator() {
@@ -726,6 +716,17 @@ void Executor::Impl::launch(Stream stream, bool loop_mode) {
     }
     loop_mode_ = loop_mode;
 
+    if (loop_mode_) {
+        // should we add an identifier to specify which plan the kernel executes
+        // i.e. ark_loop_kernel_2 for the second plan
+        kernel_name_ = "ark_loop_kernel";
+    } else {
+        kernel_name_ = "ark_kernel";
+    }
+    if (!name_.empty()) {
+        kernel_name_ += "_" + name_;
+    }
+
     auto get_global_rt = [&](const std::string &symbol) {
         return reinterpret_cast<void *>(kernel_->get_global(symbol));
     };
@@ -786,7 +787,7 @@ void Executor::Impl::launch(Stream stream, bool loop_mode) {
         for (auto &buffer : external_buffers_) {
             args.push_back(&buffer);
         }
-        kernel_->launch(stream_raw_, args);
+        kernel_->launch(kernel_name_, stream_raw_, args);
     }
     is_recording_ = true;
     is_launched_ = true;
@@ -806,7 +807,7 @@ void Executor::Impl::run(int iter) {
             args.push_back(&buffer);
         }
         for (; i < iter; i++) {
-            kernel_->launch(stream_raw_, args);
+            kernel_->launch(kernel_name_, stream_raw_, args);
         }
     }
 }
@@ -822,9 +823,8 @@ void Executor::Impl::wait(int64_t max_spin_count) {
             gpuError res = gpuStreamQuery(stream_raw_);
             if (res == gpuSuccess) {
                 if (atomicLoadRelaxed(flag_->ref<int>()) > 0) {
-                    LOG(WARN,
+                    ERR(InternalError,
                         "Stream is finished but the loop flag is still set.");
-                    break;
                 } else {
                     LOG(WARN,
                         "wait() is delayed by a stream query. Regarding "
diff --git a/ark/api/executor_test.cpp b/ark/api/executor_test.cpp
index 2cc3ee1c2..c8c96fa6d 100644
--- a/ark/api/executor_test.cpp
+++ b/ark/api/executor_test.cpp
@@ -44,7 +44,6 @@ ark::unittest::State test_executor() {
     }
     {
         ark::DefaultExecutor executor(empty, 0, stream, {}, "test", LoopMode);
-        UNITTEST_THROW(executor.launch(), ark::InvalidUsageError);
 
         executor.launch();
         executor.launch();  // Will be ignored with a warning.
diff --git a/ark/gpu/gpu_kernel.cpp b/ark/gpu/gpu_kernel.cpp
index d4412f80e..a474b32a7 100644
--- a/ark/gpu/gpu_kernel.cpp
+++ b/ark/gpu/gpu_kernel.cpp
@@ -15,24 +15,18 @@ namespace ark {
 
 GpuKernel::GpuKernel(int gpu_id, const std::string& code,
                      const std::array<int, 3>& block_dim,
-                     const std::array<int, 3>& grid_dim, size_t smem_bytes,
-                     const std::string& kernel_name) {
-    this->init(gpu_id, code, block_dim, grid_dim, smem_bytes, kernel_name);
+                     const std::array<int, 3>& grid_dim, size_t smem_bytes) {
+    this->init(gpu_id, code, block_dim, grid_dim, smem_bytes);
 }
 
 void GpuKernel::init(int gpu_id, const std::string& code,
                      const std::array<int, 3>& block_dim,
-                     const std::array<int, 3>& grid_dim, size_t smem_bytes,
-                     const std::string& kernel_name) {
+                     const std::array<int, 3>& grid_dim, size_t smem_bytes) {
     gpu_manager_ = GpuManager::get_instance(gpu_id);
     code_ = code;
     block_dim_ = block_dim;
     grid_dim_ = grid_dim;
     smem_bytes_ = smem_bytes;
-    kernel_name_ = kernel_name;
-    if (kernel_name_.size() == 0) {
-        ERR(InvalidUsageError, "Invalid kernel name: ", kernel_name_);
-    }
 }
 
 void GpuKernel::compile() {
@@ -45,21 +39,30 @@ void GpuKernel::compile() {
     }
     bin_ = gpu_compile({code_}, gpu_manager_->info().arch, max_reg_cnt);
     GLOG_DRV(gpuModuleLoadData(&module_, bin_.c_str()));
-    GLOG_DRV(gpuModuleGetFunction(&function_, module_, kernel_name_.c_str()));
-
-    int static_smem_size_bytes;
-    GLOG_DRV(gpuFuncGetAttribute(&static_smem_size_bytes,
-                                 gpuFuncAttributeSharedSizeBytes, function_));
-    int dynamic_smem_size_bytes = smem_bytes_ - static_smem_size_bytes;
-    GLOG_DRV(gpuFuncSetAttribute(function_,
-                                 gpuFuncAttributeMaxDynamicSharedSizeBytes,
-                                 dynamic_smem_size_bytes));
 }
 
-void GpuKernel::launch(gpuStream stream, std::vector<void*>& args) {
+void GpuKernel::launch(const std::string& kernel_name, gpuStream stream,
+                       std::vector<void*>& args) {
     if (!this->is_compiled()) {
         ERR(InvalidUsageError, "Kernel is not compiled yet.");
     }
+    if (kernel_name.size() == 0) {
+        ERR(InvalidUsageError, "Invalid kernel name: ", kernel_name);
+    }
+    if (kernel_name_ != kernel_name) {
+        GLOG_DRV(
+            gpuModuleGetFunction(&function_, module_, kernel_name.c_str()));
+
+        int static_smem_size_bytes;
+        GLOG_DRV(gpuFuncGetAttribute(&static_smem_size_bytes,
+                                     gpuFuncAttributeSharedSizeBytes,
+                                     function_));
+        int dynamic_smem_size_bytes = smem_bytes_ - static_smem_size_bytes;
+        GLOG_DRV(gpuFuncSetAttribute(function_,
+                                     gpuFuncAttributeMaxDynamicSharedSizeBytes,
+                                     dynamic_smem_size_bytes));
+        kernel_name_ = kernel_name;
+    }
     gpu_manager_->launch(function_, grid_dim_, block_dim_, smem_bytes_, stream,
                          args.data(), nullptr);
     GLOG(gpuGetLastError());
diff --git a/ark/gpu/gpu_kernel.hpp b/ark/gpu/gpu_kernel.hpp
index 5308cfead..1e02cc7a1 100644
--- a/ark/gpu/gpu_kernel.hpp
+++ b/ark/gpu/gpu_kernel.hpp
@@ -18,19 +18,18 @@ class GpuKernel {
    public:
     GpuKernel(int gpu_id, const std::string& codes,
               const std::array<int, 3>& block_dim,
-              const std::array<int, 3>& grid_dim, size_t smem_bytes,
-              const std::string& kernel_name);
+              const std::array<int, 3>& grid_dim, size_t smem_bytes);
 
     void init(int gpu_id, const std::string& codes,
               const std::array<int, 3>& block_dim,
-              const std::array<int, 3>& grid_dim, size_t smem_bytes,
-              const std::string& kernel_name);
+              const std::array<int, 3>& grid_dim, size_t smem_bytes);
     void compile();
-    void launch(gpuStream stream, std::vector<void*>& args);
+    void launch(const std::string& kernel_name, gpuStream stream,
+                std::vector<void*>& args);
 
     gpuDeviceptr get_global(const std::string& name,
                             bool ignore_not_found = false) const;
-    bool is_compiled() const { return function_ != nullptr; }
+    bool is_compiled() const { return !bin_.empty(); }
 
    protected:
     std::shared_ptr<GpuManager> gpu_manager_;
diff --git a/ark/gpu/gpu_kernel_test.cpp b/ark/gpu/gpu_kernel_test.cpp
index 342ef9656..10e2410a9 100644
--- a/ark/gpu/gpu_kernel_test.cpp
+++ b/ark/gpu/gpu_kernel_test.cpp
@@ -8,13 +8,13 @@
 const std::string void_kernel = "extern \"C\" __global__ void kernel() {}";
 
 ark::unittest::State test_gpu_kernel() {
-    ark::GpuKernel kernel(0, void_kernel, {1, 1, 1}, {1, 1, 1}, 0, "kernel");
+    ark::GpuKernel kernel(0, void_kernel, {1, 1, 1}, {1, 1, 1}, 0);
     UNITTEST_TRUE(!kernel.is_compiled());
     kernel.compile();
     UNITTEST_TRUE(kernel.is_compiled());
     std::vector<void*> args;
     for (int i = 0; i < 10; i++) {
-        kernel.launch(nullptr, args);
+        kernel.launch("kernel", nullptr, args);
     }
     return ark::unittest::SUCCESS;
 }

From 802d84faf2bdff101262d61ebd4cc6992f10d87f Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 12 Aug 2024 08:10:22 +0000
Subject: [PATCH 67/79] error handling

---
 ark/api/executor.cpp      | 7 ++++++-
 ark/api/executor_test.cpp | 6 +++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index 626fed808..3fcecc12f 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -696,7 +696,12 @@ void Executor::Impl::compile(const std::string &plan, int device_id,
         ERR(InvalidUsageError, "Need to stop before re-compiling.");
         return;
     }
-    init(PlanJson::parse(plan), device_id, name);
+    try {
+        auto plan_json = Json::parse(plan);
+        init(plan_json, device_id, name);
+    } catch (const ::nlohmann::json::parse_error &e) {
+        ERR(InvalidUsageError, "Failed to parse the plan JSON: ", e.what());
+    }
     kernel_->compile();
 }
 
diff --git a/ark/api/executor_test.cpp b/ark/api/executor_test.cpp
index c8c96fa6d..fd036628f 100644
--- a/ark/api/executor_test.cpp
+++ b/ark/api/executor_test.cpp
@@ -166,8 +166,12 @@ ark::unittest::State test_executor_tensor_read_write_stride_offset() {
 ark::unittest::State test_executor_invalid() {
     ark::Executor exe;
 
+    // Invalid plan.
+    UNITTEST_THROW(exe.compile("not a json", 0), ark::InvalidUsageError);
+
     // Invalid device ID.
-    UNITTEST_THROW(exe.compile("", -1), ark::InvalidUsageError);
+    UNITTEST_THROW(exe.compile(ark::PlanJson().dump(), -1),
+                   ark::InvalidUsageError);
 
     // Invalid rank.
     ark::PlanJson plan;

From 18a391fec8f1b3ac3fa0ddbdd1409f737b89105c Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 12 Aug 2024 09:09:21 +0000
Subject: [PATCH 68/79] fix unit test

---
 ark/api/executor_test.cpp | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/ark/api/executor_test.cpp b/ark/api/executor_test.cpp
index fd036628f..cf3495780 100644
--- a/ark/api/executor_test.cpp
+++ b/ark/api/executor_test.cpp
@@ -82,7 +82,7 @@ ark::unittest::State test_executor_tensor_read_write(ark::Dims shape,
     m.noop(tensor);
 
     ark::DefaultExecutor executor(m, 0);
-    executor.launch();
+
     UNITTEST_NE(executor.tensor_address(tensor), nullptr);
 
     // Copy data from CPU array to ARK tensor
@@ -102,20 +102,28 @@ ark::unittest::State test_executor_tensor_read_write(ark::Dims shape,
         dev_data[i] = -1;
     }
 
+    ark::gpuStream stream;
     UNITTEST_EQ(
-        ark::gpuMemcpy(dev_data.data(), dev_ptr, shape.nelems() * sizeof(float),
-                       ark::gpuMemcpyDeviceToHost),
+        ark::gpuStreamCreateWithFlags(&stream, ark::gpuStreamNonBlocking),
         ark::gpuSuccess);
+
+    UNITTEST_EQ(ark::gpuMemcpyAsync(dev_data.data(), dev_ptr,
+                                    shape.nelems() * sizeof(float),
+                                    ark::gpuMemcpyDeviceToHost, stream),
+                ark::gpuSuccess);
+    UNITTEST_EQ(ark::gpuStreamSynchronize(stream), ark::gpuSuccess);
+
     for (size_t i = 0; i < dev_data.size(); ++i) {
         UNITTEST_EQ(dev_data[i], static_cast<float>(i));
         dev_data[i] = -1;
     }
 
     // Copy -1s back to GPU array
-    UNITTEST_EQ(
-        ark::gpuMemcpy(dev_ptr, dev_data.data(), shape.nelems() * sizeof(float),
-                       ark::gpuMemcpyHostToDevice),
-        ark::gpuSuccess);
+    UNITTEST_EQ(ark::gpuMemcpyAsync(dev_ptr, dev_data.data(),
+                                    shape.nelems() * sizeof(float),
+                                    ark::gpuMemcpyHostToDevice, stream),
+                ark::gpuSuccess);
+    UNITTEST_EQ(ark::gpuStreamSynchronize(stream), ark::gpuSuccess);
 
     // Copy data from GPU array to ARK tensor
     executor.tensor_write(tensor, dev_ptr, shape.nelems() * sizeof(float),
@@ -131,10 +139,6 @@ ark::unittest::State test_executor_tensor_read_write(ark::Dims shape,
     }
 
     // Provide a stream
-    ark::gpuStream stream;
-    UNITTEST_EQ(
-        ark::gpuStreamCreateWithFlags(&stream, ark::gpuStreamNonBlocking),
-        ark::gpuSuccess);
     executor.tensor_read(tensor, host_data.data(),
                          shape.nelems() * sizeof(float), stream);
     executor.tensor_write(tensor, host_data.data(),

From 7ae0a65f0ce2aba8e28b825e224d4864e8eb012c Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 12 Aug 2024 20:25:43 +0000
Subject: [PATCH 69/79] updates

---
 ark/api/executor.cpp        | 6 ++----
 ark/env.cpp                 | 4 ++--
 ark/gpu/gpu_kernel_test.cpp | 1 +
 ark/include/ark/error.hpp   | 2 +-
 python/ark/tensor.py        | 5 ++++-
 5 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index 3fcecc12f..162aaa1f0 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -3,14 +3,12 @@
 
 #include "ark/executor.hpp"
 
-#include <dlpack/dlpack.h>
-
 #include <cmath>
+#include <list>
 #include <memory>
 #include <mscclpp/core.hpp>
 #include <mscclpp/proxy_channel.hpp>
 #include <mscclpp/sm_channel.hpp>
-#include <tuple>
 
 #include "ark/data_type.hpp"
 #include "ark/model.hpp"
@@ -214,7 +212,7 @@ class Executor::Impl {
     std::shared_ptr<CodeGenerator> codegen_;
     std::shared_ptr<GpuEvent> timer_begin_;
     std::shared_ptr<GpuEvent> timer_end_;
-    std::vector<std::shared_ptr<GpuMemory>> buffers_;
+    std::list<std::shared_ptr<GpuMemory>> buffers_;
     std::shared_ptr<GpuHostMemory> flag_;
     std::shared_ptr<GpuStream> stream_;
     std::shared_ptr<GpuKernel> kernel_;
diff --git a/ark/env.cpp b/ark/env.cpp
index d8322378f..f9e7355ff 100644
--- a/ark/env.cpp
+++ b/ark/env.cpp
@@ -10,11 +10,11 @@
 #define DEFAULT_ARK_LOG_LEVEL "INFO"
 #define DEFAULT_ARK_ROOT "/usr/local/ark"
 #define DEFAULT_ARK_TMP "/tmp/ark"
-#define DEFAULT_ARK_KEEP_TMP true
+#define DEFAULT_ARK_KEEP_TMP false
 #define DEFAULT_ARK_HOSTFILE_NAME "hostfile"
 #define DEFAULT_ARK_NUM_RANKS_PER_HOST 8
 #define DEFAULT_ARK_DISABLE_IB false
-#define DEFAULT_ARK_IGNORE_BINARY_CACHE true
+#define DEFAULT_ARK_IGNORE_BINARY_CACHE false
 #define DEFAULT_ARK_ENFORCE_PLAN_PATH ""
 #define DEFAULT_ARK_MSCCLPP_PORT 50051
 
diff --git a/ark/gpu/gpu_kernel_test.cpp b/ark/gpu/gpu_kernel_test.cpp
index 10e2410a9..7b9f7f176 100644
--- a/ark/gpu/gpu_kernel_test.cpp
+++ b/ark/gpu/gpu_kernel_test.cpp
@@ -13,6 +13,7 @@ ark::unittest::State test_gpu_kernel() {
     kernel.compile();
     UNITTEST_TRUE(kernel.is_compiled());
     std::vector<void*> args;
+    UNITTEST_THROW(kernel.launch("", nullptr, args), ark::InvalidUsageError);
     for (int i = 0; i < 10; i++) {
         kernel.launch("kernel", nullptr, args);
     }
diff --git a/ark/include/ark/error.hpp b/ark/include/ark/error.hpp
index 1fbec0c01..965b1c0bc 100644
--- a/ark/include/ark/error.hpp
+++ b/ark/include/ark/error.hpp
@@ -44,4 +44,4 @@ REGISTER_ERROR_TYPE(UnitTestError)
 
 }  // namespace ark
 
-#endif  // ARK_ERROR_HPP
\ No newline at end of file
+#endif  // ARK_ERROR_HPP
diff --git a/python/ark/tensor.py b/python/ark/tensor.py
index 3fda8b3b3..9211f7d9d 100644
--- a/python/ark/tensor.py
+++ b/python/ark/tensor.py
@@ -162,7 +162,10 @@ def from_torch(tensor: torch.Tensor) -> "Tensor":
             raise ValueError("Torch tensor must be contiguous.")
         elif tensor.device.type == "cpu":
             raise ValueError("Torch tensor must be on a device.")
-        return Tensor.from_dlpack(torch.utils.dlpack.to_dlpack(tensor))
+        ark_tensor = Tensor.from_dlpack(torch.utils.dlpack.to_dlpack(tensor))
+        # Share ownership of the memory with the torch tensor
+        ark_tensor.__torch_buffer__ = tensor
+        return ark_tensor
 
     def copy(
         self, data: Union[np.ndarray, torch.Tensor], stream: int = 0

From 4cca6099ed496bdb2850ae5a9a2304143472d570 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 12 Aug 2024 21:01:30 +0000
Subject: [PATCH 70/79] fix unit test

---
 ark/utils/utils_net_test.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ark/utils/utils_net_test.cpp b/ark/utils/utils_net_test.cpp
index 4c3b6f162..95dda890c 100644
--- a/ark/utils/utils_net_test.cpp
+++ b/ark/utils/utils_net_test.cpp
@@ -12,6 +12,7 @@ ark::unittest::State test_ipc_hosts() {
     auto tmp_hostfile = tmp_dir + "/.test_ipc_hostfile";
     ark::write_file(tmp_hostfile, "127.0.0.1\n127.0.0.1\n127.0.0.1\n");
     ::setenv("ARK_HOSTFILE", tmp_hostfile.c_str(), 1);
+    ::setenv("ARK_KEEP_TMP", "1", 1);
     ark::init();
 
     UNITTEST_EQ(ark::get_host(0, true), "127.0.0.1");
@@ -31,6 +32,7 @@ ark::unittest::State test_ipc_hosts_unknown_host() {
     auto tmp_hostfile = tmp_dir + "/.test_ipc_hostfile";
     ark::write_file(tmp_hostfile, "unknown\nunknown\nunknown\n");
     ::setenv("ARK_HOSTFILE", tmp_hostfile.c_str(), 1);
+    ::setenv("ARK_KEEP_TMP", "1", 1);
     ark::init();
 
     UNITTEST_THROW(ark::get_host(0, true), ark::InvalidUsageError);

From 38367fa030cefd1126124ea695a39db3d12bf98e Mon Sep 17 00:00:00 2001
From: noli <nolawigerawork@gmail.com>
Date: Tue, 13 Aug 2024 09:40:57 +0000
Subject: [PATCH 71/79] adds placeholder op

---
 ark/api/executor.cpp             | 75 +++++++++++++++++++-------------
 ark/include/ark/executor.hpp     | 17 +++++---
 ark/include/ark/model.hpp        | 34 ++++++++++++++-
 ark/include/ark/tensor.hpp       | 11 +++++
 ark/model/model_op.cpp           |  2 +
 ark/model_buffer_manager.hpp     | 13 ++++--
 ark/ops/ops_placeholder.cpp      | 57 ++++++++++++++++++++++++
 ark/ops/ops_placeholder.hpp      | 23 ++++++++++
 ark/ops/ops_placeholder_test.cpp | 51 ++++++++++++++++++++++
 9 files changed, 241 insertions(+), 42 deletions(-)
 create mode 100644 ark/ops/ops_placeholder.cpp
 create mode 100644 ark/ops/ops_placeholder.hpp
 create mode 100644 ark/ops/ops_placeholder_test.cpp

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index 162aaa1f0..7823c324c 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -26,6 +26,7 @@
 #include "model/model_data_type.hpp"
 #include "model/model_tensor.hpp"
 #include "model_buffer_manager.hpp"
+#include "unordered_map"
 #include "utils/utils_net.hpp"
 
 #if defined(ARK_CUDA)
@@ -143,7 +144,10 @@ static size_t tensor_stride_bytes(const Json &tensor) {
 
 class Executor::Impl {
    public:
-    Impl() : plan_json_(), device_id_(-1) {};
+    Impl()
+        : plan_json_(),
+          device_id_(-1),
+          buffer_manager_(ModelBufferManager::get_instance()) {};
     ~Impl();
 
     int device_id() const { return device_id_; }
@@ -160,8 +164,10 @@ class Executor::Impl {
 
     void compile(const std::string &plan, int device_id,
                  const std::string &name);
-    void launch(Stream stream, bool loop_mode);
-    void run(int iter);
+    void launch(Stream stream, bool loop_mode,
+                const std::unordered_map<Tensor, void *> &external_tensors);
+    void run(int iter,
+             const std::unordered_map<Tensor, void *> &external_tensors);
     void wait(int64_t max_spin_count);
     float stop(int64_t max_spin_count);
     void barrier();
@@ -203,6 +209,7 @@ class Executor::Impl {
     bool is_recording_ = false;
     float elapsed_msec_ = -1;
 
+    ModelBufferManager &buffer_manager_;
     std::vector<void *> external_buffers_;
     std::vector<std::string> external_args_;
     std::map<size_t, std::string> buffer_id_to_name_;
@@ -408,45 +415,40 @@ std::map<size_t, size_t> Executor::Impl::init_buffers(const Json &plan_json) {
     for (auto &kv : buffer_id_to_info) {
         auto &buf_info = kv.second;
         int r = buf_info->buffer->rank();
+        const size_t buf_id = buf_info->buffer->id();
         if (r != rank_ && r != -1) {
             // this is a remote buffer
             for (const auto &tag_info : buf_info->buffer->send_tags()) {
                 remote_rank_to_send_tag_to_buffer_id[buf_info->buffer->rank()]
-                                                    [tag_info.second] =
-                                                        buf_info->buffer->id();
+                                                    [tag_info.second] = buf_id;
             }
             for (const auto &tag_info : buf_info->buffer->recv_tags()) {
                 remote_rank_to_recv_tag_to_buffer_id[buf_info->buffer->rank()]
-                                                    [tag_info.second] =
-                                                        buf_info->buffer->id();
+                                                    [tag_info.second] = buf_id;
             }
             continue;
         }
-        if (buf_info->buffer->is_external()) {
+        if (buffer_manager_.is_external(buf_id)) {
             if (buf_info->buffer->device_id() != device_id_) {
                 ERR(InvalidUsageError,
                     "PyTorch tensor and model execution are on different GPUs");
             }
-            external_buffers_.push_back(buf_info->buffer->external_data());
+            external_buffers_.push_back(buffer_manager_.get_buffer(buf_id));
             const auto [it, inserted] = buffer_id_to_name_.try_emplace(
-                buf_info->buffer->id(),
-                "extern_buf_" + std::to_string(buf_info->buffer->id()));
+                buf_id, "extern_buf_" + std::to_string(buf_id));
             external_args_.push_back(it->second);
             continue;
         }
         // if we are adding a plan and come across a buffer from a previous
         // plan, we utilize the buffer offset from the previous plan
-        if (buffer_id_to_offset_.find(buf_info->buffer->id()) !=
-            buffer_id_to_offset_.end()) {
-            external_buffers_.push_back(
-                buffer_id_to_addr_[buf_info->buffer->id()]);
-            const std::string name =
-                "extern_buf_" + std::to_string(buf_info->buffer->id());
+        if (buffer_id_to_offset_.find(buf_id) != buffer_id_to_offset_.end()) {
+            external_buffers_.push_back(buffer_id_to_addr_[buf_id]);
+            const std::string name = "extern_buf_" + std::to_string(buf_id);
             external_args_.push_back(name);
-            buffer_id_to_name_[buf_info->buffer->id()] = name;
+            buffer_id_to_name_[buf_id] = name;
             continue;
         } else {
-            buffer_id_to_offset[buf_info->buffer->id()] = offset;
+            buffer_id_to_offset[buf_id] = offset;
             for (const auto &tag_info : buf_info->buffer->send_tags()) {
                 remote_rank_to_send_tags_and_offsets[tag_info.first]
                     .first.push_back(tag_info.second);
@@ -536,8 +538,9 @@ std::map<size_t, size_t> Executor::Impl::init_buffers(const Json &plan_json) {
         bootstrap->recv(tags.data(), len * sizeof(int), remote_rank, 1);
         bootstrap->recv(offsets.data(), len * sizeof(size_t), remote_rank, 2);
         for (int i = 0; i < len; ++i) {
-            if (!buffer_id_to_info[send_tag_to_buffer_id[tags[i]]]
-                     ->buffer->is_external()) {
+            const size_t buf_id =
+                buffer_id_to_info[send_tag_to_buffer_id[tags[i]]]->buffer->id();
+            if (!buffer_manager_.is_external(buf_id)) {
                 buffer_id_to_offset[send_tag_to_buffer_id[tags[i]]] =
                     offsets[i];
             }
@@ -556,8 +559,9 @@ std::map<size_t, size_t> Executor::Impl::init_buffers(const Json &plan_json) {
         bootstrap->recv(tags.data(), len * sizeof(int), remote_rank, 4);
         bootstrap->recv(offsets.data(), len * sizeof(size_t), remote_rank, 5);
         for (int i = 0; i < len; ++i) {
-            if (!buffer_id_to_info[recv_tag_to_buffer_id[tags[i]]]
-                     ->buffer->is_external()) {
+            const size_t buf_id =
+                buffer_id_to_info[recv_tag_to_buffer_id[tags[i]]]->buffer->id();
+            if (!buffer_manager_.is_external(buf_id)) {
                 buffer_id_to_offset[recv_tag_to_buffer_id[tags[i]]] =
                     offsets[i];
             }
@@ -703,7 +707,9 @@ void Executor::Impl::compile(const std::string &plan, int device_id,
     kernel_->compile();
 }
 
-void Executor::Impl::launch(Stream stream, bool loop_mode) {
+void Executor::Impl::launch(
+    Stream stream, bool loop_mode,
+    const std::unordered_map<Tensor, void *> &external_tensors) {
     if ((kernel_ == nullptr) || !kernel_->is_compiled()) {
         ERR(InvalidUsageError, "Need to compile first before launch.");
     }
@@ -796,7 +802,8 @@ void Executor::Impl::launch(Stream stream, bool loop_mode) {
     is_launched_ = true;
 }
 
-void Executor::Impl::run(int iter) {
+void Executor::Impl::run(
+    int iter, const std::unordered_map<Tensor, void *> &external_tensors) {
     if (iter <= 0) return;
     if (loop_mode_) {
         while (atomicLoadRelaxed(flag_->ref<int>()) > 0) {
@@ -888,7 +895,7 @@ void *Executor::Impl::tensor_address(const Tensor &tensor) const {
 void Executor::Impl::tensor_read(const Tensor &tensor, void *data, size_t bytes,
                                  Stream stream, bool is_d2d) const {
     GLOG(gpuSetDevice(device_id_));
-    if (tensor.ref()->buffer()->is_external()) {
+    if (buffer_manager_.is_external(tensor.ref()->buffer()->id())) {
         ERR(InvalidUsageError,
             "Reading data from a tensor preallocated by PyTorch is not "
             "supported. Use PyTorch's native methods.");
@@ -944,7 +951,7 @@ void Executor::Impl::tensor_write(const Tensor &tensor, const void *data,
                                   size_t bytes, Stream stream,
                                   bool is_d2d) const {
     GLOG(gpuSetDevice(device_id_));
-    if (tensor.ref()->buffer()->is_external()) {
+    if (buffer_manager_.is_external(tensor.ref()->buffer()->id())) {
         ERR(InvalidUsageError,
             "Writing data to a tensor preallocated by PyTorch is not "
             "supported. Use PyTorch's native methods.");
@@ -1019,11 +1026,16 @@ void Executor::compile(const std::string &plan, int device_id,
     impl_->compile(plan, device_id, name);
 }
 
-void Executor::launch(Stream stream, bool loop_mode) {
-    impl_->launch(stream, loop_mode);
+void Executor::launch(
+    Stream stream, bool loop_mode,
+    const std::unordered_map<Tensor, void *> &external_tensors) {
+    impl_->launch(stream, loop_mode, external_tensors);
 }
 
-void Executor::run(int iter) { impl_->run(iter); }
+void Executor::run(int iter,
+                   const std::unordered_map<Tensor, void *> &external_tensors) {
+    impl_->run(iter, external_tensors);
+}
 
 void Executor::wait(int64_t max_spin_count) { impl_->wait(max_spin_count); }
 
@@ -1071,7 +1083,8 @@ DefaultExecutor::DefaultExecutor(
 }
 
 void DefaultExecutor::launch() {
-    Executor::launch(reinterpret_cast<Stream>(impl_->stream_raw_), impl_->loop_mode_);
+    Executor::launch(reinterpret_cast<Stream>(impl_->stream_raw_),
+                     impl_->loop_mode_);
 }
 
 }  // namespace ark
diff --git a/ark/include/ark/executor.hpp b/ark/include/ark/executor.hpp
index 8e6577cd2..8e5e5c852 100644
--- a/ark/include/ark/executor.hpp
+++ b/ark/include/ark/executor.hpp
@@ -9,6 +9,7 @@
 #include <ark/tensor.hpp>
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 namespace ark {
@@ -45,10 +46,13 @@ class Executor {
                  const std::string &name = "executor");
 
     /// Launch the executor. This must be called after `compile()`.
-    void launch(Stream stream = nullptr, bool loop_mode = true);
+    void launch(
+        Stream stream = nullptr, bool loop_mode = true,
+        const std::unordered_map<Tensor, void *> &external_tensors = {});
 
     /// Run the executor for `iter` iterations.
-    void run(int iter);
+    void run(int iter,
+             const std::unordered_map<Tensor, void *> &external_tensors = {});
 
     /// Wait for the previous run to finish.
     void wait(int64_t max_spin_count = -1);
@@ -99,10 +103,11 @@ class Model;
 
 class DefaultExecutor : public Executor {
    public:
-    DefaultExecutor(
-        const Model &model, int device_id = -1, Stream stream = nullptr,
-        const std::vector<Planner::ConfigRule> &config_rules = {},
-        const std::string &name = "DefaultExecutor", bool loop_mode = true);
+    DefaultExecutor(const Model &model, int device_id = -1,
+                    Stream stream = nullptr,
+                    const std::vector<Planner::ConfigRule> &config_rules = {},
+                    const std::string &name = "DefaultExecutor",
+                    bool loop_mode = true);
 
     /// Launch the default executor.
     void launch();
diff --git a/ark/include/ark/model.hpp b/ark/include/ark/model.hpp
index 3c4f22e22..08b8fe639 100644
--- a/ark/include/ark/model.hpp
+++ b/ark/include/ark/model.hpp
@@ -76,6 +76,39 @@ class Model : public ModelGraph {
                   const Dims &padded_shape = {}, int rank = -1,
                   const std::string &name = "");
 
+    ///
+    /// Returns a tensor object associated with an external buffer.
+    ///
+    /// @param shape Shape of the tensor, where the data of interest is.
+    /// @param dtype Type of the tensor data.
+    /// @param strides Strides of each dimension of the tensor, which may be
+    /// different from the shape. @p strides can be considered as the actual
+    /// shape of the underlying data buffer.
+    /// @param offsets Offsets of the tensor. The data of interest starts at
+    /// @p offsets and ends at @p offsets + @p padded_shape.
+    /// @param padded_shape Padded shape of the tensor. Padding is used to
+    /// reserve extra space for the tensor when computation requires it.
+    /// Data on the padded region is allowed to be accessed by computation,
+    /// but it is not considered as the data of interest. The padded region is
+    /// initialized to zero only once when the Executor is launched. The padded
+    /// shape should be greater than or equal to the @p shape, and the
+    /// @p strides should be greater than or equal to the padded shape. If the
+    /// @p strides are not provided, they are set to the padded shape. If the
+    /// padded shape is not provided, it is set to the @p shape.
+    /// @param rank Rank of the tensor. -1 means the rank of this model.
+    /// @param name Name of the tensor.
+    /// @param external_data Pointer to an external data buffer. If provided,
+    /// this buffer is registered with the ModelBufferManager and associated
+    /// with the tensor.
+    /// @return Pointer to a tensor object that references the external buffer.
+    ///
+    ///
+    Tensor placeholder(const Dims &shape, const DataType &data_type,
+                       const Dims &strides = {}, const Dims &offsets = {},
+                       const Dims &padded_shape = {}, int rank = -1,
+                       const std::string &name = "",
+                       void *external_data = nullptr);
+
     Tensor refer(Tensor input, const Dims &shape = {}, const Dims &strides = {},
                  const Dims &offsets = {}, const Dims &padded_shape = {},
                  const std::string &name = "");
@@ -254,7 +287,6 @@ class Model : public ModelGraph {
 
     Tensor local_all_reduce(Tensor input, int gpu_id, int gpu_num,
                             const std::string &name = "");
-
 };
 
 }  // namespace ark
diff --git a/ark/include/ark/tensor.hpp b/ark/include/ark/tensor.hpp
index 5e463f99d..816738c07 100644
--- a/ark/include/ark/tensor.hpp
+++ b/ark/include/ark/tensor.hpp
@@ -54,6 +54,8 @@ class Tensor {
     const DataType &data_type() const;
 
     Dims torch_strides() const;
+
+    friend struct std::hash<Tensor>;
 };
 
 const Tensor NullTensor;
@@ -62,4 +64,13 @@ std::ostream &operator<<(std::ostream &os, const Tensor &tensor);
 
 }  // namespace ark
 
+namespace std {
+template <>
+struct hash<ark::Tensor> {
+    size_t operator()(const ark::Tensor &t) const {
+        return hash<size_t>()(t.id());
+    }
+};
+}  // namespace std
+
 #endif  // ARK_TENSOR_HPP
diff --git a/ark/model/model_op.cpp b/ark/model/model_op.cpp
index 5db8576e8..8f222b75d 100644
--- a/ark/model/model_op.cpp
+++ b/ark/model/model_op.cpp
@@ -16,6 +16,7 @@
 #include "ops/ops_math.hpp"
 #include "ops/ops_matmul.hpp"
 #include "ops/ops_noop.hpp"
+#include "ops/ops_placeholder.hpp"
 #include "ops/ops_reduce.hpp"
 #include "ops/ops_refer.hpp"
 #include "ops/ops_reshape.hpp"
@@ -78,6 +79,7 @@ const ModelOpType ModelOpT::from_name(const std::string &type_name) {
         MODEL_OP_TYPE_REGISTER(Sqrt);
         MODEL_OP_TYPE_REGISTER(Sub);
         MODEL_OP_TYPE_REGISTER(Tensor);
+        MODEL_OP_TYPE_REGISTER(Placeholder);
         MODEL_OP_TYPE_REGISTER(Transpose);
         MODEL_OP_TYPE_REGISTER(SendPacket);
         MODEL_OP_TYPE_REGISTER(RecvPacket);
diff --git a/ark/model_buffer_manager.hpp b/ark/model_buffer_manager.hpp
index 4baaec7fe..3e82b05f5 100644
--- a/ark/model_buffer_manager.hpp
+++ b/ark/model_buffer_manager.hpp
@@ -8,7 +8,8 @@
 #include <unordered_map>
 
 namespace ark {
-// Manages externally allocated buffers not in the ARK memory space.
+// Manages externally allocated buffers (buffers corresponding to Tensors that
+// are the output of a `placeholder` operation) outside of ARK's memory space.
 class ModelBufferManager {
    public:
     static ModelBufferManager& get_instance() {
@@ -16,11 +17,11 @@ class ModelBufferManager {
         return instance;
     }
 
-    void register_buffer(size_t id, void* data, size_t size) {
+    void register_buffer(const size_t id, void* const data, const size_t size) {
         buffers_[id] = std::make_tuple(data, size);
     }
 
-    void* get_buffer(size_t id) {
+    void* get_buffer(const size_t id) const {
         auto it = buffers_.find(id);
         if (it != buffers_.end()) {
             return std::get<0>(it->second);
@@ -28,7 +29,7 @@ class ModelBufferManager {
         return nullptr;
     }
 
-    size_t get_buffer_size(size_t id) {
+    size_t get_buffer_size(const size_t id) const {
         auto it = buffers_.find(id);
         if (it != buffers_.end()) {
             return std::get<1>(it->second);
@@ -36,6 +37,10 @@ class ModelBufferManager {
         return 0;
     }
 
+    bool is_external(const size_t id) const {
+        return buffers_.find(id) != buffers_.end();
+    }
+
     const std::unordered_map<size_t, std::tuple<void*, size_t>>& get_buffers()
         const {
         return buffers_;
diff --git a/ark/ops/ops_placeholder.cpp b/ark/ops/ops_placeholder.cpp
new file mode 100644
index 000000000..fbac73902
--- /dev/null
+++ b/ark/ops/ops_placeholder.cpp
@@ -0,0 +1,57 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "ops_placeholder.hpp"
+
+#include "logging.hpp"
+#include "model_buffer_manager.hpp"
+#include "ops_common.hpp"
+
+namespace ark {
+
+ModelOpPlaceholder::ModelOpPlaceholder(ModelBufferRef buffer, const Dims &shape,
+                                       ModelDataType data_type,
+                                       const Dims &strides, const Dims &offsets,
+                                       const Dims &padded_shape,
+                                       void *external_data)
+    : ModelOp("Placeholder", true) {
+    if (!buffer) {
+        buffer = std::make_shared<ModelBuffer>();
+    }
+    const std::vector<DimType> &shape_vec = shape.vector();
+    DataType dtype = ModelDataType(data_type);
+
+    size_t external_data_size =
+        std::accumulate(shape_vec.begin(), shape_vec.end(), 1,
+                        std::multiplies<int64_t>()) *
+        dtype.bytes();
+
+    ModelBufferManager::get_instance().register_buffer(
+        buffer->id(), external_data, external_data_size);
+
+    ModelTensorRef tensor = std::make_shared<ModelTensor>(
+        data_type, buffer, shape, strides, offsets, padded_shape);
+
+    result_tensors_.emplace_back(tensor);
+
+    verify();
+}
+
+Tensor Model::placeholder(const Dims &shape, const DataType &data_type,
+                          const Dims &strides, const Dims &offsets,
+                          const Dims &padded_shape, int rank,
+                          const std::string &name, void *external_data) {
+    if (rank != -1) {
+        if (rank == this->rank()) {
+            rank = -1;
+        } else if (rank < 0 || rank >= this->world_size()) {
+            ERR(ModelError, "Invalid rank %d", rank);
+        }
+    }
+    return impl_
+        ->create_op<ModelOpPlaceholder>(
+            name, std::make_shared<ModelBuffer>(rank), shape, data_type.ref(),
+            strides, offsets, padded_shape, external_data)
+        ->result_tensors()[0];
+}
+}  // namespace ark
\ No newline at end of file
diff --git a/ark/ops/ops_placeholder.hpp b/ark/ops/ops_placeholder.hpp
new file mode 100644
index 000000000..7fb53f983
--- /dev/null
+++ b/ark/ops/ops_placeholder.hpp
@@ -0,0 +1,23 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef ARK_OPS_PLACEHOLDER_HPP_
+#define ARK_OPS_PLACEHOLDER_HPP_
+
+#include "ark/model.hpp"
+#include "model/model_op.hpp"
+
+namespace ark {
+
+class ModelOpPlaceholder : public ModelOp {
+   public:
+    ModelOpPlaceholder() = default;
+    ModelOpPlaceholder(ModelBufferRef buffer, const Dims &shape,
+                       ModelDataType data_type, const Dims &strides,
+                       const Dims &offsets, const Dims &padded_shape,
+                       void *external_data = nullptr);
+};
+
+}  // namespace ark
+
+#endif  // ARK_OPS_PLACEHOLDER_HPP_
\ No newline at end of file
diff --git a/ark/ops/ops_placeholder_test.cpp b/ark/ops/ops_placeholder_test.cpp
new file mode 100644
index 000000000..37c047774
--- /dev/null
+++ b/ark/ops/ops_placeholder_test.cpp
@@ -0,0 +1,51 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include <cstring>
+#include <numeric>
+
+#include "ark/executor.hpp"
+#include "gpu/gpu.hpp"
+#include "logging.hpp"
+#include "model/model_node.hpp"
+#include "model/model_op.hpp"
+#include "ops_test_common.hpp"
+
+ark::unittest::State test_ops_placeholder_value_contiguous() {
+    ark::Model model;
+    ark::Dims shape{10, 1};
+
+    // Allocate GPU memory for the external buffer
+    float *d_ext_buffer = nullptr;
+    ark::gpuMalloc(&d_ext_buffer, shape.nelems() * sizeof(float));
+
+    // Initialize GPU Memory
+    std::vector<float> h_ext_buffer(shape.nelems());
+    std::iota(h_ext_buffer.begin(), h_ext_buffer.end(), 1.0f);
+    ark::gpuMemcpy(d_ext_buffer, h_ext_buffer.data(),
+                   shape.nelems() * sizeof(float), ark::gpuMemcpyHostToDevice);
+
+    // Associate the initialzied device buffer with a tensor produced from a
+    // placeholder operation
+    auto tns =
+        model.placeholder(shape, ark::FP32, {}, {}, {}, -1, "", d_ext_buffer);
+
+    // Copy tensor data from GPU to CPU
+    std::vector<float> res(shape.nelems(), 0.0f);
+    ark::gpuMemcpy(res.data(), d_ext_buffer, shape.nelems() * sizeof(float),
+                   ark::gpuMemcpyDeviceToHost);
+
+    for (auto i = 0; i < shape.nelems(); ++i) {
+        UNITTEST_EQ(res[i], i + 1);
+    }
+
+    cudaFree(d_ext_buffer);
+
+    return ark::unittest::SUCCESS;
+}
+
+int main() {
+    ark::init();
+    UNITTEST(test_ops_placeholder_value_contiguous);
+    return ark::unittest::SUCCESS;
+}
\ No newline at end of file

From 920807f2a22afc28cc80b3904cdfe343753a5cfa Mon Sep 17 00:00:00 2001
From: noli <nolawigerawork@gmail.com>
Date: Tue, 13 Aug 2024 10:28:46 +0000
Subject: [PATCH 72/79] fix test

---
 ark/ops/ops_placeholder_test.cpp | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/ark/ops/ops_placeholder_test.cpp b/ark/ops/ops_placeholder_test.cpp
index 37c047774..59f5e2dc0 100644
--- a/ark/ops/ops_placeholder_test.cpp
+++ b/ark/ops/ops_placeholder_test.cpp
@@ -1,13 +1,9 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include <cstring>
-#include <numeric>
-
 #include "ark/executor.hpp"
 #include "gpu/gpu.hpp"
 #include "logging.hpp"
-#include "model/model_node.hpp"
 #include "model/model_op.hpp"
 #include "ops_test_common.hpp"
 
@@ -25,18 +21,25 @@ ark::unittest::State test_ops_placeholder_value_contiguous() {
     ark::gpuMemcpy(d_ext_buffer, h_ext_buffer.data(),
                    shape.nelems() * sizeof(float), ark::gpuMemcpyHostToDevice);
 
-    // Associate the initialzied device buffer with a tensor produced from a
+    // Associate the initialized device buffer with a tensor produced from a
     // placeholder operation
-    auto tns =
+    ark::Tensor tns =
         model.placeholder(shape, ark::FP32, {}, {}, {}, -1, "", d_ext_buffer);
 
+    ark::Tensor res = model.add(tns, 1.0);
+
+    ark::DefaultExecutor exe(model);
+
+    exe.launch();
+    exe.run(1);
+    exe.stop();
+
     // Copy tensor data from GPU to CPU
-    std::vector<float> res(shape.nelems(), 0.0f);
-    ark::gpuMemcpy(res.data(), d_ext_buffer, shape.nelems() * sizeof(float),
-                   ark::gpuMemcpyDeviceToHost);
+    std::vector<float> h_res(shape.nelems(), 0.0f);
+    exe.tensor_read(res, h_res);
 
     for (auto i = 0; i < shape.nelems(); ++i) {
-        UNITTEST_EQ(res[i], i + 1);
+        UNITTEST_EQ(h_res[i], i + 2);
     }
 
     cudaFree(d_ext_buffer);

From 45b14b886571aa35f41fa5fc51ad97c3dd0b4ac1 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 14 Aug 2024 10:07:28 +0000
Subject: [PATCH 73/79] fix imports & pytest

---
 .dockerignore                   |  1 +
 .github/workflows/codeql.yml    |  4 ++--
 .github/workflows/ut-cuda.yml   |  8 ++++++--
 python/CMakeLists.txt           |  5 ++++-
 python/ark/__init__.py          | 10 +---------
 python/ark/data_type.py         |  2 +-
 python/ark/error.py             | 16 ++++++++--------
 python/ark/init.py              |  2 +-
 python/ark/model.py             |  2 +-
 python/ark/planner.py           |  2 +-
 python/ark/runtime.py           |  2 +-
 python/ark/tensor.py            |  2 +-
 python/unittest/test.py         |  6 ------
 python/unittest/test_error.py   |  4 ++--
 python/unittest/test_model.py   |  5 ++---
 python/unittest/test_runtime.py |  8 ++++----
 python/unittest/test_tensor.py  |  3 +--
 17 files changed, 37 insertions(+), 45 deletions(-)

diff --git a/.dockerignore b/.dockerignore
index e47f48873..60583dbf9 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -6,6 +6,7 @@ build/
 *.pyc
 *.pyo
 *.pyd
+.pytest_cache/
 
 # Git
 **/.git
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 509ac6d48..7ac2f1649 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -49,7 +49,7 @@ jobs:
     - name: Build
       run: |
         mkdir build && cd build
-        cmake -DCMAKE_BUILD_TYPE=Debug -DARK_BUILD_PYTHON=ON -DARK_BYPASS_GPU_CHECK=ON -DARK_USE_CUDA=ON -DARK_BUILD_TESTS=OFF ..
+        cmake -DCMAKE_BUILD_TYPE=Debug -DARK_BYPASS_GPU_CHECK=ON -DARK_USE_CUDA=ON -DARK_BUILD_TESTS=OFF ..
         make build ark_py
 
     - name: Perform CodeQL Analysis
@@ -95,7 +95,7 @@ jobs:
     - name: Build
       run: |
         mkdir build && cd build
-        CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Debug -DARK_BUILD_PYTHON=ON -DARK_BYPASS_GPU_CHECK=ON -DARK_USE_ROCM=ON -DARK_BUILD_TESTS=OFF ..
+        CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Debug -DARK_BYPASS_GPU_CHECK=ON -DARK_USE_ROCM=ON -DARK_BUILD_TESTS=OFF ..
         make -j build ark_py
 
     - name: Perform CodeQL Analysis
diff --git a/.github/workflows/ut-cuda.yml b/.github/workflows/ut-cuda.yml
index 363f1b771..3fa92605e 100644
--- a/.github/workflows/ut-cuda.yml
+++ b/.github/workflows/ut-cuda.yml
@@ -44,7 +44,7 @@ jobs:
       - name: Build
         run: |
           mkdir build && cd build
-          cmake -DCMAKE_BUILD_TYPE=Debug -DARK_BUILD_PYTHON=ON ..
+          cmake -DCMAKE_BUILD_TYPE=Debug ..
           make -j ut ark_py
 
       - name: Run C++ UT
@@ -71,7 +71,11 @@ jobs:
       - name: Run Python UT
         run: |
           cd build
-          ARK_ROOT=$PWD pytest --cov=../python/ark --cov-report lcov:py_coverage.info --verbose ../python/unittest/test.py
+          PYTHONPATH=$PWD/python ARK_ROOT=$PWD python3 -m pytest \
+              --cov=../python/ark \
+              --cov-report lcov:py_coverage.info \
+              --verbose \
+              ../python/unittest/test.py
 
       - name: Report Coverage
         env:
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 2e160f8d1..597388e2d 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -18,7 +18,10 @@ FetchContent_MakeAvailable(pybind11)
 
 file(GLOB_RECURSE BIND_SOURCES CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
 pybind11_add_module(ark_py ${BIND_SOURCES})
-set_target_properties(ark_py PROPERTIES OUTPUT_NAME _ark_core)
+set_target_properties(ark_py PROPERTIES OUTPUT_NAME _ark_core LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/ark)
+add_custom_command(TARGET ark_py POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_SOURCE_DIR}/ark ${CMAKE_CURRENT_BINARY_DIR}/ark
+)
 target_link_libraries(ark_py PRIVATE ark_static)
 target_include_directories(ark_py SYSTEM PRIVATE ${DLPACK_INCLUDE_DIRS})
 target_include_directories(ark_py PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../ark)
diff --git a/python/ark/__init__.py b/python/ark/__init__.py
index 939c4837f..1aebfa43f 100644
--- a/python/ark/__init__.py
+++ b/python/ark/__init__.py
@@ -1,15 +1,12 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-import sys
 import os
 
 if os.environ.get("ARK_ROOT", None) is None:
     os.environ["ARK_ROOT"] = os.path.abspath(os.path.dirname(__file__))
 
-sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
-
-import _ark_core
+from . import _ark_core
 from .model import Model
 
 
@@ -21,11 +18,6 @@ def version():
     return __version__
 
 
-def srand(seed):
-    """Sets the seed for random number generation."""
-    _ark_core.srand(seed)
-
-
 def set_rank(rank):
     """Sets the rank of the current process."""
     Model.set_rank(rank)
diff --git a/python/ark/data_type.py b/python/ark/data_type.py
index 41c4201c3..8ab982106 100644
--- a/python/ark/data_type.py
+++ b/python/ark/data_type.py
@@ -2,7 +2,7 @@
 # Licensed under the MIT license.
 
 import numpy
-import _ark_core
+from . import _ark_core
 
 try:
     import torch
diff --git a/python/ark/error.py b/python/ark/error.py
index 4ffe6a3f8..cec8ab137 100644
--- a/python/ark/error.py
+++ b/python/ark/error.py
@@ -1,14 +1,14 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-from _ark_core import _BaseError as BaseError
-from _ark_core import _InternalError as InternalError
-from _ark_core import _InvalidUsageError as InvalidUsageError
-from _ark_core import _ModelError as ModelError
-from _ark_core import _PlanError as PlanError
-from _ark_core import _UnsupportedError as UnsupportedError
-from _ark_core import _SystemError as SystemError
-from _ark_core import _GpuError as GpuError
+from ._ark_core import _BaseError as BaseError
+from ._ark_core import _InternalError as InternalError
+from ._ark_core import _InvalidUsageError as InvalidUsageError
+from ._ark_core import _ModelError as ModelError
+from ._ark_core import _PlanError as PlanError
+from ._ark_core import _UnsupportedError as UnsupportedError
+from ._ark_core import _SystemError as SystemError
+from ._ark_core import _GpuError as GpuError
 
 __all__ = [
     "BaseError",
diff --git a/python/ark/init.py b/python/ark/init.py
index a4a67e85d..7daa0771b 100644
--- a/python/ark/init.py
+++ b/python/ark/init.py
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-import _ark_core
+from . import _ark_core
 from .model import Model
 from .runtime import _RuntimeState
 
diff --git a/python/ark/model.py b/python/ark/model.py
index e6208fc16..87af88f49 100644
--- a/python/ark/model.py
+++ b/python/ark/model.py
@@ -2,7 +2,7 @@
 # Licensed under the MIT license.
 
 from typing import NewType
-from _ark_core import _Model
+from ._ark_core import _Model
 
 _ModelState = NewType("_ModelState", None)
 
diff --git a/python/ark/planner.py b/python/ark/planner.py
index e7eb2e7ed..e5291bbce 100644
--- a/python/ark/planner.py
+++ b/python/ark/planner.py
@@ -5,7 +5,7 @@
 import json
 from typing import Callable, Dict, List, Any
 
-from _ark_core import _Planner, _PlannerContext
+from ._ark_core import _Planner, _PlannerContext
 from .model import Model
 
 
diff --git a/python/ark/runtime.py b/python/ark/runtime.py
index 1490cdeb8..fa953a873 100644
--- a/python/ark/runtime.py
+++ b/python/ark/runtime.py
@@ -4,7 +4,7 @@
 import logging
 from enum import Enum
 
-from _ark_core import _Executor
+from ._ark_core import _Executor
 from .planner import Planner, Plan
 
 
diff --git a/python/ark/tensor.py b/python/ark/tensor.py
index 9211f7d9d..45a54d169 100644
--- a/python/ark/tensor.py
+++ b/python/ark/tensor.py
@@ -4,7 +4,7 @@
 import numpy as np
 from typing import Callable, List, Union, Type
 
-from _ark_core import _Dims, _Tensor, _NullTensor
+from ._ark_core import _Dims, _Tensor, _NullTensor
 from .data_type import DataType
 from .runtime import Runtime
 from .model import Model
diff --git a/python/unittest/test.py b/python/unittest/test.py
index d56932b83..e8f22fdae 100644
--- a/python/unittest/test.py
+++ b/python/unittest/test.py
@@ -1,12 +1,6 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-import sys
-import os
-
-sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)) + "/..")
-sys.path.insert(0, os.environ.get("ARK_ROOT", ".") + "/python")
-
 from test_error import *
 from test_model import *
 from test_runtime import *
diff --git a/python/unittest/test_error.py b/python/unittest/test_error.py
index 299e2675e..115dd1a15 100644
--- a/python/unittest/test_error.py
+++ b/python/unittest/test_error.py
@@ -1,11 +1,11 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-import ark
+from unittest_common import ark, pytest_ark
 
 
+@pytest_ark()
 def test_error():
-    ark.init()
     try:
         ark.tensor([0])
     except ark.BaseError as e:
diff --git a/python/unittest/test_model.py b/python/unittest/test_model.py
index da8ae399a..d65191e54 100644
--- a/python/unittest/test_model.py
+++ b/python/unittest/test_model.py
@@ -1,13 +1,12 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-import ark
+from unittest_common import ark, pytest_ark
 import json
 
 
+@pytest_ark()
 def test_model():
-    ark.init()
-
     input_tensor = ark.tensor([64, 64], ark.fp16)
     other_tensor = ark.tensor([64, 64], ark.fp16)
     ark.add(input_tensor, other_tensor)
diff --git a/python/unittest/test_runtime.py b/python/unittest/test_runtime.py
index 356430d9a..dd8064d85 100644
--- a/python/unittest/test_runtime.py
+++ b/python/unittest/test_runtime.py
@@ -1,12 +1,12 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-import ark
+from unittest_common import ark, pytest_ark
 import numpy as np
 
 
+@pytest_ark()
 def test_runtime_relaunch():
-    ark.init()
     with ark.Runtime.get_runtime() as rt:
         assert rt.launched() == False
         rt.launch()
@@ -18,8 +18,8 @@ def test_runtime_relaunch():
         assert rt.launched() == True
 
 
+@pytest_ark()
 def test_runtime_init():
-    ark.init()
     M, N = 64, 64
     input_tensor = ark.tensor([M, N], ark.fp16)
     other_tensor = ark.tensor([M, N], ark.fp16)
@@ -51,8 +51,8 @@ def test_runtime_init():
     runtime.reset()
 
 
+@pytest_ark()
 def test_runtime_reuse_plans():
-    ark.init()
     M, N = 64, 64
     input_tensor = ark.tensor([M, N], ark.fp16)
     other_tensor = ark.tensor([M, N], ark.fp16)
diff --git a/python/unittest/test_tensor.py b/python/unittest/test_tensor.py
index 1acad43ee..213264e3b 100644
--- a/python/unittest/test_tensor.py
+++ b/python/unittest/test_tensor.py
@@ -1,8 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-from unittest_common import pytest_ark
-import ark
+from unittest_common import ark, pytest_ark
 
 
 @pytest_ark(need_torch=True)

From f7c6867ca0ffe7d5e1b47495d9a0286252270e5a Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Thu, 15 Aug 2024 09:02:54 +0000
Subject: [PATCH 74/79] fix codecov

---
 .github/workflows/ut-cuda.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ut-cuda.yml b/.github/workflows/ut-cuda.yml
index 3fa92605e..10b0679da 100644
--- a/.github/workflows/ut-cuda.yml
+++ b/.github/workflows/ut-cuda.yml
@@ -72,7 +72,7 @@ jobs:
         run: |
           cd build
           PYTHONPATH=$PWD/python ARK_ROOT=$PWD python3 -m pytest \
-              --cov=../python/ark \
+              --cov=python/ark \
               --cov-report lcov:py_coverage.info \
               --verbose \
               ../python/unittest/test.py

From 2d51327052d54f553776a49f92da703e65efbee5 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Thu, 15 Aug 2024 09:09:27 +0000
Subject: [PATCH 75/79] minor

---
 ark/cpu_timer.cpp | 16 ----------------
 ark/cpu_timer.h   |  4 ----
 2 files changed, 20 deletions(-)

diff --git a/ark/cpu_timer.cpp b/ark/cpu_timer.cpp
index c740de5f3..129ba7bd2 100644
--- a/ark/cpu_timer.cpp
+++ b/ark/cpu_timer.cpp
@@ -16,20 +16,4 @@ double cpu_timer(void) {
     return (tspec.tv_nsec / 1.0e9) + tspec.tv_sec;
 }
 
-// Sleep in second.
-int cpu_timer_sleep(double sec) {
-    struct timespec tspec;
-    tspec.tv_sec = (time_t)sec;
-    tspec.tv_nsec = (long)((sec - tspec.tv_sec) * 1.0e9);
-    return nanosleep(&tspec, 0);
-}
-
-// Sleep in nanosecond.
-int cpu_ntimer_sleep(long nsec) {
-    struct timespec tspec;
-    tspec.tv_sec = 0;
-    tspec.tv_nsec = nsec;
-    return nanosleep(&tspec, 0);
-}
-
 }  // namespace ark
diff --git a/ark/cpu_timer.h b/ark/cpu_timer.h
index 52bf63d92..eaac94061 100644
--- a/ark/cpu_timer.h
+++ b/ark/cpu_timer.h
@@ -8,10 +8,6 @@ namespace ark {
 
 // Measure current time in second.
 double cpu_timer(void);
-// Sleep in second.
-int cpu_timer_sleep(double sec);
-// Sleep in nanosecond.
-int cpu_ntimer_sleep(long nsec);
 
 }  // namespace ark
 

From 7d62f0f8241bfc654237c2d9a405e4218f3128ed Mon Sep 17 00:00:00 2001
From: Noli Gerawork <ngerawor@andrew.cmu.edu>
Date: Thu, 15 Aug 2024 05:11:13 -0400
Subject: [PATCH 76/79] Add Placeholder Operator (#238)

- Separates externally allocated buffers from `ModelBuffer` by having
`ModelBufferManager` manage them instead.
- Adds the `placeholder` operation. `placeholder` is a virtual operation
that produces a `Tensor` with the added feature of providing a data
pointer (which can be null to support delayed binding) to an external
buffer.
---
 ark/api/executor.cpp             | 75 +++++++++++++++++++-------------
 ark/include/ark/executor.hpp     | 17 +++++---
 ark/include/ark/model.hpp        | 34 ++++++++++++++-
 ark/include/ark/tensor.hpp       | 11 +++++
 ark/model/model_op.cpp           |  2 +
 ark/model_buffer_manager.hpp     | 13 ++++--
 ark/ops/ops_placeholder.cpp      | 57 ++++++++++++++++++++++++
 ark/ops/ops_placeholder.hpp      | 23 ++++++++++
 ark/ops/ops_placeholder_test.cpp | 54 +++++++++++++++++++++++
 9 files changed, 244 insertions(+), 42 deletions(-)
 create mode 100644 ark/ops/ops_placeholder.cpp
 create mode 100644 ark/ops/ops_placeholder.hpp
 create mode 100644 ark/ops/ops_placeholder_test.cpp

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index 162aaa1f0..7823c324c 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -26,6 +26,7 @@
 #include "model/model_data_type.hpp"
 #include "model/model_tensor.hpp"
 #include "model_buffer_manager.hpp"
+#include "unordered_map"
 #include "utils/utils_net.hpp"
 
 #if defined(ARK_CUDA)
@@ -143,7 +144,10 @@ static size_t tensor_stride_bytes(const Json &tensor) {
 
 class Executor::Impl {
    public:
-    Impl() : plan_json_(), device_id_(-1) {};
+    Impl()
+        : plan_json_(),
+          device_id_(-1),
+          buffer_manager_(ModelBufferManager::get_instance()) {};
     ~Impl();
 
     int device_id() const { return device_id_; }
@@ -160,8 +164,10 @@ class Executor::Impl {
 
     void compile(const std::string &plan, int device_id,
                  const std::string &name);
-    void launch(Stream stream, bool loop_mode);
-    void run(int iter);
+    void launch(Stream stream, bool loop_mode,
+                const std::unordered_map<Tensor, void *> &external_tensors);
+    void run(int iter,
+             const std::unordered_map<Tensor, void *> &external_tensors);
     void wait(int64_t max_spin_count);
     float stop(int64_t max_spin_count);
     void barrier();
@@ -203,6 +209,7 @@ class Executor::Impl {
     bool is_recording_ = false;
     float elapsed_msec_ = -1;
 
+    ModelBufferManager &buffer_manager_;
     std::vector<void *> external_buffers_;
     std::vector<std::string> external_args_;
     std::map<size_t, std::string> buffer_id_to_name_;
@@ -408,45 +415,40 @@ std::map<size_t, size_t> Executor::Impl::init_buffers(const Json &plan_json) {
     for (auto &kv : buffer_id_to_info) {
         auto &buf_info = kv.second;
         int r = buf_info->buffer->rank();
+        const size_t buf_id = buf_info->buffer->id();
         if (r != rank_ && r != -1) {
             // this is a remote buffer
             for (const auto &tag_info : buf_info->buffer->send_tags()) {
                 remote_rank_to_send_tag_to_buffer_id[buf_info->buffer->rank()]
-                                                    [tag_info.second] =
-                                                        buf_info->buffer->id();
+                                                    [tag_info.second] = buf_id;
             }
             for (const auto &tag_info : buf_info->buffer->recv_tags()) {
                 remote_rank_to_recv_tag_to_buffer_id[buf_info->buffer->rank()]
-                                                    [tag_info.second] =
-                                                        buf_info->buffer->id();
+                                                    [tag_info.second] = buf_id;
             }
             continue;
         }
-        if (buf_info->buffer->is_external()) {
+        if (buffer_manager_.is_external(buf_id)) {
             if (buf_info->buffer->device_id() != device_id_) {
                 ERR(InvalidUsageError,
                     "PyTorch tensor and model execution are on different GPUs");
             }
-            external_buffers_.push_back(buf_info->buffer->external_data());
+            external_buffers_.push_back(buffer_manager_.get_buffer(buf_id));
             const auto [it, inserted] = buffer_id_to_name_.try_emplace(
-                buf_info->buffer->id(),
-                "extern_buf_" + std::to_string(buf_info->buffer->id()));
+                buf_id, "extern_buf_" + std::to_string(buf_id));
             external_args_.push_back(it->second);
             continue;
         }
         // if we are adding a plan and come across a buffer from a previous
         // plan, we utilize the buffer offset from the previous plan
-        if (buffer_id_to_offset_.find(buf_info->buffer->id()) !=
-            buffer_id_to_offset_.end()) {
-            external_buffers_.push_back(
-                buffer_id_to_addr_[buf_info->buffer->id()]);
-            const std::string name =
-                "extern_buf_" + std::to_string(buf_info->buffer->id());
+        if (buffer_id_to_offset_.find(buf_id) != buffer_id_to_offset_.end()) {
+            external_buffers_.push_back(buffer_id_to_addr_[buf_id]);
+            const std::string name = "extern_buf_" + std::to_string(buf_id);
             external_args_.push_back(name);
-            buffer_id_to_name_[buf_info->buffer->id()] = name;
+            buffer_id_to_name_[buf_id] = name;
             continue;
         } else {
-            buffer_id_to_offset[buf_info->buffer->id()] = offset;
+            buffer_id_to_offset[buf_id] = offset;
             for (const auto &tag_info : buf_info->buffer->send_tags()) {
                 remote_rank_to_send_tags_and_offsets[tag_info.first]
                     .first.push_back(tag_info.second);
@@ -536,8 +538,9 @@ std::map<size_t, size_t> Executor::Impl::init_buffers(const Json &plan_json) {
         bootstrap->recv(tags.data(), len * sizeof(int), remote_rank, 1);
         bootstrap->recv(offsets.data(), len * sizeof(size_t), remote_rank, 2);
         for (int i = 0; i < len; ++i) {
-            if (!buffer_id_to_info[send_tag_to_buffer_id[tags[i]]]
-                     ->buffer->is_external()) {
+            const size_t buf_id =
+                buffer_id_to_info[send_tag_to_buffer_id[tags[i]]]->buffer->id();
+            if (!buffer_manager_.is_external(buf_id)) {
                 buffer_id_to_offset[send_tag_to_buffer_id[tags[i]]] =
                     offsets[i];
             }
@@ -556,8 +559,9 @@ std::map<size_t, size_t> Executor::Impl::init_buffers(const Json &plan_json) {
         bootstrap->recv(tags.data(), len * sizeof(int), remote_rank, 4);
         bootstrap->recv(offsets.data(), len * sizeof(size_t), remote_rank, 5);
         for (int i = 0; i < len; ++i) {
-            if (!buffer_id_to_info[recv_tag_to_buffer_id[tags[i]]]
-                     ->buffer->is_external()) {
+            const size_t buf_id =
+                buffer_id_to_info[recv_tag_to_buffer_id[tags[i]]]->buffer->id();
+            if (!buffer_manager_.is_external(buf_id)) {
                 buffer_id_to_offset[recv_tag_to_buffer_id[tags[i]]] =
                     offsets[i];
             }
@@ -703,7 +707,9 @@ void Executor::Impl::compile(const std::string &plan, int device_id,
     kernel_->compile();
 }
 
-void Executor::Impl::launch(Stream stream, bool loop_mode) {
+void Executor::Impl::launch(
+    Stream stream, bool loop_mode,
+    const std::unordered_map<Tensor, void *> &external_tensors) {
     if ((kernel_ == nullptr) || !kernel_->is_compiled()) {
         ERR(InvalidUsageError, "Need to compile first before launch.");
     }
@@ -796,7 +802,8 @@ void Executor::Impl::launch(Stream stream, bool loop_mode) {
     is_launched_ = true;
 }
 
-void Executor::Impl::run(int iter) {
+void Executor::Impl::run(
+    int iter, const std::unordered_map<Tensor, void *> &external_tensors) {
     if (iter <= 0) return;
     if (loop_mode_) {
         while (atomicLoadRelaxed(flag_->ref<int>()) > 0) {
@@ -888,7 +895,7 @@ void *Executor::Impl::tensor_address(const Tensor &tensor) const {
 void Executor::Impl::tensor_read(const Tensor &tensor, void *data, size_t bytes,
                                  Stream stream, bool is_d2d) const {
     GLOG(gpuSetDevice(device_id_));
-    if (tensor.ref()->buffer()->is_external()) {
+    if (buffer_manager_.is_external(tensor.ref()->buffer()->id())) {
         ERR(InvalidUsageError,
             "Reading data from a tensor preallocated by PyTorch is not "
             "supported. Use PyTorch's native methods.");
@@ -944,7 +951,7 @@ void Executor::Impl::tensor_write(const Tensor &tensor, const void *data,
                                   size_t bytes, Stream stream,
                                   bool is_d2d) const {
     GLOG(gpuSetDevice(device_id_));
-    if (tensor.ref()->buffer()->is_external()) {
+    if (buffer_manager_.is_external(tensor.ref()->buffer()->id())) {
         ERR(InvalidUsageError,
             "Writing data to a tensor preallocated by PyTorch is not "
             "supported. Use PyTorch's native methods.");
@@ -1019,11 +1026,16 @@ void Executor::compile(const std::string &plan, int device_id,
     impl_->compile(plan, device_id, name);
 }
 
-void Executor::launch(Stream stream, bool loop_mode) {
-    impl_->launch(stream, loop_mode);
+void Executor::launch(
+    Stream stream, bool loop_mode,
+    const std::unordered_map<Tensor, void *> &external_tensors) {
+    impl_->launch(stream, loop_mode, external_tensors);
 }
 
-void Executor::run(int iter) { impl_->run(iter); }
+void Executor::run(int iter,
+                   const std::unordered_map<Tensor, void *> &external_tensors) {
+    impl_->run(iter, external_tensors);
+}
 
 void Executor::wait(int64_t max_spin_count) { impl_->wait(max_spin_count); }
 
@@ -1071,7 +1083,8 @@ DefaultExecutor::DefaultExecutor(
 }
 
 void DefaultExecutor::launch() {
-    Executor::launch(reinterpret_cast<Stream>(impl_->stream_raw_), impl_->loop_mode_);
+    Executor::launch(reinterpret_cast<Stream>(impl_->stream_raw_),
+                     impl_->loop_mode_);
 }
 
 }  // namespace ark
diff --git a/ark/include/ark/executor.hpp b/ark/include/ark/executor.hpp
index 8e6577cd2..8e5e5c852 100644
--- a/ark/include/ark/executor.hpp
+++ b/ark/include/ark/executor.hpp
@@ -9,6 +9,7 @@
 #include <ark/tensor.hpp>
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 namespace ark {
@@ -45,10 +46,13 @@ class Executor {
                  const std::string &name = "executor");
 
     /// Launch the executor. This must be called after `compile()`.
-    void launch(Stream stream = nullptr, bool loop_mode = true);
+    void launch(
+        Stream stream = nullptr, bool loop_mode = true,
+        const std::unordered_map<Tensor, void *> &external_tensors = {});
 
     /// Run the executor for `iter` iterations.
-    void run(int iter);
+    void run(int iter,
+             const std::unordered_map<Tensor, void *> &external_tensors = {});
 
     /// Wait for the previous run to finish.
     void wait(int64_t max_spin_count = -1);
@@ -99,10 +103,11 @@ class Model;
 
 class DefaultExecutor : public Executor {
    public:
-    DefaultExecutor(
-        const Model &model, int device_id = -1, Stream stream = nullptr,
-        const std::vector<Planner::ConfigRule> &config_rules = {},
-        const std::string &name = "DefaultExecutor", bool loop_mode = true);
+    DefaultExecutor(const Model &model, int device_id = -1,
+                    Stream stream = nullptr,
+                    const std::vector<Planner::ConfigRule> &config_rules = {},
+                    const std::string &name = "DefaultExecutor",
+                    bool loop_mode = true);
 
     /// Launch the default executor.
     void launch();
diff --git a/ark/include/ark/model.hpp b/ark/include/ark/model.hpp
index 3c4f22e22..08b8fe639 100644
--- a/ark/include/ark/model.hpp
+++ b/ark/include/ark/model.hpp
@@ -76,6 +76,39 @@ class Model : public ModelGraph {
                   const Dims &padded_shape = {}, int rank = -1,
                   const std::string &name = "");
 
+    ///
+    /// Returns a tensor object associated with an external buffer.
+    ///
+    /// @param shape Shape of the tensor, where the data of interest is.
+    /// @param dtype Type of the tensor data.
+    /// @param strides Strides of each dimension of the tensor, which may be
+    /// different from the shape. @p strides can be considered as the actual
+    /// shape of the underlying data buffer.
+    /// @param offsets Offsets of the tensor. The data of interest starts at
+    /// @p offsets and ends at @p offsets + @p padded_shape.
+    /// @param padded_shape Padded shape of the tensor. Padding is used to
+    /// reserve extra space for the tensor when computation requires it.
+    /// Data on the padded region is allowed to be accessed by computation,
+    /// but it is not considered as the data of interest. The padded region is
+    /// initialized to zero only once when the Executor is launched. The padded
+    /// shape should be greater than or equal to the @p shape, and the
+    /// @p strides should be greater than or equal to the padded shape. If the
+    /// @p strides are not provided, they are set to the padded shape. If the
+    /// padded shape is not provided, it is set to the @p shape.
+    /// @param rank Rank of the tensor. -1 means the rank of this model.
+    /// @param name Name of the tensor.
+    /// @param external_data Pointer to an external data buffer. If provided,
+    /// this buffer is registered with the ModelBufferManager and associated
+    /// with the tensor.
+    /// @return Pointer to a tensor object that references the external buffer.
+    ///
+    ///
+    Tensor placeholder(const Dims &shape, const DataType &data_type,
+                       const Dims &strides = {}, const Dims &offsets = {},
+                       const Dims &padded_shape = {}, int rank = -1,
+                       const std::string &name = "",
+                       void *external_data = nullptr);
+
     Tensor refer(Tensor input, const Dims &shape = {}, const Dims &strides = {},
                  const Dims &offsets = {}, const Dims &padded_shape = {},
                  const std::string &name = "");
@@ -254,7 +287,6 @@ class Model : public ModelGraph {
 
     Tensor local_all_reduce(Tensor input, int gpu_id, int gpu_num,
                             const std::string &name = "");
-
 };
 
 }  // namespace ark
diff --git a/ark/include/ark/tensor.hpp b/ark/include/ark/tensor.hpp
index 5e463f99d..816738c07 100644
--- a/ark/include/ark/tensor.hpp
+++ b/ark/include/ark/tensor.hpp
@@ -54,6 +54,8 @@ class Tensor {
     const DataType &data_type() const;
 
     Dims torch_strides() const;
+
+    friend struct std::hash<Tensor>;
 };
 
 const Tensor NullTensor;
@@ -62,4 +64,13 @@ std::ostream &operator<<(std::ostream &os, const Tensor &tensor);
 
 }  // namespace ark
 
+namespace std {
+template <>
+struct hash<ark::Tensor> {
+    size_t operator()(const ark::Tensor &t) const {
+        return hash<size_t>()(t.id());
+    }
+};
+}  // namespace std
+
 #endif  // ARK_TENSOR_HPP
diff --git a/ark/model/model_op.cpp b/ark/model/model_op.cpp
index 5db8576e8..8f222b75d 100644
--- a/ark/model/model_op.cpp
+++ b/ark/model/model_op.cpp
@@ -16,6 +16,7 @@
 #include "ops/ops_math.hpp"
 #include "ops/ops_matmul.hpp"
 #include "ops/ops_noop.hpp"
+#include "ops/ops_placeholder.hpp"
 #include "ops/ops_reduce.hpp"
 #include "ops/ops_refer.hpp"
 #include "ops/ops_reshape.hpp"
@@ -78,6 +79,7 @@ const ModelOpType ModelOpT::from_name(const std::string &type_name) {
         MODEL_OP_TYPE_REGISTER(Sqrt);
         MODEL_OP_TYPE_REGISTER(Sub);
         MODEL_OP_TYPE_REGISTER(Tensor);
+        MODEL_OP_TYPE_REGISTER(Placeholder);
         MODEL_OP_TYPE_REGISTER(Transpose);
         MODEL_OP_TYPE_REGISTER(SendPacket);
         MODEL_OP_TYPE_REGISTER(RecvPacket);
diff --git a/ark/model_buffer_manager.hpp b/ark/model_buffer_manager.hpp
index 4baaec7fe..3e82b05f5 100644
--- a/ark/model_buffer_manager.hpp
+++ b/ark/model_buffer_manager.hpp
@@ -8,7 +8,8 @@
 #include <unordered_map>
 
 namespace ark {
-// Manages externally allocated buffers not in the ARK memory space.
+// Manages externally allocated buffers (buffers corresponding to Tensors that
+// are the output of a `placeholder` operation) outside of ARK's memory space.
 class ModelBufferManager {
    public:
     static ModelBufferManager& get_instance() {
@@ -16,11 +17,11 @@ class ModelBufferManager {
         return instance;
     }
 
-    void register_buffer(size_t id, void* data, size_t size) {
+    void register_buffer(const size_t id, void* const data, const size_t size) {
         buffers_[id] = std::make_tuple(data, size);
     }
 
-    void* get_buffer(size_t id) {
+    void* get_buffer(const size_t id) const {
         auto it = buffers_.find(id);
         if (it != buffers_.end()) {
             return std::get<0>(it->second);
@@ -28,7 +29,7 @@ class ModelBufferManager {
         return nullptr;
     }
 
-    size_t get_buffer_size(size_t id) {
+    size_t get_buffer_size(const size_t id) const {
         auto it = buffers_.find(id);
         if (it != buffers_.end()) {
             return std::get<1>(it->second);
@@ -36,6 +37,10 @@ class ModelBufferManager {
         return 0;
     }
 
+    bool is_external(const size_t id) const {
+        return buffers_.find(id) != buffers_.end();
+    }
+
     const std::unordered_map<size_t, std::tuple<void*, size_t>>& get_buffers()
         const {
         return buffers_;
diff --git a/ark/ops/ops_placeholder.cpp b/ark/ops/ops_placeholder.cpp
new file mode 100644
index 000000000..fbac73902
--- /dev/null
+++ b/ark/ops/ops_placeholder.cpp
@@ -0,0 +1,57 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "ops_placeholder.hpp"
+
+#include "logging.hpp"
+#include "model_buffer_manager.hpp"
+#include "ops_common.hpp"
+
+namespace ark {
+
+ModelOpPlaceholder::ModelOpPlaceholder(ModelBufferRef buffer, const Dims &shape,
+                                       ModelDataType data_type,
+                                       const Dims &strides, const Dims &offsets,
+                                       const Dims &padded_shape,
+                                       void *external_data)
+    : ModelOp("Placeholder", true) {
+    if (!buffer) {
+        buffer = std::make_shared<ModelBuffer>();
+    }
+    const std::vector<DimType> &shape_vec = shape.vector();
+    DataType dtype = ModelDataType(data_type);
+
+    size_t external_data_size =
+        std::accumulate(shape_vec.begin(), shape_vec.end(), 1,
+                        std::multiplies<int64_t>()) *
+        dtype.bytes();
+
+    ModelBufferManager::get_instance().register_buffer(
+        buffer->id(), external_data, external_data_size);
+
+    ModelTensorRef tensor = std::make_shared<ModelTensor>(
+        data_type, buffer, shape, strides, offsets, padded_shape);
+
+    result_tensors_.emplace_back(tensor);
+
+    verify();
+}
+
+Tensor Model::placeholder(const Dims &shape, const DataType &data_type,
+                          const Dims &strides, const Dims &offsets,
+                          const Dims &padded_shape, int rank,
+                          const std::string &name, void *external_data) {
+    if (rank != -1) {
+        if (rank == this->rank()) {
+            rank = -1;
+        } else if (rank < 0 || rank >= this->world_size()) {
+            ERR(ModelError, "Invalid rank %d", rank);
+        }
+    }
+    return impl_
+        ->create_op<ModelOpPlaceholder>(
+            name, std::make_shared<ModelBuffer>(rank), shape, data_type.ref(),
+            strides, offsets, padded_shape, external_data)
+        ->result_tensors()[0];
+}
+}  // namespace ark
\ No newline at end of file
diff --git a/ark/ops/ops_placeholder.hpp b/ark/ops/ops_placeholder.hpp
new file mode 100644
index 000000000..7fb53f983
--- /dev/null
+++ b/ark/ops/ops_placeholder.hpp
@@ -0,0 +1,23 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef ARK_OPS_PLACEHOLDER_HPP_
+#define ARK_OPS_PLACEHOLDER_HPP_
+
+#include "ark/model.hpp"
+#include "model/model_op.hpp"
+
+namespace ark {
+
+class ModelOpPlaceholder : public ModelOp {
+   public:
+    ModelOpPlaceholder() = default;
+    ModelOpPlaceholder(ModelBufferRef buffer, const Dims &shape,
+                       ModelDataType data_type, const Dims &strides,
+                       const Dims &offsets, const Dims &padded_shape,
+                       void *external_data = nullptr);
+};
+
+}  // namespace ark
+
+#endif  // ARK_OPS_PLACEHOLDER_HPP_
\ No newline at end of file
diff --git a/ark/ops/ops_placeholder_test.cpp b/ark/ops/ops_placeholder_test.cpp
new file mode 100644
index 000000000..59f5e2dc0
--- /dev/null
+++ b/ark/ops/ops_placeholder_test.cpp
@@ -0,0 +1,54 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "ark/executor.hpp"
+#include "gpu/gpu.hpp"
+#include "logging.hpp"
+#include "model/model_op.hpp"
+#include "ops_test_common.hpp"
+
+ark::unittest::State test_ops_placeholder_value_contiguous() {
+    ark::Model model;
+    ark::Dims shape{10, 1};
+
+    // Allocate GPU memory for the external buffer
+    float *d_ext_buffer = nullptr;
+    ark::gpuMalloc(&d_ext_buffer, shape.nelems() * sizeof(float));
+
+    // Initialize GPU Memory
+    std::vector<float> h_ext_buffer(shape.nelems());
+    std::iota(h_ext_buffer.begin(), h_ext_buffer.end(), 1.0f);
+    ark::gpuMemcpy(d_ext_buffer, h_ext_buffer.data(),
+                   shape.nelems() * sizeof(float), ark::gpuMemcpyHostToDevice);
+
+    // Associate the initialized device buffer with a tensor produced from a
+    // placeholder operation
+    ark::Tensor tns =
+        model.placeholder(shape, ark::FP32, {}, {}, {}, -1, "", d_ext_buffer);
+
+    ark::Tensor res = model.add(tns, 1.0);
+
+    ark::DefaultExecutor exe(model);
+
+    exe.launch();
+    exe.run(1);
+    exe.stop();
+
+    // Copy tensor data from GPU to CPU
+    std::vector<float> h_res(shape.nelems(), 0.0f);
+    exe.tensor_read(res, h_res);
+
+    for (auto i = 0; i < shape.nelems(); ++i) {
+        UNITTEST_EQ(h_res[i], i + 2);
+    }
+
+    cudaFree(d_ext_buffer);
+
+    return ark::unittest::SUCCESS;
+}
+
+int main() {
+    ark::init();
+    UNITTEST(test_ops_placeholder_value_contiguous);
+    return ark::unittest::SUCCESS;
+}
\ No newline at end of file

From 192a3d34c24065cfceb2b0fbdd36d57a30f68867 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Thu, 15 Aug 2024 09:47:13 +0000
Subject: [PATCH 77/79] minor updates

---
 ark/api/executor.cpp             | 61 +++++++++++++++-----------------
 ark/include/ark/executor.hpp     | 13 +++----
 ark/include/ark/tensor.hpp       |  2 +-
 ark/ops/ops_placeholder_test.cpp | 13 ++++---
 python/executor_py.cpp           | 19 +++++++---
 5 files changed, 60 insertions(+), 48 deletions(-)

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index 7823c324c..47a7a7519 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -144,10 +144,7 @@ static size_t tensor_stride_bytes(const Json &tensor) {
 
 class Executor::Impl {
    public:
-    Impl()
-        : plan_json_(),
-          device_id_(-1),
-          buffer_manager_(ModelBufferManager::get_instance()) {};
+    Impl() : plan_json_(), device_id_(-1){};
     ~Impl();
 
     int device_id() const { return device_id_; }
@@ -164,10 +161,12 @@ class Executor::Impl {
 
     void compile(const std::string &plan, int device_id,
                  const std::string &name);
-    void launch(Stream stream, bool loop_mode,
-                const std::unordered_map<Tensor, void *> &external_tensors);
-    void run(int iter,
-             const std::unordered_map<Tensor, void *> &external_tensors);
+    void launch(
+        Stream stream, bool loop_mode,
+        const std::unordered_map<const Tensor, const void *> &placeholder_data);
+    void run(
+        int iter,
+        const std::unordered_map<const Tensor, const void *> &placeholder_data);
     void wait(int64_t max_spin_count);
     float stop(int64_t max_spin_count);
     void barrier();
@@ -209,7 +208,6 @@ class Executor::Impl {
     bool is_recording_ = false;
     float elapsed_msec_ = -1;
 
-    ModelBufferManager &buffer_manager_;
     std::vector<void *> external_buffers_;
     std::vector<std::string> external_args_;
     std::map<size_t, std::string> buffer_id_to_name_;
@@ -410,6 +408,8 @@ std::map<size_t, size_t> Executor::Impl::init_buffers(const Json &plan_json) {
     std::map<int, std::map<int, size_t>> remote_rank_to_send_tag_to_buffer_id;
     std::map<int, std::map<int, size_t>> remote_rank_to_recv_tag_to_buffer_id;
 
+    auto &buffer_manager = ModelBufferManager::get_instance();
+
     // TODO: improve memory planning
     size_t offset = 0;
     for (auto &kv : buffer_id_to_info) {
@@ -428,12 +428,12 @@ std::map<size_t, size_t> Executor::Impl::init_buffers(const Json &plan_json) {
             }
             continue;
         }
-        if (buffer_manager_.is_external(buf_id)) {
+        if (buffer_manager.is_external(buf_id)) {
             if (buf_info->buffer->device_id() != device_id_) {
                 ERR(InvalidUsageError,
                     "PyTorch tensor and model execution are on different GPUs");
             }
-            external_buffers_.push_back(buffer_manager_.get_buffer(buf_id));
+            external_buffers_.push_back(buffer_manager.get_buffer(buf_id));
             const auto [it, inserted] = buffer_id_to_name_.try_emplace(
                 buf_id, "extern_buf_" + std::to_string(buf_id));
             external_args_.push_back(it->second);
@@ -540,7 +540,7 @@ std::map<size_t, size_t> Executor::Impl::init_buffers(const Json &plan_json) {
         for (int i = 0; i < len; ++i) {
             const size_t buf_id =
                 buffer_id_to_info[send_tag_to_buffer_id[tags[i]]]->buffer->id();
-            if (!buffer_manager_.is_external(buf_id)) {
+            if (!buffer_manager.is_external(buf_id)) {
                 buffer_id_to_offset[send_tag_to_buffer_id[tags[i]]] =
                     offsets[i];
             }
@@ -561,7 +561,7 @@ std::map<size_t, size_t> Executor::Impl::init_buffers(const Json &plan_json) {
         for (int i = 0; i < len; ++i) {
             const size_t buf_id =
                 buffer_id_to_info[recv_tag_to_buffer_id[tags[i]]]->buffer->id();
-            if (!buffer_manager_.is_external(buf_id)) {
+            if (!buffer_manager.is_external(buf_id)) {
                 buffer_id_to_offset[recv_tag_to_buffer_id[tags[i]]] =
                     offsets[i];
             }
@@ -709,7 +709,7 @@ void Executor::Impl::compile(const std::string &plan, int device_id,
 
 void Executor::Impl::launch(
     Stream stream, bool loop_mode,
-    const std::unordered_map<Tensor, void *> &external_tensors) {
+    const std::unordered_map<const Tensor, const void *> &placeholder_data) {
     if ((kernel_ == nullptr) || !kernel_->is_compiled()) {
         ERR(InvalidUsageError, "Need to compile first before launch.");
     }
@@ -803,7 +803,8 @@ void Executor::Impl::launch(
 }
 
 void Executor::Impl::run(
-    int iter, const std::unordered_map<Tensor, void *> &external_tensors) {
+    int iter,
+    const std::unordered_map<const Tensor, const void *> &placeholder_data) {
     if (iter <= 0) return;
     if (loop_mode_) {
         while (atomicLoadRelaxed(flag_->ref<int>()) > 0) {
@@ -883,6 +884,10 @@ void Executor::Impl::barrier() {
 
 void *Executor::Impl::tensor_address(const Tensor &tensor) const {
     size_t buffer_id = tensor.ref()->buffer()->id();
+    auto &buffer_manager = ModelBufferManager::get_instance();
+    if (buffer_manager.is_external(buffer_id)) {
+        return buffer_manager.get_buffer(buffer_id);
+    }
     if (buffer_id_to_addr_.find(buffer_id) == buffer_id_to_addr_.end()) {
         ERR(InvalidUsageError, "Tensor has an unknown buffer ID ", buffer_id,
             ". This is likely caused by accessing a tensor that is optimized "
@@ -895,11 +900,6 @@ void *Executor::Impl::tensor_address(const Tensor &tensor) const {
 void Executor::Impl::tensor_read(const Tensor &tensor, void *data, size_t bytes,
                                  Stream stream, bool is_d2d) const {
     GLOG(gpuSetDevice(device_id_));
-    if (buffer_manager_.is_external(tensor.ref()->buffer()->id())) {
-        ERR(InvalidUsageError,
-            "Reading data from a tensor preallocated by PyTorch is not "
-            "supported. Use PyTorch's native methods.");
-    }
     std::shared_ptr<GpuStream> copy_stream;
     gpuStream copy_stream_raw;
     if (stream) {
@@ -951,11 +951,6 @@ void Executor::Impl::tensor_write(const Tensor &tensor, const void *data,
                                   size_t bytes, Stream stream,
                                   bool is_d2d) const {
     GLOG(gpuSetDevice(device_id_));
-    if (buffer_manager_.is_external(tensor.ref()->buffer()->id())) {
-        ERR(InvalidUsageError,
-            "Writing data to a tensor preallocated by PyTorch is not "
-            "supported. Use PyTorch's native methods.");
-    }
     std::shared_ptr<GpuStream> copy_stream;
     gpuStream copy_stream_raw;
     if (stream) {
@@ -1028,13 +1023,14 @@ void Executor::compile(const std::string &plan, int device_id,
 
 void Executor::launch(
     Stream stream, bool loop_mode,
-    const std::unordered_map<Tensor, void *> &external_tensors) {
-    impl_->launch(stream, loop_mode, external_tensors);
+    const std::unordered_map<const Tensor, const void *> &placeholder_data) {
+    impl_->launch(stream, loop_mode, placeholder_data);
 }
 
-void Executor::run(int iter,
-                   const std::unordered_map<Tensor, void *> &external_tensors) {
-    impl_->run(iter, external_tensors);
+void Executor::run(
+    int iter,
+    const std::unordered_map<const Tensor, const void *> &placeholder_data) {
+    impl_->run(iter, placeholder_data);
 }
 
 void Executor::wait(int64_t max_spin_count) { impl_->wait(max_spin_count); }
@@ -1082,9 +1078,10 @@ DefaultExecutor::DefaultExecutor(
     impl_->loop_mode_ = loop_mode;
 }
 
-void DefaultExecutor::launch() {
+void DefaultExecutor::launch(
+    const std::unordered_map<const Tensor, const void *> &placeholder_data) {
     Executor::launch(reinterpret_cast<Stream>(impl_->stream_raw_),
-                     impl_->loop_mode_);
+                     impl_->loop_mode_, placeholder_data);
 }
 
 }  // namespace ark
diff --git a/ark/include/ark/executor.hpp b/ark/include/ark/executor.hpp
index 8e5e5c852..e71e087db 100644
--- a/ark/include/ark/executor.hpp
+++ b/ark/include/ark/executor.hpp
@@ -46,13 +46,13 @@ class Executor {
                  const std::string &name = "executor");
 
     /// Launch the executor. This must be called after `compile()`.
-    void launch(
-        Stream stream = nullptr, bool loop_mode = true,
-        const std::unordered_map<Tensor, void *> &external_tensors = {});
+    void launch(Stream stream = nullptr, bool loop_mode = true,
+                const std::unordered_map<const Tensor, const void *>
+                    &placeholder_data = {});
 
     /// Run the executor for `iter` iterations.
-    void run(int iter,
-             const std::unordered_map<Tensor, void *> &external_tensors = {});
+    void run(int iter, const std::unordered_map<const Tensor, const void *>
+                           &placeholder_data = {});
 
     /// Wait for the previous run to finish.
     void wait(int64_t max_spin_count = -1);
@@ -110,7 +110,8 @@ class DefaultExecutor : public Executor {
                     bool loop_mode = true);
 
     /// Launch the default executor.
-    void launch();
+    void launch(const std::unordered_map<const Tensor, const void *>
+                    &placeholder_data = {});
 };
 
 }  // namespace ark
diff --git a/ark/include/ark/tensor.hpp b/ark/include/ark/tensor.hpp
index 816738c07..72ff9ff57 100644
--- a/ark/include/ark/tensor.hpp
+++ b/ark/include/ark/tensor.hpp
@@ -66,7 +66,7 @@ std::ostream &operator<<(std::ostream &os, const Tensor &tensor);
 
 namespace std {
 template <>
-struct hash<ark::Tensor> {
+struct hash<const ark::Tensor> {
     size_t operator()(const ark::Tensor &t) const {
         return hash<size_t>()(t.id());
     }
diff --git a/ark/ops/ops_placeholder_test.cpp b/ark/ops/ops_placeholder_test.cpp
index 59f5e2dc0..903d87593 100644
--- a/ark/ops/ops_placeholder_test.cpp
+++ b/ark/ops/ops_placeholder_test.cpp
@@ -13,13 +13,16 @@ ark::unittest::State test_ops_placeholder_value_contiguous() {
 
     // Allocate GPU memory for the external buffer
     float *d_ext_buffer = nullptr;
-    ark::gpuMalloc(&d_ext_buffer, shape.nelems() * sizeof(float));
+    UNITTEST_EQ(ark::gpuMalloc(&d_ext_buffer, shape.nelems() * sizeof(float)),
+                ark::gpuSuccess);
 
     // Initialize GPU Memory
     std::vector<float> h_ext_buffer(shape.nelems());
     std::iota(h_ext_buffer.begin(), h_ext_buffer.end(), 1.0f);
-    ark::gpuMemcpy(d_ext_buffer, h_ext_buffer.data(),
-                   shape.nelems() * sizeof(float), ark::gpuMemcpyHostToDevice);
+    UNITTEST_EQ(ark::gpuMemcpy(d_ext_buffer, h_ext_buffer.data(),
+                               shape.nelems() * sizeof(float),
+                               ark::gpuMemcpyHostToDevice),
+                ark::gpuSuccess);
 
     // Associate the initialized device buffer with a tensor produced from a
     // placeholder operation
@@ -34,6 +37,8 @@ ark::unittest::State test_ops_placeholder_value_contiguous() {
     exe.run(1);
     exe.stop();
 
+    UNITTEST_EQ(exe.tensor_address(tns), d_ext_buffer);
+
     // Copy tensor data from GPU to CPU
     std::vector<float> h_res(shape.nelems(), 0.0f);
     exe.tensor_read(res, h_res);
@@ -42,7 +47,7 @@ ark::unittest::State test_ops_placeholder_value_contiguous() {
         UNITTEST_EQ(h_res[i], i + 2);
     }
 
-    cudaFree(d_ext_buffer);
+    UNITTEST_EQ(ark::gpuFree(d_ext_buffer), ark::gpuSuccess);
 
     return ark::unittest::SUCCESS;
 }
diff --git a/python/executor_py.cpp b/python/executor_py.cpp
index 5b4e7959f..dd53af51a 100644
--- a/python/executor_py.cpp
+++ b/python/executor_py.cpp
@@ -171,11 +171,20 @@ void register_executor(py::module &m) {
         .def("name", &ark::Executor::name)
         .def("compile", &ark::Executor::compile, py::arg("device_id"),
              py::arg("plan"), py::arg("name") = "executor")
-        .def("launch", [](ark::Executor *self, uintptr_t stream, bool loop_mode) {
-                 self->launch(reinterpret_cast<ark::Stream>(stream), loop_mode);
-             },
-             py::arg("stream") = 0, py::arg("loop_mode") = true)
-        .def("run", &ark::Executor::run, py::arg("iter"))
+        .def(
+            "launch",
+            [](ark::Executor *self, uintptr_t stream, bool loop_mode,
+               const std::unordered_map<const ark::Tensor, const void *>
+                   &placeholder_data) {
+                self->launch(reinterpret_cast<ark::Stream>(stream), loop_mode,
+                             placeholder_data);
+            },
+            py::arg("stream") = 0, py::arg("loop_mode") = true,
+            py::arg("placeholder_data") =
+                std::unordered_map<const ark::Tensor, const void *>())
+        .def("run", &ark::Executor::run, py::arg("iter"),
+             py::arg("placeholder_data") =
+                 std::unordered_map<const ark::Tensor, const void *>())
         .def("wait", &ark::Executor::wait, py::arg("max_spin_count") = -1)
         .def("stop", &ark::Executor::stop, py::arg("max_spin_count") = -1)
         .def("barrier", &ark::Executor::barrier)

From 977ce9e848cc514eacc0b0314d096e3e8a8259e7 Mon Sep 17 00:00:00 2001
From: noli <nolawigerawork@gmail.com>
Date: Thu, 15 Aug 2024 11:04:45 +0000
Subject: [PATCH 78/79] wip adds python binding and delayed buffer binding

---
 ark/api/executor.cpp               | 75 +++++++++++++------------
 ark/api/tensor.cpp                 | 12 ----
 ark/include/ark/executor.hpp       | 10 ++--
 ark/include/ark/tensor.hpp         |  6 +-
 ark/model/model_buffer.cpp         | 50 +----------------
 ark/model/model_buffer.hpp         | 14 -----
 ark/model_buffer_manager.hpp       | 31 +++++++++--
 ark/ops/ops_placeholder_test.cpp   |  4 +-
 python/ark/module.py               | 12 ++--
 python/ark/ops.py                  | 61 ++++++++++++++++++++-
 python/ark/runtime.py              | 19 ++++++-
 python/ark/tensor.py               | 52 +++++++-----------
 python/executor_py.cpp             | 22 ++++++--
 python/model_py.cpp                | 79 +++++++++++++++++++++++++++
 python/tensor_py.cpp               | 71 ------------------------
 python/unittest/test_conversion.py | 88 +++++++++++++++++++++++++-----
 16 files changed, 351 insertions(+), 255 deletions(-)

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index 7823c324c..d5be65a97 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -163,11 +163,10 @@ class Executor::Impl {
     const std::string &name() const { return name_; }
 
     void compile(const std::string &plan, int device_id,
-                 const std::string &name);
-    void launch(Stream stream, bool loop_mode,
-                const std::unordered_map<Tensor, void *> &external_tensors);
-    void run(int iter,
-             const std::unordered_map<Tensor, void *> &external_tensors);
+                 const std::string &name,
+                 const std::unordered_map<Tensor, void *> &external_tensors);
+    void launch(Stream stream, bool loop_mode);
+    void run(int iter);
     void wait(int64_t max_spin_count);
     float stop(int64_t max_spin_count);
     void barrier();
@@ -330,8 +329,8 @@ std::map<size_t, void *> Executor::Impl::init_buffer_addrs(
     if (!buffer_id_to_addr_.empty()) {
         buffer_id_to_addr = buffer_id_to_addr_;
     }
-    for (const auto &kv : buffer_id_to_offset) {
-        buffer_id_to_addr[kv.first] = buffer->ref(kv.second);
+    for (const auto &[id, offset] : buffer_id_to_offset) {
+        buffer_id_to_addr[id] = buffer->ref(offset);
     }
     return buffer_id_to_addr;
 }
@@ -428,15 +427,13 @@ std::map<size_t, size_t> Executor::Impl::init_buffers(const Json &plan_json) {
             }
             continue;
         }
-        if (buffer_manager_.is_external(buf_id)) {
-            if (buf_info->buffer->device_id() != device_id_) {
-                ERR(InvalidUsageError,
-                    "PyTorch tensor and model execution are on different GPUs");
-            }
-            external_buffers_.push_back(buffer_manager_.get_buffer(buf_id));
-            const auto [it, inserted] = buffer_id_to_name_.try_emplace(
-                buf_id, "extern_buf_" + std::to_string(buf_id));
-            external_args_.push_back(it->second);
+        if (buffer_manager_.is_external(buf_id) &&
+            !buffer_manager_.is_staged(buf_id)) {
+            external_buffers_.push_back(
+                buffer_manager_.get_buffer_addr(buf_id));
+            const std::string name = "extern_buf_" + std::to_string(buf_id);
+            external_args_.push_back(name);
+            buffer_id_to_name_[buf_id] = name;
             continue;
         }
         // if we are adding a plan and come across a buffer from a previous
@@ -692,8 +689,9 @@ void Executor::Impl::init_channels(const std::set<int> &remote_ranks) {
     }
 }
 
-void Executor::Impl::compile(const std::string &plan, int device_id,
-                             const std::string &name) {
+void Executor::Impl::compile(
+    const std::string &plan, int device_id, const std::string &name,
+    const std::unordered_map<Tensor, void *> &external_tensors) {
     if (is_launched_) {
         ERR(InvalidUsageError, "Need to stop before re-compiling.");
         return;
@@ -704,12 +702,26 @@ void Executor::Impl::compile(const std::string &plan, int device_id,
     } catch (const ::nlohmann::json::parse_error &e) {
         ERR(InvalidUsageError, "Failed to parse the plan JSON: ", e.what());
     }
+    for (auto &[tns, addr] : external_tensors) {
+        const size_t buf_id = tns.ref()->buffer()->id();
+        if (buffer_manager_.is_staged(buf_id)) {
+            buffer_manager_.set_buffer_address(buf_id, addr);
+            external_buffers_.push_back(addr);
+            const std::string name = "extern_buf_" + std::to_string(buf_id);
+            external_args_.push_back(name);
+            buffer_id_to_name_[buf_id] = name;
+        } else {
+            ERR(InvalidUsageError,
+                "Cannot set the buffer address for tensor with buffer:", buf_id,
+                " the address is already bound. "
+                "Address setting is only allowed for delayed binding of "
+                "uninitialized buffers.");
+        }
+    }
     kernel_->compile();
 }
 
-void Executor::Impl::launch(
-    Stream stream, bool loop_mode,
-    const std::unordered_map<Tensor, void *> &external_tensors) {
+void Executor::Impl::launch(Stream stream, bool loop_mode) {
     if ((kernel_ == nullptr) || !kernel_->is_compiled()) {
         ERR(InvalidUsageError, "Need to compile first before launch.");
     }
@@ -802,8 +814,7 @@ void Executor::Impl::launch(
     is_launched_ = true;
 }
 
-void Executor::Impl::run(
-    int iter, const std::unordered_map<Tensor, void *> &external_tensors) {
+void Executor::Impl::run(int iter) {
     if (iter <= 0) return;
     if (loop_mode_) {
         while (atomicLoadRelaxed(flag_->ref<int>()) > 0) {
@@ -1021,22 +1032,18 @@ std::string Executor::plan() const { return impl_->plan(); }
 
 const std::string &Executor::name() const { return impl_->name(); }
 
-void Executor::compile(const std::string &plan, int device_id,
-                       const std::string &name) {
-    impl_->compile(plan, device_id, name);
-}
-
-void Executor::launch(
-    Stream stream, bool loop_mode,
+void Executor::compile(
+    const std::string &plan, int device_id, const std::string &name,
     const std::unordered_map<Tensor, void *> &external_tensors) {
-    impl_->launch(stream, loop_mode, external_tensors);
+    impl_->compile(plan, device_id, name, external_tensors);
 }
 
-void Executor::run(int iter,
-                   const std::unordered_map<Tensor, void *> &external_tensors) {
-    impl_->run(iter, external_tensors);
+void Executor::launch(Stream stream, bool loop_mode) {
+    impl_->launch(stream, loop_mode);
 }
 
+void Executor::run(int iter) { impl_->run(iter); }
+
 void Executor::wait(int64_t max_spin_count) { impl_->wait(max_spin_count); }
 
 float Executor::stop(int64_t max_spin_count) {
diff --git a/ark/api/tensor.cpp b/ark/api/tensor.cpp
index 084ce6383..fc44b4a58 100644
--- a/ark/api/tensor.cpp
+++ b/ark/api/tensor.cpp
@@ -9,18 +9,6 @@
 
 namespace ark {
 
-Tensor::Tensor(void* data_ptr, int32_t device_id,
-               const std::vector<int64_t>& shape, const DataType& dtype) {
-    size_t external_data_size = std::accumulate(shape.begin(), shape.end(), 1,
-                                                std::multiplies<int64_t>()) *
-                                dtype.bytes();
-    auto buffer =
-        std::make_shared<ModelBuffer>(data_ptr, external_data_size, device_id);
-    auto tensor = std::make_shared<ModelTensor>(
-        dtype.ref(), buffer, Dims(shape), Dims(shape), Dims(), Dims());
-    ref_ = tensor;
-}
-
 size_t Tensor::id() const {
     if (ref_) {
         return ref_->id();
diff --git a/ark/include/ark/executor.hpp b/ark/include/ark/executor.hpp
index 8e5e5c852..6b4235ae8 100644
--- a/ark/include/ark/executor.hpp
+++ b/ark/include/ark/executor.hpp
@@ -43,16 +43,14 @@ class Executor {
 
     /// Compile the model. This must be called before `launch()`.
     void compile(const std::string &plan, int device_id,
-                 const std::string &name = "executor");
+                 const std::string &name = "executor",
+                 const std::unordered_map<Tensor, void *> &external_tensors = {});
 
     /// Launch the executor. This must be called after `compile()`.
-    void launch(
-        Stream stream = nullptr, bool loop_mode = true,
-        const std::unordered_map<Tensor, void *> &external_tensors = {});
+    void launch(Stream stream = nullptr, bool loop_mode = true);
 
     /// Run the executor for `iter` iterations.
-    void run(int iter,
-             const std::unordered_map<Tensor, void *> &external_tensors = {});
+    void run(int iter);
 
     /// Wait for the previous run to finish.
     void wait(int64_t max_spin_count = -1);
diff --git a/ark/include/ark/tensor.hpp b/ark/include/ark/tensor.hpp
index 816738c07..05dbb11fe 100644
--- a/ark/include/ark/tensor.hpp
+++ b/ark/include/ark/tensor.hpp
@@ -31,8 +31,6 @@ class Tensor {
     Tensor(ModelTensorRef ref) : ref_(ref) {}
     Tensor(const Tensor &other) = default;
     Tensor &operator=(const Tensor &other) = default;
-    Tensor(void *data_ptr, int32_t device_id, const std::vector<int64_t> &shape,
-           const DataType &dtype);
 
     bool operator==(const Tensor &other) const { return ref_ == other.ref_; }
     bool operator!=(const Tensor &other) const { return ref_ != other.ref_; }
@@ -67,8 +65,8 @@ std::ostream &operator<<(std::ostream &os, const Tensor &tensor);
 namespace std {
 template <>
 struct hash<ark::Tensor> {
-    size_t operator()(const ark::Tensor &t) const {
-        return hash<size_t>()(t.id());
+    size_t operator()(const ark::Tensor &t) const noexcept {
+        return t.id();
     }
 };
 }  // namespace std
diff --git a/ark/model/model_buffer.cpp b/ark/model/model_buffer.cpp
index 5ce255ce5..9f494b7a4 100644
--- a/ark/model/model_buffer.cpp
+++ b/ark/model/model_buffer.cpp
@@ -24,23 +24,6 @@ ModelBuffer::ModelBuffer(size_t id, int rank,
     }
 }
 
-ModelBuffer::ModelBuffer(void *data, size_t size, int32_t device_id)
-    : rank_(-1),
-      external_data_(data),
-      external_data_size_(size),
-      device_id_(device_id),
-      is_external_(true) {
-    id_ = curr_id++;
-}
-
-ModelBuffer::ModelBuffer(size_t id, void *data, size_t size, int32_t device_id)
-    : id_(id),
-      rank_(-1),
-      external_data_(data),
-      external_data_size_(size),
-      device_id_(device_id),
-      is_external_(true) {}
-
 void ModelBuffer::tag_send(int remote_rank, int tag) {
     send_tags_.insert(TagInfo{remote_rank, tag});
 }
@@ -63,14 +46,6 @@ Json ModelBuffer::serialize() const {
     }
     j["SendTags"] = send_tags;
     j["RecvTags"] = recv_tags;
-    j["IsExternal"] = is_external_;
-    if (is_external_) {
-        ModelBufferManager::get_instance().register_buffer(id_, external_data_,
-                                                           external_data_size_);
-        j["ExternalDataSize"] = external_data_size_;
-        j["DeviceId"] = device_id_;
-    }
-    // external_data_ptr_ is not included in JSON
     return j;
 }
 
@@ -82,30 +57,7 @@ std::shared_ptr<ModelBuffer> ModelBuffer::deserialize(const Json &serialized) {
     } else if (!serialized.contains("SendTags")) {
         ERR(ModelError, "ModelBuffer deserialization failed: missing SendTags");
     } else if (!serialized.contains("RecvTags")) {
-        ERR(ModelError,
-            "ModelBuffer deserialization failed: missing RecvTags");
-    } else if (!serialized.contains("IsExternal")) {
-        ERR(ModelError,
-            "ModelBuffer deserialization failed: missing IsExternal");
-    }
-    if (serialized["IsExternal"]) {
-        if (!serialized.contains("ExternalDataSize")) {
-            ERR(ModelError,
-                "ModelBuffer deserialization failed: missing ExternalDataSize");
-        } else if (!serialized.contains("DeviceId")) {
-            ERR(ModelError,
-                "ModelBuffer deserialization failed: missing DeviceId");
-        }
-        void *data_ptr =
-            ModelBufferManager::get_instance().get_buffer(serialized["Id"]);
-        if (!data_ptr) {
-            ERR(ModelError,
-                "ModelBuffer deserialization failed: external buffer not found "
-                "in BufferManager");
-        }
-        return std::make_shared<ModelBuffer>(serialized["Id"], data_ptr,
-                                             serialized["ExternalDataSize"],
-                                             serialized["DeviceId"]);
+        ERR(ModelError, "ModelBuffer deserialization failed: missing RecvTags");
     }
     return std::make_shared<ModelBuffer>(serialized["Id"], serialized["Rank"],
                                          serialized["SendTags"],
diff --git a/ark/model/model_buffer.hpp b/ark/model/model_buffer.hpp
index e7f1045b2..342b08bb7 100644
--- a/ark/model/model_buffer.hpp
+++ b/ark/model/model_buffer.hpp
@@ -22,10 +22,6 @@ class ModelBuffer {
     ModelBuffer(size_t id, int rank, const std::vector<TagInfo> &send_tags,
                 const std::vector<TagInfo> &recv_tags);
 
-    // externally managed buffer
-    ModelBuffer(void *data, size_t size, int32_t device_id);
-    ModelBuffer(size_t id, void *data, size_t size, int32_t device_id);
-
     size_t id() const { return id_; }
 
     int rank() const { return rank_; }
@@ -48,22 +44,12 @@ class ModelBuffer {
 
     static std::shared_ptr<ModelBuffer> deserialize(const Json &serialized);
 
-    // external buffer management
-    size_t external_data_size() const { return external_data_size_; }
-    void *external_data() const { return external_data_; }
-    int32_t device_id() const { return device_id_; }
-    bool is_external() const { return is_external_; }
-
    private:
     static size_t curr_id;
     size_t id_;
     int rank_;
     std::set<TagInfo> send_tags_;
     std::set<TagInfo> recv_tags_;
-    void *external_data_ = nullptr;
-    size_t external_data_size_ = 0;
-    int32_t device_id_;
-    bool is_external_ = false;
 };
 
 }  // namespace ark
diff --git a/ark/model_buffer_manager.hpp b/ark/model_buffer_manager.hpp
index 3e82b05f5..ab8d8df9c 100644
--- a/ark/model_buffer_manager.hpp
+++ b/ark/model_buffer_manager.hpp
@@ -7,6 +7,8 @@
 #include <tuple>
 #include <unordered_map>
 
+#include "logging.hpp"
+
 namespace ark {
 // Manages externally allocated buffers (buffers corresponding to Tensors that
 // are the output of a `placeholder` operation) outside of ARK's memory space.
@@ -17,19 +19,35 @@ class ModelBufferManager {
         return instance;
     }
 
-    void register_buffer(const size_t id, void* const data, const size_t size) {
+    void register_buffer(size_t id, void* const data, size_t size) {
         buffers_[id] = std::make_tuple(data, size);
     }
 
-    void* get_buffer(const size_t id) const {
+    void* get_buffer_addr(size_t id) const {
         auto it = buffers_.find(id);
         if (it != buffers_.end()) {
             return std::get<0>(it->second);
         }
+        ERR(InvalidUsageError, "Tensor with buffer ID: ", id,
+            " , is not registered in the ModelBufferManager. Be sure to "
+            "register the tensor as an external tensor first (pass the tensor "
+            "into a placeholder operation).");
         return nullptr;
     }
 
-    size_t get_buffer_size(const size_t id) const {
+    void set_buffer_address(size_t id, void* const new_address) {
+        void* curr_addr = get_buffer_addr(id);
+        if (curr_addr != nullptr) {
+            ERR(InvalidUsageError,
+                "Cannot set the buffer address for tensor with buffer: ", id,
+                " , the address is already bound. "
+                "Address setting is only allowed for delayed binding of "
+                "uninitialized buffers.");
+        }
+        std::get<0>(buffers_[id]) = new_address;
+    }
+
+    size_t get_buffer_size(size_t id) const {
         auto it = buffers_.find(id);
         if (it != buffers_.end()) {
             return std::get<1>(it->second);
@@ -37,10 +55,15 @@ class ModelBufferManager {
         return 0;
     }
 
-    bool is_external(const size_t id) const {
+    bool is_external(size_t id) const {
         return buffers_.find(id) != buffers_.end();
     }
 
+    bool is_staged(size_t id) const {
+        const void* curr_addr = get_buffer_addr(id);
+        return curr_addr == nullptr;
+    }
+
     const std::unordered_map<size_t, std::tuple<void*, size_t>>& get_buffers()
         const {
         return buffers_;
diff --git a/ark/ops/ops_placeholder_test.cpp b/ark/ops/ops_placeholder_test.cpp
index 59f5e2dc0..7610ee61d 100644
--- a/ark/ops/ops_placeholder_test.cpp
+++ b/ark/ops/ops_placeholder_test.cpp
@@ -7,7 +7,7 @@
 #include "model/model_op.hpp"
 #include "ops_test_common.hpp"
 
-ark::unittest::State test_ops_placeholder_value_contiguous() {
+ark::unittest::State test_ops_placeholder() {
     ark::Model model;
     ark::Dims shape{10, 1};
 
@@ -42,7 +42,7 @@ ark::unittest::State test_ops_placeholder_value_contiguous() {
         UNITTEST_EQ(h_res[i], i + 2);
     }
 
-    cudaFree(d_ext_buffer);
+    ark::gpuFree(d_ext_buffer);
 
     return ark::unittest::SUCCESS;
 }
diff --git a/python/ark/module.py b/python/ark/module.py
index 49d2ddf00..4809ea432 100644
--- a/python/ark/module.py
+++ b/python/ark/module.py
@@ -4,13 +4,13 @@
 import logging
 import numpy as np
 from typing import Any, Dict, Union
-from .tensor import Tensor, Parameter
+from .tensor import Parameter
 from .runtime import Runtime
-from .init import init
 from .model import Model
 
 try:
     import torch
+    from .ops import placeholder
 
     _no_torch = False
 except ImportError:
@@ -43,7 +43,7 @@ def __setattr__(self, __name: str, __value: Any) -> None:
         elif isinstance(__value, Parameter):
             self.register_parameter(__name, __value)
         elif not _no_torch and isinstance(__value, torch.nn.Parameter):
-            __value = Parameter(__value)
+            __value = Parameter(placeholder(torch_tensor=__value), True)
             self.register_parameter(__name, __value)
         super().__setattr__(__name, __value)
 
@@ -151,14 +151,14 @@ def forward(ctx, ark_module, *args, **kwargs):
         input_requires_grad = 0
         for arg in args:
             if isinstance(arg, torch.Tensor):
-                input_args.append(Tensor.from_torch(arg))
+                input_args.append(placeholder(torch_tensor=arg))
                 if arg.requires_grad:
                     input_requires_grad += 1
             else:
                 input_args.append(arg)
         for k, v in kwargs.items():
             if isinstance(v, torch.Tensor):
-                input_kwargs[k] = Tensor.from_torch(v)
+                input_kwargs[k] = placeholder(torch_tensor=v)
                 if v.requires_grad:
                     input_requires_grad += 1
             else:
@@ -180,7 +180,7 @@ def backward(ctx, *grad_outputs):
         PyTorch parameters.
         """
         Model.reset()
-        ark_grad_outputs = [Tensor.from_torch(grad) for grad in grad_outputs]
+        ark_grad_outputs = [placeholder(torch_tensor=grad) for grad in grad_outputs]
         grads = ctx.ark_module.backward(*ark_grad_outputs)
         grad_inputs, grad_weights = (
             grads[: ctx.num_inp_grad],
diff --git a/python/ark/ops.py b/python/ark/ops.py
index f8b75a70b..be145eb11 100644
--- a/python/ark/ops.py
+++ b/python/ark/ops.py
@@ -1,12 +1,21 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-from typing import List, Iterable, Union
+from typing import List, Iterable, Union, Optional
 
 from .tensor import Dims, Tensor, Parameter, NullTensor
 from .data_type import DataType, fp32
 from .model import Model
 
+try:
+    import torch
+
+    _no_torch = False
+except ImportError:
+    from . import torch_mock as torch
+
+    _no_torch = True
+
 
 def _is_list_or_tuple(obj):
     return isinstance(obj, list) or isinstance(obj, tuple)
@@ -48,6 +57,55 @@ def _tensor(
     )
 
 
+def placeholder(
+    shape: Optional[Iterable[int]] = None,
+    dtype: Optional[DataType] = None,
+    torch_tensor: Optional[torch.Tensor] = None,
+    strides: Iterable[int] = [],
+    offsets: Iterable[int] = [],
+    padded_shape: Iterable[int] = [],
+    rank: int = -1,
+    name: str = "",
+) -> Tensor:
+    if torch_tensor is not None:
+        if any(
+            (arg is not None and arg != [])
+            for arg in [shape, dtype, strides, offsets, padded_shape]
+        ):
+            raise ValueError(
+                "shape, dtype, strides, offsets, and padded_shape should not "
+                "be provided as they are inferred from the torch tensor."
+            )
+        dl_tensor = torch.utils.dlpack.to_dlpack(torch_tensor)
+        return Tensor(Model.get_model().placeholder(
+            external_tensor=dl_tensor,
+            rank=rank,
+            name=name,
+        ))
+    if not _is_list_or_tuple(shape):
+        raise ValueError("shape should be a list or tuple of integers")
+    if not _is_list_or_tuple(strides):
+        raise ValueError("strides should be a list or tuple of integers")
+    if not _is_list_or_tuple(offsets):
+        raise ValueError("offsets should be a list or tuple of integers")
+    if not _is_list_or_tuple(padded_shape):
+        raise ValueError("padded_shape should be a list or tuple of integers")
+    # only support tensors with up to 4 dimensions
+    if any(len(arg) > 4 for arg in (shape, strides, offsets, padded_shape)):
+        raise ValueError("Only support tensors with up to 4 dimensions")
+    print(shape)
+    return Tensor(Model.get_model().placeholder(
+        Dims(shape),
+        dtype.ctype(),
+        Dims(strides),
+        Dims(offsets),
+        Dims(padded_shape),
+        rank,
+        name,
+        None,
+    ))
+
+
 def add(
     input: Union[Tensor, float],
     other: Union[Tensor, float],
@@ -630,6 +688,7 @@ def all_reduce(
 
 __all__ = [
     "tensor",
+    "placeholder",
     "parameter",
     "reshape",
     "identity",
diff --git a/python/ark/runtime.py b/python/ark/runtime.py
index 1490cdeb8..2cbed8612 100644
--- a/python/ark/runtime.py
+++ b/python/ark/runtime.py
@@ -6,6 +6,15 @@
 
 from _ark_core import _Executor
 from .planner import Planner, Plan
+from typing import Dict
+try:
+    import torch
+
+    _no_torch = False
+except ImportError:
+    from . import torch_mock as torch
+
+    _no_torch = True
 
 
 class _RuntimeState:
@@ -73,6 +82,7 @@ def launch(
         device_id: int = 0,
         stream: int = 0,
         loop_mode: bool = True,
+        tensor_mappings: Dict = {}
     ):
         """
         Create an executor and schedule the ARK model. The scheduler will generate
@@ -87,6 +97,12 @@ def launch(
         if self.launched():
             # Stop the current running model
             self.stop()
+        
+        for ark_tensor in tensor_mappings:
+            torch_tensor = tensor_mappings[ark_tensor]
+            if not isinstance(torch_tensor, torch.Tensor):
+                raise ValueError("Must bind PyTorch tensor")
+            tensor_mappings[ark_tensor] = torch_tensor.data_ptr()
 
         # Recompile if the previous launch was not compiled with the same info
         # or if this is the first launch
@@ -94,8 +110,7 @@ def launch(
             plan_str != self.executor.plan()
             or device_id != self.executor.device_id()
         ):
-            self.executor.compile(plan_str, device_id)
-
+            self.executor.compile(plan_str, device_id, tensor_mappings)
         self.executor.launch(stream, loop_mode)
         self.state = Runtime.State.LaunchedNotRunning
 
diff --git a/python/ark/tensor.py b/python/ark/tensor.py
index 9211f7d9d..348962c4e 100644
--- a/python/ark/tensor.py
+++ b/python/ark/tensor.py
@@ -7,7 +7,6 @@
 from _ark_core import _Dims, _Tensor, _NullTensor
 from .data_type import DataType
 from .runtime import Runtime
-from .model import Model
 
 try:
     import torch
@@ -45,6 +44,15 @@ def __init__(
         self._tensor = _tensor
         self.initializer: Initializer = initializer
         self.requires_grad = requires_grad
+    
+    def __hash__(self):
+        return self._tensor.id()
+    
+    def __eq__(self, other):
+        if not isinstance(other, Tensor):
+            return False
+        return self._tensor.id() == other._tensor.id()
+
 
     def shape(self) -> List[int]:
         """
@@ -132,13 +140,6 @@ def to_dlpack(self):
             )
         return rt.executor.tensor_to_dlpack(self._tensor)
 
-    @staticmethod
-    def from_dlpack(ext_tensor) -> "Tensor":
-        """
-        Copies the tensor from a DLPack tensor to the device.
-        """
-        return Tensor(_Tensor(ext_tensor))
-
     def to_torch(self) -> torch.Tensor:
         """
         Returns a torch tensor that shares the same memory with the device tensor.
@@ -151,22 +152,6 @@ def to_torch(self) -> torch.Tensor:
         torch_view.__ark_buffer__ = dl_capsule
         return torch_view
 
-    @staticmethod
-    def from_torch(tensor: torch.Tensor) -> "Tensor":
-        """
-        Returns an ARK tensor that shares the same memory with the torch tensor.
-        """
-        if _no_torch:
-            raise ImportError("torch is not available")
-        elif not tensor.is_contiguous():
-            raise ValueError("Torch tensor must be contiguous.")
-        elif tensor.device.type == "cpu":
-            raise ValueError("Torch tensor must be on a device.")
-        ark_tensor = Tensor.from_dlpack(torch.utils.dlpack.to_dlpack(tensor))
-        # Share ownership of the memory with the torch tensor
-        ark_tensor.__torch_buffer__ = tensor
-        return ark_tensor
-
     def copy(
         self, data: Union[np.ndarray, torch.Tensor], stream: int = 0
     ) -> "Tensor":
@@ -216,33 +201,36 @@ def initialize(self) -> "Tensor":
         return self
 
 
-class Parameter(Tensor, torch.nn.Parameter):
+class Parameter(Tensor):
     """
     A tensor as a parameter.
     """
 
     def __init__(
         self,
-        tensor: Union[_Tensor, "torch.nn.Parameter"],
+        tensor: _Tensor,
+        from_torch: bool,
     ):
         """
         Initializes a new instance of the Parameter class.
+        Args:
+            _tensor (_ark_core._Tensor): The underlying _Tensor object.
+            from_torch: Indicates if the Parameter is tied to a torch.nn.Paramter
         """
-        if not _no_torch and isinstance(tensor, torch.nn.Parameter):
-            ark_tensor = Tensor.from_torch(tensor)
-            core_tensor = ark_tensor._tensor
+        if not _no_torch and from_torch:
+            _tensor = tensor._tensor
             self.torch_param = tensor
             self.staged_tensor = None
             Tensor.__init__(
                 self,
-                core_tensor,
+                _tensor,
                 requires_grad=tensor.requires_grad,
             )
         elif isinstance(tensor, _Tensor):
-            core_tensor = tensor
+            _tensor = tensor
             self.torch_param = None
             self.staged_tensor = None
-            Tensor.__init__(self, core_tensor, requires_grad=False)
+            Tensor.__init__(self, _tensor, requires_grad=False)
         else:
             raise TypeError(
                 "tensor must be an ARK tensor or a torch.nn.Parameter"
diff --git a/python/executor_py.cpp b/python/executor_py.cpp
index 5b4e7959f..08fc94883 100644
--- a/python/executor_py.cpp
+++ b/python/executor_py.cpp
@@ -8,6 +8,7 @@
 
 #include <ark/executor.hpp>
 #include <ark/model.hpp>
+#include <unordered_map>
 
 #include "gpu/gpu_memory.hpp"
 #include "logging.hpp"
@@ -134,7 +135,8 @@ DLTensor SharedTensor::dl_tensor() const {
 
 }  // namespace ark
 
-static py::capsule tensor_to_dlpack(ark::Executor &self, const ark::Tensor &tensor) {
+static py::capsule tensor_to_dlpack(ark::Executor &self,
+                                    const ark::Tensor &tensor) {
     auto shared_tensor = new ark::SharedTensor(self, tensor);
     DLManagedTensor *dl_managed_tensor = new DLManagedTensor();
     dl_managed_tensor->dl_tensor = shared_tensor->dl_tensor();
@@ -146,8 +148,9 @@ static py::capsule tensor_to_dlpack(ark::Executor &self, const ark::Tensor &tens
         }
     };
     const char *capsule_name = "dltensor";
-    PyObject *dl_capsule = PyCapsule_New(static_cast<void *>(dl_managed_tensor),
-                                         capsule_name, [](PyObject *capsule) {
+    PyObject *dl_capsule = PyCapsule_New(
+        static_cast<void *>(dl_managed_tensor), capsule_name,
+        [](PyObject *capsule) {
             const char *name = PyCapsule_GetName(capsule);
             auto *dl_managed_tensor = static_cast<DLManagedTensor *>(
                 PyCapsule_GetPointer(capsule, name));
@@ -169,8 +172,17 @@ void register_executor(py::module &m) {
              })
         .def("plan", &ark::Executor::plan)
         .def("name", &ark::Executor::name)
-        .def("compile", &ark::Executor::compile, py::arg("device_id"),
-             py::arg("plan"), py::arg("name") = "executor")
+        .def("compile", 
+            [](ark::Executor *self, int device_id, std::string &plan, const std::string &name,
+               const std::unordered_map<ark::Tensor, uintptr_t> &external_tensors) {
+                std::unordered_map<ark::Tensor, void *> tensor_map;
+                for (const auto &[tensor, ptr] : external_tensors) {
+                    tensor_map[tensor] = reinterpret_cast<void *>(ptr);
+                }
+                self->compile(plan, device_id, name, tensor_map);
+            },
+            py::arg("device_id"), py::arg("plan"), py::arg("name") = "executor",
+            py::arg("external_tensors") = std::unordered_map<ark::Tensor, uintptr_t>())
         .def("launch", [](ark::Executor *self, uintptr_t stream, bool loop_mode) {
                  self->launch(reinterpret_cast<ark::Stream>(stream), loop_mode);
              },
diff --git a/python/model_py.cpp b/python/model_py.cpp
index c224a3d5b..d1150e485 100644
--- a/python/model_py.cpp
+++ b/python/model_py.cpp
@@ -1,6 +1,7 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
+#include <dlpack/dlpack.h>
 #include <pybind11/operators.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
@@ -8,8 +9,65 @@
 #include <ark/model.hpp>
 #include <ark/model_graph.hpp>
 
+#include "logging.hpp"
+
 namespace py = pybind11;
 
+struct DLTensorMetadata {
+    void *data_ptr;
+    int32_t device_id;
+    DLDeviceType device_type;
+    int32_t ndim;
+    DLDataType dtype;
+    std::vector<int64_t> shape;
+    std::vector<int64_t> strides;
+    uint64_t byte_offset;
+};
+
+static DLTensorMetadata extractDLTensorMetadata(DLManagedTensor *dl_tensor) {
+    DLTensorMetadata metadata;
+    metadata.data_ptr = dl_tensor->dl_tensor.data;
+    metadata.device_id = dl_tensor->dl_tensor.device.device_id;
+    metadata.device_type = dl_tensor->dl_tensor.device.device_type;
+    metadata.ndim = dl_tensor->dl_tensor.ndim;
+    metadata.dtype = dl_tensor->dl_tensor.dtype;
+    metadata.shape.assign(
+        dl_tensor->dl_tensor.shape,
+        dl_tensor->dl_tensor.shape + dl_tensor->dl_tensor.ndim);
+    if (dl_tensor->dl_tensor.strides != nullptr) {
+        metadata.strides.assign(
+            dl_tensor->dl_tensor.strides,
+            dl_tensor->dl_tensor.strides + dl_tensor->dl_tensor.ndim);
+    }
+    metadata.byte_offset = dl_tensor->dl_tensor.byte_offset;
+    return metadata;
+}
+
+static ark::DataType from_dl_dtype(const DLDataType &dl_dtype) {
+    if (dl_dtype.lanes != 1) {
+        ERR(ark::UnsupportedError, "unsupported data type");
+    }
+    ark::DataType ark_dtype;
+    if (dl_dtype.code == kDLFloat && dl_dtype.bits == 32) {
+        ark_dtype = ark::FP32;
+    } else if (dl_dtype.code == kDLFloat && dl_dtype.bits == 16) {
+        ark_dtype = ark::FP16;
+    } else if (dl_dtype.code == kDLBfloat && dl_dtype.bits == 16) {
+        ark_dtype = ark::BF16;
+    } else if (dl_dtype.code == kDLInt && dl_dtype.bits == 32) {
+        ark_dtype = ark::INT32;
+    } else if (dl_dtype.code == kDLUInt && dl_dtype.bits == 32) {
+        ark_dtype = ark::UINT32;
+    } else if (dl_dtype.code == kDLInt && dl_dtype.bits == 8) {
+        ark_dtype = ark::INT8;
+    } else if (dl_dtype.code == kDLUInt && dl_dtype.bits == 8) {
+        ark_dtype = ark::UINT8;
+    } else {
+        ERR(ark::UnsupportedError, "unsupported data type");
+    }
+    return ark_dtype;
+}
+
 void register_model(py::module &m) {
     py::class_<ark::Model, ark::ModelGraph>(m, "_Model")
         .def(py::init<int, int>(), py::arg("rank"), py::arg("world_size"))
@@ -112,6 +170,27 @@ void register_model(py::module &m) {
              py::arg("shape"), py::arg("data_type"), py::arg("strides"),
              py::arg("offsets"), py::arg("padded_shape"), py::arg("rank"),
              py::arg("name"))
+        .def("placeholder",
+             py::overload_cast<const ark::Dims &, const ark::DataType &,
+                               const ark::Dims &, const ark::Dims &,
+                               const ark::Dims &, int, const std::string &,
+                               void *>(&ark::Model::placeholder),
+             py::arg("shape"), py::arg("data_type"), py::arg("strides"),
+             py::arg("offsets"), py::arg("padded_shape"), py::arg("rank"),
+             py::arg("name"), py::arg("external_data"))
+        .def(
+            "placeholder",
+            [](ark::Model &self, py::capsule input, int rank,
+               const std::string &name) {
+                DLManagedTensor *dl_tensor =
+                    static_cast<DLManagedTensor *>(input.get_pointer());
+                DLTensorMetadata metadata = extractDLTensorMetadata(dl_tensor);
+                ark::DataType ark_dtype = from_dl_dtype(metadata.dtype);
+                ark::Dims shape(metadata.shape);
+                return self.placeholder(shape, ark_dtype, {}, {}, {}, rank,
+                                        name, metadata.data_ptr);
+            },
+            py::arg("external_tensor"), py::arg("rank"), py::arg("name"))
         .def("transpose", &ark::Model::transpose, py::arg("input"),
              py::arg("permutation"), py::arg("output"), py::arg("name"))
         .def("all_reduce", &ark::Model::all_reduce, py::arg("input"),
diff --git a/python/tensor_py.cpp b/python/tensor_py.cpp
index 5abb35c66..5c28563de 100644
--- a/python/tensor_py.cpp
+++ b/python/tensor_py.cpp
@@ -1,87 +1,16 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include <dlpack/dlpack.h>
 #include <pybind11/operators.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
 #include <ark/tensor.hpp>
 
-#include "logging.hpp"
-
 namespace py = pybind11;
 
-struct DLTensorMetadata {
-    void* data_ptr;
-    int32_t device_id;
-    DLDeviceType device_type;
-    int32_t ndim;
-    DLDataType dtype;
-    std::vector<int64_t> shape;
-    std::vector<int64_t> strides;
-    uint64_t byte_offset;
-};
-
-static DLTensorMetadata extractDLTensorMetadata(DLManagedTensor* dl_tensor) {
-    DLTensorMetadata metadata;
-    metadata.data_ptr = dl_tensor->dl_tensor.data;
-    metadata.device_id = dl_tensor->dl_tensor.device.device_id;
-    metadata.device_type = dl_tensor->dl_tensor.device.device_type;
-    metadata.ndim = dl_tensor->dl_tensor.ndim;
-    metadata.dtype = dl_tensor->dl_tensor.dtype;
-    metadata.shape.assign(
-        dl_tensor->dl_tensor.shape,
-        dl_tensor->dl_tensor.shape + dl_tensor->dl_tensor.ndim);
-    if (dl_tensor->dl_tensor.strides != nullptr) {
-        metadata.strides.assign(
-            dl_tensor->dl_tensor.strides,
-            dl_tensor->dl_tensor.strides + dl_tensor->dl_tensor.ndim);
-    }
-    metadata.byte_offset = dl_tensor->dl_tensor.byte_offset;
-    return metadata;
-}
-
-static ark::DataType from_dl_dtype(const DLDataType &dl_dtype) {
-    if (dl_dtype.lanes != 1) {
-        ERR(ark::UnsupportedError, "unsupported data type");
-    }
-    ark::DataType ark_dtype;
-    if (dl_dtype.code == kDLFloat && dl_dtype.bits == 32) {
-        ark_dtype = ark::FP32;
-    } else if (dl_dtype.code == kDLFloat && dl_dtype.bits == 16) {
-        ark_dtype = ark::FP16;
-    } else if (dl_dtype.code == kDLBfloat && dl_dtype.bits == 16) {
-        ark_dtype = ark::BF16;
-    } else if (dl_dtype.code == kDLInt && dl_dtype.bits == 32) {
-        ark_dtype = ark::INT32;
-    } else if (dl_dtype.code == kDLUInt && dl_dtype.bits == 32) {
-        ark_dtype = ark::UINT32;
-    } else if (dl_dtype.code == kDLInt && dl_dtype.bits == 8) {
-        ark_dtype = ark::INT8;
-    } else if (dl_dtype.code == kDLUInt && dl_dtype.bits == 8) {
-        ark_dtype = ark::UINT8;
-    } else {
-        ERR(ark::UnsupportedError, "unsupported data type");
-    }
-    return ark_dtype;
-}
-
 void register_tensor(py::module& m) {
     py::class_<ark::Tensor>(m, "_Tensor")
-        .def(py::init([](py::capsule capsule) {
-            DLManagedTensor* dl_tensor = (DLManagedTensor*)capsule;
-            if (!dl_tensor) {
-                ERR(ark::InvalidUsageError,
-                    "Capsule does not contain a DLManagedTensor");
-            }
-            DLTensorMetadata metadata = extractDLTensorMetadata(dl_tensor);
-            int32_t device_id = metadata.device_id;
-            void* data_ptr = metadata.data_ptr;
-            auto shape = metadata.shape;
-
-            return ark::Tensor(data_ptr, device_id, shape, from_dl_dtype(metadata.dtype));
-        }))
         .def("id", &ark::Tensor::id)
         .def("shape", &ark::Tensor::shape)
         .def("strides", &ark::Tensor::strides)
diff --git a/python/unittest/test_conversion.py b/python/unittest/test_conversion.py
index 833b88662..83fb77b31 100644
--- a/python/unittest/test_conversion.py
+++ b/python/unittest/test_conversion.py
@@ -37,9 +37,9 @@ def test_values_fixed_dims(num_dims: int, size: int, dtype: ark.DataType):
     input_tensor.from_numpy(input_tensor_host)
     other_tensor.from_numpy(other_tensor_host)
 
-    input_view = input_tensor.get_torch_view()
-    other_view = other_tensor.get_torch_view()
-    output_view = output_tensor.get_torch_view()
+    input_view = input_tensor.to_torch()
+    other_view = other_tensor.to_torch()
+    output_view = output_tensor.to_torch()
 
     runtime.run()
 
@@ -50,7 +50,7 @@ def test_values_fixed_dims(num_dims: int, size: int, dtype: ark.DataType):
     output_tensor_host = output_tensor.to_numpy()
 
     runtime.stop()
-    runtime.delete_all_runtimes()
+    runtime.reset()
 
     assert np.allclose(input_tensor_host, input_view_numpy)
     assert np.allclose(other_tensor_host, other_view_numpy)
@@ -83,9 +83,9 @@ def test_ark_to_torch_aliasing(dtype: ark.DataType):
     input_tensor.from_numpy(input_tensor_host)
     other_tensor.from_numpy(other_tensor_host)
 
-    input_view = input_tensor.get_torch_view()
-    other_view = other_tensor.get_torch_view()
-    output_view = output_tensor.get_torch_view()
+    input_view = input_tensor.to_torch()
+    other_view = other_tensor.to_torch()
+    output_view = output_tensor.to_torch()
     # make changes to the views
     input_view[1, 1] = 20
     other_view[0, 0] = 30
@@ -105,7 +105,7 @@ def test_ark_to_torch_aliasing(dtype: ark.DataType):
     runtime.stop()
     runtime.reset()
 
-
+pytest.mark.skip()
 def test_conversion_torch():
     if _no_torch:
         pytest.skip("PyTorch not available")
@@ -149,8 +149,8 @@ def test_bin_op(dtype, ark_op: ArkBinOp, torch_op: TorchBinOp, tensor_dims):
     input_tensor = torch.randn(tensor_dims, dtype=dtype, device="cuda:0")
     other_tensor = torch.randn(tensor_dims, dtype=dtype, device="cuda:0")
     expected_output = torch_op(input_tensor, other_tensor).cpu().numpy()
-    input_ark_view = ark.Tensor.from_torch(input_tensor)
-    other_ark_view = ark.Tensor.from_torch(other_tensor)
+    input_ark_view = ark.placeholder(torch_tensor=input_tensor)
+    other_ark_view = ark.placeholder(torch_tensor=other_tensor)
     output = ark_op(input_ark_view, other_ark_view)
     runtime = ark.Runtime()
     runtime.launch()
@@ -170,7 +170,7 @@ def test_unary_op(dtype, ark_op: ArkUnOp, torch_op: TorchUnOp, tensor_dims):
     ark.init()
     input_tensor = torch.randn(tensor_dims, dtype=dtype, device="cuda:0")
     expected_output = torch_op(input_tensor).cpu().numpy()
-    input_ark_view = ark.Tensor.from_torch(input_tensor)
+    input_ark_view = ark.placeholder(torch_tensor=input_tensor)
     output = ark_op(input_ark_view)
     runtime = ark.Runtime()
     runtime.launch()
@@ -189,8 +189,8 @@ def test_torch_to_ark_aliasing(dtype, tensor_dims):
     input_tensor = torch.randn(tensor_dims, dtype=dtype, device="cuda:0")
     other_tensor = torch.randn(tensor_dims, dtype=dtype, device="cuda:0")
 
-    input_ark_view = ark.Tensor.from_torch(input_tensor)
-    other_ark_view = ark.Tensor.from_torch(other_tensor)
+    input_ark_view = ark.placeholder(torch_tensor=input_tensor)
+    other_ark_view = ark.placeholder(torch_tensor=other_tensor)
 
     output = ark.add(input_ark_view, other_ark_view)
     # Perform in place operations
@@ -205,3 +205,65 @@ def test_torch_to_ark_aliasing(dtype, tensor_dims):
     runtime.stop()
     runtime.reset()
     assert np.allclose(output_host, expected_output)
+
+
+# Staged View Tests
+
+
+@pytest.mark.parametrize(
+    "dtype, ark_op, torch_op, tensor_dims",
+    [(torch.float16, ark.add, torch.add, (2, 3))],
+)
+def test_bin_op_staged(
+    dtype, ark_op: ArkBinOp, torch_op: TorchBinOp, tensor_dims
+):
+    ark.init()
+    input_tensor = torch.randn(tensor_dims, dtype=dtype, device="cuda:0")
+    other_tensor = torch.randn(tensor_dims, dtype=dtype, device="cuda:0")
+    expected_output = torch_op(input_tensor, other_tensor).cpu().numpy()
+    input_ark_view = ark.placeholder(
+        shape=tensor_dims, dtype=ark.DataType.from_torch(dtype)
+    )
+    other_ark_view = ark.placeholder(
+        shape=tensor_dims, dtype=ark.DataType.from_torch(dtype)
+    )
+    output = ark_op(input_ark_view, other_ark_view)
+    runtime = ark.Runtime()
+    tensor_mapping = {
+        input_ark_view: input_tensor,
+        other_ark_view: other_tensor,
+    }
+    runtime.launch(tensor_mappings=tensor_mapping)
+    runtime.run()
+    output_host = output.to_numpy()
+    runtime.stop()
+    runtime.reset()
+    assert np.allclose(output_host, expected_output)
+
+test_bin_op_staged(torch.float16, ark.add, torch.add, (2, 3))
+
+
+@pytest.mark.parametrize(
+    "dtype, ark_op, torch_op, tensor_dims",
+    [(torch.float16, ark.exp, torch.exp, (3, 3))],
+)
+def test_unary_op_staged(
+    dtype, ark_op: ArkUnOp, torch_op: TorchUnOp, tensor_dims
+):
+    ark.init()
+    input_tensor = torch.randn(tensor_dims, dtype=dtype, device="cuda:0")
+    expected_output = torch_op(input_tensor).cpu().numpy()
+    input_ark_view = ark.placeholder(
+        shape=tensor_dims, dtype=ark.DataType.from_torch(dtype)
+    )
+    output = ark_op(input_ark_view)
+    runtime = ark.Runtime()
+    tensor_mapping = {input_ark_view: input_tensor}
+    runtime.launch()
+    runtime.run(tensor_mappings=tensor_mapping)
+    output_host = output.to_numpy()
+    runtime.stop()
+    runtime.reset()
+    assert np.allclose(output_host, expected_output)
+
+test_unary_op_staged(torch.float16, ark.exp, torch.exp, (3, 3))

From d0a18361ef0db3f7447d83d4110f690e163957ac Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Thu, 15 Aug 2024 11:46:21 +0000
Subject: [PATCH 79/79] rename & remove unneeded code & python interface

---
 ark/api/executor.cpp             | 35 ++++++++--------
 ark/api/tensor.cpp               | 12 ------
 ark/codegen.cpp                  |  2 +-
 ark/codegen.hpp                  |  1 -
 ark/external_buffer_registry.cpp | 32 +++++++++++++++
 ark/external_buffer_registry.hpp | 31 ++++++++++++++
 ark/gpu/gpu.hpp                  |  4 ++
 ark/include/ark/model.hpp        |  8 ++--
 ark/include/ark/tensor.hpp       |  4 --
 ark/model/model_buffer.cpp       | 62 +++++-----------------------
 ark/model/model_buffer.hpp       | 22 +++-------
 ark/model_buffer_manager.hpp     | 62 ----------------------------
 ark/ops/ops_placeholder.cpp      | 28 +++++--------
 ark/ops/ops_placeholder.hpp      |  2 +-
 ark/ops/ops_placeholder_test.cpp |  2 +-
 python/ark/ops.py                | 62 +++++++++++-----------------
 python/ark/tensor.py             | 70 ++++++++++++++++++++++++++++++--
 python/model_py.cpp              | 24 +++++++----
 python/tensor_py.cpp             | 25 ++++++------
 19 files changed, 237 insertions(+), 251 deletions(-)
 create mode 100644 ark/external_buffer_registry.cpp
 create mode 100644 ark/external_buffer_registry.hpp
 delete mode 100644 ark/model_buffer_manager.hpp

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index 47a7a7519..06f31e67d 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -15,6 +15,7 @@
 #include "ark/planner.hpp"
 #include "codegen.hpp"
 #include "env.h"
+#include "external_buffer_registry.hpp"
 #include "file_io.h"
 #include "gpu/gpu.hpp"
 #include "gpu/gpu_event.hpp"
@@ -25,8 +26,6 @@
 #include "model/model_buffer.hpp"
 #include "model/model_data_type.hpp"
 #include "model/model_tensor.hpp"
-#include "model_buffer_manager.hpp"
-#include "unordered_map"
 #include "utils/utils_net.hpp"
 
 #if defined(ARK_CUDA)
@@ -408,7 +407,7 @@ std::map<size_t, size_t> Executor::Impl::init_buffers(const Json &plan_json) {
     std::map<int, std::map<int, size_t>> remote_rank_to_send_tag_to_buffer_id;
     std::map<int, std::map<int, size_t>> remote_rank_to_recv_tag_to_buffer_id;
 
-    auto &buffer_manager = ModelBufferManager::get_instance();
+    auto &ext_buf_reg = ExternalBufferRegistry::get_instance();
 
     // TODO: improve memory planning
     size_t offset = 0;
@@ -428,12 +427,16 @@ std::map<size_t, size_t> Executor::Impl::init_buffers(const Json &plan_json) {
             }
             continue;
         }
-        if (buffer_manager.is_external(buf_id)) {
-            if (buf_info->buffer->device_id() != device_id_) {
+        void *ext_data = ext_buf_reg.get(buf_id);
+        if (ext_data) {
+            gpuPointerAttributes attr;
+            GLOG(gpuPointerGetAttributes(&attr, ext_data));
+            if (attr.device != device_id_) {
                 ERR(InvalidUsageError,
-                    "PyTorch tensor and model execution are on different GPUs");
+                    "External data provided is on a different GPU: ",
+                    attr.device, " vs ", device_id_);
             }
-            external_buffers_.push_back(buffer_manager.get_buffer(buf_id));
+            external_buffers_.push_back(ext_data);
             const auto [it, inserted] = buffer_id_to_name_.try_emplace(
                 buf_id, "extern_buf_" + std::to_string(buf_id));
             external_args_.push_back(it->second);
@@ -540,7 +543,8 @@ std::map<size_t, size_t> Executor::Impl::init_buffers(const Json &plan_json) {
         for (int i = 0; i < len; ++i) {
             const size_t buf_id =
                 buffer_id_to_info[send_tag_to_buffer_id[tags[i]]]->buffer->id();
-            if (!buffer_manager.is_external(buf_id)) {
+            void *buf_data = ext_buf_reg.get(buf_id);
+            if (buf_data == nullptr) {
                 buffer_id_to_offset[send_tag_to_buffer_id[tags[i]]] =
                     offsets[i];
             }
@@ -561,7 +565,8 @@ std::map<size_t, size_t> Executor::Impl::init_buffers(const Json &plan_json) {
         for (int i = 0; i < len; ++i) {
             const size_t buf_id =
                 buffer_id_to_info[recv_tag_to_buffer_id[tags[i]]]->buffer->id();
-            if (!buffer_manager.is_external(buf_id)) {
+            void *buf_data = ext_buf_reg.get(buf_id);
+            if (buf_data == nullptr) {
                 buffer_id_to_offset[recv_tag_to_buffer_id[tags[i]]] =
                     offsets[i];
             }
@@ -884,9 +889,10 @@ void Executor::Impl::barrier() {
 
 void *Executor::Impl::tensor_address(const Tensor &tensor) const {
     size_t buffer_id = tensor.ref()->buffer()->id();
-    auto &buffer_manager = ModelBufferManager::get_instance();
-    if (buffer_manager.is_external(buffer_id)) {
-        return buffer_manager.get_buffer(buffer_id);
+    auto &ext_buf_reg = ExternalBufferRegistry::get_instance();
+    void *ext_data = ext_buf_reg.get(buffer_id);
+    if (ext_data) {
+        return ext_data;
     }
     if (buffer_id_to_addr_.find(buffer_id) == buffer_id_to_addr_.end()) {
         ERR(InvalidUsageError, "Tensor has an unknown buffer ID ", buffer_id,
@@ -1041,10 +1047,7 @@ float Executor::stop(int64_t max_spin_count) {
 
 void Executor::barrier() { impl_->barrier(); }
 
-void Executor::destroy() {
-    ModelBufferManager::get_instance().clear_buffers();
-    impl_.reset(nullptr);
-}
+void Executor::destroy() { impl_.reset(nullptr); }
 
 bool Executor::destroyed() const { return impl_.get() == nullptr; }
 
diff --git a/ark/api/tensor.cpp b/ark/api/tensor.cpp
index 084ce6383..fc44b4a58 100644
--- a/ark/api/tensor.cpp
+++ b/ark/api/tensor.cpp
@@ -9,18 +9,6 @@
 
 namespace ark {
 
-Tensor::Tensor(void* data_ptr, int32_t device_id,
-               const std::vector<int64_t>& shape, const DataType& dtype) {
-    size_t external_data_size = std::accumulate(shape.begin(), shape.end(), 1,
-                                                std::multiplies<int64_t>()) *
-                                dtype.bytes();
-    auto buffer =
-        std::make_shared<ModelBuffer>(data_ptr, external_data_size, device_id);
-    auto tensor = std::make_shared<ModelTensor>(
-        dtype.ref(), buffer, Dims(shape), Dims(shape), Dims(), Dims());
-    ref_ = tensor;
-}
-
 size_t Tensor::id() const {
     if (ref_) {
         return ref_->id();
diff --git a/ark/codegen.cpp b/ark/codegen.cpp
index 2bd36d679..4a1c1ed81 100644
--- a/ark/codegen.cpp
+++ b/ark/codegen.cpp
@@ -7,13 +7,13 @@
 
 #include "ark/data_type.hpp"
 #include "env.h"
+#include "external_buffer_registry.hpp"
 #include "file_io.h"
 #include "logging.hpp"
 #include "model/model_buffer.hpp"
 #include "model/model_data_type.hpp"
 #include "model/model_op.hpp"
 #include "model/model_tensor.hpp"
-#include "model_buffer_manager.hpp"
 #include "range.hpp"
 #include "utils/utils_math.hpp"
 
diff --git a/ark/codegen.hpp b/ark/codegen.hpp
index 8a4eed270..89d89080e 100644
--- a/ark/codegen.hpp
+++ b/ark/codegen.hpp
@@ -9,7 +9,6 @@
 #include <string>
 
 #include "model/model_json.hpp"
-#include "model_buffer_manager.hpp"
 
 namespace ark {
 
diff --git a/ark/external_buffer_registry.cpp b/ark/external_buffer_registry.cpp
new file mode 100644
index 000000000..450dd332b
--- /dev/null
+++ b/ark/external_buffer_registry.cpp
@@ -0,0 +1,32 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "external_buffer_registry.hpp"
+
+#include "logging.hpp"
+
+namespace ark {
+
+ExternalBufferRegistry &ExternalBufferRegistry::get_instance() {
+    static ExternalBufferRegistry instance;
+    return instance;
+}
+
+void ExternalBufferRegistry::set(const size_t id, void *data) {
+    if (data == nullptr) {
+        ERR(InternalError, "data is nullptr.");
+    }
+    buffers_[id] = data;
+}
+
+void *ExternalBufferRegistry::get(const size_t id) const {
+    auto it = buffers_.find(id);
+    if (it != buffers_.end()) {
+        return it->second;
+    }
+    return nullptr;
+}
+
+void ExternalBufferRegistry::clear() { buffers_.clear(); }
+
+}  // namespace ark
diff --git a/ark/external_buffer_registry.hpp b/ark/external_buffer_registry.hpp
new file mode 100644
index 000000000..ab199bafc
--- /dev/null
+++ b/ark/external_buffer_registry.hpp
@@ -0,0 +1,31 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef ARK_EXTERNAL_BUFFER_REGISTRY_HPP_
+#define ARK_EXTERNAL_BUFFER_REGISTRY_HPP_
+
+#include <unordered_map>
+
+namespace ark {
+// Manages externally allocated buffers (buffers corresponding to Tensors that
+// are the output of a `placeholder` operation) outside of ARK's memory space.
+class ExternalBufferRegistry {
+   public:
+    static ExternalBufferRegistry &get_instance();
+
+    void set(const size_t id, void *data);
+
+    void *get(const size_t id) const;
+
+    void clear();
+
+   private:
+    // Maps buffer IDs to pointers and sizes.
+    std::unordered_map<size_t, void *> buffers_;
+    ExternalBufferRegistry() {}
+    ExternalBufferRegistry(const ExternalBufferRegistry &) = delete;
+    ExternalBufferRegistry &operator=(const ExternalBufferRegistry &) = delete;
+};
+}  // namespace ark
+
+#endif  // ARK_EXTERNAL_BUFFER_REGISTRY_HPP_
diff --git a/ark/gpu/gpu.hpp b/ark/gpu/gpu.hpp
index 531d6c7ee..8ff3b2843 100644
--- a/ark/gpu/gpu.hpp
+++ b/ark/gpu/gpu.hpp
@@ -53,6 +53,8 @@ ARK_GPU_DEFINE_TYPE_ALIAS(gpuModule, CUmodule, hipModule_t);
 ARK_GPU_DEFINE_TYPE_ALIAS(gpuFunction, CUfunction, hipFunction_t);
 ARK_GPU_DEFINE_TYPE_ALIAS(gpuFunctionAttribute, CUfunction_attribute,
                           hipFunction_attribute);
+ARK_GPU_DEFINE_TYPE_ALIAS(gpuPointerAttributes, cudaPointerAttributes,
+                          hipPointerAttributes);
 
 // runtime API
 ARK_GPU_DEFINE_CONSTANT_ALIAS(gpuSuccess, cudaSuccess, hipSuccess);
@@ -126,6 +128,8 @@ ARK_GPU_DEFINE_CONSTANT_ALIAS(gpuPointerAttributeSyncMemops,
 ARK_GPU_DEFINE_FUNC_ALIAS(gpuGetErrorString, cudaGetErrorString,
                           hipGetErrorString);
 ARK_GPU_DEFINE_FUNC_ALIAS(gpuGetLastError, cudaGetLastError, hipGetLastError);
+ARK_GPU_DEFINE_FUNC_ALIAS(gpuPointerGetAttributes, cudaPointerGetAttributes,
+                          hipPointerGetAttributes);
 ARK_GPU_DEFINE_FUNC_ALIAS(gpuDeviceGetAttribute, cudaDeviceGetAttribute,
                           hipDeviceGetAttribute);
 ARK_GPU_DEFINE_FUNC_ALIAS(gpuDeviceSynchronize, cudaDeviceSynchronize,
diff --git a/ark/include/ark/model.hpp b/ark/include/ark/model.hpp
index 08b8fe639..e1b1f462b 100644
--- a/ark/include/ark/model.hpp
+++ b/ark/include/ark/model.hpp
@@ -97,17 +97,15 @@ class Model : public ModelGraph {
     /// padded shape is not provided, it is set to the @p shape.
     /// @param rank Rank of the tensor. -1 means the rank of this model.
     /// @param name Name of the tensor.
-    /// @param external_data Pointer to an external data buffer. If provided,
-    /// this buffer is registered with the ModelBufferManager and associated
+    /// @param data Address of data to pass through placeholder. If provided,
+    /// this buffer is registered with the ExternalBufferRegistry and associated
     /// with the tensor.
     /// @return Pointer to a tensor object that references the external buffer.
     ///
-    ///
     Tensor placeholder(const Dims &shape, const DataType &data_type,
                        const Dims &strides = {}, const Dims &offsets = {},
                        const Dims &padded_shape = {}, int rank = -1,
-                       const std::string &name = "",
-                       void *external_data = nullptr);
+                       void *data = nullptr, const std::string &name = "");
 
     Tensor refer(Tensor input, const Dims &shape = {}, const Dims &strides = {},
                  const Dims &offsets = {}, const Dims &padded_shape = {},
diff --git a/ark/include/ark/tensor.hpp b/ark/include/ark/tensor.hpp
index 72ff9ff57..8d6582971 100644
--- a/ark/include/ark/tensor.hpp
+++ b/ark/include/ark/tensor.hpp
@@ -31,8 +31,6 @@ class Tensor {
     Tensor(ModelTensorRef ref) : ref_(ref) {}
     Tensor(const Tensor &other) = default;
     Tensor &operator=(const Tensor &other) = default;
-    Tensor(void *data_ptr, int32_t device_id, const std::vector<int64_t> &shape,
-           const DataType &dtype);
 
     bool operator==(const Tensor &other) const { return ref_ == other.ref_; }
     bool operator!=(const Tensor &other) const { return ref_ != other.ref_; }
@@ -54,8 +52,6 @@ class Tensor {
     const DataType &data_type() const;
 
     Dims torch_strides() const;
-
-    friend struct std::hash<Tensor>;
 };
 
 const Tensor NullTensor;
diff --git a/ark/model/model_buffer.cpp b/ark/model/model_buffer.cpp
index 5ce255ce5..5e2409537 100644
--- a/ark/model/model_buffer.cpp
+++ b/ark/model/model_buffer.cpp
@@ -3,19 +3,22 @@
 
 #include "model_buffer.hpp"
 
+#include "external_buffer_registry.hpp"
 #include "logging.hpp"
-#include "model_buffer_manager.hpp"
 
 namespace ark {
 
 size_t ModelBuffer::curr_id = 0;
 
-ModelBuffer::ModelBuffer(int rank) : rank_(rank) { id_ = curr_id++; }
+ModelBuffer::ModelBuffer(int rank, bool is_external)
+    : rank_(rank), is_external_(is_external) {
+    id_ = curr_id++;
+}
 
-ModelBuffer::ModelBuffer(size_t id, int rank,
+ModelBuffer::ModelBuffer(size_t id, int rank, bool is_external,
                          const std::vector<TagInfo> &send_tags,
                          const std::vector<TagInfo> &recv_tags)
-    : id_(id), rank_(rank) {
+    : id_(id), rank_(rank), is_external_(is_external) {
     for (const auto &info : send_tags) {
         send_tags_.insert(info);
     }
@@ -24,23 +27,6 @@ ModelBuffer::ModelBuffer(size_t id, int rank,
     }
 }
 
-ModelBuffer::ModelBuffer(void *data, size_t size, int32_t device_id)
-    : rank_(-1),
-      external_data_(data),
-      external_data_size_(size),
-      device_id_(device_id),
-      is_external_(true) {
-    id_ = curr_id++;
-}
-
-ModelBuffer::ModelBuffer(size_t id, void *data, size_t size, int32_t device_id)
-    : id_(id),
-      rank_(-1),
-      external_data_(data),
-      external_data_size_(size),
-      device_id_(device_id),
-      is_external_(true) {}
-
 void ModelBuffer::tag_send(int remote_rank, int tag) {
     send_tags_.insert(TagInfo{remote_rank, tag});
 }
@@ -61,16 +47,9 @@ Json ModelBuffer::serialize() const {
     for (const auto &info : recv_tags_) {
         recv_tags.push_back({info.first, info.second});
     }
+    j["IsExternal"] = is_external_;
     j["SendTags"] = send_tags;
     j["RecvTags"] = recv_tags;
-    j["IsExternal"] = is_external_;
-    if (is_external_) {
-        ModelBufferManager::get_instance().register_buffer(id_, external_data_,
-                                                           external_data_size_);
-        j["ExternalDataSize"] = external_data_size_;
-        j["DeviceId"] = device_id_;
-    }
-    // external_data_ptr_ is not included in JSON
     return j;
 }
 
@@ -88,28 +67,9 @@ std::shared_ptr<ModelBuffer> ModelBuffer::deserialize(const Json &serialized) {
         ERR(ModelError,
             "ModelBuffer deserialization failed: missing IsExternal");
     }
-    if (serialized["IsExternal"]) {
-        if (!serialized.contains("ExternalDataSize")) {
-            ERR(ModelError,
-                "ModelBuffer deserialization failed: missing ExternalDataSize");
-        } else if (!serialized.contains("DeviceId")) {
-            ERR(ModelError,
-                "ModelBuffer deserialization failed: missing DeviceId");
-        }
-        void *data_ptr =
-            ModelBufferManager::get_instance().get_buffer(serialized["Id"]);
-        if (!data_ptr) {
-            ERR(ModelError,
-                "ModelBuffer deserialization failed: external buffer not found "
-                "in BufferManager");
-        }
-        return std::make_shared<ModelBuffer>(serialized["Id"], data_ptr,
-                                             serialized["ExternalDataSize"],
-                                             serialized["DeviceId"]);
-    }
-    return std::make_shared<ModelBuffer>(serialized["Id"], serialized["Rank"],
-                                         serialized["SendTags"],
-                                         serialized["RecvTags"]);
+    return std::make_shared<ModelBuffer>(
+        serialized["Id"], serialized["Rank"], serialized["IsExternal"],
+        serialized["SendTags"], serialized["RecvTags"]);
 }
 
 }  // namespace ark
diff --git a/ark/model/model_buffer.hpp b/ark/model/model_buffer.hpp
index e7f1045b2..8b66356b1 100644
--- a/ark/model/model_buffer.hpp
+++ b/ark/model/model_buffer.hpp
@@ -17,19 +17,18 @@ class ModelBuffer {
     // (remote_rank, tag)
     using TagInfo = std::pair<int, int>;
 
-    ModelBuffer(int rank = -1);
+    ModelBuffer(int rank = -1, bool is_external = false);
 
-    ModelBuffer(size_t id, int rank, const std::vector<TagInfo> &send_tags,
+    ModelBuffer(size_t id, int rank, bool is_external,
+                const std::vector<TagInfo> &send_tags,
                 const std::vector<TagInfo> &recv_tags);
 
-    // externally managed buffer
-    ModelBuffer(void *data, size_t size, int32_t device_id);
-    ModelBuffer(size_t id, void *data, size_t size, int32_t device_id);
-
     size_t id() const { return id_; }
 
     int rank() const { return rank_; }
 
+    bool is_external() const { return is_external_; }
+
     const std::set<TagInfo> &send_tags() const { return send_tags_; }
 
     const std::set<TagInfo> &recv_tags() const { return recv_tags_; }
@@ -48,22 +47,13 @@ class ModelBuffer {
 
     static std::shared_ptr<ModelBuffer> deserialize(const Json &serialized);
 
-    // external buffer management
-    size_t external_data_size() const { return external_data_size_; }
-    void *external_data() const { return external_data_; }
-    int32_t device_id() const { return device_id_; }
-    bool is_external() const { return is_external_; }
-
    private:
     static size_t curr_id;
     size_t id_;
     int rank_;
+    bool is_external_;
     std::set<TagInfo> send_tags_;
     std::set<TagInfo> recv_tags_;
-    void *external_data_ = nullptr;
-    size_t external_data_size_ = 0;
-    int32_t device_id_;
-    bool is_external_ = false;
 };
 
 }  // namespace ark
diff --git a/ark/model_buffer_manager.hpp b/ark/model_buffer_manager.hpp
deleted file mode 100644
index 3e82b05f5..000000000
--- a/ark/model_buffer_manager.hpp
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#ifndef ARK_MODEL_BUFFER_MANAGER_HPP_
-#define ARK_MODEL_BUFFER_MANAGER_HPP_
-
-#include <tuple>
-#include <unordered_map>
-
-namespace ark {
-// Manages externally allocated buffers (buffers corresponding to Tensors that
-// are the output of a `placeholder` operation) outside of ARK's memory space.
-class ModelBufferManager {
-   public:
-    static ModelBufferManager& get_instance() {
-        static ModelBufferManager instance;
-        return instance;
-    }
-
-    void register_buffer(const size_t id, void* const data, const size_t size) {
-        buffers_[id] = std::make_tuple(data, size);
-    }
-
-    void* get_buffer(const size_t id) const {
-        auto it = buffers_.find(id);
-        if (it != buffers_.end()) {
-            return std::get<0>(it->second);
-        }
-        return nullptr;
-    }
-
-    size_t get_buffer_size(const size_t id) const {
-        auto it = buffers_.find(id);
-        if (it != buffers_.end()) {
-            return std::get<1>(it->second);
-        }
-        return 0;
-    }
-
-    bool is_external(const size_t id) const {
-        return buffers_.find(id) != buffers_.end();
-    }
-
-    const std::unordered_map<size_t, std::tuple<void*, size_t>>& get_buffers()
-        const {
-        return buffers_;
-    }
-
-    void clear_buffers() { buffers_.clear(); }
-
-    bool is_empty() const { return buffers_.empty(); }
-
-   private:
-    // Maps buffer IDs to pointers and sizes.
-    std::unordered_map<size_t, std::tuple<void*, size_t>> buffers_;
-    ModelBufferManager() {}
-    ModelBufferManager(const ModelBufferManager&) = delete;
-    ModelBufferManager& operator=(const ModelBufferManager&) = delete;
-};
-}  // namespace ark
-
-#endif  // ARK_MODEL_BUFFER_MANAGER_HPP_
diff --git a/ark/ops/ops_placeholder.cpp b/ark/ops/ops_placeholder.cpp
index fbac73902..73c1c1b25 100644
--- a/ark/ops/ops_placeholder.cpp
+++ b/ark/ops/ops_placeholder.cpp
@@ -3,8 +3,8 @@
 
 #include "ops_placeholder.hpp"
 
+#include "external_buffer_registry.hpp"
 #include "logging.hpp"
-#include "model_buffer_manager.hpp"
 #include "ops_common.hpp"
 
 namespace ark {
@@ -12,22 +12,13 @@ namespace ark {
 ModelOpPlaceholder::ModelOpPlaceholder(ModelBufferRef buffer, const Dims &shape,
                                        ModelDataType data_type,
                                        const Dims &strides, const Dims &offsets,
-                                       const Dims &padded_shape,
-                                       void *external_data)
+                                       const Dims &padded_shape, void *data)
     : ModelOp("Placeholder", true) {
     if (!buffer) {
-        buffer = std::make_shared<ModelBuffer>();
+        buffer = std::make_shared<ModelBuffer>(-1, true);
     }
-    const std::vector<DimType> &shape_vec = shape.vector();
-    DataType dtype = ModelDataType(data_type);
 
-    size_t external_data_size =
-        std::accumulate(shape_vec.begin(), shape_vec.end(), 1,
-                        std::multiplies<int64_t>()) *
-        dtype.bytes();
-
-    ModelBufferManager::get_instance().register_buffer(
-        buffer->id(), external_data, external_data_size);
+    ExternalBufferRegistry::get_instance().set(buffer->id(), data);
 
     ModelTensorRef tensor = std::make_shared<ModelTensor>(
         data_type, buffer, shape, strides, offsets, padded_shape);
@@ -39,8 +30,8 @@ ModelOpPlaceholder::ModelOpPlaceholder(ModelBufferRef buffer, const Dims &shape,
 
 Tensor Model::placeholder(const Dims &shape, const DataType &data_type,
                           const Dims &strides, const Dims &offsets,
-                          const Dims &padded_shape, int rank,
-                          const std::string &name, void *external_data) {
+                          const Dims &padded_shape, int rank, void *data,
+                          const std::string &name) {
     if (rank != -1) {
         if (rank == this->rank()) {
             rank = -1;
@@ -50,8 +41,9 @@ Tensor Model::placeholder(const Dims &shape, const DataType &data_type,
     }
     return impl_
         ->create_op<ModelOpPlaceholder>(
-            name, std::make_shared<ModelBuffer>(rank), shape, data_type.ref(),
-            strides, offsets, padded_shape, external_data)
+            name, std::make_shared<ModelBuffer>(rank, true), shape,
+            data_type.ref(), strides, offsets, padded_shape, data)
         ->result_tensors()[0];
 }
-}  // namespace ark
\ No newline at end of file
+
+}  // namespace ark
diff --git a/ark/ops/ops_placeholder.hpp b/ark/ops/ops_placeholder.hpp
index 7fb53f983..91dd874ae 100644
--- a/ark/ops/ops_placeholder.hpp
+++ b/ark/ops/ops_placeholder.hpp
@@ -15,7 +15,7 @@ class ModelOpPlaceholder : public ModelOp {
     ModelOpPlaceholder(ModelBufferRef buffer, const Dims &shape,
                        ModelDataType data_type, const Dims &strides,
                        const Dims &offsets, const Dims &padded_shape,
-                       void *external_data = nullptr);
+                       void *data = nullptr);
 };
 
 }  // namespace ark
diff --git a/ark/ops/ops_placeholder_test.cpp b/ark/ops/ops_placeholder_test.cpp
index 903d87593..223872320 100644
--- a/ark/ops/ops_placeholder_test.cpp
+++ b/ark/ops/ops_placeholder_test.cpp
@@ -27,7 +27,7 @@ ark::unittest::State test_ops_placeholder_value_contiguous() {
     // Associate the initialized device buffer with a tensor produced from a
     // placeholder operation
     ark::Tensor tns =
-        model.placeholder(shape, ark::FP32, {}, {}, {}, -1, "", d_ext_buffer);
+        model.placeholder(shape, ark::FP32, {}, {}, {}, -1, d_ext_buffer);
 
     ark::Tensor res = model.add(tns, 1.0);
 
diff --git a/python/ark/ops.py b/python/ark/ops.py
index f8b75a70b..1e03cae97 100644
--- a/python/ark/ops.py
+++ b/python/ark/ops.py
@@ -3,7 +3,7 @@
 
 from typing import List, Iterable, Union
 
-from .tensor import Dims, Tensor, Parameter, NullTensor
+from .tensor import Dims, Tensor, Parameter, NullTensor, _cpp_tensor
 from .data_type import DataType, fp32
 from .model import Model
 
@@ -12,42 +12,6 @@ def _is_list_or_tuple(obj):
     return isinstance(obj, list) or isinstance(obj, tuple)
 
 
-def _tensor(
-    shape: Iterable[int],
-    dtype: DataType = fp32,
-    strides: Iterable[int] = [],
-    offsets: Iterable[int] = [],
-    padded_shape: Iterable[int] = [],
-    rank: int = -1,
-    name: str = "",
-) -> Tensor:
-    if not _is_list_or_tuple(shape):
-        raise ValueError("shape should be a list or tuple of integers")
-    if not _is_list_or_tuple(strides):
-        raise ValueError("strides should be a list or tuple of integers")
-    if not _is_list_or_tuple(offsets):
-        raise ValueError("offsets should be a list or tuple of integers")
-    if not _is_list_or_tuple(padded_shape):
-        raise ValueError("padded_shape should be a list or tuple of integers")
-    # only support tensors with up to 4 dimensions
-    if (
-        len(shape) > 4
-        or len(strides) > 4
-        or len(offsets) > 4
-        or len(padded_shape) > 4
-    ):
-        raise ValueError("Only support tensors with up to 4 dimensions")
-    return Model.get_model().tensor(
-        Dims(shape),
-        dtype.ctype(),
-        Dims(strides),
-        Dims(offsets),
-        Dims(padded_shape),
-        rank,
-        name,
-    )
-
-
 def add(
     input: Union[Tensor, float],
     other: Union[Tensor, float],
@@ -258,6 +222,24 @@ def noop(input: Tensor, name: str = "noop"):
     Model.get_model().noop(input._tensor, name)
 
 
+def placeholder(
+    shape: Iterable[int],
+    dtype: DataType = fp32,
+    strides: Iterable[int] = [],
+    offsets: Iterable[int] = [],
+    padded_shape: Iterable[int] = [],
+    rank: int = -1,
+    data: int = 0,
+    name: str = "placeholder",
+) -> Tensor:
+    """ """
+    return Tensor(
+        _cpp_tensor(
+            shape, dtype, strides, offsets, padded_shape, rank, data, name
+        )
+    )
+
+
 def reduce_max(
     input: Tensor,
     axis: int,
@@ -488,7 +470,9 @@ def tensor(
     tensor = ark.tensor([1, 2], dtype=ark.fp16)
     """
     return Tensor(
-        _tensor(shape, dtype, strides, offsets, padded_shape, rank, name)
+        _cpp_tensor(
+            shape, dtype, strides, offsets, padded_shape, rank, None, name
+        )
     )
 
 
@@ -554,7 +538,7 @@ def parameter(
     Construct a parameter with given shape and data type.
     """
     return Parameter(
-        _tensor(shape, dtype, strides, offsets, padded_shape, name)
+        _cpp_tensor(shape, dtype, strides, offsets, padded_shape, None, name)
     )
 
 
diff --git a/python/ark/tensor.py b/python/ark/tensor.py
index 45a54d169..dec64682f 100644
--- a/python/ark/tensor.py
+++ b/python/ark/tensor.py
@@ -2,10 +2,10 @@
 # Licensed under the MIT license.
 
 import numpy as np
-from typing import Callable, List, Union, Type
+from typing import Callable, Iterable, List, Union, Type
 
 from ._ark_core import _Dims, _Tensor, _NullTensor
-from .data_type import DataType
+from .data_type import DataType, fp32
 from .runtime import Runtime
 from .model import Model
 
@@ -137,7 +137,8 @@ def from_dlpack(ext_tensor) -> "Tensor":
         """
         Copies the tensor from a DLPack tensor to the device.
         """
-        return Tensor(_Tensor(ext_tensor))
+        # return Tensor(_Tensor(ext_tensor))
+        raise NotImplementedError("from_dlpack is not implemented yet")
 
     def to_torch(self) -> torch.Tensor:
         """
@@ -162,7 +163,14 @@ def from_torch(tensor: torch.Tensor) -> "Tensor":
             raise ValueError("Torch tensor must be contiguous.")
         elif tensor.device.type == "cpu":
             raise ValueError("Torch tensor must be on a device.")
-        ark_tensor = Tensor.from_dlpack(torch.utils.dlpack.to_dlpack(tensor))
+        # TODO: support strides and offsets
+        ark_tensor = Tensor(
+            _cpp_tensor(
+                shape=list(tensor.shape),
+                dtype=DataType.from_torch(tensor.dtype),
+                data=tensor.data_ptr(),
+            )
+        )
         # Share ownership of the memory with the torch tensor
         ark_tensor.__torch_buffer__ = tensor
         return ark_tensor
@@ -263,3 +271,57 @@ def update_gradient(self, ark_tensor: Tensor):
         if ark_tensor is None or not isinstance(ark_tensor, Tensor):
             raise ValueError("cannot use non-ARK tensor to update ARK gradient")
         self.staged_tensor = ark_tensor
+
+
+def _is_list_or_tuple(obj):
+    return isinstance(obj, list) or isinstance(obj, tuple)
+
+
+def _cpp_tensor(
+    shape: Iterable[int],
+    dtype: DataType = fp32,
+    strides: Iterable[int] = [],
+    offsets: Iterable[int] = [],
+    padded_shape: Iterable[int] = [],
+    rank: int = -1,
+    data: int = None,
+    name: str = "",
+) -> Tensor:
+    if not _is_list_or_tuple(shape):
+        raise ValueError("shape should be a list or tuple of integers")
+    if not _is_list_or_tuple(strides):
+        raise ValueError("strides should be a list or tuple of integers")
+    if not _is_list_or_tuple(offsets):
+        raise ValueError("offsets should be a list or tuple of integers")
+    if not _is_list_or_tuple(padded_shape):
+        raise ValueError("padded_shape should be a list or tuple of integers")
+    # only support tensors with up to 4 dimensions
+    if (
+        len(shape) > 4
+        or len(strides) > 4
+        or len(offsets) > 4
+        or len(padded_shape) > 4
+    ):
+        raise ValueError("Only support tensors with up to 4 dimensions")
+    if data is not None:
+        cpp_tensor = Model.get_model().placeholder(
+            Dims(shape),
+            dtype.ctype(),
+            Dims(strides),
+            Dims(offsets),
+            Dims(padded_shape),
+            rank,
+            data,
+            name,
+        )
+    else:
+        cpp_tensor = Model.get_model().tensor(
+            Dims(shape),
+            dtype.ctype(),
+            Dims(strides),
+            Dims(offsets),
+            Dims(padded_shape),
+            rank,
+            name,
+        )
+    return cpp_tensor
diff --git a/python/model_py.cpp b/python/model_py.cpp
index c224a3d5b..76740ff1a 100644
--- a/python/model_py.cpp
+++ b/python/model_py.cpp
@@ -71,6 +71,19 @@ void register_model(py::module &m) {
              py::arg("input"), py::arg("other"), py::arg("output"),
              py::arg("name"))
         .def("noop", &ark::Model::noop, py::arg("input"), py::arg("name"))
+        .def(
+            "placeholder",
+            [](ark::Model &model, const ark::Dims &shape,
+               const ark::DataType &data_type, const ark::Dims &strides,
+               const ark::Dims &offsets, const ark::Dims &padded_shape,
+               int rank, uintptr_t data, const std::string &name) {
+                return model.placeholder(shape, data_type, strides, offsets,
+                                         padded_shape, rank,
+                                         reinterpret_cast<void *>(data), name);
+            },
+            py::arg("shape"), py::arg("data_type"), py::arg("strides"),
+            py::arg("offsets"), py::arg("padded_shape"), py::arg("rank"),
+            py::arg("data"), py::arg("name"))
         .def("reduce_max", &ark::Model::reduce_max, py::arg("input"),
              py::arg("axis"), py::arg("keepdims"), py::arg("output"),
              py::arg("name"))
@@ -104,14 +117,9 @@ void register_model(py::module &m) {
                                const std::string &>(&ark::Model::sub),
              py::arg("input"), py::arg("other"), py::arg("output"),
              py::arg("name"))
-        .def("tensor",
-             py::overload_cast<const ark::Dims &, const ark::DataType &,
-                               const ark::Dims &, const ark::Dims &,
-                               const ark::Dims &, int, const std::string &>(
-                 &ark::Model::tensor),
-             py::arg("shape"), py::arg("data_type"), py::arg("strides"),
-             py::arg("offsets"), py::arg("padded_shape"), py::arg("rank"),
-             py::arg("name"))
+        .def("tensor", &ark::Model::tensor, py::arg("shape"),
+             py::arg("data_type"), py::arg("strides"), py::arg("offsets"),
+             py::arg("padded_shape"), py::arg("rank"), py::arg("name"))
         .def("transpose", &ark::Model::transpose, py::arg("input"),
              py::arg("permutation"), py::arg("output"), py::arg("name"))
         .def("all_reduce", &ark::Model::all_reduce, py::arg("input"),
diff --git a/python/tensor_py.cpp b/python/tensor_py.cpp
index 5abb35c66..74ca7f1af 100644
--- a/python/tensor_py.cpp
+++ b/python/tensor_py.cpp
@@ -69,19 +69,20 @@ static ark::DataType from_dl_dtype(const DLDataType &dl_dtype) {
 
 void register_tensor(py::module& m) {
     py::class_<ark::Tensor>(m, "_Tensor")
-        .def(py::init([](py::capsule capsule) {
-            DLManagedTensor* dl_tensor = (DLManagedTensor*)capsule;
-            if (!dl_tensor) {
-                ERR(ark::InvalidUsageError,
-                    "Capsule does not contain a DLManagedTensor");
-            }
-            DLTensorMetadata metadata = extractDLTensorMetadata(dl_tensor);
-            int32_t device_id = metadata.device_id;
-            void* data_ptr = metadata.data_ptr;
-            auto shape = metadata.shape;
+        // .def(py::init([](py::capsule capsule) {
+        //     DLManagedTensor* dl_tensor = (DLManagedTensor*)capsule;
+        //     if (!dl_tensor) {
+        //         ERR(ark::InvalidUsageError,
+        //             "Capsule does not contain a DLManagedTensor");
+        //     }
+        //     DLTensorMetadata metadata = extractDLTensorMetadata(dl_tensor);
+        //     int32_t device_id = metadata.device_id;
+        //     void* data_ptr = metadata.data_ptr;
+        //     auto shape = metadata.shape;
 
-            return ark::Tensor(data_ptr, device_id, shape, from_dl_dtype(metadata.dtype));
-        }))
+        //     return ark::Tensor(data_ptr, device_id, shape,
+        //     from_dl_dtype(metadata.dtype));
+        // }))
         .def("id", &ark::Tensor::id)
         .def("shape", &ark::Tensor::shape)
         .def("strides", &ark::Tensor::strides)